line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
=head1 NAME |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
Text::Same::FileChunkedSource |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=head1 DESCRIPTION |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
Objects of this class represent a source of chunks (generally lines) |
8
|
|
|
|
|
|
|
in a file. The "chunks" could potentially be paragraphs or sentences. |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
=head1 SYNOPSIS |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
my $source = new Text::Same::FileChunkedSource(chunks->\@chunks) |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 METHODS |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
See below. Methods private to this module are prefixed by an |
17
|
|
|
|
|
|
|
underscore. |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=cut |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
package Text::Same::FileChunkedSource; |
22
|
|
|
|
|
|
|
|
23
|
3
|
|
|
3
|
|
15
|
use warnings; |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
83
|
|
24
|
3
|
|
|
3
|
|
14
|
use strict; |
|
3
|
|
|
|
|
4
|
|
|
3
|
|
|
|
|
231
|
|
25
|
3
|
|
|
3
|
|
107
|
use Carp; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
258
|
|
26
|
|
|
|
|
|
|
|
27
|
3
|
|
|
3
|
|
28
|
use vars qw($VERSION @ISA); |
|
3
|
|
|
|
|
4
|
|
|
3
|
|
|
|
|
162
|
|
28
|
|
|
|
|
|
|
$VERSION = '0.07'; |
29
|
|
|
|
|
|
|
|
30
|
3
|
|
|
3
|
|
14
|
use Digest::MD5 qw(md5); |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
173
|
|
31
|
|
|
|
|
|
|
|
32
|
3
|
|
|
3
|
|
1807
|
use Text::Same::Util; |
|
3
|
|
|
|
|
8
|
|
|
3
|
|
|
|
|
151
|
|
33
|
3
|
|
|
3
|
|
2069
|
use Text::Same::ChunkedSource; |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
4264
|
|
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
@ISA = qw(Text::Same::ChunkedSource); |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=head2 new |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
Title : new |
40
|
|
|
|
|
|
|
Usage : $source = new Text::Same::FileChunkedSource(chunks->\@chunks) |
41
|
|
|
|
|
|
|
Function: Creates a new ChunkedSource object from an array |
42
|
|
|
|
|
|
|
Returns : A Text::Same::FileChunkedSource object |
43
|
|
|
|
|
|
|
Args : chunks - an array of strings |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=cut |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
sub new |
48
|
|
|
|
|
|
|
{ |
49
|
72
|
|
|
72
|
1
|
101
|
my $arg = shift; |
50
|
72
|
|
33
|
|
|
281
|
my $class = ref($arg) || $arg; |
51
|
|
|
|
|
|
|
|
52
|
72
|
|
|
|
|
115
|
my $self = {}; |
53
|
|
|
|
|
|
|
|
54
|
72
|
|
|
|
|
221
|
my %params = @_; |
55
|
|
|
|
|
|
|
|
56
|
72
|
|
|
|
|
185
|
$self->{name} = $params{name}; |
57
|
72
|
|
|
|
|
130
|
$self->{all_chunks} = $params{chunks}; |
58
|
72
|
|
|
|
|
79
|
$self->{all_chunks_count} = scalar(@{$params{chunks}}); |
|
72
|
|
|
|
|
155
|
|
59
|
|
|
|
|
|
|
|
60
|
72
|
|
|
|
|
497
|
return bless $self, $class; |
61
|
|
|
|
|
|
|
} |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=head2 name |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
Title : name |
66
|
|
|
|
|
|
|
Usage : my $name = $source->name(); |
67
|
|
|
|
|
|
|
Function: return the name of this source - generally the filename |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=cut |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
sub name |
72
|
|
|
|
|
|
|
{ |
73
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
74
|
0
|
|
|
|
|
0
|
return $self->{name}; |
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=head2 get_all_chunks |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
Title : get_all_chunks |
80
|
|
|
|
|
|
|
Usage : $all_chunks = $source->get_all_chunks; |
81
|
|
|
|
|
|
|
Function: return (in order) the chunks from this source |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
=cut |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
sub get_all_chunks |
86
|
|
|
|
|
|
|
{ |
87
|
824
|
|
|
824
|
1
|
837
|
my $self = shift; |
88
|
824
|
|
|
|
|
720
|
return @{$self->{all_chunks}}; |
|
824
|
|
|
|
|
3366
|
|
89
|
|
|
|
|
|
|
} |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=head2 get_chunk_by_indx |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
Title : get_chunk_by_indx |
94
|
|
|
|
|
|
|
Usage : $chunk = $source->get_chunk_by_indx($indx); |
95
|
|
|
|
|
|
|
Function: return the chunk/line at the given index in this source |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=cut |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
sub get_chunk_by_indx |
100
|
|
|
|
|
|
|
{ |
101
|
762
|
|
|
762
|
1
|
1018
|
my $self = shift; |
102
|
762
|
|
|
|
|
900
|
my $chunk_indx = shift; |
103
|
|
|
|
|
|
|
|
104
|
762
|
|
|
|
|
2413
|
return $self->{all_chunks}[$chunk_indx]; |
105
|
|
|
|
|
|
|
} |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
=head2 get_all_chunks_count |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
Title : get_all_chunks_count |
110
|
|
|
|
|
|
|
Usage : $count = $source->get_all_chunks_count; |
111
|
|
|
|
|
|
|
Function: return the number of chunks in this source |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=cut |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
sub get_all_chunks_count |
116
|
|
|
|
|
|
|
{ |
117
|
655
|
|
|
655
|
1
|
695
|
my $self = shift; |
118
|
655
|
|
|
|
|
1990
|
return $self->{all_chunks_count}; |
119
|
|
|
|
|
|
|
} |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
sub _make_chunk_maps |
122
|
|
|
|
|
|
|
{ |
123
|
72
|
|
|
72
|
|
84
|
my $self = shift; |
124
|
72
|
|
|
|
|
84
|
my $options = shift; |
125
|
|
|
|
|
|
|
|
126
|
72
|
|
|
|
|
111
|
my @filtered_chunk_indexes = (); |
127
|
72
|
|
|
|
|
105
|
my %filtered_hash = (); |
128
|
72
|
|
|
|
|
114
|
my %real_index_to_filtered_index = (); |
129
|
72
|
|
|
|
|
108
|
my %filtered_index_to_real_index = (); |
130
|
|
|
|
|
|
|
|
131
|
72
|
|
|
|
|
97
|
my $filtered_chunk_count = 0; |
132
|
|
|
|
|
|
|
|
133
|
72
|
|
|
|
|
103
|
for (my $i = 0; $i < scalar(@{$self->{all_chunks}}); $i++) { |
|
651
|
|
|
|
|
7111
|
|
134
|
579
|
|
|
|
|
1257
|
my $chunk = $self->{all_chunks}[$i]; |
135
|
579
|
100
|
|
|
|
1399
|
if (!is_ignorable($options, $chunk)) { |
136
|
528
|
|
|
|
|
846
|
push @filtered_chunk_indexes, $i; |
137
|
528
|
|
|
|
|
963
|
$real_index_to_filtered_index{$i} = $filtered_chunk_count; |
138
|
528
|
|
|
|
|
927
|
$filtered_index_to_real_index{$filtered_chunk_count} = $i; |
139
|
|
|
|
|
|
|
|
140
|
528
|
|
|
|
|
547
|
push @{$filtered_hash{Text::Same::Util::hash($options, $chunk)}}, $i; |
|
528
|
|
|
|
|
1275
|
|
141
|
528
|
|
|
|
|
1242
|
$filtered_chunk_count++; |
142
|
|
|
|
|
|
|
} |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
|
145
|
72
|
|
|
|
|
606
|
return \@filtered_chunk_indexes, \%filtered_hash, |
146
|
|
|
|
|
|
|
\%real_index_to_filtered_index, \%filtered_index_to_real_index; |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
sub _get_map_key_from_options |
151
|
|
|
|
|
|
|
{ |
152
|
4254
|
|
|
4254
|
|
4385
|
my $self = shift; |
153
|
4254
|
|
|
|
|
4253
|
my $options = shift; |
154
|
|
|
|
|
|
|
|
155
|
4254
|
50
|
|
|
|
7688
|
if (!defined $options) { |
156
|
0
|
|
|
|
|
0
|
carp("\$options is undefined"); |
157
|
|
|
|
|
|
|
} |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
return |
160
|
4254
|
100
|
|
|
|
22213
|
("key_" . |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
161
|
|
|
|
|
|
|
($options->{ignore_case} ? "w" : "W") . "_" . |
162
|
|
|
|
|
|
|
($options->{ignore_blanks} ? "b" : "B") . "_" . |
163
|
|
|
|
|
|
|
($options->{ignore_case} ? "i" : "I") . "_" . |
164
|
|
|
|
|
|
|
($options->{ignore_simple} ? $options->{ignore_simple} : "0")); |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
sub _maybe_make_filtered_maps |
168
|
|
|
|
|
|
|
{ |
169
|
2127
|
|
|
2127
|
|
2395
|
my $self = shift; |
170
|
2127
|
|
|
|
|
2131
|
my $options = shift; |
171
|
|
|
|
|
|
|
|
172
|
2127
|
|
|
|
|
3368
|
my $key = $self->_get_map_key_from_options($options); |
173
|
|
|
|
|
|
|
|
174
|
2127
|
100
|
|
|
|
6495
|
if (!exists $self->{filtered_chunk_indexes}{$key}) { |
175
|
72
|
|
|
|
|
167
|
($self->{filtered_chunk_indexes}{$key}, |
176
|
|
|
|
|
|
|
$self->{filtered_hash}{$key}, |
177
|
|
|
|
|
|
|
$self->{real_to_filtered}{$key}, |
178
|
|
|
|
|
|
|
$self->{filtered_to_real}{$key}) = $self->_make_chunk_maps($options); |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
=head2 get_filtered_chunk_indexes |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
Title : get_filtered_chunk_indexes |
186
|
|
|
|
|
|
|
Usage : $filtered_chunk_indexes = $source->get_filtered_chunk_indexes($options); |
187
|
|
|
|
|
|
|
Function: return (in order) the chunks from this source that match the given |
188
|
|
|
|
|
|
|
options: |
189
|
|
|
|
|
|
|
ignore_case=> (0 or 1) -- ignore case when comparing |
190
|
|
|
|
|
|
|
ignore_blanks=> (0 or 1) -- ignore blank lines when comparing |
191
|
|
|
|
|
|
|
ignore_space=> (0 or 1) -- ignore whitespace in chunks |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=cut |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
sub get_filtered_chunk_indexes |
196
|
|
|
|
|
|
|
{ |
197
|
831
|
|
|
831
|
1
|
956
|
my $self = shift; |
198
|
831
|
|
|
|
|
877
|
my $options = shift; |
199
|
|
|
|
|
|
|
|
200
|
831
|
|
|
|
|
1347
|
my $key = $self->_get_map_key_from_options($options); |
201
|
|
|
|
|
|
|
|
202
|
831
|
|
|
|
|
1652
|
$self->_maybe_make_filtered_maps($options); |
203
|
|
|
|
|
|
|
|
204
|
831
|
|
|
|
|
3006
|
return $self->{filtered_chunk_indexes}{$key}; |
205
|
|
|
|
|
|
|
} |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
=head2 get_matching_chunk_indexes |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
Title : get_matching_chunk_indexes |
210
|
|
|
|
|
|
|
Usage : $matches = $source->get_matching_chunk_indexes($options, $text); |
211
|
|
|
|
|
|
|
Function: return (in order) the chunks from this source that match the given |
212
|
|
|
|
|
|
|
text. |
213
|
|
|
|
|
|
|
options: |
214
|
|
|
|
|
|
|
ignore_case=> (0 or 1) -- ignore case when comparing |
215
|
|
|
|
|
|
|
ignore_blanks=> (0 or 1) -- ignore blank lines when comparing |
216
|
|
|
|
|
|
|
ignore_space=> (0 or 1) -- ignore whitespace in chunks |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
=cut |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
sub get_matching_chunk_indexes |
221
|
|
|
|
|
|
|
{ |
222
|
258
|
|
|
258
|
1
|
337
|
my $self = shift; |
223
|
258
|
|
|
|
|
272
|
my $options = shift; |
224
|
258
|
|
|
|
|
345
|
my $text = shift; |
225
|
|
|
|
|
|
|
|
226
|
258
|
|
|
|
|
478
|
my $key = $self->_get_map_key_from_options($options); |
227
|
|
|
|
|
|
|
|
228
|
258
|
|
|
|
|
510
|
$self->_maybe_make_filtered_maps($options); |
229
|
|
|
|
|
|
|
|
230
|
258
|
|
|
|
|
466
|
my $chunk_hash = $self->{filtered_hash}{$key}; |
231
|
|
|
|
|
|
|
|
232
|
258
|
100
|
|
|
|
261
|
return @{$chunk_hash->{hash($options, $text)} || []}; |
|
258
|
|
|
|
|
628
|
|
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
=head2 _get_filtered_indx_from_real |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
Title : _get_filtered_indx_from_real |
238
|
|
|
|
|
|
|
Usage : $indx = $source->_get_filtered_indx_from_real($options, $real_indx); |
239
|
|
|
|
|
|
|
Function: for the given index (eg. line number) in this source, return the |
240
|
|
|
|
|
|
|
corresponding index in the list of chunks generated by applying the |
241
|
|
|
|
|
|
|
$options. For example if $options->{ignore_blanks} is true the |
242
|
|
|
|
|
|
|
filtered chunks will contain no blank lines. |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
eg. input lines: |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
some text on line 0 |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
some text on line 3 |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
the real index of "some text on line 3" is 3, but the filtered index is 1 if |
252
|
|
|
|
|
|
|
ignore_blanks is set because the filtered lines are: |
253
|
|
|
|
|
|
|
some text on line 0 |
254
|
|
|
|
|
|
|
some text on line 3 |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
=cut |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
sub _get_filtered_indx_from_real |
259
|
|
|
|
|
|
|
{ |
260
|
1038
|
|
|
1038
|
|
1065
|
my $self = shift; |
261
|
1038
|
|
|
|
|
1067
|
my $options = shift; |
262
|
1038
|
|
|
|
|
1043
|
my $real_indx = shift; |
263
|
|
|
|
|
|
|
|
264
|
1038
|
|
|
|
|
1742
|
my $key = $self->_get_map_key_from_options($options); |
265
|
|
|
|
|
|
|
|
266
|
1038
|
|
|
|
|
2024
|
$self->_maybe_make_filtered_maps($options); |
267
|
|
|
|
|
|
|
|
268
|
1038
|
|
|
|
|
3029
|
return $self->{real_to_filtered}{$key}{$real_indx}; |
269
|
|
|
|
|
|
|
} |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
=head2 get_previous_chunk_indx |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
Title : get_previous_chunk_indx |
274
|
|
|
|
|
|
|
Usage : $prev_chunk_indx = |
275
|
|
|
|
|
|
|
$source->get_previous_chunk_indx($options, $chunk_indx); |
276
|
|
|
|
|
|
|
Function: return the previous chunk index from the list of filtered chunk |
277
|
|
|
|
|
|
|
indexes (for the given $options). See discussion above. |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
=cut |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
sub get_previous_chunk_indx |
282
|
|
|
|
|
|
|
{ |
283
|
828
|
|
|
828
|
1
|
988
|
my $self = shift; |
284
|
828
|
|
|
|
|
966
|
my $options = shift; |
285
|
828
|
|
|
|
|
932
|
my $chunk_indx = shift; |
286
|
|
|
|
|
|
|
|
287
|
828
|
|
|
|
|
1387
|
my $prev_filtered_indx = |
288
|
|
|
|
|
|
|
$self->_get_filtered_indx_from_real($options, $chunk_indx) - 1; |
289
|
|
|
|
|
|
|
|
290
|
828
|
100
|
|
|
|
1738
|
if ($prev_filtered_indx < 0) { |
291
|
168
|
|
|
|
|
439
|
return undef; |
292
|
|
|
|
|
|
|
} |
293
|
|
|
|
|
|
|
|
294
|
660
|
|
|
|
|
1249
|
return ($self->get_filtered_chunk_indexes($options))->[$prev_filtered_indx]; |
295
|
|
|
|
|
|
|
} |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
=head2 get_next_chunk_indx |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
Title : get_next_chunk_indx |
300
|
|
|
|
|
|
|
Usage : $next_chunk_indx = |
301
|
|
|
|
|
|
|
$source->get_next_chunk_indx($options, $chunk_indx); |
302
|
|
|
|
|
|
|
Function: return the next chunk index from the list of filtered chunk |
303
|
|
|
|
|
|
|
indexes (for the given $options). See discussion above. |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
=cut |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
sub get_next_chunk_indx |
308
|
|
|
|
|
|
|
{ |
309
|
210
|
|
|
210
|
1
|
263
|
my $self = shift; |
310
|
210
|
|
|
|
|
242
|
my $options = shift; |
311
|
210
|
|
|
|
|
223
|
my $chunk_indx = shift; |
312
|
|
|
|
|
|
|
|
313
|
210
|
|
|
|
|
368
|
my $next_filtered_indx = |
314
|
|
|
|
|
|
|
$self->_get_filtered_indx_from_real($options, $chunk_indx) + 1; |
315
|
|
|
|
|
|
|
|
316
|
210
|
100
|
|
|
|
420
|
if ($next_filtered_indx >= $self->get_all_chunks_count()) { |
317
|
75
|
|
|
|
|
190
|
return undef; |
318
|
|
|
|
|
|
|
} |
319
|
|
|
|
|
|
|
|
320
|
135
|
|
|
|
|
253
|
return ($self->get_filtered_chunk_indexes($options))->[$next_filtered_indx]; |
321
|
|
|
|
|
|
|
} |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
=head1 AUTHOR |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
Kim Rutherford |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
=head1 COPYRIGHT & LICENSE |
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
Copyright 2005,2006 Kim Rutherford. All rights reserved. |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
332
|
|
|
|
|
|
|
under the same terms as Perl itself. |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
=head1 DISCLAIMER |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
This module is provided "as is" without warranty of any kind. It |
337
|
|
|
|
|
|
|
may redistributed under the same conditions as Perl itself. |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
=cut |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
1; |