line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
=head1 NAME |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
Text::DeDuper - near duplicates detection module |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=head1 SYNOPSIS |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
use Text::DeDuper; |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
$deduper = new Text::DeDuper(); |
10
|
|
|
|
|
|
|
$deduper->add_doc("doc1", $doc1text); |
11
|
|
|
|
|
|
|
$deduper->add_doc("doc2", $doc2text); |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
@similar_docs = $deduper->find_similar($doc3text); |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
... |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
# delete near duplicates from an array of texts |
18
|
|
|
|
|
|
|
$deduper = new Text::DeDuper(); |
19
|
|
|
|
|
|
|
foreach $text (@texts) |
20
|
|
|
|
|
|
|
{ |
21
|
|
|
|
|
|
|
next if $deduper->find_similar($text); |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
$deduper->add_doc($i++, $text); |
24
|
|
|
|
|
|
|
push @no_near_duplicates, $text; |
25
|
|
|
|
|
|
|
} |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
=head1 DESCRIPTION |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
This module uses the resemblance measure as proposed by Andrei Z. Broder at al |
30
|
|
|
|
|
|
|
(http://www.ra.ethz.ch/CDstore/www6/Technical/Paper205/Paper205.html) to detect |
31
|
|
|
|
|
|
|
similar (near-duplicate) documents based on their text. |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
Note of caution: The module only works correctly with languages where texts can |
34
|
|
|
|
|
|
|
be tokenised to words by detecting alphabetical characters sequences. Therefore |
35
|
|
|
|
|
|
|
it might not provide very good results for e.g. Chinese. |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=cut |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
package Text::DeDuper; |
40
|
|
|
|
|
|
|
|
41
|
1
|
|
|
1
|
|
23183
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
33
|
|
42
|
1
|
|
|
1
|
|
4
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
26
|
|
43
|
1
|
|
|
1
|
|
4
|
use vars qw($VERSION); |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
53
|
|
44
|
|
|
|
|
|
|
|
45
|
1
|
|
|
1
|
|
880
|
use Digest::MD4; |
|
1
|
|
|
|
|
1108
|
|
|
1
|
|
|
|
|
64
|
|
46
|
1
|
|
|
1
|
|
1145
|
use Encode; |
|
1
|
|
|
|
|
13149
|
|
|
1
|
|
|
|
|
1344
|
|
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
$VERSION = do { my @r = (q$Revision: 1.1 $ =~ /\d+/g); sprintf "%d."."%02d", @r }; |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=head1 METHODS |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
=head2 new (CONSTRUCTOR) |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
$deduper = new Text::DeDuper(); |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
Create a new DeDuper instance. Supported attributes are described bellow, in the |
57
|
|
|
|
|
|
|
I section. |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=cut |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
sub new |
62
|
|
|
|
|
|
|
{ |
63
|
0
|
|
|
0
|
1
|
|
my $class = shift; |
64
|
0
|
|
|
|
|
|
my %options = @_; |
65
|
|
|
|
|
|
|
|
66
|
0
|
|
|
|
|
|
my $self = bless { |
67
|
|
|
|
|
|
|
ngram_size => 5, |
68
|
|
|
|
|
|
|
sim_trsh => 0.2, |
69
|
|
|
|
|
|
|
encoding => 'utf8', |
70
|
|
|
|
|
|
|
_stoplist => {}, |
71
|
|
|
|
|
|
|
_digest_count => {}, |
72
|
|
|
|
|
|
|
_doc_ids => {}, |
73
|
|
|
|
|
|
|
}, $class; |
74
|
|
|
|
|
|
|
|
75
|
0
|
|
|
|
|
|
$self->stoplist( $options{stoplist}); |
76
|
0
|
|
|
|
|
|
$self->ngram_size($options{ngram_size}); |
77
|
0
|
|
|
|
|
|
$self->sim_trsh( $options{sim_trsh}); |
78
|
0
|
|
|
|
|
|
$self->encoding( $options{encoding}); |
79
|
|
|
|
|
|
|
|
80
|
0
|
|
|
|
|
|
return $self; |
81
|
|
|
|
|
|
|
} |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
=cut |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
=head2 add_doc |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
$deduper->add_doc($document_id, $document_text); |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
Add a new document to the DeDuper's database. The C<$document_id> must be |
90
|
|
|
|
|
|
|
unique for each document. |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
=cut |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
sub add_doc |
95
|
|
|
|
|
|
|
{ |
96
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
97
|
0
|
|
|
|
|
|
my $docid = shift; |
98
|
0
|
|
|
|
|
|
my $text = shift; |
99
|
|
|
|
|
|
|
|
100
|
0
|
0
|
|
|
|
|
croak("duplicate document id '$docid'") |
101
|
|
|
|
|
|
|
if defined $self->{_digest_count}->{$docid}; |
102
|
|
|
|
|
|
|
|
103
|
0
|
|
|
|
|
|
my @tokens = $self->_tokenise($text); |
104
|
0
|
|
|
|
|
|
my @filtered_tokens = $self->_apply_stoplist(@tokens); |
105
|
0
|
|
|
|
|
|
my @digests = $self->_build_ngram_digests(@filtered_tokens); |
106
|
|
|
|
|
|
|
|
107
|
0
|
|
|
|
|
|
$self->{_digest_count}->{$docid} = scalar(@digests); |
108
|
0
|
|
|
|
|
|
foreach my $digest (@digests) |
109
|
|
|
|
|
|
|
{ |
110
|
0
|
0
|
|
|
|
|
if (not defined $self->{_doc_ids}->{$digest}) |
111
|
0
|
|
|
|
|
|
{ $self->{_doc_ids}->{$digest} = [ $docid ]; } |
112
|
|
|
|
|
|
|
else |
113
|
0
|
|
|
|
|
|
{ push @{$self->{_doc_ids}->{$digest}}, $docid; } |
|
0
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
} |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
=head2 find_similar |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
$deduper->find_similar($document_text); |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
Returns (possibly empty) array of document IDs of documents in the DeDuper's |
122
|
|
|
|
|
|
|
database similar to the C<$document_text>. This can be very simply used for |
123
|
|
|
|
|
|
|
testing whether a near-duplicate document is in the database: |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
if ($deduper->find_similar($document_text)) |
126
|
|
|
|
|
|
|
{ |
127
|
|
|
|
|
|
|
print "at least one near duplicate found"; |
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
=cut |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
sub find_similar |
133
|
|
|
|
|
|
|
{ |
134
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
135
|
0
|
|
|
|
|
|
my $text = shift; |
136
|
|
|
|
|
|
|
|
137
|
0
|
|
|
|
|
|
my @tokens = $self->_tokenise($text); |
138
|
0
|
|
|
|
|
|
my @filtered_tokens = $self->_apply_stoplist(@tokens); |
139
|
0
|
|
|
|
|
|
my @digests = $self->_build_ngram_digests(@filtered_tokens); |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
# compute intersection sizes with all documents in the database |
142
|
0
|
|
|
|
|
|
my %intersection_size; |
143
|
0
|
|
|
|
|
|
foreach my $digest (@digests) |
144
|
|
|
|
|
|
|
{ |
145
|
|
|
|
|
|
|
next |
146
|
0
|
0
|
|
|
|
|
unless defined($self->{_doc_ids}->{$digest}); |
147
|
|
|
|
|
|
|
|
148
|
0
|
|
|
|
|
|
foreach my $docid (@{$self->{_doc_ids}->{$digest}}) |
|
0
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
{ |
150
|
0
|
0
|
|
|
|
|
if (defined $intersection_size{$docid}) |
151
|
0
|
|
|
|
|
|
{ $intersection_size{$docid}++; } |
152
|
|
|
|
|
|
|
else |
153
|
0
|
|
|
|
|
|
{ $intersection_size{$docid} = 1; } |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
|
157
|
0
|
|
|
|
|
|
my @similar; |
158
|
0
|
|
|
|
|
|
foreach my $docid (keys %intersection_size) |
159
|
|
|
|
|
|
|
{ |
160
|
|
|
|
|
|
|
# union size |
161
|
0
|
|
|
|
|
|
my $union_size = scalar(@digests) + $self->{_digest_count}->{$docid} - |
162
|
|
|
|
|
|
|
$intersection_size{$docid}; |
163
|
|
|
|
|
|
|
# resemblance |
164
|
0
|
0
|
|
|
|
|
my $resemblance = $union_size > 0 ? |
165
|
|
|
|
|
|
|
$intersection_size{$docid} / $union_size : 0; |
166
|
|
|
|
|
|
|
# return docs with resemblance above treshold |
167
|
0
|
0
|
|
|
|
|
push @similar, $docid |
168
|
|
|
|
|
|
|
if $resemblance > $self->{sim_trsh}; |
169
|
|
|
|
|
|
|
} |
170
|
|
|
|
|
|
|
|
171
|
0
|
|
|
|
|
|
return @similar; |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
=head2 clean |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
$deduper->clean() |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
Removes all documents from DeDuper's database. |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
=cut |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
sub clean |
183
|
|
|
|
|
|
|
{ |
184
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
185
|
|
|
|
|
|
|
|
186
|
0
|
|
|
|
|
|
$self->{_doc_ids} = {}; |
187
|
0
|
|
|
|
|
|
$self->{_digest_count} = {}; |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
=head1 ATTRIBUTES |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
Attributes can be set using the constructor: |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
$deduper = new Text::DeDuper( |
195
|
|
|
|
|
|
|
ngram_size => 4, |
196
|
|
|
|
|
|
|
encoding => 'iso-8859-1' |
197
|
|
|
|
|
|
|
); |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
... or using the object methods: |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
$deduper->ngram_size(4); |
202
|
|
|
|
|
|
|
$deduper->encoding('iso-8859-1'); |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
The object methods can also be used for retrieving the values of the |
205
|
|
|
|
|
|
|
attributes: |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
$ngram_size = $deduper->ngram_size(); |
208
|
|
|
|
|
|
|
@stoplist = $deduper->stoplist(); |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
=over |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
=item encoding |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
The characters encoding of processed texts. Must be set to correct value so |
215
|
|
|
|
|
|
|
that alphabetical characters could be detected. Accepted values are those |
216
|
|
|
|
|
|
|
supported by the L module (see L). |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
B 'utf8' |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
=item sim_trsh |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
The similarity treshold defines how similar two documents must be to be |
223
|
|
|
|
|
|
|
considered near duplicates. The boundary values are 0 and 1. The similarity |
224
|
|
|
|
|
|
|
value of 1 indicates that the documents are exactly the same. The value of |
225
|
|
|
|
|
|
|
0 on the other hand means that the documents do not share any n-gram. |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
Any two documents will have the similarity value below the default treshold |
228
|
|
|
|
|
|
|
unless they share a significant part of text. |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
B 0.2 |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
=item ngram_size |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
The document similarity is based on the information of how many n-grams the |
235
|
|
|
|
|
|
|
documents have in common. An n-gram is a sequence of any n immeadiately |
236
|
|
|
|
|
|
|
subsequent words. For example the text |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
she sells sea shells on the sea shore |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
contains following 5-grams: |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
she sells sea shells on |
243
|
|
|
|
|
|
|
sells sea shells on the |
244
|
|
|
|
|
|
|
sea shells on the sea |
245
|
|
|
|
|
|
|
shells on the sea shore |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
This attribute specifies the value of n (the size of n-gram). |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
B 5 |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
=item stoplist |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
The stoplist is a list of very frequent words for given language (for English |
254
|
|
|
|
|
|
|
e.g. a, the, is, ...). It is a good idea to remove the stoplist words from |
255
|
|
|
|
|
|
|
texts before similarity is computed, because it is quite likely that two |
256
|
|
|
|
|
|
|
documents will share n-grams of frequent words even if they are not similar |
257
|
|
|
|
|
|
|
at all. |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
The stoplist can be specified both as an array of words and as a name of |
260
|
|
|
|
|
|
|
a file where the words are stored one per line: |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
$deduper->stoplist('a', 'the', 'is', @next_stopwords); |
263
|
|
|
|
|
|
|
$deduper->stoplist('/path/to/english_stoplist.txt'); |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
Do not worry if you do not have a stoplist for your language. DeDuper will do |
266
|
|
|
|
|
|
|
pretty good job even without the stoplist. |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
B empty |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
=back |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
=cut |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
sub encoding |
275
|
|
|
|
|
|
|
{ |
276
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
277
|
0
|
|
|
|
|
|
my $encoding = shift; |
278
|
0
|
0
|
|
|
|
|
$self->{encoding} = $encoding |
279
|
|
|
|
|
|
|
if defined $encoding; |
280
|
0
|
|
|
|
|
|
return $self->{encoding}; |
281
|
|
|
|
|
|
|
} |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
sub sim_trsh |
284
|
|
|
|
|
|
|
{ |
285
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
286
|
0
|
|
|
|
|
|
my $sim_trsh = shift; |
287
|
0
|
0
|
|
|
|
|
$self->{sim_trsh} = $sim_trsh |
288
|
|
|
|
|
|
|
if defined $sim_trsh; |
289
|
0
|
|
|
|
|
|
return $self->{sim_trsh}; |
290
|
|
|
|
|
|
|
} |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
sub ngram_size |
293
|
|
|
|
|
|
|
{ |
294
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
295
|
0
|
|
|
|
|
|
my $ngram_size = shift; |
296
|
0
|
0
|
|
|
|
|
$self->{ngram_size} = $ngram_size |
297
|
|
|
|
|
|
|
if defined $ngram_size; |
298
|
0
|
|
|
|
|
|
return $self->{ngram_size}; |
299
|
|
|
|
|
|
|
} |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
sub stoplist |
302
|
|
|
|
|
|
|
{ |
303
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
304
|
0
|
|
|
|
|
|
my @stoplist = @_; |
305
|
0
|
0
|
0
|
|
|
|
if (@stoplist && defined $stoplist[0]) |
306
|
|
|
|
|
|
|
{ |
307
|
0
|
0
|
0
|
|
|
|
if (@stoplist == 1 && -f $stoplist[0]) |
308
|
0
|
|
|
|
|
|
{ $self->_process_stoplist($stoplist[0]); } |
309
|
|
|
|
|
|
|
else |
310
|
0
|
|
|
|
|
|
{ $self->_process_stoplist(\@stoplist); } |
311
|
|
|
|
|
|
|
} |
312
|
0
|
|
|
|
|
|
return sort keys %{$self->{_stoplist}}; |
|
0
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
} |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
# process stoplist attribute value |
316
|
|
|
|
|
|
|
sub _process_stoplist |
317
|
|
|
|
|
|
|
{ |
318
|
0
|
|
|
0
|
|
|
my $self = shift; |
319
|
0
|
|
|
|
|
|
my $stoplist = shift; |
320
|
|
|
|
|
|
|
|
321
|
0
|
|
|
|
|
|
$self->{_stoplist} = {}; |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
return unless |
324
|
0
|
0
|
|
|
|
|
defined $stoplist; |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
# if not array, treat as filename |
327
|
0
|
0
|
|
|
|
|
if (ref($stoplist) ne 'ARRAY') |
328
|
|
|
|
|
|
|
{ |
329
|
0
|
0
|
|
|
|
|
open(STOPLIST, '<', $stoplist) |
330
|
|
|
|
|
|
|
or croak("can't open '$stoplist' for reading: $!"); |
331
|
0
|
|
|
|
|
|
while (<>) |
332
|
|
|
|
|
|
|
{ |
333
|
0
|
|
|
|
|
|
chomp; |
334
|
0
|
|
|
|
|
|
$self->{_stoplist}->{$_} = 1; |
335
|
|
|
|
|
|
|
} |
336
|
0
|
|
|
|
|
|
close(STOPLIST); |
337
|
|
|
|
|
|
|
} |
338
|
|
|
|
|
|
|
else |
339
|
|
|
|
|
|
|
{ |
340
|
0
|
|
|
|
|
|
foreach (@$stoplist) |
341
|
0
|
|
|
|
|
|
{ $self->{_stoplist}->{$_} = 1; } |
342
|
|
|
|
|
|
|
} |
343
|
|
|
|
|
|
|
} |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
# convert text into array of tokens (words) |
346
|
|
|
|
|
|
|
sub _tokenise |
347
|
|
|
|
|
|
|
{ |
348
|
0
|
|
|
0
|
|
|
my $self = shift; |
349
|
0
|
|
|
|
|
|
my $text = shift; |
350
|
|
|
|
|
|
|
|
351
|
1
|
|
|
1
|
|
11
|
no warnings; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
159
|
|
352
|
0
|
|
|
|
|
|
my $dec_text = Encode::decode($self->{encoding}, $text); |
353
|
0
|
|
|
|
|
|
my $lc_text = lc($dec_text); |
354
|
|
|
|
|
|
|
|
355
|
0
|
|
|
|
|
|
my @result; |
356
|
0
|
|
|
|
|
|
while ($lc_text =~ /([[:alnum:]]+)/g) |
357
|
0
|
|
|
|
|
|
{ push @result, Encode::encode($self->{encoding}, $1); } |
358
|
1
|
|
|
1
|
|
6
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
231
|
|
359
|
|
|
|
|
|
|
|
360
|
0
|
|
|
|
|
|
return @result; |
361
|
|
|
|
|
|
|
} |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
# apply stoplist to array tokens (filter out stop words) |
364
|
|
|
|
|
|
|
sub _apply_stoplist |
365
|
|
|
|
|
|
|
{ |
366
|
0
|
|
|
0
|
|
|
my $self = shift; |
367
|
0
|
|
|
|
|
|
my @tokens = @_; |
368
|
|
|
|
|
|
|
|
369
|
0
|
|
|
|
|
|
my @result; |
370
|
0
|
|
|
|
|
|
foreach my $token (@tokens) |
371
|
|
|
|
|
|
|
{ |
372
|
0
|
0
|
|
|
|
|
push @result, $token |
373
|
|
|
|
|
|
|
unless $self->{_stoplist}->{$token}; |
374
|
|
|
|
|
|
|
} |
375
|
|
|
|
|
|
|
|
376
|
0
|
|
|
|
|
|
return @result; |
377
|
|
|
|
|
|
|
} |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
# convert array of tokens to array of unique hashes |
380
|
|
|
|
|
|
|
# of ngrams (built out of the tokens) |
381
|
|
|
|
|
|
|
sub _build_ngram_digests |
382
|
|
|
|
|
|
|
{ |
383
|
0
|
|
|
0
|
|
|
my $self = shift; |
384
|
0
|
|
|
|
|
|
my @tokens = @_; |
385
|
|
|
|
|
|
|
|
386
|
0
|
|
|
|
|
|
my %digests; |
387
|
0
|
|
|
|
|
|
for my $i (0 .. scalar(@tokens) - $self->{ngram_size}) |
388
|
|
|
|
|
|
|
{ |
389
|
0
|
|
|
|
|
|
my @ngram = @tokens[$i..($i+$self->{ngram_size}-1)]; |
390
|
0
|
|
|
|
|
|
my $digest = Digest::MD4::md4_base64(@ngram); |
391
|
0
|
|
|
|
|
|
$digests{$digest} = 1; |
392
|
|
|
|
|
|
|
} |
393
|
|
|
|
|
|
|
|
394
|
0
|
|
|
|
|
|
return keys(%digests); |
395
|
|
|
|
|
|
|
} |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
1; |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
__END__ |