| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
=head1 NAME |
|
2
|
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
Text::DeDuper - near duplicates detection module |
|
4
|
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
use Text::DeDuper; |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
$deduper = new Text::DeDuper(); |
|
10
|
|
|
|
|
|
|
$deduper->add_doc("doc1", $doc1text); |
|
11
|
|
|
|
|
|
|
$deduper->add_doc("doc2", $doc2text); |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
@similar_docs = $deduper->find_similar($doc3text); |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
... |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
# delete near duplicates from an array of texts |
|
18
|
|
|
|
|
|
|
$deduper = new Text::DeDuper(); |
|
19
|
|
|
|
|
|
|
foreach $text (@texts) |
|
20
|
|
|
|
|
|
|
{ |
|
21
|
|
|
|
|
|
|
next if $deduper->find_similar($text); |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
$deduper->add_doc($i++, $text); |
|
24
|
|
|
|
|
|
|
push @no_near_duplicates, $text; |
|
25
|
|
|
|
|
|
|
} |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
This module uses the resemblance measure as proposed by Andrei Z. Broder at al |
|
30
|
|
|
|
|
|
|
(http://www.ra.ethz.ch/CDstore/www6/Technical/Paper205/Paper205.html) to detect |
|
31
|
|
|
|
|
|
|
similar (near-duplicate) documents based on their text. |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
Note of caution: The module only works correctly with languages where texts can |
|
34
|
|
|
|
|
|
|
be tokenised to words by detecting alphabetical characters sequences. Therefore |
|
35
|
|
|
|
|
|
|
it might not provide very good results for e.g. Chinese. |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=cut |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
package Text::DeDuper; |
|
40
|
|
|
|
|
|
|
|
|
41
|
1
|
|
|
1
|
|
23183
|
use strict; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
33
|
|
|
42
|
1
|
|
|
1
|
|
4
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
26
|
|
|
43
|
1
|
|
|
1
|
|
4
|
use vars qw($VERSION); |
|
|
1
|
|
|
|
|
5
|
|
|
|
1
|
|
|
|
|
53
|
|
|
44
|
|
|
|
|
|
|
|
|
45
|
1
|
|
|
1
|
|
880
|
use Digest::MD4; |
|
|
1
|
|
|
|
|
1108
|
|
|
|
1
|
|
|
|
|
64
|
|
|
46
|
1
|
|
|
1
|
|
1145
|
use Encode; |
|
|
1
|
|
|
|
|
13149
|
|
|
|
1
|
|
|
|
|
1344
|
|
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
$VERSION = do { my @r = (q$Revision: 1.1 $ =~ /\d+/g); sprintf "%d."."%02d", @r }; |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=head1 METHODS |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
=head2 new (CONSTRUCTOR) |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
$deduper = new Text::DeDuper(); |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
Create a new DeDuper instance. Supported attributes are described bellow, in the |
|
57
|
|
|
|
|
|
|
I section. |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=cut |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
sub new |
|
62
|
|
|
|
|
|
|
{ |
|
63
|
0
|
|
|
0
|
1
|
|
my $class = shift; |
|
64
|
0
|
|
|
|
|
|
my %options = @_; |
|
65
|
|
|
|
|
|
|
|
|
66
|
0
|
|
|
|
|
|
my $self = bless { |
|
67
|
|
|
|
|
|
|
ngram_size => 5, |
|
68
|
|
|
|
|
|
|
sim_trsh => 0.2, |
|
69
|
|
|
|
|
|
|
encoding => 'utf8', |
|
70
|
|
|
|
|
|
|
_stoplist => {}, |
|
71
|
|
|
|
|
|
|
_digest_count => {}, |
|
72
|
|
|
|
|
|
|
_doc_ids => {}, |
|
73
|
|
|
|
|
|
|
}, $class; |
|
74
|
|
|
|
|
|
|
|
|
75
|
0
|
|
|
|
|
|
$self->stoplist( $options{stoplist}); |
|
76
|
0
|
|
|
|
|
|
$self->ngram_size($options{ngram_size}); |
|
77
|
0
|
|
|
|
|
|
$self->sim_trsh( $options{sim_trsh}); |
|
78
|
0
|
|
|
|
|
|
$self->encoding( $options{encoding}); |
|
79
|
|
|
|
|
|
|
|
|
80
|
0
|
|
|
|
|
|
return $self; |
|
81
|
|
|
|
|
|
|
} |
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
=cut |
|
84
|
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
=head2 add_doc |
|
86
|
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
$deduper->add_doc($document_id, $document_text); |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
Add a new document to the DeDuper's database. The C<$document_id> must be |
|
90
|
|
|
|
|
|
|
unique for each document. |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
=cut |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
sub add_doc |
|
95
|
|
|
|
|
|
|
{ |
|
96
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
97
|
0
|
|
|
|
|
|
my $docid = shift; |
|
98
|
0
|
|
|
|
|
|
my $text = shift; |
|
99
|
|
|
|
|
|
|
|
|
100
|
0
|
0
|
|
|
|
|
croak("duplicate document id '$docid'") |
|
101
|
|
|
|
|
|
|
if defined $self->{_digest_count}->{$docid}; |
|
102
|
|
|
|
|
|
|
|
|
103
|
0
|
|
|
|
|
|
my @tokens = $self->_tokenise($text); |
|
104
|
0
|
|
|
|
|
|
my @filtered_tokens = $self->_apply_stoplist(@tokens); |
|
105
|
0
|
|
|
|
|
|
my @digests = $self->_build_ngram_digests(@filtered_tokens); |
|
106
|
|
|
|
|
|
|
|
|
107
|
0
|
|
|
|
|
|
$self->{_digest_count}->{$docid} = scalar(@digests); |
|
108
|
0
|
|
|
|
|
|
foreach my $digest (@digests) |
|
109
|
|
|
|
|
|
|
{ |
|
110
|
0
|
0
|
|
|
|
|
if (not defined $self->{_doc_ids}->{$digest}) |
|
111
|
0
|
|
|
|
|
|
{ $self->{_doc_ids}->{$digest} = [ $docid ]; } |
|
112
|
|
|
|
|
|
|
else |
|
113
|
0
|
|
|
|
|
|
{ push @{$self->{_doc_ids}->{$digest}}, $docid; } |
|
|
0
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
} |
|
115
|
|
|
|
|
|
|
} |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
=head2 find_similar |
|
118
|
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
$deduper->find_similar($document_text); |
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
Returns (possibly empty) array of document IDs of documents in the DeDuper's |
|
122
|
|
|
|
|
|
|
database similar to the C<$document_text>. This can be very simply used for |
|
123
|
|
|
|
|
|
|
testing whether a near-duplicate document is in the database: |
|
124
|
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
if ($deduper->find_similar($document_text)) |
|
126
|
|
|
|
|
|
|
{ |
|
127
|
|
|
|
|
|
|
print "at least one near duplicate found"; |
|
128
|
|
|
|
|
|
|
} |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
=cut |
|
131
|
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
sub find_similar |
|
133
|
|
|
|
|
|
|
{ |
|
134
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
135
|
0
|
|
|
|
|
|
my $text = shift; |
|
136
|
|
|
|
|
|
|
|
|
137
|
0
|
|
|
|
|
|
my @tokens = $self->_tokenise($text); |
|
138
|
0
|
|
|
|
|
|
my @filtered_tokens = $self->_apply_stoplist(@tokens); |
|
139
|
0
|
|
|
|
|
|
my @digests = $self->_build_ngram_digests(@filtered_tokens); |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
# compute intersection sizes with all documents in the database |
|
142
|
0
|
|
|
|
|
|
my %intersection_size; |
|
143
|
0
|
|
|
|
|
|
foreach my $digest (@digests) |
|
144
|
|
|
|
|
|
|
{ |
|
145
|
|
|
|
|
|
|
next |
|
146
|
0
|
0
|
|
|
|
|
unless defined($self->{_doc_ids}->{$digest}); |
|
147
|
|
|
|
|
|
|
|
|
148
|
0
|
|
|
|
|
|
foreach my $docid (@{$self->{_doc_ids}->{$digest}}) |
|
|
0
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
{ |
|
150
|
0
|
0
|
|
|
|
|
if (defined $intersection_size{$docid}) |
|
151
|
0
|
|
|
|
|
|
{ $intersection_size{$docid}++; } |
|
152
|
|
|
|
|
|
|
else |
|
153
|
0
|
|
|
|
|
|
{ $intersection_size{$docid} = 1; } |
|
154
|
|
|
|
|
|
|
} |
|
155
|
|
|
|
|
|
|
} |
|
156
|
|
|
|
|
|
|
|
|
157
|
0
|
|
|
|
|
|
my @similar; |
|
158
|
0
|
|
|
|
|
|
foreach my $docid (keys %intersection_size) |
|
159
|
|
|
|
|
|
|
{ |
|
160
|
|
|
|
|
|
|
# union size |
|
161
|
0
|
|
|
|
|
|
my $union_size = scalar(@digests) + $self->{_digest_count}->{$docid} - |
|
162
|
|
|
|
|
|
|
$intersection_size{$docid}; |
|
163
|
|
|
|
|
|
|
# resemblance |
|
164
|
0
|
0
|
|
|
|
|
my $resemblance = $union_size > 0 ? |
|
165
|
|
|
|
|
|
|
$intersection_size{$docid} / $union_size : 0; |
|
166
|
|
|
|
|
|
|
# return docs with resemblance above treshold |
|
167
|
0
|
0
|
|
|
|
|
push @similar, $docid |
|
168
|
|
|
|
|
|
|
if $resemblance > $self->{sim_trsh}; |
|
169
|
|
|
|
|
|
|
} |
|
170
|
|
|
|
|
|
|
|
|
171
|
0
|
|
|
|
|
|
return @similar; |
|
172
|
|
|
|
|
|
|
} |
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
=head2 clean |
|
175
|
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
$deduper->clean() |
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
Removes all documents from DeDuper's database. |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
=cut |
|
181
|
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
sub clean |
|
183
|
|
|
|
|
|
|
{ |
|
184
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
185
|
|
|
|
|
|
|
|
|
186
|
0
|
|
|
|
|
|
$self->{_doc_ids} = {}; |
|
187
|
0
|
|
|
|
|
|
$self->{_digest_count} = {}; |
|
188
|
|
|
|
|
|
|
} |
|
189
|
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
=head1 ATTRIBUTES |
|
191
|
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
Attributes can be set using the constructor: |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
$deduper = new Text::DeDuper( |
|
195
|
|
|
|
|
|
|
ngram_size => 4, |
|
196
|
|
|
|
|
|
|
encoding => 'iso-8859-1' |
|
197
|
|
|
|
|
|
|
); |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
... or using the object methods: |
|
200
|
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
$deduper->ngram_size(4); |
|
202
|
|
|
|
|
|
|
$deduper->encoding('iso-8859-1'); |
|
203
|
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
The object methods can also be used for retrieving the values of the |
|
205
|
|
|
|
|
|
|
attributes: |
|
206
|
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
$ngram_size = $deduper->ngram_size(); |
|
208
|
|
|
|
|
|
|
@stoplist = $deduper->stoplist(); |
|
209
|
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
=over |
|
211
|
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
=item encoding |
|
213
|
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
The characters encoding of processed texts. Must be set to correct value so |
|
215
|
|
|
|
|
|
|
that alphabetical characters could be detected. Accepted values are those |
|
216
|
|
|
|
|
|
|
supported by the L module (see L). |
|
217
|
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
B 'utf8' |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
=item sim_trsh |
|
221
|
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
The similarity treshold defines how similar two documents must be to be |
|
223
|
|
|
|
|
|
|
considered near duplicates. The boundary values are 0 and 1. The similarity |
|
224
|
|
|
|
|
|
|
value of 1 indicates that the documents are exactly the same. The value of |
|
225
|
|
|
|
|
|
|
0 on the other hand means that the documents do not share any n-gram. |
|
226
|
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
Any two documents will have the similarity value below the default treshold |
|
228
|
|
|
|
|
|
|
unless they share a significant part of text. |
|
229
|
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
B 0.2 |
|
231
|
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
=item ngram_size |
|
233
|
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
The document similarity is based on the information of how many n-grams the |
|
235
|
|
|
|
|
|
|
documents have in common. An n-gram is a sequence of any n immeadiately |
|
236
|
|
|
|
|
|
|
subsequent words. For example the text |
|
237
|
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
she sells sea shells on the sea shore |
|
239
|
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
contains following 5-grams: |
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
she sells sea shells on |
|
243
|
|
|
|
|
|
|
sells sea shells on the |
|
244
|
|
|
|
|
|
|
sea shells on the sea |
|
245
|
|
|
|
|
|
|
shells on the sea shore |
|
246
|
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
This attribute specifies the value of n (the size of n-gram). |
|
248
|
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
B 5 |
|
250
|
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
=item stoplist |
|
252
|
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
The stoplist is a list of very frequent words for given language (for English |
|
254
|
|
|
|
|
|
|
e.g. a, the, is, ...). It is a good idea to remove the stoplist words from |
|
255
|
|
|
|
|
|
|
texts before similarity is computed, because it is quite likely that two |
|
256
|
|
|
|
|
|
|
documents will share n-grams of frequent words even if they are not similar |
|
257
|
|
|
|
|
|
|
at all. |
|
258
|
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
The stoplist can be specified both as an array of words and as a name of |
|
260
|
|
|
|
|
|
|
a file where the words are stored one per line: |
|
261
|
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
$deduper->stoplist('a', 'the', 'is', @next_stopwords); |
|
263
|
|
|
|
|
|
|
$deduper->stoplist('/path/to/english_stoplist.txt'); |
|
264
|
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
Do not worry if you do not have a stoplist for your language. DeDuper will do |
|
266
|
|
|
|
|
|
|
pretty good job even without the stoplist. |
|
267
|
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
B empty |
|
269
|
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
=back |
|
271
|
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
=cut |
|
273
|
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
sub encoding |
|
275
|
|
|
|
|
|
|
{ |
|
276
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
277
|
0
|
|
|
|
|
|
my $encoding = shift; |
|
278
|
0
|
0
|
|
|
|
|
$self->{encoding} = $encoding |
|
279
|
|
|
|
|
|
|
if defined $encoding; |
|
280
|
0
|
|
|
|
|
|
return $self->{encoding}; |
|
281
|
|
|
|
|
|
|
} |
|
282
|
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
sub sim_trsh |
|
284
|
|
|
|
|
|
|
{ |
|
285
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
286
|
0
|
|
|
|
|
|
my $sim_trsh = shift; |
|
287
|
0
|
0
|
|
|
|
|
$self->{sim_trsh} = $sim_trsh |
|
288
|
|
|
|
|
|
|
if defined $sim_trsh; |
|
289
|
0
|
|
|
|
|
|
return $self->{sim_trsh}; |
|
290
|
|
|
|
|
|
|
} |
|
291
|
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
sub ngram_size |
|
293
|
|
|
|
|
|
|
{ |
|
294
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
295
|
0
|
|
|
|
|
|
my $ngram_size = shift; |
|
296
|
0
|
0
|
|
|
|
|
$self->{ngram_size} = $ngram_size |
|
297
|
|
|
|
|
|
|
if defined $ngram_size; |
|
298
|
0
|
|
|
|
|
|
return $self->{ngram_size}; |
|
299
|
|
|
|
|
|
|
} |
|
300
|
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
sub stoplist |
|
302
|
|
|
|
|
|
|
{ |
|
303
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
304
|
0
|
|
|
|
|
|
my @stoplist = @_; |
|
305
|
0
|
0
|
0
|
|
|
|
if (@stoplist && defined $stoplist[0]) |
|
306
|
|
|
|
|
|
|
{ |
|
307
|
0
|
0
|
0
|
|
|
|
if (@stoplist == 1 && -f $stoplist[0]) |
|
308
|
0
|
|
|
|
|
|
{ $self->_process_stoplist($stoplist[0]); } |
|
309
|
|
|
|
|
|
|
else |
|
310
|
0
|
|
|
|
|
|
{ $self->_process_stoplist(\@stoplist); } |
|
311
|
|
|
|
|
|
|
} |
|
312
|
0
|
|
|
|
|
|
return sort keys %{$self->{_stoplist}}; |
|
|
0
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
} |
|
314
|
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
# process stoplist attribute value |
|
316
|
|
|
|
|
|
|
sub _process_stoplist |
|
317
|
|
|
|
|
|
|
{ |
|
318
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
319
|
0
|
|
|
|
|
|
my $stoplist = shift; |
|
320
|
|
|
|
|
|
|
|
|
321
|
0
|
|
|
|
|
|
$self->{_stoplist} = {}; |
|
322
|
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
return unless |
|
324
|
0
|
0
|
|
|
|
|
defined $stoplist; |
|
325
|
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
# if not array, treat as filename |
|
327
|
0
|
0
|
|
|
|
|
if (ref($stoplist) ne 'ARRAY') |
|
328
|
|
|
|
|
|
|
{ |
|
329
|
0
|
0
|
|
|
|
|
open(STOPLIST, '<', $stoplist) |
|
330
|
|
|
|
|
|
|
or croak("can't open '$stoplist' for reading: $!"); |
|
331
|
0
|
|
|
|
|
|
while (<>) |
|
332
|
|
|
|
|
|
|
{ |
|
333
|
0
|
|
|
|
|
|
chomp; |
|
334
|
0
|
|
|
|
|
|
$self->{_stoplist}->{$_} = 1; |
|
335
|
|
|
|
|
|
|
} |
|
336
|
0
|
|
|
|
|
|
close(STOPLIST); |
|
337
|
|
|
|
|
|
|
} |
|
338
|
|
|
|
|
|
|
else |
|
339
|
|
|
|
|
|
|
{ |
|
340
|
0
|
|
|
|
|
|
foreach (@$stoplist) |
|
341
|
0
|
|
|
|
|
|
{ $self->{_stoplist}->{$_} = 1; } |
|
342
|
|
|
|
|
|
|
} |
|
343
|
|
|
|
|
|
|
} |
|
344
|
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
# convert text into array of tokens (words) |
|
346
|
|
|
|
|
|
|
sub _tokenise |
|
347
|
|
|
|
|
|
|
{ |
|
348
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
349
|
0
|
|
|
|
|
|
my $text = shift; |
|
350
|
|
|
|
|
|
|
|
|
351
|
1
|
|
|
1
|
|
11
|
no warnings; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
159
|
|
|
352
|
0
|
|
|
|
|
|
my $dec_text = Encode::decode($self->{encoding}, $text); |
|
353
|
0
|
|
|
|
|
|
my $lc_text = lc($dec_text); |
|
354
|
|
|
|
|
|
|
|
|
355
|
0
|
|
|
|
|
|
my @result; |
|
356
|
0
|
|
|
|
|
|
while ($lc_text =~ /([[:alnum:]]+)/g) |
|
357
|
0
|
|
|
|
|
|
{ push @result, Encode::encode($self->{encoding}, $1); } |
|
358
|
1
|
|
|
1
|
|
6
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
231
|
|
|
359
|
|
|
|
|
|
|
|
|
360
|
0
|
|
|
|
|
|
return @result; |
|
361
|
|
|
|
|
|
|
} |
|
362
|
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
# apply stoplist to array tokens (filter out stop words) |
|
364
|
|
|
|
|
|
|
sub _apply_stoplist |
|
365
|
|
|
|
|
|
|
{ |
|
366
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
367
|
0
|
|
|
|
|
|
my @tokens = @_; |
|
368
|
|
|
|
|
|
|
|
|
369
|
0
|
|
|
|
|
|
my @result; |
|
370
|
0
|
|
|
|
|
|
foreach my $token (@tokens) |
|
371
|
|
|
|
|
|
|
{ |
|
372
|
0
|
0
|
|
|
|
|
push @result, $token |
|
373
|
|
|
|
|
|
|
unless $self->{_stoplist}->{$token}; |
|
374
|
|
|
|
|
|
|
} |
|
375
|
|
|
|
|
|
|
|
|
376
|
0
|
|
|
|
|
|
return @result; |
|
377
|
|
|
|
|
|
|
} |
|
378
|
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
# convert array of tokens to array of unique hashes |
|
380
|
|
|
|
|
|
|
# of ngrams (built out of the tokens) |
|
381
|
|
|
|
|
|
|
sub _build_ngram_digests |
|
382
|
|
|
|
|
|
|
{ |
|
383
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
384
|
0
|
|
|
|
|
|
my @tokens = @_; |
|
385
|
|
|
|
|
|
|
|
|
386
|
0
|
|
|
|
|
|
my %digests; |
|
387
|
0
|
|
|
|
|
|
for my $i (0 .. scalar(@tokens) - $self->{ngram_size}) |
|
388
|
|
|
|
|
|
|
{ |
|
389
|
0
|
|
|
|
|
|
my @ngram = @tokens[$i..($i+$self->{ngram_size}-1)]; |
|
390
|
0
|
|
|
|
|
|
my $digest = Digest::MD4::md4_base64(@ngram); |
|
391
|
0
|
|
|
|
|
|
$digests{$digest} = 1; |
|
392
|
|
|
|
|
|
|
} |
|
393
|
|
|
|
|
|
|
|
|
394
|
0
|
|
|
|
|
|
return keys(%digests); |
|
395
|
|
|
|
|
|
|
} |
|
396
|
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
1; |
|
398
|
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
__END__ |