File Coverage

blib/lib/Text/DeDuper.pm

Criterion	Covered	Total	%
statement	21	118	17.8
branch	0	30	0.0
condition	0	6	0.0
subroutine	7	19	36.8
pod	8	8	100.0
total	36	181	19.8

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							Text::DeDuper - near duplicates detection module
4
5							=head1 SYNOPSIS
6
7							use Text::DeDuper;
8
9							$deduper = new Text::DeDuper();
10							$deduper->add_doc("doc1", $doc1text);
11							$deduper->add_doc("doc2", $doc2text);
12
13							@similar_docs = $deduper->find_similar($doc3text);
14
15							...
16
17							# delete near duplicates from an array of texts
18							$deduper = new Text::DeDuper();
19							foreach $text (@texts)
20							{
21							next if $deduper->find_similar($text);
22
23							$deduper->add_doc($i++, $text);
24							push @no_near_duplicates, $text;
25							}
26
27							=head1 DESCRIPTION
28
29							This module uses the resemblance measure as proposed by Andrei Z. Broder at al
30							(http://www.ra.ethz.ch/CDstore/www6/Technical/Paper205/Paper205.html) to detect
31							similar (near-duplicate) documents based on their text.
32
33							Note of caution: The module only works correctly with languages where texts can
34							be tokenised to words by detecting alphabetical characters sequences. Therefore
35							it might not provide very good results for e.g. Chinese.
36
37							=cut
38
39							package Text::DeDuper;
40
41	1			1		23183	use strict;
	1					1
	1					33
42	1			1		4	use warnings;
	1					2
	1					26
43	1			1		4	use vars qw($VERSION);
	1					5
	1					53
44
45	1			1		880	use Digest::MD4;
	1					1108
	1					64
46	1			1		1145	use Encode;
	1					13149
	1					1344
47
48							$VERSION = do { my @r = (q$Revision: 1.1 $ =~ /\d+/g); sprintf "%d."."%02d", @r };
49
50							=head1 METHODS
51
52							=head2 new (CONSTRUCTOR)
53
54							$deduper = new Text::DeDuper();
55
56							Create a new DeDuper instance. Supported attributes are described bellow, in the
57							I section.
58
59							=cut
60
61							sub new
62							{
63	0			0	1		my $class = shift;
64	0						my %options = @_;
65
66	0						my $self = bless {
67							ngram_size => 5,
68							sim_trsh => 0.2,
69							encoding => 'utf8',
70							_stoplist => {},
71							_digest_count => {},
72							_doc_ids => {},
73							}, $class;
74
75	0						$self->stoplist( $options{stoplist});
76	0						$self->ngram_size($options{ngram_size});
77	0						$self->sim_trsh( $options{sim_trsh});
78	0						$self->encoding( $options{encoding});
79
80	0						return $self;
81							}
82
83							=cut
84
85							=head2 add_doc
86
87							$deduper->add_doc($document_id, $document_text);
88
89							Add a new document to the DeDuper's database. The C<$document_id> must be
90							unique for each document.
91
92							=cut
93
94							sub add_doc
95							{
96	0			0	1		my $self = shift;
97	0						my $docid = shift;
98	0						my $text = shift;
99
100	0	0					croak("duplicate document id '$docid'")
101							if defined $self->{_digest_count}->{$docid};
102
103	0						my @tokens = $self->_tokenise($text);
104	0						my @filtered_tokens = $self->_apply_stoplist(@tokens);
105	0						my @digests = $self->_build_ngram_digests(@filtered_tokens);
106
107	0						$self->{_digest_count}->{$docid} = scalar(@digests);
108	0						foreach my $digest (@digests)
109							{
110	0	0					if (not defined $self->{_doc_ids}->{$digest})
111	0						{ $self->{_doc_ids}->{$digest} = [ $docid ]; }
112							else
113	0						{ push @{$self->{_doc_ids}->{$digest}}, $docid; }
	0
114							}
115							}
116
117							=head2 find_similar
118
119							$deduper->find_similar($document_text);
120
121							Returns (possibly empty) array of document IDs of documents in the DeDuper's
122							database similar to the C<$document_text>. This can be very simply used for
123							testing whether a near-duplicate document is in the database:
124
125							if ($deduper->find_similar($document_text))
126							{
127							print "at least one near duplicate found";
128							}
129
130							=cut
131
132							sub find_similar
133							{
134	0			0	1		my $self = shift;
135	0						my $text = shift;
136
137	0						my @tokens = $self->_tokenise($text);
138	0						my @filtered_tokens = $self->_apply_stoplist(@tokens);
139	0						my @digests = $self->_build_ngram_digests(@filtered_tokens);
140
141							# compute intersection sizes with all documents in the database
142	0						my %intersection_size;
143	0						foreach my $digest (@digests)
144							{
145							next
146	0	0					unless defined($self->{_doc_ids}->{$digest});
147
148	0						foreach my $docid (@{$self->{_doc_ids}->{$digest}})
	0
149							{
150	0	0					if (defined $intersection_size{$docid})
151	0						{ $intersection_size{$docid}++; }
152							else
153	0						{ $intersection_size{$docid} = 1; }
154							}
155							}
156
157	0						my @similar;
158	0						foreach my $docid (keys %intersection_size)
159							{
160							# union size
161	0						my $union_size = scalar(@digests) + $self->{_digest_count}->{$docid} -
162							$intersection_size{$docid};
163							# resemblance
164	0	0					my $resemblance = $union_size > 0 ?
165							$intersection_size{$docid} / $union_size : 0;
166							# return docs with resemblance above treshold
167	0	0					push @similar, $docid
168							if $resemblance > $self->{sim_trsh};
169							}
170
171	0						return @similar;
172							}
173
174							=head2 clean
175
176							$deduper->clean()
177
178							Removes all documents from DeDuper's database.
179
180							=cut
181
182							sub clean
183							{
184	0			0	1		my $self = shift;
185
186	0						$self->{_doc_ids} = {};
187	0						$self->{_digest_count} = {};
188							}
189
190							=head1 ATTRIBUTES
191
192							Attributes can be set using the constructor:
193
194							$deduper = new Text::DeDuper(
195							ngram_size => 4,
196							encoding => 'iso-8859-1'
197							);
198
199							... or using the object methods:
200
201							$deduper->ngram_size(4);
202							$deduper->encoding('iso-8859-1');
203
204							The object methods can also be used for retrieving the values of the
205							attributes:
206
207							$ngram_size = $deduper->ngram_size();
208							@stoplist = $deduper->stoplist();
209
210							=over
211
212							=item encoding
213
214							The characters encoding of processed texts. Must be set to correct value so
215							that alphabetical characters could be detected. Accepted values are those
216							supported by the L module (see L).
217
218							B 'utf8'
219
220							=item sim_trsh
221
222							The similarity treshold defines how similar two documents must be to be
223							considered near duplicates. The boundary values are 0 and 1. The similarity
224							value of 1 indicates that the documents are exactly the same. The value of
225							0 on the other hand means that the documents do not share any n-gram.
226
227							Any two documents will have the similarity value below the default treshold
228							unless they share a significant part of text.
229
230							B 0.2
231
232							=item ngram_size
233
234							The document similarity is based on the information of how many n-grams the
235							documents have in common. An n-gram is a sequence of any n immeadiately
236							subsequent words. For example the text
237
238							she sells sea shells on the sea shore
239
240							contains following 5-grams:
241
242							she sells sea shells on
243							sells sea shells on the
244							sea shells on the sea
245							shells on the sea shore
246
247							This attribute specifies the value of n (the size of n-gram).
248
249							B 5
250
251							=item stoplist
252
253							The stoplist is a list of very frequent words for given language (for English
254							e.g. a, the, is, ...). It is a good idea to remove the stoplist words from
255							texts before similarity is computed, because it is quite likely that two
256							documents will share n-grams of frequent words even if they are not similar
257							at all.
258
259							The stoplist can be specified both as an array of words and as a name of
260							a file where the words are stored one per line:
261
262							$deduper->stoplist('a', 'the', 'is', @next_stopwords);
263							$deduper->stoplist('/path/to/english_stoplist.txt');
264
265							Do not worry if you do not have a stoplist for your language. DeDuper will do
266							pretty good job even without the stoplist.
267
268							B empty
269
270							=back
271
272							=cut
273
274							sub encoding
275							{
276	0			0	1		my $self = shift;
277	0						my $encoding = shift;
278	0	0					$self->{encoding} = $encoding
279							if defined $encoding;
280	0						return $self->{encoding};
281							}
282
283							sub sim_trsh
284							{
285	0			0	1		my $self = shift;
286	0						my $sim_trsh = shift;
287	0	0					$self->{sim_trsh} = $sim_trsh
288							if defined $sim_trsh;
289	0						return $self->{sim_trsh};
290							}
291
292							sub ngram_size
293							{
294	0			0	1		my $self = shift;
295	0						my $ngram_size = shift;
296	0	0					$self->{ngram_size} = $ngram_size
297							if defined $ngram_size;
298	0						return $self->{ngram_size};
299							}
300
301							sub stoplist
302							{
303	0			0	1		my $self = shift;
304	0						my @stoplist = @_;
305	0	0	0				if (@stoplist && defined $stoplist[0])
306							{
307	0	0	0				if (@stoplist == 1 && -f $stoplist[0])
308	0						{ $self->_process_stoplist($stoplist[0]); }
309							else
310	0						{ $self->_process_stoplist(\@stoplist); }
311							}
312	0						return sort keys %{$self->{_stoplist}};
	0
313							}
314
315							# process stoplist attribute value
316							sub _process_stoplist
317							{
318	0			0			my $self = shift;
319	0						my $stoplist = shift;
320
321	0						$self->{_stoplist} = {};
322
323							return unless
324	0	0					defined $stoplist;
325
326							# if not array, treat as filename
327	0	0					if (ref($stoplist) ne 'ARRAY')
328							{
329	0	0					open(STOPLIST, '<', $stoplist)
330							or croak("can't open '$stoplist' for reading: $!");
331	0						while (<>)
332							{
333	0						chomp;
334	0						$self->{_stoplist}->{$_} = 1;
335							}
336	0						close(STOPLIST);
337							}
338							else
339							{
340	0						foreach (@$stoplist)
341	0						{ $self->{_stoplist}->{$_} = 1; }
342							}
343							}
344
345							# convert text into array of tokens (words)
346							sub _tokenise
347							{
348	0			0			my $self = shift;
349	0						my $text = shift;
350
351	1			1		11	no warnings;
	1					3
	1					159
352	0						my $dec_text = Encode::decode($self->{encoding}, $text);
353	0						my $lc_text = lc($dec_text);
354
355	0						my @result;
356	0						while ($lc_text =~ /([[:alnum:]]+)/g)
357	0						{ push @result, Encode::encode($self->{encoding}, $1); }
358	1			1		6	use warnings;
	1					2
	1					231
359
360	0						return @result;
361							}
362
363							# apply stoplist to array tokens (filter out stop words)
364							sub _apply_stoplist
365							{
366	0			0			my $self = shift;
367	0						my @tokens = @_;
368
369	0						my @result;
370	0						foreach my $token (@tokens)
371							{
372	0	0					push @result, $token
373							unless $self->{_stoplist}->{$token};
374							}
375
376	0						return @result;
377							}
378
379							# convert array of tokens to array of unique hashes
380							# of ngrams (built out of the tokens)
381							sub _build_ngram_digests
382							{
383	0			0			my $self = shift;
384	0						my @tokens = @_;
385
386	0						my %digests;
387	0						for my $i (0 .. scalar(@tokens) - $self->{ngram_size})
388							{
389	0						my @ngram = @tokens[$i..($i+$self->{ngram_size}-1)];
390	0						my $digest = Digest::MD4::md4_base64(@ngram);
391	0						$digests{$digest} = 1;
392							}
393
394	0						return keys(%digests);
395							}
396
397							1;
398
399							__END__