File Coverage

blib/lib/File/FindSimilars.pm

Criterion	Covered	Total	%
statement	13	15	86.6
branch			n/a
condition			n/a
subroutine	5	5	100.0
pod			n/a
total	18	20	90.0

line	stmt	sub	time	code
1				package File::FindSimilars;
2
3				# @Author: Tong SUN, (c)2001-2008, all right reserved
4				# @Version: $Date: 2009/01/02 21:10:20 $ $Revision: 2.6 $
5				# @HomeURL: http://xpt.sourceforge.net/
6
7				# {{{ LICENSE:
8
9				#
10				# Permission to use, copy, modify, and distribute this software and its
11				# documentation for any purpose and without fee is hereby granted, provided
12				# that the above copyright notices appear in all copies and that both those
13				# copyright notices and this permission notice appear in supporting
14				# documentation, and that the names of author not be used in advertising or
15				# publicity pertaining to distribution of the software without specific,
16				# written prior permission. Tong Sun makes no representations about the
17				# suitability of this software for any purpose. It is provided "as is"
18				# without express or implied warranty.
19				#
20				# TONG SUN DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
21				# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL ADOBE
22				# SYSTEMS INCORPORATED AND DIGITAL EQUIPMENT CORPORATION BE LIABLE FOR ANY
23				# SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
24				# RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
25				# CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
26				# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
27				#
28
29				# }}}
30
31				# {{{ POD, Intro:
32
33				=head1 NAME
34
35				File::FindSimilars - Fast similar-files finder
36
37				=head1 SYNOPSIS
38
39				use File::FindSimilars;
40
41				my $similars_finder =
42				File::FindSimilars->new( { fc_level => $fc_level, } );
43				$similars_finder->find_for(\@ARGV);
44				$similars_finder->similarity_check();
45
46				=head1 DESCRIPTION
47
48				Extremely fast file similarity checker. Similar-sized and similar-named
49				files are picked out as suspicious candidates of duplicated files.
50
51				It uses advanced soundex vector algorithm to determine the similarity
52				between files. Generally it means that if there are n files, each having
53				approximately m words in the file name, the degree of calculation is merely
54
55				O(n^2 * m)
56
57				which is over thousands times faster than any existing file fingerprinting
58				technology.
59
60				=head2 ALGORITHM EXPLANATION
61
62				The self-test output will help you understand what the module do and what
63				would you expect from the outcome.
64
65				$ make test
66				PERL_DL_NONLAZY=1 /usr/bin/perl "-Iblib/lib" "-Iblib/arch" test.pl
67				1..5 todo 2;
68				# Running under perl version 5.010000 for linux
69				# Current time local: Wed Nov 5 17:45:19 2008
70				# Current time GMT: Wed Nov 5 22:45:19 2008
71				# Using Test.pm version 1.25
72				# Testing File::FindSimilars version 2.04
73
74				. . . .
75
76				== Testing 2, files under test/ subdir:
77
78				9 test/(eBook) GNU - Python Standard Library 2001.pdf
79				3 test/Audio Book - The Grey Coloured Bunnie.mp3
80				5 test/ColoredGrayBunny.ogg
81				5 test/GNU - 2001 - Python Standard Library.pdf
82				4 test/GNU - Python Standard Library (2001).rar
83				9 test/LayoutTest.java
84				3 test/PopupTest.java
85				2 test/Python Standard Library.zip
86				ok 2 # (test.pl at line 83 TODO?!)
87
88				Note:
89
90				- The findsimilars script will pick out similar files from them in next test.
91				- Let's assume that the number represent the file size in KB.
92
93				== Testing 3 result should be:
94
95				## =========
96				3 'Audio Book - The Grey Coloured Bunnie.mp3' 'test/'
97				5 'ColoredGrayBunny.ogg' 'test/'
98
99				## =========
100				4 'GNU - Python Standard Library (2001).rar' 'test/'
101				5 'GNU - 2001 - Python Standard Library.pdf' 'test/'
102				ok 3
103
104				Note:
105
106				- There are 2 groups of similar files picked out by the script.
107				- The similar files are picked because their file names look similar.
108				Note that the first group looks different and spells differently too,
109				which means that the script is versatile enough to handle file names that
110				don't have space in it, and robust enough to deal with spelling mistakes.
111				- Apart from the file name, the file size plays an important role as well.
112				- There are 2 files in the second similar files group, the book files group.
113				- The file 'Python Standard Library.zip' is not considered to be similar to
114				the group because its size is not similar to the group.
115
116				== Testing 4, if Python.zip is bigger, result should be:
117
118				## =========
119				3 'Audio Book - The Grey Coloured Bunnie.mp3' 'test/'
120				5 'ColoredGrayBunny.ogg' 'test/'
121
122				## =========
123				4 'Python Standard Library.zip' 'test/'
124				4 'GNU - Python Standard Library (2001).rar' 'test/'
125				5 'GNU - 2001 - Python Standard Library.pdf' 'test/'
126				ok 4
127
128				Note:
129
130				- There are now 3 files in the book files group.
131				- The file 'Python Standard Library.zip' is included in the
132				group because its size is now similar to the group.
133
134				== Testing 5, if Python.zip is even bigger, result should be:
135
136				## =========
137				3 'Audio Book - The Grey Coloured Bunnie.mp3' 'test/'
138				5 'ColoredGrayBunny.ogg' 'test/'
139
140				## =========
141				4 'GNU - Python Standard Library (2001).rar' 'test/'
142				5 'GNU - 2001 - Python Standard Library.pdf' 'test/'
143				6 'Python Standard Library.zip' 'test/'
144				9 '(eBook) GNU - Python Standard Library 2001.pdf' 'test/'
145				ok 5
146
147				Note:
148
149				- There are 4 files in the book files group now.
150				- The file 'Python Standard Library.zip' is still in the group.
151				- But this time, because it is also considered to be similar to the .pdf
152				file (since their size are now similar, 6 vs 9), a 4th file the .pdf one
153				is now included in the book group.
154				- If the size of file 'Python Standard Library.zip' is 12(KB), then the
155				book files group will be split into two. Do you know why and
156				which files each group will contain?
157
158				The File::FindSimilars package comes with a fully functional demo
159				script findsimilars. Please refer to its help file for further
160				explanations.
161
162				This package is highly customizable. Refer to the class method C for
163				details.
164
165				=head1 DEPENDS
166
167				This module depends on L, but not L.
168
169				=cut
170
171				# }}}
172
173				# {{{ Global Declaration:
174
175				# ============================================================== &us ===
176				# ............................................................. Uses ...
177
178				# -- global modules
179	1	1	102328	use strict; # !
	1		2
	1		30
180
181	1	1	5	use Carp;
	1		2
	1		63
182	1	1	5	use Getopt::Long;
	1		5
	1		8
183	1	1	136	use File::Basename;
	1		2
	1		69
184	1	1	328	use Text::Soundex;
	0
	0
185
186				use base qw(Class::Accessor::Fast);
187
188				# -- local modules
189
190				sub dbg_show {};
191				#use MyDbg; $MyDbg::debugging=010;
192
193				# ============================================================== &gv ===
194				# .................................................. Global Varibles ...
195				#
196
197				our @EXPORT = ( ); # may even omit this line
198
199				use vars qw($progname $VERSION $debugging);
200				use vars qw(%config @filequeue @fileInfo %sdxCnt %wrdLst);
201
202				# @fileInfo: List of the following list:
203				my (
204				$N_dName, # dir name
205				$N_fName, # file name
206				$N_fSize, # file size
207				$N_fSdxl, # file soundex list, reference
208				) = (0..9);
209
210				# ============================================================== &cs ===
211				# ................................................. Constant setting ...
212				#
213				$VERSION = sprintf("%d.%02d", q$Revision: 2.6 $ =~ /(\d+)\.(\d+)/);
214
215				# }}}
216
217				# ############################################################## &ss ###
218				# ................................................ Subroutions start ...
219
220				=head1 METHODS
221
222				=head2 File::FindSimilars->new(\%config_param)
223
224				Initialize the object.
225
226				my $similars_finder = File::FindSimilars->new();
227
228				or,
229
230				my $similars_finder = File::FindSimilars->new( {} );
231
232				which are the same as:
233
234				my $similars_finder = File::FindSimilars->new( {
235				soundex_weight => 50, # percentage of weight that soundex takes,
236				# the rest is for file size
237				fc_threshold => 75, # over which files are considered similar
238				delimiter => "\n## =========\n", # delimiter between files output
239				format => "%12d '%s' %s'%s'", # file info print format
240				fc_level => 0, # file comparison level
241				verbose => 0,
242
243				} );
244
245				What shown above are default settings. Any of the C<%config_param> attribute can be omitted when calling the new method.
246
247				The C is the only class method. All the rest methods are object methods.
248
249				=head2 Object attribute: soundex_weight([set_val])
250
251				Percentage of weight that soundex takes, the rest of percentage is for file size.
252
253				Provide the C to change the attribute, omitting it to retrieve the attribute value.
254
255				=head2 Object attribute: fc_threshold([set_val])
256
257				The threshold over which files are considered similar.
258
259				Provide the C to change the attribute, omitting it to retrieve the attribute value.
260
261				=head2 Object attribute: delimiter([set_val])
262
263				Delimiter printed between file info outputs.
264
265				Provide the C to change the attribute, omitting it to retrieve the attribute value.
266
267				=head2 Object attribute: format([set_val])
268
269				Format used to print file info.
270
271				Provide the C to change the attribute, omitting it to retrieve the attribute value.
272
273				=head2 Object attribute: fc_level([set_val])
274
275				File comparison level. Whether to check similar files within the same folder: 0, no; 1, yes.
276
277				Provide the C to change the attribute, omitting it to retrieve the attribute value.
278
279				=head2 Object attribute: verbose([set_val])
280
281				Verbose level. Whether to output progress info: 0, no; 1, yes.
282
283				Provide the C to change the attribute, omitting it to retrieve the attribute value.
284
285				=cut
286
287				File::FindSimilars
288				->mk_accessors(qw(soundex_weight fc_threshold
289				delimiter format fc_level verbose));
290
291				%config =
292				(
293
294				soundex_weight => 50, # percentage of weight that soundex takes,
295				# the rest is for file size
296				fc_threshold => 75, # over which files are considered similar
297				delimiter => "\n## =========\n", # delimiter between files output
298				format => "%12d '%s' %s'%s'", # file info print format
299
300				fc_level => 0, # file comparison level
301				verbose => 0,
302				);
303
304
305				# =========================================================== &s-sub ===
306
307				sub new {
308				ref(my $class = shift)
309				and croak "new is a class method. class name needed.";
310				my ($arg_ref) = @_;
311				my $self = $class->SUPER::new({%config, %$arg_ref});
312				$config{soundex_weight} = $self->soundex_weight;
313				$config{fc_threshold} = $self->fc_threshold;
314				$config{delimiter} = $self->delimiter;
315				$config{format} = $self->format;
316				$config{fc_level} = $self->fc_level;
317				$config{verbose} = $self->verbose;
318				#$config{} = $self->;
319				return $self;
320				}
321
322				# =========================================================== &s-sub ===
323
324				=head2 Object method: find_for($array_ref)
325
326				Set directory queue for similarity checking. Each entry in C<$array_ref>
327				is a directory to check into. E.g.,
328
329				$similars_finder->find_for(\@ARGV);
330
331				=cut
332
333				sub find_for {
334				my ($self, $init_dirs) = @_;
335
336				# threshold $config{fc_threshold}
337				print STDERR "Searching in directory(ies): @$init_dirs with level $config{fc_level}...\n\n"
338				if $config{verbose};
339
340				@filequeue = @fileInfo = ();
341				@filequeue = (@filequeue, map { [$_, ''] } @$init_dirs);
342				process_entries();
343
344				dbg_show(100,"\@fileInfo", @fileInfo);
345				dbg_show(100,"%sdxCnt", %sdxCnt);
346				dbg_show(100,"%wrdLst", %wrdLst);
347				}
348
349				# =========================================================== &s-sub ===
350				# I - Input: global array @filequeue
351				# Input parameters: None
352				#
353				sub process_entries {
354				my($dir, $qf) = ();
355				#warn "] inside process_entries...\n";
356
357				while ($qf = shift @filequeue) {
358				($dir, $_) = ($qf->[0], $qf->[1]);
359				#warn "] inside process_entries loop, $dir, $_, ...\n";
360				next if /^..?$/;
361				my $name = "$dir/$_";
362				#warn "] processing file '$name'.\n";
363				if ($name eq '-/') {
364				# get info from stdin
365				process_stdin();
366				}
367				elsif (-d $name) {
368				# a directory, process it recursively.
369				process_dir($name);
370				}
371				else {
372				process_file($dir, $_);
373				}
374				}
375				}
376
377				# =========================================================== &s-sub ===
378				# D - Process info given from stdin, which should of form same as
379				# find -printf "%p\t%s\n"
380				#
381				sub process_stdin {
382
383				while (<>){
384				croak "Wrong input format: '$_'" unless m{(.*)/(.+?)\t(\d+)$};
385				my ($dn, $fn, $size) = ( $1, $2, $3 );
386				my $fSdxl = [ get_soundex($fn) ]; # file soundex list
387				push @fileInfo, [ $dn, $fn, $size, $fSdxl, ];
388
389				dbg_show(100,"fileInfo",@fileInfo);
390				map { $sdxCnt{$_}++ } @$fSdxl;
391				}
392				}
393
394				# =========================================================== &s-sub ===
395				# D - Process given dir recursively
396				# N - BFS is more memory friendly than DFS
397				#
398				# T - $dir="/home/tong/tmp"
399				sub process_dir {
400				my($dir) = @_;
401				#warn "] processing dir '$dir'...\n";
402
403				opendir(DIR,$dir) \|\| die "File::FindSimilars error: Can't open $dir";
404				my @filenames = readdir(DIR);
405				closedir(DIR);
406
407				# record the dirname/fname pair to queue
408				@filequeue = (@filequeue, map { [$dir, $_] } @filenames);
409				dbg_show(100,"filequeue", @filequeue)
410				}
411
412				# =========================================================== &s-sub ===
413				# S - process_file($dirname, $fname), process file $fname under $dirname
414				# D - Process one file and update global vars
415				# U -
416				#
417				# I - Input parameters:
418				# $dirname: dir name string
419				# $fname: file name string
420				# O - Global vars get updated
421				# fileInfo [ $dirname, $fname, $fsize, [ file_soundex ] ]
422				# T -
423
424				sub process_file {
425				my ($dn, $fn) = @_;
426				my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,@rest) =
427				stat("$dn/$fn");
428				my $fSdxl = [ get_soundex($fn) ]; # file soundex list
429				push @fileInfo, [ $dn, $fn, $size, $fSdxl, ];
430
431				dbg_show(100,"fileInfo",@fileInfo);
432				map { $sdxCnt{$_}++ } @$fSdxl;
433				}
434
435				# =========================================================== &s-sub ===
436				# S - get_soundex($fname), get soundex for file $fname
437				# D - Return a list of soundex of each individual word in file name
438				# U - $aref = [ get_soundex($fname) ];
439				#
440				# I - Input parameters:
441				# $fname: file name string
442				# O - sorted anonymous soundex array w/ duplications removed
443				# T - @out = get_soundex 'Java_RMI - _Remote_Method_Invocation_ch03.tgz';
444				# @out = get_soundex 'ASuchKindOfFile.tgz';
445
446				sub get_soundex {
447				my ($fn) = @_;
448				# split to individual words
449				my @fn_wlist = split /[-_[:cntrl:][:blank:][:punct:][:digit:]]/i, $fn;
450				# discards file extension, if any
451				pop @fn_wlist if @fn_wlist >= 1;
452				# if it is single word, try further decompose SuchKindOfWord
453				@fn_wlist = $fn_wlist[0] =~ /[A-Z][^A-Z]*/g
454				if @fn_wlist == 1 && $fn_wlist[0] =~ /^[A-Z]/;
455				# wash short
456				dbg_show(100,"wlist 0",@fn_wlist);
457				@fn_wlist = arrwash_short(\@fn_wlist);
458				dbg_show(100,"wlist 1",@fn_wlist);
459
460				# language specific handling
461				@fn_wlist = arrwash_lang(\@fn_wlist);
462				dbg_show(100,"wlist 2",@fn_wlist);
463
464				# change word to soundex, record soundex/word in global hash
465				map {
466				if (/[[:alpha:]]/) {
467				my $sdx = soundex($_);
468				$wrdLst{$sdx}{$_}++;
469				s/^.*$/$sdx/;
470				}
471				} @fn_wlist;
472				dbg_show(1,"wrdLst",%wrdLst);
473
474				# wash empty/duplicates
475				@fn_wlist = grep(!/^$/, @fn_wlist);
476				@fn_wlist = arrwash_dup(\@fn_wlist);
477
478				return sort @fn_wlist;
479				}
480
481				# =========================================================== &s-sub ===
482				# S - arrwash_short($arr_ref), wash short from array $arr_ref
483				# D - weed out empty lines and less-than-3-letter words (e.g. ch12)
484				# U - @fn_wlist = arrwash_short(\@fn_wlist);
485				#
486
487				sub arrwash_short($) {
488				my ($arr_ref) = @_;
489				return @$arr_ref unless @$arr_ref >= 1;
490				my @r= grep tr/a-zA-Z// >=3, @$arr_ref;
491				return @r if @r;
492				return @$arr_ref # for upper ASCII
493				if grep(/[\200-\377]/, @$arr_ref);
494				return @r;
495				}
496
497				# =========================================================== &s-sub ===
498				# S - arrwash_dup($arr_ref), wash duplicates from array $arr_ref
499				# D - weed out duplicates
500				# U - @fn_wlist = arrwash_dup(\@fn_wlist);
501				#
502
503				sub arrwash_dup($) {
504				my ($arr_ref) = @_;
505				my %saw;
506				return grep !$saw{$_}++, @$arr_ref;
507				}
508
509				# =========================================================== &s-sub ===
510				# S - arrwash_lang($arr_ref), language specific washing from array $arr_ref
511				# U - @fn_wlist = arrwash_lang(\@fn_wlist);
512				#
513
514				sub arrwash_lang($) {
515				my ($arr_ref) = @_;
516
517				# split Chinese into individual chars
518				my @r;
519				map {
520				if (/[\200-\377]{2}/) {
521				@r = (@r, /[\200-\377]{2}/g);
522				}
523				else {
524				@r = (@r, $_);
525				}
526				} @$arr_ref;
527
528				return @r;
529				}
530
531				=head2 Object method: similarity_check()
532
533				Do similarity check on the queued directories. Print similar files info on
534				stdout according to the configured format and delimiters. E.g.,
535
536				$similars_finder->similarity_check();
537
538				=cut
539
540				# =========================================================== &s-sub ===
541				# S - similarity_check: similarity check on glabal array @fileInfo
542				# U - similarity_check();
543				#
544				# I - Input parameters: None
545				# O - similar files printed on stdout
546
547				sub similarity_check {
548
549				# get a ordered (by soundex count and file name) of file Info array
550				# (Use short file names to compare to long file names)
551				# use Schwartzian Transform to sort on 2 fields for efficiency
552				my @fileInfos = map { $_->[0] }
553				sort { $a->[1] cmp $b->[1] }
554				map { [ $_,
555				sprintf "%3d%6s", $#{$_->[$N_fSdxl]}, $_->[$N_fSdxl][0]
556				] } @fileInfo;
557				dbg_show(100,"\@fileInfos", @fileInfos);
558
559				my @saw = (0) x ($#fileInfos+1);
560				foreach my $ii (0..$#fileInfos) {
561				#warn "] ii=$ii\n";
562				my @similar = ();
563				my $fnl;
564
565				dbg_show(100,"\@fileInfos", $fileInfos[$ii]);
566				push @similar, [$ii, $ii, $fileInfos[$ii]->[$N_fSize] ];
567				foreach my $jj (($ii+1) ..$#fileInfos) {
568				$fnl=0; # 0 is good enough since file at [ii] is
569				# shorter in name than the one at [jj]
570				# don't care about same dir files?
571				next
572				if (!$config{fc_level} && ($fileInfos[$ii]->[$N_dName]
573				eq $fileInfos[$jj]->[$N_dName])) ;
574				if (file_diff(\@fileInfos, $ii, $jj) >= $config{fc_threshold}) {
575				push @similar, [$ii, $jj, $fileInfos[$jj]->[$N_fSize] ];
576				$fnl= length($fileInfos[$jj]->[$N_fName]) if
577				$fnl < length($fileInfos[$jj]->[$N_fName]);
578				}
579				}
580				dbg_show(100,"\@similar", @similar);
581				# output unvisited potential similars by each row, order by fSize
582				@similar = grep {!$saw[$_->[1]]}
583				sort { $a->[2] <=> $b->[2] } @similar;
584				next unless @similar>1;
585				print $config{delimiter};
586				foreach my $similar (@similar) {
587				print file_info(\@fileInfos, $similar->[1], $fnl). "\n";
588				$saw[$similar->[1]]++;
589				}
590				}
591				}
592
593				# =========================================================== &s-sub ===
594				sub file_info ($$$) {
595				my ($fileInfos, $ndx, $fnl) = @_;
596				return sprintf($config{format}, $fileInfos->[$ndx]->[$N_fSize],
597				$fileInfos->[$ndx]->[$N_fName],
598				' ' x ($fnl - length($fileInfos->[$ndx]->[$N_fName])),
599				"$fileInfos->[$ndx]->[$N_dName]");
600				}
601
602				# =========================================================== &s-sub ===
603				# S - file_diff: determind how difference two files are by name & size
604				# U - file_diff($fileInfos, $ndx1, $ndx2);
605				#
606				# I - $fileInfos: reference to @fileInfos
607				# $ndx1, $ndx2: index to the two file in @fileInfos
608				# O - 100%: files are identical
609				# 0%: no similarity at all
610				sub file_diff ($$$) {
611				my ($fileInfos, $ndx1, $ndx2) = @_;
612
613				return 0 unless @{$fileInfos->[$ndx1]->[$N_fSdxl]};
614
615				# find intersection in two soudex array
616				my %count = ();
617				foreach my $element
618				(@{$fileInfos->[$ndx1]->[$N_fSdxl]},
619				@{$fileInfos->[$ndx2]->[$N_fSdxl]}) { $count{$element}++ }
620				# since there is no duplication in each of file soudex
621				my $intersection =
622				grep $count{$_} > 1, keys %count;
623				# return p * normal(\common soudex) + (1-p) * ( 1 - normal(\delta fSize))
624				# so the bigger the return value is, the similar the two files are
625				$intersection *= $config{soundex_weight} /
626				(@{$fileInfos->[$ndx1]->[$N_fSdxl]});
627				dbg_show(100,"intersection", $intersection, $ndx1, $ndx2);
628				my $WeightfSzie = 100 - $config{soundex_weight};
629				my $dfSize = abs($fileInfos->[$ndx1]->[$N_fSize] -
630				$fileInfos->[$ndx2]->[$N_fSize]) * $WeightfSzie /
631				($fileInfos->[$ndx1]->[$N_fSize] + 1);
632				$dfSize = $dfSize > $WeightfSzie ? $WeightfSzie : $dfSize;
633				my $file_diff = $intersection + ($WeightfSzie - $dfSize);
634				if ($file_diff >= $config{fc_threshold}) {
635				dbg_show(010,"file_diff",
636				@{$fileInfos->[$ndx1]},
637				@{$fileInfos->[$ndx2]},
638				$intersection, $dfSize, $file_diff
639				);
640				}
641				return $file_diff;
642				}
643
644
645				1;
646				__END__