File Coverage

blib/lib/File/Searcher/Similars.pm
Criterion Covered Total %
statement 16 18 88.8
branch n/a
condition n/a
subroutine 6 6 100.0
pod n/a
total 22 24 91.6


line stmt bran cond sub pod time code
1             package File::Searcher::Similars;
2              
3             # @Author: Tong SUN, (c)2001-2003, all right reserved
4             # @Version: $Date: 2008/10/31 16:07:34 $ $Revision: 1.27 $
5             # @HomeURL: http://xpt.sourceforge.net/
6              
7             # {{{ LICENSE:
8              
9             #
10             # Permission to use, copy, modify, and distribute this software and its
11             # documentation for any purpose and without fee is hereby granted, provided
12             # that the above copyright notices appear in all copies and that both those
13             # copyright notices and this permission notice appear in supporting
14             # documentation, and that the names of author not be used in advertising or
15             # publicity pertaining to distribution of the software without specific,
16             # written prior permission. Tong Sun makes no representations about the
17             # suitability of this software for any purpose. It is provided "as is"
18             # without express or implied warranty.
19             #
20             # TONG SUN DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
21             # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL ADOBE
22             # SYSTEMS INCORPORATED AND DIGITAL EQUIPMENT CORPORATION BE LIABLE FOR ANY
23             # SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
24             # RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
25             # CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
26             # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
27             #
28              
29             # }}}
30              
31             # {{{ POD, Intro:
32              
33             =head1 NAME
34              
35             File::Searcher::Similars - Fast similar-files finder
36              
37             =head1 SYNOPSIS
38              
39             use File::Searcher::Similars;
40              
41             File::Searcher::Similars->init(0, \@ARGV);
42             similarity_check_name();
43              
44             Similar-sized and similar-named files are picked as suspicious candidates of
45             duplicated files.
46              
47             Please note that this version is deprecated. Future versions are released
48             as File::Find::Similars.
49              
50             =head1 DESCRIPTION
51              
52             Extremely fast file similarity checker. It uses advanced soundex vector
53             algorithm to determine the similarity between files. Generally it means that
54             if there are n files, each having approximately m words, the degree of
55             calculation is merely
56              
57             O(n^2 * m)
58              
59             which is over hundreds times faster than any existing file fingerprinting
60             technology.
61              
62             The self-test output will help you understand what the module do and what
63             would you expect from the outcome.
64              
65             $ make test
66             PERL_DL_NONLAZY=1 /usr/bin/perl "-Iblib/lib" "-Iblib/arch" test.pl
67             1..4
68             # Running under perl version 5.010000 for linux
69             # Current time local: Wed Oct 29 11:35:06 2008
70             # Current time GMT: Wed Oct 29 15:35:06 2008
71             # Using Test.pm version 1.25
72             # Testing File::Searcher::Similars version 1.23
73            
74             == Testing 1, files under test/ subdir:
75            
76             9 test/(eBook) GNU - Python Standard Library 2001.pdf
77             3 test/CardLayoutTest.java
78             5 test/GNU - 2001 - Python Standard Library.pdf
79             4 test/GNU - Python Standard Library (2001).rar
80             9 test/LayoutTest.java
81             3 test/PopupTest.java
82             2 test/Python Standard Library.zip
83             5 test/TestLayout.java
84             ok 1
85            
86             Note:
87            
88             - The fileSimilars.pl script will pick out similar files from them in next test.
89             - Let's assume that the number represent the file size in KB.
90            
91             == Testing 2 result should be:
92            
93             ## =========
94             3 'CardLayoutTest.java' 'test/'
95             5 'TestLayout.java' 'test/'
96            
97             ## =========
98             4 'GNU - Python Standard Library (2001).rar' 'test/'
99             5 'GNU - 2001 - Python Standard Library.pdf' 'test/'
100             ok 2
101            
102             Note:
103            
104             - There are 2 groups of similar files picked out by the script.
105             The second group makes more sense.
106             - The similar files are picked because their file names looks similar.
107             - However, the file size plays an important role as well.
108             - There are 2 files in the second similar files group.
109             - The file 'Python Standard Library.zip' is not considered to be similar to
110             the group because its size is not similar to the group.
111            
112             == Testing 3, if Python.zip is bigger, result should be:
113            
114             ## =========
115             3 'CardLayoutTest.java' 'test/'
116             5 'TestLayout.java' 'test/'
117            
118             ## =========
119             4 'Python Standard Library.zip' 'test/'
120             4 'GNU - Python Standard Library (2001).rar' 'test/'
121             5 'GNU - 2001 - Python Standard Library.pdf' 'test/'
122             ok 3
123            
124             Note:
125            
126             - There are 3 files in the second similar files group.
127             - The file 'Python Standard Library.zip' is now in the 2nd similar files
128             group because its size is now similar to the group.
129            
130             == Testing 4, if Python.zip is even bigger, result should be:
131            
132             ## =========
133             3 'CardLayoutTest.java' 'test/'
134             5 'TestLayout.java' 'test/'
135            
136             ## =========
137             4 'GNU - Python Standard Library (2001).rar' 'test/'
138             5 'GNU - 2001 - Python Standard Library.pdf' 'test/'
139             6 'Python Standard Library.zip' 'test/'
140             9 '(eBook) GNU - Python Standard Library 2001.pdf' 'test/'
141             ok 4
142            
143             Note:
144            
145             - There are 4 files in the second similar files group.
146             - The file 'Python Standard Library.zip' is still in the group.
147             - But this time, because it is also considered to be similar to the .pdf
148             file (since their size are now similar, 6 vs 9), a 4th file the .pdf
149             is now included in the 2nd group.
150             - If the size of file 'Python Standard Library.zip' is 12(KB), then the
151             second similar files group will be split into two. Do you know why and
152             which files each group will contain?
153              
154             The File::Searcher::Similars package comes with a fully functional demo
155             script fileSimilars.pl. Please refer to its help file for further
156             explanations.
157              
158             This package is highly customizable. Refer to hash variable %config and/or
159             the 3 arrwash_ functions for customization hints.
160              
161             =cut
162              
163             # }}}
164              
165             # {{{ global declaration:
166              
167 1     1   6314 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
  1         2  
  1         101  
168              
169             require Exporter;
170              
171             @ISA = qw(Exporter);
172              
173             @EXPORT = qw(
174             &similarity_check_name
175             );
176              
177             # ============================================================== &us ===
178             # ............................................................. Uses ...
179              
180             # -- global modules
181 1     1   6 use strict; # !
  1         2  
  1         36  
182              
183 1     1   6 use Carp;
  1         6  
  1         69  
184 1     1   1534 use Getopt::Long;
  1         15134  
  1         7  
185 1     1   164 use File::Basename;
  1         2  
  1         111  
186 1     1   1403 use Text::Soundex;
  0            
  0            
187              
188             # -- local modules
189              
190             sub dbg_show {};
191              
192             # -- global variables
193             use vars qw($progname $VERSION $verbose $debugging);
194              
195             # ============================================================== &cs ===
196             # ................................................. Constant setting ...
197             #
198             $VERSION = sprintf("%d.%02d", q$Revision: 1.27 $ =~ /(\d+)\.(\d+)/);
199              
200              
201             # ============================================================== &gv ===
202             # .................................................. Global Varibles ...
203             #
204             use vars qw(%config @filequeue @fileInfo %sdxCnt %wrdLst);
205              
206             $config{WeightSoundex} = 50; # precentage of weight that soundex takes,
207             # the rest is for file size
208             $config{Threshold} = 75; # over which files are considered similar
209             $config{Deliminator} = "\n## =========\n";
210             $config{Format} = "%12d '%s' %s'%s'";
211              
212             # @fileInfo: List of the following list:
213             my (
214             $N_dName, # dir name
215             $N_fName, # file name
216             $N_fSize, # file size
217             $N_fSdxl, # file soundex list, reference
218             ) = (0..9);
219              
220             my $fc_level=0;
221              
222             # }}}
223              
224              
225             # Preloaded methods go here.
226              
227             # ############################################################## &ss ###
228             # ................................................ Subroutions start ...
229              
230             # =========================================================== &s-sub ===
231             # S - File::Searcher::Similars->init($fc_level, \@ARGV);
232             # D - initialize file comparing level and dir queue
233             #
234             # T -
235             sub init ($\@) {
236             my ($mname, $_fc_level, $init_dirs) = @_;
237             $fc_level = $_fc_level; # update module variable
238             #warn "] $mname, $fc_level, $init_dirs\n";
239              
240             @filequeue = @fileInfo = ();
241             @filequeue = (@filequeue, map { [$_, ''] } @$init_dirs);
242             process_entries();
243              
244             dbg_show(100,"\@fileInfo", @fileInfo);
245             dbg_show(100,"%sdxCnt", %sdxCnt);
246             dbg_show(100,"%wrdLst", %wrdLst);
247             }
248              
249             # =========================================================== &s-sub ===
250             # I - Input: global array @filequeue
251             # Input parameters: None
252             #
253             sub process_entries {
254             my($dir, $qf) = ();
255             #warn "] inside process_entries...\n";
256              
257             while ($qf = shift @filequeue) {
258             ($dir, $_) = ($qf->[0], $qf->[1]);
259             #warn "] inside process_entries loop, $dir, $_, ...\n";
260             next if /^..?$/;
261             my $name = "$dir/$_";
262             #warn "] processing file '$name'.\n";
263             if ($name eq '-/') {
264             # get info from stdin
265             process_stdin();
266             }
267             elsif (-d $name) {
268             # a directory, process it recursively.
269             process_dir($name);
270             }
271             else {
272             process_file($dir, $_);
273             }
274             }
275             }
276              
277             # =========================================================== &s-sub ===
278             # D - Process info given from stdin, which should of form same as
279             # find -printf "%p\t%s\n"
280             #
281             sub process_stdin {
282            
283             while (<>){
284             croak "Wrong input format: '$_'" unless m{(.*)/(.+?)\t(\d+)$};
285             my ($dn, $fn, $size) = ( $1, $2, $3 );
286             my $fSdxl = [ get_soundex($fn) ]; # file soundex list
287             push @fileInfo, [ $dn, $fn, $size, $fSdxl, ];
288              
289             dbg_show(100,"fileInfo",@fileInfo);
290             map { $sdxCnt{$_}++ } @$fSdxl;
291             }
292             }
293              
294             # =========================================================== &s-sub ===
295             # D - Process given dir recursively
296             # N - BFS is more memory friendly than DFS
297             #
298             # T - $dir="/home/tong/tmp"
299             sub process_dir {
300             my($dir) = @_;
301             #warn "] processing dir '$dir'...\n";
302              
303             opendir(DIR,$dir) || die "File::Searcher::Similars error: Can't open $dir";
304             my @filenames = readdir(DIR);
305             closedir(DIR);
306              
307             # record the dirname/fname pair to queue
308             @filequeue = (@filequeue, map { [$dir, $_] } @filenames);
309             dbg_show(100,"filequeue", @filequeue)
310             }
311              
312             # =========================================================== &s-sub ===
313             # S - process_file($dirname, $fname), process file $fname under $dirname
314             # D - Process one file and update global vars
315             # U -
316             #
317             # I - Input parameters:
318             # $dirname: dir name string
319             # $fname: file name string
320             # O - Global vars get updated
321             # fileInfo [ $dirname, $fname, $fsize, [ file_soundex ] ]
322             # T -
323              
324             sub process_file {
325             my ($dn, $fn) = @_;
326             my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,@rest) =
327             stat("$dn/$fn");
328             my $fSdxl = [ get_soundex($fn) ]; # file soundex list
329             push @fileInfo, [ $dn, $fn, $size, $fSdxl, ];
330              
331             dbg_show(100,"fileInfo",@fileInfo);
332             map { $sdxCnt{$_}++ } @$fSdxl;
333             }
334              
335             # =========================================================== &s-sub ===
336             # S - get_soundex($fname), get soundex for file $fname
337             # D - Return a list of soundex of each individual word in file name
338             # U - $aref = [ get_soundex($fname) ];
339             #
340             # I - Input parameters:
341             # $fname: file name string
342             # O - sorted anonymous soundex array w/ duplications removed
343             # T - @out = get_soundex 'Java_RMI - _Remote_Method_Invocation_ch03.tgz';
344             # @out = get_soundex 'ASuchKindOfFile.tgz';
345              
346             sub get_soundex {
347             my ($fn) = @_;
348             # split to individual words
349             my @fn_wlist = split /[-_[:cntrl:][:blank:][:punct:][:digit:]]/i, $fn;
350             # discards file extension, if any
351             pop @fn_wlist if @fn_wlist >= 1;
352             # if it is single word, try further decompose SuchKindOfWord
353             @fn_wlist = $fn_wlist[0] =~ /[A-Z][^A-Z]*/g
354             if @fn_wlist == 1 && $fn_wlist[0] =~ /^[A-Z]/;
355             # wash short
356             dbg_show(100,"wlist 0",@fn_wlist);
357             @fn_wlist = arrwash_short(\@fn_wlist);
358             dbg_show(100,"wlist 1",@fn_wlist);
359              
360             # language specific handling
361             @fn_wlist = arrwash_lang(\@fn_wlist);
362             dbg_show(100,"wlist 2",@fn_wlist);
363            
364             # change word to soundex, record soundex/word in global hash
365             map {
366             if (/[[:alpha:]]/) {
367             my $sdx = soundex($_);
368             $wrdLst{$sdx}{$_}++;
369             s/^.*$/$sdx/;
370             }
371             } @fn_wlist;
372             dbg_show(1,"wrdLst",%wrdLst);
373              
374             # wash empty/duplicates
375             @fn_wlist = grep(!/^$/, @fn_wlist);
376             @fn_wlist = arrwash_dup(\@fn_wlist);
377            
378             return sort @fn_wlist;
379             }
380              
381             # =========================================================== &s-sub ===
382             # S - arrwash_short($arr_ref), wash short from array $arr_ref
383             # D - weed out empty lines and less-than-3-letter words (e.g. ch12)
384             # U - @fn_wlist = arrwash_short(\@fn_wlist);
385             #
386              
387             sub arrwash_short($) {
388             my ($arr_ref) = @_;
389             return @$arr_ref unless @$arr_ref >= 1;
390             my @r= grep tr/a-zA-Z// >=3, @$arr_ref;
391             return @r if @r;
392             return @$arr_ref # for upper ASCII
393             if grep(/[\200-\377]/, @$arr_ref);
394             return @r;
395             }
396              
397             # =========================================================== &s-sub ===
398             # S - arrwash_dup($arr_ref), wash duplicates from array $arr_ref
399             # D - weed out duplicates
400             # U - @fn_wlist = arrwash_dup(\@fn_wlist);
401             #
402              
403             sub arrwash_dup($) {
404             my ($arr_ref) = @_;
405             my %saw;
406             return grep !$saw{$_}++, @$arr_ref;
407             }
408              
409             # =========================================================== &s-sub ===
410             # S - arrwash_lang($arr_ref), language specific washing from array $arr_ref
411             # U - @fn_wlist = arrwash_lang(\@fn_wlist);
412             #
413              
414             sub arrwash_lang($) {
415             my ($arr_ref) = @_;
416            
417             # split Chinese into individual chars
418             my @r;
419             map {
420             if (/[\200-\377]{2}/) {
421             @r = (@r, /[\200-\377]{2}/g);
422             }
423             else {
424             @r = (@r, $_);
425             }
426             } @$arr_ref;
427            
428             return @r;
429             }
430              
431             # =========================================================== &s-sub ===
432             # S - similarity_check_name: similarity check on glabal array @fileInfo
433             # U - similarity_check_name();
434             #
435             # I - Input parameters: None
436             # O - similar files printed on stdout
437              
438             sub similarity_check_name {
439              
440             # get a ordered (by soundex count) file Info array
441             # (Use short file names to compare to long file names)
442             my @fileInfos =
443             sort { $#{$a->[$N_fSdxl]} cmp $#{$b->[$N_fSdxl]} } @fileInfo;
444             dbg_show(100,"\@fileInfos", @fileInfos);
445              
446             my @saw = (0) x ($#fileInfos+1);
447             foreach my $ii (0..$#fileInfos) {
448             #warn "] ii=$ii\n";
449             my @similar = ();
450             my $fnl;
451            
452             dbg_show(100,"\@fileInfos", $fileInfos[$ii]);
453             push @similar, [$ii, $ii, $fileInfos[$ii]->[$N_fSize] ];
454             foreach my $jj (($ii+1) ..$#fileInfos) {
455             $fnl=0; # 0 is good enough since file at [ii] is
456             # shorter in name than the one at [jj]
457             #warn "] jj=$jj\n";
458             # don't care about same dir files?
459             next
460             if (!$fc_level && ($fileInfos[$ii]->[$N_dName]
461             eq $fileInfos[$jj]->[$N_dName])) ;
462             if (file_diff(\@fileInfos, $ii, $jj) >= $config{Threshold}) {
463             push @similar, [$ii, $jj, $fileInfos[$jj]->[$N_fSize] ];
464             $fnl= length($fileInfos[$jj]->[$N_fName]) if
465             $fnl < length($fileInfos[$jj]->[$N_fName]);
466             }
467             }
468             dbg_show(100,"\@similar", @similar);
469             # output unvisited potential similars by each row, order by fSize
470             @similar = grep {!$saw[$_->[1]]}
471             sort { $a->[2] <=> $b->[2] } @similar;
472             next unless @similar>1;
473             print $config{Deliminator};
474             foreach my $similar (@similar) {
475             print file_info(\@fileInfos, $similar->[1], $fnl). "\n";
476             $saw[$similar->[1]]++;
477             }
478             }
479             }
480              
481             # =========================================================== &s-sub ===
482             sub file_info ($$$) {
483             my ($fileInfos, $ndx, $fnl) = @_;
484             return sprintf($config{Format}, $fileInfos->[$ndx]->[$N_fSize],
485             $fileInfos->[$ndx]->[$N_fName],
486             ' ' x ($fnl - length($fileInfos->[$ndx]->[$N_fName])),
487             "$fileInfos->[$ndx]->[$N_dName]");
488             }
489              
490             # =========================================================== &s-sub ===
491             # S - file_diff: determind how difference two files are by name & size
492             # U - file_diff($fileInfos, $ndx1, $ndx2);
493             #
494             # I - $fileInfos: reference to @fileInfos
495             # $ndx1, $ndx2: index to the two file in @fileInfos
496             # O - 100%: files are identical
497             # 0%: no similarity at all
498             sub file_diff ($$$) {
499             my ($fileInfos, $ndx1, $ndx2) = @_;
500              
501             return 0 unless @{$fileInfos->[$ndx1]->[$N_fSdxl]};
502            
503             # find intersection in two soudex array
504             my %count = ();
505             foreach my $element
506             (@{$fileInfos->[$ndx1]->[$N_fSdxl]},
507             @{$fileInfos->[$ndx2]->[$N_fSdxl]}) { $count{$element}++ }
508             # since there is no duplication in each of file soudex
509             my $intersection =
510             grep $count{$_} > 1, keys %count;
511             # return p * normal(\common soudex) + (1-p) * ( 1 - normal(\delta fSize))
512             # so the bigger the return value is, the similar the two files are
513             $intersection *= $config{WeightSoundex} /
514             (@{$fileInfos->[$ndx1]->[$N_fSdxl]});
515             dbg_show(100,"intersection", $intersection, $ndx1, $ndx2);
516             my $WeightfSzie = 100 - $config{WeightSoundex};
517             my $dfSize = abs($fileInfos->[$ndx1]->[$N_fSize] -
518             $fileInfos->[$ndx2]->[$N_fSize]) * $WeightfSzie /
519             ($fileInfos->[$ndx1]->[$N_fSize] + 1);
520             $dfSize = $dfSize > $WeightfSzie ? $WeightfSzie : $dfSize;
521             my $file_diff = $intersection + ($WeightfSzie - $dfSize);
522             if ($file_diff >= $config{Threshold}) {
523             dbg_show(010,"file_diff",
524             @{$fileInfos->[$ndx1]},
525             @{$fileInfos->[$ndx2]},
526             $intersection, $dfSize, $file_diff
527             );
528             }
529             return $file_diff;
530             }
531              
532              
533             1;
534             __END__