File Coverage

blib/lib/File/Searcher/Similars.pm

Criterion	Covered	Total	%
statement	16	18	88.8
branch			n/a
condition			n/a
subroutine	6	6	100.0
pod			n/a
total	22	24	91.6

line	stmt	sub	time	code
1				package File::Searcher::Similars;
2
3				# @Author: Tong SUN, (c)2001-2003, all right reserved
4				# @Version: $Date: 2008/10/31 16:07:34 $ $Revision: 1.27 $
5				# @HomeURL: http://xpt.sourceforge.net/
6
7				# {{{ LICENSE:
8
9				#
10				# Permission to use, copy, modify, and distribute this software and its
11				# documentation for any purpose and without fee is hereby granted, provided
12				# that the above copyright notices appear in all copies and that both those
13				# copyright notices and this permission notice appear in supporting
14				# documentation, and that the names of author not be used in advertising or
15				# publicity pertaining to distribution of the software without specific,
16				# written prior permission. Tong Sun makes no representations about the
17				# suitability of this software for any purpose. It is provided "as is"
18				# without express or implied warranty.
19				#
20				# TONG SUN DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
21				# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL ADOBE
22				# SYSTEMS INCORPORATED AND DIGITAL EQUIPMENT CORPORATION BE LIABLE FOR ANY
23				# SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
24				# RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
25				# CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
26				# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
27				#
28
29				# }}}
30
31				# {{{ POD, Intro:
32
33				=head1 NAME
34
35				File::Searcher::Similars - Fast similar-files finder
36
37				=head1 SYNOPSIS
38
39				use File::Searcher::Similars;
40
41				File::Searcher::Similars->init(0, \@ARGV);
42				similarity_check_name();
43
44				Similar-sized and similar-named files are picked as suspicious candidates of
45				duplicated files.
46
47				Please note that this version is deprecated. Future versions are released
48				as File::Find::Similars.
49
50				=head1 DESCRIPTION
51
52				Extremely fast file similarity checker. It uses advanced soundex vector
53				algorithm to determine the similarity between files. Generally it means that
54				if there are n files, each having approximately m words, the degree of
55				calculation is merely
56
57				O(n^2 * m)
58
59				which is over hundreds times faster than any existing file fingerprinting
60				technology.
61
62				The self-test output will help you understand what the module do and what
63				would you expect from the outcome.
64
65				$ make test
66				PERL_DL_NONLAZY=1 /usr/bin/perl "-Iblib/lib" "-Iblib/arch" test.pl
67				1..4
68				# Running under perl version 5.010000 for linux
69				# Current time local: Wed Oct 29 11:35:06 2008
70				# Current time GMT: Wed Oct 29 15:35:06 2008
71				# Using Test.pm version 1.25
72				# Testing File::Searcher::Similars version 1.23
73
74				== Testing 1, files under test/ subdir:
75
76				9 test/(eBook) GNU - Python Standard Library 2001.pdf
77				3 test/CardLayoutTest.java
78				5 test/GNU - 2001 - Python Standard Library.pdf
79				4 test/GNU - Python Standard Library (2001).rar
80				9 test/LayoutTest.java
81				3 test/PopupTest.java
82				2 test/Python Standard Library.zip
83				5 test/TestLayout.java
84				ok 1
85
86				Note:
87
88				- The fileSimilars.pl script will pick out similar files from them in next test.
89				- Let's assume that the number represent the file size in KB.
90
91				== Testing 2 result should be:
92
93				## =========
94				3 'CardLayoutTest.java' 'test/'
95				5 'TestLayout.java' 'test/'
96
97				## =========
98				4 'GNU - Python Standard Library (2001).rar' 'test/'
99				5 'GNU - 2001 - Python Standard Library.pdf' 'test/'
100				ok 2
101
102				Note:
103
104				- There are 2 groups of similar files picked out by the script.
105				The second group makes more sense.
106				- The similar files are picked because their file names looks similar.
107				- However, the file size plays an important role as well.
108				- There are 2 files in the second similar files group.
109				- The file 'Python Standard Library.zip' is not considered to be similar to
110				the group because its size is not similar to the group.
111
112				== Testing 3, if Python.zip is bigger, result should be:
113
114				## =========
115				3 'CardLayoutTest.java' 'test/'
116				5 'TestLayout.java' 'test/'
117
118				## =========
119				4 'Python Standard Library.zip' 'test/'
120				4 'GNU - Python Standard Library (2001).rar' 'test/'
121				5 'GNU - 2001 - Python Standard Library.pdf' 'test/'
122				ok 3
123
124				Note:
125
126				- There are 3 files in the second similar files group.
127				- The file 'Python Standard Library.zip' is now in the 2nd similar files
128				group because its size is now similar to the group.
129
130				== Testing 4, if Python.zip is even bigger, result should be:
131
132				## =========
133				3 'CardLayoutTest.java' 'test/'
134				5 'TestLayout.java' 'test/'
135
136				## =========
137				4 'GNU - Python Standard Library (2001).rar' 'test/'
138				5 'GNU - 2001 - Python Standard Library.pdf' 'test/'
139				6 'Python Standard Library.zip' 'test/'
140				9 '(eBook) GNU - Python Standard Library 2001.pdf' 'test/'
141				ok 4
142
143				Note:
144
145				- There are 4 files in the second similar files group.
146				- The file 'Python Standard Library.zip' is still in the group.
147				- But this time, because it is also considered to be similar to the .pdf
148				file (since their size are now similar, 6 vs 9), a 4th file the .pdf
149				is now included in the 2nd group.
150				- If the size of file 'Python Standard Library.zip' is 12(KB), then the
151				second similar files group will be split into two. Do you know why and
152				which files each group will contain?
153
154				The File::Searcher::Similars package comes with a fully functional demo
155				script fileSimilars.pl. Please refer to its help file for further
156				explanations.
157
158				This package is highly customizable. Refer to hash variable %config and/or
159				the 3 arrwash_ functions for customization hints.
160
161				=cut
162
163				# }}}
164
165				# {{{ global declaration:
166
167	1	1	6314	use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
	1		2
	1		101
168
169				require Exporter;
170
171				@ISA = qw(Exporter);
172
173				@EXPORT = qw(
174				&similarity_check_name
175				);
176
177				# ============================================================== &us ===
178				# ............................................................. Uses ...
179
180				# -- global modules
181	1	1	6	use strict; # !
	1		2
	1		36
182
183	1	1	6	use Carp;
	1		6
	1		69
184	1	1	1534	use Getopt::Long;
	1		15134
	1		7
185	1	1	164	use File::Basename;
	1		2
	1		111
186	1	1	1403	use Text::Soundex;
	0
	0
187
188				# -- local modules
189
190				sub dbg_show {};
191
192				# -- global variables
193				use vars qw($progname $VERSION $verbose $debugging);
194
195				# ============================================================== &cs ===
196				# ................................................. Constant setting ...
197				#
198				$VERSION = sprintf("%d.%02d", q$Revision: 1.27 $ =~ /(\d+)\.(\d+)/);
199
200
201				# ============================================================== &gv ===
202				# .................................................. Global Varibles ...
203				#
204				use vars qw(%config @filequeue @fileInfo %sdxCnt %wrdLst);
205
206				$config{WeightSoundex} = 50; # precentage of weight that soundex takes,
207				# the rest is for file size
208				$config{Threshold} = 75; # over which files are considered similar
209				$config{Deliminator} = "\n## =========\n";
210				$config{Format} = "%12d '%s' %s'%s'";
211
212				# @fileInfo: List of the following list:
213				my (
214				$N_dName, # dir name
215				$N_fName, # file name
216				$N_fSize, # file size
217				$N_fSdxl, # file soundex list, reference
218				) = (0..9);
219
220				my $fc_level=0;
221
222				# }}}
223
224
225				# Preloaded methods go here.
226
227				# ############################################################## &ss ###
228				# ................................................ Subroutions start ...
229
230				# =========================================================== &s-sub ===
231				# S - File::Searcher::Similars->init($fc_level, \@ARGV);
232				# D - initialize file comparing level and dir queue
233				#
234				# T -
235				sub init ($\@) {
236				my ($mname, $_fc_level, $init_dirs) = @_;
237				$fc_level = $_fc_level; # update module variable
238				#warn "] $mname, $fc_level, $init_dirs\n";
239
240				@filequeue = @fileInfo = ();
241				@filequeue = (@filequeue, map { [$_, ''] } @$init_dirs);
242				process_entries();
243
244				dbg_show(100,"\@fileInfo", @fileInfo);
245				dbg_show(100,"%sdxCnt", %sdxCnt);
246				dbg_show(100,"%wrdLst", %wrdLst);
247				}
248
249				# =========================================================== &s-sub ===
250				# I - Input: global array @filequeue
251				# Input parameters: None
252				#
253				sub process_entries {
254				my($dir, $qf) = ();
255				#warn "] inside process_entries...\n";
256
257				while ($qf = shift @filequeue) {
258				($dir, $_) = ($qf->[0], $qf->[1]);
259				#warn "] inside process_entries loop, $dir, $_, ...\n";
260				next if /^..?$/;
261				my $name = "$dir/$_";
262				#warn "] processing file '$name'.\n";
263				if ($name eq '-/') {
264				# get info from stdin
265				process_stdin();
266				}
267				elsif (-d $name) {
268				# a directory, process it recursively.
269				process_dir($name);
270				}
271				else {
272				process_file($dir, $_);
273				}
274				}
275				}
276
277				# =========================================================== &s-sub ===
278				# D - Process info given from stdin, which should of form same as
279				# find -printf "%p\t%s\n"
280				#
281				sub process_stdin {
282
283				while (<>){
284				croak "Wrong input format: '$_'" unless m{(.*)/(.+?)\t(\d+)$};
285				my ($dn, $fn, $size) = ( $1, $2, $3 );
286				my $fSdxl = [ get_soundex($fn) ]; # file soundex list
287				push @fileInfo, [ $dn, $fn, $size, $fSdxl, ];
288
289				dbg_show(100,"fileInfo",@fileInfo);
290				map { $sdxCnt{$_}++ } @$fSdxl;
291				}
292				}
293
294				# =========================================================== &s-sub ===
295				# D - Process given dir recursively
296				# N - BFS is more memory friendly than DFS
297				#
298				# T - $dir="/home/tong/tmp"
299				sub process_dir {
300				my($dir) = @_;
301				#warn "] processing dir '$dir'...\n";
302
303				opendir(DIR,$dir) \|\| die "File::Searcher::Similars error: Can't open $dir";
304				my @filenames = readdir(DIR);
305				closedir(DIR);
306
307				# record the dirname/fname pair to queue
308				@filequeue = (@filequeue, map { [$dir, $_] } @filenames);
309				dbg_show(100,"filequeue", @filequeue)
310				}
311
312				# =========================================================== &s-sub ===
313				# S - process_file($dirname, $fname), process file $fname under $dirname
314				# D - Process one file and update global vars
315				# U -
316				#
317				# I - Input parameters:
318				# $dirname: dir name string
319				# $fname: file name string
320				# O - Global vars get updated
321				# fileInfo [ $dirname, $fname, $fsize, [ file_soundex ] ]
322				# T -
323
324				sub process_file {
325				my ($dn, $fn) = @_;
326				my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,@rest) =
327				stat("$dn/$fn");
328				my $fSdxl = [ get_soundex($fn) ]; # file soundex list
329				push @fileInfo, [ $dn, $fn, $size, $fSdxl, ];
330
331				dbg_show(100,"fileInfo",@fileInfo);
332				map { $sdxCnt{$_}++ } @$fSdxl;
333				}
334
335				# =========================================================== &s-sub ===
336				# S - get_soundex($fname), get soundex for file $fname
337				# D - Return a list of soundex of each individual word in file name
338				# U - $aref = [ get_soundex($fname) ];
339				#
340				# I - Input parameters:
341				# $fname: file name string
342				# O - sorted anonymous soundex array w/ duplications removed
343				# T - @out = get_soundex 'Java_RMI - _Remote_Method_Invocation_ch03.tgz';
344				# @out = get_soundex 'ASuchKindOfFile.tgz';
345
346				sub get_soundex {
347				my ($fn) = @_;
348				# split to individual words
349				my @fn_wlist = split /[-_[:cntrl:][:blank:][:punct:][:digit:]]/i, $fn;
350				# discards file extension, if any
351				pop @fn_wlist if @fn_wlist >= 1;
352				# if it is single word, try further decompose SuchKindOfWord
353				@fn_wlist = $fn_wlist[0] =~ /[A-Z][^A-Z]*/g
354				if @fn_wlist == 1 && $fn_wlist[0] =~ /^[A-Z]/;
355				# wash short
356				dbg_show(100,"wlist 0",@fn_wlist);
357				@fn_wlist = arrwash_short(\@fn_wlist);
358				dbg_show(100,"wlist 1",@fn_wlist);
359
360				# language specific handling
361				@fn_wlist = arrwash_lang(\@fn_wlist);
362				dbg_show(100,"wlist 2",@fn_wlist);
363
364				# change word to soundex, record soundex/word in global hash
365				map {
366				if (/[[:alpha:]]/) {
367				my $sdx = soundex($_);
368				$wrdLst{$sdx}{$_}++;
369				s/^.*$/$sdx/;
370				}
371				} @fn_wlist;
372				dbg_show(1,"wrdLst",%wrdLst);
373
374				# wash empty/duplicates
375				@fn_wlist = grep(!/^$/, @fn_wlist);
376				@fn_wlist = arrwash_dup(\@fn_wlist);
377
378				return sort @fn_wlist;
379				}
380
381				# =========================================================== &s-sub ===
382				# S - arrwash_short($arr_ref), wash short from array $arr_ref
383				# D - weed out empty lines and less-than-3-letter words (e.g. ch12)
384				# U - @fn_wlist = arrwash_short(\@fn_wlist);
385				#
386
387				sub arrwash_short($) {
388				my ($arr_ref) = @_;
389				return @$arr_ref unless @$arr_ref >= 1;
390				my @r= grep tr/a-zA-Z// >=3, @$arr_ref;
391				return @r if @r;
392				return @$arr_ref # for upper ASCII
393				if grep(/[\200-\377]/, @$arr_ref);
394				return @r;
395				}
396
397				# =========================================================== &s-sub ===
398				# S - arrwash_dup($arr_ref), wash duplicates from array $arr_ref
399				# D - weed out duplicates
400				# U - @fn_wlist = arrwash_dup(\@fn_wlist);
401				#
402
403				sub arrwash_dup($) {
404				my ($arr_ref) = @_;
405				my %saw;
406				return grep !$saw{$_}++, @$arr_ref;
407				}
408
409				# =========================================================== &s-sub ===
410				# S - arrwash_lang($arr_ref), language specific washing from array $arr_ref
411				# U - @fn_wlist = arrwash_lang(\@fn_wlist);
412				#
413
414				sub arrwash_lang($) {
415				my ($arr_ref) = @_;
416
417				# split Chinese into individual chars
418				my @r;
419				map {
420				if (/[\200-\377]{2}/) {
421				@r = (@r, /[\200-\377]{2}/g);
422				}
423				else {
424				@r = (@r, $_);
425				}
426				} @$arr_ref;
427
428				return @r;
429				}
430
431				# =========================================================== &s-sub ===
432				# S - similarity_check_name: similarity check on glabal array @fileInfo
433				# U - similarity_check_name();
434				#
435				# I - Input parameters: None
436				# O - similar files printed on stdout
437
438				sub similarity_check_name {
439
440				# get a ordered (by soundex count) file Info array
441				# (Use short file names to compare to long file names)
442				my @fileInfos =
443				sort { $#{$a->[$N_fSdxl]} cmp $#{$b->[$N_fSdxl]} } @fileInfo;
444				dbg_show(100,"\@fileInfos", @fileInfos);
445
446				my @saw = (0) x ($#fileInfos+1);
447				foreach my $ii (0..$#fileInfos) {
448				#warn "] ii=$ii\n";
449				my @similar = ();
450				my $fnl;
451
452				dbg_show(100,"\@fileInfos", $fileInfos[$ii]);
453				push @similar, [$ii, $ii, $fileInfos[$ii]->[$N_fSize] ];
454				foreach my $jj (($ii+1) ..$#fileInfos) {
455				$fnl=0; # 0 is good enough since file at [ii] is
456				# shorter in name than the one at [jj]
457				#warn "] jj=$jj\n";
458				# don't care about same dir files?
459				next
460				if (!$fc_level && ($fileInfos[$ii]->[$N_dName]
461				eq $fileInfos[$jj]->[$N_dName])) ;
462				if (file_diff(\@fileInfos, $ii, $jj) >= $config{Threshold}) {
463				push @similar, [$ii, $jj, $fileInfos[$jj]->[$N_fSize] ];
464				$fnl= length($fileInfos[$jj]->[$N_fName]) if
465				$fnl < length($fileInfos[$jj]->[$N_fName]);
466				}
467				}
468				dbg_show(100,"\@similar", @similar);
469				# output unvisited potential similars by each row, order by fSize
470				@similar = grep {!$saw[$_->[1]]}
471				sort { $a->[2] <=> $b->[2] } @similar;
472				next unless @similar>1;
473				print $config{Deliminator};
474				foreach my $similar (@similar) {
475				print file_info(\@fileInfos, $similar->[1], $fnl). "\n";
476				$saw[$similar->[1]]++;
477				}
478				}
479				}
480
481				# =========================================================== &s-sub ===
482				sub file_info ($$$) {
483				my ($fileInfos, $ndx, $fnl) = @_;
484				return sprintf($config{Format}, $fileInfos->[$ndx]->[$N_fSize],
485				$fileInfos->[$ndx]->[$N_fName],
486				' ' x ($fnl - length($fileInfos->[$ndx]->[$N_fName])),
487				"$fileInfos->[$ndx]->[$N_dName]");
488				}
489
490				# =========================================================== &s-sub ===
491				# S - file_diff: determind how difference two files are by name & size
492				# U - file_diff($fileInfos, $ndx1, $ndx2);
493				#
494				# I - $fileInfos: reference to @fileInfos
495				# $ndx1, $ndx2: index to the two file in @fileInfos
496				# O - 100%: files are identical
497				# 0%: no similarity at all
498				sub file_diff ($$$) {
499				my ($fileInfos, $ndx1, $ndx2) = @_;
500
501				return 0 unless @{$fileInfos->[$ndx1]->[$N_fSdxl]};
502
503				# find intersection in two soudex array
504				my %count = ();
505				foreach my $element
506				(@{$fileInfos->[$ndx1]->[$N_fSdxl]},
507				@{$fileInfos->[$ndx2]->[$N_fSdxl]}) { $count{$element}++ }
508				# since there is no duplication in each of file soudex
509				my $intersection =
510				grep $count{$_} > 1, keys %count;
511				# return p * normal(\common soudex) + (1-p) * ( 1 - normal(\delta fSize))
512				# so the bigger the return value is, the similar the two files are
513				$intersection *= $config{WeightSoundex} /
514				(@{$fileInfos->[$ndx1]->[$N_fSdxl]});
515				dbg_show(100,"intersection", $intersection, $ndx1, $ndx2);
516				my $WeightfSzie = 100 - $config{WeightSoundex};
517				my $dfSize = abs($fileInfos->[$ndx1]->[$N_fSize] -
518				$fileInfos->[$ndx2]->[$N_fSize]) * $WeightfSzie /
519				($fileInfos->[$ndx1]->[$N_fSize] + 1);
520				$dfSize = $dfSize > $WeightfSzie ? $WeightfSzie : $dfSize;
521				my $file_diff = $intersection + ($WeightfSzie - $dfSize);
522				if ($file_diff >= $config{Threshold}) {
523				dbg_show(010,"file_diff",
524				@{$fileInfos->[$ndx1]},
525				@{$fileInfos->[$ndx2]},
526				$intersection, $dfSize, $file_diff
527				);
528				}
529				return $file_diff;
530				}
531
532
533				1;
534				__END__