File Coverage

blib/lib/File/FindSimilars.pm

Criterion	Covered	Total	%
statement	24	158	15.1
branch	0	42	0.0
condition	0	6	0.0
subroutine	8	22	36.3
pod	3	14	21.4
total	35	242	14.4

line	stmt	bran	cond	sub	pod	time	code
1							package File::FindSimilars;
2
3							# @Author: Tong SUN, (c)2001-2016, all right reserved
4							# @Version: $Date: 2015/08/30 13:04:54 $ $Revision: 2.7 $
5							# @HomeURL: http://xpt.sourceforge.net/
6
7							# {{{ LICENSE:
8
9							#
10							# Permission to use, copy, modify, and distribute this software and its
11							# documentation for any purpose and without fee is hereby granted, provided
12							# that the above copyright notices appear in all copies and that both those
13							# copyright notices and this permission notice appear in supporting
14							# documentation, and that the names of author not be used in advertising or
15							# publicity pertaining to distribution of the software without specific,
16							# written prior permission. Tong Sun makes no representations about the
17							# suitability of this software for any purpose. It is provided "as is"
18							# without express or implied warranty.
19							#
20							# TONG SUN DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
21							# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL ADOBE
22							# SYSTEMS INCORPORATED AND DIGITAL EQUIPMENT CORPORATION BE LIABLE FOR ANY
23							# SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
24							# RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
25							# CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
26							# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
27							#
28
29							# }}}
30
31							# {{{ POD, Intro:
32
33							=head1 NAME
34
35							File::FindSimilars - Fast similar-files finder
36
37							=head1 SYNOPSIS
38
39							use File::FindSimilars;
40
41							my $similars_finder =
42							File::FindSimilars->new( { fc_level => $fc_level, } );
43							$similars_finder->find_for(\@ARGV);
44							$similars_finder->similarity_check();
45
46							=head1 DESCRIPTION
47
48							Extremely fast file similarity checker. Similar-sized and similar-named
49							files are picked out as suspicious candidates of duplicated files.
50
51							It uses advanced soundex vector algorithm to determine the similarity
52							between files. Generally it means that if there are n files, each having
53							approximately m words in the file name, the degree of calculation is merely
54
55							O(n^2 * m)
56
57							which is over thousands times faster than any existing file fingerprinting
58							technology.
59
60							=head2 ALGORITHM EXPLANATION
61
62							The self-test output will help you understand what the module do and what
63							would you expect from the outcome.
64
65							$ make test
66							PERL_DL_NONLAZY=1 /usr/bin/perl "-Iblib/lib" "-Iblib/arch" test.pl
67							1..5 todo 2;
68							# Running under perl version 5.010000 for linux
69							# Current time local: Wed Nov 5 17:45:19 2008
70							# Current time GMT: Wed Nov 5 22:45:19 2008
71							# Using Test.pm version 1.25
72							# Testing File::FindSimilars version 2.04
73
74							. . . .
75
76							== Testing 2, files under test/ subdir:
77
78							9 test/(eBook) GNU - Python Standard Library 2001.pdf
79							3 test/Audio Book - The Grey Coloured Bunnie.mp3
80							5 test/ColoredGrayBunny.ogg
81							5 test/GNU - 2001 - Python Standard Library.pdf
82							4 test/GNU - Python Standard Library (2001).rar
83							9 test/LayoutTest.java
84							3 test/PopupTest.java
85							2 test/Python Standard Library.zip
86							ok 2 # (test.pl at line 83 TODO?!)
87
88							Note:
89
90							- The findsimilars script will pick out similar files from them in next test.
91							- Let's assume that the number represent the file size in KB.
92
93							== Testing 3 result should be:
94
95							## =========
96							3 'Audio Book - The Grey Coloured Bunnie.mp3' 'test/'
97							5 'ColoredGrayBunny.ogg' 'test/'
98
99							## =========
100							4 'GNU - Python Standard Library (2001).rar' 'test/'
101							5 'GNU - 2001 - Python Standard Library.pdf' 'test/'
102							ok 3
103
104							Note:
105
106							- There are 2 groups of similar files picked out by the script.
107							- The similar files are picked because their file names look similar.
108							Note that the first group looks different and spells differently too,
109							which means that the script is versatile enough to handle file names that
110							don't have space in it, and robust enough to deal with spelling mistakes.
111							- Apart from the file name, the file size plays an important role as well.
112							- There are 2 files in the second similar files group, the book files group.
113							- The file 'Python Standard Library.zip' is not considered to be similar to
114							the group because its size is not similar to the group.
115
116							== Testing 4, if Python.zip is bigger, result should be:
117
118							## =========
119							3 'Audio Book - The Grey Coloured Bunnie.mp3' 'test/'
120							5 'ColoredGrayBunny.ogg' 'test/'
121
122							## =========
123							4 'Python Standard Library.zip' 'test/'
124							4 'GNU - Python Standard Library (2001).rar' 'test/'
125							5 'GNU - 2001 - Python Standard Library.pdf' 'test/'
126							ok 4
127
128							Note:
129
130							- There are now 3 files in the book files group.
131							- The file 'Python Standard Library.zip' is included in the
132							group because its size is now similar to the group.
133
134							== Testing 5, if Python.zip is even bigger, result should be:
135
136							## =========
137							3 'Audio Book - The Grey Coloured Bunnie.mp3' 'test/'
138							5 'ColoredGrayBunny.ogg' 'test/'
139
140							## =========
141							4 'GNU - Python Standard Library (2001).rar' 'test/'
142							5 'GNU - 2001 - Python Standard Library.pdf' 'test/'
143							6 'Python Standard Library.zip' 'test/'
144							9 '(eBook) GNU - Python Standard Library 2001.pdf' 'test/'
145							ok 5
146
147							Note:
148
149							- There are 4 files in the book files group now.
150							- The file 'Python Standard Library.zip' is still in the group.
151							- But this time, because it is also considered to be similar to the .pdf
152							file (since their size are now similar, 6 vs 9), a 4th file the .pdf one
153							is now included in the book group.
154							- If the size of file 'Python Standard Library.zip' is 12(KB), then the
155							book files group will be split into two. Do you know why and
156							which files each group will contain?
157
158							The File::FindSimilars package comes with a fully functional demo
159							script findsimilars. Please refer to its help file for further
160							explanations.
161
162							This package is highly customizable. Refer to the class method C for
163							details.
164
165							=head1 DEPENDS
166
167							This module depends on L, but not L.
168
169							=cut
170
171							# }}}
172
173							# {{{ Global Declaration:
174
175							# ============================================================== &us ===
176							# ............................................................. Uses ...
177
178							# -- global modules
179	1			1		108579	use strict; # !
	1					3
	1					24
180
181	1			1		5	use Carp;
	1					2
	1					57
182	1			1		4	use Getopt::Long;
	1					7
	1					6
183	1			1		103	use File::Basename;
	1					1
	1					66
184	1			1		4	use Text::Soundex;
	1					1
	1					72
185
186	1			1		4	use base qw(Class::Accessor::Fast);
	1					2
	1					122
187
188							# -- local modules
189
190				0	0		sub dbg_show {};
191							#use MyDbg; $MyDbg::debugging=010;
192
193							# ============================================================== &gv ===
194							# .................................................. Global Varibles ...
195							#
196
197							our @EXPORT = ( ); # may even omit this line
198
199	1			1		7	use vars qw($progname $VERSION $debugging);
	1					7
	1					66
200	1			1		6	use vars qw(%config @filequeue @fileInfo %sdxCnt %wrdLst);
	1					1
	1					2161
201
202							# @fileInfo: List of the following list:
203							my (
204							$N_dName, # dir name
205							$N_fName, # file name
206							$N_fSize, # file size
207							$N_fSdxl, # file soundex list, reference
208							) = (0..9);
209
210							# ============================================================== &cs ===
211							# ................................................. Constant setting ...
212							#
213							$VERSION = sprintf("%d.%02d", q$Revision: 2.7 $ =~ /(\d+)\.(\d+)/);
214
215							# }}}
216
217							# ############################################################## &ss ###
218							# ................................................ Subroutions start ...
219
220							=head1 METHODS
221
222							=head2 File::FindSimilars->new(\%config_param)
223
224							Initialize the object.
225
226							my $similars_finder = File::FindSimilars->new();
227
228							or,
229
230							my $similars_finder = File::FindSimilars->new( {} );
231
232							which are the same as:
233
234							my $similars_finder = File::FindSimilars->new( {
235							soundex_weight => 50, # percentage of weight that soundex takes,
236							# the rest is for file size
237							fc_threshold => 75, # over which files are considered similar
238							delimiter => "\n## =========\n", # delimiter between files output
239							format => "%12d '%s' %s'%s'", # file info print format
240							fc_level => 0, # file comparison level
241							verbose => 0,
242
243							} );
244
245							What shown above are default settings. Any of the C<%config_param> attribute can be omitted when calling the new method.
246
247							The C is the only class method. All the rest methods are object methods.
248
249							=head2 Object attribute: soundex_weight([set_val])
250
251							Percentage of weight that soundex takes, the rest of percentage is for file size.
252
253							Provide the C to change the attribute, omitting it to retrieve the attribute value.
254
255							=head2 Object attribute: fc_threshold([set_val])
256
257							The threshold over which files are considered similar.
258
259							Provide the C to change the attribute, omitting it to retrieve the attribute value.
260
261							=head2 Object attribute: delimiter([set_val])
262
263							Delimiter printed between file info outputs.
264
265							Provide the C to change the attribute, omitting it to retrieve the attribute value.
266
267							=head2 Object attribute: format([set_val])
268
269							Format used to print file info.
270
271							Provide the C to change the attribute, omitting it to retrieve the attribute value.
272
273							=head2 Object attribute: fc_level([set_val])
274
275							File comparison level. Whether to check similar files within the same folder: 0, no; 1, yes.
276
277							Provide the C to change the attribute, omitting it to retrieve the attribute value.
278
279							=head2 Object attribute: verbose([set_val])
280
281							Verbose level. Whether to output progress info: 0, no; 1, yes.
282
283							Provide the C to change the attribute, omitting it to retrieve the attribute value.
284
285							=cut
286
287							File::FindSimilars
288							->mk_accessors(qw(soundex_weight fc_threshold
289							delimiter format fc_level verbose));
290
291							%config =
292							(
293
294							soundex_weight => 50, # percentage of weight that soundex takes,
295							# the rest is for file size
296							fc_threshold => 75, # over which files are considered similar
297							delimiter => "\n## =========\n", # delimiter between files output
298							format => "%12d '%s' %s'%s'", # file info print format
299
300							fc_level => 0, # file comparison level
301							verbose => 0,
302							);
303
304
305							# =========================================================== &s-sub ===
306
307							sub new {
308	0	0		0	1		ref(my $class = shift)
309							and croak "new is a class method. class name needed.";
310	0						my ($arg_ref) = @_;
311	0						my $self = $class->SUPER::new({%config, %$arg_ref});
312	0						$config{soundex_weight} = $self->soundex_weight;
313	0						$config{fc_threshold} = $self->fc_threshold;
314	0						$config{delimiter} = $self->delimiter;
315	0						$config{format} = $self->format;
316	0						$config{fc_level} = $self->fc_level;
317	0						$config{verbose} = $self->verbose;
318							#$config{} = $self->;
319	0						return $self;
320							}
321
322							# =========================================================== &s-sub ===
323
324							=head2 Object method: find_for($array_ref)
325
326							Set directory queue for similarity checking. Each entry in C<$array_ref>
327							is a directory to check into. E.g.,
328
329							$similars_finder->find_for(\@ARGV);
330
331							=cut
332
333							sub find_for {
334	0			0	1		my ($self, $init_dirs) = @_;
335
336							# threshold $config{fc_threshold}
337							print STDERR "Searching in directory(ies): @$init_dirs with level $config{fc_level}...\n\n"
338	0	0					if $config{verbose};
339
340	0						@filequeue = @fileInfo = ();
341	0						@filequeue = (@filequeue, map { [$_, ''] } @$init_dirs);
	0
342	0						process_entries();
343
344	0						dbg_show(100,"\@fileInfo", @fileInfo);
345	0						dbg_show(100,"%sdxCnt", %sdxCnt);
346	0						dbg_show(100,"%wrdLst", %wrdLst);
347							}
348
349							# =========================================================== &s-sub ===
350							# I - Input: global array @filequeue
351							# Input parameters: None
352							#
353							sub process_entries {
354	0			0	0		my($dir, $qf) = ();
355							#warn "] inside process_entries...\n";
356
357	0						while ($qf = shift @filequeue) {
358	0						($dir, $_) = ($qf->[0], $qf->[1]);
359							#warn "] inside process_entries loop, $dir, $_, ...\n";
360	0	0					next if /^..?$/;
361	0						my $name = "$dir/$_";
362							#warn "] processing file '$name'.\n";
363	0	0					if ($name eq '-/') {
		0
364							# get info from stdin
365	0						process_stdin();
366							}
367							elsif (-d $name) {
368							# a directory, process it recursively.
369	0						process_dir($name);
370							}
371							else {
372	0						process_file($dir, $_);
373							}
374							}
375							}
376
377							# =========================================================== &s-sub ===
378							# D - Process info given from stdin, which should of form same as
379							# find -printf "%p\t%s\n"
380							#
381							sub process_stdin {
382
383	0			0	0		while (<>){
384	0	0					croak "Wrong input format: '$_'" unless m{(.*)/(.+?)\t(\d+)$};
385	0						my ($dn, $fn, $size) = ( $1, $2, $3 );
386	0						my $fSdxl = [ get_soundex($fn) ]; # file soundex list
387	0						push @fileInfo, [ $dn, $fn, $size, $fSdxl, ];
388
389	0						dbg_show(100,"fileInfo",@fileInfo);
390	0						map { $sdxCnt{$_}++ } @$fSdxl;
	0
391							}
392							}
393
394							# =========================================================== &s-sub ===
395							# D - Process given dir recursively
396							# N - BFS is more memory friendly than DFS
397							#
398							# T - $dir="/home/tong/tmp"
399							sub process_dir {
400	0			0	0		my($dir) = @_;
401							#warn "] processing dir '$dir'...\n";
402
403	0	0					opendir(DIR,$dir) \|\| die "File::FindSimilars error: Can't open $dir";
404	0						my @filenames = readdir(DIR);
405	0						closedir(DIR);
406
407							# record the dirname/fname pair to queue
408	0						@filequeue = (@filequeue, map { [$dir, $_] } @filenames);
	0
409	0						dbg_show(100,"filequeue", @filequeue)
410							}
411
412							# =========================================================== &s-sub ===
413							# S - process_file($dirname, $fname), process file $fname under $dirname
414							# D - Process one file and update global vars
415							# U -
416							#
417							# I - Input parameters:
418							# $dirname: dir name string
419							# $fname: file name string
420							# O - Global vars get updated
421							# fileInfo [ $dirname, $fname, $fsize, [ file_soundex ] ]
422							# T -
423
424							sub process_file {
425	0			0	0		my ($dn, $fn) = @_;
426	0						my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,@rest) =
427							stat("$dn/$fn");
428	0						my $fSdxl = [ get_soundex($fn) ]; # file soundex list
429	0						push @fileInfo, [ $dn, $fn, $size, $fSdxl, ];
430
431	0						dbg_show(100,"fileInfo",@fileInfo);
432	0						map { $sdxCnt{$_}++ } @$fSdxl;
	0
433							}
434
435							# =========================================================== &s-sub ===
436							# S - get_soundex($fname), get soundex for file $fname
437							# D - Return a list of soundex of each individual word in file name
438							# U - $aref = [ get_soundex($fname) ];
439							#
440							# I - Input parameters:
441							# $fname: file name string
442							# O - sorted anonymous soundex array w/ duplications removed
443							# T - @out = get_soundex 'Java_RMI - _Remote_Method_Invocation_ch03.tgz';
444							# @out = get_soundex 'ASuchKindOfFile.tgz';
445
446							sub get_soundex {
447	0			0	0		my ($fn) = @_;
448							# split to individual words
449	0						my @fn_wlist = split /[-_[:cntrl:][:blank:][:punct:][:digit:]]/i, $fn;
450							# discards file extension, if any
451	0	0					pop @fn_wlist if @fn_wlist >= 1;
452							# if it is single word, try further decompose SuchKindOfWord
453	0	0	0				@fn_wlist = $fn_wlist[0] =~ /[A-Z][^A-Z]*/g
454							if @fn_wlist == 1 && $fn_wlist[0] =~ /^[A-Z]/;
455							# wash short
456	0						dbg_show(100,"wlist 0",@fn_wlist);
457	0						@fn_wlist = arrwash_short(\@fn_wlist);
458	0						dbg_show(100,"wlist 1",@fn_wlist);
459
460							# language specific handling
461	0						@fn_wlist = arrwash_lang(\@fn_wlist);
462	0						dbg_show(100,"wlist 2",@fn_wlist);
463
464							# change word to soundex, record soundex/word in global hash
465							map {
466	0	0					if (/[[:alpha:]]/) {
	0
467	0						my $sdx = soundex($_);
468	0						$wrdLst{$sdx}{$_}++;
469	0						s/^.*$/$sdx/;
470							}
471							} @fn_wlist;
472	0						dbg_show(1,"wrdLst",%wrdLst);
473
474							# wash empty/duplicates
475	0						@fn_wlist = grep(!/^$/, @fn_wlist);
476	0						@fn_wlist = arrwash_dup(\@fn_wlist);
477
478	0						return sort @fn_wlist;
479							}
480
481							# =========================================================== &s-sub ===
482							# S - arrwash_short($arr_ref), wash short from array $arr_ref
483							# D - weed out empty lines and less-than-3-letter words (e.g. ch12)
484							# U - @fn_wlist = arrwash_short(\@fn_wlist);
485							#
486
487							sub arrwash_short($) {
488	0			0	0		my ($arr_ref) = @_;
489	0	0					return @$arr_ref unless @$arr_ref >= 1;
490	0						my @r= grep tr/a-zA-Z// >=3, @$arr_ref;
491	0	0					return @r if @r;
492	0	0					return @$arr_ref # for upper ASCII
493							if grep(/[\200-\377]/, @$arr_ref);
494	0						return @r;
495							}
496
497							# =========================================================== &s-sub ===
498							# S - arrwash_dup($arr_ref), wash duplicates from array $arr_ref
499							# D - weed out duplicates
500							# U - @fn_wlist = arrwash_dup(\@fn_wlist);
501							#
502
503							sub arrwash_dup($) {
504	0			0	0		my ($arr_ref) = @_;
505	0						my %saw;
506	0						return grep !$saw{$_}++, @$arr_ref;
507							}
508
509							# =========================================================== &s-sub ===
510							# S - arrwash_lang($arr_ref), language specific washing from array $arr_ref
511							# U - @fn_wlist = arrwash_lang(\@fn_wlist);
512							#
513
514							sub arrwash_lang($) {
515	0			0	0		my ($arr_ref) = @_;
516
517							# split Chinese into individual chars
518	0						my @r;
519							map {
520	0	0					if (/[\200-\377]{2}/) {
	0
521	0						@r = (@r, /[\200-\377]{2}/g);
522							}
523							else {
524	0						@r = (@r, $_);
525							}
526							} @$arr_ref;
527
528	0						return @r;
529							}
530
531							=head2 Object method: similarity_check()
532
533							Do similarity check on the queued directories. Print similar files info on
534							stdout according to the configured format and delimiters. E.g.,
535
536							$similars_finder->similarity_check();
537
538							=cut
539
540							# =========================================================== &s-sub ===
541							# S - similarity_check: similarity check on glabal array @fileInfo
542							# U - similarity_check();
543							#
544							# I - Input parameters: None
545							# O - similar files printed on stdout
546
547							sub similarity_check {
548
549							# get a ordered (by soundex count and file name) of file Info array
550							# (Use short file names to compare to long file names)
551							# use Schwartzian Transform to sort on 2 fields for efficiency
552	0						my @fileInfos = map { $_->[0] }
553	0						sort { $a->[1] cmp $b->[1] }
554	0			0	1		map { [ $_,
555	0						sprintf "%3d%6s", $#{$_->[$N_fSdxl]}, $_->[$N_fSdxl][0]
	0
556							] } @fileInfo;
557	0						dbg_show(100,"\@fileInfos", @fileInfos);
558
559	0						my @saw = (0) x ($#fileInfos+1);
560	0						foreach my $ii (0..$#fileInfos) {
561							#warn "] ii=$ii\n";
562	0						my @similar = ();
563	0						my $fnl;
564
565	0						dbg_show(100,"\@fileInfos", $fileInfos[$ii]);
566	0						push @similar, [$ii, $ii, $fileInfos[$ii]->[$N_fSize] ];
567	0						foreach my $jj (($ii+1) ..$#fileInfos) {
568	0						$fnl=0; # 0 is good enough since file at [ii] is
569							# shorter in name than the one at [jj]
570							# don't care about same dir files?
571							next
572	0	0	0				if (!$config{fc_level} && ($fileInfos[$ii]->[$N_dName]
573							eq $fileInfos[$jj]->[$N_dName])) ;
574	0	0					if (file_diff(\@fileInfos, $ii, $jj) >= $config{fc_threshold}) {
575	0						push @similar, [$ii, $jj, $fileInfos[$jj]->[$N_fSize] ];
576	0	0					$fnl= length($fileInfos[$jj]->[$N_fName]) if
577							$fnl < length($fileInfos[$jj]->[$N_fName]);
578							}
579							}
580	0						dbg_show(100,"\@similar", @similar);
581							# output unvisited potential similars by each row, order by fSize
582	0						@similar = grep {!$saw[$_->[1]]}
583	0						sort { $a->[2] <=> $b->[2] } @similar;
	0
584	0	0					next unless @similar>1;
585	0						print $config{delimiter};
586	0						foreach my $similar (@similar) {
587	0						print file_info(\@fileInfos, $similar->[1], $fnl). "\n";
588	0						$saw[$similar->[1]]++;
589							}
590							}
591							}
592
593							# =========================================================== &s-sub ===
594							sub file_info ($$$) {
595	0			0	0		my ($fileInfos, $ndx, $fnl) = @_;
596	0						return sprintf($config{format}, $fileInfos->[$ndx]->[$N_fSize],
597							$fileInfos->[$ndx]->[$N_fName],
598							' ' x ($fnl - length($fileInfos->[$ndx]->[$N_fName])),
599							"$fileInfos->[$ndx]->[$N_dName]");
600							}
601
602							# =========================================================== &s-sub ===
603							# S - file_diff: determind how difference two files are by name & size
604							# U - file_diff($fileInfos, $ndx1, $ndx2);
605							#
606							# I - $fileInfos: reference to @fileInfos
607							# $ndx1, $ndx2: index to the two file in @fileInfos
608							# O - 100%: files are identical
609							# 0%: no similarity at all
610							sub file_diff ($$$) {
611	0			0	0		my ($fileInfos, $ndx1, $ndx2) = @_;
612
613	0	0					return 0 unless @{$fileInfos->[$ndx1]->[$N_fSdxl]};
	0
614
615							# find intersection in two soudex array
616	0						my %count = ();
617	0						foreach my $element
618	0						(@{$fileInfos->[$ndx1]->[$N_fSdxl]},
619	0						@{$fileInfos->[$ndx2]->[$N_fSdxl]}) { $count{$element}++ }
	0
620							# since there is no duplication in each of file soudex
621							my $intersection =
622	0						grep $count{$_} > 1, keys %count;
623							# return p * normal(\common soudex) + (1-p) * ( 1 - normal(\delta fSize))
624							# so the bigger the return value is, the similar the two files are
625							$intersection *= $config{soundex_weight} /
626	0						(@{$fileInfos->[$ndx1]->[$N_fSdxl]});
	0
627	0						dbg_show(100,"intersection", $intersection, $ndx1, $ndx2);
628	0						my $WeightfSzie = 100 - $config{soundex_weight};
629	0						my $dfSize = abs($fileInfos->[$ndx1]->[$N_fSize] -
630							$fileInfos->[$ndx2]->[$N_fSize]) * $WeightfSzie /
631							($fileInfos->[$ndx1]->[$N_fSize] + 1);
632	0	0					$dfSize = $dfSize > $WeightfSzie ? $WeightfSzie : $dfSize;
633	0						my $file_diff = $intersection + ($WeightfSzie - $dfSize);
634	0	0					if ($file_diff >= $config{fc_threshold}) {
635							dbg_show(010,"file_diff",
636	0						@{$fileInfos->[$ndx1]},
637	0						@{$fileInfos->[$ndx2]},
	0
638							$intersection, $dfSize, $file_diff
639							);
640							}
641	0						return $file_diff;
642							}
643
644
645							1;
646							__END__