| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package File::FindSimilars; |
|
2
|
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
# @Author: Tong SUN, (c)2001-2016, all right reserved |
|
4
|
|
|
|
|
|
|
# @Version: $Date: 2015/08/30 13:04:54 $ $Revision: 2.7 $ |
|
5
|
|
|
|
|
|
|
# @HomeURL: http://xpt.sourceforge.net/ |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
# {{{ LICENSE: |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
# |
|
10
|
|
|
|
|
|
|
# Permission to use, copy, modify, and distribute this software and its |
|
11
|
|
|
|
|
|
|
# documentation for any purpose and without fee is hereby granted, provided |
|
12
|
|
|
|
|
|
|
# that the above copyright notices appear in all copies and that both those |
|
13
|
|
|
|
|
|
|
# copyright notices and this permission notice appear in supporting |
|
14
|
|
|
|
|
|
|
# documentation, and that the names of author not be used in advertising or |
|
15
|
|
|
|
|
|
|
# publicity pertaining to distribution of the software without specific, |
|
16
|
|
|
|
|
|
|
# written prior permission. Tong Sun makes no representations about the |
|
17
|
|
|
|
|
|
|
# suitability of this software for any purpose. It is provided "as is" |
|
18
|
|
|
|
|
|
|
# without express or implied warranty. |
|
19
|
|
|
|
|
|
|
# |
|
20
|
|
|
|
|
|
|
# TONG SUN DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL |
|
21
|
|
|
|
|
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL ADOBE |
|
22
|
|
|
|
|
|
|
# SYSTEMS INCORPORATED AND DIGITAL EQUIPMENT CORPORATION BE LIABLE FOR ANY |
|
23
|
|
|
|
|
|
|
# SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER |
|
24
|
|
|
|
|
|
|
# RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF |
|
25
|
|
|
|
|
|
|
# CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN |
|
26
|
|
|
|
|
|
|
# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
|
27
|
|
|
|
|
|
|
# |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# }}} |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
# {{{ POD, Intro: |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
=head1 NAME |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
File::FindSimilars - Fast similar-files finder |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
use File::FindSimilars; |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
my $similars_finder = |
|
42
|
|
|
|
|
|
|
File::FindSimilars->new( { fc_level => $fc_level, } ); |
|
43
|
|
|
|
|
|
|
$similars_finder->find_for(\@ARGV); |
|
44
|
|
|
|
|
|
|
$similars_finder->similarity_check(); |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
Extremely fast file similarity checker. Similar-sized and similar-named |
|
49
|
|
|
|
|
|
|
files are picked out as suspicious candidates of duplicated files. |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
It uses advanced soundex vector algorithm to determine the similarity |
|
52
|
|
|
|
|
|
|
between files. Generally it means that if there are n files, each having |
|
53
|
|
|
|
|
|
|
approximately m words in the file name, the degree of calculation is merely |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
O(n^2 * m) |
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
which is over thousands times faster than any existing file fingerprinting |
|
58
|
|
|
|
|
|
|
technology. |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=head2 ALGORITHM EXPLANATION |
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
The self-test output will help you understand what the module do and what |
|
63
|
|
|
|
|
|
|
would you expect from the outcome. |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
$ make test |
|
66
|
|
|
|
|
|
|
PERL_DL_NONLAZY=1 /usr/bin/perl "-Iblib/lib" "-Iblib/arch" test.pl |
|
67
|
|
|
|
|
|
|
1..5 todo 2; |
|
68
|
|
|
|
|
|
|
# Running under perl version 5.010000 for linux |
|
69
|
|
|
|
|
|
|
# Current time local: Wed Nov 5 17:45:19 2008 |
|
70
|
|
|
|
|
|
|
# Current time GMT: Wed Nov 5 22:45:19 2008 |
|
71
|
|
|
|
|
|
|
# Using Test.pm version 1.25 |
|
72
|
|
|
|
|
|
|
# Testing File::FindSimilars version 2.04 |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
. . . . |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
== Testing 2, files under test/ subdir: |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
9 test/(eBook) GNU - Python Standard Library 2001.pdf |
|
79
|
|
|
|
|
|
|
3 test/Audio Book - The Grey Coloured Bunnie.mp3 |
|
80
|
|
|
|
|
|
|
5 test/ColoredGrayBunny.ogg |
|
81
|
|
|
|
|
|
|
5 test/GNU - 2001 - Python Standard Library.pdf |
|
82
|
|
|
|
|
|
|
4 test/GNU - Python Standard Library (2001).rar |
|
83
|
|
|
|
|
|
|
9 test/LayoutTest.java |
|
84
|
|
|
|
|
|
|
3 test/PopupTest.java |
|
85
|
|
|
|
|
|
|
2 test/Python Standard Library.zip |
|
86
|
|
|
|
|
|
|
ok 2 # (test.pl at line 83 TODO?!) |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
Note: |
|
89
|
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
- The findsimilars script will pick out similar files from them in next test. |
|
91
|
|
|
|
|
|
|
- Let's assume that the number represent the file size in KB. |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
== Testing 3 result should be: |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
## ========= |
|
96
|
|
|
|
|
|
|
3 'Audio Book - The Grey Coloured Bunnie.mp3' 'test/' |
|
97
|
|
|
|
|
|
|
5 'ColoredGrayBunny.ogg' 'test/' |
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
## ========= |
|
100
|
|
|
|
|
|
|
4 'GNU - Python Standard Library (2001).rar' 'test/' |
|
101
|
|
|
|
|
|
|
5 'GNU - 2001 - Python Standard Library.pdf' 'test/' |
|
102
|
|
|
|
|
|
|
ok 3 |
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
Note: |
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
- There are 2 groups of similar files picked out by the script. |
|
107
|
|
|
|
|
|
|
- The similar files are picked because their file names look similar. |
|
108
|
|
|
|
|
|
|
Note that the first group looks different and spells differently too, |
|
109
|
|
|
|
|
|
|
which means that the script is versatile enough to handle file names that |
|
110
|
|
|
|
|
|
|
don't have space in it, and robust enough to deal with spelling mistakes. |
|
111
|
|
|
|
|
|
|
- Apart from the file name, the file size plays an important role as well. |
|
112
|
|
|
|
|
|
|
- There are 2 files in the second similar files group, the book files group. |
|
113
|
|
|
|
|
|
|
- The file 'Python Standard Library.zip' is not considered to be similar to |
|
114
|
|
|
|
|
|
|
the group because its size is not similar to the group. |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
== Testing 4, if Python.zip is bigger, result should be: |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
## ========= |
|
119
|
|
|
|
|
|
|
3 'Audio Book - The Grey Coloured Bunnie.mp3' 'test/' |
|
120
|
|
|
|
|
|
|
5 'ColoredGrayBunny.ogg' 'test/' |
|
121
|
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
## ========= |
|
123
|
|
|
|
|
|
|
4 'Python Standard Library.zip' 'test/' |
|
124
|
|
|
|
|
|
|
4 'GNU - Python Standard Library (2001).rar' 'test/' |
|
125
|
|
|
|
|
|
|
5 'GNU - 2001 - Python Standard Library.pdf' 'test/' |
|
126
|
|
|
|
|
|
|
ok 4 |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
Note: |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
- There are now 3 files in the book files group. |
|
131
|
|
|
|
|
|
|
- The file 'Python Standard Library.zip' is included in the |
|
132
|
|
|
|
|
|
|
group because its size is now similar to the group. |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
== Testing 5, if Python.zip is even bigger, result should be: |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
## ========= |
|
137
|
|
|
|
|
|
|
3 'Audio Book - The Grey Coloured Bunnie.mp3' 'test/' |
|
138
|
|
|
|
|
|
|
5 'ColoredGrayBunny.ogg' 'test/' |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
## ========= |
|
141
|
|
|
|
|
|
|
4 'GNU - Python Standard Library (2001).rar' 'test/' |
|
142
|
|
|
|
|
|
|
5 'GNU - 2001 - Python Standard Library.pdf' 'test/' |
|
143
|
|
|
|
|
|
|
6 'Python Standard Library.zip' 'test/' |
|
144
|
|
|
|
|
|
|
9 '(eBook) GNU - Python Standard Library 2001.pdf' 'test/' |
|
145
|
|
|
|
|
|
|
ok 5 |
|
146
|
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
Note: |
|
148
|
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
- There are 4 files in the book files group now. |
|
150
|
|
|
|
|
|
|
- The file 'Python Standard Library.zip' is still in the group. |
|
151
|
|
|
|
|
|
|
- But this time, because it is also considered to be similar to the .pdf |
|
152
|
|
|
|
|
|
|
file (since their size are now similar, 6 vs 9), a 4th file the .pdf one |
|
153
|
|
|
|
|
|
|
is now included in the book group. |
|
154
|
|
|
|
|
|
|
- If the size of file 'Python Standard Library.zip' is 12(KB), then the |
|
155
|
|
|
|
|
|
|
book files group will be split into two. Do you know why and |
|
156
|
|
|
|
|
|
|
which files each group will contain? |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
The File::FindSimilars package comes with a fully functional demo |
|
159
|
|
|
|
|
|
|
script findsimilars. Please refer to its help file for further |
|
160
|
|
|
|
|
|
|
explanations. |
|
161
|
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
This package is highly customizable. Refer to the class method C for |
|
163
|
|
|
|
|
|
|
details. |
|
164
|
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
=head1 DEPENDS |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
This module depends on L, but not L. |
|
168
|
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=cut |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
# }}} |
|
172
|
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
# {{{ Global Declaration: |
|
174
|
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
# ============================================================== &us === |
|
176
|
|
|
|
|
|
|
# ............................................................. Uses ... |
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
# -- global modules |
|
179
|
1
|
|
|
1
|
|
108579
|
use strict; # ! |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
24
|
|
|
180
|
|
|
|
|
|
|
|
|
181
|
1
|
|
|
1
|
|
5
|
use Carp; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
57
|
|
|
182
|
1
|
|
|
1
|
|
4
|
use Getopt::Long; |
|
|
1
|
|
|
|
|
7
|
|
|
|
1
|
|
|
|
|
6
|
|
|
183
|
1
|
|
|
1
|
|
103
|
use File::Basename; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
66
|
|
|
184
|
1
|
|
|
1
|
|
4
|
use Text::Soundex; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
72
|
|
|
185
|
|
|
|
|
|
|
|
|
186
|
1
|
|
|
1
|
|
4
|
use base qw(Class::Accessor::Fast); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
122
|
|
|
187
|
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
# -- local modules |
|
189
|
|
|
|
|
|
|
|
|
190
|
|
|
|
0
|
0
|
|
sub dbg_show {}; |
|
191
|
|
|
|
|
|
|
#use MyDbg; $MyDbg::debugging=010; |
|
192
|
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
# ============================================================== &gv === |
|
194
|
|
|
|
|
|
|
# .................................................. Global Varibles ... |
|
195
|
|
|
|
|
|
|
# |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
our @EXPORT = ( ); # may even omit this line |
|
198
|
|
|
|
|
|
|
|
|
199
|
1
|
|
|
1
|
|
7
|
use vars qw($progname $VERSION $debugging); |
|
|
1
|
|
|
|
|
7
|
|
|
|
1
|
|
|
|
|
66
|
|
|
200
|
1
|
|
|
1
|
|
6
|
use vars qw(%config @filequeue @fileInfo %sdxCnt %wrdLst); |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
2161
|
|
|
201
|
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
# @fileInfo: List of the following list: |
|
203
|
|
|
|
|
|
|
my ( |
|
204
|
|
|
|
|
|
|
$N_dName, # dir name |
|
205
|
|
|
|
|
|
|
$N_fName, # file name |
|
206
|
|
|
|
|
|
|
$N_fSize, # file size |
|
207
|
|
|
|
|
|
|
$N_fSdxl, # file soundex list, reference |
|
208
|
|
|
|
|
|
|
) = (0..9); |
|
209
|
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
# ============================================================== &cs === |
|
211
|
|
|
|
|
|
|
# ................................................. Constant setting ... |
|
212
|
|
|
|
|
|
|
# |
|
213
|
|
|
|
|
|
|
$VERSION = sprintf("%d.%02d", q$Revision: 2.7 $ =~ /(\d+)\.(\d+)/); |
|
214
|
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
# }}} |
|
216
|
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
# ############################################################## &ss ### |
|
218
|
|
|
|
|
|
|
# ................................................ Subroutions start ... |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
=head1 METHODS |
|
221
|
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
=head2 File::FindSimilars->new(\%config_param) |
|
223
|
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
Initialize the object. |
|
225
|
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
my $similars_finder = File::FindSimilars->new(); |
|
227
|
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
or, |
|
229
|
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
my $similars_finder = File::FindSimilars->new( {} ); |
|
231
|
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
which are the same as: |
|
233
|
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
my $similars_finder = File::FindSimilars->new( { |
|
235
|
|
|
|
|
|
|
soundex_weight => 50, # percentage of weight that soundex takes, |
|
236
|
|
|
|
|
|
|
# the rest is for file size |
|
237
|
|
|
|
|
|
|
fc_threshold => 75, # over which files are considered similar |
|
238
|
|
|
|
|
|
|
delimiter => "\n## =========\n", # delimiter between files output |
|
239
|
|
|
|
|
|
|
format => "%12d '%s' %s'%s'", # file info print format |
|
240
|
|
|
|
|
|
|
fc_level => 0, # file comparison level |
|
241
|
|
|
|
|
|
|
verbose => 0, |
|
242
|
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
} ); |
|
244
|
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
What shown above are default settings. Any of the C<%config_param> attribute can be omitted when calling the new method. |
|
246
|
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
The C is the only class method. All the rest methods are object methods. |
|
248
|
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
=head2 Object attribute: soundex_weight([set_val]) |
|
250
|
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
Percentage of weight that soundex takes, the rest of percentage is for file size. |
|
252
|
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
Provide the C to change the attribute, omitting it to retrieve the attribute value. |
|
254
|
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
=head2 Object attribute: fc_threshold([set_val]) |
|
256
|
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
The threshold over which files are considered similar. |
|
258
|
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
Provide the C to change the attribute, omitting it to retrieve the attribute value. |
|
260
|
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
=head2 Object attribute: delimiter([set_val]) |
|
262
|
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
Delimiter printed between file info outputs. |
|
264
|
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
Provide the C to change the attribute, omitting it to retrieve the attribute value. |
|
266
|
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
=head2 Object attribute: format([set_val]) |
|
268
|
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
Format used to print file info. |
|
270
|
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
Provide the C to change the attribute, omitting it to retrieve the attribute value. |
|
272
|
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
=head2 Object attribute: fc_level([set_val]) |
|
274
|
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
File comparison level. Whether to check similar files within the same folder: 0, no; 1, yes. |
|
276
|
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
Provide the C to change the attribute, omitting it to retrieve the attribute value. |
|
278
|
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
=head2 Object attribute: verbose([set_val]) |
|
280
|
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
Verbose level. Whether to output progress info: 0, no; 1, yes. |
|
282
|
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
Provide the C to change the attribute, omitting it to retrieve the attribute value. |
|
284
|
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
=cut |
|
286
|
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
File::FindSimilars |
|
288
|
|
|
|
|
|
|
->mk_accessors(qw(soundex_weight fc_threshold |
|
289
|
|
|
|
|
|
|
delimiter format fc_level verbose)); |
|
290
|
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
%config = |
|
292
|
|
|
|
|
|
|
( |
|
293
|
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
soundex_weight => 50, # percentage of weight that soundex takes, |
|
295
|
|
|
|
|
|
|
# the rest is for file size |
|
296
|
|
|
|
|
|
|
fc_threshold => 75, # over which files are considered similar |
|
297
|
|
|
|
|
|
|
delimiter => "\n## =========\n", # delimiter between files output |
|
298
|
|
|
|
|
|
|
format => "%12d '%s' %s'%s'", # file info print format |
|
299
|
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
fc_level => 0, # file comparison level |
|
301
|
|
|
|
|
|
|
verbose => 0, |
|
302
|
|
|
|
|
|
|
); |
|
303
|
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
|
306
|
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
sub new { |
|
308
|
0
|
0
|
|
0
|
1
|
|
ref(my $class = shift) |
|
309
|
|
|
|
|
|
|
and croak "new is a class method. class name needed."; |
|
310
|
0
|
|
|
|
|
|
my ($arg_ref) = @_; |
|
311
|
0
|
|
|
|
|
|
my $self = $class->SUPER::new({%config, %$arg_ref}); |
|
312
|
0
|
|
|
|
|
|
$config{soundex_weight} = $self->soundex_weight; |
|
313
|
0
|
|
|
|
|
|
$config{fc_threshold} = $self->fc_threshold; |
|
314
|
0
|
|
|
|
|
|
$config{delimiter} = $self->delimiter; |
|
315
|
0
|
|
|
|
|
|
$config{format} = $self->format; |
|
316
|
0
|
|
|
|
|
|
$config{fc_level} = $self->fc_level; |
|
317
|
0
|
|
|
|
|
|
$config{verbose} = $self->verbose; |
|
318
|
|
|
|
|
|
|
#$config{} = $self->; |
|
319
|
0
|
|
|
|
|
|
return $self; |
|
320
|
|
|
|
|
|
|
} |
|
321
|
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
|
323
|
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
=head2 Object method: find_for($array_ref) |
|
325
|
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
Set directory queue for similarity checking. Each entry in C<$array_ref> |
|
327
|
|
|
|
|
|
|
is a directory to check into. E.g., |
|
328
|
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
$similars_finder->find_for(\@ARGV); |
|
330
|
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
=cut |
|
332
|
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
sub find_for { |
|
334
|
0
|
|
|
0
|
1
|
|
my ($self, $init_dirs) = @_; |
|
335
|
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
# threshold $config{fc_threshold} |
|
337
|
|
|
|
|
|
|
print STDERR "Searching in directory(ies): @$init_dirs with level $config{fc_level}...\n\n" |
|
338
|
0
|
0
|
|
|
|
|
if $config{verbose}; |
|
339
|
|
|
|
|
|
|
|
|
340
|
0
|
|
|
|
|
|
@filequeue = @fileInfo = (); |
|
341
|
0
|
|
|
|
|
|
@filequeue = (@filequeue, map { [$_, ''] } @$init_dirs); |
|
|
0
|
|
|
|
|
|
|
|
342
|
0
|
|
|
|
|
|
process_entries(); |
|
343
|
|
|
|
|
|
|
|
|
344
|
0
|
|
|
|
|
|
dbg_show(100,"\@fileInfo", @fileInfo); |
|
345
|
0
|
|
|
|
|
|
dbg_show(100,"%sdxCnt", %sdxCnt); |
|
346
|
0
|
|
|
|
|
|
dbg_show(100,"%wrdLst", %wrdLst); |
|
347
|
|
|
|
|
|
|
} |
|
348
|
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
|
350
|
|
|
|
|
|
|
# I - Input: global array @filequeue |
|
351
|
|
|
|
|
|
|
# Input parameters: None |
|
352
|
|
|
|
|
|
|
# |
|
353
|
|
|
|
|
|
|
sub process_entries { |
|
354
|
0
|
|
|
0
|
0
|
|
my($dir, $qf) = (); |
|
355
|
|
|
|
|
|
|
#warn "] inside process_entries...\n"; |
|
356
|
|
|
|
|
|
|
|
|
357
|
0
|
|
|
|
|
|
while ($qf = shift @filequeue) { |
|
358
|
0
|
|
|
|
|
|
($dir, $_) = ($qf->[0], $qf->[1]); |
|
359
|
|
|
|
|
|
|
#warn "] inside process_entries loop, $dir, $_, ...\n"; |
|
360
|
0
|
0
|
|
|
|
|
next if /^..?$/; |
|
361
|
0
|
|
|
|
|
|
my $name = "$dir/$_"; |
|
362
|
|
|
|
|
|
|
#warn "] processing file '$name'.\n"; |
|
363
|
0
|
0
|
|
|
|
|
if ($name eq '-/') { |
|
|
|
0
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
# get info from stdin |
|
365
|
0
|
|
|
|
|
|
process_stdin(); |
|
366
|
|
|
|
|
|
|
} |
|
367
|
|
|
|
|
|
|
elsif (-d $name) { |
|
368
|
|
|
|
|
|
|
# a directory, process it recursively. |
|
369
|
0
|
|
|
|
|
|
process_dir($name); |
|
370
|
|
|
|
|
|
|
} |
|
371
|
|
|
|
|
|
|
else { |
|
372
|
0
|
|
|
|
|
|
process_file($dir, $_); |
|
373
|
|
|
|
|
|
|
} |
|
374
|
|
|
|
|
|
|
} |
|
375
|
|
|
|
|
|
|
} |
|
376
|
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
|
378
|
|
|
|
|
|
|
# D - Process info given from stdin, which should of form same as |
|
379
|
|
|
|
|
|
|
# find -printf "%p\t%s\n" |
|
380
|
|
|
|
|
|
|
# |
|
381
|
|
|
|
|
|
|
sub process_stdin { |
|
382
|
|
|
|
|
|
|
|
|
383
|
0
|
|
|
0
|
0
|
|
while (<>){ |
|
384
|
0
|
0
|
|
|
|
|
croak "Wrong input format: '$_'" unless m{(.*)/(.+?)\t(\d+)$}; |
|
385
|
0
|
|
|
|
|
|
my ($dn, $fn, $size) = ( $1, $2, $3 ); |
|
386
|
0
|
|
|
|
|
|
my $fSdxl = [ get_soundex($fn) ]; # file soundex list |
|
387
|
0
|
|
|
|
|
|
push @fileInfo, [ $dn, $fn, $size, $fSdxl, ]; |
|
388
|
|
|
|
|
|
|
|
|
389
|
0
|
|
|
|
|
|
dbg_show(100,"fileInfo",@fileInfo); |
|
390
|
0
|
|
|
|
|
|
map { $sdxCnt{$_}++ } @$fSdxl; |
|
|
0
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
} |
|
392
|
|
|
|
|
|
|
} |
|
393
|
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
|
395
|
|
|
|
|
|
|
# D - Process given dir recursively |
|
396
|
|
|
|
|
|
|
# N - BFS is more memory friendly than DFS |
|
397
|
|
|
|
|
|
|
# |
|
398
|
|
|
|
|
|
|
# T - $dir="/home/tong/tmp" |
|
399
|
|
|
|
|
|
|
sub process_dir { |
|
400
|
0
|
|
|
0
|
0
|
|
my($dir) = @_; |
|
401
|
|
|
|
|
|
|
#warn "] processing dir '$dir'...\n"; |
|
402
|
|
|
|
|
|
|
|
|
403
|
0
|
0
|
|
|
|
|
opendir(DIR,$dir) || die "File::FindSimilars error: Can't open $dir"; |
|
404
|
0
|
|
|
|
|
|
my @filenames = readdir(DIR); |
|
405
|
0
|
|
|
|
|
|
closedir(DIR); |
|
406
|
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
# record the dirname/fname pair to queue |
|
408
|
0
|
|
|
|
|
|
@filequeue = (@filequeue, map { [$dir, $_] } @filenames); |
|
|
0
|
|
|
|
|
|
|
|
409
|
0
|
|
|
|
|
|
dbg_show(100,"filequeue", @filequeue) |
|
410
|
|
|
|
|
|
|
} |
|
411
|
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
|
413
|
|
|
|
|
|
|
# S - process_file($dirname, $fname), process file $fname under $dirname |
|
414
|
|
|
|
|
|
|
# D - Process one file and update global vars |
|
415
|
|
|
|
|
|
|
# U - |
|
416
|
|
|
|
|
|
|
# |
|
417
|
|
|
|
|
|
|
# I - Input parameters: |
|
418
|
|
|
|
|
|
|
# $dirname: dir name string |
|
419
|
|
|
|
|
|
|
# $fname: file name string |
|
420
|
|
|
|
|
|
|
# O - Global vars get updated |
|
421
|
|
|
|
|
|
|
# fileInfo [ $dirname, $fname, $fsize, [ file_soundex ] ] |
|
422
|
|
|
|
|
|
|
# T - |
|
423
|
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
sub process_file { |
|
425
|
0
|
|
|
0
|
0
|
|
my ($dn, $fn) = @_; |
|
426
|
0
|
|
|
|
|
|
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,@rest) = |
|
427
|
|
|
|
|
|
|
stat("$dn/$fn"); |
|
428
|
0
|
|
|
|
|
|
my $fSdxl = [ get_soundex($fn) ]; # file soundex list |
|
429
|
0
|
|
|
|
|
|
push @fileInfo, [ $dn, $fn, $size, $fSdxl, ]; |
|
430
|
|
|
|
|
|
|
|
|
431
|
0
|
|
|
|
|
|
dbg_show(100,"fileInfo",@fileInfo); |
|
432
|
0
|
|
|
|
|
|
map { $sdxCnt{$_}++ } @$fSdxl; |
|
|
0
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
} |
|
434
|
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
|
436
|
|
|
|
|
|
|
# S - get_soundex($fname), get soundex for file $fname |
|
437
|
|
|
|
|
|
|
# D - Return a list of soundex of each individual word in file name |
|
438
|
|
|
|
|
|
|
# U - $aref = [ get_soundex($fname) ]; |
|
439
|
|
|
|
|
|
|
# |
|
440
|
|
|
|
|
|
|
# I - Input parameters: |
|
441
|
|
|
|
|
|
|
# $fname: file name string |
|
442
|
|
|
|
|
|
|
# O - sorted anonymous soundex array w/ duplications removed |
|
443
|
|
|
|
|
|
|
# T - @out = get_soundex 'Java_RMI - _Remote_Method_Invocation_ch03.tgz'; |
|
444
|
|
|
|
|
|
|
# @out = get_soundex 'ASuchKindOfFile.tgz'; |
|
445
|
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
sub get_soundex { |
|
447
|
0
|
|
|
0
|
0
|
|
my ($fn) = @_; |
|
448
|
|
|
|
|
|
|
# split to individual words |
|
449
|
0
|
|
|
|
|
|
my @fn_wlist = split /[-_[:cntrl:][:blank:][:punct:][:digit:]]/i, $fn; |
|
450
|
|
|
|
|
|
|
# discards file extension, if any |
|
451
|
0
|
0
|
|
|
|
|
pop @fn_wlist if @fn_wlist >= 1; |
|
452
|
|
|
|
|
|
|
# if it is single word, try further decompose SuchKindOfWord |
|
453
|
0
|
0
|
0
|
|
|
|
@fn_wlist = $fn_wlist[0] =~ /[A-Z][^A-Z]*/g |
|
454
|
|
|
|
|
|
|
if @fn_wlist == 1 && $fn_wlist[0] =~ /^[A-Z]/; |
|
455
|
|
|
|
|
|
|
# wash short |
|
456
|
0
|
|
|
|
|
|
dbg_show(100,"wlist 0",@fn_wlist); |
|
457
|
0
|
|
|
|
|
|
@fn_wlist = arrwash_short(\@fn_wlist); |
|
458
|
0
|
|
|
|
|
|
dbg_show(100,"wlist 1",@fn_wlist); |
|
459
|
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
# language specific handling |
|
461
|
0
|
|
|
|
|
|
@fn_wlist = arrwash_lang(\@fn_wlist); |
|
462
|
0
|
|
|
|
|
|
dbg_show(100,"wlist 2",@fn_wlist); |
|
463
|
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
# change word to soundex, record soundex/word in global hash |
|
465
|
|
|
|
|
|
|
map { |
|
466
|
0
|
0
|
|
|
|
|
if (/[[:alpha:]]/) { |
|
|
0
|
|
|
|
|
|
|
|
467
|
0
|
|
|
|
|
|
my $sdx = soundex($_); |
|
468
|
0
|
|
|
|
|
|
$wrdLst{$sdx}{$_}++; |
|
469
|
0
|
|
|
|
|
|
s/^.*$/$sdx/; |
|
470
|
|
|
|
|
|
|
} |
|
471
|
|
|
|
|
|
|
} @fn_wlist; |
|
472
|
0
|
|
|
|
|
|
dbg_show(1,"wrdLst",%wrdLst); |
|
473
|
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
# wash empty/duplicates |
|
475
|
0
|
|
|
|
|
|
@fn_wlist = grep(!/^$/, @fn_wlist); |
|
476
|
0
|
|
|
|
|
|
@fn_wlist = arrwash_dup(\@fn_wlist); |
|
477
|
|
|
|
|
|
|
|
|
478
|
0
|
|
|
|
|
|
return sort @fn_wlist; |
|
479
|
|
|
|
|
|
|
} |
|
480
|
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
|
482
|
|
|
|
|
|
|
# S - arrwash_short($arr_ref), wash short from array $arr_ref |
|
483
|
|
|
|
|
|
|
# D - weed out empty lines and less-than-3-letter words (e.g. ch12) |
|
484
|
|
|
|
|
|
|
# U - @fn_wlist = arrwash_short(\@fn_wlist); |
|
485
|
|
|
|
|
|
|
# |
|
486
|
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
sub arrwash_short($) { |
|
488
|
0
|
|
|
0
|
0
|
|
my ($arr_ref) = @_; |
|
489
|
0
|
0
|
|
|
|
|
return @$arr_ref unless @$arr_ref >= 1; |
|
490
|
0
|
|
|
|
|
|
my @r= grep tr/a-zA-Z// >=3, @$arr_ref; |
|
491
|
0
|
0
|
|
|
|
|
return @r if @r; |
|
492
|
0
|
0
|
|
|
|
|
return @$arr_ref # for upper ASCII |
|
493
|
|
|
|
|
|
|
if grep(/[\200-\377]/, @$arr_ref); |
|
494
|
0
|
|
|
|
|
|
return @r; |
|
495
|
|
|
|
|
|
|
} |
|
496
|
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
|
498
|
|
|
|
|
|
|
# S - arrwash_dup($arr_ref), wash duplicates from array $arr_ref |
|
499
|
|
|
|
|
|
|
# D - weed out duplicates |
|
500
|
|
|
|
|
|
|
# U - @fn_wlist = arrwash_dup(\@fn_wlist); |
|
501
|
|
|
|
|
|
|
# |
|
502
|
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
sub arrwash_dup($) { |
|
504
|
0
|
|
|
0
|
0
|
|
my ($arr_ref) = @_; |
|
505
|
0
|
|
|
|
|
|
my %saw; |
|
506
|
0
|
|
|
|
|
|
return grep !$saw{$_}++, @$arr_ref; |
|
507
|
|
|
|
|
|
|
} |
|
508
|
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
|
510
|
|
|
|
|
|
|
# S - arrwash_lang($arr_ref), language specific washing from array $arr_ref |
|
511
|
|
|
|
|
|
|
# U - @fn_wlist = arrwash_lang(\@fn_wlist); |
|
512
|
|
|
|
|
|
|
# |
|
513
|
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
sub arrwash_lang($) { |
|
515
|
0
|
|
|
0
|
0
|
|
my ($arr_ref) = @_; |
|
516
|
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
# split Chinese into individual chars |
|
518
|
0
|
|
|
|
|
|
my @r; |
|
519
|
|
|
|
|
|
|
map { |
|
520
|
0
|
0
|
|
|
|
|
if (/[\200-\377]{2}/) { |
|
|
0
|
|
|
|
|
|
|
|
521
|
0
|
|
|
|
|
|
@r = (@r, /[\200-\377]{2}/g); |
|
522
|
|
|
|
|
|
|
} |
|
523
|
|
|
|
|
|
|
else { |
|
524
|
0
|
|
|
|
|
|
@r = (@r, $_); |
|
525
|
|
|
|
|
|
|
} |
|
526
|
|
|
|
|
|
|
} @$arr_ref; |
|
527
|
|
|
|
|
|
|
|
|
528
|
0
|
|
|
|
|
|
return @r; |
|
529
|
|
|
|
|
|
|
} |
|
530
|
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
=head2 Object method: similarity_check() |
|
532
|
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
Do similarity check on the queued directories. Print similar files info on |
|
534
|
|
|
|
|
|
|
stdout according to the configured format and delimiters. E.g., |
|
535
|
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
$similars_finder->similarity_check(); |
|
537
|
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
=cut |
|
539
|
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
|
541
|
|
|
|
|
|
|
# S - similarity_check: similarity check on glabal array @fileInfo |
|
542
|
|
|
|
|
|
|
# U - similarity_check(); |
|
543
|
|
|
|
|
|
|
# |
|
544
|
|
|
|
|
|
|
# I - Input parameters: None |
|
545
|
|
|
|
|
|
|
# O - similar files printed on stdout |
|
546
|
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
sub similarity_check { |
|
548
|
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
# get a ordered (by soundex count and file name) of file Info array |
|
550
|
|
|
|
|
|
|
# (Use short file names to compare to long file names) |
|
551
|
|
|
|
|
|
|
# use Schwartzian Transform to sort on 2 fields for efficiency |
|
552
|
0
|
|
|
|
|
|
my @fileInfos = map { $_->[0] } |
|
553
|
0
|
|
|
|
|
|
sort { $a->[1] cmp $b->[1] } |
|
554
|
0
|
|
|
0
|
1
|
|
map { [ $_, |
|
555
|
0
|
|
|
|
|
|
sprintf "%3d%6s", $#{$_->[$N_fSdxl]}, $_->[$N_fSdxl][0] |
|
|
0
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
] } @fileInfo; |
|
557
|
0
|
|
|
|
|
|
dbg_show(100,"\@fileInfos", @fileInfos); |
|
558
|
|
|
|
|
|
|
|
|
559
|
0
|
|
|
|
|
|
my @saw = (0) x ($#fileInfos+1); |
|
560
|
0
|
|
|
|
|
|
foreach my $ii (0..$#fileInfos) { |
|
561
|
|
|
|
|
|
|
#warn "] ii=$ii\n"; |
|
562
|
0
|
|
|
|
|
|
my @similar = (); |
|
563
|
0
|
|
|
|
|
|
my $fnl; |
|
564
|
|
|
|
|
|
|
|
|
565
|
0
|
|
|
|
|
|
dbg_show(100,"\@fileInfos", $fileInfos[$ii]); |
|
566
|
0
|
|
|
|
|
|
push @similar, [$ii, $ii, $fileInfos[$ii]->[$N_fSize] ]; |
|
567
|
0
|
|
|
|
|
|
foreach my $jj (($ii+1) ..$#fileInfos) { |
|
568
|
0
|
|
|
|
|
|
$fnl=0; # 0 is good enough since file at [ii] is |
|
569
|
|
|
|
|
|
|
# shorter in name than the one at [jj] |
|
570
|
|
|
|
|
|
|
# don't care about same dir files? |
|
571
|
|
|
|
|
|
|
next |
|
572
|
0
|
0
|
0
|
|
|
|
if (!$config{fc_level} && ($fileInfos[$ii]->[$N_dName] |
|
573
|
|
|
|
|
|
|
eq $fileInfos[$jj]->[$N_dName])) ; |
|
574
|
0
|
0
|
|
|
|
|
if (file_diff(\@fileInfos, $ii, $jj) >= $config{fc_threshold}) { |
|
575
|
0
|
|
|
|
|
|
push @similar, [$ii, $jj, $fileInfos[$jj]->[$N_fSize] ]; |
|
576
|
0
|
0
|
|
|
|
|
$fnl= length($fileInfos[$jj]->[$N_fName]) if |
|
577
|
|
|
|
|
|
|
$fnl < length($fileInfos[$jj]->[$N_fName]); |
|
578
|
|
|
|
|
|
|
} |
|
579
|
|
|
|
|
|
|
} |
|
580
|
0
|
|
|
|
|
|
dbg_show(100,"\@similar", @similar); |
|
581
|
|
|
|
|
|
|
# output unvisited potential similars by each row, order by fSize |
|
582
|
0
|
|
|
|
|
|
@similar = grep {!$saw[$_->[1]]} |
|
583
|
0
|
|
|
|
|
|
sort { $a->[2] <=> $b->[2] } @similar; |
|
|
0
|
|
|
|
|
|
|
|
584
|
0
|
0
|
|
|
|
|
next unless @similar>1; |
|
585
|
0
|
|
|
|
|
|
print $config{delimiter}; |
|
586
|
0
|
|
|
|
|
|
foreach my $similar (@similar) { |
|
587
|
0
|
|
|
|
|
|
print file_info(\@fileInfos, $similar->[1], $fnl). "\n"; |
|
588
|
0
|
|
|
|
|
|
$saw[$similar->[1]]++; |
|
589
|
|
|
|
|
|
|
} |
|
590
|
|
|
|
|
|
|
} |
|
591
|
|
|
|
|
|
|
} |
|
592
|
|
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
|
594
|
|
|
|
|
|
|
sub file_info ($$$) { |
|
595
|
0
|
|
|
0
|
0
|
|
my ($fileInfos, $ndx, $fnl) = @_; |
|
596
|
0
|
|
|
|
|
|
return sprintf($config{format}, $fileInfos->[$ndx]->[$N_fSize], |
|
597
|
|
|
|
|
|
|
$fileInfos->[$ndx]->[$N_fName], |
|
598
|
|
|
|
|
|
|
' ' x ($fnl - length($fileInfos->[$ndx]->[$N_fName])), |
|
599
|
|
|
|
|
|
|
"$fileInfos->[$ndx]->[$N_dName]"); |
|
600
|
|
|
|
|
|
|
} |
|
601
|
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
|
603
|
|
|
|
|
|
|
# S - file_diff: determind how difference two files are by name & size |
|
604
|
|
|
|
|
|
|
# U - file_diff($fileInfos, $ndx1, $ndx2); |
|
605
|
|
|
|
|
|
|
# |
|
606
|
|
|
|
|
|
|
# I - $fileInfos: reference to @fileInfos |
|
607
|
|
|
|
|
|
|
# $ndx1, $ndx2: index to the two file in @fileInfos |
|
608
|
|
|
|
|
|
|
# O - 100%: files are identical |
|
609
|
|
|
|
|
|
|
# 0%: no similarity at all |
|
610
|
|
|
|
|
|
|
sub file_diff ($$$) { |
|
611
|
0
|
|
|
0
|
0
|
|
my ($fileInfos, $ndx1, $ndx2) = @_; |
|
612
|
|
|
|
|
|
|
|
|
613
|
0
|
0
|
|
|
|
|
return 0 unless @{$fileInfos->[$ndx1]->[$N_fSdxl]}; |
|
|
0
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
# find intersection in two soudex array |
|
616
|
0
|
|
|
|
|
|
my %count = (); |
|
617
|
0
|
|
|
|
|
|
foreach my $element |
|
618
|
0
|
|
|
|
|
|
(@{$fileInfos->[$ndx1]->[$N_fSdxl]}, |
|
619
|
0
|
|
|
|
|
|
@{$fileInfos->[$ndx2]->[$N_fSdxl]}) { $count{$element}++ } |
|
|
0
|
|
|
|
|
|
|
|
620
|
|
|
|
|
|
|
# since there is no duplication in each of file soudex |
|
621
|
|
|
|
|
|
|
my $intersection = |
|
622
|
0
|
|
|
|
|
|
grep $count{$_} > 1, keys %count; |
|
623
|
|
|
|
|
|
|
# return p * normal(\common soudex) + (1-p) * ( 1 - normal(\delta fSize)) |
|
624
|
|
|
|
|
|
|
# so the bigger the return value is, the similar the two files are |
|
625
|
|
|
|
|
|
|
$intersection *= $config{soundex_weight} / |
|
626
|
0
|
|
|
|
|
|
(@{$fileInfos->[$ndx1]->[$N_fSdxl]}); |
|
|
0
|
|
|
|
|
|
|
|
627
|
0
|
|
|
|
|
|
dbg_show(100,"intersection", $intersection, $ndx1, $ndx2); |
|
628
|
0
|
|
|
|
|
|
my $WeightfSzie = 100 - $config{soundex_weight}; |
|
629
|
0
|
|
|
|
|
|
my $dfSize = abs($fileInfos->[$ndx1]->[$N_fSize] - |
|
630
|
|
|
|
|
|
|
$fileInfos->[$ndx2]->[$N_fSize]) * $WeightfSzie / |
|
631
|
|
|
|
|
|
|
($fileInfos->[$ndx1]->[$N_fSize] + 1); |
|
632
|
0
|
0
|
|
|
|
|
$dfSize = $dfSize > $WeightfSzie ? $WeightfSzie : $dfSize; |
|
633
|
0
|
|
|
|
|
|
my $file_diff = $intersection + ($WeightfSzie - $dfSize); |
|
634
|
0
|
0
|
|
|
|
|
if ($file_diff >= $config{fc_threshold}) { |
|
635
|
|
|
|
|
|
|
dbg_show(010,"file_diff", |
|
636
|
0
|
|
|
|
|
|
@{$fileInfos->[$ndx1]}, |
|
637
|
0
|
|
|
|
|
|
@{$fileInfos->[$ndx2]}, |
|
|
0
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
$intersection, $dfSize, $file_diff |
|
639
|
|
|
|
|
|
|
); |
|
640
|
|
|
|
|
|
|
} |
|
641
|
0
|
|
|
|
|
|
return $file_diff; |
|
642
|
|
|
|
|
|
|
} |
|
643
|
|
|
|
|
|
|
|
|
644
|
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
1; |
|
646
|
|
|
|
|
|
|
__END__ |