line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package File::FindSimilars; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
# @Author: Tong SUN, (c)2001-2008, all right reserved |
4
|
|
|
|
|
|
|
# @Version: $Date: 2009/01/02 21:10:20 $ $Revision: 2.6 $ |
5
|
|
|
|
|
|
|
# @HomeURL: http://xpt.sourceforge.net/ |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
# {{{ LICENSE: |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
# |
10
|
|
|
|
|
|
|
# Permission to use, copy, modify, and distribute this software and its |
11
|
|
|
|
|
|
|
# documentation for any purpose and without fee is hereby granted, provided |
12
|
|
|
|
|
|
|
# that the above copyright notices appear in all copies and that both those |
13
|
|
|
|
|
|
|
# copyright notices and this permission notice appear in supporting |
14
|
|
|
|
|
|
|
# documentation, and that the names of author not be used in advertising or |
15
|
|
|
|
|
|
|
# publicity pertaining to distribution of the software without specific, |
16
|
|
|
|
|
|
|
# written prior permission. Tong Sun makes no representations about the |
17
|
|
|
|
|
|
|
# suitability of this software for any purpose. It is provided "as is" |
18
|
|
|
|
|
|
|
# without express or implied warranty. |
19
|
|
|
|
|
|
|
# |
20
|
|
|
|
|
|
|
# TONG SUN DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL |
21
|
|
|
|
|
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL ADOBE |
22
|
|
|
|
|
|
|
# SYSTEMS INCORPORATED AND DIGITAL EQUIPMENT CORPORATION BE LIABLE FOR ANY |
23
|
|
|
|
|
|
|
# SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER |
24
|
|
|
|
|
|
|
# RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF |
25
|
|
|
|
|
|
|
# CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN |
26
|
|
|
|
|
|
|
# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
27
|
|
|
|
|
|
|
# |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# }}} |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
# {{{ POD, Intro: |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
=head1 NAME |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
File::FindSimilars - Fast similar-files finder |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=head1 SYNOPSIS |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
use File::FindSimilars; |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
my $similars_finder = |
42
|
|
|
|
|
|
|
File::FindSimilars->new( { fc_level => $fc_level, } ); |
43
|
|
|
|
|
|
|
$similars_finder->find_for(\@ARGV); |
44
|
|
|
|
|
|
|
$similars_finder->similarity_check(); |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=head1 DESCRIPTION |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
Extremely fast file similarity checker. Similar-sized and similar-named |
49
|
|
|
|
|
|
|
files are picked out as suspicious candidates of duplicated files. |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
It uses advanced soundex vector algorithm to determine the similarity |
52
|
|
|
|
|
|
|
between files. Generally it means that if there are n files, each having |
53
|
|
|
|
|
|
|
approximately m words in the file name, the degree of calculation is merely |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
O(n^2 * m) |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
which is over thousands times faster than any existing file fingerprinting |
58
|
|
|
|
|
|
|
technology. |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=head2 ALGORITHM EXPLANATION |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
The self-test output will help you understand what the module do and what |
63
|
|
|
|
|
|
|
would you expect from the outcome. |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
$ make test |
66
|
|
|
|
|
|
|
PERL_DL_NONLAZY=1 /usr/bin/perl "-Iblib/lib" "-Iblib/arch" test.pl |
67
|
|
|
|
|
|
|
1..5 todo 2; |
68
|
|
|
|
|
|
|
# Running under perl version 5.010000 for linux |
69
|
|
|
|
|
|
|
# Current time local: Wed Nov 5 17:45:19 2008 |
70
|
|
|
|
|
|
|
# Current time GMT: Wed Nov 5 22:45:19 2008 |
71
|
|
|
|
|
|
|
# Using Test.pm version 1.25 |
72
|
|
|
|
|
|
|
# Testing File::FindSimilars version 2.04 |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
. . . . |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
== Testing 2, files under test/ subdir: |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
9 test/(eBook) GNU - Python Standard Library 2001.pdf |
79
|
|
|
|
|
|
|
3 test/Audio Book - The Grey Coloured Bunnie.mp3 |
80
|
|
|
|
|
|
|
5 test/ColoredGrayBunny.ogg |
81
|
|
|
|
|
|
|
5 test/GNU - 2001 - Python Standard Library.pdf |
82
|
|
|
|
|
|
|
4 test/GNU - Python Standard Library (2001).rar |
83
|
|
|
|
|
|
|
9 test/LayoutTest.java |
84
|
|
|
|
|
|
|
3 test/PopupTest.java |
85
|
|
|
|
|
|
|
2 test/Python Standard Library.zip |
86
|
|
|
|
|
|
|
ok 2 # (test.pl at line 83 TODO?!) |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
Note: |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
- The findsimilars script will pick out similar files from them in next test. |
91
|
|
|
|
|
|
|
- Let's assume that the number represent the file size in KB. |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
== Testing 3 result should be: |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
## ========= |
96
|
|
|
|
|
|
|
3 'Audio Book - The Grey Coloured Bunnie.mp3' 'test/' |
97
|
|
|
|
|
|
|
5 'ColoredGrayBunny.ogg' 'test/' |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
## ========= |
100
|
|
|
|
|
|
|
4 'GNU - Python Standard Library (2001).rar' 'test/' |
101
|
|
|
|
|
|
|
5 'GNU - 2001 - Python Standard Library.pdf' 'test/' |
102
|
|
|
|
|
|
|
ok 3 |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
Note: |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
- There are 2 groups of similar files picked out by the script. |
107
|
|
|
|
|
|
|
- The similar files are picked because their file names look similar. |
108
|
|
|
|
|
|
|
Note that the first group looks different and spells differently too, |
109
|
|
|
|
|
|
|
which means that the script is versatile enough to handle file names that |
110
|
|
|
|
|
|
|
don't have space in it, and robust enough to deal with spelling mistakes. |
111
|
|
|
|
|
|
|
- Apart from the file name, the file size plays an important role as well. |
112
|
|
|
|
|
|
|
- There are 2 files in the second similar files group, the book files group. |
113
|
|
|
|
|
|
|
- The file 'Python Standard Library.zip' is not considered to be similar to |
114
|
|
|
|
|
|
|
the group because its size is not similar to the group. |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
== Testing 4, if Python.zip is bigger, result should be: |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
## ========= |
119
|
|
|
|
|
|
|
3 'Audio Book - The Grey Coloured Bunnie.mp3' 'test/' |
120
|
|
|
|
|
|
|
5 'ColoredGrayBunny.ogg' 'test/' |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
## ========= |
123
|
|
|
|
|
|
|
4 'Python Standard Library.zip' 'test/' |
124
|
|
|
|
|
|
|
4 'GNU - Python Standard Library (2001).rar' 'test/' |
125
|
|
|
|
|
|
|
5 'GNU - 2001 - Python Standard Library.pdf' 'test/' |
126
|
|
|
|
|
|
|
ok 4 |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
Note: |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
- There are now 3 files in the book files group. |
131
|
|
|
|
|
|
|
- The file 'Python Standard Library.zip' is included in the |
132
|
|
|
|
|
|
|
group because its size is now similar to the group. |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
== Testing 5, if Python.zip is even bigger, result should be: |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
## ========= |
137
|
|
|
|
|
|
|
3 'Audio Book - The Grey Coloured Bunnie.mp3' 'test/' |
138
|
|
|
|
|
|
|
5 'ColoredGrayBunny.ogg' 'test/' |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
## ========= |
141
|
|
|
|
|
|
|
4 'GNU - Python Standard Library (2001).rar' 'test/' |
142
|
|
|
|
|
|
|
5 'GNU - 2001 - Python Standard Library.pdf' 'test/' |
143
|
|
|
|
|
|
|
6 'Python Standard Library.zip' 'test/' |
144
|
|
|
|
|
|
|
9 '(eBook) GNU - Python Standard Library 2001.pdf' 'test/' |
145
|
|
|
|
|
|
|
ok 5 |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
Note: |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
- There are 4 files in the book files group now. |
150
|
|
|
|
|
|
|
- The file 'Python Standard Library.zip' is still in the group. |
151
|
|
|
|
|
|
|
- But this time, because it is also considered to be similar to the .pdf |
152
|
|
|
|
|
|
|
file (since their size are now similar, 6 vs 9), a 4th file the .pdf one |
153
|
|
|
|
|
|
|
is now included in the book group. |
154
|
|
|
|
|
|
|
- If the size of file 'Python Standard Library.zip' is 12(KB), then the |
155
|
|
|
|
|
|
|
book files group will be split into two. Do you know why and |
156
|
|
|
|
|
|
|
which files each group will contain? |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
The File::FindSimilars package comes with a fully functional demo |
159
|
|
|
|
|
|
|
script findsimilars. Please refer to its help file for further |
160
|
|
|
|
|
|
|
explanations. |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
This package is highly customizable. Refer to the class method C for |
163
|
|
|
|
|
|
|
details. |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
=head1 DEPENDS |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
This module depends on L, but not L. |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=cut |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
# }}} |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
# {{{ Global Declaration: |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
# ============================================================== &us === |
176
|
|
|
|
|
|
|
# ............................................................. Uses ... |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
# -- global modules |
179
|
1
|
|
|
1
|
|
102328
|
use strict; # ! |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
30
|
|
180
|
|
|
|
|
|
|
|
181
|
1
|
|
|
1
|
|
5
|
use Carp; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
63
|
|
182
|
1
|
|
|
1
|
|
5
|
use Getopt::Long; |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
8
|
|
183
|
1
|
|
|
1
|
|
136
|
use File::Basename; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
69
|
|
184
|
1
|
|
|
1
|
|
328
|
use Text::Soundex; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
use base qw(Class::Accessor::Fast); |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
# -- local modules |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
sub dbg_show {}; |
191
|
|
|
|
|
|
|
#use MyDbg; $MyDbg::debugging=010; |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
# ============================================================== &gv === |
194
|
|
|
|
|
|
|
# .................................................. Global Varibles ... |
195
|
|
|
|
|
|
|
# |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
our @EXPORT = ( ); # may even omit this line |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
use vars qw($progname $VERSION $debugging); |
200
|
|
|
|
|
|
|
use vars qw(%config @filequeue @fileInfo %sdxCnt %wrdLst); |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
# @fileInfo: List of the following list: |
203
|
|
|
|
|
|
|
my ( |
204
|
|
|
|
|
|
|
$N_dName, # dir name |
205
|
|
|
|
|
|
|
$N_fName, # file name |
206
|
|
|
|
|
|
|
$N_fSize, # file size |
207
|
|
|
|
|
|
|
$N_fSdxl, # file soundex list, reference |
208
|
|
|
|
|
|
|
) = (0..9); |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
# ============================================================== &cs === |
211
|
|
|
|
|
|
|
# ................................................. Constant setting ... |
212
|
|
|
|
|
|
|
# |
213
|
|
|
|
|
|
|
$VERSION = sprintf("%d.%02d", q$Revision: 2.6 $ =~ /(\d+)\.(\d+)/); |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
# }}} |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
# ############################################################## &ss ### |
218
|
|
|
|
|
|
|
# ................................................ Subroutions start ... |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
=head1 METHODS |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
=head2 File::FindSimilars->new(\%config_param) |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
Initialize the object. |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
my $similars_finder = File::FindSimilars->new(); |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
or, |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
my $similars_finder = File::FindSimilars->new( {} ); |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
which are the same as: |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
my $similars_finder = File::FindSimilars->new( { |
235
|
|
|
|
|
|
|
soundex_weight => 50, # percentage of weight that soundex takes, |
236
|
|
|
|
|
|
|
# the rest is for file size |
237
|
|
|
|
|
|
|
fc_threshold => 75, # over which files are considered similar |
238
|
|
|
|
|
|
|
delimiter => "\n## =========\n", # delimiter between files output |
239
|
|
|
|
|
|
|
format => "%12d '%s' %s'%s'", # file info print format |
240
|
|
|
|
|
|
|
fc_level => 0, # file comparison level |
241
|
|
|
|
|
|
|
verbose => 0, |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
} ); |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
What shown above are default settings. Any of the C<%config_param> attribute can be omitted when calling the new method. |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
The C is the only class method. All the rest methods are object methods. |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
=head2 Object attribute: soundex_weight([set_val]) |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
Percentage of weight that soundex takes, the rest of percentage is for file size. |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
Provide the C to change the attribute, omitting it to retrieve the attribute value. |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
=head2 Object attribute: fc_threshold([set_val]) |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
The threshold over which files are considered similar. |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
Provide the C to change the attribute, omitting it to retrieve the attribute value. |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
=head2 Object attribute: delimiter([set_val]) |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
Delimiter printed between file info outputs. |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
Provide the C to change the attribute, omitting it to retrieve the attribute value. |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
=head2 Object attribute: format([set_val]) |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
Format used to print file info. |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
Provide the C to change the attribute, omitting it to retrieve the attribute value. |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
=head2 Object attribute: fc_level([set_val]) |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
File comparison level. Whether to check similar files within the same folder: 0, no; 1, yes. |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
Provide the C to change the attribute, omitting it to retrieve the attribute value. |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
=head2 Object attribute: verbose([set_val]) |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
Verbose level. Whether to output progress info: 0, no; 1, yes. |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
Provide the C to change the attribute, omitting it to retrieve the attribute value. |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
=cut |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
File::FindSimilars |
288
|
|
|
|
|
|
|
->mk_accessors(qw(soundex_weight fc_threshold |
289
|
|
|
|
|
|
|
delimiter format fc_level verbose)); |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
%config = |
292
|
|
|
|
|
|
|
( |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
soundex_weight => 50, # percentage of weight that soundex takes, |
295
|
|
|
|
|
|
|
# the rest is for file size |
296
|
|
|
|
|
|
|
fc_threshold => 75, # over which files are considered similar |
297
|
|
|
|
|
|
|
delimiter => "\n## =========\n", # delimiter between files output |
298
|
|
|
|
|
|
|
format => "%12d '%s' %s'%s'", # file info print format |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
fc_level => 0, # file comparison level |
301
|
|
|
|
|
|
|
verbose => 0, |
302
|
|
|
|
|
|
|
); |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
sub new { |
308
|
|
|
|
|
|
|
ref(my $class = shift) |
309
|
|
|
|
|
|
|
and croak "new is a class method. class name needed."; |
310
|
|
|
|
|
|
|
my ($arg_ref) = @_; |
311
|
|
|
|
|
|
|
my $self = $class->SUPER::new({%config, %$arg_ref}); |
312
|
|
|
|
|
|
|
$config{soundex_weight} = $self->soundex_weight; |
313
|
|
|
|
|
|
|
$config{fc_threshold} = $self->fc_threshold; |
314
|
|
|
|
|
|
|
$config{delimiter} = $self->delimiter; |
315
|
|
|
|
|
|
|
$config{format} = $self->format; |
316
|
|
|
|
|
|
|
$config{fc_level} = $self->fc_level; |
317
|
|
|
|
|
|
|
$config{verbose} = $self->verbose; |
318
|
|
|
|
|
|
|
#$config{} = $self->; |
319
|
|
|
|
|
|
|
return $self; |
320
|
|
|
|
|
|
|
} |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
=head2 Object method: find_for($array_ref) |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
Set directory queue for similarity checking. Each entry in C<$array_ref> |
327
|
|
|
|
|
|
|
is a directory to check into. E.g., |
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
$similars_finder->find_for(\@ARGV); |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
=cut |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
sub find_for { |
334
|
|
|
|
|
|
|
my ($self, $init_dirs) = @_; |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
# threshold $config{fc_threshold} |
337
|
|
|
|
|
|
|
print STDERR "Searching in directory(ies): @$init_dirs with level $config{fc_level}...\n\n" |
338
|
|
|
|
|
|
|
if $config{verbose}; |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
@filequeue = @fileInfo = (); |
341
|
|
|
|
|
|
|
@filequeue = (@filequeue, map { [$_, ''] } @$init_dirs); |
342
|
|
|
|
|
|
|
process_entries(); |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
dbg_show(100,"\@fileInfo", @fileInfo); |
345
|
|
|
|
|
|
|
dbg_show(100,"%sdxCnt", %sdxCnt); |
346
|
|
|
|
|
|
|
dbg_show(100,"%wrdLst", %wrdLst); |
347
|
|
|
|
|
|
|
} |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
350
|
|
|
|
|
|
|
# I - Input: global array @filequeue |
351
|
|
|
|
|
|
|
# Input parameters: None |
352
|
|
|
|
|
|
|
# |
353
|
|
|
|
|
|
|
sub process_entries { |
354
|
|
|
|
|
|
|
my($dir, $qf) = (); |
355
|
|
|
|
|
|
|
#warn "] inside process_entries...\n"; |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
while ($qf = shift @filequeue) { |
358
|
|
|
|
|
|
|
($dir, $_) = ($qf->[0], $qf->[1]); |
359
|
|
|
|
|
|
|
#warn "] inside process_entries loop, $dir, $_, ...\n"; |
360
|
|
|
|
|
|
|
next if /^..?$/; |
361
|
|
|
|
|
|
|
my $name = "$dir/$_"; |
362
|
|
|
|
|
|
|
#warn "] processing file '$name'.\n"; |
363
|
|
|
|
|
|
|
if ($name eq '-/') { |
364
|
|
|
|
|
|
|
# get info from stdin |
365
|
|
|
|
|
|
|
process_stdin(); |
366
|
|
|
|
|
|
|
} |
367
|
|
|
|
|
|
|
elsif (-d $name) { |
368
|
|
|
|
|
|
|
# a directory, process it recursively. |
369
|
|
|
|
|
|
|
process_dir($name); |
370
|
|
|
|
|
|
|
} |
371
|
|
|
|
|
|
|
else { |
372
|
|
|
|
|
|
|
process_file($dir, $_); |
373
|
|
|
|
|
|
|
} |
374
|
|
|
|
|
|
|
} |
375
|
|
|
|
|
|
|
} |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
378
|
|
|
|
|
|
|
# D - Process info given from stdin, which should of form same as |
379
|
|
|
|
|
|
|
# find -printf "%p\t%s\n" |
380
|
|
|
|
|
|
|
# |
381
|
|
|
|
|
|
|
sub process_stdin { |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
while (<>){ |
384
|
|
|
|
|
|
|
croak "Wrong input format: '$_'" unless m{(.*)/(.+?)\t(\d+)$}; |
385
|
|
|
|
|
|
|
my ($dn, $fn, $size) = ( $1, $2, $3 ); |
386
|
|
|
|
|
|
|
my $fSdxl = [ get_soundex($fn) ]; # file soundex list |
387
|
|
|
|
|
|
|
push @fileInfo, [ $dn, $fn, $size, $fSdxl, ]; |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
dbg_show(100,"fileInfo",@fileInfo); |
390
|
|
|
|
|
|
|
map { $sdxCnt{$_}++ } @$fSdxl; |
391
|
|
|
|
|
|
|
} |
392
|
|
|
|
|
|
|
} |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
395
|
|
|
|
|
|
|
# D - Process given dir recursively |
396
|
|
|
|
|
|
|
# N - BFS is more memory friendly than DFS |
397
|
|
|
|
|
|
|
# |
398
|
|
|
|
|
|
|
# T - $dir="/home/tong/tmp" |
399
|
|
|
|
|
|
|
sub process_dir { |
400
|
|
|
|
|
|
|
my($dir) = @_; |
401
|
|
|
|
|
|
|
#warn "] processing dir '$dir'...\n"; |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
opendir(DIR,$dir) || die "File::FindSimilars error: Can't open $dir"; |
404
|
|
|
|
|
|
|
my @filenames = readdir(DIR); |
405
|
|
|
|
|
|
|
closedir(DIR); |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
# record the dirname/fname pair to queue |
408
|
|
|
|
|
|
|
@filequeue = (@filequeue, map { [$dir, $_] } @filenames); |
409
|
|
|
|
|
|
|
dbg_show(100,"filequeue", @filequeue) |
410
|
|
|
|
|
|
|
} |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
413
|
|
|
|
|
|
|
# S - process_file($dirname, $fname), process file $fname under $dirname |
414
|
|
|
|
|
|
|
# D - Process one file and update global vars |
415
|
|
|
|
|
|
|
# U - |
416
|
|
|
|
|
|
|
# |
417
|
|
|
|
|
|
|
# I - Input parameters: |
418
|
|
|
|
|
|
|
# $dirname: dir name string |
419
|
|
|
|
|
|
|
# $fname: file name string |
420
|
|
|
|
|
|
|
# O - Global vars get updated |
421
|
|
|
|
|
|
|
# fileInfo [ $dirname, $fname, $fsize, [ file_soundex ] ] |
422
|
|
|
|
|
|
|
# T - |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
sub process_file { |
425
|
|
|
|
|
|
|
my ($dn, $fn) = @_; |
426
|
|
|
|
|
|
|
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,@rest) = |
427
|
|
|
|
|
|
|
stat("$dn/$fn"); |
428
|
|
|
|
|
|
|
my $fSdxl = [ get_soundex($fn) ]; # file soundex list |
429
|
|
|
|
|
|
|
push @fileInfo, [ $dn, $fn, $size, $fSdxl, ]; |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
dbg_show(100,"fileInfo",@fileInfo); |
432
|
|
|
|
|
|
|
map { $sdxCnt{$_}++ } @$fSdxl; |
433
|
|
|
|
|
|
|
} |
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
436
|
|
|
|
|
|
|
# S - get_soundex($fname), get soundex for file $fname |
437
|
|
|
|
|
|
|
# D - Return a list of soundex of each individual word in file name |
438
|
|
|
|
|
|
|
# U - $aref = [ get_soundex($fname) ]; |
439
|
|
|
|
|
|
|
# |
440
|
|
|
|
|
|
|
# I - Input parameters: |
441
|
|
|
|
|
|
|
# $fname: file name string |
442
|
|
|
|
|
|
|
# O - sorted anonymous soundex array w/ duplications removed |
443
|
|
|
|
|
|
|
# T - @out = get_soundex 'Java_RMI - _Remote_Method_Invocation_ch03.tgz'; |
444
|
|
|
|
|
|
|
# @out = get_soundex 'ASuchKindOfFile.tgz'; |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
sub get_soundex { |
447
|
|
|
|
|
|
|
my ($fn) = @_; |
448
|
|
|
|
|
|
|
# split to individual words |
449
|
|
|
|
|
|
|
my @fn_wlist = split /[-_[:cntrl:][:blank:][:punct:][:digit:]]/i, $fn; |
450
|
|
|
|
|
|
|
# discards file extension, if any |
451
|
|
|
|
|
|
|
pop @fn_wlist if @fn_wlist >= 1; |
452
|
|
|
|
|
|
|
# if it is single word, try further decompose SuchKindOfWord |
453
|
|
|
|
|
|
|
@fn_wlist = $fn_wlist[0] =~ /[A-Z][^A-Z]*/g |
454
|
|
|
|
|
|
|
if @fn_wlist == 1 && $fn_wlist[0] =~ /^[A-Z]/; |
455
|
|
|
|
|
|
|
# wash short |
456
|
|
|
|
|
|
|
dbg_show(100,"wlist 0",@fn_wlist); |
457
|
|
|
|
|
|
|
@fn_wlist = arrwash_short(\@fn_wlist); |
458
|
|
|
|
|
|
|
dbg_show(100,"wlist 1",@fn_wlist); |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
# language specific handling |
461
|
|
|
|
|
|
|
@fn_wlist = arrwash_lang(\@fn_wlist); |
462
|
|
|
|
|
|
|
dbg_show(100,"wlist 2",@fn_wlist); |
463
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
# change word to soundex, record soundex/word in global hash |
465
|
|
|
|
|
|
|
map { |
466
|
|
|
|
|
|
|
if (/[[:alpha:]]/) { |
467
|
|
|
|
|
|
|
my $sdx = soundex($_); |
468
|
|
|
|
|
|
|
$wrdLst{$sdx}{$_}++; |
469
|
|
|
|
|
|
|
s/^.*$/$sdx/; |
470
|
|
|
|
|
|
|
} |
471
|
|
|
|
|
|
|
} @fn_wlist; |
472
|
|
|
|
|
|
|
dbg_show(1,"wrdLst",%wrdLst); |
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
# wash empty/duplicates |
475
|
|
|
|
|
|
|
@fn_wlist = grep(!/^$/, @fn_wlist); |
476
|
|
|
|
|
|
|
@fn_wlist = arrwash_dup(\@fn_wlist); |
477
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
return sort @fn_wlist; |
479
|
|
|
|
|
|
|
} |
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
482
|
|
|
|
|
|
|
# S - arrwash_short($arr_ref), wash short from array $arr_ref |
483
|
|
|
|
|
|
|
# D - weed out empty lines and less-than-3-letter words (e.g. ch12) |
484
|
|
|
|
|
|
|
# U - @fn_wlist = arrwash_short(\@fn_wlist); |
485
|
|
|
|
|
|
|
# |
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
sub arrwash_short($) { |
488
|
|
|
|
|
|
|
my ($arr_ref) = @_; |
489
|
|
|
|
|
|
|
return @$arr_ref unless @$arr_ref >= 1; |
490
|
|
|
|
|
|
|
my @r= grep tr/a-zA-Z// >=3, @$arr_ref; |
491
|
|
|
|
|
|
|
return @r if @r; |
492
|
|
|
|
|
|
|
return @$arr_ref # for upper ASCII |
493
|
|
|
|
|
|
|
if grep(/[\200-\377]/, @$arr_ref); |
494
|
|
|
|
|
|
|
return @r; |
495
|
|
|
|
|
|
|
} |
496
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
498
|
|
|
|
|
|
|
# S - arrwash_dup($arr_ref), wash duplicates from array $arr_ref |
499
|
|
|
|
|
|
|
# D - weed out duplicates |
500
|
|
|
|
|
|
|
# U - @fn_wlist = arrwash_dup(\@fn_wlist); |
501
|
|
|
|
|
|
|
# |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
sub arrwash_dup($) { |
504
|
|
|
|
|
|
|
my ($arr_ref) = @_; |
505
|
|
|
|
|
|
|
my %saw; |
506
|
|
|
|
|
|
|
return grep !$saw{$_}++, @$arr_ref; |
507
|
|
|
|
|
|
|
} |
508
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
510
|
|
|
|
|
|
|
# S - arrwash_lang($arr_ref), language specific washing from array $arr_ref |
511
|
|
|
|
|
|
|
# U - @fn_wlist = arrwash_lang(\@fn_wlist); |
512
|
|
|
|
|
|
|
# |
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
sub arrwash_lang($) { |
515
|
|
|
|
|
|
|
my ($arr_ref) = @_; |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
# split Chinese into individual chars |
518
|
|
|
|
|
|
|
my @r; |
519
|
|
|
|
|
|
|
map { |
520
|
|
|
|
|
|
|
if (/[\200-\377]{2}/) { |
521
|
|
|
|
|
|
|
@r = (@r, /[\200-\377]{2}/g); |
522
|
|
|
|
|
|
|
} |
523
|
|
|
|
|
|
|
else { |
524
|
|
|
|
|
|
|
@r = (@r, $_); |
525
|
|
|
|
|
|
|
} |
526
|
|
|
|
|
|
|
} @$arr_ref; |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
return @r; |
529
|
|
|
|
|
|
|
} |
530
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
=head2 Object method: similarity_check() |
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
Do similarity check on the queued directories. Print similar files info on |
534
|
|
|
|
|
|
|
stdout according to the configured format and delimiters. E.g., |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
$similars_finder->similarity_check(); |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
=cut |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
541
|
|
|
|
|
|
|
# S - similarity_check: similarity check on glabal array @fileInfo |
542
|
|
|
|
|
|
|
# U - similarity_check(); |
543
|
|
|
|
|
|
|
# |
544
|
|
|
|
|
|
|
# I - Input parameters: None |
545
|
|
|
|
|
|
|
# O - similar files printed on stdout |
546
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
sub similarity_check { |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
# get a ordered (by soundex count and file name) of file Info array |
550
|
|
|
|
|
|
|
# (Use short file names to compare to long file names) |
551
|
|
|
|
|
|
|
# use Schwartzian Transform to sort on 2 fields for efficiency |
552
|
|
|
|
|
|
|
my @fileInfos = map { $_->[0] } |
553
|
|
|
|
|
|
|
sort { $a->[1] cmp $b->[1] } |
554
|
|
|
|
|
|
|
map { [ $_, |
555
|
|
|
|
|
|
|
sprintf "%3d%6s", $#{$_->[$N_fSdxl]}, $_->[$N_fSdxl][0] |
556
|
|
|
|
|
|
|
] } @fileInfo; |
557
|
|
|
|
|
|
|
dbg_show(100,"\@fileInfos", @fileInfos); |
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
my @saw = (0) x ($#fileInfos+1); |
560
|
|
|
|
|
|
|
foreach my $ii (0..$#fileInfos) { |
561
|
|
|
|
|
|
|
#warn "] ii=$ii\n"; |
562
|
|
|
|
|
|
|
my @similar = (); |
563
|
|
|
|
|
|
|
my $fnl; |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
dbg_show(100,"\@fileInfos", $fileInfos[$ii]); |
566
|
|
|
|
|
|
|
push @similar, [$ii, $ii, $fileInfos[$ii]->[$N_fSize] ]; |
567
|
|
|
|
|
|
|
foreach my $jj (($ii+1) ..$#fileInfos) { |
568
|
|
|
|
|
|
|
$fnl=0; # 0 is good enough since file at [ii] is |
569
|
|
|
|
|
|
|
# shorter in name than the one at [jj] |
570
|
|
|
|
|
|
|
# don't care about same dir files? |
571
|
|
|
|
|
|
|
next |
572
|
|
|
|
|
|
|
if (!$config{fc_level} && ($fileInfos[$ii]->[$N_dName] |
573
|
|
|
|
|
|
|
eq $fileInfos[$jj]->[$N_dName])) ; |
574
|
|
|
|
|
|
|
if (file_diff(\@fileInfos, $ii, $jj) >= $config{fc_threshold}) { |
575
|
|
|
|
|
|
|
push @similar, [$ii, $jj, $fileInfos[$jj]->[$N_fSize] ]; |
576
|
|
|
|
|
|
|
$fnl= length($fileInfos[$jj]->[$N_fName]) if |
577
|
|
|
|
|
|
|
$fnl < length($fileInfos[$jj]->[$N_fName]); |
578
|
|
|
|
|
|
|
} |
579
|
|
|
|
|
|
|
} |
580
|
|
|
|
|
|
|
dbg_show(100,"\@similar", @similar); |
581
|
|
|
|
|
|
|
# output unvisited potential similars by each row, order by fSize |
582
|
|
|
|
|
|
|
@similar = grep {!$saw[$_->[1]]} |
583
|
|
|
|
|
|
|
sort { $a->[2] <=> $b->[2] } @similar; |
584
|
|
|
|
|
|
|
next unless @similar>1; |
585
|
|
|
|
|
|
|
print $config{delimiter}; |
586
|
|
|
|
|
|
|
foreach my $similar (@similar) { |
587
|
|
|
|
|
|
|
print file_info(\@fileInfos, $similar->[1], $fnl). "\n"; |
588
|
|
|
|
|
|
|
$saw[$similar->[1]]++; |
589
|
|
|
|
|
|
|
} |
590
|
|
|
|
|
|
|
} |
591
|
|
|
|
|
|
|
} |
592
|
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
594
|
|
|
|
|
|
|
sub file_info ($$$) { |
595
|
|
|
|
|
|
|
my ($fileInfos, $ndx, $fnl) = @_; |
596
|
|
|
|
|
|
|
return sprintf($config{format}, $fileInfos->[$ndx]->[$N_fSize], |
597
|
|
|
|
|
|
|
$fileInfos->[$ndx]->[$N_fName], |
598
|
|
|
|
|
|
|
' ' x ($fnl - length($fileInfos->[$ndx]->[$N_fName])), |
599
|
|
|
|
|
|
|
"$fileInfos->[$ndx]->[$N_dName]"); |
600
|
|
|
|
|
|
|
} |
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
603
|
|
|
|
|
|
|
# S - file_diff: determind how difference two files are by name & size |
604
|
|
|
|
|
|
|
# U - file_diff($fileInfos, $ndx1, $ndx2); |
605
|
|
|
|
|
|
|
# |
606
|
|
|
|
|
|
|
# I - $fileInfos: reference to @fileInfos |
607
|
|
|
|
|
|
|
# $ndx1, $ndx2: index to the two file in @fileInfos |
608
|
|
|
|
|
|
|
# O - 100%: files are identical |
609
|
|
|
|
|
|
|
# 0%: no similarity at all |
610
|
|
|
|
|
|
|
sub file_diff ($$$) { |
611
|
|
|
|
|
|
|
my ($fileInfos, $ndx1, $ndx2) = @_; |
612
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
return 0 unless @{$fileInfos->[$ndx1]->[$N_fSdxl]}; |
614
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
# find intersection in two soudex array |
616
|
|
|
|
|
|
|
my %count = (); |
617
|
|
|
|
|
|
|
foreach my $element |
618
|
|
|
|
|
|
|
(@{$fileInfos->[$ndx1]->[$N_fSdxl]}, |
619
|
|
|
|
|
|
|
@{$fileInfos->[$ndx2]->[$N_fSdxl]}) { $count{$element}++ } |
620
|
|
|
|
|
|
|
# since there is no duplication in each of file soudex |
621
|
|
|
|
|
|
|
my $intersection = |
622
|
|
|
|
|
|
|
grep $count{$_} > 1, keys %count; |
623
|
|
|
|
|
|
|
# return p * normal(\common soudex) + (1-p) * ( 1 - normal(\delta fSize)) |
624
|
|
|
|
|
|
|
# so the bigger the return value is, the similar the two files are |
625
|
|
|
|
|
|
|
$intersection *= $config{soundex_weight} / |
626
|
|
|
|
|
|
|
(@{$fileInfos->[$ndx1]->[$N_fSdxl]}); |
627
|
|
|
|
|
|
|
dbg_show(100,"intersection", $intersection, $ndx1, $ndx2); |
628
|
|
|
|
|
|
|
my $WeightfSzie = 100 - $config{soundex_weight}; |
629
|
|
|
|
|
|
|
my $dfSize = abs($fileInfos->[$ndx1]->[$N_fSize] - |
630
|
|
|
|
|
|
|
$fileInfos->[$ndx2]->[$N_fSize]) * $WeightfSzie / |
631
|
|
|
|
|
|
|
($fileInfos->[$ndx1]->[$N_fSize] + 1); |
632
|
|
|
|
|
|
|
$dfSize = $dfSize > $WeightfSzie ? $WeightfSzie : $dfSize; |
633
|
|
|
|
|
|
|
my $file_diff = $intersection + ($WeightfSzie - $dfSize); |
634
|
|
|
|
|
|
|
if ($file_diff >= $config{fc_threshold}) { |
635
|
|
|
|
|
|
|
dbg_show(010,"file_diff", |
636
|
|
|
|
|
|
|
@{$fileInfos->[$ndx1]}, |
637
|
|
|
|
|
|
|
@{$fileInfos->[$ndx2]}, |
638
|
|
|
|
|
|
|
$intersection, $dfSize, $file_diff |
639
|
|
|
|
|
|
|
); |
640
|
|
|
|
|
|
|
} |
641
|
|
|
|
|
|
|
return $file_diff; |
642
|
|
|
|
|
|
|
} |
643
|
|
|
|
|
|
|
|
644
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
1; |
646
|
|
|
|
|
|
|
__END__ |