line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package File::Find::Similars; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
# @Author: Tong SUN, (c)2001-2008, all right reserved |
4
|
|
|
|
|
|
|
# @Version: $Date: 2008/11/03 14:19:45 $ $Revision: 2.4 $ |
5
|
|
|
|
|
|
|
# @HomeURL: http://xpt.sourceforge.net/ |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
# {{{ LICENSE: |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
# |
10
|
|
|
|
|
|
|
# Permission to use, copy, modify, and distribute this software and its |
11
|
|
|
|
|
|
|
# documentation for any purpose and without fee is hereby granted, provided |
12
|
|
|
|
|
|
|
# that the above copyright notices appear in all copies and that both those |
13
|
|
|
|
|
|
|
# copyright notices and this permission notice appear in supporting |
14
|
|
|
|
|
|
|
# documentation, and that the names of author not be used in advertising or |
15
|
|
|
|
|
|
|
# publicity pertaining to distribution of the software without specific, |
16
|
|
|
|
|
|
|
# written prior permission. Tong Sun makes no representations about the |
17
|
|
|
|
|
|
|
# suitability of this software for any purpose. It is provided "as is" |
18
|
|
|
|
|
|
|
# without express or implied warranty. |
19
|
|
|
|
|
|
|
# |
20
|
|
|
|
|
|
|
# TONG SUN DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL |
21
|
|
|
|
|
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL ADOBE |
22
|
|
|
|
|
|
|
# SYSTEMS INCORPORATED AND DIGITAL EQUIPMENT CORPORATION BE LIABLE FOR ANY |
23
|
|
|
|
|
|
|
# SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER |
24
|
|
|
|
|
|
|
# RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF |
25
|
|
|
|
|
|
|
# CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN |
26
|
|
|
|
|
|
|
# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
27
|
|
|
|
|
|
|
# |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# }}} |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
# {{{ POD, Intro: |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
=head1 NAME |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
File::Find::Similars - Fast similar-files finder |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=head1 SYNOPSIS |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
use File::Find::Similars; |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
my $similars_finder = |
42
|
|
|
|
|
|
|
File::Find::Similars->new( { fc_level => $fc_level, } ); |
43
|
|
|
|
|
|
|
$similars_finder->find_for(\@ARGV); |
44
|
|
|
|
|
|
|
$similars_finder->similarity_check(); |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=head1 DESCRIPTION |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
Extremely fast file similarity checker. Similar-sized and similar-named |
49
|
|
|
|
|
|
|
files are picked out as suspicious candidates of duplicated files. |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
It uses advanced soundex vector algorithm to determine the similarity |
52
|
|
|
|
|
|
|
between files. Generally it means that if there are n files, each having |
53
|
|
|
|
|
|
|
approximately m words in the file name, the degree of calculation is merely |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
O(n^2 * m) |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
which is over thousands times faster than any existing file fingerprinting |
58
|
|
|
|
|
|
|
technology. |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=head2 ALGORITHM EXPLANATION |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
The self-test output will help you understand what the module do and what |
63
|
|
|
|
|
|
|
would you expect from the outcome. |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
$ make test |
66
|
|
|
|
|
|
|
PERL_DL_NONLAZY=1 /usr/bin/perl "-Iblib/lib" "-Iblib/arch" test.pl |
67
|
|
|
|
|
|
|
1..5 todo 2; |
68
|
|
|
|
|
|
|
# Running under perl version 5.010000 for linux |
69
|
|
|
|
|
|
|
# Current time local: Mon Nov 3 08:57:42 2008 |
70
|
|
|
|
|
|
|
# Current time GMT: Mon Nov 3 13:57:42 2008 |
71
|
|
|
|
|
|
|
# Using Test.pm version 1.25 |
72
|
|
|
|
|
|
|
# Testing File::Find::Similars version 2.03 |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
. . . |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
== Testing 2, files under test/ subdir: |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
9 test/(eBook) GNU - Python Standard Library 2001.pdf |
79
|
|
|
|
|
|
|
3 test/Audio Book - The Grey Coloured Bunnie.mp3 |
80
|
|
|
|
|
|
|
5 test/ColoredGrayBunny.ogg |
81
|
|
|
|
|
|
|
5 test/GNU - 2001 - Python Standard Library.pdf |
82
|
|
|
|
|
|
|
4 test/GNU - Python Standard Library (2001).rar |
83
|
|
|
|
|
|
|
9 test/LayoutTest.java |
84
|
|
|
|
|
|
|
3 test/PopupTest.java |
85
|
|
|
|
|
|
|
2 test/Python Standard Library.zip |
86
|
|
|
|
|
|
|
ok 2 # (test.pl at line 83 TODO?!) |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
Note: |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
- The file-similars script will pick out similar files from them in next test. |
91
|
|
|
|
|
|
|
- Let's assume that the number represent the file size in KB. |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
== Testing 3 result should be: |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
## ========= |
96
|
|
|
|
|
|
|
3 'Audio Book - The Grey Coloured Bunnie.mp3' 'test/' |
97
|
|
|
|
|
|
|
5 'ColoredGrayBunny.ogg' 'test/' |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
## ========= |
100
|
|
|
|
|
|
|
4 'GNU - Python Standard Library (2001).rar' 'test/' |
101
|
|
|
|
|
|
|
5 'GNU - 2001 - Python Standard Library.pdf' 'test/' |
102
|
|
|
|
|
|
|
ok 3 |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
Note: |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
- There are 2 groups of similar files picked out by the script. |
107
|
|
|
|
|
|
|
- The similar files are picked because their file names look similar. |
108
|
|
|
|
|
|
|
Note that the first group looks different and spells differently too, |
109
|
|
|
|
|
|
|
which means that the script is versatile enough to handle file names that |
110
|
|
|
|
|
|
|
don't have space in it, and robust enough to deal with spelling mistakes. |
111
|
|
|
|
|
|
|
- Apart from the file name, the file size plays an important role as well. |
112
|
|
|
|
|
|
|
- There are 2 files in the second similar files group, the book files group. |
113
|
|
|
|
|
|
|
- The file 'Python Standard Library.zip' is not considered to be similar to |
114
|
|
|
|
|
|
|
the group because its size is not similar to the group. |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
== Testing 4, if Python.zip is bigger, result should be: |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
## ========= |
119
|
|
|
|
|
|
|
4 'Python Standard Library.zip' 'test/' |
120
|
|
|
|
|
|
|
4 'GNU - Python Standard Library (2001).rar' 'test/' |
121
|
|
|
|
|
|
|
5 'GNU - 2001 - Python Standard Library.pdf' 'test/' |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
## ========= |
124
|
|
|
|
|
|
|
3 'Audio Book - The Grey Coloured Bunnie.mp3' 'test/' |
125
|
|
|
|
|
|
|
5 'ColoredGrayBunny.ogg' 'test/' |
126
|
|
|
|
|
|
|
ok 4 |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
Note: |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
- The previous second similar files group is now the first. I.e., |
131
|
|
|
|
|
|
|
the order of similar files groups is not important. |
132
|
|
|
|
|
|
|
- There are now 3 files in the book files group. |
133
|
|
|
|
|
|
|
- The file 'Python Standard Library.zip' is included in the |
134
|
|
|
|
|
|
|
group because its size is now similar to the group. |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
== Testing 5, if Python.zip is even bigger, result should be: |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
## ========= |
139
|
|
|
|
|
|
|
4 'GNU - Python Standard Library (2001).rar' 'test/' |
140
|
|
|
|
|
|
|
5 'GNU - 2001 - Python Standard Library.pdf' 'test/' |
141
|
|
|
|
|
|
|
6 'Python Standard Library.zip' 'test/' |
142
|
|
|
|
|
|
|
9 '(eBook) GNU - Python Standard Library 2001.pdf' 'test/' |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
## ========= |
145
|
|
|
|
|
|
|
3 'Audio Book - The Grey Coloured Bunnie.mp3' 'test/' |
146
|
|
|
|
|
|
|
5 'ColoredGrayBunny.ogg' 'test/' |
147
|
|
|
|
|
|
|
ok 5 |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
Note: |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
- There are 4 files in the book files group now. |
152
|
|
|
|
|
|
|
- The file 'Python Standard Library.zip' is still in the group. |
153
|
|
|
|
|
|
|
- But this time, because it is also considered to be similar to the .pdf |
154
|
|
|
|
|
|
|
file (since their size are now similar, 6 vs 9), a 4th file the .pdf one |
155
|
|
|
|
|
|
|
is now included in the book group. |
156
|
|
|
|
|
|
|
- If the size of file 'Python Standard Library.zip' is 12(KB), then the |
157
|
|
|
|
|
|
|
book files group will be split into two. Do you know why and |
158
|
|
|
|
|
|
|
which files each group will contain? |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
The File::Find::Similars package comes with a fully functional demo |
161
|
|
|
|
|
|
|
script file-similars. Please refer to its help file for further |
162
|
|
|
|
|
|
|
explanations. |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
This package is highly customizable. Refer to the class method C for |
165
|
|
|
|
|
|
|
details. |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
=head1 DEPENDS |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
This module depends on L, but not L. |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
=cut |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
# }}} |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
# {{{ Global Declaration: |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
# ============================================================== &us === |
178
|
|
|
|
|
|
|
# ............................................................. Uses ... |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
# -- global modules |
181
|
1
|
|
|
1
|
|
105454
|
use strict; # ! |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
44
|
|
182
|
|
|
|
|
|
|
|
183
|
1
|
|
|
1
|
|
6
|
use Carp; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
85
|
|
184
|
1
|
|
|
1
|
|
8
|
use Getopt::Long; |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
9
|
|
185
|
1
|
|
|
1
|
|
172
|
use File::Basename; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
86
|
|
186
|
1
|
|
|
1
|
|
463
|
use Text::Soundex; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
use base qw(Class::Accessor::Fast); |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
# -- local modules |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
sub dbg_show {}; |
193
|
|
|
|
|
|
|
#use MyDbg; $MyDbg::debugging=010; |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
# ============================================================== &gv === |
196
|
|
|
|
|
|
|
# .................................................. Global Varibles ... |
197
|
|
|
|
|
|
|
# |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
our @EXPORT = ( ); # may even omit this line |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
use vars qw($progname $VERSION $debugging); |
202
|
|
|
|
|
|
|
use vars qw(%config @filequeue @fileInfo %sdxCnt %wrdLst); |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
# @fileInfo: List of the following list: |
205
|
|
|
|
|
|
|
my ( |
206
|
|
|
|
|
|
|
$N_dName, # dir name |
207
|
|
|
|
|
|
|
$N_fName, # file name |
208
|
|
|
|
|
|
|
$N_fSize, # file size |
209
|
|
|
|
|
|
|
$N_fSdxl, # file soundex list, reference |
210
|
|
|
|
|
|
|
) = (0..9); |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
# ============================================================== &cs === |
213
|
|
|
|
|
|
|
# ................................................. Constant setting ... |
214
|
|
|
|
|
|
|
# |
215
|
|
|
|
|
|
|
$VERSION = sprintf("%d.%02d", q$Revision: 2.4 $ =~ /(\d+)\.(\d+)/); |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
# }}} |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
# ############################################################## &ss ### |
220
|
|
|
|
|
|
|
# ................................................ Subroutions start ... |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
=head1 METHODS |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
=head2 File::Find::Similars->new(\%config_param) |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
Initialize the object. |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
my $similars_finder = File::Find::Similars->new(); |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
or, |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
my $similars_finder = File::Find::Similars->new( {} ); |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
which are the same as: |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
my $similars_finder = File::Find::Similars->new( { |
237
|
|
|
|
|
|
|
soundex_weight => 50, # percentage of weight that soundex takes, |
238
|
|
|
|
|
|
|
# the rest is for file size |
239
|
|
|
|
|
|
|
fc_threshold => 75, # over which files are considered similar |
240
|
|
|
|
|
|
|
delimiter => "\n## =========\n", # delimiter between files output |
241
|
|
|
|
|
|
|
format => "%12d '%s' %s'%s'", # file info print format |
242
|
|
|
|
|
|
|
fc_level => 0, # file comparison level |
243
|
|
|
|
|
|
|
verbose => 0, |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
} ); |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
What shown above are default settings. Any of the C<%config_param> attribute can be omitted when calling the new method. |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
The C is the only class method. All the rest methods are object methods. |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
=head2 Object attribute: soundex_weight([set_val]) |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
Percentage of weight that soundex takes, the rest of percentage is for file size. |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
Provide the C to change the attribute, omitting it to retrieve the attribute value. |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
=head2 Object attribute: fc_threshold([set_val]) |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
The threshold over which files are considered similar. |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
Provide the C to change the attribute, omitting it to retrieve the attribute value. |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
=head2 Object attribute: delimiter([set_val]) |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
Delimiter printed between file info outputs. |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
Provide the C to change the attribute, omitting it to retrieve the attribute value. |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
=head2 Object attribute: format([set_val]) |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
Format used to print file info. |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
Provide the C to change the attribute, omitting it to retrieve the attribute value. |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
=head2 Object attribute: fc_level([set_val]) |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
File comparison level. Whether to check similar files within the same folder: 0, no; 1, yes. |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
Provide the C to change the attribute, omitting it to retrieve the attribute value. |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
=head2 Object attribute: verbose([set_val]) |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
Verbose level. Whether to output progress info: 0, no; 1, yes. |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
Provide the C to change the attribute, omitting it to retrieve the attribute value. |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
=cut |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
File::Find::Similars |
290
|
|
|
|
|
|
|
->mk_accessors(qw(soundex_weight fc_threshold |
291
|
|
|
|
|
|
|
delimiter format fc_level verbose)); |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
%config = |
294
|
|
|
|
|
|
|
( |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
soundex_weight => 50, # percentage of weight that soundex takes, |
297
|
|
|
|
|
|
|
# the rest is for file size |
298
|
|
|
|
|
|
|
fc_threshold => 75, # over which files are considered similar |
299
|
|
|
|
|
|
|
delimiter => "\n## =========\n", # delimiter between files output |
300
|
|
|
|
|
|
|
format => "%12d '%s' %s'%s'", # file info print format |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
fc_level => 0, # file comparison level |
303
|
|
|
|
|
|
|
verbose => 0, |
304
|
|
|
|
|
|
|
); |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
sub new { |
310
|
|
|
|
|
|
|
ref(my $class = shift) |
311
|
|
|
|
|
|
|
and croak "new is a class method. class name needed."; |
312
|
|
|
|
|
|
|
my ($arg_ref) = @_; |
313
|
|
|
|
|
|
|
my $self = $class->SUPER::new({%config, %$arg_ref}); |
314
|
|
|
|
|
|
|
$config{soundex_weight} = $self->soundex_weight; |
315
|
|
|
|
|
|
|
$config{fc_threshold} = $self->fc_threshold; |
316
|
|
|
|
|
|
|
$config{delimiter} = $self->delimiter; |
317
|
|
|
|
|
|
|
$config{format} = $self->format; |
318
|
|
|
|
|
|
|
$config{fc_level} = $self->fc_level; |
319
|
|
|
|
|
|
|
$config{verbose} = $self->verbose; |
320
|
|
|
|
|
|
|
#$config{} = $self->; |
321
|
|
|
|
|
|
|
return $self; |
322
|
|
|
|
|
|
|
} |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
=head2 Object method: find_for($array_ref) |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
Set directory queue for similarity checking. Each entry in C<$array_ref> |
329
|
|
|
|
|
|
|
is a directory to check into. E.g., |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
$similars_finder->find_for(\@ARGV); |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
=cut |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
sub find_for { |
336
|
|
|
|
|
|
|
my ($self, $init_dirs) = @_; |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
# threshold $config{fc_threshold} |
339
|
|
|
|
|
|
|
print STDERR "Searching in directory(ies): @$init_dirs with level $config{fc_level}...\n\n" |
340
|
|
|
|
|
|
|
if $config{verbose}; |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
@filequeue = @fileInfo = (); |
343
|
|
|
|
|
|
|
@filequeue = (@filequeue, map { [$_, ''] } @$init_dirs); |
344
|
|
|
|
|
|
|
process_entries(); |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
dbg_show(100,"\@fileInfo", @fileInfo); |
347
|
|
|
|
|
|
|
dbg_show(100,"%sdxCnt", %sdxCnt); |
348
|
|
|
|
|
|
|
dbg_show(100,"%wrdLst", %wrdLst); |
349
|
|
|
|
|
|
|
} |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
352
|
|
|
|
|
|
|
# I - Input: global array @filequeue |
353
|
|
|
|
|
|
|
# Input parameters: None |
354
|
|
|
|
|
|
|
# |
355
|
|
|
|
|
|
|
sub process_entries { |
356
|
|
|
|
|
|
|
my($dir, $qf) = (); |
357
|
|
|
|
|
|
|
#warn "] inside process_entries...\n"; |
358
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
while ($qf = shift @filequeue) { |
360
|
|
|
|
|
|
|
($dir, $_) = ($qf->[0], $qf->[1]); |
361
|
|
|
|
|
|
|
#warn "] inside process_entries loop, $dir, $_, ...\n"; |
362
|
|
|
|
|
|
|
next if /^..?$/; |
363
|
|
|
|
|
|
|
my $name = "$dir/$_"; |
364
|
|
|
|
|
|
|
#warn "] processing file '$name'.\n"; |
365
|
|
|
|
|
|
|
if ($name eq '-/') { |
366
|
|
|
|
|
|
|
# get info from stdin |
367
|
|
|
|
|
|
|
process_stdin(); |
368
|
|
|
|
|
|
|
} |
369
|
|
|
|
|
|
|
elsif (-d $name) { |
370
|
|
|
|
|
|
|
# a directory, process it recursively. |
371
|
|
|
|
|
|
|
process_dir($name); |
372
|
|
|
|
|
|
|
} |
373
|
|
|
|
|
|
|
else { |
374
|
|
|
|
|
|
|
process_file($dir, $_); |
375
|
|
|
|
|
|
|
} |
376
|
|
|
|
|
|
|
} |
377
|
|
|
|
|
|
|
} |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
380
|
|
|
|
|
|
|
# D - Process info given from stdin, which should of form same as |
381
|
|
|
|
|
|
|
# find -printf "%p\t%s\n" |
382
|
|
|
|
|
|
|
# |
383
|
|
|
|
|
|
|
sub process_stdin { |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
while (<>){ |
386
|
|
|
|
|
|
|
croak "Wrong input format: '$_'" unless m{(.*)/(.+?)\t(\d+)$}; |
387
|
|
|
|
|
|
|
my ($dn, $fn, $size) = ( $1, $2, $3 ); |
388
|
|
|
|
|
|
|
my $fSdxl = [ get_soundex($fn) ]; # file soundex list |
389
|
|
|
|
|
|
|
push @fileInfo, [ $dn, $fn, $size, $fSdxl, ]; |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
dbg_show(100,"fileInfo",@fileInfo); |
392
|
|
|
|
|
|
|
map { $sdxCnt{$_}++ } @$fSdxl; |
393
|
|
|
|
|
|
|
} |
394
|
|
|
|
|
|
|
} |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
397
|
|
|
|
|
|
|
# D - Process given dir recursively |
398
|
|
|
|
|
|
|
# N - BFS is more memory friendly than DFS |
399
|
|
|
|
|
|
|
# |
400
|
|
|
|
|
|
|
# T - $dir="/home/tong/tmp" |
401
|
|
|
|
|
|
|
sub process_dir { |
402
|
|
|
|
|
|
|
my($dir) = @_; |
403
|
|
|
|
|
|
|
#warn "] processing dir '$dir'...\n"; |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
opendir(DIR,$dir) || die "File::Find::Similars error: Can't open $dir"; |
406
|
|
|
|
|
|
|
my @filenames = readdir(DIR); |
407
|
|
|
|
|
|
|
closedir(DIR); |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
# record the dirname/fname pair to queue |
410
|
|
|
|
|
|
|
@filequeue = (@filequeue, map { [$dir, $_] } @filenames); |
411
|
|
|
|
|
|
|
dbg_show(100,"filequeue", @filequeue) |
412
|
|
|
|
|
|
|
} |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
415
|
|
|
|
|
|
|
# S - process_file($dirname, $fname), process file $fname under $dirname |
416
|
|
|
|
|
|
|
# D - Process one file and update global vars |
417
|
|
|
|
|
|
|
# U - |
418
|
|
|
|
|
|
|
# |
419
|
|
|
|
|
|
|
# I - Input parameters: |
420
|
|
|
|
|
|
|
# $dirname: dir name string |
421
|
|
|
|
|
|
|
# $fname: file name string |
422
|
|
|
|
|
|
|
# O - Global vars get updated |
423
|
|
|
|
|
|
|
# fileInfo [ $dirname, $fname, $fsize, [ file_soundex ] ] |
424
|
|
|
|
|
|
|
# T - |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
sub process_file { |
427
|
|
|
|
|
|
|
my ($dn, $fn) = @_; |
428
|
|
|
|
|
|
|
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,@rest) = |
429
|
|
|
|
|
|
|
stat("$dn/$fn"); |
430
|
|
|
|
|
|
|
my $fSdxl = [ get_soundex($fn) ]; # file soundex list |
431
|
|
|
|
|
|
|
push @fileInfo, [ $dn, $fn, $size, $fSdxl, ]; |
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
dbg_show(100,"fileInfo",@fileInfo); |
434
|
|
|
|
|
|
|
map { $sdxCnt{$_}++ } @$fSdxl; |
435
|
|
|
|
|
|
|
} |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
438
|
|
|
|
|
|
|
# S - get_soundex($fname), get soundex for file $fname |
439
|
|
|
|
|
|
|
# D - Return a list of soundex of each individual word in file name |
440
|
|
|
|
|
|
|
# U - $aref = [ get_soundex($fname) ]; |
441
|
|
|
|
|
|
|
# |
442
|
|
|
|
|
|
|
# I - Input parameters: |
443
|
|
|
|
|
|
|
# $fname: file name string |
444
|
|
|
|
|
|
|
# O - sorted anonymous soundex array w/ duplications removed |
445
|
|
|
|
|
|
|
# T - @out = get_soundex 'Java_RMI - _Remote_Method_Invocation_ch03.tgz'; |
446
|
|
|
|
|
|
|
# @out = get_soundex 'ASuchKindOfFile.tgz'; |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
sub get_soundex { |
449
|
|
|
|
|
|
|
my ($fn) = @_; |
450
|
|
|
|
|
|
|
# split to individual words |
451
|
|
|
|
|
|
|
my @fn_wlist = split /[-_[:cntrl:][:blank:][:punct:][:digit:]]/i, $fn; |
452
|
|
|
|
|
|
|
# discards file extension, if any |
453
|
|
|
|
|
|
|
pop @fn_wlist if @fn_wlist >= 1; |
454
|
|
|
|
|
|
|
# if it is single word, try further decompose SuchKindOfWord |
455
|
|
|
|
|
|
|
@fn_wlist = $fn_wlist[0] =~ /[A-Z][^A-Z]*/g |
456
|
|
|
|
|
|
|
if @fn_wlist == 1 && $fn_wlist[0] =~ /^[A-Z]/; |
457
|
|
|
|
|
|
|
# wash short |
458
|
|
|
|
|
|
|
dbg_show(100,"wlist 0",@fn_wlist); |
459
|
|
|
|
|
|
|
@fn_wlist = arrwash_short(\@fn_wlist); |
460
|
|
|
|
|
|
|
dbg_show(100,"wlist 1",@fn_wlist); |
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
# language specific handling |
463
|
|
|
|
|
|
|
@fn_wlist = arrwash_lang(\@fn_wlist); |
464
|
|
|
|
|
|
|
dbg_show(100,"wlist 2",@fn_wlist); |
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
# change word to soundex, record soundex/word in global hash |
467
|
|
|
|
|
|
|
map { |
468
|
|
|
|
|
|
|
if (/[[:alpha:]]/) { |
469
|
|
|
|
|
|
|
my $sdx = soundex($_); |
470
|
|
|
|
|
|
|
$wrdLst{$sdx}{$_}++; |
471
|
|
|
|
|
|
|
s/^.*$/$sdx/; |
472
|
|
|
|
|
|
|
} |
473
|
|
|
|
|
|
|
} @fn_wlist; |
474
|
|
|
|
|
|
|
dbg_show(1,"wrdLst",%wrdLst); |
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
# wash empty/duplicates |
477
|
|
|
|
|
|
|
@fn_wlist = grep(!/^$/, @fn_wlist); |
478
|
|
|
|
|
|
|
@fn_wlist = arrwash_dup(\@fn_wlist); |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
return sort @fn_wlist; |
481
|
|
|
|
|
|
|
} |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
484
|
|
|
|
|
|
|
# S - arrwash_short($arr_ref), wash short from array $arr_ref |
485
|
|
|
|
|
|
|
# D - weed out empty lines and less-than-3-letter words (e.g. ch12) |
486
|
|
|
|
|
|
|
# U - @fn_wlist = arrwash_short(\@fn_wlist); |
487
|
|
|
|
|
|
|
# |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
sub arrwash_short($) { |
490
|
|
|
|
|
|
|
my ($arr_ref) = @_; |
491
|
|
|
|
|
|
|
return @$arr_ref unless @$arr_ref >= 1; |
492
|
|
|
|
|
|
|
my @r= grep tr/a-zA-Z// >=3, @$arr_ref; |
493
|
|
|
|
|
|
|
return @r if @r; |
494
|
|
|
|
|
|
|
return @$arr_ref # for upper ASCII |
495
|
|
|
|
|
|
|
if grep(/[\200-\377]/, @$arr_ref); |
496
|
|
|
|
|
|
|
return @r; |
497
|
|
|
|
|
|
|
} |
498
|
|
|
|
|
|
|
|
499
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
500
|
|
|
|
|
|
|
# S - arrwash_dup($arr_ref), wash duplicates from array $arr_ref |
501
|
|
|
|
|
|
|
# D - weed out duplicates |
502
|
|
|
|
|
|
|
# U - @fn_wlist = arrwash_dup(\@fn_wlist); |
503
|
|
|
|
|
|
|
# |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
sub arrwash_dup($) { |
506
|
|
|
|
|
|
|
my ($arr_ref) = @_; |
507
|
|
|
|
|
|
|
my %saw; |
508
|
|
|
|
|
|
|
return grep !$saw{$_}++, @$arr_ref; |
509
|
|
|
|
|
|
|
} |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
512
|
|
|
|
|
|
|
# S - arrwash_lang($arr_ref), language specific washing from array $arr_ref |
513
|
|
|
|
|
|
|
# U - @fn_wlist = arrwash_lang(\@fn_wlist); |
514
|
|
|
|
|
|
|
# |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
sub arrwash_lang($) { |
517
|
|
|
|
|
|
|
my ($arr_ref) = @_; |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
# split Chinese into individual chars |
520
|
|
|
|
|
|
|
my @r; |
521
|
|
|
|
|
|
|
map { |
522
|
|
|
|
|
|
|
if (/[\200-\377]{2}/) { |
523
|
|
|
|
|
|
|
@r = (@r, /[\200-\377]{2}/g); |
524
|
|
|
|
|
|
|
} |
525
|
|
|
|
|
|
|
else { |
526
|
|
|
|
|
|
|
@r = (@r, $_); |
527
|
|
|
|
|
|
|
} |
528
|
|
|
|
|
|
|
} @$arr_ref; |
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
return @r; |
531
|
|
|
|
|
|
|
} |
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
=head2 Object method: similarity_check() |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
Do similarity check on the queued directories. Print similar files info on |
536
|
|
|
|
|
|
|
stdout according to the configured format and delimiters. E.g., |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
$similars_finder->similarity_check(); |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
=cut |
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
543
|
|
|
|
|
|
|
# S - similarity_check: similarity check on glabal array @fileInfo |
544
|
|
|
|
|
|
|
# U - similarity_check(); |
545
|
|
|
|
|
|
|
# |
546
|
|
|
|
|
|
|
# I - Input parameters: None |
547
|
|
|
|
|
|
|
# O - similar files printed on stdout |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
sub similarity_check { |
550
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
# get a ordered (by soundex count) file Info array |
552
|
|
|
|
|
|
|
# (Use short file names to compare to long file names) |
553
|
|
|
|
|
|
|
my @fileInfos = |
554
|
|
|
|
|
|
|
sort { $#{$a->[$N_fSdxl]} cmp $#{$b->[$N_fSdxl]} } @fileInfo; |
555
|
|
|
|
|
|
|
dbg_show(100,"\@fileInfos", @fileInfos); |
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
my @saw = (0) x ($#fileInfos+1); |
558
|
|
|
|
|
|
|
foreach my $ii (0..$#fileInfos) { |
559
|
|
|
|
|
|
|
#warn "] ii=$ii\n"; |
560
|
|
|
|
|
|
|
my @similar = (); |
561
|
|
|
|
|
|
|
my $fnl; |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
dbg_show(100,"\@fileInfos", $fileInfos[$ii]); |
564
|
|
|
|
|
|
|
push @similar, [$ii, $ii, $fileInfos[$ii]->[$N_fSize] ]; |
565
|
|
|
|
|
|
|
foreach my $jj (($ii+1) ..$#fileInfos) { |
566
|
|
|
|
|
|
|
$fnl=0; # 0 is good enough since file at [ii] is |
567
|
|
|
|
|
|
|
# shorter in name than the one at [jj] |
568
|
|
|
|
|
|
|
# don't care about same dir files? |
569
|
|
|
|
|
|
|
next |
570
|
|
|
|
|
|
|
if (!$config{fc_level} && ($fileInfos[$ii]->[$N_dName] |
571
|
|
|
|
|
|
|
eq $fileInfos[$jj]->[$N_dName])) ; |
572
|
|
|
|
|
|
|
if (file_diff(\@fileInfos, $ii, $jj) >= $config{fc_threshold}) { |
573
|
|
|
|
|
|
|
push @similar, [$ii, $jj, $fileInfos[$jj]->[$N_fSize] ]; |
574
|
|
|
|
|
|
|
$fnl= length($fileInfos[$jj]->[$N_fName]) if |
575
|
|
|
|
|
|
|
$fnl < length($fileInfos[$jj]->[$N_fName]); |
576
|
|
|
|
|
|
|
} |
577
|
|
|
|
|
|
|
} |
578
|
|
|
|
|
|
|
dbg_show(100,"\@similar", @similar); |
579
|
|
|
|
|
|
|
# output unvisited potential similars by each row, order by fSize |
580
|
|
|
|
|
|
|
@similar = grep {!$saw[$_->[1]]} |
581
|
|
|
|
|
|
|
sort { $a->[2] <=> $b->[2] } @similar; |
582
|
|
|
|
|
|
|
next unless @similar>1; |
583
|
|
|
|
|
|
|
print $config{delimiter}; |
584
|
|
|
|
|
|
|
foreach my $similar (@similar) { |
585
|
|
|
|
|
|
|
print file_info(\@fileInfos, $similar->[1], $fnl). "\n"; |
586
|
|
|
|
|
|
|
$saw[$similar->[1]]++; |
587
|
|
|
|
|
|
|
} |
588
|
|
|
|
|
|
|
} |
589
|
|
|
|
|
|
|
} |
590
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
592
|
|
|
|
|
|
|
sub file_info ($$$) { |
593
|
|
|
|
|
|
|
my ($fileInfos, $ndx, $fnl) = @_; |
594
|
|
|
|
|
|
|
return sprintf($config{format}, $fileInfos->[$ndx]->[$N_fSize], |
595
|
|
|
|
|
|
|
$fileInfos->[$ndx]->[$N_fName], |
596
|
|
|
|
|
|
|
' ' x ($fnl - length($fileInfos->[$ndx]->[$N_fName])), |
597
|
|
|
|
|
|
|
"$fileInfos->[$ndx]->[$N_dName]"); |
598
|
|
|
|
|
|
|
} |
599
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
601
|
|
|
|
|
|
|
# S - file_diff: determind how difference two files are by name & size |
602
|
|
|
|
|
|
|
# U - file_diff($fileInfos, $ndx1, $ndx2); |
603
|
|
|
|
|
|
|
# |
604
|
|
|
|
|
|
|
# I - $fileInfos: reference to @fileInfos |
605
|
|
|
|
|
|
|
# $ndx1, $ndx2: index to the two file in @fileInfos |
606
|
|
|
|
|
|
|
# O - 100%: files are identical |
607
|
|
|
|
|
|
|
# 0%: no similarity at all |
608
|
|
|
|
|
|
|
sub file_diff ($$$) { |
609
|
|
|
|
|
|
|
my ($fileInfos, $ndx1, $ndx2) = @_; |
610
|
|
|
|
|
|
|
|
611
|
|
|
|
|
|
|
return 0 unless @{$fileInfos->[$ndx1]->[$N_fSdxl]}; |
612
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
# find intersection in two soudex array |
614
|
|
|
|
|
|
|
my %count = (); |
615
|
|
|
|
|
|
|
foreach my $element |
616
|
|
|
|
|
|
|
(@{$fileInfos->[$ndx1]->[$N_fSdxl]}, |
617
|
|
|
|
|
|
|
@{$fileInfos->[$ndx2]->[$N_fSdxl]}) { $count{$element}++ } |
618
|
|
|
|
|
|
|
# since there is no duplication in each of file soudex |
619
|
|
|
|
|
|
|
my $intersection = |
620
|
|
|
|
|
|
|
grep $count{$_} > 1, keys %count; |
621
|
|
|
|
|
|
|
# return p * normal(\common soudex) + (1-p) * ( 1 - normal(\delta fSize)) |
622
|
|
|
|
|
|
|
# so the bigger the return value is, the similar the two files are |
623
|
|
|
|
|
|
|
$intersection *= $config{soundex_weight} / |
624
|
|
|
|
|
|
|
(@{$fileInfos->[$ndx1]->[$N_fSdxl]}); |
625
|
|
|
|
|
|
|
dbg_show(100,"intersection", $intersection, $ndx1, $ndx2); |
626
|
|
|
|
|
|
|
my $WeightfSzie = 100 - $config{soundex_weight}; |
627
|
|
|
|
|
|
|
my $dfSize = abs($fileInfos->[$ndx1]->[$N_fSize] - |
628
|
|
|
|
|
|
|
$fileInfos->[$ndx2]->[$N_fSize]) * $WeightfSzie / |
629
|
|
|
|
|
|
|
($fileInfos->[$ndx1]->[$N_fSize] + 1); |
630
|
|
|
|
|
|
|
$dfSize = $dfSize > $WeightfSzie ? $WeightfSzie : $dfSize; |
631
|
|
|
|
|
|
|
my $file_diff = $intersection + ($WeightfSzie - $dfSize); |
632
|
|
|
|
|
|
|
if ($file_diff >= $config{fc_threshold}) { |
633
|
|
|
|
|
|
|
dbg_show(010,"file_diff", |
634
|
|
|
|
|
|
|
@{$fileInfos->[$ndx1]}, |
635
|
|
|
|
|
|
|
@{$fileInfos->[$ndx2]}, |
636
|
|
|
|
|
|
|
$intersection, $dfSize, $file_diff |
637
|
|
|
|
|
|
|
); |
638
|
|
|
|
|
|
|
} |
639
|
|
|
|
|
|
|
return $file_diff; |
640
|
|
|
|
|
|
|
} |
641
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
1; |
644
|
|
|
|
|
|
|
__END__ |