line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package File::Searcher::Similars; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
# @Author: Tong SUN, (c)2001-2003, all right reserved |
4
|
|
|
|
|
|
|
# @Version: $Date: 2008/10/31 16:07:34 $ $Revision: 1.27 $ |
5
|
|
|
|
|
|
|
# @HomeURL: http://xpt.sourceforge.net/ |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
# {{{ LICENSE: |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
# |
10
|
|
|
|
|
|
|
# Permission to use, copy, modify, and distribute this software and its |
11
|
|
|
|
|
|
|
# documentation for any purpose and without fee is hereby granted, provided |
12
|
|
|
|
|
|
|
# that the above copyright notices appear in all copies and that both those |
13
|
|
|
|
|
|
|
# copyright notices and this permission notice appear in supporting |
14
|
|
|
|
|
|
|
# documentation, and that the names of author not be used in advertising or |
15
|
|
|
|
|
|
|
# publicity pertaining to distribution of the software without specific, |
16
|
|
|
|
|
|
|
# written prior permission. Tong Sun makes no representations about the |
17
|
|
|
|
|
|
|
# suitability of this software for any purpose. It is provided "as is" |
18
|
|
|
|
|
|
|
# without express or implied warranty. |
19
|
|
|
|
|
|
|
# |
20
|
|
|
|
|
|
|
# TONG SUN DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL |
21
|
|
|
|
|
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL ADOBE |
22
|
|
|
|
|
|
|
# SYSTEMS INCORPORATED AND DIGITAL EQUIPMENT CORPORATION BE LIABLE FOR ANY |
23
|
|
|
|
|
|
|
# SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER |
24
|
|
|
|
|
|
|
# RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF |
25
|
|
|
|
|
|
|
# CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN |
26
|
|
|
|
|
|
|
# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
27
|
|
|
|
|
|
|
# |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# }}} |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
# {{{ POD, Intro: |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
=head1 NAME |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
File::Searcher::Similars - Fast similar-files finder |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=head1 SYNOPSIS |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
use File::Searcher::Similars; |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
File::Searcher::Similars->init(0, \@ARGV); |
42
|
|
|
|
|
|
|
similarity_check_name(); |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
Similar-sized and similar-named files are picked as suspicious candidates of |
45
|
|
|
|
|
|
|
duplicated files. |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
Please note that this version is deprecated. Future versions are released |
48
|
|
|
|
|
|
|
as File::Find::Similars. |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=head1 DESCRIPTION |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
Extremely fast file similarity checker. It uses advanced soundex vector |
53
|
|
|
|
|
|
|
algorithm to determine the similarity between files. Generally it means that |
54
|
|
|
|
|
|
|
if there are n files, each having approximately m words, the degree of |
55
|
|
|
|
|
|
|
calculation is merely |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
O(n^2 * m) |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
which is over hundreds times faster than any existing file fingerprinting |
60
|
|
|
|
|
|
|
technology. |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
The self-test output will help you understand what the module do and what |
63
|
|
|
|
|
|
|
would you expect from the outcome. |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
$ make test |
66
|
|
|
|
|
|
|
PERL_DL_NONLAZY=1 /usr/bin/perl "-Iblib/lib" "-Iblib/arch" test.pl |
67
|
|
|
|
|
|
|
1..4 |
68
|
|
|
|
|
|
|
# Running under perl version 5.010000 for linux |
69
|
|
|
|
|
|
|
# Current time local: Wed Oct 29 11:35:06 2008 |
70
|
|
|
|
|
|
|
# Current time GMT: Wed Oct 29 15:35:06 2008 |
71
|
|
|
|
|
|
|
# Using Test.pm version 1.25 |
72
|
|
|
|
|
|
|
# Testing File::Searcher::Similars version 1.23 |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
== Testing 1, files under test/ subdir: |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
9 test/(eBook) GNU - Python Standard Library 2001.pdf |
77
|
|
|
|
|
|
|
3 test/CardLayoutTest.java |
78
|
|
|
|
|
|
|
5 test/GNU - 2001 - Python Standard Library.pdf |
79
|
|
|
|
|
|
|
4 test/GNU - Python Standard Library (2001).rar |
80
|
|
|
|
|
|
|
9 test/LayoutTest.java |
81
|
|
|
|
|
|
|
3 test/PopupTest.java |
82
|
|
|
|
|
|
|
2 test/Python Standard Library.zip |
83
|
|
|
|
|
|
|
5 test/TestLayout.java |
84
|
|
|
|
|
|
|
ok 1 |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
Note: |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
- The fileSimilars.pl script will pick out similar files from them in next test. |
89
|
|
|
|
|
|
|
- Let's assume that the number represent the file size in KB. |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
== Testing 2 result should be: |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
## ========= |
94
|
|
|
|
|
|
|
3 'CardLayoutTest.java' 'test/' |
95
|
|
|
|
|
|
|
5 'TestLayout.java' 'test/' |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
## ========= |
98
|
|
|
|
|
|
|
4 'GNU - Python Standard Library (2001).rar' 'test/' |
99
|
|
|
|
|
|
|
5 'GNU - 2001 - Python Standard Library.pdf' 'test/' |
100
|
|
|
|
|
|
|
ok 2 |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
Note: |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
- There are 2 groups of similar files picked out by the script. |
105
|
|
|
|
|
|
|
The second group makes more sense. |
106
|
|
|
|
|
|
|
- The similar files are picked because their file names looks similar. |
107
|
|
|
|
|
|
|
- However, the file size plays an important role as well. |
108
|
|
|
|
|
|
|
- There are 2 files in the second similar files group. |
109
|
|
|
|
|
|
|
- The file 'Python Standard Library.zip' is not considered to be similar to |
110
|
|
|
|
|
|
|
the group because its size is not similar to the group. |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
== Testing 3, if Python.zip is bigger, result should be: |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
## ========= |
115
|
|
|
|
|
|
|
3 'CardLayoutTest.java' 'test/' |
116
|
|
|
|
|
|
|
5 'TestLayout.java' 'test/' |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
## ========= |
119
|
|
|
|
|
|
|
4 'Python Standard Library.zip' 'test/' |
120
|
|
|
|
|
|
|
4 'GNU - Python Standard Library (2001).rar' 'test/' |
121
|
|
|
|
|
|
|
5 'GNU - 2001 - Python Standard Library.pdf' 'test/' |
122
|
|
|
|
|
|
|
ok 3 |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
Note: |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
- There are 3 files in the second similar files group. |
127
|
|
|
|
|
|
|
- The file 'Python Standard Library.zip' is now in the 2nd similar files |
128
|
|
|
|
|
|
|
group because its size is now similar to the group. |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
== Testing 4, if Python.zip is even bigger, result should be: |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
## ========= |
133
|
|
|
|
|
|
|
3 'CardLayoutTest.java' 'test/' |
134
|
|
|
|
|
|
|
5 'TestLayout.java' 'test/' |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
## ========= |
137
|
|
|
|
|
|
|
4 'GNU - Python Standard Library (2001).rar' 'test/' |
138
|
|
|
|
|
|
|
5 'GNU - 2001 - Python Standard Library.pdf' 'test/' |
139
|
|
|
|
|
|
|
6 'Python Standard Library.zip' 'test/' |
140
|
|
|
|
|
|
|
9 '(eBook) GNU - Python Standard Library 2001.pdf' 'test/' |
141
|
|
|
|
|
|
|
ok 4 |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
Note: |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
- There are 4 files in the second similar files group. |
146
|
|
|
|
|
|
|
- The file 'Python Standard Library.zip' is still in the group. |
147
|
|
|
|
|
|
|
- But this time, because it is also considered to be similar to the .pdf |
148
|
|
|
|
|
|
|
file (since their size are now similar, 6 vs 9), a 4th file the .pdf |
149
|
|
|
|
|
|
|
is now included in the 2nd group. |
150
|
|
|
|
|
|
|
- If the size of file 'Python Standard Library.zip' is 12(KB), then the |
151
|
|
|
|
|
|
|
second similar files group will be split into two. Do you know why and |
152
|
|
|
|
|
|
|
which files each group will contain? |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
The File::Searcher::Similars package comes with a fully functional demo |
155
|
|
|
|
|
|
|
script fileSimilars.pl. Please refer to its help file for further |
156
|
|
|
|
|
|
|
explanations. |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
This package is highly customizable. Refer to hash variable %config and/or |
159
|
|
|
|
|
|
|
the 3 arrwash_ functions for customization hints. |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
=cut |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
# }}} |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
# {{{ global declaration: |
166
|
|
|
|
|
|
|
|
167
|
1
|
|
|
1
|
|
6314
|
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
101
|
|
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
require Exporter; |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
@ISA = qw(Exporter); |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
@EXPORT = qw( |
174
|
|
|
|
|
|
|
&similarity_check_name |
175
|
|
|
|
|
|
|
); |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
# ============================================================== &us === |
178
|
|
|
|
|
|
|
# ............................................................. Uses ... |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
# -- global modules |
181
|
1
|
|
|
1
|
|
6
|
use strict; # ! |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
36
|
|
182
|
|
|
|
|
|
|
|
183
|
1
|
|
|
1
|
|
6
|
use Carp; |
|
1
|
|
|
|
|
6
|
|
|
1
|
|
|
|
|
69
|
|
184
|
1
|
|
|
1
|
|
1534
|
use Getopt::Long; |
|
1
|
|
|
|
|
15134
|
|
|
1
|
|
|
|
|
7
|
|
185
|
1
|
|
|
1
|
|
164
|
use File::Basename; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
111
|
|
186
|
1
|
|
|
1
|
|
1403
|
use Text::Soundex; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
# -- local modules |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
sub dbg_show {}; |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
# -- global variables |
193
|
|
|
|
|
|
|
use vars qw($progname $VERSION $verbose $debugging); |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
# ============================================================== &cs === |
196
|
|
|
|
|
|
|
# ................................................. Constant setting ... |
197
|
|
|
|
|
|
|
# |
198
|
|
|
|
|
|
|
$VERSION = sprintf("%d.%02d", q$Revision: 1.27 $ =~ /(\d+)\.(\d+)/); |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
# ============================================================== &gv === |
202
|
|
|
|
|
|
|
# .................................................. Global Varibles ... |
203
|
|
|
|
|
|
|
# |
204
|
|
|
|
|
|
|
use vars qw(%config @filequeue @fileInfo %sdxCnt %wrdLst); |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
$config{WeightSoundex} = 50; # precentage of weight that soundex takes, |
207
|
|
|
|
|
|
|
# the rest is for file size |
208
|
|
|
|
|
|
|
$config{Threshold} = 75; # over which files are considered similar |
209
|
|
|
|
|
|
|
$config{Deliminator} = "\n## =========\n"; |
210
|
|
|
|
|
|
|
$config{Format} = "%12d '%s' %s'%s'"; |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
# @fileInfo: List of the following list: |
213
|
|
|
|
|
|
|
my ( |
214
|
|
|
|
|
|
|
$N_dName, # dir name |
215
|
|
|
|
|
|
|
$N_fName, # file name |
216
|
|
|
|
|
|
|
$N_fSize, # file size |
217
|
|
|
|
|
|
|
$N_fSdxl, # file soundex list, reference |
218
|
|
|
|
|
|
|
) = (0..9); |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
my $fc_level=0; |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
# }}} |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
# Preloaded methods go here. |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
# ############################################################## &ss ### |
228
|
|
|
|
|
|
|
# ................................................ Subroutions start ... |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
231
|
|
|
|
|
|
|
# S - File::Searcher::Similars->init($fc_level, \@ARGV); |
232
|
|
|
|
|
|
|
# D - initialize file comparing level and dir queue |
233
|
|
|
|
|
|
|
# |
234
|
|
|
|
|
|
|
# T - |
235
|
|
|
|
|
|
|
sub init ($\@) { |
236
|
|
|
|
|
|
|
my ($mname, $_fc_level, $init_dirs) = @_; |
237
|
|
|
|
|
|
|
$fc_level = $_fc_level; # update module variable |
238
|
|
|
|
|
|
|
#warn "] $mname, $fc_level, $init_dirs\n"; |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
@filequeue = @fileInfo = (); |
241
|
|
|
|
|
|
|
@filequeue = (@filequeue, map { [$_, ''] } @$init_dirs); |
242
|
|
|
|
|
|
|
process_entries(); |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
dbg_show(100,"\@fileInfo", @fileInfo); |
245
|
|
|
|
|
|
|
dbg_show(100,"%sdxCnt", %sdxCnt); |
246
|
|
|
|
|
|
|
dbg_show(100,"%wrdLst", %wrdLst); |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
250
|
|
|
|
|
|
|
# I - Input: global array @filequeue |
251
|
|
|
|
|
|
|
# Input parameters: None |
252
|
|
|
|
|
|
|
# |
253
|
|
|
|
|
|
|
sub process_entries { |
254
|
|
|
|
|
|
|
my($dir, $qf) = (); |
255
|
|
|
|
|
|
|
#warn "] inside process_entries...\n"; |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
while ($qf = shift @filequeue) { |
258
|
|
|
|
|
|
|
($dir, $_) = ($qf->[0], $qf->[1]); |
259
|
|
|
|
|
|
|
#warn "] inside process_entries loop, $dir, $_, ...\n"; |
260
|
|
|
|
|
|
|
next if /^..?$/; |
261
|
|
|
|
|
|
|
my $name = "$dir/$_"; |
262
|
|
|
|
|
|
|
#warn "] processing file '$name'.\n"; |
263
|
|
|
|
|
|
|
if ($name eq '-/') { |
264
|
|
|
|
|
|
|
# get info from stdin |
265
|
|
|
|
|
|
|
process_stdin(); |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
elsif (-d $name) { |
268
|
|
|
|
|
|
|
# a directory, process it recursively. |
269
|
|
|
|
|
|
|
process_dir($name); |
270
|
|
|
|
|
|
|
} |
271
|
|
|
|
|
|
|
else { |
272
|
|
|
|
|
|
|
process_file($dir, $_); |
273
|
|
|
|
|
|
|
} |
274
|
|
|
|
|
|
|
} |
275
|
|
|
|
|
|
|
} |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
278
|
|
|
|
|
|
|
# D - Process info given from stdin, which should of form same as |
279
|
|
|
|
|
|
|
# find -printf "%p\t%s\n" |
280
|
|
|
|
|
|
|
# |
281
|
|
|
|
|
|
|
sub process_stdin { |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
while (<>){ |
284
|
|
|
|
|
|
|
croak "Wrong input format: '$_'" unless m{(.*)/(.+?)\t(\d+)$}; |
285
|
|
|
|
|
|
|
my ($dn, $fn, $size) = ( $1, $2, $3 ); |
286
|
|
|
|
|
|
|
my $fSdxl = [ get_soundex($fn) ]; # file soundex list |
287
|
|
|
|
|
|
|
push @fileInfo, [ $dn, $fn, $size, $fSdxl, ]; |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
dbg_show(100,"fileInfo",@fileInfo); |
290
|
|
|
|
|
|
|
map { $sdxCnt{$_}++ } @$fSdxl; |
291
|
|
|
|
|
|
|
} |
292
|
|
|
|
|
|
|
} |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
295
|
|
|
|
|
|
|
# D - Process given dir recursively |
296
|
|
|
|
|
|
|
# N - BFS is more memory friendly than DFS |
297
|
|
|
|
|
|
|
# |
298
|
|
|
|
|
|
|
# T - $dir="/home/tong/tmp" |
299
|
|
|
|
|
|
|
sub process_dir { |
300
|
|
|
|
|
|
|
my($dir) = @_; |
301
|
|
|
|
|
|
|
#warn "] processing dir '$dir'...\n"; |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
opendir(DIR,$dir) || die "File::Searcher::Similars error: Can't open $dir"; |
304
|
|
|
|
|
|
|
my @filenames = readdir(DIR); |
305
|
|
|
|
|
|
|
closedir(DIR); |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
# record the dirname/fname pair to queue |
308
|
|
|
|
|
|
|
@filequeue = (@filequeue, map { [$dir, $_] } @filenames); |
309
|
|
|
|
|
|
|
dbg_show(100,"filequeue", @filequeue) |
310
|
|
|
|
|
|
|
} |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
313
|
|
|
|
|
|
|
# S - process_file($dirname, $fname), process file $fname under $dirname |
314
|
|
|
|
|
|
|
# D - Process one file and update global vars |
315
|
|
|
|
|
|
|
# U - |
316
|
|
|
|
|
|
|
# |
317
|
|
|
|
|
|
|
# I - Input parameters: |
318
|
|
|
|
|
|
|
# $dirname: dir name string |
319
|
|
|
|
|
|
|
# $fname: file name string |
320
|
|
|
|
|
|
|
# O - Global vars get updated |
321
|
|
|
|
|
|
|
# fileInfo [ $dirname, $fname, $fsize, [ file_soundex ] ] |
322
|
|
|
|
|
|
|
# T - |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
sub process_file { |
325
|
|
|
|
|
|
|
my ($dn, $fn) = @_; |
326
|
|
|
|
|
|
|
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,@rest) = |
327
|
|
|
|
|
|
|
stat("$dn/$fn"); |
328
|
|
|
|
|
|
|
my $fSdxl = [ get_soundex($fn) ]; # file soundex list |
329
|
|
|
|
|
|
|
push @fileInfo, [ $dn, $fn, $size, $fSdxl, ]; |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
dbg_show(100,"fileInfo",@fileInfo); |
332
|
|
|
|
|
|
|
map { $sdxCnt{$_}++ } @$fSdxl; |
333
|
|
|
|
|
|
|
} |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
336
|
|
|
|
|
|
|
# S - get_soundex($fname), get soundex for file $fname |
337
|
|
|
|
|
|
|
# D - Return a list of soundex of each individual word in file name |
338
|
|
|
|
|
|
|
# U - $aref = [ get_soundex($fname) ]; |
339
|
|
|
|
|
|
|
# |
340
|
|
|
|
|
|
|
# I - Input parameters: |
341
|
|
|
|
|
|
|
# $fname: file name string |
342
|
|
|
|
|
|
|
# O - sorted anonymous soundex array w/ duplications removed |
343
|
|
|
|
|
|
|
# T - @out = get_soundex 'Java_RMI - _Remote_Method_Invocation_ch03.tgz'; |
344
|
|
|
|
|
|
|
# @out = get_soundex 'ASuchKindOfFile.tgz'; |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
sub get_soundex { |
347
|
|
|
|
|
|
|
my ($fn) = @_; |
348
|
|
|
|
|
|
|
# split to individual words |
349
|
|
|
|
|
|
|
my @fn_wlist = split /[-_[:cntrl:][:blank:][:punct:][:digit:]]/i, $fn; |
350
|
|
|
|
|
|
|
# discards file extension, if any |
351
|
|
|
|
|
|
|
pop @fn_wlist if @fn_wlist >= 1; |
352
|
|
|
|
|
|
|
# if it is single word, try further decompose SuchKindOfWord |
353
|
|
|
|
|
|
|
@fn_wlist = $fn_wlist[0] =~ /[A-Z][^A-Z]*/g |
354
|
|
|
|
|
|
|
if @fn_wlist == 1 && $fn_wlist[0] =~ /^[A-Z]/; |
355
|
|
|
|
|
|
|
# wash short |
356
|
|
|
|
|
|
|
dbg_show(100,"wlist 0",@fn_wlist); |
357
|
|
|
|
|
|
|
@fn_wlist = arrwash_short(\@fn_wlist); |
358
|
|
|
|
|
|
|
dbg_show(100,"wlist 1",@fn_wlist); |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
# language specific handling |
361
|
|
|
|
|
|
|
@fn_wlist = arrwash_lang(\@fn_wlist); |
362
|
|
|
|
|
|
|
dbg_show(100,"wlist 2",@fn_wlist); |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
# change word to soundex, record soundex/word in global hash |
365
|
|
|
|
|
|
|
map { |
366
|
|
|
|
|
|
|
if (/[[:alpha:]]/) { |
367
|
|
|
|
|
|
|
my $sdx = soundex($_); |
368
|
|
|
|
|
|
|
$wrdLst{$sdx}{$_}++; |
369
|
|
|
|
|
|
|
s/^.*$/$sdx/; |
370
|
|
|
|
|
|
|
} |
371
|
|
|
|
|
|
|
} @fn_wlist; |
372
|
|
|
|
|
|
|
dbg_show(1,"wrdLst",%wrdLst); |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
# wash empty/duplicates |
375
|
|
|
|
|
|
|
@fn_wlist = grep(!/^$/, @fn_wlist); |
376
|
|
|
|
|
|
|
@fn_wlist = arrwash_dup(\@fn_wlist); |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
return sort @fn_wlist; |
379
|
|
|
|
|
|
|
} |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
382
|
|
|
|
|
|
|
# S - arrwash_short($arr_ref), wash short from array $arr_ref |
383
|
|
|
|
|
|
|
# D - weed out empty lines and less-than-3-letter words (e.g. ch12) |
384
|
|
|
|
|
|
|
# U - @fn_wlist = arrwash_short(\@fn_wlist); |
385
|
|
|
|
|
|
|
# |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
sub arrwash_short($) { |
388
|
|
|
|
|
|
|
my ($arr_ref) = @_; |
389
|
|
|
|
|
|
|
return @$arr_ref unless @$arr_ref >= 1; |
390
|
|
|
|
|
|
|
my @r= grep tr/a-zA-Z// >=3, @$arr_ref; |
391
|
|
|
|
|
|
|
return @r if @r; |
392
|
|
|
|
|
|
|
return @$arr_ref # for upper ASCII |
393
|
|
|
|
|
|
|
if grep(/[\200-\377]/, @$arr_ref); |
394
|
|
|
|
|
|
|
return @r; |
395
|
|
|
|
|
|
|
} |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
398
|
|
|
|
|
|
|
# S - arrwash_dup($arr_ref), wash duplicates from array $arr_ref |
399
|
|
|
|
|
|
|
# D - weed out duplicates |
400
|
|
|
|
|
|
|
# U - @fn_wlist = arrwash_dup(\@fn_wlist); |
401
|
|
|
|
|
|
|
# |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
sub arrwash_dup($) { |
404
|
|
|
|
|
|
|
my ($arr_ref) = @_; |
405
|
|
|
|
|
|
|
my %saw; |
406
|
|
|
|
|
|
|
return grep !$saw{$_}++, @$arr_ref; |
407
|
|
|
|
|
|
|
} |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
410
|
|
|
|
|
|
|
# S - arrwash_lang($arr_ref), language specific washing from array $arr_ref |
411
|
|
|
|
|
|
|
# U - @fn_wlist = arrwash_lang(\@fn_wlist); |
412
|
|
|
|
|
|
|
# |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
sub arrwash_lang($) { |
415
|
|
|
|
|
|
|
my ($arr_ref) = @_; |
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
# split Chinese into individual chars |
418
|
|
|
|
|
|
|
my @r; |
419
|
|
|
|
|
|
|
map { |
420
|
|
|
|
|
|
|
if (/[\200-\377]{2}/) { |
421
|
|
|
|
|
|
|
@r = (@r, /[\200-\377]{2}/g); |
422
|
|
|
|
|
|
|
} |
423
|
|
|
|
|
|
|
else { |
424
|
|
|
|
|
|
|
@r = (@r, $_); |
425
|
|
|
|
|
|
|
} |
426
|
|
|
|
|
|
|
} @$arr_ref; |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
return @r; |
429
|
|
|
|
|
|
|
} |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
432
|
|
|
|
|
|
|
# S - similarity_check_name: similarity check on glabal array @fileInfo |
433
|
|
|
|
|
|
|
# U - similarity_check_name(); |
434
|
|
|
|
|
|
|
# |
435
|
|
|
|
|
|
|
# I - Input parameters: None |
436
|
|
|
|
|
|
|
# O - similar files printed on stdout |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
sub similarity_check_name { |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
# get a ordered (by soundex count) file Info array |
441
|
|
|
|
|
|
|
# (Use short file names to compare to long file names) |
442
|
|
|
|
|
|
|
my @fileInfos = |
443
|
|
|
|
|
|
|
sort { $#{$a->[$N_fSdxl]} cmp $#{$b->[$N_fSdxl]} } @fileInfo; |
444
|
|
|
|
|
|
|
dbg_show(100,"\@fileInfos", @fileInfos); |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
my @saw = (0) x ($#fileInfos+1); |
447
|
|
|
|
|
|
|
foreach my $ii (0..$#fileInfos) { |
448
|
|
|
|
|
|
|
#warn "] ii=$ii\n"; |
449
|
|
|
|
|
|
|
my @similar = (); |
450
|
|
|
|
|
|
|
my $fnl; |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
dbg_show(100,"\@fileInfos", $fileInfos[$ii]); |
453
|
|
|
|
|
|
|
push @similar, [$ii, $ii, $fileInfos[$ii]->[$N_fSize] ]; |
454
|
|
|
|
|
|
|
foreach my $jj (($ii+1) ..$#fileInfos) { |
455
|
|
|
|
|
|
|
$fnl=0; # 0 is good enough since file at [ii] is |
456
|
|
|
|
|
|
|
# shorter in name than the one at [jj] |
457
|
|
|
|
|
|
|
#warn "] jj=$jj\n"; |
458
|
|
|
|
|
|
|
# don't care about same dir files? |
459
|
|
|
|
|
|
|
next |
460
|
|
|
|
|
|
|
if (!$fc_level && ($fileInfos[$ii]->[$N_dName] |
461
|
|
|
|
|
|
|
eq $fileInfos[$jj]->[$N_dName])) ; |
462
|
|
|
|
|
|
|
if (file_diff(\@fileInfos, $ii, $jj) >= $config{Threshold}) { |
463
|
|
|
|
|
|
|
push @similar, [$ii, $jj, $fileInfos[$jj]->[$N_fSize] ]; |
464
|
|
|
|
|
|
|
$fnl= length($fileInfos[$jj]->[$N_fName]) if |
465
|
|
|
|
|
|
|
$fnl < length($fileInfos[$jj]->[$N_fName]); |
466
|
|
|
|
|
|
|
} |
467
|
|
|
|
|
|
|
} |
468
|
|
|
|
|
|
|
dbg_show(100,"\@similar", @similar); |
469
|
|
|
|
|
|
|
# output unvisited potential similars by each row, order by fSize |
470
|
|
|
|
|
|
|
@similar = grep {!$saw[$_->[1]]} |
471
|
|
|
|
|
|
|
sort { $a->[2] <=> $b->[2] } @similar; |
472
|
|
|
|
|
|
|
next unless @similar>1; |
473
|
|
|
|
|
|
|
print $config{Deliminator}; |
474
|
|
|
|
|
|
|
foreach my $similar (@similar) { |
475
|
|
|
|
|
|
|
print file_info(\@fileInfos, $similar->[1], $fnl). "\n"; |
476
|
|
|
|
|
|
|
$saw[$similar->[1]]++; |
477
|
|
|
|
|
|
|
} |
478
|
|
|
|
|
|
|
} |
479
|
|
|
|
|
|
|
} |
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
482
|
|
|
|
|
|
|
sub file_info ($$$) { |
483
|
|
|
|
|
|
|
my ($fileInfos, $ndx, $fnl) = @_; |
484
|
|
|
|
|
|
|
return sprintf($config{Format}, $fileInfos->[$ndx]->[$N_fSize], |
485
|
|
|
|
|
|
|
$fileInfos->[$ndx]->[$N_fName], |
486
|
|
|
|
|
|
|
' ' x ($fnl - length($fileInfos->[$ndx]->[$N_fName])), |
487
|
|
|
|
|
|
|
"$fileInfos->[$ndx]->[$N_dName]"); |
488
|
|
|
|
|
|
|
} |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
# =========================================================== &s-sub === |
491
|
|
|
|
|
|
|
# S - file_diff: determind how difference two files are by name & size |
492
|
|
|
|
|
|
|
# U - file_diff($fileInfos, $ndx1, $ndx2); |
493
|
|
|
|
|
|
|
# |
494
|
|
|
|
|
|
|
# I - $fileInfos: reference to @fileInfos |
495
|
|
|
|
|
|
|
# $ndx1, $ndx2: index to the two file in @fileInfos |
496
|
|
|
|
|
|
|
# O - 100%: files are identical |
497
|
|
|
|
|
|
|
# 0%: no similarity at all |
498
|
|
|
|
|
|
|
sub file_diff ($$$) { |
499
|
|
|
|
|
|
|
my ($fileInfos, $ndx1, $ndx2) = @_; |
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
return 0 unless @{$fileInfos->[$ndx1]->[$N_fSdxl]}; |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
# find intersection in two soudex array |
504
|
|
|
|
|
|
|
my %count = (); |
505
|
|
|
|
|
|
|
foreach my $element |
506
|
|
|
|
|
|
|
(@{$fileInfos->[$ndx1]->[$N_fSdxl]}, |
507
|
|
|
|
|
|
|
@{$fileInfos->[$ndx2]->[$N_fSdxl]}) { $count{$element}++ } |
508
|
|
|
|
|
|
|
# since there is no duplication in each of file soudex |
509
|
|
|
|
|
|
|
my $intersection = |
510
|
|
|
|
|
|
|
grep $count{$_} > 1, keys %count; |
511
|
|
|
|
|
|
|
# return p * normal(\common soudex) + (1-p) * ( 1 - normal(\delta fSize)) |
512
|
|
|
|
|
|
|
# so the bigger the return value is, the similar the two files are |
513
|
|
|
|
|
|
|
$intersection *= $config{WeightSoundex} / |
514
|
|
|
|
|
|
|
(@{$fileInfos->[$ndx1]->[$N_fSdxl]}); |
515
|
|
|
|
|
|
|
dbg_show(100,"intersection", $intersection, $ndx1, $ndx2); |
516
|
|
|
|
|
|
|
my $WeightfSzie = 100 - $config{WeightSoundex}; |
517
|
|
|
|
|
|
|
my $dfSize = abs($fileInfos->[$ndx1]->[$N_fSize] - |
518
|
|
|
|
|
|
|
$fileInfos->[$ndx2]->[$N_fSize]) * $WeightfSzie / |
519
|
|
|
|
|
|
|
($fileInfos->[$ndx1]->[$N_fSize] + 1); |
520
|
|
|
|
|
|
|
$dfSize = $dfSize > $WeightfSzie ? $WeightfSzie : $dfSize; |
521
|
|
|
|
|
|
|
my $file_diff = $intersection + ($WeightfSzie - $dfSize); |
522
|
|
|
|
|
|
|
if ($file_diff >= $config{Threshold}) { |
523
|
|
|
|
|
|
|
dbg_show(010,"file_diff", |
524
|
|
|
|
|
|
|
@{$fileInfos->[$ndx1]}, |
525
|
|
|
|
|
|
|
@{$fileInfos->[$ndx2]}, |
526
|
|
|
|
|
|
|
$intersection, $dfSize, $file_diff |
527
|
|
|
|
|
|
|
); |
528
|
|
|
|
|
|
|
} |
529
|
|
|
|
|
|
|
return $file_diff; |
530
|
|
|
|
|
|
|
} |
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
1; |
534
|
|
|
|
|
|
|
__END__ |