line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# ABSTRACT: This is the application that gets run() by $bin/dupfind |
2
|
|
|
|
|
|
|
|
3
|
6
|
|
|
6
|
|
355630
|
use strict; |
|
6
|
|
|
|
|
15
|
|
|
6
|
|
|
|
|
194
|
|
4
|
6
|
|
|
6
|
|
32
|
use warnings; |
|
6
|
|
|
|
|
11
|
|
|
6
|
|
|
|
|
448
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package App::dupfind::App; |
7
|
|
|
|
|
|
|
{ |
8
|
|
|
|
|
|
|
$App::dupfind::App::VERSION = '0.140230'; # TRIAL |
9
|
|
|
|
|
|
|
} |
10
|
|
|
|
|
|
|
|
11
|
6
|
|
|
6
|
|
21
|
BEGIN { select STDERR; $|++; select STDOUT; $|++; } |
|
6
|
|
|
|
|
18
|
|
|
6
|
|
|
|
|
14
|
|
|
6
|
|
|
|
|
128
|
|
12
|
|
|
|
|
|
|
|
13
|
6
|
|
|
6
|
|
155
|
use 5.010; |
|
6
|
|
|
|
|
18
|
|
|
6
|
|
|
|
|
218
|
|
14
|
|
|
|
|
|
|
|
15
|
6
|
|
|
6
|
|
7548
|
use Getopt::Long; |
|
6
|
|
|
|
|
90811
|
|
|
6
|
|
|
|
|
41
|
|
16
|
6
|
|
|
6
|
|
6989
|
use Benchmark ':hireswallclock'; |
|
6
|
|
|
|
|
54383
|
|
|
6
|
|
|
|
|
45
|
|
17
|
|
|
|
|
|
|
|
18
|
6
|
|
|
6
|
|
1034
|
use lib 'lib'; |
|
6
|
|
|
|
|
13
|
|
|
6
|
|
|
|
|
53
|
|
19
|
|
|
|
|
|
|
|
20
|
6
|
|
|
6
|
|
3283
|
use App::dupfind; |
|
6
|
|
|
|
|
22
|
|
|
6
|
|
|
|
|
239
|
|
21
|
6
|
|
|
6
|
|
2696
|
use App::dupfind::Threaded; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
use Moo; |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
exit run() unless caller; |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
has opts => |
28
|
|
|
|
|
|
|
( |
29
|
|
|
|
|
|
|
is => 'rw', |
30
|
|
|
|
|
|
|
required => 1, |
31
|
|
|
|
|
|
|
builder => '_build_opts' |
32
|
|
|
|
|
|
|
); |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
has metrics => |
35
|
|
|
|
|
|
|
( |
36
|
|
|
|
|
|
|
is => 'rw', |
37
|
|
|
|
|
|
|
required => 1, |
38
|
|
|
|
|
|
|
builder => sub |
39
|
|
|
|
|
|
|
{ |
40
|
|
|
|
|
|
|
{ |
41
|
|
|
|
|
|
|
scan_count => 0, |
42
|
|
|
|
|
|
|
size_dup_count => 0, |
43
|
|
|
|
|
|
|
real_dup_count => 0, |
44
|
|
|
|
|
|
|
} |
45
|
|
|
|
|
|
|
} |
46
|
|
|
|
|
|
|
); |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
has benchmarks => |
49
|
|
|
|
|
|
|
( |
50
|
|
|
|
|
|
|
is => 'rw', |
51
|
|
|
|
|
|
|
required => 1, |
52
|
|
|
|
|
|
|
builder => sub |
53
|
|
|
|
|
|
|
{ |
54
|
|
|
|
|
|
|
{ |
55
|
|
|
|
|
|
|
scanfs => { }, |
56
|
|
|
|
|
|
|
prune => { }, |
57
|
|
|
|
|
|
|
weed => { }, |
58
|
|
|
|
|
|
|
digest => { }, |
59
|
|
|
|
|
|
|
remove => { }, |
60
|
|
|
|
|
|
|
run => { }, |
61
|
|
|
|
|
|
|
} |
62
|
|
|
|
|
|
|
} |
63
|
|
|
|
|
|
|
); |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
has deduper => |
66
|
|
|
|
|
|
|
( |
67
|
|
|
|
|
|
|
is => 'ro', |
68
|
|
|
|
|
|
|
required => 1, |
69
|
|
|
|
|
|
|
lazy => 1, |
70
|
|
|
|
|
|
|
builder => sub |
71
|
|
|
|
|
|
|
{ |
72
|
|
|
|
|
|
|
my $self = shift; |
73
|
|
|
|
|
|
|
my $ddclass = $self->opts->{threads} |
74
|
|
|
|
|
|
|
? 'App::dupfind::Threaded' |
75
|
|
|
|
|
|
|
: 'App::dupfind'; |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
$ddclass->new( opts => $self->opts ); |
78
|
|
|
|
|
|
|
} |
79
|
|
|
|
|
|
|
); |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
sub BUILD |
83
|
|
|
|
|
|
|
{ |
84
|
|
|
|
|
|
|
my $self = shift; |
85
|
|
|
|
|
|
|
my $opts = $self->opts; |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
$opts->{threads} //= 0; |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
$opts->{progress} = 1 if $opts->{verbose}; |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
$opts->{remove} = 1 if $opts->{prompt}; |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
$opts->{weed} = 0 if $opts->{weed} =~ /^(n|no)$/i; |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
$opts->{weed} = 1 if !! @{ $opts->{wpass} }; |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
$opts->{wpass} = [ 'first_middle_last' ] |
98
|
|
|
|
|
|
|
if $opts->{weed} && ! @{ $opts->{wpass} }; |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
die 'Bogus cache parameters provided! Just use the defaults buddy.' |
101
|
|
|
|
|
|
|
if $opts->{cachestop} < 0 || $opts->{ramcache} < 0; |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
die 'Invalid cachestop size; must be a small fraction of total ramcache.' |
104
|
|
|
|
|
|
|
if $opts->{ramcache} && $opts->{cachestop} > ( $opts->{ramcache} / 100 ); |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
$opts->{cachesize} = int ( $opts->{ramcache} / $opts->{cachestop} ); |
107
|
|
|
|
|
|
|
} |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
sub _build_opts |
110
|
|
|
|
|
|
|
{ |
111
|
|
|
|
|
|
|
my $self = shift; |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
my $opts = |
114
|
|
|
|
|
|
|
{ |
115
|
|
|
|
|
|
|
bytes => 1024 ** 3, # 1 GB max read |
116
|
|
|
|
|
|
|
cachestop => ( 1024 ** 2 ) * 2, # 2 MB file size limit on cache candidacy |
117
|
|
|
|
|
|
|
dir => undef, |
118
|
|
|
|
|
|
|
format => 'human', # options are "human" or "robot" |
119
|
|
|
|
|
|
|
help => undef, |
120
|
|
|
|
|
|
|
links => 0, |
121
|
|
|
|
|
|
|
maxdepth => 50, |
122
|
|
|
|
|
|
|
progress => 0, |
123
|
|
|
|
|
|
|
prompt => 0, |
124
|
|
|
|
|
|
|
quiet => 0, |
125
|
|
|
|
|
|
|
ramcache => ( 1024 ** 2 ) * 300, # 300 MB default cache size |
126
|
|
|
|
|
|
|
remove => 0, |
127
|
|
|
|
|
|
|
threads => 0, |
128
|
|
|
|
|
|
|
verbose => 0, |
129
|
|
|
|
|
|
|
weed => 1, |
130
|
|
|
|
|
|
|
wpass => [ ], |
131
|
|
|
|
|
|
|
wpsize => 32, |
132
|
|
|
|
|
|
|
}; |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
GetOptions |
135
|
|
|
|
|
|
|
( |
136
|
|
|
|
|
|
|
'bytes|b=i' => \$opts->{bytes}, |
137
|
|
|
|
|
|
|
'cachestop=i' => \$opts->{cachestop}, |
138
|
|
|
|
|
|
|
'dir|d=s' => \$opts->{dir}, |
139
|
|
|
|
|
|
|
'format|f=s' => \$opts->{format}, |
140
|
|
|
|
|
|
|
'help|h|?' => \$opts->{help}, |
141
|
|
|
|
|
|
|
'links|l' => \$opts->{links}, |
142
|
|
|
|
|
|
|
'maxdepth|m=i' => \$opts->{maxdepth}, |
143
|
|
|
|
|
|
|
'progress' => \$opts->{progress}, |
144
|
|
|
|
|
|
|
'prompt|p' => \$opts->{prompt}, |
145
|
|
|
|
|
|
|
'quiet|q' => \$opts->{quiet}, |
146
|
|
|
|
|
|
|
'ramcache=i' => \$opts->{ramcache}, |
147
|
|
|
|
|
|
|
'remove|x' => \$opts->{remove}, |
148
|
|
|
|
|
|
|
'threads|t=i' => \$opts->{threads}, |
149
|
|
|
|
|
|
|
'verbose|v' => \$opts->{verbose}, |
150
|
|
|
|
|
|
|
'weedout|w=s' => \$opts->{weed}, |
151
|
|
|
|
|
|
|
'wpass=s' => $opts->{wpass}, |
152
|
|
|
|
|
|
|
'wpsize=i' => \$opts->{wpsize}, |
153
|
|
|
|
|
|
|
) or exit _usage(); |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
exit _usage() unless defined $opts->{dir}; |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
return $opts; |
158
|
|
|
|
|
|
|
} |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
sub _usage |
161
|
|
|
|
|
|
|
{ |
162
|
|
|
|
|
|
|
# This is just the help message: |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
require Pod::Usage; |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
Pod::Usage::pod2usage( { -exitval => 1, -verbose => 2 } ) |
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
sub _bench_this |
170
|
|
|
|
|
|
|
{ |
171
|
|
|
|
|
|
|
my ( $self, $mark, $event ) = @_; |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
$self->benchmarks->{ $mark }->{ $event } = Benchmark->new(); |
174
|
|
|
|
|
|
|
} |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
sub _calculate_bench_times |
177
|
|
|
|
|
|
|
{ |
178
|
|
|
|
|
|
|
my $self = shift; |
179
|
|
|
|
|
|
|
my $benchmarks = $self->benchmarks; |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
for my $mark ( keys %$benchmarks ) |
182
|
|
|
|
|
|
|
{ |
183
|
|
|
|
|
|
|
next unless $benchmarks->{ $mark }->{start}; |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
$self->benchmarks->{ $mark }->{result} = |
186
|
|
|
|
|
|
|
timestr timediff |
187
|
|
|
|
|
|
|
( |
188
|
|
|
|
|
|
|
$benchmarks->{ $mark }->{end}, |
189
|
|
|
|
|
|
|
$benchmarks->{ $mark }->{start} |
190
|
|
|
|
|
|
|
); |
191
|
|
|
|
|
|
|
} |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
$self->benchmarks->{weed}->{result} ||= 'did not weed'; |
194
|
|
|
|
|
|
|
$self->benchmarks->{remove}->{result} ||= 'no deletions'; |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
sub _run_summary |
198
|
|
|
|
|
|
|
{ |
199
|
|
|
|
|
|
|
my $self = shift; |
200
|
|
|
|
|
|
|
my $opts = $self->opts; |
201
|
|
|
|
|
|
|
my $benchmarks = $self->benchmarks; |
202
|
|
|
|
|
|
|
my $metrics = $self->metrics; |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
my ( $cache_hits, $cache_misses ) = $self->deduper->cache_stats; |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
$self->_stderr( <<__SUMMARY__ ); |
207
|
|
|
|
|
|
|
------------------------------ |
208
|
|
|
|
|
|
|
** THREADS...............$opts->{threads} |
209
|
|
|
|
|
|
|
** RAM CACHE.............$opts->{ramcache} bytes |
210
|
|
|
|
|
|
|
** CACHE HITS/MISSES.....$cache_hits/$cache_misses |
211
|
|
|
|
|
|
|
** TOTAL FILES SCANNED...$metrics->{scan_count} |
212
|
|
|
|
|
|
|
** TOTAL SAME SIZE.......$metrics->{size_dup_count} |
213
|
|
|
|
|
|
|
** TOTAL ACTUAL DUPES....$metrics->{real_dup_count} |
214
|
|
|
|
|
|
|
-- TIMES -- |
215
|
|
|
|
|
|
|
** TREE SCAN TIME........$benchmarks->{scanfs}->{result} |
216
|
|
|
|
|
|
|
** HARDLINK PRUNE TIME...$benchmarks->{prune}->{result} |
217
|
|
|
|
|
|
|
** WEED-OUT TIME.........$benchmarks->{weed}->{result} |
218
|
|
|
|
|
|
|
** CRYPTO-HASHING TIME...$benchmarks->{digest}->{result} |
219
|
|
|
|
|
|
|
** DELETION TIME.........$benchmarks->{remove}->{result} |
220
|
|
|
|
|
|
|
** TOTAL RUN TIME........$benchmarks->{run}->{result} |
221
|
|
|
|
|
|
|
__SUMMARY__ |
222
|
|
|
|
|
|
|
} |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
sub run |
225
|
|
|
|
|
|
|
{ |
226
|
|
|
|
|
|
|
my $self = __PACKAGE__->new(); |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
$self->_bench_this( run => 'start' ); |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
my( $size_dups, $pruned_dups, $weeded_dups, $digest_dups ); |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
$size_dups = $self->scanfs; |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
$pruned_dups = $self->prune( $size_dups ); |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
$weeded_dups = $self->weed( $pruned_dups ) if $self->opts->{weed}; |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
$digest_dups = $self->digest( $weeded_dups // $pruned_dups ); |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
$self->metrics->{real_dup_count} = $self->deduper->count_dups( $digest_dups ); |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
$self->_stderr( '** DISPLAYING OUTPUT', '-' x 30 ); |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
$self->deduper->show_dups( $digest_dups ); |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
$self->remove( $digest_dups ) if $self->opts->{remove}; |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
$self->_bench_this( run => 'end' ); |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
$self->_calculate_bench_times; |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
$self->_run_summary; |
253
|
|
|
|
|
|
|
} |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
sub scanfs |
256
|
|
|
|
|
|
|
{ |
257
|
|
|
|
|
|
|
my $self = shift; |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
$self->_stderr( '** SCANNING ALL FILES FOR SIZE DUPLICATES' ); |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
$self->_bench_this( scanfs => 'start' ); |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
my ( $size_dups, $scan_ct, $size_dup_ct ) = $self->deduper->get_size_dups(); |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
$self->metrics->{scan_count} = $scan_ct; |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
$self->metrics->{size_dup_count} = $size_dup_ct; |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
$self->_bench_this( scanfs => 'end' ); |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
say '** NO DUPLICATES FOUND' and exit unless keys %$size_dups; |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
return $size_dups; |
274
|
|
|
|
|
|
|
} |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
sub prune |
277
|
|
|
|
|
|
|
{ |
278
|
|
|
|
|
|
|
my ( $self, $size_dups ) = @_; |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
$self->_stderr( '** PRUNING HARD LINKS' ); |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
$self->_bench_this( prune => 'start' ); |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
$size_dups = $self->deduper->toss_out_hardlinks( $size_dups ); |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
$self->_bench_this( prune => 'end' ); |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
say '** NO DUPLICATES FOUND' and exit unless keys %$size_dups; |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
return $size_dups; |
291
|
|
|
|
|
|
|
} |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
sub weed |
294
|
|
|
|
|
|
|
{ |
295
|
|
|
|
|
|
|
my ( $self, $size_dups ) = @_; |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
$self->_stderr( '** WEEDING-OUT FILES THAT ARE OBVIOUSLY DIFFERENT' ); |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
$self->_bench_this( weed => 'start' ); |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
my $weeded_dups = $self->deduper->weed_dups( $size_dups ); |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
$self->_bench_this( weed => 'end' ); |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
say '** NO DUPLICATES FOUND' and exit unless keys %$weeded_dups; |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
return $weeded_dups; |
308
|
|
|
|
|
|
|
} |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
sub digest |
311
|
|
|
|
|
|
|
{ |
312
|
|
|
|
|
|
|
my ( $self, $size_dups ) = @_; |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
$self->_stderr( '** CHECKSUMMING SIZE DUPLICATES' ); |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
$self->_bench_this( digest => 'start' ); |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
my $digest_dups = $self->deduper->digest_dups( $size_dups ); |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
$self->_bench_this( digest => 'end' ); |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
say '** NO DUPLICATES FOUND' and exit unless keys %$digest_dups; |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
return $digest_dups; |
325
|
|
|
|
|
|
|
} |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
sub remove |
328
|
|
|
|
|
|
|
{ |
329
|
|
|
|
|
|
|
my ( $self, $digests ) = @_; |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
$self->_bench_this( remove => 'start' ); |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
$self->deduper->delete_dups( $digests ); |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
$self->_bench_this( remove => 'end' ); |
336
|
|
|
|
|
|
|
} |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
sub _stderr { return if shift->opts->{quiet}; warn "$_\n" for @_ }; |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
1; |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
__END__ |