line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# ABSTRACT: Public methods for the App::dupfind deduplication engine |
2
|
|
|
|
|
|
|
|
3
|
8
|
|
|
8
|
|
4197
|
use strict; |
|
8
|
|
|
|
|
14
|
|
|
8
|
|
|
|
|
268
|
|
4
|
8
|
|
|
8
|
|
36
|
use warnings; |
|
8
|
|
|
|
|
12
|
|
|
8
|
|
|
|
|
349
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package App::dupfind::Common; |
7
|
|
|
|
|
|
|
{ |
8
|
|
|
|
|
|
|
$App::dupfind::Common::VERSION = '0.140230'; # TRIAL |
9
|
|
|
|
|
|
|
} |
10
|
|
|
|
|
|
|
|
11
|
8
|
|
|
8
|
|
168
|
use 5.010; |
|
8
|
|
|
|
|
47
|
|
|
8
|
|
|
|
|
268
|
|
12
|
|
|
|
|
|
|
|
13
|
8
|
|
|
8
|
|
36
|
use Moo; |
|
8
|
|
|
|
|
18
|
|
|
8
|
|
|
|
|
43
|
|
14
|
8
|
|
|
8
|
|
8316
|
use Digest::xxHash 'xxhash_hex'; |
|
8
|
|
|
|
|
6265
|
|
|
8
|
|
|
|
|
512
|
|
15
|
|
|
|
|
|
|
|
16
|
8
|
|
|
8
|
|
47
|
use lib 'lib'; |
|
8
|
|
|
|
|
13
|
|
|
8
|
|
|
|
|
49
|
|
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
has opts => ( is => 'ro', required => 1 ); |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
with 'App::dupfind::Guts'; |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
before [ qw/ weed_dups digest_dups / ] => sub |
23
|
|
|
|
|
|
|
{ |
24
|
|
|
|
|
|
|
require Term::ProgressBar if shift->opts->{progress} |
25
|
|
|
|
|
|
|
}; |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
before delete_dups => sub { require Term::Prompt }; |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
sub count_dups |
31
|
|
|
|
|
|
|
{ |
32
|
0
|
|
|
0
|
1
|
|
my ( $self, $dups ) = @_; |
33
|
|
|
|
|
|
|
|
34
|
0
|
|
|
|
|
|
my $count = 0; |
35
|
|
|
|
|
|
|
|
36
|
0
|
|
|
|
|
|
$count += @$_ for map { $dups->{ $_ } } keys %$dups; |
|
0
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
|
38
|
0
|
|
|
|
|
|
return $count; |
39
|
|
|
|
|
|
|
} |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
sub get_size_dups |
42
|
|
|
|
|
|
|
{ |
43
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
44
|
|
|
|
|
|
|
|
45
|
0
|
|
|
|
|
|
my ( $size_dups, $scan_count ) = ( {}, 0 ); |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
$self->ftl->list_dir |
48
|
|
|
|
|
|
|
( |
49
|
|
|
|
|
|
|
$self->opts->{dir} => |
50
|
|
|
|
|
|
|
{ |
51
|
|
|
|
|
|
|
recurse => 1, |
52
|
|
|
|
|
|
|
callback => sub |
53
|
|
|
|
|
|
|
{ |
54
|
|
|
|
|
|
|
## my ( $selfdir, $subdirs, $files ) = @_; |
55
|
|
|
|
|
|
|
|
56
|
0
|
|
|
0
|
|
|
my $files = $_[2]; # save two vars |
57
|
|
|
|
|
|
|
|
58
|
0
|
|
|
|
|
|
$scan_count += @$files; |
59
|
|
|
|
|
|
|
|
60
|
0
|
0
|
|
|
|
|
push @{ $size_dups->{ -s $_ } }, $_ |
|
0
|
|
|
|
|
|
|
61
|
0
|
|
|
|
|
|
for grep { !-l $_ && defined -s $_ } @$files; |
62
|
|
|
|
|
|
|
} |
63
|
|
|
|
|
|
|
} |
64
|
0
|
|
|
|
|
|
); |
65
|
|
|
|
|
|
|
|
66
|
0
|
|
|
|
|
|
delete $size_dups->{ $_ } |
67
|
0
|
|
|
|
|
|
for grep { @{ $size_dups->{ $_ } } == 1 } |
|
0
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
keys %$size_dups; |
69
|
|
|
|
|
|
|
|
70
|
0
|
|
|
|
|
|
return $size_dups, $scan_count, $self->count_dups( $size_dups ); |
71
|
|
|
|
|
|
|
} |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
sub toss_out_hardlinks |
74
|
|
|
|
|
|
|
{ |
75
|
0
|
|
|
0
|
1
|
|
my ( $self, $size_dups ) = @_; |
76
|
|
|
|
|
|
|
|
77
|
0
|
|
|
|
|
|
for my $size ( keys %$size_dups ) |
78
|
|
|
|
|
|
|
{ |
79
|
0
|
|
|
|
|
|
my $group = $size_dups->{ $size }; |
80
|
0
|
|
|
|
|
|
my %dev_inodes; |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# this will automatically throw out hardlinks, with the only surviving |
83
|
|
|
|
|
|
|
# file being the first asciibetically-sorted entry |
84
|
0
|
|
|
|
|
|
$dev_inodes{ join '', ( stat $_ )[0,1] } = $_ for reverse sort @$group; |
85
|
|
|
|
|
|
|
|
86
|
0
|
0
|
|
|
|
|
if ( scalar keys %dev_inodes == 1 ) |
87
|
|
|
|
|
|
|
{ |
88
|
0
|
|
|
|
|
|
delete $size_dups->{ $size }; |
89
|
|
|
|
|
|
|
} |
90
|
|
|
|
|
|
|
else |
91
|
|
|
|
|
|
|
{ |
92
|
0
|
|
|
|
|
|
$size_dups->{ $size } = [ values %dev_inodes ]; |
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
} |
95
|
|
|
|
|
|
|
|
96
|
0
|
|
|
|
|
|
return $size_dups; |
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
sub weed_dups |
100
|
|
|
|
|
|
|
{ |
101
|
|
|
|
|
|
|
my ( $self, $size_dups ) = @_; |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
my $zero_sized = delete $size_dups->{0}; |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
my $pass_count = 0; |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
$self->_do_weed_pass( $size_dups => $_ => ++$pass_count ) |
108
|
|
|
|
|
|
|
for $self->_plan_weed_passes; |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
$size_dups->{0} = $zero_sized if ref $zero_sized; |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
return $size_dups; |
113
|
|
|
|
|
|
|
} |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
sub digest_dups |
116
|
|
|
|
|
|
|
{ |
117
|
|
|
|
|
|
|
my ( $self, $size_dups ) = @_; |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
my ( $digests, $progress, $i ) = ( {}, undef, 0 ); |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
my $digest_cache = {}; |
122
|
|
|
|
|
|
|
my $cache_stop = $self->opts->{cachestop}; |
123
|
|
|
|
|
|
|
my $max_cache = $self->opts->{cachesize}; |
124
|
|
|
|
|
|
|
my $ram_caching = !! $self->opts->{ramcache}; |
125
|
|
|
|
|
|
|
my $cache_size = 0; |
126
|
|
|
|
|
|
|
my $cache_hits = 0; |
127
|
|
|
|
|
|
|
my $cache_misses = 0; |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
# don't bother to hash zero-size files |
130
|
|
|
|
|
|
|
$digests->{ xxhash_hex '', 0 } = delete $size_dups->{0} |
131
|
|
|
|
|
|
|
if exists $size_dups->{0}; |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
if ( $self->opts->{progress} ) |
134
|
|
|
|
|
|
|
{ |
135
|
|
|
|
|
|
|
my $dup_count = $self->count_dups( $size_dups ); |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
$progress = Term::ProgressBar->new |
138
|
|
|
|
|
|
|
( |
139
|
|
|
|
|
|
|
{ |
140
|
|
|
|
|
|
|
name => ' ...PROGRESS', |
141
|
|
|
|
|
|
|
count => $dup_count, |
142
|
|
|
|
|
|
|
remove => 1, |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
); |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
local $/; |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
SIZES: for my $size ( keys %$size_dups ) |
150
|
|
|
|
|
|
|
{ |
151
|
|
|
|
|
|
|
my $group = $size_dups->{ $size }; |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
GROUPING: for my $file ( @$group ) |
154
|
|
|
|
|
|
|
{ |
155
|
|
|
|
|
|
|
my $digest; |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
open my $fh, '<', $file or next; |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
my $data = <$fh>; |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
close $fh; |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
if ( $ram_caching ) |
164
|
|
|
|
|
|
|
{ |
165
|
|
|
|
|
|
|
if ( $digest = $digest_cache->{ $data } ) |
166
|
|
|
|
|
|
|
{ |
167
|
|
|
|
|
|
|
$cache_hits++; |
168
|
|
|
|
|
|
|
} |
169
|
|
|
|
|
|
|
else |
170
|
|
|
|
|
|
|
{ |
171
|
|
|
|
|
|
|
if ( $cache_size < $max_cache && $size <= $cache_stop ) |
172
|
|
|
|
|
|
|
{ |
173
|
|
|
|
|
|
|
$digest_cache->{ $data } = $digest = xxhash_hex $data, 0; |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
$cache_size++; |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
$cache_misses++; |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
else |
180
|
|
|
|
|
|
|
{ |
181
|
|
|
|
|
|
|
$digest = xxhash_hex $data, 0; |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
} |
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
else |
186
|
|
|
|
|
|
|
{ |
187
|
|
|
|
|
|
|
$digest = xxhash_hex $data, 0; |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
push @{ $digests->{ $digest } }, $file; |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
$progress->update( ++$i ) if $progress; |
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
$digest_cache = {}; # it's only worthwhile per-size-grouping |
196
|
|
|
|
|
|
|
$cache_size = 0; |
197
|
|
|
|
|
|
|
} |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
delete $digests->{ $_ } |
200
|
|
|
|
|
|
|
for grep { @{ $digests->{ $_ } } == 1 } |
201
|
|
|
|
|
|
|
keys %$digests; |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
$self->stats->{cache_hits} = $cache_hits; |
204
|
|
|
|
|
|
|
$self->stats->{cache_misses} = $cache_misses; |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
return $digests; |
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
sub sort_dups |
210
|
|
|
|
|
|
|
{ |
211
|
0
|
|
|
0
|
1
|
|
my ( $self, $dups ) = @_; |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
# sort dup groupings |
214
|
0
|
|
|
|
|
|
for my $identifier ( keys %$dups ) |
215
|
|
|
|
|
|
|
{ |
216
|
0
|
|
|
|
|
|
my @group = @{ $dups->{ $identifier } }; |
|
0
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
|
218
|
0
|
|
|
|
|
|
$dups->{ $identifier } = [ sort { $a cmp $b } @group ]; |
|
0
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
} |
220
|
|
|
|
|
|
|
|
221
|
0
|
|
|
|
|
|
return $dups; |
222
|
|
|
|
|
|
|
} |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
sub show_dups # also calls $self->sort_dups before displaying output |
225
|
|
|
|
|
|
|
{ |
226
|
0
|
|
|
0
|
1
|
|
my ( $self, $digests ) = @_; |
227
|
0
|
|
|
|
|
|
my $dupes = 0; |
228
|
|
|
|
|
|
|
|
229
|
0
|
|
|
|
|
|
$digests = $self->sort_dups( $digests ); |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
my $for_humans = sub # human-readable output |
232
|
|
|
|
|
|
|
{ |
233
|
0
|
|
|
0
|
|
|
my ( $digest, $files ) = @_; |
234
|
|
|
|
|
|
|
|
235
|
0
|
|
|
|
|
|
say sprintf 'DUPLICATES (digest: %s | size: %db)', $digest, -s $$files[0]; |
236
|
|
|
|
|
|
|
|
237
|
0
|
|
|
|
|
|
say " $_" for @$files; |
238
|
|
|
|
|
|
|
|
239
|
0
|
|
|
|
|
|
say ''; |
240
|
0
|
|
|
|
|
|
}; |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
my $for_robots = sub # machine parseable output |
243
|
|
|
|
|
|
|
{ |
244
|
0
|
|
|
0
|
|
|
my $files = pop; |
245
|
|
|
|
|
|
|
|
246
|
0
|
|
|
|
|
|
say join "\t", @$files |
247
|
0
|
|
|
|
|
|
}; |
248
|
|
|
|
|
|
|
|
249
|
0
|
0
|
|
|
|
|
my $formatter = $self->opts->{format} eq 'human' ? $for_humans : $for_robots; |
250
|
|
|
|
|
|
|
|
251
|
0
|
|
|
|
|
|
for my $digest |
|
0
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
( |
253
|
|
|
|
|
|
|
sort { $digests->{ $a }->[0] cmp $digests->{ $b }->[0] } keys %$digests |
254
|
|
|
|
|
|
|
) |
255
|
|
|
|
|
|
|
{ |
256
|
0
|
|
|
|
|
|
my $files = $digests->{ $digest }; |
257
|
|
|
|
|
|
|
|
258
|
0
|
|
|
|
|
|
$formatter->( $digest => $files ); |
259
|
|
|
|
|
|
|
|
260
|
0
|
|
|
|
|
|
$dupes += @$files - 1; |
261
|
|
|
|
|
|
|
} |
262
|
|
|
|
|
|
|
|
263
|
0
|
|
|
|
|
|
return $dupes |
264
|
|
|
|
|
|
|
} |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
sub delete_dups |
267
|
|
|
|
|
|
|
{ |
268
|
|
|
|
|
|
|
my ( $self, $digests ) = @_; |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
my $removed = 0; |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
for my $digest ( keys %$digests ) |
273
|
|
|
|
|
|
|
{ |
274
|
|
|
|
|
|
|
my $group = $digests->{ $digest }; |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
say sprintf 'ORIGINAL (%s) %s', $digest, $group->[0]; |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
shift @$group; |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
for my $dup ( @$group ) |
281
|
|
|
|
|
|
|
{ |
282
|
|
|
|
|
|
|
if ( $self->opts->{prompt} ) |
283
|
|
|
|
|
|
|
{ |
284
|
|
|
|
|
|
|
unless ( Term::Prompt::prompt( 'y', "REMOVE DUPE? $dup", '', 'n' ) ) |
285
|
|
|
|
|
|
|
{ |
286
|
|
|
|
|
|
|
say sprintf 'KEPT (%s) %s', $digest, $dup; |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
next; |
289
|
|
|
|
|
|
|
} |
290
|
|
|
|
|
|
|
} |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
unlink $dup or warn "COULD NOT REMOVE $dup! $!" and next; |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
$removed++; |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
say sprintf 'REMOVED (%s) %s', $digest, $dup; |
297
|
|
|
|
|
|
|
} |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
say '--'; |
300
|
|
|
|
|
|
|
} |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
say "** TOTAL DUPLICATE FILES REMOVED: $removed"; |
303
|
|
|
|
|
|
|
} |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
sub cache_stats |
306
|
|
|
|
|
|
|
{ |
307
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
308
|
|
|
|
|
|
|
|
309
|
0
|
|
|
|
|
|
return $self->stats->{cache_hits}, |
310
|
|
|
|
|
|
|
$self->stats->{cache_misses} |
311
|
|
|
|
|
|
|
} |
312
|
|
|
|
|
|
|
|
313
|
0
|
0
|
|
0
|
1
|
|
sub say_stderr { return if shift->opts->{quiet}; warn "$_\n" for @_ }; |
|
0
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
1; |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
__END__ |