| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package File::Dedup; |
|
2
|
|
|
|
|
|
|
# ABSTRACT: Deduplicate files across directories |
|
3
|
|
|
|
|
|
|
$File::Dedup::VERSION = '0.007'; |
|
4
|
5
|
|
|
5
|
|
2852
|
use strict; |
|
|
5
|
|
|
|
|
7
|
|
|
|
5
|
|
|
|
|
145
|
|
|
5
|
5
|
|
|
5
|
|
19
|
use warnings; |
|
|
5
|
|
|
|
|
8
|
|
|
|
5
|
|
|
|
|
129
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
5
|
|
|
5
|
|
4134
|
use Digest::SHA; |
|
|
5
|
|
|
|
|
17331
|
|
|
|
5
|
|
|
|
|
298
|
|
|
8
|
5
|
|
|
5
|
|
43
|
use feature qw(say); |
|
|
5
|
|
|
|
|
7
|
|
|
|
5
|
|
|
|
|
2455
|
|
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
my @VALID_OPTIONS = qw(ask directory group recursive); |
|
11
|
|
|
|
|
|
|
sub new { |
|
12
|
14
|
|
|
14
|
0
|
12528
|
my ($class, %opts) = @_; |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
die "Must pass a directory to process" |
|
15
|
14
|
100
|
|
|
|
54
|
unless exists $opts{directory}; |
|
16
|
|
|
|
|
|
|
die "Supplied directory argument '$opts{directory}' is not a directory" |
|
17
|
13
|
100
|
|
|
|
179
|
unless -d $opts{directory}; |
|
18
|
|
|
|
|
|
|
warn "Supplied option 'group' not implemented yet" |
|
19
|
12
|
100
|
66
|
|
|
135
|
if exists $opts{group} and defined $opts{group}; |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
# do not allow undefined options |
|
22
|
12
|
|
|
|
|
39
|
foreach my $opt ( keys %opts ) { |
|
23
|
|
|
|
|
|
|
die "Invalid argument '$opt' passed to new" |
|
24
|
23
|
100
|
|
|
|
29
|
unless grep { $_ eq $opt } @VALID_OPTIONS; |
|
|
92
|
|
|
|
|
140
|
|
|
25
|
|
|
|
|
|
|
} |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
# default to always asking before purging |
|
28
|
|
|
|
|
|
|
$opts{ask} = 1 |
|
29
|
11
|
100
|
66
|
|
|
61
|
unless exists $opts{ask} && defined $opts{ask}; |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
# default to non-recursive |
|
32
|
|
|
|
|
|
|
$opts{recursive} = 0 |
|
33
|
11
|
100
|
66
|
|
|
45
|
unless exists $opts{recursive} && defined $opts{recursive}; |
|
34
|
|
|
|
|
|
|
|
|
35
|
11
|
|
|
|
|
37
|
return bless \%opts, $class; |
|
36
|
|
|
|
|
|
|
} |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
sub directory { |
|
39
|
2
|
|
|
2
|
1
|
18
|
return shift->{directory}; |
|
40
|
|
|
|
|
|
|
} |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
sub recursive { |
|
43
|
2
|
|
|
2
|
1
|
16
|
return shift->{recursive}; |
|
44
|
|
|
|
|
|
|
} |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
sub ask { |
|
47
|
15
|
|
|
15
|
1
|
70
|
return shift->{ask}; |
|
48
|
|
|
|
|
|
|
} |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
sub group { |
|
51
|
0
|
|
|
0
|
1
|
0
|
return shift->{group}; |
|
52
|
|
|
|
|
|
|
} |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
sub _file_digest { |
|
55
|
3
|
|
|
3
|
|
3
|
my ($filename) = @_; |
|
56
|
|
|
|
|
|
|
|
|
57
|
3
|
50
|
|
|
|
61
|
open my $fh, '<', $filename |
|
58
|
|
|
|
|
|
|
or die "$!"; |
|
59
|
|
|
|
|
|
|
|
|
60
|
3
|
|
|
|
|
15
|
my $checksum = Digest::SHA->new->addfile($fh)->hexdigest; |
|
61
|
3
|
|
|
|
|
128
|
close($fh); |
|
62
|
|
|
|
|
|
|
|
|
63
|
3
|
|
|
|
|
18
|
return $checksum; |
|
64
|
|
|
|
|
|
|
} |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
sub dedup { |
|
67
|
1
|
|
|
1
|
1
|
5
|
my ($self) = @_; |
|
68
|
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
my @results = $self->_dirwalk( |
|
70
|
|
|
|
|
|
|
$self->directory, |
|
71
|
3
|
|
|
3
|
|
9
|
sub { [ $_[0], _file_digest($_[0]) ] }, |
|
72
|
1
|
|
|
1
|
|
1
|
sub { shift; @_ } |
|
|
1
|
|
|
|
|
12
|
|
|
73
|
1
|
|
|
|
|
4
|
); |
|
74
|
5
|
|
|
5
|
|
6705
|
use Data::Dumper; |
|
|
5
|
|
|
|
|
39221
|
|
|
|
5
|
|
|
|
|
5248
|
|
|
75
|
1
|
|
|
|
|
9
|
print Dumper \@results; |
|
76
|
1
|
|
|
|
|
129
|
my %files_by_hashsum; |
|
77
|
1
|
|
|
|
|
4
|
foreach my $result ( @results ) { |
|
78
|
3
|
|
|
|
|
62
|
my ($filename, $digest) = @$result; |
|
79
|
3
|
|
|
|
|
4
|
push @{ $files_by_hashsum{$digest} }, $filename; |
|
|
3
|
|
|
|
|
8
|
|
|
80
|
|
|
|
|
|
|
} |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
my %duplicates_by_hashsum = |
|
83
|
1
|
|
|
|
|
3
|
map { $_ => [ sort @{$files_by_hashsum{$_}} ] } |
|
|
1
|
|
|
|
|
9
|
|
|
84
|
1
|
|
|
|
|
5
|
grep { @{ $files_by_hashsum{$_} } > 1 } keys %files_by_hashsum; |
|
|
2
|
|
|
|
|
2
|
|
|
|
2
|
|
|
|
|
18
|
|
|
85
|
|
|
|
|
|
|
|
|
86
|
1
|
|
|
|
|
5
|
my @files_to_purge = $self->_handle_duplicates(\%duplicates_by_hashsum); |
|
87
|
1
|
|
|
|
|
5
|
$self->_purge_files(\@files_to_purge); |
|
88
|
|
|
|
|
|
|
|
|
89
|
1
|
|
|
|
|
5
|
return; |
|
90
|
|
|
|
|
|
|
} |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
sub _handle_duplicates { |
|
93
|
4
|
|
|
4
|
|
39
|
my ($self, $duplicates) = @_; |
|
94
|
4
|
50
|
|
|
|
13
|
return unless keys %$duplicates; |
|
95
|
|
|
|
|
|
|
|
|
96
|
4
|
|
|
|
|
9
|
my @files_to_purge; |
|
97
|
4
|
|
|
|
|
16
|
while ( my ($digest, $files) = each %$duplicates ) { |
|
98
|
7
|
|
|
|
|
7
|
my $to_keep; |
|
99
|
7
|
100
|
|
|
|
17
|
if ( $self->ask ) { |
|
100
|
4
|
|
|
|
|
523
|
say 'The following files are duplicates ' |
|
101
|
|
|
|
|
|
|
. " indicate which one(s) you would like to keep\n" |
|
102
|
|
|
|
|
|
|
. '(-1 to SKIP or CTRL-C to quit):'; |
|
103
|
|
|
|
|
|
|
|
|
104
|
4
|
|
|
|
|
8
|
my $number_of_files = $#{ $files }; |
|
|
4
|
|
|
|
|
10
|
|
|
105
|
4
|
|
|
|
|
11
|
foreach my $i ( 0 .. $number_of_files ) { |
|
106
|
8
|
|
|
|
|
17
|
my $file = $files->[$i]; |
|
107
|
8
|
|
|
|
|
762
|
say "[ $i]\t$file"; |
|
108
|
|
|
|
|
|
|
} |
|
109
|
4
|
|
|
|
|
331
|
say "[ -1]\tSKIP"; |
|
110
|
4
|
|
|
|
|
323
|
say "[C-c]\tQUIT"; |
|
111
|
4
|
|
|
|
|
15
|
$to_keep = _get_numeric_response($number_of_files); |
|
112
|
4
|
100
|
66
|
|
|
53
|
next if ! defined $to_keep || defined $to_keep && $to_keep == -1; |
|
|
|
|
33
|
|
|
|
|
|
113
|
|
|
|
|
|
|
} |
|
114
|
|
|
|
|
|
|
else { # if ask = 0 keep the first duplicate |
|
115
|
3
|
|
|
|
|
5
|
$to_keep = 0; |
|
116
|
|
|
|
|
|
|
} |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
push @files_to_purge, |
|
119
|
5
|
|
|
|
|
8
|
grep { $_ ne $files->[$to_keep] } @$files; |
|
|
10
|
|
|
|
|
30
|
|
|
120
|
|
|
|
|
|
|
} |
|
121
|
|
|
|
|
|
|
|
|
122
|
4
|
|
|
|
|
34
|
return sort @files_to_purge; |
|
123
|
|
|
|
|
|
|
} |
|
124
|
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
sub _purge_files { |
|
126
|
2
|
|
|
2
|
|
125
|
my ($self, $files) = @_; |
|
127
|
|
|
|
|
|
|
|
|
128
|
2
|
|
|
|
|
6
|
foreach my $file ( @$files ) { |
|
129
|
3
|
|
|
|
|
243
|
print "purging file: $file\n"; |
|
130
|
3
|
|
|
|
|
6
|
my $response; |
|
131
|
3
|
50
|
|
|
|
11
|
if ( $self->ask ) { |
|
132
|
|
|
|
|
|
|
do { |
|
133
|
0
|
|
|
|
|
0
|
print "About to delete '$file'; continue? [Y/n] "; |
|
134
|
0
|
|
|
|
|
0
|
$response = _prompt(); |
|
135
|
|
|
|
|
|
|
} |
|
136
|
0
|
|
|
|
|
0
|
while ( !grep { $response eq $_ } ('y', 'Y', 'n', 'N', '') ); |
|
|
0
|
|
|
|
|
0
|
|
|
137
|
|
|
|
|
|
|
} |
|
138
|
|
|
|
|
|
|
|
|
139
|
3
|
50
|
0
|
|
|
9
|
_delete_file($file) |
|
|
|
|
0
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
140
|
|
|
|
|
|
|
if !$self->ask |
|
141
|
|
|
|
|
|
|
|| ($self->ask |
|
142
|
|
|
|
|
|
|
&& ($response eq '' || $response =~ m/^[yY]$/)); |
|
143
|
|
|
|
|
|
|
} |
|
144
|
|
|
|
|
|
|
|
|
145
|
2
|
|
|
|
|
7
|
return; |
|
146
|
|
|
|
|
|
|
} |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
sub _delete_file { |
|
149
|
3
|
|
|
3
|
|
6
|
my ($file) = @_; |
|
150
|
|
|
|
|
|
|
|
|
151
|
3
|
50
|
|
|
|
207
|
unlink($file) |
|
152
|
|
|
|
|
|
|
or die "Unable to delete file '$file': $!"; |
|
153
|
|
|
|
|
|
|
} |
|
154
|
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
sub _get_numeric_response { |
|
156
|
4
|
|
|
4
|
|
6
|
my ($max) = @_; |
|
157
|
|
|
|
|
|
|
|
|
158
|
4
|
|
|
|
|
3
|
my $input; |
|
159
|
4
|
|
|
|
|
5
|
my $valid_response = 0; |
|
160
|
4
|
|
|
|
|
3
|
do { |
|
161
|
4
|
|
|
|
|
127
|
print "\n>> "; |
|
162
|
4
|
|
|
|
|
13
|
$input = _prompt(); |
|
163
|
|
|
|
|
|
|
|
|
164
|
4
|
50
|
33
|
|
|
50
|
if ( ! defined $input ) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
165
|
0
|
|
|
|
|
0
|
say 'You did not enter any input.'; |
|
166
|
|
|
|
|
|
|
} |
|
167
|
|
|
|
|
|
|
elsif ( $input !~ m/^\-?\d+$/ ) { |
|
168
|
0
|
|
|
|
|
0
|
say "You must enter a number between 0 and $max"; |
|
169
|
|
|
|
|
|
|
} |
|
170
|
|
|
|
|
|
|
elsif ( $input && $input > $max ) { |
|
171
|
0
|
|
|
|
|
0
|
say "You must enter a number between 0 and $max"; |
|
172
|
|
|
|
|
|
|
} |
|
173
|
|
|
|
|
|
|
else { |
|
174
|
4
|
|
|
|
|
10
|
$valid_response = 1; |
|
175
|
|
|
|
|
|
|
} |
|
176
|
|
|
|
|
|
|
} while( !$valid_response ); |
|
177
|
|
|
|
|
|
|
|
|
178
|
4
|
|
|
|
|
335
|
print "AFTER get_numeric_response: $input\n"; |
|
179
|
4
|
|
|
|
|
13
|
return $input; |
|
180
|
|
|
|
|
|
|
} |
|
181
|
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
sub _prompt { |
|
183
|
0
|
|
|
0
|
|
0
|
my $input = ; |
|
184
|
0
|
|
|
|
|
0
|
chomp($input); |
|
185
|
|
|
|
|
|
|
|
|
186
|
0
|
|
|
|
|
0
|
return $input; |
|
187
|
|
|
|
|
|
|
} |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
sub _dirwalk { |
|
190
|
4
|
|
|
4
|
|
5
|
my ($self, $top, $filefunc, $dirfunc) = @_; |
|
191
|
|
|
|
|
|
|
|
|
192
|
4
|
100
|
|
|
|
34
|
if ( -d $top ) { |
|
193
|
|
|
|
|
|
|
# stop processing non-recursive searches when a directory that |
|
194
|
|
|
|
|
|
|
# was not the starting directory is encountered |
|
195
|
|
|
|
|
|
|
return |
|
196
|
1
|
50
|
33
|
|
|
3
|
if $top ne $self->directory && !$self->recursive; |
|
197
|
|
|
|
|
|
|
|
|
198
|
1
|
|
|
|
|
2
|
my $DIR; |
|
199
|
1
|
50
|
|
|
|
35
|
unless ( opendir $DIR, $top ) { |
|
200
|
0
|
|
|
|
|
0
|
warn "Couldn't open directory '$top': $!; skipping.\n"; |
|
201
|
0
|
|
|
|
|
0
|
return; |
|
202
|
|
|
|
|
|
|
} |
|
203
|
|
|
|
|
|
|
|
|
204
|
1
|
|
|
|
|
5
|
my @results; |
|
205
|
1
|
|
|
|
|
29
|
while ( my $file = readdir $DIR ) { |
|
206
|
5
|
100
|
|
|
|
19
|
next if $file =~ m/^\./; # ignore hidden files, '.', and '..' |
|
207
|
|
|
|
|
|
|
|
|
208
|
3
|
|
|
|
|
16
|
push @results, $self->_dirwalk("$top/$file", $filefunc, $dirfunc); |
|
209
|
|
|
|
|
|
|
} |
|
210
|
1
|
50
|
|
|
|
5
|
return $dirfunc ? $dirfunc->($top, @results) : (); |
|
211
|
|
|
|
|
|
|
} |
|
212
|
|
|
|
|
|
|
|
|
213
|
3
|
50
|
|
|
|
11
|
return $filefunc ? $filefunc->($top) : (); |
|
214
|
|
|
|
|
|
|
} |
|
215
|
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
1; |
|
217
|
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
__END__ |