line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package File::Dedup; |
2
|
|
|
|
|
|
|
# ABSTRACT: Deduplicate files across directories |
3
|
|
|
|
|
|
|
$File::Dedup::VERSION = '0.007'; |
4
|
5
|
|
|
5
|
|
2852
|
use strict; |
|
5
|
|
|
|
|
7
|
|
|
5
|
|
|
|
|
145
|
|
5
|
5
|
|
|
5
|
|
19
|
use warnings; |
|
5
|
|
|
|
|
8
|
|
|
5
|
|
|
|
|
129
|
|
6
|
|
|
|
|
|
|
|
7
|
5
|
|
|
5
|
|
4134
|
use Digest::SHA; |
|
5
|
|
|
|
|
17331
|
|
|
5
|
|
|
|
|
298
|
|
8
|
5
|
|
|
5
|
|
43
|
use feature qw(say); |
|
5
|
|
|
|
|
7
|
|
|
5
|
|
|
|
|
2455
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
my @VALID_OPTIONS = qw(ask directory group recursive); |
11
|
|
|
|
|
|
|
sub new { |
12
|
14
|
|
|
14
|
0
|
12528
|
my ($class, %opts) = @_; |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
die "Must pass a directory to process" |
15
|
14
|
100
|
|
|
|
54
|
unless exists $opts{directory}; |
16
|
|
|
|
|
|
|
die "Supplied directory argument '$opts{directory}' is not a directory" |
17
|
13
|
100
|
|
|
|
179
|
unless -d $opts{directory}; |
18
|
|
|
|
|
|
|
warn "Supplied option 'group' not implemented yet" |
19
|
12
|
100
|
66
|
|
|
135
|
if exists $opts{group} and defined $opts{group}; |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
# do not allow undefined options |
22
|
12
|
|
|
|
|
39
|
foreach my $opt ( keys %opts ) { |
23
|
|
|
|
|
|
|
die "Invalid argument '$opt' passed to new" |
24
|
23
|
100
|
|
|
|
29
|
unless grep { $_ eq $opt } @VALID_OPTIONS; |
|
92
|
|
|
|
|
140
|
|
25
|
|
|
|
|
|
|
} |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
# default to always asking before purging |
28
|
|
|
|
|
|
|
$opts{ask} = 1 |
29
|
11
|
100
|
66
|
|
|
61
|
unless exists $opts{ask} && defined $opts{ask}; |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
# default to non-recursive |
32
|
|
|
|
|
|
|
$opts{recursive} = 0 |
33
|
11
|
100
|
66
|
|
|
45
|
unless exists $opts{recursive} && defined $opts{recursive}; |
34
|
|
|
|
|
|
|
|
35
|
11
|
|
|
|
|
37
|
return bless \%opts, $class; |
36
|
|
|
|
|
|
|
} |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
sub directory { |
39
|
2
|
|
|
2
|
1
|
18
|
return shift->{directory}; |
40
|
|
|
|
|
|
|
} |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
sub recursive { |
43
|
2
|
|
|
2
|
1
|
16
|
return shift->{recursive}; |
44
|
|
|
|
|
|
|
} |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
sub ask { |
47
|
15
|
|
|
15
|
1
|
70
|
return shift->{ask}; |
48
|
|
|
|
|
|
|
} |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
sub group { |
51
|
0
|
|
|
0
|
1
|
0
|
return shift->{group}; |
52
|
|
|
|
|
|
|
} |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
sub _file_digest { |
55
|
3
|
|
|
3
|
|
3
|
my ($filename) = @_; |
56
|
|
|
|
|
|
|
|
57
|
3
|
50
|
|
|
|
61
|
open my $fh, '<', $filename |
58
|
|
|
|
|
|
|
or die "$!"; |
59
|
|
|
|
|
|
|
|
60
|
3
|
|
|
|
|
15
|
my $checksum = Digest::SHA->new->addfile($fh)->hexdigest; |
61
|
3
|
|
|
|
|
128
|
close($fh); |
62
|
|
|
|
|
|
|
|
63
|
3
|
|
|
|
|
18
|
return $checksum; |
64
|
|
|
|
|
|
|
} |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
sub dedup { |
67
|
1
|
|
|
1
|
1
|
5
|
my ($self) = @_; |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
my @results = $self->_dirwalk( |
70
|
|
|
|
|
|
|
$self->directory, |
71
|
3
|
|
|
3
|
|
9
|
sub { [ $_[0], _file_digest($_[0]) ] }, |
72
|
1
|
|
|
1
|
|
1
|
sub { shift; @_ } |
|
1
|
|
|
|
|
12
|
|
73
|
1
|
|
|
|
|
4
|
); |
74
|
5
|
|
|
5
|
|
6705
|
use Data::Dumper; |
|
5
|
|
|
|
|
39221
|
|
|
5
|
|
|
|
|
5248
|
|
75
|
1
|
|
|
|
|
9
|
print Dumper \@results; |
76
|
1
|
|
|
|
|
129
|
my %files_by_hashsum; |
77
|
1
|
|
|
|
|
4
|
foreach my $result ( @results ) { |
78
|
3
|
|
|
|
|
62
|
my ($filename, $digest) = @$result; |
79
|
3
|
|
|
|
|
4
|
push @{ $files_by_hashsum{$digest} }, $filename; |
|
3
|
|
|
|
|
8
|
|
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
my %duplicates_by_hashsum = |
83
|
1
|
|
|
|
|
3
|
map { $_ => [ sort @{$files_by_hashsum{$_}} ] } |
|
1
|
|
|
|
|
9
|
|
84
|
1
|
|
|
|
|
5
|
grep { @{ $files_by_hashsum{$_} } > 1 } keys %files_by_hashsum; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
18
|
|
85
|
|
|
|
|
|
|
|
86
|
1
|
|
|
|
|
5
|
my @files_to_purge = $self->_handle_duplicates(\%duplicates_by_hashsum); |
87
|
1
|
|
|
|
|
5
|
$self->_purge_files(\@files_to_purge); |
88
|
|
|
|
|
|
|
|
89
|
1
|
|
|
|
|
5
|
return; |
90
|
|
|
|
|
|
|
} |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
sub _handle_duplicates { |
93
|
4
|
|
|
4
|
|
39
|
my ($self, $duplicates) = @_; |
94
|
4
|
50
|
|
|
|
13
|
return unless keys %$duplicates; |
95
|
|
|
|
|
|
|
|
96
|
4
|
|
|
|
|
9
|
my @files_to_purge; |
97
|
4
|
|
|
|
|
16
|
while ( my ($digest, $files) = each %$duplicates ) { |
98
|
7
|
|
|
|
|
7
|
my $to_keep; |
99
|
7
|
100
|
|
|
|
17
|
if ( $self->ask ) { |
100
|
4
|
|
|
|
|
523
|
say 'The following files are duplicates ' |
101
|
|
|
|
|
|
|
. " indicate which one(s) you would like to keep\n" |
102
|
|
|
|
|
|
|
. '(-1 to SKIP or CTRL-C to quit):'; |
103
|
|
|
|
|
|
|
|
104
|
4
|
|
|
|
|
8
|
my $number_of_files = $#{ $files }; |
|
4
|
|
|
|
|
10
|
|
105
|
4
|
|
|
|
|
11
|
foreach my $i ( 0 .. $number_of_files ) { |
106
|
8
|
|
|
|
|
17
|
my $file = $files->[$i]; |
107
|
8
|
|
|
|
|
762
|
say "[ $i]\t$file"; |
108
|
|
|
|
|
|
|
} |
109
|
4
|
|
|
|
|
331
|
say "[ -1]\tSKIP"; |
110
|
4
|
|
|
|
|
323
|
say "[C-c]\tQUIT"; |
111
|
4
|
|
|
|
|
15
|
$to_keep = _get_numeric_response($number_of_files); |
112
|
4
|
100
|
66
|
|
|
53
|
next if ! defined $to_keep || defined $to_keep && $to_keep == -1; |
|
|
|
33
|
|
|
|
|
113
|
|
|
|
|
|
|
} |
114
|
|
|
|
|
|
|
else { # if ask = 0 keep the first duplicate |
115
|
3
|
|
|
|
|
5
|
$to_keep = 0; |
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
push @files_to_purge, |
119
|
5
|
|
|
|
|
8
|
grep { $_ ne $files->[$to_keep] } @$files; |
|
10
|
|
|
|
|
30
|
|
120
|
|
|
|
|
|
|
} |
121
|
|
|
|
|
|
|
|
122
|
4
|
|
|
|
|
34
|
return sort @files_to_purge; |
123
|
|
|
|
|
|
|
} |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
sub _purge_files { |
126
|
2
|
|
|
2
|
|
125
|
my ($self, $files) = @_; |
127
|
|
|
|
|
|
|
|
128
|
2
|
|
|
|
|
6
|
foreach my $file ( @$files ) { |
129
|
3
|
|
|
|
|
243
|
print "purging file: $file\n"; |
130
|
3
|
|
|
|
|
6
|
my $response; |
131
|
3
|
50
|
|
|
|
11
|
if ( $self->ask ) { |
132
|
|
|
|
|
|
|
do { |
133
|
0
|
|
|
|
|
0
|
print "About to delete '$file'; continue? [Y/n] "; |
134
|
0
|
|
|
|
|
0
|
$response = _prompt(); |
135
|
|
|
|
|
|
|
} |
136
|
0
|
|
|
|
|
0
|
while ( !grep { $response eq $_ } ('y', 'Y', 'n', 'N', '') ); |
|
0
|
|
|
|
|
0
|
|
137
|
|
|
|
|
|
|
} |
138
|
|
|
|
|
|
|
|
139
|
3
|
50
|
0
|
|
|
9
|
_delete_file($file) |
|
|
|
0
|
|
|
|
|
|
|
|
33
|
|
|
|
|
140
|
|
|
|
|
|
|
if !$self->ask |
141
|
|
|
|
|
|
|
|| ($self->ask |
142
|
|
|
|
|
|
|
&& ($response eq '' || $response =~ m/^[yY]$/)); |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
|
145
|
2
|
|
|
|
|
7
|
return; |
146
|
|
|
|
|
|
|
} |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
sub _delete_file { |
149
|
3
|
|
|
3
|
|
6
|
my ($file) = @_; |
150
|
|
|
|
|
|
|
|
151
|
3
|
50
|
|
|
|
207
|
unlink($file) |
152
|
|
|
|
|
|
|
or die "Unable to delete file '$file': $!"; |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
sub _get_numeric_response { |
156
|
4
|
|
|
4
|
|
6
|
my ($max) = @_; |
157
|
|
|
|
|
|
|
|
158
|
4
|
|
|
|
|
3
|
my $input; |
159
|
4
|
|
|
|
|
5
|
my $valid_response = 0; |
160
|
4
|
|
|
|
|
3
|
do { |
161
|
4
|
|
|
|
|
127
|
print "\n>> "; |
162
|
4
|
|
|
|
|
13
|
$input = _prompt(); |
163
|
|
|
|
|
|
|
|
164
|
4
|
50
|
33
|
|
|
50
|
if ( ! defined $input ) { |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
165
|
0
|
|
|
|
|
0
|
say 'You did not enter any input.'; |
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
elsif ( $input !~ m/^\-?\d+$/ ) { |
168
|
0
|
|
|
|
|
0
|
say "You must enter a number between 0 and $max"; |
169
|
|
|
|
|
|
|
} |
170
|
|
|
|
|
|
|
elsif ( $input && $input > $max ) { |
171
|
0
|
|
|
|
|
0
|
say "You must enter a number between 0 and $max"; |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
else { |
174
|
4
|
|
|
|
|
10
|
$valid_response = 1; |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
} while( !$valid_response ); |
177
|
|
|
|
|
|
|
|
178
|
4
|
|
|
|
|
335
|
print "AFTER get_numeric_response: $input\n"; |
179
|
4
|
|
|
|
|
13
|
return $input; |
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
sub _prompt { |
183
|
0
|
|
|
0
|
|
0
|
my $input = ; |
184
|
0
|
|
|
|
|
0
|
chomp($input); |
185
|
|
|
|
|
|
|
|
186
|
0
|
|
|
|
|
0
|
return $input; |
187
|
|
|
|
|
|
|
} |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
sub _dirwalk { |
190
|
4
|
|
|
4
|
|
5
|
my ($self, $top, $filefunc, $dirfunc) = @_; |
191
|
|
|
|
|
|
|
|
192
|
4
|
100
|
|
|
|
34
|
if ( -d $top ) { |
193
|
|
|
|
|
|
|
# stop processing non-recursive searches when a directory that |
194
|
|
|
|
|
|
|
# was not the starting directory is encountered |
195
|
|
|
|
|
|
|
return |
196
|
1
|
50
|
33
|
|
|
3
|
if $top ne $self->directory && !$self->recursive; |
197
|
|
|
|
|
|
|
|
198
|
1
|
|
|
|
|
2
|
my $DIR; |
199
|
1
|
50
|
|
|
|
35
|
unless ( opendir $DIR, $top ) { |
200
|
0
|
|
|
|
|
0
|
warn "Couldn't open directory '$top': $!; skipping.\n"; |
201
|
0
|
|
|
|
|
0
|
return; |
202
|
|
|
|
|
|
|
} |
203
|
|
|
|
|
|
|
|
204
|
1
|
|
|
|
|
5
|
my @results; |
205
|
1
|
|
|
|
|
29
|
while ( my $file = readdir $DIR ) { |
206
|
5
|
100
|
|
|
|
19
|
next if $file =~ m/^\./; # ignore hidden files, '.', and '..' |
207
|
|
|
|
|
|
|
|
208
|
3
|
|
|
|
|
16
|
push @results, $self->_dirwalk("$top/$file", $filefunc, $dirfunc); |
209
|
|
|
|
|
|
|
} |
210
|
1
|
50
|
|
|
|
5
|
return $dirfunc ? $dirfunc->($top, @results) : (); |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
|
213
|
3
|
50
|
|
|
|
11
|
return $filefunc ? $filefunc->($top) : (); |
214
|
|
|
|
|
|
|
} |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
1; |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
__END__ |