File Coverage

blib/lib/App/UniqFiles.pm
Criterion Covered Total %
statement 168 223 75.3
branch 78 122 63.9
condition 43 65 66.1
subroutine 11 12 91.6
pod 1 1 100.0
total 301 423 71.1


line stmt bran cond sub pod time code
1             package App::UniqFiles;
2              
3 1     1   152193 use 5.010001;
  1         15  
4 1     1   5 use strict;
  1         2  
  1         23  
5 1     1   5 use warnings;
  1         2  
  1         32  
6 1     1   2348 use Log::ger;
  1         59  
  1         7  
7              
8 1     1   284 use Cwd qw(abs_path);
  1         3  
  1         61  
9 1     1   6 use Exporter qw(import);
  1         2  
  1         27  
10 1     1   561 use Perinci::Sub::Util qw(gen_modified_sub);
  1         2578  
  1         169  
11              
12             our $AUTHORITY = 'cpan:PERLANCAR'; # AUTHORITY
13             our $DATE = '2022-11-15'; # DATE
14             our $DIST = 'App-UniqFiles'; # DIST
15             our $VERSION = '0.139'; # VERSION
16              
17             our @EXPORT_OK = qw(uniq_files);
18              
19             our %SPEC;
20              
21             sub _glob {
22 10     10   56 require File::Find;
23              
24 10         22 my $dir;
25             my @res;
26             File::Find::finddepth(
27             sub {
28 30 50   30   324 return if -l $_;
29 30 100       270 return unless -f _;
30 1     1   8 no warnings 'once'; # $File::Find::dir
  1         3  
  1         2664  
31 20         266 push @res, "$File::Find::dir/$_";
32             },
33 10         1152 @_,
34             );
35 10         82 @res;
36             }
37              
38             our %argspec_authoritative_dirs = (
39             authoritative_dirs => {
40             summary => 'Denote director(y|ies) where authoritative/"Original" copies are found',
41             'x.name.is_plural' => 1,
42             'x.name.singular' => 'authoritative_dir',
43             schema => ['array*', of=>'str*'], # XXX dirname
44             cmdline_aliases => {O=>{}},
45             },
46             );
47             our %argspecs_filter = (
48             include_file_patterns => {
49             summary => 'Filename (including path) regex patterns to exclude',
50             'x.name.is_plural' => 1,
51             'x.name.singular' => 'include_file_pattern',
52             schema => ['array*', of=>'str*'], # XXX re
53             cmdline_aliases => {I=>{}},
54             },
55             exclude_file_patterns => {
56             summary => 'Filename (including path) regex patterns to include',
57             'x.name.is_plural' => 1,
58             'x.name.singular' => 'exclude_file_pattern',
59             schema => ['array*', of=>'str*'], # XXX re
60             cmdline_aliases => {X=>{}},
61             },
62             exclude_empty_files => {
63             schema => 'bool*',
64             cmdline_aliases => {Z=>{}},
65             },
66             min_size => {
67             summary => 'Minimum file size to consider',
68             schema => 'filesize*',
69             },
70             max_size => {
71             summary => 'Maximum file size to consider',
72             schema => 'filesize*',
73             },
74             );
75              
76             $SPEC{uniq_files} = {
77             v => 1.1,
78             summary => 'Report duplicate or unique file contents',
79             description => <<'_',
80              
81             Given a list of filenames, will check each file size and content for duplicate
82             content. Interface is a bit like the `uniq` Unix command-line program.
83              
84             _
85             args => {
86             files => {
87             schema => ['array*' => {of=>'str*'}],
88             req => 1,
89             pos => 0,
90             slurpy => 1,
91             },
92             recurse => {
93             schema => 'bool*',
94             cmdline_aliases => {R=>{}},
95             description => <<'_',
96              
97             If set to true, will recurse into subdirectories.
98              
99             _
100             },
101             group_by_digest => {
102             summary => 'Sort files by its digest (or size, if not computing digest), separate each different digest',
103             schema => 'bool*',
104             },
105             show_digest => {
106             summary => 'Show the digest value (or the size, if not computing digest) for each file',
107             description => <<'_',
108              
109             Note that this routine does not compute digest for files which have unique
110             sizes, so they will show up as empty.
111              
112             _
113             schema => 'true*',
114             },
115             show_size => {
116             summary => 'Show the size for each file',
117             schema => 'true*',
118             },
119             # TODO add option follow_symlinks?
120             report_unique => {
121             schema => [bool => {default=>1}],
122             summary => 'Whether to return unique items',
123             cmdline_aliases => {
124             a => {
125             summary => 'Alias for --report-unique --report-duplicate=1 (report all files)',
126             code => sub {
127             my $args = shift;
128             $args->{report_unique} = 1;
129             $args->{report_duplicate} = 1;
130             },
131             },
132             u => {
133             summary => 'Alias for --report-unique --report-duplicate=0',
134             code => sub {
135             my $args = shift;
136             $args->{report_unique} = 1;
137             $args->{report_duplicate} = 0;
138             },
139             },
140             d => {
141             summary =>
142             'Alias for --noreport-unique --report-duplicate=1',
143             code => sub {
144             my $args = shift;
145             $args->{report_unique} = 0;
146             $args->{report_duplicate} = 1;
147             },
148             },
149             D => {
150             summary =>
151             'Alias for --noreport-unique --report-duplicate=3',
152             code => sub {
153             my $args = shift;
154             $args->{report_unique} = 0;
155             $args->{report_duplicate} = 3;
156             },
157             },
158             },
159             },
160             report_duplicate => {
161             schema => [int => {in=>[0,1,2,3], default=>2}],
162             summary => 'Whether to return duplicate items',
163             description => <<'_',
164              
165             Can be set to either 0, 1, 2, or 3.
166              
167             If set to 0, duplicate items will not be returned.
168              
169             If set to 1 (the default for `dupe-files`), will return all the the duplicate
170             files. For example: `file1` contains text 'a', `file2` 'b', `file3` 'a'. Then
171             `file1` and `file3` will be returned.
172              
173             If set to 2 (the default for `uniq-files`), will only return the first of
174             duplicate items. Continuing from previous example, only `file1` will be returned
175             because `file2` is unique and `file3` contains 'a' (already represented by
176             `file1`). If one or more `--authoritative-dir` (`-O`) options are specified,
177             files under these directories will be preferred.
178              
179             If set to 3, will return all but the first of duplicate items. Continuing from
180             previous example: `file3` will be returned. This is useful if you want to keep
181             only one copy of the duplicate content. You can use the output of this routine
182             to `mv` or `rm`. Similar to the previous case, if one or more
183             `--authoritative-dir` (`-O`) options are specified, then files under these
184             directories will not be listed if possible.
185              
186             _
187             cmdline_aliases => {
188             },
189             },
190             algorithm => {
191             schema => ['str*'],
192             summary => "What algorithm is used to compute the digest of the content",
193             description => <<'_',
194              
195             The default is to use `md5`. Some algorithms supported include `crc32`, `sha1`,
196             `sha256`, as well as `Digest` to use Perl <pm:Digest> which supports a lot of
197             other algorithms, e.g. `SHA-1`, `BLAKE2b`.
198              
199             If set to '', 'none', or 'size', then digest will be set to file size. This
200             means uniqueness will be determined solely from file size. This can be quicker
201             but will generate a false positive when two files of the same size are deemed as
202             duplicate even though their content may be different.
203              
204             _
205             },
206             digest_args => {
207             schema => ['array*',
208              
209             # comment out temporarily, Perinci::Sub::GetArgs::Argv
210             # clashes with coerce rules; we should fix
211             # Perinci::Sub::GetArgs::Argv to observe coercion rules
212             # first
213             #of=>'str*',
214              
215             'x.perl.coerce_rules'=>['From_str::comma_sep']],
216             description => <<'_',
217              
218             Some Digest algorithms require arguments, you can pass them here.
219              
220             _
221             cmdline_aliases => {A=>{}},
222             },
223             show_count => {
224             schema => [bool => {default=>0}],
225             summary => "Whether to return each file content's ".
226             "number of occurence",
227             description => <<'_',
228              
229             1 means the file content is only encountered once (unique), 2 means there is one
230             duplicate, and so on.
231              
232             _
233             cmdline_aliases => {count=>{}, c=>{}},
234             },
235             detail => {
236             summary => 'Show details (a.k.a. --show-digest, --show-size, --show-count)',
237             schema => 'true*',
238             cmdline_aliases => {l=>{}},
239             },
240             %argspec_authoritative_dirs,
241             %argspecs_filter,
242             },
243             examples => [
244             {
245             summary => 'List all files which do no have duplicate contents',
246             src => 'uniq-files *',
247             src_plang => 'bash',
248             test => 0,
249             'x.doc.show_result' => 0,
250             },
251             {
252             summary => 'List all files (recursively, and in detail) which have duplicate contents (all duplicate copies), exclude some files',
253             src => q(uniq-files -R -l -d -X '\.git/' --min-size 10k .),
254             src_plang => 'bash',
255             test => 0,
256             'x.doc.show_result' => 0,
257             },
258             {
259             summary => 'Move all duplicate files (except one copy) in this directory (and subdirectories) to .dupes/',
260             src => 'uniq-files -D -R * | while read f; do mv "$f" .dupes/; done',
261             src_plang => 'bash',
262             test => 0,
263             'x.doc.show_result' => 0,
264             },
265             {
266             summary => 'List number of occurences of contents for duplicate files',
267             src => 'uniq-files -c *',
268             src_plang => 'bash',
269             test => 0,
270             'x.doc.show_result' => 0,
271             },
272             {
273             summary => 'List number of occurences of contents for all files',
274             src => 'uniq-files -a -c *',
275             src_plang => 'bash',
276             test => 0,
277             'x.doc.show_result' => 0,
278             },
279             {
280             summary => 'List all files, along with their number of content occurrences and content digest. '.
281             'Use the BLAKE2b digest algorithm. And group the files according to their digest.',
282             src => 'uniq-files -a -c --show-digest -A BLAKE2,blake2b *',
283             src_plang => 'bash',
284             test => 0,
285             'x.doc.show_result' => 0,
286             },
287             ],
288             };
289             sub uniq_files {
290 11     11 1 38875 my %args = @_;
291              
292 11         26 my $files = $args{files};
293 11 50 33     54 return [400, "Please specify files"] if !$files || !@$files;
294 11         18 my $recurse = $args{recurse};
295 11   100     30 my $report_unique = $args{report_unique} // 1;
296 11   100     27 my $report_duplicate = $args{report_duplicate} // 2;
297 11   100     31 my $show_count = $args{show_count} // 0;
298 11   100     28 my $show_digest = $args{show_digest} // 0;
299 11   100     29 my $show_size = $args{show_size} // 0;
300 11         17 my $digest_args = $args{digest_args};
301 11 50 66     37 my $algorithm = $args{algorithm} // ($digest_args ? 'Digest' : 'md5');
302 11         17 my $group_by_digest = $args{group_by_digest};
303              
304 11 50       26 if ($args{detail}) {
305 0         0 $show_digest = 1;
306 0         0 $show_size = 1;
307 0         0 $show_count = 1;
308             }
309              
310             my @authoritative_dirs = $args{authoritative_dirs} && @{$args{authoritative_dirs}} ?
311 11 100 66     30 @{ $args{authoritative_dirs} } : ();
  2         5  
312 11         24 for my $dir (@authoritative_dirs) {
313 2 50       34 (-d $dir) or return [400, "Authoritative dir '$dir' does not exist or not a directory"];
314 2 50       40 my $abs_dir = abs_path $dir or return [400, "Cannot get absolute path for authoritative dir '$dir'"];
315 2         8 $dir = $abs_dir;
316             }
317             #log_trace "authoritative_dirs=%s", \@authoritative_dirs if @authoritative_dirs;
318              
319 11         17 my @include_re;
320 11   50     16 for my $re0 (@{ $args{include_file_patterns} // [] }) {
  11         45  
321 0         0 require Regexp::Util;
322 0         0 my $re;
323 0 0       0 if (ref $re0 eq 'Regexp') {
324 0         0 $re = $re0;
325             } else {
326 0         0 eval { $re = Regexp::Util::deserialize_regexp("qr($re0)") };
  0         0  
327 0 0       0 return [400, "Invalid/unsafe regex pattern in include_file_patterns '$re0': $@"] if $@;
328 0 0       0 return [400, "Unsafe regex pattern (contains embedded code) in include_file_patterns '$re0'"] if Regexp::Util::regexp_seen_evals($re);
329             }
330 0         0 push @include_re, $re;
331             }
332 11         19 my @exclude_re;
333 11   50     16 for my $re0 (@{ $args{exclude_file_patterns} // [] }) {
  11         36  
334 0         0 require Regexp::Util;
335 0         0 my $re;
336 0 0       0 if (ref $re0 eq 'Regexp') {
337 0         0 $re = $re0;
338             } else {
339 0         0 eval { $re = Regexp::Util::deserialize_regexp("qr($re0)") };
  0         0  
340 0 0       0 return [400, "Invalid/unsafe regex pattern in exclude_file_patterns '$re0': $@"] if $@;
341 0 0       0 return [400, "Unsafe regex pattern (contains embedded code) in exclude_file_patterns '$re0'"] if Regexp::Util::regexp_seen_evals($re);
342             }
343 0         0 push @exclude_re, $re;
344             }
345              
346 11 100       25 if ($recurse) {
347             $files = [ map {
348 5 50       10 if (-l $_) {
  35 100       324  
349 0         0 ();
350             } elsif (-d _) {
351 10         34 (_glob($_));
352             } else {
353 25         80 ($_);
354             }
355             } @$files ];
356             }
357              
358             FILTER: {
359 11         17 my $ffiles;
  11         15  
360             FILE:
361 11         23 for my $f (@$files) {
362 87 50       802 if (-l $f) {
363 0         0 log_warn "File '$f' is a symlink, ignored";
364 0         0 next FILE;
365             }
366 87 100       221 if (-d _) {
367 12         55 log_warn "File '$f' is a directory, ignored";
368 12         45 next FILE;
369             }
370 75 50       146 unless (-f _) {
371 0         0 log_warn "File '$f' is not a regular file, ignored";
372 0         0 next FILE;
373             }
374              
375 75 50       162 if (@include_re) {
376 0         0 my $included;
377 0         0 for my $re (@include_re) {
378 0 0       0 if ($f =~ $re) { $included++; last }
  0         0  
  0         0  
379             }
380 0 0       0 unless ($included) {
381 0         0 log_info "File '$f' is not in --include-file-patterns, skipped";
382 0         0 next FILE;
383             }
384             }
385 75 50       138 if (@exclude_re) {
386 0         0 for my $re (@exclude_re) {
387 0 0       0 if ($f =~ $re) {
388 0         0 log_info "File '$f' is in --exclude-file-patterns, skipped";
389 0         0 next FILE;
390             }
391             }
392             }
393              
394 75         622 my $size = -s $f;
395 75 50 33     216 if ($args{exclude_empty_files} && !$size) {
396 0         0 log_info "File '$f' is empty, skipped by option -Z";
397 0         0 next FILE;
398             }
399 75 50 33     149 if ($args{min_size} && $size < $args{min_size}) {
400 0         0 log_info "File '$f' (size=$size) is smaller than min_file ($args{min_size}), skipped";
401 0         0 next FILE;
402             }
403 75 50 33     170 if ($args{max_size} && $size > $args{max_size}) {
404 0         0 log_info "File '$f' (size=$size) is larger than max_file ($args{max_size}), skipped";
405 0         0 next FILE;
406             }
407              
408 75         202 push @$ffiles, $f;
409             }
410 11         27 $files = $ffiles;
411             } # FILTER
412              
413 11         29 my %size_counts; # key = size, value = number of files having that size
414             my %size_files; # key = size, value = [file, ...]
415 11         0 my %file_sizes; # key = filename, value = file size, for caching stat()
416             GET_FILE_SIZES: {
417 11         16 for my $f (@$files) {
  11         19  
418 75         672 my @st = stat $f;
419 75 50       186 unless (@st) {
420 0         0 log_error("Can't stat file `$f`: $!, skipped");
421 0         0 next;
422             }
423 75         166 $size_counts{$st[7]}++;
424 75   100     199 $size_files{$st[7]} //= [];
425 75         89 push @{$size_files{$st[7]}}, $f;
  75         166  
426 75         310 $file_sizes{$f} = $st[7];
427             }
428             }
429              
430 11   66     58 my $calc_digest = !($algorithm eq '' || $algorithm eq 'none' || $algorithm eq 'size');
431              
432             # calculate digest for all files having non-unique sizes
433 11         27 my %digest_counts; # key = digest, value = num of files having that digest
434             my %digest_files; # key = digest, value = [file, ...]
435 11         0 my %file_digests; # key = filename, value = file digest
436             CALC_FILE_DIGESTS: {
437 11 100       16 last unless $calc_digest;
  11         23  
438 10         559 require File::Digest;
439              
440 10         2368 for my $f (@$files) {
441 66 50       147 next unless defined $file_sizes{$f}; # just checking. all files should have sizes.
442 66 100       153 next if $size_counts{ $file_sizes{$f} } == 1; # skip unique file sizes.
443 60         154 my $res = File::Digest::digest_file(
444             file=>$f, algorithm=>$algorithm, digest_args=>$digest_args);
445 60 50       10215 return [500, "Can't calculate digest for file '$f': $res->[0] - $res->[1]"]
446             unless $res->[0] == 200;
447 60         104 my $digest = $res->[2];
448 60         128 $digest_counts{$digest}++;
449 60   100     224 $digest_files{$digest} //= [];
450 60         93 push @{$digest_files{$digest}}, $f;
  60         148  
451 60         169 $file_digests{$f} = $digest;
452             }
453             }
454              
455 11         22 my %file_counts; # key = file name, value = num of files having file content
456 11         22 for my $f (@$files) {
457 75 50       132 next unless defined $file_sizes{$f}; # just checking
458 75 100       122 if (!defined($file_digests{$f})) {
459 15         32 $file_counts{$f} = $size_counts{ $file_sizes{$f} };
460             } else {
461 60         111 $file_counts{$f} = $digest_counts{ $file_digests{$f} };
462             }
463             }
464              
465             SORT_DUPLICATE_FILES: {
466 11 100       16 last unless @authoritative_dirs;
  11         25  
467 2 50       8 my $hash = $calc_digest ? \%digest_files : \%size_files;
468 2         6 for my $key (keys %$hash) {
469 10         15 my @files = @{ $hash->{$key} };
  10         24  
470 10         17 my @abs_files;
471 10 100       24 next unless @files > 1;
472 4         22 for my $file (@files) {
473 12 50       197 my $abs_file = abs_path $file or do {
474 0         0 log_error "Cannot find absolute path for duplicate file '$file', skipping duplicate set %s", \@files;
475             };
476 12         37 push @abs_files, $abs_file;
477             }
478              
479             #log_trace "Duplicate files before sorting: %s", \@files;
480 12         26 @files = map { $files[$_] } sort {
481 4         16 my $file_a = $abs_files[$a];
  10         29  
482 10         13 my $file_a_in_authoritative_dirs = 0;
483 10         13 my $subdir_len_file_a;
484 10         18 for my $d (@authoritative_dirs) {
485 10 50       61 if ($file_a =~ m!\A\Q$d\E(?:/|\z)(.*)!) { $file_a_in_authoritative_dirs++; $subdir_len_file_a = length($1); last }
  0         0  
  0         0  
  0         0  
486             }
487 10         17 my $file_b = $abs_files[$b];
488 10         13 my $file_b_in_authoritative_dirs = 0;
489 10         11 my $subdir_len_file_b;
490 10         15 for my $d (@authoritative_dirs) {
491 10 100       51 if ($file_b =~ m!\A\Q$d\E(?:/|\z)(.*)!) { $file_b_in_authoritative_dirs++; $subdir_len_file_b = length($1); last }
  2         3  
  2         7  
  2         4  
492             }
493             #log_trace " file_a=<$file_a>, in authoritative_dirs? $file_a_in_authoritative_dirs";
494             #log_trace " file_b=<$file_b>, in authoritative_dirs? $file_b_in_authoritative_dirs";
495             # files located near the root of authoritative dir is preferred
496             # to deeper files. this is done by comparing subdir_len
497 10 50       39 ($file_a_in_authoritative_dirs ? $subdir_len_file_a : 9999) <=> ($file_b_in_authoritative_dirs ? $subdir_len_file_b : 9999) ||
    100          
    50          
498             $file_a cmp $file_b;
499             } 0..$#files;
500             #log_trace "Duplicate files after sorting: %s", \@files;
501              
502 4         15 $hash->{$key} = \@files;
503             }
504             }
505              
506             #$log->trace("report_duplicate=$report_duplicate");
507 11         20 my @files;
508 11         60 for my $f (sort keys %file_counts) {
509 75 100       128 if ($file_counts{$f} == 1) {
510             #log_trace "unique file '$f'";
511 24 100       44 push @files, $f if $report_unique;
512             } else {
513             #log_trace "duplicate file '$f'";
514             my $is_first_copy = $calc_digest ?
515             $f eq $digest_files{ $file_digests{$f} }[0] :
516 51 100       105 $f eq $size_files{ $file_sizes{$f} }[0];
517             #log_trace "is first copy? <$is_first_copy>";
518 51 100       115 if ($report_duplicate == 0) {
    100          
    100          
    50          
519             # do not report dupe files
520             } elsif ($report_duplicate == 1) {
521 15         28 push @files, $f;
522             } elsif ($report_duplicate == 2) {
523 21 100       43 push @files, $f if $is_first_copy;
524             } elsif ($report_duplicate == 3) {
525 9 100       23 push @files, $f unless $is_first_copy;
526             } else {
527 0         0 die "Invalid value for --report-duplicate ".
528             "'$report_duplicate', please choose 0/1/2/3";
529             }
530             }
531             }
532              
533             GROUP_FILES_BY_DIGEST: {
534 11 100       22 last unless $group_by_digest;
  11         26  
535             @files = sort {
536 1         9 $file_sizes{$a} <=> $file_sizes{$b} ||
537 20 50 50     52 ($file_digests{$a} // '') cmp ($file_digests{$b} // '')
      50        
538             } @files;
539             }
540              
541 11         28 my @rows;
542             my %resmeta;
543 11         0 my $last_digest;
544 11         15 for my $f (@files) {
545 41   66     89 my $digest = $file_digests{$f} // $file_sizes{$f};
546              
547             # add separator row
548 41 100 100     95 if ($group_by_digest && defined $last_digest && $digest ne $last_digest) {
      100        
549 4 50 33     23 push @rows, ($show_count || $show_digest || $show_size) ? {} : '';
550             }
551              
552 41         52 my $row;
553 41 100 100     135 if ($show_count || $show_digest || $show_size) {
      100        
554 19         49 $row = {file=>$f};
555 19 100       39 $row->{count} = $file_counts{$f} if $show_count;
556 19 100       34 $row->{digest} = $file_digests{$f} if $show_digest;
557 19 100       38 $row->{size} = $file_sizes{$f} if $show_size;
558             } else {
559 22         30 $row = $f;
560             }
561 41         57 push @rows, $row;
562 41         67 $last_digest = $digest;
563             }
564              
565 11         31 $resmeta{'table.fields'} = [qw/file size digest count/];
566              
567 11         146 [200, "OK", \@rows, \%resmeta];
568             }
569              
570             gen_modified_sub(
571             base_name => 'uniq_files',
572             output_name => 'dupe_files',
573             description => <<'_',
574              
575             This is a thin wrapper to <prog:uniq-files>. It defaults `report_unique` to 0
576             and `report_duplicate` to 1.
577              
578             _
579             modify_args => {
580             report_unique => sub {
581             $_[0]{schema} = [bool => {default=>0}];
582             },
583             report_duplicate => sub {
584             $_[0]{schema} = [int => {in=>[0,1,2,3], default=>1}];
585             },
586             },
587             modify_meta => sub {
588             $_[0]{examples} = [
589             {
590             summary => 'List all files (recursively, and in detail) which have duplicate contents (all duplicate copies)',
591             src => 'dupe-files -lR *',
592             src_plang => 'bash',
593             test => 0,
594             'x.doc.show_result' => 0,
595             },
596             ];
597             },
598             output_code => sub {
599 0     0     my %args = @_;
600 0   0       $args{report_unique} //= 0;
601 0   0       $args{report_duplicate} //= 1;
602 0           uniq_files(%args);
603             },
604             );
605              
606             1;
607             # ABSTRACT: Report duplicate or unique file contents
608              
609             __END__
610              
611             =pod
612              
613             =encoding UTF-8
614              
615             =head1 NAME
616              
617             App::UniqFiles - Report duplicate or unique file contents
618              
619             =head1 VERSION
620              
621             This document describes version 0.139 of App::UniqFiles (from Perl distribution App-UniqFiles), released on 2022-11-15.
622              
623             =head1 SYNOPSIS
624              
625             # See uniq-files script
626              
627             =head1 NOTES
628              
629             =head1 FUNCTIONS
630              
631              
632             =head2 dupe_files
633              
634             Usage:
635              
636             dupe_files(%args) -> [$status_code, $reason, $payload, \%result_meta]
637              
638             Report duplicate or unique file contents.
639              
640             This is a thin wrapper to L<uniq-files>. It defaults C<report_unique> to 0
641             and C<report_duplicate> to 1.
642              
643             This function is not exported.
644              
645             Arguments ('*' denotes required arguments):
646              
647             =over 4
648              
649             =item * B<algorithm> => I<str>
650              
651             What algorithm is used to compute the digest of the content.
652              
653             The default is to use C<md5>. Some algorithms supported include C<crc32>, C<sha1>,
654             C<sha256>, as well as C<Digest> to use Perl L<Digest> which supports a lot of
655             other algorithms, e.g. C<SHA-1>, C<BLAKE2b>.
656              
657             If set to '', 'none', or 'size', then digest will be set to file size. This
658             means uniqueness will be determined solely from file size. This can be quicker
659             but will generate a false positive when two files of the same size are deemed as
660             duplicate even though their content may be different.
661              
662             =item * B<authoritative_dirs> => I<array[str]>
663              
664             Denote director(yE<verbar>ies) where authoritativeE<sol>"Original" copies are found.
665              
666             =item * B<detail> => I<true>
667              
668             Show details (a.k.a. --show-digest, --show-size, --show-count).
669              
670             =item * B<digest_args> => I<array>
671              
672             Some Digest algorithms require arguments, you can pass them here.
673              
674             =item * B<exclude_empty_files> => I<bool>
675              
676             (No description)
677              
678             =item * B<exclude_file_patterns> => I<array[str]>
679              
680             Filename (including path) regex patterns to include.
681              
682             =item * B<files>* => I<array[str]>
683              
684             (No description)
685              
686             =item * B<group_by_digest> => I<bool>
687              
688             Sort files by its digest (or size, if not computing digest), separate each different digest.
689              
690             =item * B<include_file_patterns> => I<array[str]>
691              
692             Filename (including path) regex patterns to exclude.
693              
694             =item * B<max_size> => I<filesize>
695              
696             Maximum file size to consider.
697              
698             =item * B<min_size> => I<filesize>
699              
700             Minimum file size to consider.
701              
702             =item * B<recurse> => I<bool>
703              
704             If set to true, will recurse into subdirectories.
705              
706             =item * B<report_duplicate> => I<int> (default: 1)
707              
708             Whether to return duplicate items.
709              
710             Can be set to either 0, 1, 2, or 3.
711              
712             If set to 0, duplicate items will not be returned.
713              
714             If set to 1 (the default for C<dupe-files>), will return all the the duplicate
715             files. For example: C<file1> contains text 'a', C<file2> 'b', C<file3> 'a'. Then
716             C<file1> and C<file3> will be returned.
717              
718             If set to 2 (the default for C<uniq-files>), will only return the first of
719             duplicate items. Continuing from previous example, only C<file1> will be returned
720             because C<file2> is unique and C<file3> contains 'a' (already represented by
721             C<file1>). If one or more C<--authoritative-dir> (C<-O>) options are specified,
722             files under these directories will be preferred.
723              
724             If set to 3, will return all but the first of duplicate items. Continuing from
725             previous example: C<file3> will be returned. This is useful if you want to keep
726             only one copy of the duplicate content. You can use the output of this routine
727             to C<mv> or C<rm>. Similar to the previous case, if one or more
728             C<--authoritative-dir> (C<-O>) options are specified, then files under these
729             directories will not be listed if possible.
730              
731             =item * B<report_unique> => I<bool> (default: 0)
732              
733             Whether to return unique items.
734              
735             =item * B<show_count> => I<bool> (default: 0)
736              
737             Whether to return each file content's number of occurence.
738              
739             1 means the file content is only encountered once (unique), 2 means there is one
740             duplicate, and so on.
741              
742             =item * B<show_digest> => I<true>
743              
744             Show the digest value (or the size, if not computing digest) for each file.
745              
746             Note that this routine does not compute digest for files which have unique
747             sizes, so they will show up as empty.
748              
749             =item * B<show_size> => I<true>
750              
751             Show the size for each file.
752              
753              
754             =back
755              
756             Returns an enveloped result (an array).
757              
758             First element ($status_code) is an integer containing HTTP-like status code
759             (200 means OK, 4xx caller error, 5xx function error). Second element
760             ($reason) is a string containing error message, or something like "OK" if status is
761             200. Third element ($payload) is the actual result, but usually not present when enveloped result is an error response ($status_code is not 2xx). Fourth
762             element (%result_meta) is called result metadata and is optional, a hash
763             that contains extra information, much like how HTTP response headers provide additional metadata.
764              
765             Return value: (any)
766              
767              
768              
769             =head2 uniq_files
770              
771             Usage:
772              
773             uniq_files(%args) -> [$status_code, $reason, $payload, \%result_meta]
774              
775             Report duplicate or unique file contents.
776              
777             Given a list of filenames, will check each file size and content for duplicate
778             content. Interface is a bit like the C<uniq> Unix command-line program.
779              
780             This function is not exported by default, but exportable.
781              
782             Arguments ('*' denotes required arguments):
783              
784             =over 4
785              
786             =item * B<algorithm> => I<str>
787              
788             What algorithm is used to compute the digest of the content.
789              
790             The default is to use C<md5>. Some algorithms supported include C<crc32>, C<sha1>,
791             C<sha256>, as well as C<Digest> to use Perl L<Digest> which supports a lot of
792             other algorithms, e.g. C<SHA-1>, C<BLAKE2b>.
793              
794             If set to '', 'none', or 'size', then digest will be set to file size. This
795             means uniqueness will be determined solely from file size. This can be quicker
796             but will generate a false positive when two files of the same size are deemed as
797             duplicate even though their content may be different.
798              
799             =item * B<authoritative_dirs> => I<array[str]>
800              
801             Denote director(yE<verbar>ies) where authoritativeE<sol>"Original" copies are found.
802              
803             =item * B<detail> => I<true>
804              
805             Show details (a.k.a. --show-digest, --show-size, --show-count).
806              
807             =item * B<digest_args> => I<array>
808              
809             Some Digest algorithms require arguments, you can pass them here.
810              
811             =item * B<exclude_empty_files> => I<bool>
812              
813             (No description)
814              
815             =item * B<exclude_file_patterns> => I<array[str]>
816              
817             Filename (including path) regex patterns to include.
818              
819             =item * B<files>* => I<array[str]>
820              
821             (No description)
822              
823             =item * B<group_by_digest> => I<bool>
824              
825             Sort files by its digest (or size, if not computing digest), separate each different digest.
826              
827             =item * B<include_file_patterns> => I<array[str]>
828              
829             Filename (including path) regex patterns to exclude.
830              
831             =item * B<max_size> => I<filesize>
832              
833             Maximum file size to consider.
834              
835             =item * B<min_size> => I<filesize>
836              
837             Minimum file size to consider.
838              
839             =item * B<recurse> => I<bool>
840              
841             If set to true, will recurse into subdirectories.
842              
843             =item * B<report_duplicate> => I<int> (default: 2)
844              
845             Whether to return duplicate items.
846              
847             Can be set to either 0, 1, 2, or 3.
848              
849             If set to 0, duplicate items will not be returned.
850              
851             If set to 1 (the default for C<dupe-files>), will return all the the duplicate
852             files. For example: C<file1> contains text 'a', C<file2> 'b', C<file3> 'a'. Then
853             C<file1> and C<file3> will be returned.
854              
855             If set to 2 (the default for C<uniq-files>), will only return the first of
856             duplicate items. Continuing from previous example, only C<file1> will be returned
857             because C<file2> is unique and C<file3> contains 'a' (already represented by
858             C<file1>). If one or more C<--authoritative-dir> (C<-O>) options are specified,
859             files under these directories will be preferred.
860              
861             If set to 3, will return all but the first of duplicate items. Continuing from
862             previous example: C<file3> will be returned. This is useful if you want to keep
863             only one copy of the duplicate content. You can use the output of this routine
864             to C<mv> or C<rm>. Similar to the previous case, if one or more
865             C<--authoritative-dir> (C<-O>) options are specified, then files under these
866             directories will not be listed if possible.
867              
868             =item * B<report_unique> => I<bool> (default: 1)
869              
870             Whether to return unique items.
871              
872             =item * B<show_count> => I<bool> (default: 0)
873              
874             Whether to return each file content's number of occurence.
875              
876             1 means the file content is only encountered once (unique), 2 means there is one
877             duplicate, and so on.
878              
879             =item * B<show_digest> => I<true>
880              
881             Show the digest value (or the size, if not computing digest) for each file.
882              
883             Note that this routine does not compute digest for files which have unique
884             sizes, so they will show up as empty.
885              
886             =item * B<show_size> => I<true>
887              
888             Show the size for each file.
889              
890              
891             =back
892              
893             Returns an enveloped result (an array).
894              
895             First element ($status_code) is an integer containing HTTP-like status code
896             (200 means OK, 4xx caller error, 5xx function error). Second element
897             ($reason) is a string containing error message, or something like "OK" if status is
898             200. Third element ($payload) is the actual result, but usually not present when enveloped result is an error response ($status_code is not 2xx). Fourth
899             element (%result_meta) is called result metadata and is optional, a hash
900             that contains extra information, much like how HTTP response headers provide additional metadata.
901              
902             Return value: (any)
903              
904             =head1 HOMEPAGE
905              
906             Please visit the project's homepage at L<https://metacpan.org/release/App-UniqFiles>.
907              
908             =head1 SOURCE
909              
910             Source repository is at L<https://github.com/perlancar/perl-App-UniqFiles>.
911              
912             =head1 SEE ALSO
913              
914             L<find-duplicate-filenames> from L<App::FindUtils>
915              
916             L<move-duplicate-files-to> from L<App::DuplicateFilesUtils>, which is basically
917             a shortcut for C<< uniq-files -D -R . | while read f; do mv "$f" SOMEDIR/; done
918             >>.
919              
920             =head1 AUTHOR
921              
922             perlancar <perlancar@cpan.org>
923              
924             =head1 CONTRIBUTOR
925              
926             =for stopwords Steven Haryanto
927              
928             Steven Haryanto <stevenharyanto@gmail.com>
929              
930             =head1 CONTRIBUTING
931              
932              
933             To contribute, you can send patches by email/via RT, or send pull requests on
934             GitHub.
935              
936             Most of the time, you don't need to build the distribution yourself. You can
937             simply modify the code, then test via:
938              
939             % prove -l
940              
941             If you want to build the distribution (e.g. to try to install it locally on your
942             system), you can install L<Dist::Zilla>,
943             L<Dist::Zilla::PluginBundle::Author::PERLANCAR>,
944             L<Pod::Weaver::PluginBundle::Author::PERLANCAR>, and sometimes one or two other
945             Dist::Zilla- and/or Pod::Weaver plugins. Any additional steps required beyond
946             that are considered a bug and can be reported to me.
947              
948             =head1 COPYRIGHT AND LICENSE
949              
950             This software is copyright (c) 2022, 2020, 2019, 2017, 2015, 2014, 2012, 2011 by perlancar <perlancar@cpan.org>.
951              
952             This is free software; you can redistribute it and/or modify it under
953             the same terms as the Perl 5 programming language system itself.
954              
955             =head1 BUGS
956              
957             Please report any bugs or feature requests on the bugtracker website L<https://rt.cpan.org/Public/Dist/Display.html?Name=App-UniqFiles>
958              
959             When submitting a bug or request, please include a test-file or a
960             patch to an existing test-file that illustrates the bug or desired
961             feature.
962              
963             =cut