File Coverage

blib/lib/App/UniqFiles.pm
Criterion Covered Total %
statement 176 232 75.8
branch 79 126 62.7
condition 44 67 65.6
subroutine 11 12 91.6
pod 1 1 100.0
total 311 438 71.0


line stmt bran cond sub pod time code
1             package App::UniqFiles;
2              
3 1     1   148801 use 5.010001;
  1         13  
4 1     1   6 use strict;
  1         2  
  1         18  
5 1     1   5 use warnings;
  1         2  
  1         23  
6 1     1   2273 use Log::ger;
  1         57  
  1         5  
7              
8 1     1   257 use Cwd qw(abs_path);
  1         2  
  1         49  
9 1     1   6 use Exporter qw(import);
  1         2  
  1         24  
10 1     1   626 use Perinci::Sub::Util qw(gen_modified_sub);
  1         2626  
  1         174  
11              
12             our $AUTHORITY = 'cpan:PERLANCAR'; # AUTHORITY
13             our $DATE = '2023-02-06'; # DATE
14             our $DIST = 'App-UniqFiles'; # DIST
15             our $VERSION = '0.141'; # VERSION
16              
17             our @EXPORT_OK = qw(uniq_files);
18              
19             our %SPEC;
20              
21             sub _glob {
22 10     10   64 require File::Find;
23              
24 10         21 my $dir;
25             my @res;
26             File::Find::finddepth(
27             sub {
28 30 50   30   313 return if -l $_;
29 30 100       265 return unless -f _;
30 1     1   8 no warnings 'once'; # $File::Find::dir
  1         2  
  1         2754  
31 20         260 push @res, "$File::Find::dir/$_";
32             },
33 10         1160 @_,
34             );
35 10         83 @res;
36             }
37              
38             our %argspec_authoritative_dirs = (
39             authoritative_dirs => {
40             summary => 'Denote director(y|ies) where authoritative/"Original" copies are found',
41             'x.name.is_plural' => 1,
42             'x.name.singular' => 'authoritative_dir',
43             schema => ['array*', of=>'str*'], # XXX dirname
44             cmdline_aliases => {O=>{}},
45             },
46             );
47             our %argspecs_filter = (
48             include_file_patterns => {
49             summary => 'Filename (including path) regex patterns to exclude',
50             'x.name.is_plural' => 1,
51             'x.name.singular' => 'include_file_pattern',
52             schema => ['array*', of=>'str*'], # XXX re
53             cmdline_aliases => {I=>{}},
54             },
55             exclude_file_patterns => {
56             summary => 'Filename (including path) regex patterns to include',
57             'x.name.is_plural' => 1,
58             'x.name.singular' => 'exclude_file_pattern',
59             schema => ['array*', of=>'str*'], # XXX re
60             cmdline_aliases => {X=>{}},
61             },
62             exclude_empty_files => {
63             schema => 'bool*',
64             cmdline_aliases => {Z=>{}},
65             },
66             min_size => {
67             summary => 'Minimum file size to consider',
68             schema => 'filesize*',
69             },
70             max_size => {
71             summary => 'Maximum file size to consider',
72             schema => 'filesize*',
73             },
74             );
75              
76             $SPEC{uniq_files} = {
77             v => 1.1,
78             summary => 'Report duplicate or unique file contents',
79             description => <<'_',
80              
81             Given a list of filenames, will check each file size and content for duplicate
82             content. Interface is a bit like the `uniq` Unix command-line program.
83              
84             _
85             args => {
86             files => {
87             schema => ['array*' => {of=>'str*'}],
88             req => 1,
89             pos => 0,
90             slurpy => 1,
91             },
92             recurse => {
93             schema => 'bool*',
94             cmdline_aliases => {R=>{}},
95             description => <<'_',
96              
97             If set to true, will recurse into subdirectories.
98              
99             _
100             },
101             group_by_digest => {
102             summary => 'Sort files by its digest (or size, if not computing digest), separate each different digest',
103             schema => 'bool*',
104             },
105             show_digest => {
106             summary => 'Show the digest value (or the size, if not computing digest) for each file',
107             description => <<'_',
108              
109             Note that this routine does not compute digest for files which have unique
110             sizes, so they will show up as empty.
111              
112             _
113             schema => 'true*',
114             },
115             show_size => {
116             summary => 'Show the size for each file',
117             schema => 'true*',
118             },
119             # TODO add option follow_symlinks?
120             report_unique => {
121             schema => [bool => {default=>1}],
122             summary => 'Whether to return unique items',
123             cmdline_aliases => {
124             a => {
125             summary => 'Alias for --report-unique --report-duplicate=1 (report all files)',
126             code => sub {
127             my $args = shift;
128             $args->{report_unique} = 1;
129             $args->{report_duplicate} = 1;
130             },
131             },
132             u => {
133             summary => 'Alias for --report-unique --report-duplicate=0',
134             code => sub {
135             my $args = shift;
136             $args->{report_unique} = 1;
137             $args->{report_duplicate} = 0;
138             },
139             },
140             d => {
141             summary =>
142             'Alias for --noreport-unique --report-duplicate=1',
143             code => sub {
144             my $args = shift;
145             $args->{report_unique} = 0;
146             $args->{report_duplicate} = 1;
147             },
148             },
149             D => {
150             summary =>
151             'Alias for --noreport-unique --report-duplicate=3',
152             code => sub {
153             my $args = shift;
154             $args->{report_unique} = 0;
155             $args->{report_duplicate} = 3;
156             },
157             },
158             },
159             },
160             report_duplicate => {
161             schema => [int => {in=>[0,1,2,3], default=>2}],
162             summary => 'Whether to return duplicate items',
163             description => <<'_',
164              
165             Can be set to either 0, 1, 2, or 3.
166              
167             If set to 0, duplicate items will not be returned.
168              
169             If set to 1 (the default for `dupe-files`), will return all the the duplicate
170             files. For example: `file1` contains text 'a', `file2` 'b', `file3` 'a'. Then
171             `file1` and `file3` will be returned.
172              
173             If set to 2 (the default for `uniq-files`), will only return the first of
174             duplicate items. Continuing from previous example, only `file1` will be returned
175             because `file2` is unique and `file3` contains 'a' (already represented by
176             `file1`). If one or more `--authoritative-dir` (`-O`) options are specified,
177             files under these directories will be preferred.
178              
179             If set to 3, will return all but the first of duplicate items. Continuing from
180             previous example: `file3` will be returned. This is useful if you want to keep
181             only one copy of the duplicate content. You can use the output of this routine
182             to `mv` or `rm`. Similar to the previous case, if one or more
183             `--authoritative-dir` (`-O`) options are specified, then files under these
184             directories will not be listed if possible.
185              
186             _
187             cmdline_aliases => {
188             },
189             },
190             algorithm => {
191             schema => ['str*'],
192             summary => "What algorithm is used to compute the digest of the content",
193             description => <<'_',
194              
195             The default is to use `md5`. Some algorithms supported include `crc32`, `sha1`,
196             `sha256`, as well as `Digest` to use Perl <pm:Digest> which supports a lot of
197             other algorithms, e.g. `SHA-1`, `BLAKE2b`.
198              
199             If set to '', 'none', or 'size', then digest will be set to file size. This
200             means uniqueness will be determined solely from file size. This can be quicker
201             but will generate a false positive when two files of the same size are deemed as
202             duplicate even though their content may be different.
203              
204             If set to 'name' then only name comparison will be performed. This of course can
205             potentially generate lots of false positives, but in some cases you might want
206             to compare filename for uniqueness.
207              
208             _
209             },
210             digest_args => {
211             schema => ['array*',
212              
213             # comment out temporarily, Perinci::Sub::GetArgs::Argv
214             # clashes with coerce rules; we should fix
215             # Perinci::Sub::GetArgs::Argv to observe coercion rules
216             # first
217             #of=>'str*',
218              
219             'x.perl.coerce_rules'=>['From_str::comma_sep']],
220             description => <<'_',
221              
222             Some Digest algorithms require arguments, you can pass them here.
223              
224             _
225             cmdline_aliases => {A=>{}},
226             },
227             show_count => {
228             schema => [bool => {default=>0}],
229             summary => "Whether to return each file content's ".
230             "number of occurence",
231             description => <<'_',
232              
233             1 means the file content is only encountered once (unique), 2 means there is one
234             duplicate, and so on.
235              
236             _
237             cmdline_aliases => {count=>{}, c=>{}},
238             },
239             detail => {
240             summary => 'Show details (a.k.a. --show-digest, --show-size, --show-count)',
241             schema => 'true*',
242             cmdline_aliases => {l=>{}},
243             },
244             %argspec_authoritative_dirs,
245             %argspecs_filter,
246             },
247             examples => [
248             {
249             summary => 'List all files which do no have duplicate contents',
250             src => 'uniq-files *',
251             src_plang => 'bash',
252             test => 0,
253             'x.doc.show_result' => 0,
254             },
255             {
256             summary => 'List all files (recursively, and in detail) which have duplicate contents (all duplicate copies), exclude some files',
257             src => q(uniq-files -R -l -d -X '\.git/' --min-size 10k .),
258             src_plang => 'bash',
259             test => 0,
260             'x.doc.show_result' => 0,
261             },
262             {
263             summary => 'Move all duplicate files (except one copy) in this directory (and subdirectories) to .dupes/',
264             src => 'uniq-files -D -R * | while read f; do mv "$f" .dupes/; done',
265             src_plang => 'bash',
266             test => 0,
267             'x.doc.show_result' => 0,
268             },
269             {
270             summary => 'List number of occurences of contents for duplicate files',
271             src => 'uniq-files -c *',
272             src_plang => 'bash',
273             test => 0,
274             'x.doc.show_result' => 0,
275             },
276             {
277             summary => 'List number of occurences of contents for all files',
278             src => 'uniq-files -a -c *',
279             src_plang => 'bash',
280             test => 0,
281             'x.doc.show_result' => 0,
282             },
283             {
284             summary => 'List all files, along with their number of content occurrences and content digest. '.
285             'Use the BLAKE2b digest algorithm. And group the files according to their digest.',
286             src => 'uniq-files -a -c --show-digest -A BLAKE2,blake2b *',
287             src_plang => 'bash',
288             test => 0,
289             'x.doc.show_result' => 0,
290             },
291             ],
292             };
293             sub uniq_files {
294 11     11 1 37974 my %args = @_;
295              
296 11         25 my $files = $args{files};
297 11 50 33     62 return [400, "Please specify files"] if !$files || !@$files;
298 11         15 my $recurse = $args{recurse};
299 11   100     32 my $report_unique = $args{report_unique} // 1;
300 11   100     81 my $report_duplicate = $args{report_duplicate} // 2;
301 11   100     41 my $show_count = $args{show_count} // 0;
302 11   100     25 my $show_digest = $args{show_digest} // 0;
303 11   100     32 my $show_size = $args{show_size} // 0;
304 11         15 my $digest_args = $args{digest_args};
305 11 50 66     40 my $algorithm = $args{algorithm} // ($digest_args ? 'Digest' : 'md5');
306 11         17 my $group_by_digest = $args{group_by_digest};
307              
308 11 50       23 if ($args{detail}) {
309 0         0 $show_digest = 1;
310 0         0 $show_size = 1;
311 0         0 $show_count = 1;
312             }
313              
314             my @authoritative_dirs = $args{authoritative_dirs} && @{$args{authoritative_dirs}} ?
315 11 100 66     29 @{ $args{authoritative_dirs} } : ();
  2         5  
316 11         24 for my $dir (@authoritative_dirs) {
317 2 50       33 (-d $dir) or return [400, "Authoritative dir '$dir' does not exist or not a directory"];
318 2 50       62 my $abs_dir = abs_path $dir or return [400, "Cannot get absolute path for authoritative dir '$dir'"];
319 2         8 $dir = $abs_dir;
320             }
321             #log_trace "authoritative_dirs=%s", \@authoritative_dirs if @authoritative_dirs;
322              
323 11         15 my @include_re;
324 11   50     15 for my $re0 (@{ $args{include_file_patterns} // [] }) {
  11         45  
325 0         0 require Regexp::Util;
326 0         0 my $re;
327 0 0       0 if (ref $re0 eq 'Regexp') {
328 0         0 $re = $re0;
329             } else {
330 0         0 eval { $re = Regexp::Util::deserialize_regexp("qr($re0)") };
  0         0  
331 0 0       0 return [400, "Invalid/unsafe regex pattern in include_file_patterns '$re0': $@"] if $@;
332 0 0       0 return [400, "Unsafe regex pattern (contains embedded code) in include_file_patterns '$re0'"] if Regexp::Util::regexp_seen_evals($re);
333             }
334 0         0 push @include_re, $re;
335             }
336 11         19 my @exclude_re;
337 11   50     16 for my $re0 (@{ $args{exclude_file_patterns} // [] }) {
  11         39  
338 0         0 require Regexp::Util;
339 0         0 my $re;
340 0 0       0 if (ref $re0 eq 'Regexp') {
341 0         0 $re = $re0;
342             } else {
343 0         0 eval { $re = Regexp::Util::deserialize_regexp("qr($re0)") };
  0         0  
344 0 0       0 return [400, "Invalid/unsafe regex pattern in exclude_file_patterns '$re0': $@"] if $@;
345 0 0       0 return [400, "Unsafe regex pattern (contains embedded code) in exclude_file_patterns '$re0'"] if Regexp::Util::regexp_seen_evals($re);
346             }
347 0         0 push @exclude_re, $re;
348             }
349              
350 11 100       24 if ($recurse) {
351             $files = [ map {
352 5 50       12 if (-l $_) {
  35 100       330  
353 0         0 ();
354             } elsif (-d _) {
355 10         34 (_glob($_));
356             } else {
357 25         94 ($_);
358             }
359             } @$files ];
360             }
361              
362             FILTER: {
363 11         19 my $ffiles;
  11         15  
364             FILE:
365 11         25 for my $f (@$files) {
366 87 50       778 if (-l $f) {
367 0         0 log_warn "File '$f' is a symlink, ignored";
368 0         0 next FILE;
369             }
370 87 100       218 if (-d _) {
371 12         54 log_warn "File '$f' is a directory, ignored";
372 12         46 next FILE;
373             }
374 75 50       132 unless (-f _) {
375 0         0 log_warn "File '$f' is not a regular file, ignored";
376 0         0 next FILE;
377             }
378              
379 75 50       151 if (@include_re) {
380 0         0 my $included;
381 0         0 for my $re (@include_re) {
382 0 0       0 if ($f =~ $re) { $included++; last }
  0         0  
  0         0  
383             }
384 0 0       0 unless ($included) {
385 0         0 log_info "File '$f' is not in --include-file-patterns, skipped";
386 0         0 next FILE;
387             }
388             }
389 75 50       150 if (@exclude_re) {
390 0         0 for my $re (@exclude_re) {
391 0 0       0 if ($f =~ $re) {
392 0         0 log_info "File '$f' is in --exclude-file-patterns, skipped";
393 0         0 next FILE;
394             }
395             }
396             }
397              
398 75         583 my $size = -s $f;
399 75 50 33     208 if ($args{exclude_empty_files} && !$size) {
400 0         0 log_info "File '$f' is empty, skipped by option -Z";
401 0         0 next FILE;
402             }
403 75 50 33     156 if ($args{min_size} && $size < $args{min_size}) {
404 0         0 log_info "File '$f' (size=$size) is smaller than min_file ($args{min_size}), skipped";
405 0         0 next FILE;
406             }
407 75 50 33     130 if ($args{max_size} && $size > $args{max_size}) {
408 0         0 log_info "File '$f' (size=$size) is larger than max_file ($args{max_size}), skipped";
409 0         0 next FILE;
410             }
411              
412 75         203 push @$ffiles, $f;
413             }
414 11         25 $files = $ffiles;
415             } # FILTER
416              
417 11         17 my %name_files; # key = filename (computed), value = [path, ...]
418             GROUP_FILE_NAMES: {
419 11         15 for my $f (@$files) {
  11         20  
420             #my $path = abs_path($f);
421 75         173 (my $basename = $f) =~ s!.+/!!;
422 75   50     280 $name_files{$basename} //= [];
423 75         154 push @{ $name_files{$basename} }, $f
424 75 50       95 unless grep { $_ eq $f } @{ $name_files{$basename} };
  0         0  
  75         189  
425             }
426             #use DD; dd \%name_files;
427             }
428              
429 11         30 my %size_counts; # key = size, value = number of files having that size
430             my %size_files; # key = size, value = [file, ...]
431 11         0 my %file_sizes; # key = filename, value = file size, for caching stat()
432             GET_FILE_SIZES: {
433 11         14 for my $f (@$files) {
  11         17  
434 75         730 my @st = stat $f;
435 75 50       192 unless (@st) {
436 0         0 log_error("Can't stat file `$f`: $!, skipped");
437 0         0 next;
438             }
439 75         162 $size_counts{$st[7]}++;
440 75   100     220 $size_files{$st[7]} //= [];
441 75         94 push @{$size_files{$st[7]}}, $f;
  75         157  
442 75         214 $file_sizes{$f} = $st[7];
443             }
444             }
445              
446 11   66     70 my $calc_digest = !($algorithm eq '' || $algorithm eq 'none' || $algorithm eq 'size' || $algorithm eq 'name');
447              
448             # calculate digest for all files having non-unique sizes
449 11         25 my %digest_counts; # key = digest, value = num of files having that digest
450             my %digest_files; # key = digest, value = [file, ...]
451 11         0 my %file_digests; # key = filename, value = file digest
452             CALC_FILE_DIGESTS: {
453 11 100       14 last unless $calc_digest;
  11         22  
454 10         522 require File::Digest;
455              
456 10         2197 for my $f (@$files) {
457 66 50       148 next unless defined $file_sizes{$f}; # just checking. all files should have sizes.
458 66 100       147 next if $size_counts{ $file_sizes{$f} } == 1; # skip unique file sizes.
459 60         146 my $res = File::Digest::digest_file(
460             file=>$f, algorithm=>$algorithm, digest_args=>$digest_args);
461 60 50       9846 return [500, "Can't calculate digest for file '$f': $res->[0] - $res->[1]"]
462             unless $res->[0] == 200;
463 60         107 my $digest = $res->[2];
464 60         123 $digest_counts{$digest}++;
465 60   100     207 $digest_files{$digest} //= [];
466 60         89 push @{$digest_files{$digest}}, $f;
  60         145  
467 60         160 $file_digests{$f} = $digest;
468             }
469             }
470              
471 11         21 my %file_counts; # key = file name, value = num of files having file content
472 11         18 for my $f (@$files) {
473 75 50       197 next unless defined $file_sizes{$f}; # just checking
474 75 100       123 if (!defined($file_digests{$f})) {
475 15         35 $file_counts{$f} = $size_counts{ $file_sizes{$f} };
476             } else {
477 60         110 $file_counts{$f} = $digest_counts{ $file_digests{$f} };
478             }
479             }
480              
481             SORT_DUPLICATE_FILES: {
482 11 100       15 last unless @authoritative_dirs;
  11         26  
483 2 0       8 my $hash = $calc_digest ? \%digest_files : $algorithm eq 'name' ? \%name_files : \%size_files;
    50          
484 2         8 for my $key (keys %$hash) {
485 10         31 my @files = @{ $hash->{$key} };
  10         22  
486 10         14 my @abs_files;
487 10 100       22 next unless @files > 1;
488 4         8 for my $file (@files) {
489 12 50       194 my $abs_file = abs_path $file or do {
490 0         0 log_error "Cannot find absolute path for duplicate file '$file', skipping duplicate set %s", \@files;
491             };
492 12         37 push @abs_files, $abs_file;
493             }
494              
495             #log_trace "Duplicate files before sorting: %s", \@files;
496 12         29 @files = map { $files[$_] } sort {
497 4         19 my $file_a = $abs_files[$a];
  10         20  
498 10         13 my $file_a_in_authoritative_dirs = 0;
499 10         15 my $subdir_len_file_a;
500 10         13 for my $d (@authoritative_dirs) {
501 10 50       69 if ($file_a =~ m!\A\Q$d\E(?:/|\z)(.*)!) { $file_a_in_authoritative_dirs++; $subdir_len_file_a = length($1); last }
  0         0  
  0         0  
  0         0  
502             }
503 10         16 my $file_b = $abs_files[$b];
504 10         15 my $file_b_in_authoritative_dirs = 0;
505 10         14 my $subdir_len_file_b;
506 10         11 for my $d (@authoritative_dirs) {
507 10 100       53 if ($file_b =~ m!\A\Q$d\E(?:/|\z)(.*)!) { $file_b_in_authoritative_dirs++; $subdir_len_file_b = length($1); last }
  2         3  
  2         7  
  2         4  
508             }
509             #log_trace " file_a=<$file_a>, in authoritative_dirs? $file_a_in_authoritative_dirs";
510             #log_trace " file_b=<$file_b>, in authoritative_dirs? $file_b_in_authoritative_dirs";
511             # files located near the root of authoritative dir is preferred
512             # to deeper files. this is done by comparing subdir_len
513 10 50       39 ($file_a_in_authoritative_dirs ? $subdir_len_file_a : 9999) <=> ($file_b_in_authoritative_dirs ? $subdir_len_file_b : 9999) ||
    100          
    50          
514             $file_a cmp $file_b;
515             } 0..$#files;
516             #log_trace "Duplicate files after sorting: %s", \@files;
517              
518 4         14 $hash->{$key} = \@files;
519             }
520             }
521              
522             #$log->trace("report_duplicate=$report_duplicate");
523 11         19 my @files;
524 11         64 for my $f (sort keys %file_counts) {
525 75 100       155 if ($file_counts{$f} == 1) {
526             #log_trace "unique file '$f'";
527 24 100       48 push @files, $f if $report_unique;
528             } else {
529             #log_trace "duplicate file '$f'";
530             my $is_first_copy = $calc_digest ?
531             $f eq $digest_files{ $file_digests{$f} }[0] :
532 51 100       107 $f eq $size_files{ $file_sizes{$f} }[0];
533             #log_trace "is first copy? <$is_first_copy>";
534 51 100       114 if ($report_duplicate == 0) {
    100          
    100          
    50          
535             # do not report dupe files
536             } elsif ($report_duplicate == 1) {
537 15         32 push @files, $f;
538             } elsif ($report_duplicate == 2) {
539 21 100       43 push @files, $f if $is_first_copy;
540             } elsif ($report_duplicate == 3) {
541 9 100       25 push @files, $f unless $is_first_copy;
542             } else {
543 0         0 die "Invalid value for --report-duplicate ".
544             "'$report_duplicate', please choose 0/1/2/3";
545             }
546             }
547             }
548              
549             GROUP_FILES_BY_DIGEST: {
550 11 100       17 last unless $group_by_digest;
  11         24  
551             @files = sort {
552 1         9 $file_sizes{$a} <=> $file_sizes{$b} ||
553 20 50 50     55 ($file_digests{$a} // '') cmp ($file_digests{$b} // '')
      50        
554             } @files;
555             }
556              
557 11         32 my @rows;
558             my %resmeta;
559 11         0 my $last_digest;
560 11         20 for my $f (@files) {
561 41   66     88 my $digest = $file_digests{$f} // $file_sizes{$f};
562              
563             # add separator row
564 41 100 100     98 if ($group_by_digest && defined $last_digest && $digest ne $last_digest) {
      100        
565 4 50 33     16 push @rows, ($show_count || $show_digest || $show_size) ? {} : '';
566             }
567              
568 41         48 my $row;
569 41 100 100     131 if ($show_count || $show_digest || $show_size) {
      100        
570 19         37 $row = {file=>$f};
571 19 100       42 $row->{count} = $file_counts{$f} if $show_count;
572 19 100       35 $row->{digest} = $file_digests{$f} if $show_digest;
573 19 100       32 $row->{size} = $file_sizes{$f} if $show_size;
574             } else {
575 22         30 $row = $f;
576             }
577 41         65 push @rows, $row;
578 41         67 $last_digest = $digest;
579             }
580              
581 11         36 $resmeta{'table.fields'} = [qw/file size digest count/];
582              
583 11         161 [200, "OK", \@rows, \%resmeta];
584             }
585              
586             gen_modified_sub(
587             base_name => 'uniq_files',
588             output_name => 'dupe_files',
589             description => <<'_',
590              
591             This is a thin wrapper to <prog:uniq-files>. It defaults `report_unique` to 0
592             and `report_duplicate` to 1.
593              
594             _
595             modify_args => {
596             report_unique => sub {
597             $_[0]{schema} = [bool => {default=>0}];
598             },
599             report_duplicate => sub {
600             $_[0]{schema} = [int => {in=>[0,1,2,3], default=>1}];
601             },
602             },
603             modify_meta => sub {
604             $_[0]{examples} = [
605             {
606             summary => 'List all files (recursively, and in detail) which have duplicate contents (all duplicate copies)',
607             src => 'dupe-files -lR *',
608             src_plang => 'bash',
609             test => 0,
610             'x.doc.show_result' => 0,
611             },
612             ];
613             },
614             output_code => sub {
615 0     0     my %args = @_;
616 0   0       $args{report_unique} //= 0;
617 0   0       $args{report_duplicate} //= 1;
618 0           uniq_files(%args);
619             },
620             );
621              
622             1;
623             # ABSTRACT: Report duplicate or unique file contents
624              
625             __END__
626              
627             =pod
628              
629             =encoding UTF-8
630              
631             =head1 NAME
632              
633             App::UniqFiles - Report duplicate or unique file contents
634              
635             =head1 VERSION
636              
637             This document describes version 0.141 of App::UniqFiles (from Perl distribution App-UniqFiles), released on 2023-02-06.
638              
639             =head1 SYNOPSIS
640              
641             # See uniq-files script
642              
643             =head1 NOTES
644              
645             =head1 FUNCTIONS
646              
647              
648             =head2 dupe_files
649              
650             Usage:
651              
652             dupe_files(%args) -> [$status_code, $reason, $payload, \%result_meta]
653              
654             Report duplicate or unique file contents.
655              
656             This is a thin wrapper to L<uniq-files>. It defaults C<report_unique> to 0
657             and C<report_duplicate> to 1.
658              
659             This function is not exported.
660              
661             Arguments ('*' denotes required arguments):
662              
663             =over 4
664              
665             =item * B<algorithm> => I<str>
666              
667             What algorithm is used to compute the digest of the content.
668              
669             The default is to use C<md5>. Some algorithms supported include C<crc32>, C<sha1>,
670             C<sha256>, as well as C<Digest> to use Perl L<Digest> which supports a lot of
671             other algorithms, e.g. C<SHA-1>, C<BLAKE2b>.
672              
673             If set to '', 'none', or 'size', then digest will be set to file size. This
674             means uniqueness will be determined solely from file size. This can be quicker
675             but will generate a false positive when two files of the same size are deemed as
676             duplicate even though their content may be different.
677              
678             If set to 'name' then only name comparison will be performed. This of course can
679             potentially generate lots of false positives, but in some cases you might want
680             to compare filename for uniqueness.
681              
682             =item * B<authoritative_dirs> => I<array[str]>
683              
684             Denote director(yE<verbar>ies) where authoritativeE<sol>"Original" copies are found.
685              
686             =item * B<detail> => I<true>
687              
688             Show details (a.k.a. --show-digest, --show-size, --show-count).
689              
690             =item * B<digest_args> => I<array>
691              
692             Some Digest algorithms require arguments, you can pass them here.
693              
694             =item * B<exclude_empty_files> => I<bool>
695              
696             (No description)
697              
698             =item * B<exclude_file_patterns> => I<array[str]>
699              
700             Filename (including path) regex patterns to include.
701              
702             =item * B<files>* => I<array[str]>
703              
704             (No description)
705              
706             =item * B<group_by_digest> => I<bool>
707              
708             Sort files by its digest (or size, if not computing digest), separate each different digest.
709              
710             =item * B<include_file_patterns> => I<array[str]>
711              
712             Filename (including path) regex patterns to exclude.
713              
714             =item * B<max_size> => I<filesize>
715              
716             Maximum file size to consider.
717              
718             =item * B<min_size> => I<filesize>
719              
720             Minimum file size to consider.
721              
722             =item * B<recurse> => I<bool>
723              
724             If set to true, will recurse into subdirectories.
725              
726             =item * B<report_duplicate> => I<int> (default: 1)
727              
728             Whether to return duplicate items.
729              
730             Can be set to either 0, 1, 2, or 3.
731              
732             If set to 0, duplicate items will not be returned.
733              
734             If set to 1 (the default for C<dupe-files>), will return all the the duplicate
735             files. For example: C<file1> contains text 'a', C<file2> 'b', C<file3> 'a'. Then
736             C<file1> and C<file3> will be returned.
737              
738             If set to 2 (the default for C<uniq-files>), will only return the first of
739             duplicate items. Continuing from previous example, only C<file1> will be returned
740             because C<file2> is unique and C<file3> contains 'a' (already represented by
741             C<file1>). If one or more C<--authoritative-dir> (C<-O>) options are specified,
742             files under these directories will be preferred.
743              
744             If set to 3, will return all but the first of duplicate items. Continuing from
745             previous example: C<file3> will be returned. This is useful if you want to keep
746             only one copy of the duplicate content. You can use the output of this routine
747             to C<mv> or C<rm>. Similar to the previous case, if one or more
748             C<--authoritative-dir> (C<-O>) options are specified, then files under these
749             directories will not be listed if possible.
750              
751             =item * B<report_unique> => I<bool> (default: 0)
752              
753             Whether to return unique items.
754              
755             =item * B<show_count> => I<bool> (default: 0)
756              
757             Whether to return each file content's number of occurence.
758              
759             1 means the file content is only encountered once (unique), 2 means there is one
760             duplicate, and so on.
761              
762             =item * B<show_digest> => I<true>
763              
764             Show the digest value (or the size, if not computing digest) for each file.
765              
766             Note that this routine does not compute digest for files which have unique
767             sizes, so they will show up as empty.
768              
769             =item * B<show_size> => I<true>
770              
771             Show the size for each file.
772              
773              
774             =back
775              
776             Returns an enveloped result (an array).
777              
778             First element ($status_code) is an integer containing HTTP-like status code
779             (200 means OK, 4xx caller error, 5xx function error). Second element
780             ($reason) is a string containing error message, or something like "OK" if status is
781             200. Third element ($payload) is the actual result, but usually not present when enveloped result is an error response ($status_code is not 2xx). Fourth
782             element (%result_meta) is called result metadata and is optional, a hash
783             that contains extra information, much like how HTTP response headers provide additional metadata.
784              
785             Return value: (any)
786              
787              
788              
789             =head2 uniq_files
790              
791             Usage:
792              
793             uniq_files(%args) -> [$status_code, $reason, $payload, \%result_meta]
794              
795             Report duplicate or unique file contents.
796              
797             Given a list of filenames, will check each file size and content for duplicate
798             content. Interface is a bit like the C<uniq> Unix command-line program.
799              
800             This function is not exported by default, but exportable.
801              
802             Arguments ('*' denotes required arguments):
803              
804             =over 4
805              
806             =item * B<algorithm> => I<str>
807              
808             What algorithm is used to compute the digest of the content.
809              
810             The default is to use C<md5>. Some algorithms supported include C<crc32>, C<sha1>,
811             C<sha256>, as well as C<Digest> to use Perl L<Digest> which supports a lot of
812             other algorithms, e.g. C<SHA-1>, C<BLAKE2b>.
813              
814             If set to '', 'none', or 'size', then digest will be set to file size. This
815             means uniqueness will be determined solely from file size. This can be quicker
816             but will generate a false positive when two files of the same size are deemed as
817             duplicate even though their content may be different.
818              
819             If set to 'name' then only name comparison will be performed. This of course can
820             potentially generate lots of false positives, but in some cases you might want
821             to compare filename for uniqueness.
822              
823             =item * B<authoritative_dirs> => I<array[str]>
824              
825             Denote director(yE<verbar>ies) where authoritativeE<sol>"Original" copies are found.
826              
827             =item * B<detail> => I<true>
828              
829             Show details (a.k.a. --show-digest, --show-size, --show-count).
830              
831             =item * B<digest_args> => I<array>
832              
833             Some Digest algorithms require arguments, you can pass them here.
834              
835             =item * B<exclude_empty_files> => I<bool>
836              
837             (No description)
838              
839             =item * B<exclude_file_patterns> => I<array[str]>
840              
841             Filename (including path) regex patterns to include.
842              
843             =item * B<files>* => I<array[str]>
844              
845             (No description)
846              
847             =item * B<group_by_digest> => I<bool>
848              
849             Sort files by its digest (or size, if not computing digest), separate each different digest.
850              
851             =item * B<include_file_patterns> => I<array[str]>
852              
853             Filename (including path) regex patterns to exclude.
854              
855             =item * B<max_size> => I<filesize>
856              
857             Maximum file size to consider.
858              
859             =item * B<min_size> => I<filesize>
860              
861             Minimum file size to consider.
862              
863             =item * B<recurse> => I<bool>
864              
865             If set to true, will recurse into subdirectories.
866              
867             =item * B<report_duplicate> => I<int> (default: 2)
868              
869             Whether to return duplicate items.
870              
871             Can be set to either 0, 1, 2, or 3.
872              
873             If set to 0, duplicate items will not be returned.
874              
875             If set to 1 (the default for C<dupe-files>), will return all the the duplicate
876             files. For example: C<file1> contains text 'a', C<file2> 'b', C<file3> 'a'. Then
877             C<file1> and C<file3> will be returned.
878              
879             If set to 2 (the default for C<uniq-files>), will only return the first of
880             duplicate items. Continuing from previous example, only C<file1> will be returned
881             because C<file2> is unique and C<file3> contains 'a' (already represented by
882             C<file1>). If one or more C<--authoritative-dir> (C<-O>) options are specified,
883             files under these directories will be preferred.
884              
885             If set to 3, will return all but the first of duplicate items. Continuing from
886             previous example: C<file3> will be returned. This is useful if you want to keep
887             only one copy of the duplicate content. You can use the output of this routine
888             to C<mv> or C<rm>. Similar to the previous case, if one or more
889             C<--authoritative-dir> (C<-O>) options are specified, then files under these
890             directories will not be listed if possible.
891              
892             =item * B<report_unique> => I<bool> (default: 1)
893              
894             Whether to return unique items.
895              
896             =item * B<show_count> => I<bool> (default: 0)
897              
898             Whether to return each file content's number of occurence.
899              
900             1 means the file content is only encountered once (unique), 2 means there is one
901             duplicate, and so on.
902              
903             =item * B<show_digest> => I<true>
904              
905             Show the digest value (or the size, if not computing digest) for each file.
906              
907             Note that this routine does not compute digest for files which have unique
908             sizes, so they will show up as empty.
909              
910             =item * B<show_size> => I<true>
911              
912             Show the size for each file.
913              
914              
915             =back
916              
917             Returns an enveloped result (an array).
918              
919             First element ($status_code) is an integer containing HTTP-like status code
920             (200 means OK, 4xx caller error, 5xx function error). Second element
921             ($reason) is a string containing error message, or something like "OK" if status is
922             200. Third element ($payload) is the actual result, but usually not present when enveloped result is an error response ($status_code is not 2xx). Fourth
923             element (%result_meta) is called result metadata and is optional, a hash
924             that contains extra information, much like how HTTP response headers provide additional metadata.
925              
926             Return value: (any)
927              
928             =head1 HOMEPAGE
929              
930             Please visit the project's homepage at L<https://metacpan.org/release/App-UniqFiles>.
931              
932             =head1 SOURCE
933              
934             Source repository is at L<https://github.com/perlancar/perl-App-UniqFiles>.
935              
936             =head1 SEE ALSO
937              
938             L<find-duplicate-filenames> from L<App::FindUtils>
939              
940             L<move-duplicate-files-to> from L<App::DuplicateFilesUtils>, which is basically
941             a shortcut for C<< uniq-files -D -R . | while read f; do mv "$f" SOMEDIR/; done
942             >>.
943              
944             =head1 AUTHOR
945              
946             perlancar <perlancar@cpan.org>
947              
948             =head1 CONTRIBUTOR
949              
950             =for stopwords Steven Haryanto
951              
952             Steven Haryanto <stevenharyanto@gmail.com>
953              
954             =head1 CONTRIBUTING
955              
956              
957             To contribute, you can send patches by email/via RT, or send pull requests on
958             GitHub.
959              
960             Most of the time, you don't need to build the distribution yourself. You can
961             simply modify the code, then test via:
962              
963             % prove -l
964              
965             If you want to build the distribution (e.g. to try to install it locally on your
966             system), you can install L<Dist::Zilla>,
967             L<Dist::Zilla::PluginBundle::Author::PERLANCAR>,
968             L<Pod::Weaver::PluginBundle::Author::PERLANCAR>, and sometimes one or two other
969             Dist::Zilla- and/or Pod::Weaver plugins. Any additional steps required beyond
970             that are considered a bug and can be reported to me.
971              
972             =head1 COPYRIGHT AND LICENSE
973              
974             This software is copyright (c) 2023, 2022, 2020, 2019, 2017, 2015, 2014, 2012, 2011 by perlancar <perlancar@cpan.org>.
975              
976             This is free software; you can redistribute it and/or modify it under
977             the same terms as the Perl 5 programming language system itself.
978              
979             =head1 BUGS
980              
981             Please report any bugs or feature requests on the bugtracker website L<https://rt.cpan.org/Public/Dist/Display.html?Name=App-UniqFiles>
982              
983             When submitting a bug or request, please include a test-file or a
984             patch to an existing test-file that illustrates the bug or desired
985             feature.
986              
987             =cut