File Coverage

blib/lib/File/FindUniq.pm
Criterion Covered Total %
statement 188 262 71.7
branch 80 126 63.4
condition 53 86 61.6
subroutine 11 16 68.7
pod 1 1 100.0
total 333 491 67.8


line stmt bran cond sub pod time code
1             package File::FindUniq;
2              
3 2     2   530292 use 5.010001;
  2         9  
4 2     2   12 use strict;
  2         10  
  2         56  
5 2     2   15 use warnings;
  2         5  
  2         127  
6 2     2   4002 use Log::ger;
  2         123  
  2         13  
7              
8 2     2   694 use Cwd qw(abs_path);
  2         3  
  2         158  
9 2     2   11 use Exporter qw(import);
  2         4  
  2         68  
10 2     2   1347 use Perinci::Sub::Util qw(gen_modified_sub);
  2         7327  
  2         419  
11              
12             our $AUTHORITY = 'cpan:PERLANCAR'; # AUTHORITY
13             our $DATE = '2025-05-03'; # DATE
14             our $DIST = 'File-FindUniq'; # DIST
15             our $VERSION = '0.004'; # VERSION
16              
17             sub _glob {
18 10     10   109 require File::Find;
19              
20 10         24 my $dir;
21             my @res;
22             File::Find::finddepth(
23             sub {
24 30 50   30   459 return if -l $_;
25 30 100       271 return unless -f _;
26 2     2   24 no warnings 'once'; # $File::Find::dir
  2         3  
  2         8826  
27 20         4479 push @res, "$File::Find::dir/$_";
28             },
29 10         2052 @_,
30             );
31 10         98 @res;
32             }
33              
34             our @EXPORT_OK = qw(uniq_files dupe_files);
35              
36             our %SPEC;
37              
38             $SPEC{':package'} = {
39             v => 1.1,
40             summary => 'Find unique or duplicate file {contents,names}',
41             };
42              
43             our %argspec_authoritative_dirs = (
44             authoritative_dirs => {
45             summary => 'Denote director(y|ies) where authoritative/"Original" copies are found',
46             'x.name.is_plural' => 1,
47             'x.name.singular' => 'authoritative_dir',
48             schema => ['array*', of=>'str*'], # XXX dirname
49             cmdline_aliases => {O=>{}},
50             },
51             );
52             our %argspecs_filter = (
53             include_file_patterns => {
54             summary => 'Filename (including path) regex patterns to exclude',
55             'x.name.is_plural' => 1,
56             'x.name.singular' => 'include_file_pattern',
57             schema => ['array*', of=>'str*'], # XXX re
58             cmdline_aliases => {I=>{}},
59             },
60             exclude_file_patterns => {
61             summary => 'Filename (including path) regex patterns to include',
62             'x.name.is_plural' => 1,
63             'x.name.singular' => 'exclude_file_pattern',
64             schema => ['array*', of=>'str*'], # XXX re
65             cmdline_aliases => {X=>{}},
66             },
67             exclude_empty_files => {
68             schema => 'bool*',
69             cmdline_aliases => {Z=>{}},
70             },
71             min_size => {
72             summary => 'Minimum file size to consider',
73             schema => 'filesize*',
74             },
75             max_size => {
76             summary => 'Maximum file size to consider',
77             schema => 'filesize*',
78             },
79             );
80              
81             $SPEC{uniq_files} = {
82             v => 1.1,
83             summary => 'Report duplicate or unique files, optionally perform action on them',
84             description => <<'MARKDOWN',
85              
86             Given a list of filenames, will check each file's content (and/or size, and/or
87             only name) to decide whether the file is a duplicate of another.
88              
89             There is a certain amount of flexibility on how duplicate is determined:
90             - when comparing content, various hashing algorithm is supported;
91             - when comparing size, a certain tolerance % is allowed;
92             - when comparing filename, munging can first be done.
93              
94             There is flexibility on what to do with duplicate files:
95             - just print unique/duplicate files (and let other utilities down the pipe deal
96             with them);
97             - move duplicates to some location;
98             - open the files first and prompt for action;
99             - let a Perl code process the files.
100              
101             Interface is loosely based on the `uniq` Unix command-line program.
102              
103             MARKDOWN
104             args => {
105             # actions => {
106             # 'x.name.is_plural' => 1,
107             # 'x.name.singular' => 'action',
108             # summary => 'What action(s) to perform',
109             # schema => ['array*', of=>['str*', in=>[qw/report/]], 'prefilters'=>['Array::check_uniq']],
110             # default => ['report'],
111             # description => <<'MARKDOWN',
112             #
113             #The following actions are available. More than one action can be
114             #
115             #MARKDOWN
116             # tags => ['category:input'],
117             # },
118             files => {
119             schema => ['array*' => {of=>'str*'}],
120             req => 1,
121             pos => 0,
122             slurpy => 1,
123             tags => ['category:input'],
124             },
125              
126             recurse => {
127             schema => 'bool*',
128             cmdline_aliases => {R=>{}},
129             description => <<'MARKDOWN',
130              
131             If set to true, will recurse into subdirectories.
132              
133             MARKDOWN
134             tags => ['category:input'],
135             },
136             group_by_digest => {
137             summary => 'Sort files by its digest (or size, if not computing digest), separate each different digest',
138             schema => 'bool*',
139             },
140             show_digest => {
141             summary => 'Show the digest value (or the size, if not computing digest) for each file',
142             description => <<'MARKDOWN',
143              
144             Note that this routine does not compute digest for files which have unique
145             sizes, so they will show up as empty.
146              
147             MARKDOWN
148             schema => 'true*',
149             },
150             show_size => {
151             summary => 'Show the size for each file',
152             schema => 'true*',
153             },
154             # TODO add option follow_symlinks?
155             report_unique => {
156             schema => [bool => {default=>1}],
157             summary => 'Whether to return unique items',
158             cmdline_aliases => {
159             a => {
160             summary => 'Alias for --report-unique --report-duplicate=1 (report all files)',
161             code => sub {
162             my $args = shift;
163             $args->{report_unique} = 1;
164             $args->{report_duplicate} = 1;
165             },
166             },
167             u => {
168             summary => 'Alias for --report-unique --report-duplicate=0',
169             code => sub {
170             my $args = shift;
171             $args->{report_unique} = 1;
172             $args->{report_duplicate} = 0;
173             },
174             },
175             d => {
176             summary =>
177             'Alias for --noreport-unique --report-duplicate=1',
178             code => sub {
179             my $args = shift;
180             $args->{report_unique} = 0;
181             $args->{report_duplicate} = 1;
182             },
183             },
184             D => {
185             summary =>
186             'Alias for --noreport-unique --report-duplicate=3',
187             code => sub {
188             my $args = shift;
189             $args->{report_unique} = 0;
190             $args->{report_duplicate} = 3;
191             },
192             },
193             },
194             },
195             report_duplicate => {
196             schema => [int => {in=>[0,1,2,3], default=>2}],
197             summary => 'Whether to return duplicate items',
198             description => <<'MARKDOWN',
199              
200             Can be set to either 0, 1, 2, or 3.
201              
202             If set to 0, duplicate items will not be returned.
203              
204             If set to 1 (the default for `dupe-files`), will return all the the duplicate
205             files. For example: `file1` contains text 'a', `file2` 'b', `file3` 'a'. Then
206             `file1` and `file3` will be returned.
207              
208             If set to 2 (the default for `uniq-files`), will only return the first of
209             duplicate items. Continuing from previous example, only `file1` will be returned
210             because `file2` is unique and `file3` contains 'a' (already represented by
211             `file1`). If one or more `--authoritative-dir` (`-O`) options are specified,
212             files under these directories will be preferred.
213              
214             If set to 3, will return all but the first of duplicate items. Continuing from
215             previous example: `file3` will be returned. This is useful if you want to keep
216             only one copy of the duplicate content. You can use the output of this routine
217             to `mv` or `rm`. Similar to the previous case, if one or more
218             `--authoritative-dir` (`-O`) options are specified, then files under these
219             directories will not be listed if possible.
220              
221             MARKDOWN
222             cmdline_aliases => {
223             },
224             },
225             algorithm => {
226             schema => ['str*'],
227             summary => "What algorithm is used to compute the digest of the content",
228             description => <<'MARKDOWN',
229              
230             The default is to use `md5`. Some algorithms supported include `crc32`, `sha1`,
231             `sha256`, as well as `Digest` to use Perl <pm:Digest> which supports a lot of
232             other algorithms, e.g. `SHA-1`, `BLAKE2b`.
233              
234             If set to '', 'none', or 'size', then digest will be set to file size. This
235             means uniqueness will be determined solely from file size. This can be quicker
236             but will generate a false positive when two files of the same size are deemed as
237             duplicate even though their content may be different.
238              
239             If set to 'name' then only name comparison will be performed. This of course can
240             potentially generate lots of false positives, but in some cases you might want
241             to compare filename for uniqueness.
242              
243             MARKDOWN
244             },
245             digest_args => {
246             schema => ['array*',
247              
248             # comment out temporarily, Perinci::Sub::GetArgs::Argv
249             # clashes with coerce rules; we should fix
250             # Perinci::Sub::GetArgs::Argv to observe coercion rules
251             # first
252             #of=>'str*',
253              
254             'x.perl.coerce_rules'=>['From_str::comma_sep']],
255             description => <<'MARKDOWN',
256              
257             Some Digest algorithms require arguments, you can pass them here.
258              
259             MARKDOWN
260             cmdline_aliases => {A=>{}},
261             },
262             show_count => {
263             schema => [bool => {default=>0}],
264             summary => "Whether to return each file content's ".
265             "number of occurence",
266             description => <<'MARKDOWN',
267              
268             1 means the file content is only encountered once (unique), 2 means there is one
269             duplicate, and so on.
270              
271             MARKDOWN
272             cmdline_aliases => {count=>{}, c=>{}},
273             },
274             detail => {
275             summary => 'Show details (a.k.a. --show-digest, --show-size, --show-count)',
276             schema => 'true*',
277             cmdline_aliases => {l=>{}},
278             },
279             %argspec_authoritative_dirs,
280             %argspecs_filter,
281             },
282             examples => [
283             {
284             summary => 'List all files which do no have duplicate contents',
285             src => 'uniq-files *',
286             src_plang => 'bash',
287             test => 0,
288             'x.doc.show_result' => 0,
289             },
290             {
291             summary => 'List all files (recursively, and in detail) which have duplicate contents (all duplicate copies), exclude some files',
292             src => q(uniq-files -R -l -d -X '\.git/' --min-size 10k .),
293             src_plang => 'bash',
294             test => 0,
295             'x.doc.show_result' => 0,
296             },
297             {
298             summary => 'Move all duplicate files (except one copy) in this directory (and subdirectories) to .dupes/',
299             src => 'uniq-files -D -R * | while read f; do mv "$f" .dupes/; done',
300             src_plang => 'bash',
301             test => 0,
302             'x.doc.show_result' => 0,
303             },
304             {
305             summary => 'List number of occurences of contents for duplicate files',
306             src => 'uniq-files -c *',
307             src_plang => 'bash',
308             test => 0,
309             'x.doc.show_result' => 0,
310             },
311             {
312             summary => 'List number of occurences of contents for all files',
313             src => 'uniq-files -a -c *',
314             src_plang => 'bash',
315             test => 0,
316             'x.doc.show_result' => 0,
317             },
318             {
319             summary => 'List all files, along with their number of content occurrences and content digest. '.
320             'Use the BLAKE2b digest algorithm. And group the files according to their digest.',
321             src => 'uniq-files -a -c --show-digest -A BLAKE2,blake2b *',
322             src_plang => 'bash',
323             test => 0,
324             'x.doc.show_result' => 0,
325             },
326             ],
327             };
328             sub uniq_files {
329 11     11 1 624989 my %args = @_;
330              
331 11         42 my $files = delete $args{files};
332 11 50 33     93 return [400, "Please specify files"] if !$files || !@$files;
333 11         32 my $recurse = delete($args{recurse});
334 11   100     49 my $report_unique = delete($args{report_unique}) // 1;
335 11   100     42 my $report_duplicate = delete($args{report_duplicate}) // 2;
336 11   100     48 my $show_count = delete($args{show_count}) // 0;
337 11   100     45 my $show_digest = delete($args{show_digest}) // 0;
338 11   100     48 my $show_size = delete($args{show_size}) // 0;
339 11         23 my $digest_args = delete($args{digest_args});
340 11 50 66     58 my $algorithm = delete($args{algorithm}) // ($digest_args ? 'Digest' : 'md5');
341 11         24 my $group_by_digest = delete($args{group_by_digest});
342 11         22 my $detail = delete($args{detail});
343 11         23 my $authoritative_dirs = delete($args{authoritative_dirs});
344 11         24 my $include_file_patterns = delete($args{include_file_patterns});
345 11         21 my $exclude_file_patterns = delete($args{exclude_file_patterns});
346 11   50     59 my $exclude_empty_files = delete($args{exclude_empty_files}) // 0;
347 11         20 my $min_size = delete($args{min_size});
348 11         20 my $max_size = delete($args{max_size});
349             return [400, "Unknown argument(s): ".join(", ", sort keys %args)]
350 11 50       44 if grep {!/\A-/} keys %args;
  0         0  
351              
352 11 50       32 if ($detail) {
353 0         0 $show_digest = 1;
354 0         0 $show_size = 1;
355 0         0 $show_count = 1;
356             }
357              
358 11 100 66     58 my @authoritative_dirs = $authoritative_dirs && @$authoritative_dirs ?
359             @$authoritative_dirs : ();
360 11         56 for my $dir (@authoritative_dirs) {
361 2 50       53 (-d $dir) or return [400, "Authoritative dir '$dir' does not exist or not a directory"];
362 2 50       42 my $abs_dir = abs_path $dir or return [400, "Cannot get absolute path for authoritative dir '$dir'"];
363 2         25 $dir = $abs_dir;
364             }
365             #log_trace "authoritative_dirs=%s", \@authoritative_dirs if @authoritative_dirs;
366              
367 11         20 my @include_re;
368 11   50     19 for my $re0 (@{ $include_file_patterns // [] }) {
  11         66  
369 0         0 require Regexp::Util;
370 0         0 my $re;
371 0 0       0 if (ref $re0 eq 'Regexp') {
372 0         0 $re = $re0;
373             } else {
374 0         0 eval { $re = Regexp::Util::deserialize_regexp("qr($re0)") };
  0         0  
375 0 0       0 return [400, "Invalid/unsafe regex pattern in include_file_patterns '$re0': $@"] if $@;
376 0 0       0 return [400, "Unsafe regex pattern (contains embedded code) in include_file_patterns '$re0'"] if Regexp::Util::regexp_seen_evals($re);
377             }
378 0         0 push @include_re, $re;
379             }
380 11         25 my @exclude_re;
381 11   50     18 for my $re0 (@{ $exclude_file_patterns // [] }) {
  11         54  
382 0         0 require Regexp::Util;
383 0         0 my $re;
384 0 0       0 if (ref $re0 eq 'Regexp') {
385 0         0 $re = $re0;
386             } else {
387 0         0 eval { $re = Regexp::Util::deserialize_regexp("qr($re0)") };
  0         0  
388 0 0       0 return [400, "Invalid/unsafe regex pattern in exclude_file_patterns '$re0': $@"] if $@;
389 0 0       0 return [400, "Unsafe regex pattern (contains embedded code) in exclude_file_patterns '$re0'"] if Regexp::Util::regexp_seen_evals($re);
390             }
391 0         0 push @exclude_re, $re;
392             }
393              
394 11 100       36 if ($recurse) {
395             $files = [ map {
396 5 50       17 if (-l $_) {
  35 100       430  
397 0         0 ();
398             } elsif (-d _) {
399 10         37 (_glob($_));
400             } else {
401 25         63 ($_);
402             }
403             } @$files ];
404             }
405              
406             FILTER: {
407 11         24 my $ffiles;
  11         24  
408             FILE:
409 11         30 for my $f (@$files) {
410 87 50       1065 if (-l $f) {
411 0         0 log_warn "File '$f' is a symlink, ignored";
412 0         0 next FILE;
413             }
414 87 100       200 if (-d _) {
415 12         53 log_warn "File '$f' is a directory, ignored";
416 12         51 next FILE;
417             }
418 75 50       144 unless (-f _) {
419 0         0 log_warn "File '$f' is not a regular file, ignored";
420 0         0 next FILE;
421             }
422              
423 75 50       153 if (@include_re) {
424 0         0 my $included;
425 0         0 for my $re (@include_re) {
426 0 0       0 if ($f =~ $re) { $included++; last }
  0         0  
  0         0  
427             }
428 0 0       0 unless ($included) {
429 0         0 log_info "File '$f' is not in --include-file-patterns, skipped";
430 0         0 next FILE;
431             }
432             }
433 75 50       144 if (@exclude_re) {
434 0         0 for my $re (@exclude_re) {
435 0 0       0 if ($f =~ $re) {
436 0         0 log_info "File '$f' is in --exclude-file-patterns, skipped";
437 0         0 next FILE;
438             }
439             }
440             }
441              
442 75         625 my $size = -s $f;
443 75 50 33     185 if ($exclude_empty_files && !$size) {
444 0         0 log_info "File '$f' is empty, skipped by option -Z";
445 0         0 next FILE;
446             }
447 75 50 33     181 if (defined($min_size) && $size < $min_size) {
448 0         0 log_info "File '$f' (size=$size) is smaller than min_file ($min_size), skipped";
449 0         0 next FILE;
450             }
451 75 50 33     148 if (defined($max_size) && $size > $max_size) {
452 0         0 log_info "File '$f' (size=$size) is larger than max_file ($max_size), skipped";
453 0         0 next FILE;
454             }
455              
456 75         248 push @$ffiles, $f;
457             }
458 11         31 $files = $ffiles;
459             } # FILTER
460              
461 11         27 my %basename_paths; # key = basename (computed), value = [path, ...]
462             my %path_basenames; # key = path, value = basename
463             GROUP_FILE_NAMES: {
464 11         18 for my $f (@$files) {
  11         27  
465             #my $path = abs_path($f);
466 75         203 (my $basename = $f) =~ s!.+/!!;
467 75   50     334 $basename_paths{$basename} //= [];
468 75         208 push @{ $basename_paths{$basename} }, $f
469 75 50       103 unless grep { $_ eq $f } @{ $basename_paths{$basename} };
  0         0  
  75         222  
470 75         179 $path_basenames{$f} = $basename;
471             }
472             }
473             #use DD; print "basename_paths: "; dd \%basename_paths;
474              
475 11         36 my %size_counts; # key = size, value = number of files having that size
476             my %size_paths; # key = size, value = [path, ...]
477 11         0 my %path_sizes; # key = path, value = file size, for caching stat()
478             GET_FILE_SIZES: {
479 11         19 for my $f (@$files) {
  11         29  
480 75         653 my @st = stat $f;
481 75 50       157 unless (@st) {
482 0         0 log_error("Can't stat file `$f`: $!, skipped");
483 0         0 next;
484             }
485 75         171 $size_counts{$st[7]}++;
486 75   100     200 $size_paths{$st[7]} //= [];
487 75         118 push @{$size_paths{$st[7]}}, $f;
  75         158  
488 75         219 $path_sizes{$f} = $st[7];
489             }
490             }
491             #use DD; print "size_paths: "; dd \%size_paths;
492              
493             # calculate digest for all files having non-unique sizes
494 11         4108 my %digest_counts; # key = digest, value = num of files having that digest
495             my %digest_paths; # key = digest, value = [file, ...]
496 11         0 my %path_digests; # key = path, value = file digest
497             CALC_FILE_DIGESTS: {
498 11         18 require File::Digest;
  11         892  
499              
500 11         6795 for my $f (@$files) {
501 75 50       204 next unless defined $path_sizes{$f}; # just checking. all files should have sizes.
502              
503 75         110 my $digest;
504 75 100 66     575 if ($algorithm eq '' || $algorithm eq 'none' || $algorithm eq 'size') {
    50 66        
505 9         13 $digest = $path_sizes{$f};
506             } elsif ($algorithm eq 'name') {
507 0         0 $digest = $path_basenames{$f};
508             } else {
509 66 100       206 next if $size_counts{ $path_sizes{$f} } == 1; # skip unique file sizes.
510 60         176 my $res = File::Digest::digest_file(
511             file=>$f, algorithm=>$algorithm, digest_args=>$digest_args);
512 60 50       24529 return [500, "Can't calculate digest for file '$f': $res->[0] - $res->[1]"]
513             unless $res->[0] == 200;
514 60         147 $digest = $res->[2];
515             }
516 69         164 $digest_counts{$digest}++;
517 69   100     578 $digest_paths{$digest} //= [];
518 69         145 push @{$digest_paths{$digest}}, $f;
  69         176  
519 69         213 $path_digests{$f} = $digest;
520             }
521             }
522             #use DD; print "digest_paths: "; dd \%digest_paths;
523             #use DD; print "path_digests: "; dd \%path_digests;
524              
525 11         23 my %path_counts; # key = path, value = num of files having file content
526 11         31 for my $f (@$files) {
527 75 50       152 next unless defined $path_sizes{$f}; # just checking, all files should have sizes
528 75 100       158 if (!defined($path_digests{$f})) {
529 6         18 $path_counts{$f} = $size_counts{ $path_sizes{$f} };
530             } else {
531 69         153 $path_counts{$f} = $digest_counts{ $path_digests{$f} };
532             }
533             }
534             #use DD; print "path_counts: "; dd \%path_counts;
535              
536             SORT_DUPLICATE_FILES: {
537 11 100       18 last unless @authoritative_dirs;
  11         37  
538 2         6 my $hash = \%digest_paths;
539 2         9 for my $key (keys %$hash) {
540 10         17 my @files = @{ $hash->{$key} };
  10         28  
541 10         18 my @abs_files;
542 10 100       28 next unless @files > 1;
543 4         9 for my $file (@files) {
544 12 50       237 my $abs_file = abs_path $file or do {
545 0         0 log_error "Cannot find absolute path for duplicate file '$file', skipping duplicate set %s", \@files;
546             };
547 12         30 push @abs_files, $abs_file;
548             }
549              
550             #log_trace "Duplicate files before sorting: %s", \@files;
551 12         53 @files = map { $files[$_] } sort {
552 4         23 my $file_a = $abs_files[$a];
  10         21  
553 10         15 my $file_a_in_authoritative_dirs = 0;
554 10         16 my $subdir_len_file_a;
555 10         18 for my $d (@authoritative_dirs) {
556 10 50       93 if ($file_a =~ m!\A\Q$d\E(?:/|\z)(.*)!) { $file_a_in_authoritative_dirs++; $subdir_len_file_a = length($1); last }
  0         0  
  0         0  
  0         0  
557             }
558 10         18 my $file_b = $abs_files[$b];
559 10         17 my $file_b_in_authoritative_dirs = 0;
560 10         17 my $subdir_len_file_b;
561 10         20 for my $d (@authoritative_dirs) {
562 10 100       70 if ($file_b =~ m!\A\Q$d\E(?:/|\z)(.*)!) { $file_b_in_authoritative_dirs++; $subdir_len_file_b = length($1); last }
  2         3  
  2         10  
  2         4  
563             }
564             #log_trace " file_a=<$file_a>, in authoritative_dirs? $file_a_in_authoritative_dirs";
565             #log_trace " file_b=<$file_b>, in authoritative_dirs? $file_b_in_authoritative_dirs";
566             # files located near the root of authoritative dir is preferred
567             # to deeper files. this is done by comparing subdir_len
568 10 50       46 ($file_a_in_authoritative_dirs ? $subdir_len_file_a : 9999) <=> ($file_b_in_authoritative_dirs ? $subdir_len_file_b : 9999) ||
    100          
    50          
569             $file_a cmp $file_b;
570             } 0..$#files;
571             #log_trace "Duplicate files after sorting: %s", \@files;
572              
573 4         19 $hash->{$key} = \@files;
574             }
575             }
576              
577             #$log->trace("report_duplicate=$report_duplicate");
578 11         22 my @files;
579 11         76 for my $f (sort keys %path_counts) {
580 75 100       225 if ($path_counts{$f} == 1) {
581 24         71 log_trace "unique file '$f'";
582 24 100       87 push @files, $f if $report_unique;
583             } else {
584 51         178 log_trace "duplicate file '$f'";
585 51         10952 my $is_first_copy = $f eq $digest_paths{ $path_digests{$f} }[0];
586 51         171 log_trace "is first copy? <$is_first_copy>";
587 51 100       221 if ($report_duplicate == 0) {
    100          
    100          
    50          
588             # do not report dupe files
589             } elsif ($report_duplicate == 1) {
590 15         40 push @files, $f;
591             } elsif ($report_duplicate == 2) {
592 21 100       59 push @files, $f if $is_first_copy;
593             } elsif ($report_duplicate == 3) {
594 9 100       27 push @files, $f unless $is_first_copy;
595             } else {
596 0         0 die "Invalid value for --report-duplicate ".
597             "'$report_duplicate', please choose 0/1/2/3";
598             }
599             }
600             }
601              
602             GROUP_FILES_BY_DIGEST: {
603 11 100       24 last unless $group_by_digest;
  11         33  
604             @files = sort {
605 1         8 $path_sizes{$a} <=> $path_sizes{$b} ||
606 20 50 50     57 ($path_digests{$a} // '') cmp ($path_digests{$b} // '')
      50        
607             } @files;
608             }
609              
610 11         53 my @rows;
611             my %resmeta;
612 11         0 my $last_digest;
613 11         27 for my $f (@files) {
614 41   66     111 my $digest = $path_digests{$f} // $path_sizes{$f};
615              
616             # add separator row
617 41 100 100     121 if ($group_by_digest && defined $last_digest && $digest ne $last_digest) {
      100        
618 4 50 33     4076 push @rows, ($show_count || $show_digest || $show_size) ? {} : '';
619             }
620              
621 41         64 my $row;
622 41 100 100     161 if ($show_count || $show_digest || $show_size) {
      100        
623 19         60 $row = {file=>$f};
624 19 100       76 $row->{count} = $path_counts{$f} if $show_count;
625 19 100       51 $row->{digest} = $path_digests{$f} if $show_digest;
626 19 100       44 $row->{size} = $path_sizes{$f} if $show_size;
627             } else {
628 22         34 $row = $f;
629             }
630 41         78 push @rows, $row;
631 41         105 $last_digest = $digest;
632             }
633              
634 11 100 100     79 $resmeta{'table.fields'} = [qw/file size digest count/]
      100        
635             if $show_count || $show_digest || $show_size;
636              
637 11         331 [200, "OK", \@rows, \%resmeta];
638             }
639              
640             # dupe_files
641             gen_modified_sub(
642             base_name => 'uniq_files',
643             output_name => 'dupe_files',
644             description => <<'MARKDOWN',
645              
646             This is a thin wrapper for <prog:uniq-files>. It defaults `report_unique` to 0
647             and `report_duplicate` to 1.
648              
649             MARKDOWN
650             modify_args => {
651             report_unique => sub {
652             $_[0]{schema} = [bool => {default=>0}];
653             },
654             report_duplicate => sub {
655             $_[0]{schema} = [int => {in=>[0,1,2,3], default=>1}];
656             },
657             },
658             modify_meta => sub {
659             $_[0]{examples} = [
660             {
661             summary => 'List all files (recursively, and in detail)NN which have duplicate contents (all duplicate copies)',
662             src => 'dupe-files -lR *',
663             src_plang => 'bash',
664             test => 0,
665             'x.doc.show_result' => 0,
666             },
667             ];
668             },
669             output_code => sub {
670 0     0     my %args = @_;
671 0   0       $args{report_unique} //= 0;
672 0   0       $args{report_duplicate} //= 1;
673 0           uniq_files(%args);
674             },
675             );
676              
677             # uniq_filenames
678             gen_modified_sub(
679             base_name => 'uniq_files',
680             output_name => 'uniq_filenames',
681             description => <<'MARKDOWN',
682              
683             This is a thin wrapper for <prog:uniq-files>. It sets `algorithm` to `name`.
684              
685             MARKDOWN
686             remove_args => ['algorithm'],
687             modify_meta => sub {
688             $_[0]{examples} = [
689             {
690             summary => 'Find unique filenames in two directories',
691             src => 'uniq-filenames -uR dir1 dir2',
692             src_plang => 'bash',
693             test => 0,
694             'x.doc.show_result' => 0,
695             },
696             ];
697             },
698             output_code => sub {
699 0     0     my %args = @_;
700 0           uniq_files(%args, algorithm => 'name');
701             },
702             );
703              
704             # dupe_filenames
705             gen_modified_sub(
706             base_name => 'uniq_files',
707             output_name => 'dupe_filenames',
708             description => <<'MARKDOWN',
709              
710             This is a thin wrapper for <prog:uniq-files>. It sets `algorithm` to `name`,
711             defaults `report_unique` to 0 and `report_duplicate` to 1.
712              
713             MARKDOWN
714             remove_args => ['algorithm'],
715             modify_args => {
716             report_unique => sub {
717             $_[0]{schema} = [bool => {default=>0}];
718             },
719             report_duplicate => sub {
720             $_[0]{schema} = [int => {in=>[0,1,2,3], default=>1}];
721             },
722             },
723             modify_meta => sub {
724             $_[0]{examples} = [
725             {
726             summary => 'Find duplicate filenames in two directories',
727             src => 'dupe-filenames -R dir1 dir2',
728             src_plang => 'bash',
729             test => 0,
730             'x.doc.show_result' => 0,
731             },
732             ];
733             },
734             output_code => sub {
735 0     0     my %args = @_;
736 0   0       $args{report_unique} //= 0;
737 0   0       $args{report_duplicate} //= 1;
738 0           uniq_files(%args, algorithm=>'name');
739             },
740             );
741              
742             # uniq_filenames_between_two_dirs
743             gen_modified_sub(
744             base_name => 'uniq_files',
745             output_name => 'uniq_filenames_between_two_dirs',
746             description => <<'MARKDOWN',
747              
748             This is a thin wrapper for <prog:uniq-files>. It sets `algorithm` to `name`,
749             `recurse` to true. It also accepts two directory names instead of one+ dir/file
750             names.
751              
752             MARKDOWN
753             add_args => {
754             dir1 => {
755             schema => 'dirname*',
756             req => 1,
757             pos => 0,
758             },
759             dir2 => {
760             schema => 'dirname*',
761             req => 1,
762             pos => 1,
763             },
764             },
765             remove_args => ['algorithm', 'files', 'recurse'],
766             modify_meta => sub {
767             $_[0]{examples} = [
768             {
769             summary => 'Find unique filenames in two directories',
770             src => 'uniq-filenames-between-two-dirs -u dir1 dir2',
771             src_plang => 'bash',
772             test => 0,
773             'x.doc.show_result' => 0,
774             },
775             ];
776             },
777             output_code => sub {
778 0     0     my %args = @_;
779 0           my $dir1 = delete $args{dir1};
780 0           my $dir2 = delete $args{dir2};
781 0           uniq_files(
782             %args,
783             files => [$dir1, $dir2],
784             algorithm => 'name',
785             recurse => 1,
786             );
787             },
788             );
789              
790             # dupe_filenames_between_two_dirs
791             gen_modified_sub(
792             base_name => 'uniq_files',
793             output_name => 'dupe_filenames_between_two_dirs',
794             description => <<'MARKDOWN',
795              
796             This is a thin wrapper for <prog:uniq-files>. It sets `algorithm` to `name`,
797             `recurse` to true, defaults `report_unique` to 0 and `report_duplicate` to 1. It
798             also accepts two directory names instead of one+ dir/file names.
799              
800             MARKDOWN
801             add_args => {
802             dir1 => {
803             schema => 'dirname*',
804             req => 1,
805             pos => 0,
806             },
807             dir2 => {
808             schema => 'dirname*',
809             req => 1,
810             pos => 1,
811             },
812             },
813             remove_args => ['algorithm', 'files', 'recurse'],
814             modify_args => {
815             report_unique => sub {
816             $_[0]{schema} = [bool => {default=>0}];
817             },
818             report_duplicate => sub {
819             $_[0]{schema} = [int => {in=>[0,1,2,3], default=>1}];
820             },
821             },
822             modify_meta => sub {
823             $_[0]{examples} = [
824             {
825             summary => 'Find duplicate filenames in two directories',
826             src => 'dupe-filenames-between-two-dirs dir1 dir2',
827             src_plang => 'bash',
828             test => 0,
829             'x.doc.show_result' => 0,
830             },
831             ];
832             },
833             output_code => sub {
834 0     0     my %args = @_;
835 0           my $dir1 = delete $args{dir1};
836 0           my $dir2 = delete $args{dir2};
837 0   0       $args{report_unique} //= 0;
838 0   0       $args{report_duplicate} //= 1;
839 0           uniq_files(
840             %args,
841             files => [$dir1, $dir2],
842             algorithm => 'name',
843             recurse => 1,
844             );
845             },
846             );
847              
848              
849             1;
850             # ABSTRACT: Find unique or duplicate file {contents,names}
851              
852             __END__
853              
854             =pod
855              
856             =encoding UTF-8
857              
858             =head1 NAME
859              
860             File::FindUniq - Find unique or duplicate file {contents,names}
861              
862             =head1 VERSION
863              
864             This document describes version 0.004 of File::FindUniq (from Perl distribution File-FindUniq), released on 2025-05-03.
865              
866             =head1 SYNOPSIS
867              
868             Given this directory content:
869              
870             filename size (bytes) content
871             -------- ------------ -------
872             foo 0
873             bar 0
874             baz 3 123
875             qux 3 456
876             quux 3 123
877             sub/foo 5 abcde
878             sub/bar 0
879              
880             To list files and skip duplicate contents:
881              
882             use File::FindUniq (dupe_files uniq_files);
883             my $res = uniq_files(files => [glob "*"], recurse=>1);
884             # => [200, "OK", ["bar", "baz", "qux", "sub/foo"], {}]
885             # although bar content (0 bytes) is not unique, it's the first seen copy, so included
886             # foo is deemed as duplicate of bar, so skipped
887             # although baz content ("1234") is not unique, it's the first seen copy, so included
888             # quux is deemed as duplicate of baz, so skipped
889             # sub/bar is deemed as duplicate of bar, so skipped
890              
891             To list only duplicate files (including the first copy):
892              
893             my $res = dupe_files(files => [glob "*"], recurse=>1);
894             # => [200, "OK", ["bar", "baz", "foo", "quux", "sub/bar"], {}]
895             # qux's content is unique, so skipped
896             # sub/foo's content is unique, so skipped
897             # foo's content is not unique, but it's the first
898              
899             To only report unique filenames:
900              
901             my $res = uniq_files(files => [glob "*"], recurse=>1,
902             algorithm=>'name');
903             # => [200, "OK", ["bar", "baz", "foo", "quux", "qux"], {}]
904              
905             To report filenames that have duplicates:
906              
907             my $res = dupe_files(files => [glob "*"], recurse=>1,
908             algorithm=>'name');
909             # => [200, "OK", ["bar", "foo", "sub/bar", "sub/foo"], {}]
910              
911             =head1 DESCRIPTION
912              
913             Keywords: unique files, unique file names, duplicate files, duplicate file
914             names.
915              
916             =head1 NOTES
917              
918             =head1 FUNCTIONS
919              
920              
921             =head2 dupe_filenames
922              
923             Usage:
924              
925             dupe_filenames(%args) -> [$status_code, $reason, $payload, \%result_meta]
926              
927             Report duplicate or unique files, optionally perform action on them.
928              
929             This is a thin wrapper for L<uniq-files>. It sets C<algorithm> to C<name>,
930             defaults C<report_unique> to 0 and C<report_duplicate> to 1.
931              
932             This function is not exported.
933              
934             Arguments ('*' denotes required arguments):
935              
936             =over 4
937              
938             =item * B<authoritative_dirs> => I<array[str]>
939              
940             Denote director(yE<verbar>ies) where authoritativeE<sol>"Original" copies are found.
941              
942             =item * B<detail> => I<true>
943              
944             Show details (a.k.a. --show-digest, --show-size, --show-count).
945              
946             =item * B<digest_args> => I<array>
947              
948             Some Digest algorithms require arguments, you can pass them here.
949              
950             =item * B<exclude_empty_files> => I<bool>
951              
952             (No description)
953              
954             =item * B<exclude_file_patterns> => I<array[str]>
955              
956             Filename (including path) regex patterns to include.
957              
958             =item * B<files>* => I<array[str]>
959              
960             (No description)
961              
962             =item * B<group_by_digest> => I<bool>
963              
964             Sort files by its digest (or size, if not computing digest), separate each different digest.
965              
966             =item * B<include_file_patterns> => I<array[str]>
967              
968             Filename (including path) regex patterns to exclude.
969              
970             =item * B<max_size> => I<filesize>
971              
972             Maximum file size to consider.
973              
974             =item * B<min_size> => I<filesize>
975              
976             Minimum file size to consider.
977              
978             =item * B<recurse> => I<bool>
979              
980             If set to true, will recurse into subdirectories.
981              
982             =item * B<report_duplicate> => I<int> (default: 1)
983              
984             Whether to return duplicate items.
985              
986             Can be set to either 0, 1, 2, or 3.
987              
988             If set to 0, duplicate items will not be returned.
989              
990             If set to 1 (the default for C<dupe-files>), will return all the the duplicate
991             files. For example: C<file1> contains text 'a', C<file2> 'b', C<file3> 'a'. Then
992             C<file1> and C<file3> will be returned.
993              
994             If set to 2 (the default for C<uniq-files>), will only return the first of
995             duplicate items. Continuing from previous example, only C<file1> will be returned
996             because C<file2> is unique and C<file3> contains 'a' (already represented by
997             C<file1>). If one or more C<--authoritative-dir> (C<-O>) options are specified,
998             files under these directories will be preferred.
999              
1000             If set to 3, will return all but the first of duplicate items. Continuing from
1001             previous example: C<file3> will be returned. This is useful if you want to keep
1002             only one copy of the duplicate content. You can use the output of this routine
1003             to C<mv> or C<rm>. Similar to the previous case, if one or more
1004             C<--authoritative-dir> (C<-O>) options are specified, then files under these
1005             directories will not be listed if possible.
1006              
1007             =item * B<report_unique> => I<bool> (default: 0)
1008              
1009             Whether to return unique items.
1010              
1011             =item * B<show_count> => I<bool> (default: 0)
1012              
1013             Whether to return each file content's number of occurence.
1014              
1015             1 means the file content is only encountered once (unique), 2 means there is one
1016             duplicate, and so on.
1017              
1018             =item * B<show_digest> => I<true>
1019              
1020             Show the digest value (or the size, if not computing digest) for each file.
1021              
1022             Note that this routine does not compute digest for files which have unique
1023             sizes, so they will show up as empty.
1024              
1025             =item * B<show_size> => I<true>
1026              
1027             Show the size for each file.
1028              
1029              
1030             =back
1031              
1032             Returns an enveloped result (an array).
1033              
1034             First element ($status_code) is an integer containing HTTP-like status code
1035             (200 means OK, 4xx caller error, 5xx function error). Second element
1036             ($reason) is a string containing error message, or something like "OK" if status is
1037             200. Third element ($payload) is the actual result, but usually not present when enveloped result is an error response ($status_code is not 2xx). Fourth
1038             element (%result_meta) is called result metadata and is optional, a hash
1039             that contains extra information, much like how HTTP response headers provide additional metadata.
1040              
1041             Return value: (any)
1042              
1043              
1044              
1045             =head2 dupe_filenames_between_two_dirs
1046              
1047             Usage:
1048              
1049             dupe_filenames_between_two_dirs(%args) -> [$status_code, $reason, $payload, \%result_meta]
1050              
1051             Report duplicate or unique files, optionally perform action on them.
1052              
1053             This is a thin wrapper for L<uniq-files>. It sets C<algorithm> to C<name>,
1054             C<recurse> to true, defaults C<report_unique> to 0 and C<report_duplicate> to 1. It
1055             also accepts two directory names instead of one+ dir/file names.
1056              
1057             This function is not exported.
1058              
1059             Arguments ('*' denotes required arguments):
1060              
1061             =over 4
1062              
1063             =item * B<authoritative_dirs> => I<array[str]>
1064              
1065             Denote director(yE<verbar>ies) where authoritativeE<sol>"Original" copies are found.
1066              
1067             =item * B<detail> => I<true>
1068              
1069             Show details (a.k.a. --show-digest, --show-size, --show-count).
1070              
1071             =item * B<digest_args> => I<array>
1072              
1073             Some Digest algorithms require arguments, you can pass them here.
1074              
1075             =item * B<dir1>* => I<dirname>
1076              
1077             (No description)
1078              
1079             =item * B<dir2>* => I<dirname>
1080              
1081             (No description)
1082              
1083             =item * B<exclude_empty_files> => I<bool>
1084              
1085             (No description)
1086              
1087             =item * B<exclude_file_patterns> => I<array[str]>
1088              
1089             Filename (including path) regex patterns to include.
1090              
1091             =item * B<group_by_digest> => I<bool>
1092              
1093             Sort files by its digest (or size, if not computing digest), separate each different digest.
1094              
1095             =item * B<include_file_patterns> => I<array[str]>
1096              
1097             Filename (including path) regex patterns to exclude.
1098              
1099             =item * B<max_size> => I<filesize>
1100              
1101             Maximum file size to consider.
1102              
1103             =item * B<min_size> => I<filesize>
1104              
1105             Minimum file size to consider.
1106              
1107             =item * B<report_duplicate> => I<int> (default: 1)
1108              
1109             Whether to return duplicate items.
1110              
1111             Can be set to either 0, 1, 2, or 3.
1112              
1113             If set to 0, duplicate items will not be returned.
1114              
1115             If set to 1 (the default for C<dupe-files>), will return all the the duplicate
1116             files. For example: C<file1> contains text 'a', C<file2> 'b', C<file3> 'a'. Then
1117             C<file1> and C<file3> will be returned.
1118              
1119             If set to 2 (the default for C<uniq-files>), will only return the first of
1120             duplicate items. Continuing from previous example, only C<file1> will be returned
1121             because C<file2> is unique and C<file3> contains 'a' (already represented by
1122             C<file1>). If one or more C<--authoritative-dir> (C<-O>) options are specified,
1123             files under these directories will be preferred.
1124              
1125             If set to 3, will return all but the first of duplicate items. Continuing from
1126             previous example: C<file3> will be returned. This is useful if you want to keep
1127             only one copy of the duplicate content. You can use the output of this routine
1128             to C<mv> or C<rm>. Similar to the previous case, if one or more
1129             C<--authoritative-dir> (C<-O>) options are specified, then files under these
1130             directories will not be listed if possible.
1131              
1132             =item * B<report_unique> => I<bool> (default: 0)
1133              
1134             Whether to return unique items.
1135              
1136             =item * B<show_count> => I<bool> (default: 0)
1137              
1138             Whether to return each file content's number of occurence.
1139              
1140             1 means the file content is only encountered once (unique), 2 means there is one
1141             duplicate, and so on.
1142              
1143             =item * B<show_digest> => I<true>
1144              
1145             Show the digest value (or the size, if not computing digest) for each file.
1146              
1147             Note that this routine does not compute digest for files which have unique
1148             sizes, so they will show up as empty.
1149              
1150             =item * B<show_size> => I<true>
1151              
1152             Show the size for each file.
1153              
1154              
1155             =back
1156              
1157             Returns an enveloped result (an array).
1158              
1159             First element ($status_code) is an integer containing HTTP-like status code
1160             (200 means OK, 4xx caller error, 5xx function error). Second element
1161             ($reason) is a string containing error message, or something like "OK" if status is
1162             200. Third element ($payload) is the actual result, but usually not present when enveloped result is an error response ($status_code is not 2xx). Fourth
1163             element (%result_meta) is called result metadata and is optional, a hash
1164             that contains extra information, much like how HTTP response headers provide additional metadata.
1165              
1166             Return value: (any)
1167              
1168              
1169              
1170             =head2 dupe_files
1171              
1172             Usage:
1173              
1174             dupe_files(%args) -> [$status_code, $reason, $payload, \%result_meta]
1175              
1176             Report duplicate or unique files, optionally perform action on them.
1177              
1178             This is a thin wrapper for L<uniq-files>. It defaults C<report_unique> to 0
1179             and C<report_duplicate> to 1.
1180              
1181             This function is not exported by default, but exportable.
1182              
1183             Arguments ('*' denotes required arguments):
1184              
1185             =over 4
1186              
1187             =item * B<algorithm> => I<str>
1188              
1189             What algorithm is used to compute the digest of the content.
1190              
1191             The default is to use C<md5>. Some algorithms supported include C<crc32>, C<sha1>,
1192             C<sha256>, as well as C<Digest> to use Perl L<Digest> which supports a lot of
1193             other algorithms, e.g. C<SHA-1>, C<BLAKE2b>.
1194              
1195             If set to '', 'none', or 'size', then digest will be set to file size. This
1196             means uniqueness will be determined solely from file size. This can be quicker
1197             but will generate a false positive when two files of the same size are deemed as
1198             duplicate even though their content may be different.
1199              
1200             If set to 'name' then only name comparison will be performed. This of course can
1201             potentially generate lots of false positives, but in some cases you might want
1202             to compare filename for uniqueness.
1203              
1204             =item * B<authoritative_dirs> => I<array[str]>
1205              
1206             Denote director(yE<verbar>ies) where authoritativeE<sol>"Original" copies are found.
1207              
1208             =item * B<detail> => I<true>
1209              
1210             Show details (a.k.a. --show-digest, --show-size, --show-count).
1211              
1212             =item * B<digest_args> => I<array>
1213              
1214             Some Digest algorithms require arguments, you can pass them here.
1215              
1216             =item * B<exclude_empty_files> => I<bool>
1217              
1218             (No description)
1219              
1220             =item * B<exclude_file_patterns> => I<array[str]>
1221              
1222             Filename (including path) regex patterns to include.
1223              
1224             =item * B<files>* => I<array[str]>
1225              
1226             (No description)
1227              
1228             =item * B<group_by_digest> => I<bool>
1229              
1230             Sort files by its digest (or size, if not computing digest), separate each different digest.
1231              
1232             =item * B<include_file_patterns> => I<array[str]>
1233              
1234             Filename (including path) regex patterns to exclude.
1235              
1236             =item * B<max_size> => I<filesize>
1237              
1238             Maximum file size to consider.
1239              
1240             =item * B<min_size> => I<filesize>
1241              
1242             Minimum file size to consider.
1243              
1244             =item * B<recurse> => I<bool>
1245              
1246             If set to true, will recurse into subdirectories.
1247              
1248             =item * B<report_duplicate> => I<int> (default: 1)
1249              
1250             Whether to return duplicate items.
1251              
1252             Can be set to either 0, 1, 2, or 3.
1253              
1254             If set to 0, duplicate items will not be returned.
1255              
1256             If set to 1 (the default for C<dupe-files>), will return all the the duplicate
1257             files. For example: C<file1> contains text 'a', C<file2> 'b', C<file3> 'a'. Then
1258             C<file1> and C<file3> will be returned.
1259              
1260             If set to 2 (the default for C<uniq-files>), will only return the first of
1261             duplicate items. Continuing from previous example, only C<file1> will be returned
1262             because C<file2> is unique and C<file3> contains 'a' (already represented by
1263             C<file1>). If one or more C<--authoritative-dir> (C<-O>) options are specified,
1264             files under these directories will be preferred.
1265              
1266             If set to 3, will return all but the first of duplicate items. Continuing from
1267             previous example: C<file3> will be returned. This is useful if you want to keep
1268             only one copy of the duplicate content. You can use the output of this routine
1269             to C<mv> or C<rm>. Similar to the previous case, if one or more
1270             C<--authoritative-dir> (C<-O>) options are specified, then files under these
1271             directories will not be listed if possible.
1272              
1273             =item * B<report_unique> => I<bool> (default: 0)
1274              
1275             Whether to return unique items.
1276              
1277             =item * B<show_count> => I<bool> (default: 0)
1278              
1279             Whether to return each file content's number of occurence.
1280              
1281             1 means the file content is only encountered once (unique), 2 means there is one
1282             duplicate, and so on.
1283              
1284             =item * B<show_digest> => I<true>
1285              
1286             Show the digest value (or the size, if not computing digest) for each file.
1287              
1288             Note that this routine does not compute digest for files which have unique
1289             sizes, so they will show up as empty.
1290              
1291             =item * B<show_size> => I<true>
1292              
1293             Show the size for each file.
1294              
1295              
1296             =back
1297              
1298             Returns an enveloped result (an array).
1299              
1300             First element ($status_code) is an integer containing HTTP-like status code
1301             (200 means OK, 4xx caller error, 5xx function error). Second element
1302             ($reason) is a string containing error message, or something like "OK" if status is
1303             200. Third element ($payload) is the actual result, but usually not present when enveloped result is an error response ($status_code is not 2xx). Fourth
1304             element (%result_meta) is called result metadata and is optional, a hash
1305             that contains extra information, much like how HTTP response headers provide additional metadata.
1306              
1307             Return value: (any)
1308              
1309              
1310              
1311             =head2 uniq_filenames
1312              
1313             Usage:
1314              
1315             uniq_filenames(%args) -> [$status_code, $reason, $payload, \%result_meta]
1316              
1317             Report duplicate or unique files, optionally perform action on them.
1318              
1319             This is a thin wrapper for L<uniq-files>. It sets C<algorithm> to C<name>.
1320              
1321             This function is not exported.
1322              
1323             Arguments ('*' denotes required arguments):
1324              
1325             =over 4
1326              
1327             =item * B<authoritative_dirs> => I<array[str]>
1328              
1329             Denote director(yE<verbar>ies) where authoritativeE<sol>"Original" copies are found.
1330              
1331             =item * B<detail> => I<true>
1332              
1333             Show details (a.k.a. --show-digest, --show-size, --show-count).
1334              
1335             =item * B<digest_args> => I<array>
1336              
1337             Some Digest algorithms require arguments, you can pass them here.
1338              
1339             =item * B<exclude_empty_files> => I<bool>
1340              
1341             (No description)
1342              
1343             =item * B<exclude_file_patterns> => I<array[str]>
1344              
1345             Filename (including path) regex patterns to include.
1346              
1347             =item * B<files>* => I<array[str]>
1348              
1349             (No description)
1350              
1351             =item * B<group_by_digest> => I<bool>
1352              
1353             Sort files by its digest (or size, if not computing digest), separate each different digest.
1354              
1355             =item * B<include_file_patterns> => I<array[str]>
1356              
1357             Filename (including path) regex patterns to exclude.
1358              
1359             =item * B<max_size> => I<filesize>
1360              
1361             Maximum file size to consider.
1362              
1363             =item * B<min_size> => I<filesize>
1364              
1365             Minimum file size to consider.
1366              
1367             =item * B<recurse> => I<bool>
1368              
1369             If set to true, will recurse into subdirectories.
1370              
1371             =item * B<report_duplicate> => I<int> (default: 2)
1372              
1373             Whether to return duplicate items.
1374              
1375             Can be set to either 0, 1, 2, or 3.
1376              
1377             If set to 0, duplicate items will not be returned.
1378              
1379             If set to 1 (the default for C<dupe-files>), will return all the the duplicate
1380             files. For example: C<file1> contains text 'a', C<file2> 'b', C<file3> 'a'. Then
1381             C<file1> and C<file3> will be returned.
1382              
1383             If set to 2 (the default for C<uniq-files>), will only return the first of
1384             duplicate items. Continuing from previous example, only C<file1> will be returned
1385             because C<file2> is unique and C<file3> contains 'a' (already represented by
1386             C<file1>). If one or more C<--authoritative-dir> (C<-O>) options are specified,
1387             files under these directories will be preferred.
1388              
1389             If set to 3, will return all but the first of duplicate items. Continuing from
1390             previous example: C<file3> will be returned. This is useful if you want to keep
1391             only one copy of the duplicate content. You can use the output of this routine
1392             to C<mv> or C<rm>. Similar to the previous case, if one or more
1393             C<--authoritative-dir> (C<-O>) options are specified, then files under these
1394             directories will not be listed if possible.
1395              
1396             =item * B<report_unique> => I<bool> (default: 1)
1397              
1398             Whether to return unique items.
1399              
1400             =item * B<show_count> => I<bool> (default: 0)
1401              
1402             Whether to return each file content's number of occurence.
1403              
1404             1 means the file content is only encountered once (unique), 2 means there is one
1405             duplicate, and so on.
1406              
1407             =item * B<show_digest> => I<true>
1408              
1409             Show the digest value (or the size, if not computing digest) for each file.
1410              
1411             Note that this routine does not compute digest for files which have unique
1412             sizes, so they will show up as empty.
1413              
1414             =item * B<show_size> => I<true>
1415              
1416             Show the size for each file.
1417              
1418              
1419             =back
1420              
1421             Returns an enveloped result (an array).
1422              
1423             First element ($status_code) is an integer containing HTTP-like status code
1424             (200 means OK, 4xx caller error, 5xx function error). Second element
1425             ($reason) is a string containing error message, or something like "OK" if status is
1426             200. Third element ($payload) is the actual result, but usually not present when enveloped result is an error response ($status_code is not 2xx). Fourth
1427             element (%result_meta) is called result metadata and is optional, a hash
1428             that contains extra information, much like how HTTP response headers provide additional metadata.
1429              
1430             Return value: (any)
1431              
1432              
1433              
1434             =head2 uniq_filenames_between_two_dirs
1435              
1436             Usage:
1437              
1438             uniq_filenames_between_two_dirs(%args) -> [$status_code, $reason, $payload, \%result_meta]
1439              
1440             Report duplicate or unique files, optionally perform action on them.
1441              
1442             This is a thin wrapper for L<uniq-files>. It sets C<algorithm> to C<name>,
1443             C<recurse> to true. It also accepts two directory names instead of one+ dir/file
1444             names.
1445              
1446             This function is not exported.
1447              
1448             Arguments ('*' denotes required arguments):
1449              
1450             =over 4
1451              
1452             =item * B<authoritative_dirs> => I<array[str]>
1453              
1454             Denote director(yE<verbar>ies) where authoritativeE<sol>"Original" copies are found.
1455              
1456             =item * B<detail> => I<true>
1457              
1458             Show details (a.k.a. --show-digest, --show-size, --show-count).
1459              
1460             =item * B<digest_args> => I<array>
1461              
1462             Some Digest algorithms require arguments, you can pass them here.
1463              
1464             =item * B<dir1>* => I<dirname>
1465              
1466             (No description)
1467              
1468             =item * B<dir2>* => I<dirname>
1469              
1470             (No description)
1471              
1472             =item * B<exclude_empty_files> => I<bool>
1473              
1474             (No description)
1475              
1476             =item * B<exclude_file_patterns> => I<array[str]>
1477              
1478             Filename (including path) regex patterns to include.
1479              
1480             =item * B<group_by_digest> => I<bool>
1481              
1482             Sort files by its digest (or size, if not computing digest), separate each different digest.
1483              
1484             =item * B<include_file_patterns> => I<array[str]>
1485              
1486             Filename (including path) regex patterns to exclude.
1487              
1488             =item * B<max_size> => I<filesize>
1489              
1490             Maximum file size to consider.
1491              
1492             =item * B<min_size> => I<filesize>
1493              
1494             Minimum file size to consider.
1495              
1496             =item * B<report_duplicate> => I<int> (default: 2)
1497              
1498             Whether to return duplicate items.
1499              
1500             Can be set to either 0, 1, 2, or 3.
1501              
1502             If set to 0, duplicate items will not be returned.
1503              
1504             If set to 1 (the default for C<dupe-files>), will return all the the duplicate
1505             files. For example: C<file1> contains text 'a', C<file2> 'b', C<file3> 'a'. Then
1506             C<file1> and C<file3> will be returned.
1507              
1508             If set to 2 (the default for C<uniq-files>), will only return the first of
1509             duplicate items. Continuing from previous example, only C<file1> will be returned
1510             because C<file2> is unique and C<file3> contains 'a' (already represented by
1511             C<file1>). If one or more C<--authoritative-dir> (C<-O>) options are specified,
1512             files under these directories will be preferred.
1513              
1514             If set to 3, will return all but the first of duplicate items. Continuing from
1515             previous example: C<file3> will be returned. This is useful if you want to keep
1516             only one copy of the duplicate content. You can use the output of this routine
1517             to C<mv> or C<rm>. Similar to the previous case, if one or more
1518             C<--authoritative-dir> (C<-O>) options are specified, then files under these
1519             directories will not be listed if possible.
1520              
1521             =item * B<report_unique> => I<bool> (default: 1)
1522              
1523             Whether to return unique items.
1524              
1525             =item * B<show_count> => I<bool> (default: 0)
1526              
1527             Whether to return each file content's number of occurence.
1528              
1529             1 means the file content is only encountered once (unique), 2 means there is one
1530             duplicate, and so on.
1531              
1532             =item * B<show_digest> => I<true>
1533              
1534             Show the digest value (or the size, if not computing digest) for each file.
1535              
1536             Note that this routine does not compute digest for files which have unique
1537             sizes, so they will show up as empty.
1538              
1539             =item * B<show_size> => I<true>
1540              
1541             Show the size for each file.
1542              
1543              
1544             =back
1545              
1546             Returns an enveloped result (an array).
1547              
1548             First element ($status_code) is an integer containing HTTP-like status code
1549             (200 means OK, 4xx caller error, 5xx function error). Second element
1550             ($reason) is a string containing error message, or something like "OK" if status is
1551             200. Third element ($payload) is the actual result, but usually not present when enveloped result is an error response ($status_code is not 2xx). Fourth
1552             element (%result_meta) is called result metadata and is optional, a hash
1553             that contains extra information, much like how HTTP response headers provide additional metadata.
1554              
1555             Return value: (any)
1556              
1557              
1558              
1559             =head2 uniq_files
1560              
1561             Usage:
1562              
1563             uniq_files(%args) -> [$status_code, $reason, $payload, \%result_meta]
1564              
1565             Report duplicate or unique files, optionally perform action on them.
1566              
1567             Given a list of filenames, will check each file's content (and/or size, and/or
1568             only name) to decide whether the file is a duplicate of another.
1569              
1570             There is a certain amount of flexibility on how duplicate is determined:
1571             - when comparing content, various hashing algorithm is supported;
1572             - when comparing size, a certain tolerance % is allowed;
1573             - when comparing filename, munging can first be done.
1574              
1575             There is flexibility on what to do with duplicate files:
1576             - just print unique/duplicate files (and let other utilities down the pipe deal
1577             with them);
1578             - move duplicates to some location;
1579             - open the files first and prompt for action;
1580             - let a Perl code process the files.
1581              
1582             Interface is loosely based on the C<uniq> Unix command-line program.
1583              
1584             This function is not exported by default, but exportable.
1585              
1586             Arguments ('*' denotes required arguments):
1587              
1588             =over 4
1589              
1590             =item * B<algorithm> => I<str>
1591              
1592             What algorithm is used to compute the digest of the content.
1593              
1594             The default is to use C<md5>. Some algorithms supported include C<crc32>, C<sha1>,
1595             C<sha256>, as well as C<Digest> to use Perl L<Digest> which supports a lot of
1596             other algorithms, e.g. C<SHA-1>, C<BLAKE2b>.
1597              
1598             If set to '', 'none', or 'size', then digest will be set to file size. This
1599             means uniqueness will be determined solely from file size. This can be quicker
1600             but will generate a false positive when two files of the same size are deemed as
1601             duplicate even though their content may be different.
1602              
1603             If set to 'name' then only name comparison will be performed. This of course can
1604             potentially generate lots of false positives, but in some cases you might want
1605             to compare filename for uniqueness.
1606              
1607             =item * B<authoritative_dirs> => I<array[str]>
1608              
1609             Denote director(yE<verbar>ies) where authoritativeE<sol>"Original" copies are found.
1610              
1611             =item * B<detail> => I<true>
1612              
1613             Show details (a.k.a. --show-digest, --show-size, --show-count).
1614              
1615             =item * B<digest_args> => I<array>
1616              
1617             Some Digest algorithms require arguments, you can pass them here.
1618              
1619             =item * B<exclude_empty_files> => I<bool>
1620              
1621             (No description)
1622              
1623             =item * B<exclude_file_patterns> => I<array[str]>
1624              
1625             Filename (including path) regex patterns to include.
1626              
1627             =item * B<files>* => I<array[str]>
1628              
1629             (No description)
1630              
1631             =item * B<group_by_digest> => I<bool>
1632              
1633             Sort files by its digest (or size, if not computing digest), separate each different digest.
1634              
1635             =item * B<include_file_patterns> => I<array[str]>
1636              
1637             Filename (including path) regex patterns to exclude.
1638              
1639             =item * B<max_size> => I<filesize>
1640              
1641             Maximum file size to consider.
1642              
1643             =item * B<min_size> => I<filesize>
1644              
1645             Minimum file size to consider.
1646              
1647             =item * B<recurse> => I<bool>
1648              
1649             If set to true, will recurse into subdirectories.
1650              
1651             =item * B<report_duplicate> => I<int> (default: 2)
1652              
1653             Whether to return duplicate items.
1654              
1655             Can be set to either 0, 1, 2, or 3.
1656              
1657             If set to 0, duplicate items will not be returned.
1658              
1659             If set to 1 (the default for C<dupe-files>), will return all the the duplicate
1660             files. For example: C<file1> contains text 'a', C<file2> 'b', C<file3> 'a'. Then
1661             C<file1> and C<file3> will be returned.
1662              
1663             If set to 2 (the default for C<uniq-files>), will only return the first of
1664             duplicate items. Continuing from previous example, only C<file1> will be returned
1665             because C<file2> is unique and C<file3> contains 'a' (already represented by
1666             C<file1>). If one or more C<--authoritative-dir> (C<-O>) options are specified,
1667             files under these directories will be preferred.
1668              
1669             If set to 3, will return all but the first of duplicate items. Continuing from
1670             previous example: C<file3> will be returned. This is useful if you want to keep
1671             only one copy of the duplicate content. You can use the output of this routine
1672             to C<mv> or C<rm>. Similar to the previous case, if one or more
1673             C<--authoritative-dir> (C<-O>) options are specified, then files under these
1674             directories will not be listed if possible.
1675              
1676             =item * B<report_unique> => I<bool> (default: 1)
1677              
1678             Whether to return unique items.
1679              
1680             =item * B<show_count> => I<bool> (default: 0)
1681              
1682             Whether to return each file content's number of occurence.
1683              
1684             1 means the file content is only encountered once (unique), 2 means there is one
1685             duplicate, and so on.
1686              
1687             =item * B<show_digest> => I<true>
1688              
1689             Show the digest value (or the size, if not computing digest) for each file.
1690              
1691             Note that this routine does not compute digest for files which have unique
1692             sizes, so they will show up as empty.
1693              
1694             =item * B<show_size> => I<true>
1695              
1696             Show the size for each file.
1697              
1698              
1699             =back
1700              
1701             Returns an enveloped result (an array).
1702              
1703             First element ($status_code) is an integer containing HTTP-like status code
1704             (200 means OK, 4xx caller error, 5xx function error). Second element
1705             ($reason) is a string containing error message, or something like "OK" if status is
1706             200. Third element ($payload) is the actual result, but usually not present when enveloped result is an error response ($status_code is not 2xx). Fourth
1707             element (%result_meta) is called result metadata and is optional, a hash
1708             that contains extra information, much like how HTTP response headers provide additional metadata.
1709              
1710             Return value: (any)
1711              
1712             =head1 HOMEPAGE
1713              
1714             Please visit the project's homepage at L<https://metacpan.org/release/File-FindUniq>.
1715              
1716             =head1 SOURCE
1717              
1718             Source repository is at L<https://github.com/perlancar/perl-File-FindUniq>.
1719              
1720             =head1 SEE ALSO
1721              
1722             L<App::FindUtils>
1723              
1724             L<move-duplicate-files-to> from L<App::DuplicateFilesUtils>, which is basically
1725             a shortcut for C<< uniq-files -D -R . | while read f; do mv "$f" SOMEDIR/; done
1726             >>.
1727              
1728             =head1 AUTHOR
1729              
1730             perlancar <perlancar@cpan.org>
1731              
1732             =head1 CONTRIBUTING
1733              
1734              
1735             To contribute, you can send patches by email/via RT, or send pull requests on
1736             GitHub.
1737              
1738             Most of the time, you don't need to build the distribution yourself. You can
1739             simply modify the code, then test via:
1740              
1741             % prove -l
1742              
1743             If you want to build the distribution (e.g. to try to install it locally on your
1744             system), you can install L<Dist::Zilla>,
1745             L<Dist::Zilla::PluginBundle::Author::PERLANCAR>,
1746             L<Pod::Weaver::PluginBundle::Author::PERLANCAR>, and sometimes one or two other
1747             Dist::Zilla- and/or Pod::Weaver plugins. Any additional steps required beyond
1748             that are considered a bug and can be reported to me.
1749              
1750             =head1 COPYRIGHT AND LICENSE
1751              
1752             This software is copyright (c) 2025 by perlancar <perlancar@cpan.org>.
1753              
1754             This is free software; you can redistribute it and/or modify it under
1755             the same terms as the Perl 5 programming language system itself.
1756              
1757             =head1 BUGS
1758              
1759             Please report any bugs or feature requests on the bugtracker website L<https://rt.cpan.org/Public/Dist/Display.html?Name=File-FindUniq>
1760              
1761             When submitting a bug or request, please include a test-file or a
1762             patch to an existing test-file that illustrates the bug or desired
1763             feature.
1764              
1765             =cut