File Coverage

blib/lib/File/FindUniq.pm

Criterion	Covered	Total	%
statement	188	262	71.7
branch	80	126	63.4
condition	53	86	61.6
subroutine	11	16	68.7
pod	1	1	100.0
total	333	491	67.8

line	stmt	bran	cond	sub	pod	time	code
1							package File::FindUniq;
2
3	2			2		530292	use 5.010001;
	2					9
4	2			2		12	use strict;
	2					10
	2					56
5	2			2		15	use warnings;
	2					5
	2					127
6	2			2		4002	use Log::ger;
	2					123
	2					13
7
8	2			2		694	use Cwd qw(abs_path);
	2					3
	2					158
9	2			2		11	use Exporter qw(import);
	2					4
	2					68
10	2			2		1347	use Perinci::Sub::Util qw(gen_modified_sub);
	2					7327
	2					419
11
12							our $AUTHORITY = 'cpan:PERLANCAR'; # AUTHORITY
13							our $DATE = '2025-05-03'; # DATE
14							our $DIST = 'File-FindUniq'; # DIST
15							our $VERSION = '0.004'; # VERSION
16
17							sub _glob {
18	10			10		109	require File::Find;
19
20	10					24	my $dir;
21							my @res;
22							File::Find::finddepth(
23							sub {
24	30	50		30		459	return if -l $_;
25	30	100				271	return unless -f _;
26	2			2		24	no warnings 'once'; # $File::Find::dir
	2					3
	2					8826
27	20					4479	push @res, "$File::Find::dir/$_";
28							},
29	10					2052	@_,
30							);
31	10					98	@res;
32							}
33
34							our @EXPORT_OK = qw(uniq_files dupe_files);
35
36							our %SPEC;
37
38							$SPEC{':package'} = {
39							v => 1.1,
40							summary => 'Find unique or duplicate file {contents,names}',
41							};
42
43							our %argspec_authoritative_dirs = (
44							authoritative_dirs => {
45							summary => 'Denote director(y\|ies) where authoritative/"Original" copies are found',
46							'x.name.is_plural' => 1,
47							'x.name.singular' => 'authoritative_dir',
48							schema => ['array', of=>'str'], # XXX dirname
49							cmdline_aliases => {O=>{}},
50							},
51							);
52							our %argspecs_filter = (
53							include_file_patterns => {
54							summary => 'Filename (including path) regex patterns to exclude',
55							'x.name.is_plural' => 1,
56							'x.name.singular' => 'include_file_pattern',
57							schema => ['array', of=>'str'], # XXX re
58							cmdline_aliases => {I=>{}},
59							},
60							exclude_file_patterns => {
61							summary => 'Filename (including path) regex patterns to include',
62							'x.name.is_plural' => 1,
63							'x.name.singular' => 'exclude_file_pattern',
64							schema => ['array', of=>'str'], # XXX re
65							cmdline_aliases => {X=>{}},
66							},
67							exclude_empty_files => {
68							schema => 'bool*',
69							cmdline_aliases => {Z=>{}},
70							},
71							min_size => {
72							summary => 'Minimum file size to consider',
73							schema => 'filesize*',
74							},
75							max_size => {
76							summary => 'Maximum file size to consider',
77							schema => 'filesize*',
78							},
79							);
80
81							$SPEC{uniq_files} = {
82							v => 1.1,
83							summary => 'Report duplicate or unique files, optionally perform action on them',
84							description => <<'MARKDOWN',
85
86							Given a list of filenames, will check each file's content (and/or size, and/or
87							only name) to decide whether the file is a duplicate of another.
88
89							There is a certain amount of flexibility on how duplicate is determined:
90							- when comparing content, various hashing algorithm is supported;
91							- when comparing size, a certain tolerance % is allowed;
92							- when comparing filename, munging can first be done.
93
94							There is flexibility on what to do with duplicate files:
95							- just print unique/duplicate files (and let other utilities down the pipe deal
96							with them);
97							- move duplicates to some location;
98							- open the files first and prompt for action;
99							- let a Perl code process the files.
100
101							Interface is loosely based on the `uniq` Unix command-line program.
102
103							MARKDOWN
104							args => {
105							# actions => {
106							# 'x.name.is_plural' => 1,
107							# 'x.name.singular' => 'action',
108							# summary => 'What action(s) to perform',
109							# schema => ['array', of=>['str', in=>[qw/report/]], 'prefilters'=>['Array::check_uniq']],
110							# default => ['report'],
111							# description => <<'MARKDOWN',
112							#
113							#The following actions are available. More than one action can be
114							#
115							#MARKDOWN
116							# tags => ['category:input'],
117							# },
118							files => {
119							schema => ['array' => {of=>'str'}],
120							req => 1,
121							pos => 0,
122							slurpy => 1,
123							tags => ['category:input'],
124							},
125
126							recurse => {
127							schema => 'bool*',
128							cmdline_aliases => {R=>{}},
129							description => <<'MARKDOWN',
130
131							If set to true, will recurse into subdirectories.
132
133							MARKDOWN
134							tags => ['category:input'],
135							},
136							group_by_digest => {
137							summary => 'Sort files by its digest (or size, if not computing digest), separate each different digest',
138							schema => 'bool*',
139							},
140							show_digest => {
141							summary => 'Show the digest value (or the size, if not computing digest) for each file',
142							description => <<'MARKDOWN',
143
144							Note that this routine does not compute digest for files which have unique
145							sizes, so they will show up as empty.
146
147							MARKDOWN
148							schema => 'true*',
149							},
150							show_size => {
151							summary => 'Show the size for each file',
152							schema => 'true*',
153							},
154							# TODO add option follow_symlinks?
155							report_unique => {
156							schema => [bool => {default=>1}],
157							summary => 'Whether to return unique items',
158							cmdline_aliases => {
159							a => {
160							summary => 'Alias for --report-unique --report-duplicate=1 (report all files)',
161							code => sub {
162							my $args = shift;
163							$args->{report_unique} = 1;
164							$args->{report_duplicate} = 1;
165							},
166							},
167							u => {
168							summary => 'Alias for --report-unique --report-duplicate=0',
169							code => sub {
170							my $args = shift;
171							$args->{report_unique} = 1;
172							$args->{report_duplicate} = 0;
173							},
174							},
175							d => {
176							summary =>
177							'Alias for --noreport-unique --report-duplicate=1',
178							code => sub {
179							my $args = shift;
180							$args->{report_unique} = 0;
181							$args->{report_duplicate} = 1;
182							},
183							},
184							D => {
185							summary =>
186							'Alias for --noreport-unique --report-duplicate=3',
187							code => sub {
188							my $args = shift;
189							$args->{report_unique} = 0;
190							$args->{report_duplicate} = 3;
191							},
192							},
193							},
194							},
195							report_duplicate => {
196							schema => [int => {in=>[0,1,2,3], default=>2}],
197							summary => 'Whether to return duplicate items',
198							description => <<'MARKDOWN',
199
200							Can be set to either 0, 1, 2, or 3.
201
202							If set to 0, duplicate items will not be returned.
203
204							If set to 1 (the default for `dupe-files`), will return all the the duplicate
205							files. For example: `file1` contains text 'a', `file2` 'b', `file3` 'a'. Then
206							`file1` and `file3` will be returned.
207
208							If set to 2 (the default for `uniq-files`), will only return the first of
209							duplicate items. Continuing from previous example, only `file1` will be returned
210							because `file2` is unique and `file3` contains 'a' (already represented by
211							`file1`). If one or more `--authoritative-dir` (`-O`) options are specified,
212							files under these directories will be preferred.
213
214							If set to 3, will return all but the first of duplicate items. Continuing from
215							previous example: `file3` will be returned. This is useful if you want to keep
216							only one copy of the duplicate content. You can use the output of this routine
217							to `mv` or `rm`. Similar to the previous case, if one or more
218							`--authoritative-dir` (`-O`) options are specified, then files under these
219							directories will not be listed if possible.
220
221							MARKDOWN
222							cmdline_aliases => {
223							},
224							},
225							algorithm => {
226							schema => ['str*'],
227							summary => "What algorithm is used to compute the digest of the content",
228							description => <<'MARKDOWN',
229
230							The default is to use `md5`. Some algorithms supported include `crc32`, `sha1`,
231							`sha256`, as well as `Digest` to use Perl <pm:Digest> which supports a lot of
232							other algorithms, e.g. `SHA-1`, `BLAKE2b`.
233
234							If set to '', 'none', or 'size', then digest will be set to file size. This
235							means uniqueness will be determined solely from file size. This can be quicker
236							but will generate a false positive when two files of the same size are deemed as
237							duplicate even though their content may be different.
238
239							If set to 'name' then only name comparison will be performed. This of course can
240							potentially generate lots of false positives, but in some cases you might want
241							to compare filename for uniqueness.
242
243							MARKDOWN
244							},
245							digest_args => {
246							schema => ['array*',
247
248							# comment out temporarily, Perinci::Sub::GetArgs::Argv
249							# clashes with coerce rules; we should fix
250							# Perinci::Sub::GetArgs::Argv to observe coercion rules
251							# first
252							#of=>'str*',
253
254							'x.perl.coerce_rules'=>['From_str::comma_sep']],
255							description => <<'MARKDOWN',
256
257							Some Digest algorithms require arguments, you can pass them here.
258
259							MARKDOWN
260							cmdline_aliases => {A=>{}},
261							},
262							show_count => {
263							schema => [bool => {default=>0}],
264							summary => "Whether to return each file content's ".
265							"number of occurence",
266							description => <<'MARKDOWN',
267
268							1 means the file content is only encountered once (unique), 2 means there is one
269							duplicate, and so on.
270
271							MARKDOWN
272							cmdline_aliases => {count=>{}, c=>{}},
273							},
274							detail => {
275							summary => 'Show details (a.k.a. --show-digest, --show-size, --show-count)',
276							schema => 'true*',
277							cmdline_aliases => {l=>{}},
278							},
279							%argspec_authoritative_dirs,
280							%argspecs_filter,
281							},
282							examples => [
283							{
284							summary => 'List all files which do no have duplicate contents',
285							src => 'uniq-files *',
286							src_plang => 'bash',
287							test => 0,
288							'x.doc.show_result' => 0,
289							},
290							{
291							summary => 'List all files (recursively, and in detail) which have duplicate contents (all duplicate copies), exclude some files',
292							src => q(uniq-files -R -l -d -X '\.git/' --min-size 10k .),
293							src_plang => 'bash',
294							test => 0,
295							'x.doc.show_result' => 0,
296							},
297							{
298							summary => 'Move all duplicate files (except one copy) in this directory (and subdirectories) to .dupes/',
299							src => 'uniq-files -D -R * \| while read f; do mv "$f" .dupes/; done',
300							src_plang => 'bash',
301							test => 0,
302							'x.doc.show_result' => 0,
303							},
304							{
305							summary => 'List number of occurences of contents for duplicate files',
306							src => 'uniq-files -c *',
307							src_plang => 'bash',
308							test => 0,
309							'x.doc.show_result' => 0,
310							},
311							{
312							summary => 'List number of occurences of contents for all files',
313							src => 'uniq-files -a -c *',
314							src_plang => 'bash',
315							test => 0,
316							'x.doc.show_result' => 0,
317							},
318							{
319							summary => 'List all files, along with their number of content occurrences and content digest. '.
320							'Use the BLAKE2b digest algorithm. And group the files according to their digest.',
321							src => 'uniq-files -a -c --show-digest -A BLAKE2,blake2b *',
322							src_plang => 'bash',
323							test => 0,
324							'x.doc.show_result' => 0,
325							},
326							],
327							};
328							sub uniq_files {
329	11			11	1	624989	my %args = @_;
330
331	11					42	my $files = delete $args{files};
332	11	50	33			93	return [400, "Please specify files"] if !$files \|\| !@$files;
333	11					32	my $recurse = delete($args{recurse});
334	11		100			49	my $report_unique = delete($args{report_unique}) // 1;
335	11		100			42	my $report_duplicate = delete($args{report_duplicate}) // 2;
336	11		100			48	my $show_count = delete($args{show_count}) // 0;
337	11		100			45	my $show_digest = delete($args{show_digest}) // 0;
338	11		100			48	my $show_size = delete($args{show_size}) // 0;
339	11					23	my $digest_args = delete($args{digest_args});
340	11	50	66			58	my $algorithm = delete($args{algorithm}) // ($digest_args ? 'Digest' : 'md5');
341	11					24	my $group_by_digest = delete($args{group_by_digest});
342	11					22	my $detail = delete($args{detail});
343	11					23	my $authoritative_dirs = delete($args{authoritative_dirs});
344	11					24	my $include_file_patterns = delete($args{include_file_patterns});
345	11					21	my $exclude_file_patterns = delete($args{exclude_file_patterns});
346	11		50			59	my $exclude_empty_files = delete($args{exclude_empty_files}) // 0;
347	11					20	my $min_size = delete($args{min_size});
348	11					20	my $max_size = delete($args{max_size});
349							return [400, "Unknown argument(s): ".join(", ", sort keys %args)]
350	11	50				44	if grep {!/\A-/} keys %args;
	0					0
351
352	11	50				32	if ($detail) {
353	0					0	$show_digest = 1;
354	0					0	$show_size = 1;
355	0					0	$show_count = 1;
356							}
357
358	11	100	66			58	my @authoritative_dirs = $authoritative_dirs && @$authoritative_dirs ?
359							@$authoritative_dirs : ();
360	11					56	for my $dir (@authoritative_dirs) {
361	2	50				53	(-d $dir) or return [400, "Authoritative dir '$dir' does not exist or not a directory"];
362	2	50				42	my $abs_dir = abs_path $dir or return [400, "Cannot get absolute path for authoritative dir '$dir'"];
363	2					25	$dir = $abs_dir;
364							}
365							#log_trace "authoritative_dirs=%s", \@authoritative_dirs if @authoritative_dirs;
366
367	11					20	my @include_re;
368	11		50			19	for my $re0 (@{ $include_file_patterns // [] }) {
	11					66
369	0					0	require Regexp::Util;
370	0					0	my $re;
371	0	0				0	if (ref $re0 eq 'Regexp') {
372	0					0	$re = $re0;
373							} else {
374	0					0	eval { $re = Regexp::Util::deserialize_regexp("qr($re0)") };
	0					0
375	0	0				0	return [400, "Invalid/unsafe regex pattern in include_file_patterns '$re0': $@"] if $@;
376	0	0				0	return [400, "Unsafe regex pattern (contains embedded code) in include_file_patterns '$re0'"] if Regexp::Util::regexp_seen_evals($re);
377							}
378	0					0	push @include_re, $re;
379							}
380	11					25	my @exclude_re;
381	11		50			18	for my $re0 (@{ $exclude_file_patterns // [] }) {
	11					54
382	0					0	require Regexp::Util;
383	0					0	my $re;
384	0	0				0	if (ref $re0 eq 'Regexp') {
385	0					0	$re = $re0;
386							} else {
387	0					0	eval { $re = Regexp::Util::deserialize_regexp("qr($re0)") };
	0					0
388	0	0				0	return [400, "Invalid/unsafe regex pattern in exclude_file_patterns '$re0': $@"] if $@;
389	0	0				0	return [400, "Unsafe regex pattern (contains embedded code) in exclude_file_patterns '$re0'"] if Regexp::Util::regexp_seen_evals($re);
390							}
391	0					0	push @exclude_re, $re;
392							}
393
394	11	100				36	if ($recurse) {
395							$files = [ map {
396	5	50				17	if (-l $_) {
	35	100				430
397	0					0	();
398							} elsif (-d _) {
399	10					37	(_glob($_));
400							} else {
401	25					63	($_);
402							}
403							} @$files ];
404							}
405
406							FILTER: {
407	11					24	my $ffiles;
	11					24
408							FILE:
409	11					30	for my $f (@$files) {
410	87	50				1065	if (-l $f) {
411	0					0	log_warn "File '$f' is a symlink, ignored";
412	0					0	next FILE;
413							}
414	87	100				200	if (-d _) {
415	12					53	log_warn "File '$f' is a directory, ignored";
416	12					51	next FILE;
417							}
418	75	50				144	unless (-f _) {
419	0					0	log_warn "File '$f' is not a regular file, ignored";
420	0					0	next FILE;
421							}
422
423	75	50				153	if (@include_re) {
424	0					0	my $included;
425	0					0	for my $re (@include_re) {
426	0	0				0	if ($f =~ $re) { $included++; last }
	0					0
	0					0
427							}
428	0	0				0	unless ($included) {
429	0					0	log_info "File '$f' is not in --include-file-patterns, skipped";
430	0					0	next FILE;
431							}
432							}
433	75	50				144	if (@exclude_re) {
434	0					0	for my $re (@exclude_re) {
435	0	0				0	if ($f =~ $re) {
436	0					0	log_info "File '$f' is in --exclude-file-patterns, skipped";
437	0					0	next FILE;
438							}
439							}
440							}
441
442	75					625	my $size = -s $f;
443	75	50	33			185	if ($exclude_empty_files && !$size) {
444	0					0	log_info "File '$f' is empty, skipped by option -Z";
445	0					0	next FILE;
446							}
447	75	50	33			181	if (defined($min_size) && $size < $min_size) {
448	0					0	log_info "File '$f' (size=$size) is smaller than min_file ($min_size), skipped";
449	0					0	next FILE;
450							}
451	75	50	33			148	if (defined($max_size) && $size > $max_size) {
452	0					0	log_info "File '$f' (size=$size) is larger than max_file ($max_size), skipped";
453	0					0	next FILE;
454							}
455
456	75					248	push @$ffiles, $f;
457							}
458	11					31	$files = $ffiles;
459							} # FILTER
460
461	11					27	my %basename_paths; # key = basename (computed), value = [path, ...]
462							my %path_basenames; # key = path, value = basename
463							GROUP_FILE_NAMES: {
464	11					18	for my $f (@$files) {
	11					27
465							#my $path = abs_path($f);
466	75					203	(my $basename = $f) =~ s!.+/!!;
467	75		50			334	$basename_paths{$basename} //= [];
468	75					208	push @{ $basename_paths{$basename} }, $f
469	75	50				103	unless grep { $_ eq $f } @{ $basename_paths{$basename} };
	0					0
	75					222
470	75					179	$path_basenames{$f} = $basename;
471							}
472							}
473							#use DD; print "basename_paths: "; dd \%basename_paths;
474
475	11					36	my %size_counts; # key = size, value = number of files having that size
476							my %size_paths; # key = size, value = [path, ...]
477	11					0	my %path_sizes; # key = path, value = file size, for caching stat()
478							GET_FILE_SIZES: {
479	11					19	for my $f (@$files) {
	11					29
480	75					653	my @st = stat $f;
481	75	50				157	unless (@st) {
482	0					0	log_error("Can't stat file `$f`: $!, skipped");
483	0					0	next;
484							}
485	75					171	$size_counts{$st[7]}++;
486	75		100			200	$size_paths{$st[7]} //= [];
487	75					118	push @{$size_paths{$st[7]}}, $f;
	75					158
488	75					219	$path_sizes{$f} = $st[7];
489							}
490							}
491							#use DD; print "size_paths: "; dd \%size_paths;
492
493							# calculate digest for all files having non-unique sizes
494	11					4108	my %digest_counts; # key = digest, value = num of files having that digest
495							my %digest_paths; # key = digest, value = [file, ...]
496	11					0	my %path_digests; # key = path, value = file digest
497							CALC_FILE_DIGESTS: {
498	11					18	require File::Digest;
	11					892
499
500	11					6795	for my $f (@$files) {
501	75	50				204	next unless defined $path_sizes{$f}; # just checking. all files should have sizes.
502
503	75					110	my $digest;
504	75	100	66			575	if ($algorithm eq '' \|\| $algorithm eq 'none' \|\| $algorithm eq 'size') {
		50	66
505	9					13	$digest = $path_sizes{$f};
506							} elsif ($algorithm eq 'name') {
507	0					0	$digest = $path_basenames{$f};
508							} else {
509	66	100				206	next if $size_counts{ $path_sizes{$f} } == 1; # skip unique file sizes.
510	60					176	my $res = File::Digest::digest_file(
511							file=>$f, algorithm=>$algorithm, digest_args=>$digest_args);
512	60	50				24529	return [500, "Can't calculate digest for file '$f': $res->[0] - $res->[1]"]
513							unless $res->[0] == 200;
514	60					147	$digest = $res->[2];
515							}
516	69					164	$digest_counts{$digest}++;
517	69		100			578	$digest_paths{$digest} //= [];
518	69					145	push @{$digest_paths{$digest}}, $f;
	69					176
519	69					213	$path_digests{$f} = $digest;
520							}
521							}
522							#use DD; print "digest_paths: "; dd \%digest_paths;
523							#use DD; print "path_digests: "; dd \%path_digests;
524
525	11					23	my %path_counts; # key = path, value = num of files having file content
526	11					31	for my $f (@$files) {
527	75	50				152	next unless defined $path_sizes{$f}; # just checking, all files should have sizes
528	75	100				158	if (!defined($path_digests{$f})) {
529	6					18	$path_counts{$f} = $size_counts{ $path_sizes{$f} };
530							} else {
531	69					153	$path_counts{$f} = $digest_counts{ $path_digests{$f} };
532							}
533							}
534							#use DD; print "path_counts: "; dd \%path_counts;
535
536							SORT_DUPLICATE_FILES: {
537	11	100				18	last unless @authoritative_dirs;
	11					37
538	2					6	my $hash = \%digest_paths;
539	2					9	for my $key (keys %$hash) {
540	10					17	my @files = @{ $hash->{$key} };
	10					28
541	10					18	my @abs_files;
542	10	100				28	next unless @files > 1;
543	4					9	for my $file (@files) {
544	12	50				237	my $abs_file = abs_path $file or do {
545	0					0	log_error "Cannot find absolute path for duplicate file '$file', skipping duplicate set %s", \@files;
546							};
547	12					30	push @abs_files, $abs_file;
548							}
549
550							#log_trace "Duplicate files before sorting: %s", \@files;
551	12					53	@files = map { $files[$_] } sort {
552	4					23	my $file_a = $abs_files[$a];
	10					21
553	10					15	my $file_a_in_authoritative_dirs = 0;
554	10					16	my $subdir_len_file_a;
555	10					18	for my $d (@authoritative_dirs) {
556	10	50				93	if ($file_a =~ m!\A\Q$d\E(?:/\|\z)(.*)!) { $file_a_in_authoritative_dirs++; $subdir_len_file_a = length($1); last }
	0					0
	0					0
	0					0
557							}
558	10					18	my $file_b = $abs_files[$b];
559	10					17	my $file_b_in_authoritative_dirs = 0;
560	10					17	my $subdir_len_file_b;
561	10					20	for my $d (@authoritative_dirs) {
562	10	100				70	if ($file_b =~ m!\A\Q$d\E(?:/\|\z)(.*)!) { $file_b_in_authoritative_dirs++; $subdir_len_file_b = length($1); last }
	2					3
	2					10
	2					4
563							}
564							#log_trace " file_a=<$file_a>, in authoritative_dirs? $file_a_in_authoritative_dirs";
565							#log_trace " file_b=<$file_b>, in authoritative_dirs? $file_b_in_authoritative_dirs";
566							# files located near the root of authoritative dir is preferred
567							# to deeper files. this is done by comparing subdir_len
568	10	50				46	($file_a_in_authoritative_dirs ? $subdir_len_file_a : 9999) <=> ($file_b_in_authoritative_dirs ? $subdir_len_file_b : 9999) \|\|
		100
		50
569							$file_a cmp $file_b;
570							} 0..$#files;
571							#log_trace "Duplicate files after sorting: %s", \@files;
572
573	4					19	$hash->{$key} = \@files;
574							}
575							}
576
577							#$log->trace("report_duplicate=$report_duplicate");
578	11					22	my @files;
579	11					76	for my $f (sort keys %path_counts) {
580	75	100				225	if ($path_counts{$f} == 1) {
581	24					71	log_trace "unique file '$f'";
582	24	100				87	push @files, $f if $report_unique;
583							} else {
584	51					178	log_trace "duplicate file '$f'";
585	51					10952	my $is_first_copy = $f eq $digest_paths{ $path_digests{$f} }[0];
586	51					171	log_trace "is first copy? <$is_first_copy>";
587	51	100				221	if ($report_duplicate == 0) {
		100
		100
		50
588							# do not report dupe files
589							} elsif ($report_duplicate == 1) {
590	15					40	push @files, $f;
591							} elsif ($report_duplicate == 2) {
592	21	100				59	push @files, $f if $is_first_copy;
593							} elsif ($report_duplicate == 3) {
594	9	100				27	push @files, $f unless $is_first_copy;
595							} else {
596	0					0	die "Invalid value for --report-duplicate ".
597							"'$report_duplicate', please choose 0/1/2/3";
598							}
599							}
600							}
601
602							GROUP_FILES_BY_DIGEST: {
603	11	100				24	last unless $group_by_digest;
	11					33
604							@files = sort {
605	1					8	$path_sizes{$a} <=> $path_sizes{$b} \|\|
606	20	50	50			57	($path_digests{$a} // '') cmp ($path_digests{$b} // '')
			50
607							} @files;
608							}
609
610	11					53	my @rows;
611							my %resmeta;
612	11					0	my $last_digest;
613	11					27	for my $f (@files) {
614	41		66			111	my $digest = $path_digests{$f} // $path_sizes{$f};
615
616							# add separator row
617	41	100	100			121	if ($group_by_digest && defined $last_digest && $digest ne $last_digest) {
			100
618	4	50	33			4076	push @rows, ($show_count \|\| $show_digest \|\| $show_size) ? {} : '';
619							}
620
621	41					64	my $row;
622	41	100	100			161	if ($show_count \|\| $show_digest \|\| $show_size) {
			100
623	19					60	$row = {file=>$f};
624	19	100				76	$row->{count} = $path_counts{$f} if $show_count;
625	19	100				51	$row->{digest} = $path_digests{$f} if $show_digest;
626	19	100				44	$row->{size} = $path_sizes{$f} if $show_size;
627							} else {
628	22					34	$row = $f;
629							}
630	41					78	push @rows, $row;
631	41					105	$last_digest = $digest;
632							}
633
634	11	100	100			79	$resmeta{'table.fields'} = [qw/file size digest count/]
			100
635							if $show_count \|\| $show_digest \|\| $show_size;
636
637	11					331	[200, "OK", \@rows, \%resmeta];
638							}
639
640							# dupe_files
641							gen_modified_sub(
642							base_name => 'uniq_files',
643							output_name => 'dupe_files',
644							description => <<'MARKDOWN',
645
646							This is a thin wrapper for <prog:uniq-files>. It defaults `report_unique` to 0
647							and `report_duplicate` to 1.
648
649							MARKDOWN
650							modify_args => {
651							report_unique => sub {
652							$_[0]{schema} = [bool => {default=>0}];
653							},
654							report_duplicate => sub {
655							$_[0]{schema} = [int => {in=>[0,1,2,3], default=>1}];
656							},
657							},
658							modify_meta => sub {
659							$_[0]{examples} = [
660							{
661							summary => 'List all files (recursively, and in detail)NN which have duplicate contents (all duplicate copies)',
662							src => 'dupe-files -lR *',
663							src_plang => 'bash',
664							test => 0,
665							'x.doc.show_result' => 0,
666							},
667							];
668							},
669							output_code => sub {
670	0			0			my %args = @_;
671	0		0				$args{report_unique} //= 0;
672	0		0				$args{report_duplicate} //= 1;
673	0						uniq_files(%args);
674							},
675							);
676
677							# uniq_filenames
678							gen_modified_sub(
679							base_name => 'uniq_files',
680							output_name => 'uniq_filenames',
681							description => <<'MARKDOWN',
682
683							This is a thin wrapper for <prog:uniq-files>. It sets `algorithm` to `name`.
684
685							MARKDOWN
686							remove_args => ['algorithm'],
687							modify_meta => sub {
688							$_[0]{examples} = [
689							{
690							summary => 'Find unique filenames in two directories',
691							src => 'uniq-filenames -uR dir1 dir2',
692							src_plang => 'bash',
693							test => 0,
694							'x.doc.show_result' => 0,
695							},
696							];
697							},
698							output_code => sub {
699	0			0			my %args = @_;
700	0						uniq_files(%args, algorithm => 'name');
701							},
702							);
703
704							# dupe_filenames
705							gen_modified_sub(
706							base_name => 'uniq_files',
707							output_name => 'dupe_filenames',
708							description => <<'MARKDOWN',
709
710							This is a thin wrapper for <prog:uniq-files>. It sets `algorithm` to `name`,
711							defaults `report_unique` to 0 and `report_duplicate` to 1.
712
713							MARKDOWN
714							remove_args => ['algorithm'],
715							modify_args => {
716							report_unique => sub {
717							$_[0]{schema} = [bool => {default=>0}];
718							},
719							report_duplicate => sub {
720							$_[0]{schema} = [int => {in=>[0,1,2,3], default=>1}];
721							},
722							},
723							modify_meta => sub {
724							$_[0]{examples} = [
725							{
726							summary => 'Find duplicate filenames in two directories',
727							src => 'dupe-filenames -R dir1 dir2',
728							src_plang => 'bash',
729							test => 0,
730							'x.doc.show_result' => 0,
731							},
732							];
733							},
734							output_code => sub {
735	0			0			my %args = @_;
736	0		0				$args{report_unique} //= 0;
737	0		0				$args{report_duplicate} //= 1;
738	0						uniq_files(%args, algorithm=>'name');
739							},
740							);
741
742							# uniq_filenames_between_two_dirs
743							gen_modified_sub(
744							base_name => 'uniq_files',
745							output_name => 'uniq_filenames_between_two_dirs',
746							description => <<'MARKDOWN',
747
748							This is a thin wrapper for <prog:uniq-files>. It sets `algorithm` to `name`,
749							`recurse` to true. It also accepts two directory names instead of one+ dir/file
750							names.
751
752							MARKDOWN
753							add_args => {
754							dir1 => {
755							schema => 'dirname*',
756							req => 1,
757							pos => 0,
758							},
759							dir2 => {
760							schema => 'dirname*',
761							req => 1,
762							pos => 1,
763							},
764							},
765							remove_args => ['algorithm', 'files', 'recurse'],
766							modify_meta => sub {
767							$_[0]{examples} = [
768							{
769							summary => 'Find unique filenames in two directories',
770							src => 'uniq-filenames-between-two-dirs -u dir1 dir2',
771							src_plang => 'bash',
772							test => 0,
773							'x.doc.show_result' => 0,
774							},
775							];
776							},
777							output_code => sub {
778	0			0			my %args = @_;
779	0						my $dir1 = delete $args{dir1};
780	0						my $dir2 = delete $args{dir2};
781	0						uniq_files(
782							%args,
783							files => [$dir1, $dir2],
784							algorithm => 'name',
785							recurse => 1,
786							);
787							},
788							);
789
790							# dupe_filenames_between_two_dirs
791							gen_modified_sub(
792							base_name => 'uniq_files',
793							output_name => 'dupe_filenames_between_two_dirs',
794							description => <<'MARKDOWN',
795
796							This is a thin wrapper for <prog:uniq-files>. It sets `algorithm` to `name`,
797							`recurse` to true, defaults `report_unique` to 0 and `report_duplicate` to 1. It
798							also accepts two directory names instead of one+ dir/file names.
799
800							MARKDOWN
801							add_args => {
802							dir1 => {
803							schema => 'dirname*',
804							req => 1,
805							pos => 0,
806							},
807							dir2 => {
808							schema => 'dirname*',
809							req => 1,
810							pos => 1,
811							},
812							},
813							remove_args => ['algorithm', 'files', 'recurse'],
814							modify_args => {
815							report_unique => sub {
816							$_[0]{schema} = [bool => {default=>0}];
817							},
818							report_duplicate => sub {
819							$_[0]{schema} = [int => {in=>[0,1,2,3], default=>1}];
820							},
821							},
822							modify_meta => sub {
823							$_[0]{examples} = [
824							{
825							summary => 'Find duplicate filenames in two directories',
826							src => 'dupe-filenames-between-two-dirs dir1 dir2',
827							src_plang => 'bash',
828							test => 0,
829							'x.doc.show_result' => 0,
830							},
831							];
832							},
833							output_code => sub {
834	0			0			my %args = @_;
835	0						my $dir1 = delete $args{dir1};
836	0						my $dir2 = delete $args{dir2};
837	0		0				$args{report_unique} //= 0;
838	0		0				$args{report_duplicate} //= 1;
839	0						uniq_files(
840							%args,
841							files => [$dir1, $dir2],
842							algorithm => 'name',
843							recurse => 1,
844							);
845							},
846							);
847
848
849							1;
850							# ABSTRACT: Find unique or duplicate file {contents,names}
851
852							__END__
853
854							=pod
855
856							=encoding UTF-8
857
858							=head1 NAME
859
860							File::FindUniq - Find unique or duplicate file {contents,names}
861
862							=head1 VERSION
863
864							This document describes version 0.004 of File::FindUniq (from Perl distribution File-FindUniq), released on 2025-05-03.
865
866							=head1 SYNOPSIS
867
868							Given this directory content:
869
870							filename size (bytes) content
871							-------- ------------ -------
872							foo 0
873							bar 0
874							baz 3 123
875							qux 3 456
876							quux 3 123
877							sub/foo 5 abcde
878							sub/bar 0
879
880							To list files and skip duplicate contents:
881
882							use File::FindUniq (dupe_files uniq_files);
883							my $res = uniq_files(files => [glob "*"], recurse=>1);
884							# => [200, "OK", ["bar", "baz", "qux", "sub/foo"], {}]
885							# although bar content (0 bytes) is not unique, it's the first seen copy, so included
886							# foo is deemed as duplicate of bar, so skipped
887							# although baz content ("1234") is not unique, it's the first seen copy, so included
888							# quux is deemed as duplicate of baz, so skipped
889							# sub/bar is deemed as duplicate of bar, so skipped
890
891							To list only duplicate files (including the first copy):
892
893							my $res = dupe_files(files => [glob "*"], recurse=>1);
894							# => [200, "OK", ["bar", "baz", "foo", "quux", "sub/bar"], {}]
895							# qux's content is unique, so skipped
896							# sub/foo's content is unique, so skipped
897							# foo's content is not unique, but it's the first
898
899							To only report unique filenames:
900
901							my $res = uniq_files(files => [glob "*"], recurse=>1,
902							algorithm=>'name');
903							# => [200, "OK", ["bar", "baz", "foo", "quux", "qux"], {}]
904
905							To report filenames that have duplicates:
906
907							my $res = dupe_files(files => [glob "*"], recurse=>1,
908							algorithm=>'name');
909							# => [200, "OK", ["bar", "foo", "sub/bar", "sub/foo"], {}]
910
911							=head1 DESCRIPTION
912
913							Keywords: unique files, unique file names, duplicate files, duplicate file
914							names.
915
916							=head1 NOTES
917
918							=head1 FUNCTIONS
919
920
921							=head2 dupe_filenames
922
923							Usage:
924
925							dupe_filenames(%args) -> [$status_code, $reason, $payload, \%result_meta]
926
927							Report duplicate or unique files, optionally perform action on them.
928
929							This is a thin wrapper for L<uniq-files>. It sets C<algorithm> to C<name>,
930							defaults C<report_unique> to 0 and C<report_duplicate> to 1.
931
932							This function is not exported.
933
934							Arguments ('*' denotes required arguments):
935
936							=over 4
937
938							=item * B<authoritative_dirs> => I<array[str]>
939
940							Denote director(yE<verbar>ies) where authoritativeE<sol>"Original" copies are found.
941
942							=item * B<detail> => I<true>
943
944							Show details (a.k.a. --show-digest, --show-size, --show-count).
945
946							=item * B<digest_args> => I<array>
947
948							Some Digest algorithms require arguments, you can pass them here.
949
950							=item * B<exclude_empty_files> => I<bool>
951
952							(No description)
953
954							=item * B<exclude_file_patterns> => I<array[str]>
955
956							Filename (including path) regex patterns to include.
957
958							=item * B<files>* => I<array[str]>
959
960							(No description)
961
962							=item * B<group_by_digest> => I<bool>
963
964							Sort files by its digest (or size, if not computing digest), separate each different digest.
965
966							=item * B<include_file_patterns> => I<array[str]>
967
968							Filename (including path) regex patterns to exclude.
969
970							=item * B<max_size> => I<filesize>
971
972							Maximum file size to consider.
973
974							=item * B<min_size> => I<filesize>
975
976							Minimum file size to consider.
977
978							=item * B<recurse> => I<bool>
979
980							If set to true, will recurse into subdirectories.
981
982							=item * B<report_duplicate> => I<int> (default: 1)
983
984							Whether to return duplicate items.
985
986							Can be set to either 0, 1, 2, or 3.
987
988							If set to 0, duplicate items will not be returned.
989
990							If set to 1 (the default for C<dupe-files>), will return all the the duplicate
991							files. For example: C<file1> contains text 'a', C<file2> 'b', C<file3> 'a'. Then
992							C<file1> and C<file3> will be returned.
993
994							If set to 2 (the default for C<uniq-files>), will only return the first of
995							duplicate items. Continuing from previous example, only C<file1> will be returned
996							because C<file2> is unique and C<file3> contains 'a' (already represented by
997							C<file1>). If one or more C<--authoritative-dir> (C<-O>) options are specified,
998							files under these directories will be preferred.
999
1000							If set to 3, will return all but the first of duplicate items. Continuing from
1001							previous example: C<file3> will be returned. This is useful if you want to keep
1002							only one copy of the duplicate content. You can use the output of this routine
1003							to C<mv> or C<rm>. Similar to the previous case, if one or more
1004							C<--authoritative-dir> (C<-O>) options are specified, then files under these
1005							directories will not be listed if possible.
1006
1007							=item * B<report_unique> => I<bool> (default: 0)
1008
1009							Whether to return unique items.
1010
1011							=item * B<show_count> => I<bool> (default: 0)
1012
1013							Whether to return each file content's number of occurence.
1014
1015							1 means the file content is only encountered once (unique), 2 means there is one
1016							duplicate, and so on.
1017
1018							=item * B<show_digest> => I<true>
1019
1020							Show the digest value (or the size, if not computing digest) for each file.
1021
1022							Note that this routine does not compute digest for files which have unique
1023							sizes, so they will show up as empty.
1024
1025							=item * B<show_size> => I<true>
1026
1027							Show the size for each file.
1028
1029
1030							=back
1031
1032							Returns an enveloped result (an array).
1033
1034							First element ($status_code) is an integer containing HTTP-like status code
1035							(200 means OK, 4xx caller error, 5xx function error). Second element
1036							($reason) is a string containing error message, or something like "OK" if status is
1037							200. Third element ($payload) is the actual result, but usually not present when enveloped result is an error response ($status_code is not 2xx). Fourth
1038							element (%result_meta) is called result metadata and is optional, a hash
1039							that contains extra information, much like how HTTP response headers provide additional metadata.
1040
1041							Return value: (any)
1042
1043
1044
1045							=head2 dupe_filenames_between_two_dirs
1046
1047							Usage:
1048
1049							dupe_filenames_between_two_dirs(%args) -> [$status_code, $reason, $payload, \%result_meta]
1050
1051							Report duplicate or unique files, optionally perform action on them.
1052
1053							This is a thin wrapper for L<uniq-files>. It sets C<algorithm> to C<name>,
1054							C<recurse> to true, defaults C<report_unique> to 0 and C<report_duplicate> to 1. It
1055							also accepts two directory names instead of one+ dir/file names.
1056
1057							This function is not exported.
1058
1059							Arguments ('*' denotes required arguments):
1060
1061							=over 4
1062
1063							=item * B<authoritative_dirs> => I<array[str]>
1064
1065							Denote director(yE<verbar>ies) where authoritativeE<sol>"Original" copies are found.
1066
1067							=item * B<detail> => I<true>
1068
1069							Show details (a.k.a. --show-digest, --show-size, --show-count).
1070
1071							=item * B<digest_args> => I<array>
1072
1073							Some Digest algorithms require arguments, you can pass them here.
1074
1075							=item * B<dir1>* => I<dirname>
1076
1077							(No description)
1078
1079							=item * B<dir2>* => I<dirname>
1080
1081							(No description)
1082
1083							=item * B<exclude_empty_files> => I<bool>
1084
1085							(No description)
1086
1087							=item * B<exclude_file_patterns> => I<array[str]>
1088
1089							Filename (including path) regex patterns to include.
1090
1091							=item * B<group_by_digest> => I<bool>
1092
1093							Sort files by its digest (or size, if not computing digest), separate each different digest.
1094
1095							=item * B<include_file_patterns> => I<array[str]>
1096
1097							Filename (including path) regex patterns to exclude.
1098
1099							=item * B<max_size> => I<filesize>
1100
1101							Maximum file size to consider.
1102
1103							=item * B<min_size> => I<filesize>
1104
1105							Minimum file size to consider.
1106
1107							=item * B<report_duplicate> => I<int> (default: 1)
1108
1109							Whether to return duplicate items.
1110
1111							Can be set to either 0, 1, 2, or 3.
1112
1113							If set to 0, duplicate items will not be returned.
1114
1115							If set to 1 (the default for C<dupe-files>), will return all the the duplicate
1116							files. For example: C<file1> contains text 'a', C<file2> 'b', C<file3> 'a'. Then
1117							C<file1> and C<file3> will be returned.
1118
1119							If set to 2 (the default for C<uniq-files>), will only return the first of
1120							duplicate items. Continuing from previous example, only C<file1> will be returned
1121							because C<file2> is unique and C<file3> contains 'a' (already represented by
1122							C<file1>). If one or more C<--authoritative-dir> (C<-O>) options are specified,
1123							files under these directories will be preferred.
1124
1125							If set to 3, will return all but the first of duplicate items. Continuing from
1126							previous example: C<file3> will be returned. This is useful if you want to keep
1127							only one copy of the duplicate content. You can use the output of this routine
1128							to C<mv> or C<rm>. Similar to the previous case, if one or more
1129							C<--authoritative-dir> (C<-O>) options are specified, then files under these
1130							directories will not be listed if possible.
1131
1132							=item * B<report_unique> => I<bool> (default: 0)
1133
1134							Whether to return unique items.
1135
1136							=item * B<show_count> => I<bool> (default: 0)
1137
1138							Whether to return each file content's number of occurence.
1139
1140							1 means the file content is only encountered once (unique), 2 means there is one
1141							duplicate, and so on.
1142
1143							=item * B<show_digest> => I<true>
1144
1145							Show the digest value (or the size, if not computing digest) for each file.
1146
1147							Note that this routine does not compute digest for files which have unique
1148							sizes, so they will show up as empty.
1149
1150							=item * B<show_size> => I<true>
1151
1152							Show the size for each file.
1153
1154
1155							=back
1156
1157							Returns an enveloped result (an array).
1158
1159							First element ($status_code) is an integer containing HTTP-like status code
1160							(200 means OK, 4xx caller error, 5xx function error). Second element
1161							($reason) is a string containing error message, or something like "OK" if status is
1162							200. Third element ($payload) is the actual result, but usually not present when enveloped result is an error response ($status_code is not 2xx). Fourth
1163							element (%result_meta) is called result metadata and is optional, a hash
1164							that contains extra information, much like how HTTP response headers provide additional metadata.
1165
1166							Return value: (any)
1167
1168
1169
1170							=head2 dupe_files
1171
1172							Usage:
1173
1174							dupe_files(%args) -> [$status_code, $reason, $payload, \%result_meta]
1175
1176							Report duplicate or unique files, optionally perform action on them.
1177
1178							This is a thin wrapper for L<uniq-files>. It defaults C<report_unique> to 0
1179							and C<report_duplicate> to 1.
1180
1181							This function is not exported by default, but exportable.
1182
1183							Arguments ('*' denotes required arguments):
1184
1185							=over 4
1186
1187							=item * B<algorithm> => I<str>
1188
1189							What algorithm is used to compute the digest of the content.
1190
1191							The default is to use C<md5>. Some algorithms supported include C<crc32>, C<sha1>,
1192							C<sha256>, as well as C<Digest> to use Perl L<Digest> which supports a lot of
1193							other algorithms, e.g. C<SHA-1>, C<BLAKE2b>.
1194
1195							If set to '', 'none', or 'size', then digest will be set to file size. This
1196							means uniqueness will be determined solely from file size. This can be quicker
1197							but will generate a false positive when two files of the same size are deemed as
1198							duplicate even though their content may be different.
1199
1200							If set to 'name' then only name comparison will be performed. This of course can
1201							potentially generate lots of false positives, but in some cases you might want
1202							to compare filename for uniqueness.
1203
1204							=item * B<authoritative_dirs> => I<array[str]>
1205
1206							Denote director(yE<verbar>ies) where authoritativeE<sol>"Original" copies are found.
1207
1208							=item * B<detail> => I<true>
1209
1210							Show details (a.k.a. --show-digest, --show-size, --show-count).
1211
1212							=item * B<digest_args> => I<array>
1213
1214							Some Digest algorithms require arguments, you can pass them here.
1215
1216							=item * B<exclude_empty_files> => I<bool>
1217
1218							(No description)
1219
1220							=item * B<exclude_file_patterns> => I<array[str]>
1221
1222							Filename (including path) regex patterns to include.
1223
1224							=item * B<files>* => I<array[str]>
1225
1226							(No description)
1227
1228							=item * B<group_by_digest> => I<bool>
1229
1230							Sort files by its digest (or size, if not computing digest), separate each different digest.
1231
1232							=item * B<include_file_patterns> => I<array[str]>
1233
1234							Filename (including path) regex patterns to exclude.
1235
1236							=item * B<max_size> => I<filesize>
1237
1238							Maximum file size to consider.
1239
1240							=item * B<min_size> => I<filesize>
1241
1242							Minimum file size to consider.
1243
1244							=item * B<recurse> => I<bool>
1245
1246							If set to true, will recurse into subdirectories.
1247
1248							=item * B<report_duplicate> => I<int> (default: 1)
1249
1250							Whether to return duplicate items.
1251
1252							Can be set to either 0, 1, 2, or 3.
1253
1254							If set to 0, duplicate items will not be returned.
1255
1256							If set to 1 (the default for C<dupe-files>), will return all the the duplicate
1257							files. For example: C<file1> contains text 'a', C<file2> 'b', C<file3> 'a'. Then
1258							C<file1> and C<file3> will be returned.
1259
1260							If set to 2 (the default for C<uniq-files>), will only return the first of
1261							duplicate items. Continuing from previous example, only C<file1> will be returned
1262							because C<file2> is unique and C<file3> contains 'a' (already represented by
1263							C<file1>). If one or more C<--authoritative-dir> (C<-O>) options are specified,
1264							files under these directories will be preferred.
1265
1266							If set to 3, will return all but the first of duplicate items. Continuing from
1267							previous example: C<file3> will be returned. This is useful if you want to keep
1268							only one copy of the duplicate content. You can use the output of this routine
1269							to C<mv> or C<rm>. Similar to the previous case, if one or more
1270							C<--authoritative-dir> (C<-O>) options are specified, then files under these
1271							directories will not be listed if possible.
1272
1273							=item * B<report_unique> => I<bool> (default: 0)
1274
1275							Whether to return unique items.
1276
1277							=item * B<show_count> => I<bool> (default: 0)
1278
1279							Whether to return each file content's number of occurence.
1280
1281							1 means the file content is only encountered once (unique), 2 means there is one
1282							duplicate, and so on.
1283
1284							=item * B<show_digest> => I<true>
1285
1286							Show the digest value (or the size, if not computing digest) for each file.
1287
1288							Note that this routine does not compute digest for files which have unique
1289							sizes, so they will show up as empty.
1290
1291							=item * B<show_size> => I<true>
1292
1293							Show the size for each file.
1294
1295
1296							=back
1297
1298							Returns an enveloped result (an array).
1299
1300							First element ($status_code) is an integer containing HTTP-like status code
1301							(200 means OK, 4xx caller error, 5xx function error). Second element
1302							($reason) is a string containing error message, or something like "OK" if status is
1303							200. Third element ($payload) is the actual result, but usually not present when enveloped result is an error response ($status_code is not 2xx). Fourth
1304							element (%result_meta) is called result metadata and is optional, a hash
1305							that contains extra information, much like how HTTP response headers provide additional metadata.
1306
1307							Return value: (any)
1308
1309
1310
1311							=head2 uniq_filenames
1312
1313							Usage:
1314
1315							uniq_filenames(%args) -> [$status_code, $reason, $payload, \%result_meta]
1316
1317							Report duplicate or unique files, optionally perform action on them.
1318
1319							This is a thin wrapper for L<uniq-files>. It sets C<algorithm> to C<name>.
1320
1321							This function is not exported.
1322
1323							Arguments ('*' denotes required arguments):
1324
1325							=over 4
1326
1327							=item * B<authoritative_dirs> => I<array[str]>
1328
1329							Denote director(yE<verbar>ies) where authoritativeE<sol>"Original" copies are found.
1330
1331							=item * B<detail> => I<true>
1332
1333							Show details (a.k.a. --show-digest, --show-size, --show-count).
1334
1335							=item * B<digest_args> => I<array>
1336
1337							Some Digest algorithms require arguments, you can pass them here.
1338
1339							=item * B<exclude_empty_files> => I<bool>
1340
1341							(No description)
1342
1343							=item * B<exclude_file_patterns> => I<array[str]>
1344
1345							Filename (including path) regex patterns to include.
1346
1347							=item * B<files>* => I<array[str]>
1348
1349							(No description)
1350
1351							=item * B<group_by_digest> => I<bool>
1352
1353							Sort files by its digest (or size, if not computing digest), separate each different digest.
1354
1355							=item * B<include_file_patterns> => I<array[str]>
1356
1357							Filename (including path) regex patterns to exclude.
1358
1359							=item * B<max_size> => I<filesize>
1360
1361							Maximum file size to consider.
1362
1363							=item * B<min_size> => I<filesize>
1364
1365							Minimum file size to consider.
1366
1367							=item * B<recurse> => I<bool>
1368
1369							If set to true, will recurse into subdirectories.
1370
1371							=item * B<report_duplicate> => I<int> (default: 2)
1372
1373							Whether to return duplicate items.
1374
1375							Can be set to either 0, 1, 2, or 3.
1376
1377							If set to 0, duplicate items will not be returned.
1378
1379							If set to 1 (the default for C<dupe-files>), will return all the the duplicate
1380							files. For example: C<file1> contains text 'a', C<file2> 'b', C<file3> 'a'. Then
1381							C<file1> and C<file3> will be returned.
1382
1383							If set to 2 (the default for C<uniq-files>), will only return the first of
1384							duplicate items. Continuing from previous example, only C<file1> will be returned
1385							because C<file2> is unique and C<file3> contains 'a' (already represented by
1386							C<file1>). If one or more C<--authoritative-dir> (C<-O>) options are specified,
1387							files under these directories will be preferred.
1388
1389							If set to 3, will return all but the first of duplicate items. Continuing from
1390							previous example: C<file3> will be returned. This is useful if you want to keep
1391							only one copy of the duplicate content. You can use the output of this routine
1392							to C<mv> or C<rm>. Similar to the previous case, if one or more
1393							C<--authoritative-dir> (C<-O>) options are specified, then files under these
1394							directories will not be listed if possible.
1395
1396							=item * B<report_unique> => I<bool> (default: 1)
1397
1398							Whether to return unique items.
1399
1400							=item * B<show_count> => I<bool> (default: 0)
1401
1402							Whether to return each file content's number of occurence.
1403
1404							1 means the file content is only encountered once (unique), 2 means there is one
1405							duplicate, and so on.
1406
1407							=item * B<show_digest> => I<true>
1408
1409							Show the digest value (or the size, if not computing digest) for each file.
1410
1411							Note that this routine does not compute digest for files which have unique
1412							sizes, so they will show up as empty.
1413
1414							=item * B<show_size> => I<true>
1415
1416							Show the size for each file.
1417
1418
1419							=back
1420
1421							Returns an enveloped result (an array).
1422
1423							First element ($status_code) is an integer containing HTTP-like status code
1424							(200 means OK, 4xx caller error, 5xx function error). Second element
1425							($reason) is a string containing error message, or something like "OK" if status is
1426							200. Third element ($payload) is the actual result, but usually not present when enveloped result is an error response ($status_code is not 2xx). Fourth
1427							element (%result_meta) is called result metadata and is optional, a hash
1428							that contains extra information, much like how HTTP response headers provide additional metadata.
1429
1430							Return value: (any)
1431
1432
1433
1434							=head2 uniq_filenames_between_two_dirs
1435
1436							Usage:
1437
1438							uniq_filenames_between_two_dirs(%args) -> [$status_code, $reason, $payload, \%result_meta]
1439
1440							Report duplicate or unique files, optionally perform action on them.
1441
1442							This is a thin wrapper for L<uniq-files>. It sets C<algorithm> to C<name>,
1443							C<recurse> to true. It also accepts two directory names instead of one+ dir/file
1444							names.
1445
1446							This function is not exported.
1447
1448							Arguments ('*' denotes required arguments):
1449
1450							=over 4
1451
1452							=item * B<authoritative_dirs> => I<array[str]>
1453
1454							Denote director(yE<verbar>ies) where authoritativeE<sol>"Original" copies are found.
1455
1456							=item * B<detail> => I<true>
1457
1458							Show details (a.k.a. --show-digest, --show-size, --show-count).
1459
1460							=item * B<digest_args> => I<array>
1461
1462							Some Digest algorithms require arguments, you can pass them here.
1463
1464							=item * B<dir1>* => I<dirname>
1465
1466							(No description)
1467
1468							=item * B<dir2>* => I<dirname>
1469
1470							(No description)
1471
1472							=item * B<exclude_empty_files> => I<bool>
1473
1474							(No description)
1475
1476							=item * B<exclude_file_patterns> => I<array[str]>
1477
1478							Filename (including path) regex patterns to include.
1479
1480							=item * B<group_by_digest> => I<bool>
1481
1482							Sort files by its digest (or size, if not computing digest), separate each different digest.
1483
1484							=item * B<include_file_patterns> => I<array[str]>
1485
1486							Filename (including path) regex patterns to exclude.
1487
1488							=item * B<max_size> => I<filesize>
1489
1490							Maximum file size to consider.
1491
1492							=item * B<min_size> => I<filesize>
1493
1494							Minimum file size to consider.
1495
1496							=item * B<report_duplicate> => I<int> (default: 2)
1497
1498							Whether to return duplicate items.
1499
1500							Can be set to either 0, 1, 2, or 3.
1501
1502							If set to 0, duplicate items will not be returned.
1503
1504							If set to 1 (the default for C<dupe-files>), will return all the the duplicate
1505							files. For example: C<file1> contains text 'a', C<file2> 'b', C<file3> 'a'. Then
1506							C<file1> and C<file3> will be returned.
1507
1508							If set to 2 (the default for C<uniq-files>), will only return the first of
1509							duplicate items. Continuing from previous example, only C<file1> will be returned
1510							because C<file2> is unique and C<file3> contains 'a' (already represented by
1511							C<file1>). If one or more C<--authoritative-dir> (C<-O>) options are specified,
1512							files under these directories will be preferred.
1513
1514							If set to 3, will return all but the first of duplicate items. Continuing from
1515							previous example: C<file3> will be returned. This is useful if you want to keep
1516							only one copy of the duplicate content. You can use the output of this routine
1517							to C<mv> or C<rm>. Similar to the previous case, if one or more
1518							C<--authoritative-dir> (C<-O>) options are specified, then files under these
1519							directories will not be listed if possible.
1520
1521							=item * B<report_unique> => I<bool> (default: 1)
1522
1523							Whether to return unique items.
1524
1525							=item * B<show_count> => I<bool> (default: 0)
1526
1527							Whether to return each file content's number of occurence.
1528
1529							1 means the file content is only encountered once (unique), 2 means there is one
1530							duplicate, and so on.
1531
1532							=item * B<show_digest> => I<true>
1533
1534							Show the digest value (or the size, if not computing digest) for each file.
1535
1536							Note that this routine does not compute digest for files which have unique
1537							sizes, so they will show up as empty.
1538
1539							=item * B<show_size> => I<true>
1540
1541							Show the size for each file.
1542
1543
1544							=back
1545
1546							Returns an enveloped result (an array).
1547
1548							First element ($status_code) is an integer containing HTTP-like status code
1549							(200 means OK, 4xx caller error, 5xx function error). Second element
1550							($reason) is a string containing error message, or something like "OK" if status is
1551							200. Third element ($payload) is the actual result, but usually not present when enveloped result is an error response ($status_code is not 2xx). Fourth
1552							element (%result_meta) is called result metadata and is optional, a hash
1553							that contains extra information, much like how HTTP response headers provide additional metadata.
1554
1555							Return value: (any)
1556
1557
1558
1559							=head2 uniq_files
1560
1561							Usage:
1562
1563							uniq_files(%args) -> [$status_code, $reason, $payload, \%result_meta]
1564
1565							Report duplicate or unique files, optionally perform action on them.
1566
1567							Given a list of filenames, will check each file's content (and/or size, and/or
1568							only name) to decide whether the file is a duplicate of another.
1569
1570							There is a certain amount of flexibility on how duplicate is determined:
1571							- when comparing content, various hashing algorithm is supported;
1572							- when comparing size, a certain tolerance % is allowed;
1573							- when comparing filename, munging can first be done.
1574
1575							There is flexibility on what to do with duplicate files:
1576							- just print unique/duplicate files (and let other utilities down the pipe deal
1577							with them);
1578							- move duplicates to some location;
1579							- open the files first and prompt for action;
1580							- let a Perl code process the files.
1581
1582							Interface is loosely based on the C<uniq> Unix command-line program.
1583
1584							This function is not exported by default, but exportable.
1585
1586							Arguments ('*' denotes required arguments):
1587
1588							=over 4
1589
1590							=item * B<algorithm> => I<str>
1591
1592							What algorithm is used to compute the digest of the content.
1593
1594							The default is to use C<md5>. Some algorithms supported include C<crc32>, C<sha1>,
1595							C<sha256>, as well as C<Digest> to use Perl L<Digest> which supports a lot of
1596							other algorithms, e.g. C<SHA-1>, C<BLAKE2b>.
1597
1598							If set to '', 'none', or 'size', then digest will be set to file size. This
1599							means uniqueness will be determined solely from file size. This can be quicker
1600							but will generate a false positive when two files of the same size are deemed as
1601							duplicate even though their content may be different.
1602
1603							If set to 'name' then only name comparison will be performed. This of course can
1604							potentially generate lots of false positives, but in some cases you might want
1605							to compare filename for uniqueness.
1606
1607							=item * B<authoritative_dirs> => I<array[str]>
1608
1609							Denote director(yE<verbar>ies) where authoritativeE<sol>"Original" copies are found.
1610
1611							=item * B<detail> => I<true>
1612
1613							Show details (a.k.a. --show-digest, --show-size, --show-count).
1614
1615							=item * B<digest_args> => I<array>
1616
1617							Some Digest algorithms require arguments, you can pass them here.
1618
1619							=item * B<exclude_empty_files> => I<bool>
1620
1621							(No description)
1622
1623							=item * B<exclude_file_patterns> => I<array[str]>
1624
1625							Filename (including path) regex patterns to include.
1626
1627							=item * B<files>* => I<array[str]>
1628
1629							(No description)
1630
1631							=item * B<group_by_digest> => I<bool>
1632
1633							Sort files by its digest (or size, if not computing digest), separate each different digest.
1634
1635							=item * B<include_file_patterns> => I<array[str]>
1636
1637							Filename (including path) regex patterns to exclude.
1638
1639							=item * B<max_size> => I<filesize>
1640
1641							Maximum file size to consider.
1642
1643							=item * B<min_size> => I<filesize>
1644
1645							Minimum file size to consider.
1646
1647							=item * B<recurse> => I<bool>
1648
1649							If set to true, will recurse into subdirectories.
1650
1651							=item * B<report_duplicate> => I<int> (default: 2)
1652
1653							Whether to return duplicate items.
1654
1655							Can be set to either 0, 1, 2, or 3.
1656
1657							If set to 0, duplicate items will not be returned.
1658
1659							If set to 1 (the default for C<dupe-files>), will return all the the duplicate
1660							files. For example: C<file1> contains text 'a', C<file2> 'b', C<file3> 'a'. Then
1661							C<file1> and C<file3> will be returned.
1662
1663							If set to 2 (the default for C<uniq-files>), will only return the first of
1664							duplicate items. Continuing from previous example, only C<file1> will be returned
1665							because C<file2> is unique and C<file3> contains 'a' (already represented by
1666							C<file1>). If one or more C<--authoritative-dir> (C<-O>) options are specified,
1667							files under these directories will be preferred.
1668
1669							If set to 3, will return all but the first of duplicate items. Continuing from
1670							previous example: C<file3> will be returned. This is useful if you want to keep
1671							only one copy of the duplicate content. You can use the output of this routine
1672							to C<mv> or C<rm>. Similar to the previous case, if one or more
1673							C<--authoritative-dir> (C<-O>) options are specified, then files under these
1674							directories will not be listed if possible.
1675
1676							=item * B<report_unique> => I<bool> (default: 1)
1677
1678							Whether to return unique items.
1679
1680							=item * B<show_count> => I<bool> (default: 0)
1681
1682							Whether to return each file content's number of occurence.
1683
1684							1 means the file content is only encountered once (unique), 2 means there is one
1685							duplicate, and so on.
1686
1687							=item * B<show_digest> => I<true>
1688
1689							Show the digest value (or the size, if not computing digest) for each file.
1690
1691							Note that this routine does not compute digest for files which have unique
1692							sizes, so they will show up as empty.
1693
1694							=item * B<show_size> => I<true>
1695
1696							Show the size for each file.
1697
1698
1699							=back
1700
1701							Returns an enveloped result (an array).
1702
1703							First element ($status_code) is an integer containing HTTP-like status code
1704							(200 means OK, 4xx caller error, 5xx function error). Second element
1705							($reason) is a string containing error message, or something like "OK" if status is
1706							200. Third element ($payload) is the actual result, but usually not present when enveloped result is an error response ($status_code is not 2xx). Fourth
1707							element (%result_meta) is called result metadata and is optional, a hash
1708							that contains extra information, much like how HTTP response headers provide additional metadata.
1709
1710							Return value: (any)
1711
1712							=head1 HOMEPAGE
1713
1714							Please visit the project's homepage at L<https://metacpan.org/release/File-FindUniq>.
1715
1716							=head1 SOURCE
1717
1718							Source repository is at L<https://github.com/perlancar/perl-File-FindUniq>.
1719
1720							=head1 SEE ALSO
1721
1722							L<App::FindUtils>
1723
1724							L<move-duplicate-files-to> from L<App::DuplicateFilesUtils>, which is basically
1725							a shortcut for C<< uniq-files -D -R . \| while read f; do mv "$f" SOMEDIR/; done
1726							>>.
1727
1728							=head1 AUTHOR
1729
1730							perlancar <perlancar@cpan.org>
1731
1732							=head1 CONTRIBUTING
1733
1734
1735							To contribute, you can send patches by email/via RT, or send pull requests on
1736							GitHub.
1737
1738							Most of the time, you don't need to build the distribution yourself. You can
1739							simply modify the code, then test via:
1740
1741							% prove -l
1742
1743							If you want to build the distribution (e.g. to try to install it locally on your
1744							system), you can install L<Dist::Zilla>,
1745							L<Dist::Zilla::PluginBundle::Author::PERLANCAR>,
1746							L<Pod::Weaver::PluginBundle::Author::PERLANCAR>, and sometimes one or two other
1747							Dist::Zilla- and/or Pod::Weaver plugins. Any additional steps required beyond
1748							that are considered a bug and can be reported to me.
1749
1750							=head1 COPYRIGHT AND LICENSE
1751
1752							This software is copyright (c) 2025 by perlancar <perlancar@cpan.org>.
1753
1754							This is free software; you can redistribute it and/or modify it under
1755							the same terms as the Perl 5 programming language system itself.
1756
1757							=head1 BUGS
1758
1759							Please report any bugs or feature requests on the bugtracker website L<https://rt.cpan.org/Public/Dist/Display.html?Name=File-FindUniq>
1760
1761							When submitting a bug or request, please include a test-file or a
1762							patch to an existing test-file that illustrates the bug or desired
1763							feature.
1764
1765							=cut