File Coverage

blib/lib/App/UniqFiles.pm

Criterion	Covered	Total	%
statement	168	223	75.3
branch	78	122	63.9
condition	43	65	66.1
subroutine	11	12	91.6
pod	1	1	100.0
total	301	423	71.1

line	stmt	bran	cond	sub	pod	time	code
1							package App::UniqFiles;
2
3	1			1		152193	use 5.010001;
	1					15
4	1			1		5	use strict;
	1					2
	1					23
5	1			1		5	use warnings;
	1					2
	1					32
6	1			1		2348	use Log::ger;
	1					59
	1					7
7
8	1			1		284	use Cwd qw(abs_path);
	1					3
	1					61
9	1			1		6	use Exporter qw(import);
	1					2
	1					27
10	1			1		561	use Perinci::Sub::Util qw(gen_modified_sub);
	1					2578
	1					169
11
12							our $AUTHORITY = 'cpan:PERLANCAR'; # AUTHORITY
13							our $DATE = '2022-11-15'; # DATE
14							our $DIST = 'App-UniqFiles'; # DIST
15							our $VERSION = '0.139'; # VERSION
16
17							our @EXPORT_OK = qw(uniq_files);
18
19							our %SPEC;
20
21							sub _glob {
22	10			10		56	require File::Find;
23
24	10					22	my $dir;
25							my @res;
26							File::Find::finddepth(
27							sub {
28	30	50		30		324	return if -l $_;
29	30	100				270	return unless -f _;
30	1			1		8	no warnings 'once'; # $File::Find::dir
	1					3
	1					2664
31	20					266	push @res, "$File::Find::dir/$_";
32							},
33	10					1152	@_,
34							);
35	10					82	@res;
36							}
37
38							our %argspec_authoritative_dirs = (
39							authoritative_dirs => {
40							summary => 'Denote director(y\|ies) where authoritative/"Original" copies are found',
41							'x.name.is_plural' => 1,
42							'x.name.singular' => 'authoritative_dir',
43							schema => ['array', of=>'str'], # XXX dirname
44							cmdline_aliases => {O=>{}},
45							},
46							);
47							our %argspecs_filter = (
48							include_file_patterns => {
49							summary => 'Filename (including path) regex patterns to exclude',
50							'x.name.is_plural' => 1,
51							'x.name.singular' => 'include_file_pattern',
52							schema => ['array', of=>'str'], # XXX re
53							cmdline_aliases => {I=>{}},
54							},
55							exclude_file_patterns => {
56							summary => 'Filename (including path) regex patterns to include',
57							'x.name.is_plural' => 1,
58							'x.name.singular' => 'exclude_file_pattern',
59							schema => ['array', of=>'str'], # XXX re
60							cmdline_aliases => {X=>{}},
61							},
62							exclude_empty_files => {
63							schema => 'bool*',
64							cmdline_aliases => {Z=>{}},
65							},
66							min_size => {
67							summary => 'Minimum file size to consider',
68							schema => 'filesize*',
69							},
70							max_size => {
71							summary => 'Maximum file size to consider',
72							schema => 'filesize*',
73							},
74							);
75
76							$SPEC{uniq_files} = {
77							v => 1.1,
78							summary => 'Report duplicate or unique file contents',
79							description => <<'_',
80
81							Given a list of filenames, will check each file size and content for duplicate
82							content. Interface is a bit like the `uniq` Unix command-line program.
83
84							_
85							args => {
86							files => {
87							schema => ['array' => {of=>'str'}],
88							req => 1,
89							pos => 0,
90							slurpy => 1,
91							},
92							recurse => {
93							schema => 'bool*',
94							cmdline_aliases => {R=>{}},
95							description => <<'_',
96
97							If set to true, will recurse into subdirectories.
98
99							_
100							},
101							group_by_digest => {
102							summary => 'Sort files by its digest (or size, if not computing digest), separate each different digest',
103							schema => 'bool*',
104							},
105							show_digest => {
106							summary => 'Show the digest value (or the size, if not computing digest) for each file',
107							description => <<'_',
108
109							Note that this routine does not compute digest for files which have unique
110							sizes, so they will show up as empty.
111
112							_
113							schema => 'true*',
114							},
115							show_size => {
116							summary => 'Show the size for each file',
117							schema => 'true*',
118							},
119							# TODO add option follow_symlinks?
120							report_unique => {
121							schema => [bool => {default=>1}],
122							summary => 'Whether to return unique items',
123							cmdline_aliases => {
124							a => {
125							summary => 'Alias for --report-unique --report-duplicate=1 (report all files)',
126							code => sub {
127							my $args = shift;
128							$args->{report_unique} = 1;
129							$args->{report_duplicate} = 1;
130							},
131							},
132							u => {
133							summary => 'Alias for --report-unique --report-duplicate=0',
134							code => sub {
135							my $args = shift;
136							$args->{report_unique} = 1;
137							$args->{report_duplicate} = 0;
138							},
139							},
140							d => {
141							summary =>
142							'Alias for --noreport-unique --report-duplicate=1',
143							code => sub {
144							my $args = shift;
145							$args->{report_unique} = 0;
146							$args->{report_duplicate} = 1;
147							},
148							},
149							D => {
150							summary =>
151							'Alias for --noreport-unique --report-duplicate=3',
152							code => sub {
153							my $args = shift;
154							$args->{report_unique} = 0;
155							$args->{report_duplicate} = 3;
156							},
157							},
158							},
159							},
160							report_duplicate => {
161							schema => [int => {in=>[0,1,2,3], default=>2}],
162							summary => 'Whether to return duplicate items',
163							description => <<'_',
164
165							Can be set to either 0, 1, 2, or 3.
166
167							If set to 0, duplicate items will not be returned.
168
169							If set to 1 (the default for `dupe-files`), will return all the the duplicate
170							files. For example: `file1` contains text 'a', `file2` 'b', `file3` 'a'. Then
171							`file1` and `file3` will be returned.
172
173							If set to 2 (the default for `uniq-files`), will only return the first of
174							duplicate items. Continuing from previous example, only `file1` will be returned
175							because `file2` is unique and `file3` contains 'a' (already represented by
176							`file1`). If one or more `--authoritative-dir` (`-O`) options are specified,
177							files under these directories will be preferred.
178
179							If set to 3, will return all but the first of duplicate items. Continuing from
180							previous example: `file3` will be returned. This is useful if you want to keep
181							only one copy of the duplicate content. You can use the output of this routine
182							to `mv` or `rm`. Similar to the previous case, if one or more
183							`--authoritative-dir` (`-O`) options are specified, then files under these
184							directories will not be listed if possible.
185
186							_
187							cmdline_aliases => {
188							},
189							},
190							algorithm => {
191							schema => ['str*'],
192							summary => "What algorithm is used to compute the digest of the content",
193							description => <<'_',
194
195							The default is to use `md5`. Some algorithms supported include `crc32`, `sha1`,
196							`sha256`, as well as `Digest` to use Perl <pm:Digest> which supports a lot of
197							other algorithms, e.g. `SHA-1`, `BLAKE2b`.
198
199							If set to '', 'none', or 'size', then digest will be set to file size. This
200							means uniqueness will be determined solely from file size. This can be quicker
201							but will generate a false positive when two files of the same size are deemed as
202							duplicate even though their content may be different.
203
204							_
205							},
206							digest_args => {
207							schema => ['array*',
208
209							# comment out temporarily, Perinci::Sub::GetArgs::Argv
210							# clashes with coerce rules; we should fix
211							# Perinci::Sub::GetArgs::Argv to observe coercion rules
212							# first
213							#of=>'str*',
214
215							'x.perl.coerce_rules'=>['From_str::comma_sep']],
216							description => <<'_',
217
218							Some Digest algorithms require arguments, you can pass them here.
219
220							_
221							cmdline_aliases => {A=>{}},
222							},
223							show_count => {
224							schema => [bool => {default=>0}],
225							summary => "Whether to return each file content's ".
226							"number of occurence",
227							description => <<'_',
228
229							1 means the file content is only encountered once (unique), 2 means there is one
230							duplicate, and so on.
231
232							_
233							cmdline_aliases => {count=>{}, c=>{}},
234							},
235							detail => {
236							summary => 'Show details (a.k.a. --show-digest, --show-size, --show-count)',
237							schema => 'true*',
238							cmdline_aliases => {l=>{}},
239							},
240							%argspec_authoritative_dirs,
241							%argspecs_filter,
242							},
243							examples => [
244							{
245							summary => 'List all files which do no have duplicate contents',
246							src => 'uniq-files *',
247							src_plang => 'bash',
248							test => 0,
249							'x.doc.show_result' => 0,
250							},
251							{
252							summary => 'List all files (recursively, and in detail) which have duplicate contents (all duplicate copies), exclude some files',
253							src => q(uniq-files -R -l -d -X '\.git/' --min-size 10k .),
254							src_plang => 'bash',
255							test => 0,
256							'x.doc.show_result' => 0,
257							},
258							{
259							summary => 'Move all duplicate files (except one copy) in this directory (and subdirectories) to .dupes/',
260							src => 'uniq-files -D -R * \| while read f; do mv "$f" .dupes/; done',
261							src_plang => 'bash',
262							test => 0,
263							'x.doc.show_result' => 0,
264							},
265							{
266							summary => 'List number of occurences of contents for duplicate files',
267							src => 'uniq-files -c *',
268							src_plang => 'bash',
269							test => 0,
270							'x.doc.show_result' => 0,
271							},
272							{
273							summary => 'List number of occurences of contents for all files',
274							src => 'uniq-files -a -c *',
275							src_plang => 'bash',
276							test => 0,
277							'x.doc.show_result' => 0,
278							},
279							{
280							summary => 'List all files, along with their number of content occurrences and content digest. '.
281							'Use the BLAKE2b digest algorithm. And group the files according to their digest.',
282							src => 'uniq-files -a -c --show-digest -A BLAKE2,blake2b *',
283							src_plang => 'bash',
284							test => 0,
285							'x.doc.show_result' => 0,
286							},
287							],
288							};
289							sub uniq_files {
290	11			11	1	38875	my %args = @_;
291
292	11					26	my $files = $args{files};
293	11	50	33			54	return [400, "Please specify files"] if !$files \|\| !@$files;
294	11					18	my $recurse = $args{recurse};
295	11		100			30	my $report_unique = $args{report_unique} // 1;
296	11		100			27	my $report_duplicate = $args{report_duplicate} // 2;
297	11		100			31	my $show_count = $args{show_count} // 0;
298	11		100			28	my $show_digest = $args{show_digest} // 0;
299	11		100			29	my $show_size = $args{show_size} // 0;
300	11					17	my $digest_args = $args{digest_args};
301	11	50	66			37	my $algorithm = $args{algorithm} // ($digest_args ? 'Digest' : 'md5');
302	11					17	my $group_by_digest = $args{group_by_digest};
303
304	11	50				26	if ($args{detail}) {
305	0					0	$show_digest = 1;
306	0					0	$show_size = 1;
307	0					0	$show_count = 1;
308							}
309
310							my @authoritative_dirs = $args{authoritative_dirs} && @{$args{authoritative_dirs}} ?
311	11	100	66			30	@{ $args{authoritative_dirs} } : ();
	2					5
312	11					24	for my $dir (@authoritative_dirs) {
313	2	50				34	(-d $dir) or return [400, "Authoritative dir '$dir' does not exist or not a directory"];
314	2	50				40	my $abs_dir = abs_path $dir or return [400, "Cannot get absolute path for authoritative dir '$dir'"];
315	2					8	$dir = $abs_dir;
316							}
317							#log_trace "authoritative_dirs=%s", \@authoritative_dirs if @authoritative_dirs;
318
319	11					17	my @include_re;
320	11		50			16	for my $re0 (@{ $args{include_file_patterns} // [] }) {
	11					45
321	0					0	require Regexp::Util;
322	0					0	my $re;
323	0	0				0	if (ref $re0 eq 'Regexp') {
324	0					0	$re = $re0;
325							} else {
326	0					0	eval { $re = Regexp::Util::deserialize_regexp("qr($re0)") };
	0					0
327	0	0				0	return [400, "Invalid/unsafe regex pattern in include_file_patterns '$re0': $@"] if $@;
328	0	0				0	return [400, "Unsafe regex pattern (contains embedded code) in include_file_patterns '$re0'"] if Regexp::Util::regexp_seen_evals($re);
329							}
330	0					0	push @include_re, $re;
331							}
332	11					19	my @exclude_re;
333	11		50			16	for my $re0 (@{ $args{exclude_file_patterns} // [] }) {
	11					36
334	0					0	require Regexp::Util;
335	0					0	my $re;
336	0	0				0	if (ref $re0 eq 'Regexp') {
337	0					0	$re = $re0;
338							} else {
339	0					0	eval { $re = Regexp::Util::deserialize_regexp("qr($re0)") };
	0					0
340	0	0				0	return [400, "Invalid/unsafe regex pattern in exclude_file_patterns '$re0': $@"] if $@;
341	0	0				0	return [400, "Unsafe regex pattern (contains embedded code) in exclude_file_patterns '$re0'"] if Regexp::Util::regexp_seen_evals($re);
342							}
343	0					0	push @exclude_re, $re;
344							}
345
346	11	100				25	if ($recurse) {
347							$files = [ map {
348	5	50				10	if (-l $_) {
	35	100				324
349	0					0	();
350							} elsif (-d _) {
351	10					34	(_glob($_));
352							} else {
353	25					80	($_);
354							}
355							} @$files ];
356							}
357
358							FILTER: {
359	11					17	my $ffiles;
	11					15
360							FILE:
361	11					23	for my $f (@$files) {
362	87	50				802	if (-l $f) {
363	0					0	log_warn "File '$f' is a symlink, ignored";
364	0					0	next FILE;
365							}
366	87	100				221	if (-d _) {
367	12					55	log_warn "File '$f' is a directory, ignored";
368	12					45	next FILE;
369							}
370	75	50				146	unless (-f _) {
371	0					0	log_warn "File '$f' is not a regular file, ignored";
372	0					0	next FILE;
373							}
374
375	75	50				162	if (@include_re) {
376	0					0	my $included;
377	0					0	for my $re (@include_re) {
378	0	0				0	if ($f =~ $re) { $included++; last }
	0					0
	0					0
379							}
380	0	0				0	unless ($included) {
381	0					0	log_info "File '$f' is not in --include-file-patterns, skipped";
382	0					0	next FILE;
383							}
384							}
385	75	50				138	if (@exclude_re) {
386	0					0	for my $re (@exclude_re) {
387	0	0				0	if ($f =~ $re) {
388	0					0	log_info "File '$f' is in --exclude-file-patterns, skipped";
389	0					0	next FILE;
390							}
391							}
392							}
393
394	75					622	my $size = -s $f;
395	75	50	33			216	if ($args{exclude_empty_files} && !$size) {
396	0					0	log_info "File '$f' is empty, skipped by option -Z";
397	0					0	next FILE;
398							}
399	75	50	33			149	if ($args{min_size} && $size < $args{min_size}) {
400	0					0	log_info "File '$f' (size=$size) is smaller than min_file ($args{min_size}), skipped";
401	0					0	next FILE;
402							}
403	75	50	33			170	if ($args{max_size} && $size > $args{max_size}) {
404	0					0	log_info "File '$f' (size=$size) is larger than max_file ($args{max_size}), skipped";
405	0					0	next FILE;
406							}
407
408	75					202	push @$ffiles, $f;
409							}
410	11					27	$files = $ffiles;
411							} # FILTER
412
413	11					29	my %size_counts; # key = size, value = number of files having that size
414							my %size_files; # key = size, value = [file, ...]
415	11					0	my %file_sizes; # key = filename, value = file size, for caching stat()
416							GET_FILE_SIZES: {
417	11					16	for my $f (@$files) {
	11					19
418	75					672	my @st = stat $f;
419	75	50				186	unless (@st) {
420	0					0	log_error("Can't stat file `$f`: $!, skipped");
421	0					0	next;
422							}
423	75					166	$size_counts{$st[7]}++;
424	75		100			199	$size_files{$st[7]} //= [];
425	75					89	push @{$size_files{$st[7]}}, $f;
	75					166
426	75					310	$file_sizes{$f} = $st[7];
427							}
428							}
429
430	11		66			58	my $calc_digest = !($algorithm eq '' \|\| $algorithm eq 'none' \|\| $algorithm eq 'size');
431
432							# calculate digest for all files having non-unique sizes
433	11					27	my %digest_counts; # key = digest, value = num of files having that digest
434							my %digest_files; # key = digest, value = [file, ...]
435	11					0	my %file_digests; # key = filename, value = file digest
436							CALC_FILE_DIGESTS: {
437	11	100				16	last unless $calc_digest;
	11					23
438	10					559	require File::Digest;
439
440	10					2368	for my $f (@$files) {
441	66	50				147	next unless defined $file_sizes{$f}; # just checking. all files should have sizes.
442	66	100				153	next if $size_counts{ $file_sizes{$f} } == 1; # skip unique file sizes.
443	60					154	my $res = File::Digest::digest_file(
444							file=>$f, algorithm=>$algorithm, digest_args=>$digest_args);
445	60	50				10215	return [500, "Can't calculate digest for file '$f': $res->[0] - $res->[1]"]
446							unless $res->[0] == 200;
447	60					104	my $digest = $res->[2];
448	60					128	$digest_counts{$digest}++;
449	60		100			224	$digest_files{$digest} //= [];
450	60					93	push @{$digest_files{$digest}}, $f;
	60					148
451	60					169	$file_digests{$f} = $digest;
452							}
453							}
454
455	11					22	my %file_counts; # key = file name, value = num of files having file content
456	11					22	for my $f (@$files) {
457	75	50				132	next unless defined $file_sizes{$f}; # just checking
458	75	100				122	if (!defined($file_digests{$f})) {
459	15					32	$file_counts{$f} = $size_counts{ $file_sizes{$f} };
460							} else {
461	60					111	$file_counts{$f} = $digest_counts{ $file_digests{$f} };
462							}
463							}
464
465							SORT_DUPLICATE_FILES: {
466	11	100				16	last unless @authoritative_dirs;
	11					25
467	2	50				8	my $hash = $calc_digest ? \%digest_files : \%size_files;
468	2					6	for my $key (keys %$hash) {
469	10					15	my @files = @{ $hash->{$key} };
	10					24
470	10					17	my @abs_files;
471	10	100				24	next unless @files > 1;
472	4					22	for my $file (@files) {
473	12	50				197	my $abs_file = abs_path $file or do {
474	0					0	log_error "Cannot find absolute path for duplicate file '$file', skipping duplicate set %s", \@files;
475							};
476	12					37	push @abs_files, $abs_file;
477							}
478
479							#log_trace "Duplicate files before sorting: %s", \@files;
480	12					26	@files = map { $files[$_] } sort {
481	4					16	my $file_a = $abs_files[$a];
	10					29
482	10					13	my $file_a_in_authoritative_dirs = 0;
483	10					13	my $subdir_len_file_a;
484	10					18	for my $d (@authoritative_dirs) {
485	10	50				61	if ($file_a =~ m!\A\Q$d\E(?:/\|\z)(.*)!) { $file_a_in_authoritative_dirs++; $subdir_len_file_a = length($1); last }
	0					0
	0					0
	0					0
486							}
487	10					17	my $file_b = $abs_files[$b];
488	10					13	my $file_b_in_authoritative_dirs = 0;
489	10					11	my $subdir_len_file_b;
490	10					15	for my $d (@authoritative_dirs) {
491	10	100				51	if ($file_b =~ m!\A\Q$d\E(?:/\|\z)(.*)!) { $file_b_in_authoritative_dirs++; $subdir_len_file_b = length($1); last }
	2					3
	2					7
	2					4
492							}
493							#log_trace " file_a=<$file_a>, in authoritative_dirs? $file_a_in_authoritative_dirs";
494							#log_trace " file_b=<$file_b>, in authoritative_dirs? $file_b_in_authoritative_dirs";
495							# files located near the root of authoritative dir is preferred
496							# to deeper files. this is done by comparing subdir_len
497	10	50				39	($file_a_in_authoritative_dirs ? $subdir_len_file_a : 9999) <=> ($file_b_in_authoritative_dirs ? $subdir_len_file_b : 9999) \|\|
		100
		50
498							$file_a cmp $file_b;
499							} 0..$#files;
500							#log_trace "Duplicate files after sorting: %s", \@files;
501
502	4					15	$hash->{$key} = \@files;
503							}
504							}
505
506							#$log->trace("report_duplicate=$report_duplicate");
507	11					20	my @files;
508	11					60	for my $f (sort keys %file_counts) {
509	75	100				128	if ($file_counts{$f} == 1) {
510							#log_trace "unique file '$f'";
511	24	100				44	push @files, $f if $report_unique;
512							} else {
513							#log_trace "duplicate file '$f'";
514							my $is_first_copy = $calc_digest ?
515							$f eq $digest_files{ $file_digests{$f} }[0] :
516	51	100				105	$f eq $size_files{ $file_sizes{$f} }[0];
517							#log_trace "is first copy? <$is_first_copy>";
518	51	100				115	if ($report_duplicate == 0) {
		100
		100
		50
519							# do not report dupe files
520							} elsif ($report_duplicate == 1) {
521	15					28	push @files, $f;
522							} elsif ($report_duplicate == 2) {
523	21	100				43	push @files, $f if $is_first_copy;
524							} elsif ($report_duplicate == 3) {
525	9	100				23	push @files, $f unless $is_first_copy;
526							} else {
527	0					0	die "Invalid value for --report-duplicate ".
528							"'$report_duplicate', please choose 0/1/2/3";
529							}
530							}
531							}
532
533							GROUP_FILES_BY_DIGEST: {
534	11	100				22	last unless $group_by_digest;
	11					26
535							@files = sort {
536	1					9	$file_sizes{$a} <=> $file_sizes{$b} \|\|
537	20	50	50			52	($file_digests{$a} // '') cmp ($file_digests{$b} // '')
			50
538							} @files;
539							}
540
541	11					28	my @rows;
542							my %resmeta;
543	11					0	my $last_digest;
544	11					15	for my $f (@files) {
545	41		66			89	my $digest = $file_digests{$f} // $file_sizes{$f};
546
547							# add separator row
548	41	100	100			95	if ($group_by_digest && defined $last_digest && $digest ne $last_digest) {
			100
549	4	50	33			23	push @rows, ($show_count \|\| $show_digest \|\| $show_size) ? {} : '';
550							}
551
552	41					52	my $row;
553	41	100	100			135	if ($show_count \|\| $show_digest \|\| $show_size) {
			100
554	19					49	$row = {file=>$f};
555	19	100				39	$row->{count} = $file_counts{$f} if $show_count;
556	19	100				34	$row->{digest} = $file_digests{$f} if $show_digest;
557	19	100				38	$row->{size} = $file_sizes{$f} if $show_size;
558							} else {
559	22					30	$row = $f;
560							}
561	41					57	push @rows, $row;
562	41					67	$last_digest = $digest;
563							}
564
565	11					31	$resmeta{'table.fields'} = [qw/file size digest count/];
566
567	11					146	[200, "OK", \@rows, \%resmeta];
568							}
569
570							gen_modified_sub(
571							base_name => 'uniq_files',
572							output_name => 'dupe_files',
573							description => <<'_',
574
575							This is a thin wrapper to <prog:uniq-files>. It defaults `report_unique` to 0
576							and `report_duplicate` to 1.
577
578							_
579							modify_args => {
580							report_unique => sub {
581							$_[0]{schema} = [bool => {default=>0}];
582							},
583							report_duplicate => sub {
584							$_[0]{schema} = [int => {in=>[0,1,2,3], default=>1}];
585							},
586							},
587							modify_meta => sub {
588							$_[0]{examples} = [
589							{
590							summary => 'List all files (recursively, and in detail) which have duplicate contents (all duplicate copies)',
591							src => 'dupe-files -lR *',
592							src_plang => 'bash',
593							test => 0,
594							'x.doc.show_result' => 0,
595							},
596							];
597							},
598							output_code => sub {
599	0			0			my %args = @_;
600	0		0				$args{report_unique} //= 0;
601	0		0				$args{report_duplicate} //= 1;
602	0						uniq_files(%args);
603							},
604							);
605
606							1;
607							# ABSTRACT: Report duplicate or unique file contents
608
609							__END__
610
611							=pod
612
613							=encoding UTF-8
614
615							=head1 NAME
616
617							App::UniqFiles - Report duplicate or unique file contents
618
619							=head1 VERSION
620
621							This document describes version 0.139 of App::UniqFiles (from Perl distribution App-UniqFiles), released on 2022-11-15.
622
623							=head1 SYNOPSIS
624
625							# See uniq-files script
626
627							=head1 NOTES
628
629							=head1 FUNCTIONS
630
631
632							=head2 dupe_files
633
634							Usage:
635
636							dupe_files(%args) -> [$status_code, $reason, $payload, \%result_meta]
637
638							Report duplicate or unique file contents.
639
640							This is a thin wrapper to L<uniq-files>. It defaults C<report_unique> to 0
641							and C<report_duplicate> to 1.
642
643							This function is not exported.
644
645							Arguments ('*' denotes required arguments):
646
647							=over 4
648
649							=item * B<algorithm> => I<str>
650
651							What algorithm is used to compute the digest of the content.
652
653							The default is to use C<md5>. Some algorithms supported include C<crc32>, C<sha1>,
654							C<sha256>, as well as C<Digest> to use Perl L<Digest> which supports a lot of
655							other algorithms, e.g. C<SHA-1>, C<BLAKE2b>.
656
657							If set to '', 'none', or 'size', then digest will be set to file size. This
658							means uniqueness will be determined solely from file size. This can be quicker
659							but will generate a false positive when two files of the same size are deemed as
660							duplicate even though their content may be different.
661
662							=item * B<authoritative_dirs> => I<array[str]>
663
664							Denote director(yE<verbar>ies) where authoritativeE<sol>"Original" copies are found.
665
666							=item * B<detail> => I<true>
667
668							Show details (a.k.a. --show-digest, --show-size, --show-count).
669
670							=item * B<digest_args> => I<array>
671
672							Some Digest algorithms require arguments, you can pass them here.
673
674							=item * B<exclude_empty_files> => I<bool>
675
676							(No description)
677
678							=item * B<exclude_file_patterns> => I<array[str]>
679
680							Filename (including path) regex patterns to include.
681
682							=item * B<files>* => I<array[str]>
683
684							(No description)
685
686							=item * B<group_by_digest> => I<bool>
687
688							Sort files by its digest (or size, if not computing digest), separate each different digest.
689
690							=item * B<include_file_patterns> => I<array[str]>
691
692							Filename (including path) regex patterns to exclude.
693
694							=item * B<max_size> => I<filesize>
695
696							Maximum file size to consider.
697
698							=item * B<min_size> => I<filesize>
699
700							Minimum file size to consider.
701
702							=item * B<recurse> => I<bool>
703
704							If set to true, will recurse into subdirectories.
705
706							=item * B<report_duplicate> => I<int> (default: 1)
707
708							Whether to return duplicate items.
709
710							Can be set to either 0, 1, 2, or 3.
711
712							If set to 0, duplicate items will not be returned.
713
714							If set to 1 (the default for C<dupe-files>), will return all the the duplicate
715							files. For example: C<file1> contains text 'a', C<file2> 'b', C<file3> 'a'. Then
716							C<file1> and C<file3> will be returned.
717
718							If set to 2 (the default for C<uniq-files>), will only return the first of
719							duplicate items. Continuing from previous example, only C<file1> will be returned
720							because C<file2> is unique and C<file3> contains 'a' (already represented by
721							C<file1>). If one or more C<--authoritative-dir> (C<-O>) options are specified,
722							files under these directories will be preferred.
723
724							If set to 3, will return all but the first of duplicate items. Continuing from
725							previous example: C<file3> will be returned. This is useful if you want to keep
726							only one copy of the duplicate content. You can use the output of this routine
727							to C<mv> or C<rm>. Similar to the previous case, if one or more
728							C<--authoritative-dir> (C<-O>) options are specified, then files under these
729							directories will not be listed if possible.
730
731							=item * B<report_unique> => I<bool> (default: 0)
732
733							Whether to return unique items.
734
735							=item * B<show_count> => I<bool> (default: 0)
736
737							Whether to return each file content's number of occurence.
738
739							1 means the file content is only encountered once (unique), 2 means there is one
740							duplicate, and so on.
741
742							=item * B<show_digest> => I<true>
743
744							Show the digest value (or the size, if not computing digest) for each file.
745
746							Note that this routine does not compute digest for files which have unique
747							sizes, so they will show up as empty.
748
749							=item * B<show_size> => I<true>
750
751							Show the size for each file.
752
753
754							=back
755
756							Returns an enveloped result (an array).
757
758							First element ($status_code) is an integer containing HTTP-like status code
759							(200 means OK, 4xx caller error, 5xx function error). Second element
760							($reason) is a string containing error message, or something like "OK" if status is
761							200. Third element ($payload) is the actual result, but usually not present when enveloped result is an error response ($status_code is not 2xx). Fourth
762							element (%result_meta) is called result metadata and is optional, a hash
763							that contains extra information, much like how HTTP response headers provide additional metadata.
764
765							Return value: (any)
766
767
768
769							=head2 uniq_files
770
771							Usage:
772
773							uniq_files(%args) -> [$status_code, $reason, $payload, \%result_meta]
774
775							Report duplicate or unique file contents.
776
777							Given a list of filenames, will check each file size and content for duplicate
778							content. Interface is a bit like the C<uniq> Unix command-line program.
779
780							This function is not exported by default, but exportable.
781
782							Arguments ('*' denotes required arguments):
783
784							=over 4
785
786							=item * B<algorithm> => I<str>
787
788							What algorithm is used to compute the digest of the content.
789
790							The default is to use C<md5>. Some algorithms supported include C<crc32>, C<sha1>,
791							C<sha256>, as well as C<Digest> to use Perl L<Digest> which supports a lot of
792							other algorithms, e.g. C<SHA-1>, C<BLAKE2b>.
793
794							If set to '', 'none', or 'size', then digest will be set to file size. This
795							means uniqueness will be determined solely from file size. This can be quicker
796							but will generate a false positive when two files of the same size are deemed as
797							duplicate even though their content may be different.
798
799							=item * B<authoritative_dirs> => I<array[str]>
800
801							Denote director(yE<verbar>ies) where authoritativeE<sol>"Original" copies are found.
802
803							=item * B<detail> => I<true>
804
805							Show details (a.k.a. --show-digest, --show-size, --show-count).
806
807							=item * B<digest_args> => I<array>
808
809							Some Digest algorithms require arguments, you can pass them here.
810
811							=item * B<exclude_empty_files> => I<bool>
812
813							(No description)
814
815							=item * B<exclude_file_patterns> => I<array[str]>
816
817							Filename (including path) regex patterns to include.
818
819							=item * B<files>* => I<array[str]>
820
821							(No description)
822
823							=item * B<group_by_digest> => I<bool>
824
825							Sort files by its digest (or size, if not computing digest), separate each different digest.
826
827							=item * B<include_file_patterns> => I<array[str]>
828
829							Filename (including path) regex patterns to exclude.
830
831							=item * B<max_size> => I<filesize>
832
833							Maximum file size to consider.
834
835							=item * B<min_size> => I<filesize>
836
837							Minimum file size to consider.
838
839							=item * B<recurse> => I<bool>
840
841							If set to true, will recurse into subdirectories.
842
843							=item * B<report_duplicate> => I<int> (default: 2)
844
845							Whether to return duplicate items.
846
847							Can be set to either 0, 1, 2, or 3.
848
849							If set to 0, duplicate items will not be returned.
850
851							If set to 1 (the default for C<dupe-files>), will return all the the duplicate
852							files. For example: C<file1> contains text 'a', C<file2> 'b', C<file3> 'a'. Then
853							C<file1> and C<file3> will be returned.
854
855							If set to 2 (the default for C<uniq-files>), will only return the first of
856							duplicate items. Continuing from previous example, only C<file1> will be returned
857							because C<file2> is unique and C<file3> contains 'a' (already represented by
858							C<file1>). If one or more C<--authoritative-dir> (C<-O>) options are specified,
859							files under these directories will be preferred.
860
861							If set to 3, will return all but the first of duplicate items. Continuing from
862							previous example: C<file3> will be returned. This is useful if you want to keep
863							only one copy of the duplicate content. You can use the output of this routine
864							to C<mv> or C<rm>. Similar to the previous case, if one or more
865							C<--authoritative-dir> (C<-O>) options are specified, then files under these
866							directories will not be listed if possible.
867
868							=item * B<report_unique> => I<bool> (default: 1)
869
870							Whether to return unique items.
871
872							=item * B<show_count> => I<bool> (default: 0)
873
874							Whether to return each file content's number of occurence.
875
876							1 means the file content is only encountered once (unique), 2 means there is one
877							duplicate, and so on.
878
879							=item * B<show_digest> => I<true>
880
881							Show the digest value (or the size, if not computing digest) for each file.
882
883							Note that this routine does not compute digest for files which have unique
884							sizes, so they will show up as empty.
885
886							=item * B<show_size> => I<true>
887
888							Show the size for each file.
889
890
891							=back
892
893							Returns an enveloped result (an array).
894
895							First element ($status_code) is an integer containing HTTP-like status code
896							(200 means OK, 4xx caller error, 5xx function error). Second element
897							($reason) is a string containing error message, or something like "OK" if status is
898							200. Third element ($payload) is the actual result, but usually not present when enveloped result is an error response ($status_code is not 2xx). Fourth
899							element (%result_meta) is called result metadata and is optional, a hash
900							that contains extra information, much like how HTTP response headers provide additional metadata.
901
902							Return value: (any)
903
904							=head1 HOMEPAGE
905
906							Please visit the project's homepage at L<https://metacpan.org/release/App-UniqFiles>.
907
908							=head1 SOURCE
909
910							Source repository is at L<https://github.com/perlancar/perl-App-UniqFiles>.
911
912							=head1 SEE ALSO
913
914							L<find-duplicate-filenames> from L<App::FindUtils>
915
916							L<move-duplicate-files-to> from L<App::DuplicateFilesUtils>, which is basically
917							a shortcut for C<< uniq-files -D -R . \| while read f; do mv "$f" SOMEDIR/; done
918							>>.
919
920							=head1 AUTHOR
921
922							perlancar <perlancar@cpan.org>
923
924							=head1 CONTRIBUTOR
925
926							=for stopwords Steven Haryanto
927
928							Steven Haryanto <stevenharyanto@gmail.com>
929
930							=head1 CONTRIBUTING
931
932
933							To contribute, you can send patches by email/via RT, or send pull requests on
934							GitHub.
935
936							Most of the time, you don't need to build the distribution yourself. You can
937							simply modify the code, then test via:
938
939							% prove -l
940
941							If you want to build the distribution (e.g. to try to install it locally on your
942							system), you can install L<Dist::Zilla>,
943							L<Dist::Zilla::PluginBundle::Author::PERLANCAR>,
944							L<Pod::Weaver::PluginBundle::Author::PERLANCAR>, and sometimes one or two other
945							Dist::Zilla- and/or Pod::Weaver plugins. Any additional steps required beyond
946							that are considered a bug and can be reported to me.
947
948							=head1 COPYRIGHT AND LICENSE
949
950							This software is copyright (c) 2022, 2020, 2019, 2017, 2015, 2014, 2012, 2011 by perlancar <perlancar@cpan.org>.
951
952							This is free software; you can redistribute it and/or modify it under
953							the same terms as the Perl 5 programming language system itself.
954
955							=head1 BUGS
956
957							Please report any bugs or feature requests on the bugtracker website L<https://rt.cpan.org/Public/Dist/Display.html?Name=App-UniqFiles>
958
959							When submitting a bug or request, please include a test-file or a
960							patch to an existing test-file that illustrates the bug or desired
961							feature.
962
963							=cut