File Coverage

blib/lib/App/UniqFiles.pm

Criterion	Covered	Total	%
statement	176	232	75.8
branch	79	126	62.7
condition	44	67	65.6
subroutine	11	12	91.6
pod	1	1	100.0
total	311	438	71.0

line	stmt	bran	cond	sub	pod	time	code
1							package App::UniqFiles;
2
3	1			1		148801	use 5.010001;
	1					13
4	1			1		6	use strict;
	1					2
	1					18
5	1			1		5	use warnings;
	1					2
	1					23
6	1			1		2273	use Log::ger;
	1					57
	1					5
7
8	1			1		257	use Cwd qw(abs_path);
	1					2
	1					49
9	1			1		6	use Exporter qw(import);
	1					2
	1					24
10	1			1		626	use Perinci::Sub::Util qw(gen_modified_sub);
	1					2626
	1					174
11
12							our $AUTHORITY = 'cpan:PERLANCAR'; # AUTHORITY
13							our $DATE = '2023-02-06'; # DATE
14							our $DIST = 'App-UniqFiles'; # DIST
15							our $VERSION = '0.141'; # VERSION
16
17							our @EXPORT_OK = qw(uniq_files);
18
19							our %SPEC;
20
21							sub _glob {
22	10			10		64	require File::Find;
23
24	10					21	my $dir;
25							my @res;
26							File::Find::finddepth(
27							sub {
28	30	50		30		313	return if -l $_;
29	30	100				265	return unless -f _;
30	1			1		8	no warnings 'once'; # $File::Find::dir
	1					2
	1					2754
31	20					260	push @res, "$File::Find::dir/$_";
32							},
33	10					1160	@_,
34							);
35	10					83	@res;
36							}
37
38							our %argspec_authoritative_dirs = (
39							authoritative_dirs => {
40							summary => 'Denote director(y\|ies) where authoritative/"Original" copies are found',
41							'x.name.is_plural' => 1,
42							'x.name.singular' => 'authoritative_dir',
43							schema => ['array', of=>'str'], # XXX dirname
44							cmdline_aliases => {O=>{}},
45							},
46							);
47							our %argspecs_filter = (
48							include_file_patterns => {
49							summary => 'Filename (including path) regex patterns to exclude',
50							'x.name.is_plural' => 1,
51							'x.name.singular' => 'include_file_pattern',
52							schema => ['array', of=>'str'], # XXX re
53							cmdline_aliases => {I=>{}},
54							},
55							exclude_file_patterns => {
56							summary => 'Filename (including path) regex patterns to include',
57							'x.name.is_plural' => 1,
58							'x.name.singular' => 'exclude_file_pattern',
59							schema => ['array', of=>'str'], # XXX re
60							cmdline_aliases => {X=>{}},
61							},
62							exclude_empty_files => {
63							schema => 'bool*',
64							cmdline_aliases => {Z=>{}},
65							},
66							min_size => {
67							summary => 'Minimum file size to consider',
68							schema => 'filesize*',
69							},
70							max_size => {
71							summary => 'Maximum file size to consider',
72							schema => 'filesize*',
73							},
74							);
75
76							$SPEC{uniq_files} = {
77							v => 1.1,
78							summary => 'Report duplicate or unique file contents',
79							description => <<'_',
80
81							Given a list of filenames, will check each file size and content for duplicate
82							content. Interface is a bit like the `uniq` Unix command-line program.
83
84							_
85							args => {
86							files => {
87							schema => ['array' => {of=>'str'}],
88							req => 1,
89							pos => 0,
90							slurpy => 1,
91							},
92							recurse => {
93							schema => 'bool*',
94							cmdline_aliases => {R=>{}},
95							description => <<'_',
96
97							If set to true, will recurse into subdirectories.
98
99							_
100							},
101							group_by_digest => {
102							summary => 'Sort files by its digest (or size, if not computing digest), separate each different digest',
103							schema => 'bool*',
104							},
105							show_digest => {
106							summary => 'Show the digest value (or the size, if not computing digest) for each file',
107							description => <<'_',
108
109							Note that this routine does not compute digest for files which have unique
110							sizes, so they will show up as empty.
111
112							_
113							schema => 'true*',
114							},
115							show_size => {
116							summary => 'Show the size for each file',
117							schema => 'true*',
118							},
119							# TODO add option follow_symlinks?
120							report_unique => {
121							schema => [bool => {default=>1}],
122							summary => 'Whether to return unique items',
123							cmdline_aliases => {
124							a => {
125							summary => 'Alias for --report-unique --report-duplicate=1 (report all files)',
126							code => sub {
127							my $args = shift;
128							$args->{report_unique} = 1;
129							$args->{report_duplicate} = 1;
130							},
131							},
132							u => {
133							summary => 'Alias for --report-unique --report-duplicate=0',
134							code => sub {
135							my $args = shift;
136							$args->{report_unique} = 1;
137							$args->{report_duplicate} = 0;
138							},
139							},
140							d => {
141							summary =>
142							'Alias for --noreport-unique --report-duplicate=1',
143							code => sub {
144							my $args = shift;
145							$args->{report_unique} = 0;
146							$args->{report_duplicate} = 1;
147							},
148							},
149							D => {
150							summary =>
151							'Alias for --noreport-unique --report-duplicate=3',
152							code => sub {
153							my $args = shift;
154							$args->{report_unique} = 0;
155							$args->{report_duplicate} = 3;
156							},
157							},
158							},
159							},
160							report_duplicate => {
161							schema => [int => {in=>[0,1,2,3], default=>2}],
162							summary => 'Whether to return duplicate items',
163							description => <<'_',
164
165							Can be set to either 0, 1, 2, or 3.
166
167							If set to 0, duplicate items will not be returned.
168
169							If set to 1 (the default for `dupe-files`), will return all the the duplicate
170							files. For example: `file1` contains text 'a', `file2` 'b', `file3` 'a'. Then
171							`file1` and `file3` will be returned.
172
173							If set to 2 (the default for `uniq-files`), will only return the first of
174							duplicate items. Continuing from previous example, only `file1` will be returned
175							because `file2` is unique and `file3` contains 'a' (already represented by
176							`file1`). If one or more `--authoritative-dir` (`-O`) options are specified,
177							files under these directories will be preferred.
178
179							If set to 3, will return all but the first of duplicate items. Continuing from
180							previous example: `file3` will be returned. This is useful if you want to keep
181							only one copy of the duplicate content. You can use the output of this routine
182							to `mv` or `rm`. Similar to the previous case, if one or more
183							`--authoritative-dir` (`-O`) options are specified, then files under these
184							directories will not be listed if possible.
185
186							_
187							cmdline_aliases => {
188							},
189							},
190							algorithm => {
191							schema => ['str*'],
192							summary => "What algorithm is used to compute the digest of the content",
193							description => <<'_',
194
195							The default is to use `md5`. Some algorithms supported include `crc32`, `sha1`,
196							`sha256`, as well as `Digest` to use Perl <pm:Digest> which supports a lot of
197							other algorithms, e.g. `SHA-1`, `BLAKE2b`.
198
199							If set to '', 'none', or 'size', then digest will be set to file size. This
200							means uniqueness will be determined solely from file size. This can be quicker
201							but will generate a false positive when two files of the same size are deemed as
202							duplicate even though their content may be different.
203
204							If set to 'name' then only name comparison will be performed. This of course can
205							potentially generate lots of false positives, but in some cases you might want
206							to compare filename for uniqueness.
207
208							_
209							},
210							digest_args => {
211							schema => ['array*',
212
213							# comment out temporarily, Perinci::Sub::GetArgs::Argv
214							# clashes with coerce rules; we should fix
215							# Perinci::Sub::GetArgs::Argv to observe coercion rules
216							# first
217							#of=>'str*',
218
219							'x.perl.coerce_rules'=>['From_str::comma_sep']],
220							description => <<'_',
221
222							Some Digest algorithms require arguments, you can pass them here.
223
224							_
225							cmdline_aliases => {A=>{}},
226							},
227							show_count => {
228							schema => [bool => {default=>0}],
229							summary => "Whether to return each file content's ".
230							"number of occurence",
231							description => <<'_',
232
233							1 means the file content is only encountered once (unique), 2 means there is one
234							duplicate, and so on.
235
236							_
237							cmdline_aliases => {count=>{}, c=>{}},
238							},
239							detail => {
240							summary => 'Show details (a.k.a. --show-digest, --show-size, --show-count)',
241							schema => 'true*',
242							cmdline_aliases => {l=>{}},
243							},
244							%argspec_authoritative_dirs,
245							%argspecs_filter,
246							},
247							examples => [
248							{
249							summary => 'List all files which do no have duplicate contents',
250							src => 'uniq-files *',
251							src_plang => 'bash',
252							test => 0,
253							'x.doc.show_result' => 0,
254							},
255							{
256							summary => 'List all files (recursively, and in detail) which have duplicate contents (all duplicate copies), exclude some files',
257							src => q(uniq-files -R -l -d -X '\.git/' --min-size 10k .),
258							src_plang => 'bash',
259							test => 0,
260							'x.doc.show_result' => 0,
261							},
262							{
263							summary => 'Move all duplicate files (except one copy) in this directory (and subdirectories) to .dupes/',
264							src => 'uniq-files -D -R * \| while read f; do mv "$f" .dupes/; done',
265							src_plang => 'bash',
266							test => 0,
267							'x.doc.show_result' => 0,
268							},
269							{
270							summary => 'List number of occurences of contents for duplicate files',
271							src => 'uniq-files -c *',
272							src_plang => 'bash',
273							test => 0,
274							'x.doc.show_result' => 0,
275							},
276							{
277							summary => 'List number of occurences of contents for all files',
278							src => 'uniq-files -a -c *',
279							src_plang => 'bash',
280							test => 0,
281							'x.doc.show_result' => 0,
282							},
283							{
284							summary => 'List all files, along with their number of content occurrences and content digest. '.
285							'Use the BLAKE2b digest algorithm. And group the files according to their digest.',
286							src => 'uniq-files -a -c --show-digest -A BLAKE2,blake2b *',
287							src_plang => 'bash',
288							test => 0,
289							'x.doc.show_result' => 0,
290							},
291							],
292							};
293							sub uniq_files {
294	11			11	1	37974	my %args = @_;
295
296	11					25	my $files = $args{files};
297	11	50	33			62	return [400, "Please specify files"] if !$files \|\| !@$files;
298	11					15	my $recurse = $args{recurse};
299	11		100			32	my $report_unique = $args{report_unique} // 1;
300	11		100			81	my $report_duplicate = $args{report_duplicate} // 2;
301	11		100			41	my $show_count = $args{show_count} // 0;
302	11		100			25	my $show_digest = $args{show_digest} // 0;
303	11		100			32	my $show_size = $args{show_size} // 0;
304	11					15	my $digest_args = $args{digest_args};
305	11	50	66			40	my $algorithm = $args{algorithm} // ($digest_args ? 'Digest' : 'md5');
306	11					17	my $group_by_digest = $args{group_by_digest};
307
308	11	50				23	if ($args{detail}) {
309	0					0	$show_digest = 1;
310	0					0	$show_size = 1;
311	0					0	$show_count = 1;
312							}
313
314							my @authoritative_dirs = $args{authoritative_dirs} && @{$args{authoritative_dirs}} ?
315	11	100	66			29	@{ $args{authoritative_dirs} } : ();
	2					5
316	11					24	for my $dir (@authoritative_dirs) {
317	2	50				33	(-d $dir) or return [400, "Authoritative dir '$dir' does not exist or not a directory"];
318	2	50				62	my $abs_dir = abs_path $dir or return [400, "Cannot get absolute path for authoritative dir '$dir'"];
319	2					8	$dir = $abs_dir;
320							}
321							#log_trace "authoritative_dirs=%s", \@authoritative_dirs if @authoritative_dirs;
322
323	11					15	my @include_re;
324	11		50			15	for my $re0 (@{ $args{include_file_patterns} // [] }) {
	11					45
325	0					0	require Regexp::Util;
326	0					0	my $re;
327	0	0				0	if (ref $re0 eq 'Regexp') {
328	0					0	$re = $re0;
329							} else {
330	0					0	eval { $re = Regexp::Util::deserialize_regexp("qr($re0)") };
	0					0
331	0	0				0	return [400, "Invalid/unsafe regex pattern in include_file_patterns '$re0': $@"] if $@;
332	0	0				0	return [400, "Unsafe regex pattern (contains embedded code) in include_file_patterns '$re0'"] if Regexp::Util::regexp_seen_evals($re);
333							}
334	0					0	push @include_re, $re;
335							}
336	11					19	my @exclude_re;
337	11		50			16	for my $re0 (@{ $args{exclude_file_patterns} // [] }) {
	11					39
338	0					0	require Regexp::Util;
339	0					0	my $re;
340	0	0				0	if (ref $re0 eq 'Regexp') {
341	0					0	$re = $re0;
342							} else {
343	0					0	eval { $re = Regexp::Util::deserialize_regexp("qr($re0)") };
	0					0
344	0	0				0	return [400, "Invalid/unsafe regex pattern in exclude_file_patterns '$re0': $@"] if $@;
345	0	0				0	return [400, "Unsafe regex pattern (contains embedded code) in exclude_file_patterns '$re0'"] if Regexp::Util::regexp_seen_evals($re);
346							}
347	0					0	push @exclude_re, $re;
348							}
349
350	11	100				24	if ($recurse) {
351							$files = [ map {
352	5	50				12	if (-l $_) {
	35	100				330
353	0					0	();
354							} elsif (-d _) {
355	10					34	(_glob($_));
356							} else {
357	25					94	($_);
358							}
359							} @$files ];
360							}
361
362							FILTER: {
363	11					19	my $ffiles;
	11					15
364							FILE:
365	11					25	for my $f (@$files) {
366	87	50				778	if (-l $f) {
367	0					0	log_warn "File '$f' is a symlink, ignored";
368	0					0	next FILE;
369							}
370	87	100				218	if (-d _) {
371	12					54	log_warn "File '$f' is a directory, ignored";
372	12					46	next FILE;
373							}
374	75	50				132	unless (-f _) {
375	0					0	log_warn "File '$f' is not a regular file, ignored";
376	0					0	next FILE;
377							}
378
379	75	50				151	if (@include_re) {
380	0					0	my $included;
381	0					0	for my $re (@include_re) {
382	0	0				0	if ($f =~ $re) { $included++; last }
	0					0
	0					0
383							}
384	0	0				0	unless ($included) {
385	0					0	log_info "File '$f' is not in --include-file-patterns, skipped";
386	0					0	next FILE;
387							}
388							}
389	75	50				150	if (@exclude_re) {
390	0					0	for my $re (@exclude_re) {
391	0	0				0	if ($f =~ $re) {
392	0					0	log_info "File '$f' is in --exclude-file-patterns, skipped";
393	0					0	next FILE;
394							}
395							}
396							}
397
398	75					583	my $size = -s $f;
399	75	50	33			208	if ($args{exclude_empty_files} && !$size) {
400	0					0	log_info "File '$f' is empty, skipped by option -Z";
401	0					0	next FILE;
402							}
403	75	50	33			156	if ($args{min_size} && $size < $args{min_size}) {
404	0					0	log_info "File '$f' (size=$size) is smaller than min_file ($args{min_size}), skipped";
405	0					0	next FILE;
406							}
407	75	50	33			130	if ($args{max_size} && $size > $args{max_size}) {
408	0					0	log_info "File '$f' (size=$size) is larger than max_file ($args{max_size}), skipped";
409	0					0	next FILE;
410							}
411
412	75					203	push @$ffiles, $f;
413							}
414	11					25	$files = $ffiles;
415							} # FILTER
416
417	11					17	my %name_files; # key = filename (computed), value = [path, ...]
418							GROUP_FILE_NAMES: {
419	11					15	for my $f (@$files) {
	11					20
420							#my $path = abs_path($f);
421	75					173	(my $basename = $f) =~ s!.+/!!;
422	75		50			280	$name_files{$basename} //= [];
423	75					154	push @{ $name_files{$basename} }, $f
424	75	50				95	unless grep { $_ eq $f } @{ $name_files{$basename} };
	0					0
	75					189
425							}
426							#use DD; dd \%name_files;
427							}
428
429	11					30	my %size_counts; # key = size, value = number of files having that size
430							my %size_files; # key = size, value = [file, ...]
431	11					0	my %file_sizes; # key = filename, value = file size, for caching stat()
432							GET_FILE_SIZES: {
433	11					14	for my $f (@$files) {
	11					17
434	75					730	my @st = stat $f;
435	75	50				192	unless (@st) {
436	0					0	log_error("Can't stat file `$f`: $!, skipped");
437	0					0	next;
438							}
439	75					162	$size_counts{$st[7]}++;
440	75		100			220	$size_files{$st[7]} //= [];
441	75					94	push @{$size_files{$st[7]}}, $f;
	75					157
442	75					214	$file_sizes{$f} = $st[7];
443							}
444							}
445
446	11		66			70	my $calc_digest = !($algorithm eq '' \|\| $algorithm eq 'none' \|\| $algorithm eq 'size' \|\| $algorithm eq 'name');
447
448							# calculate digest for all files having non-unique sizes
449	11					25	my %digest_counts; # key = digest, value = num of files having that digest
450							my %digest_files; # key = digest, value = [file, ...]
451	11					0	my %file_digests; # key = filename, value = file digest
452							CALC_FILE_DIGESTS: {
453	11	100				14	last unless $calc_digest;
	11					22
454	10					522	require File::Digest;
455
456	10					2197	for my $f (@$files) {
457	66	50				148	next unless defined $file_sizes{$f}; # just checking. all files should have sizes.
458	66	100				147	next if $size_counts{ $file_sizes{$f} } == 1; # skip unique file sizes.
459	60					146	my $res = File::Digest::digest_file(
460							file=>$f, algorithm=>$algorithm, digest_args=>$digest_args);
461	60	50				9846	return [500, "Can't calculate digest for file '$f': $res->[0] - $res->[1]"]
462							unless $res->[0] == 200;
463	60					107	my $digest = $res->[2];
464	60					123	$digest_counts{$digest}++;
465	60		100			207	$digest_files{$digest} //= [];
466	60					89	push @{$digest_files{$digest}}, $f;
	60					145
467	60					160	$file_digests{$f} = $digest;
468							}
469							}
470
471	11					21	my %file_counts; # key = file name, value = num of files having file content
472	11					18	for my $f (@$files) {
473	75	50				197	next unless defined $file_sizes{$f}; # just checking
474	75	100				123	if (!defined($file_digests{$f})) {
475	15					35	$file_counts{$f} = $size_counts{ $file_sizes{$f} };
476							} else {
477	60					110	$file_counts{$f} = $digest_counts{ $file_digests{$f} };
478							}
479							}
480
481							SORT_DUPLICATE_FILES: {
482	11	100				15	last unless @authoritative_dirs;
	11					26
483	2	0				8	my $hash = $calc_digest ? \%digest_files : $algorithm eq 'name' ? \%name_files : \%size_files;
		50
484	2					8	for my $key (keys %$hash) {
485	10					31	my @files = @{ $hash->{$key} };
	10					22
486	10					14	my @abs_files;
487	10	100				22	next unless @files > 1;
488	4					8	for my $file (@files) {
489	12	50				194	my $abs_file = abs_path $file or do {
490	0					0	log_error "Cannot find absolute path for duplicate file '$file', skipping duplicate set %s", \@files;
491							};
492	12					37	push @abs_files, $abs_file;
493							}
494
495							#log_trace "Duplicate files before sorting: %s", \@files;
496	12					29	@files = map { $files[$_] } sort {
497	4					19	my $file_a = $abs_files[$a];
	10					20
498	10					13	my $file_a_in_authoritative_dirs = 0;
499	10					15	my $subdir_len_file_a;
500	10					13	for my $d (@authoritative_dirs) {
501	10	50				69	if ($file_a =~ m!\A\Q$d\E(?:/\|\z)(.*)!) { $file_a_in_authoritative_dirs++; $subdir_len_file_a = length($1); last }
	0					0
	0					0
	0					0
502							}
503	10					16	my $file_b = $abs_files[$b];
504	10					15	my $file_b_in_authoritative_dirs = 0;
505	10					14	my $subdir_len_file_b;
506	10					11	for my $d (@authoritative_dirs) {
507	10	100				53	if ($file_b =~ m!\A\Q$d\E(?:/\|\z)(.*)!) { $file_b_in_authoritative_dirs++; $subdir_len_file_b = length($1); last }
	2					3
	2					7
	2					4
508							}
509							#log_trace " file_a=<$file_a>, in authoritative_dirs? $file_a_in_authoritative_dirs";
510							#log_trace " file_b=<$file_b>, in authoritative_dirs? $file_b_in_authoritative_dirs";
511							# files located near the root of authoritative dir is preferred
512							# to deeper files. this is done by comparing subdir_len
513	10	50				39	($file_a_in_authoritative_dirs ? $subdir_len_file_a : 9999) <=> ($file_b_in_authoritative_dirs ? $subdir_len_file_b : 9999) \|\|
		100
		50
514							$file_a cmp $file_b;
515							} 0..$#files;
516							#log_trace "Duplicate files after sorting: %s", \@files;
517
518	4					14	$hash->{$key} = \@files;
519							}
520							}
521
522							#$log->trace("report_duplicate=$report_duplicate");
523	11					19	my @files;
524	11					64	for my $f (sort keys %file_counts) {
525	75	100				155	if ($file_counts{$f} == 1) {
526							#log_trace "unique file '$f'";
527	24	100				48	push @files, $f if $report_unique;
528							} else {
529							#log_trace "duplicate file '$f'";
530							my $is_first_copy = $calc_digest ?
531							$f eq $digest_files{ $file_digests{$f} }[0] :
532	51	100				107	$f eq $size_files{ $file_sizes{$f} }[0];
533							#log_trace "is first copy? <$is_first_copy>";
534	51	100				114	if ($report_duplicate == 0) {
		100
		100
		50
535							# do not report dupe files
536							} elsif ($report_duplicate == 1) {
537	15					32	push @files, $f;
538							} elsif ($report_duplicate == 2) {
539	21	100				43	push @files, $f if $is_first_copy;
540							} elsif ($report_duplicate == 3) {
541	9	100				25	push @files, $f unless $is_first_copy;
542							} else {
543	0					0	die "Invalid value for --report-duplicate ".
544							"'$report_duplicate', please choose 0/1/2/3";
545							}
546							}
547							}
548
549							GROUP_FILES_BY_DIGEST: {
550	11	100				17	last unless $group_by_digest;
	11					24
551							@files = sort {
552	1					9	$file_sizes{$a} <=> $file_sizes{$b} \|\|
553	20	50	50			55	($file_digests{$a} // '') cmp ($file_digests{$b} // '')
			50
554							} @files;
555							}
556
557	11					32	my @rows;
558							my %resmeta;
559	11					0	my $last_digest;
560	11					20	for my $f (@files) {
561	41		66			88	my $digest = $file_digests{$f} // $file_sizes{$f};
562
563							# add separator row
564	41	100	100			98	if ($group_by_digest && defined $last_digest && $digest ne $last_digest) {
			100
565	4	50	33			16	push @rows, ($show_count \|\| $show_digest \|\| $show_size) ? {} : '';
566							}
567
568	41					48	my $row;
569	41	100	100			131	if ($show_count \|\| $show_digest \|\| $show_size) {
			100
570	19					37	$row = {file=>$f};
571	19	100				42	$row->{count} = $file_counts{$f} if $show_count;
572	19	100				35	$row->{digest} = $file_digests{$f} if $show_digest;
573	19	100				32	$row->{size} = $file_sizes{$f} if $show_size;
574							} else {
575	22					30	$row = $f;
576							}
577	41					65	push @rows, $row;
578	41					67	$last_digest = $digest;
579							}
580
581	11					36	$resmeta{'table.fields'} = [qw/file size digest count/];
582
583	11					161	[200, "OK", \@rows, \%resmeta];
584							}
585
586							gen_modified_sub(
587							base_name => 'uniq_files',
588							output_name => 'dupe_files',
589							description => <<'_',
590
591							This is a thin wrapper to <prog:uniq-files>. It defaults `report_unique` to 0
592							and `report_duplicate` to 1.
593
594							_
595							modify_args => {
596							report_unique => sub {
597							$_[0]{schema} = [bool => {default=>0}];
598							},
599							report_duplicate => sub {
600							$_[0]{schema} = [int => {in=>[0,1,2,3], default=>1}];
601							},
602							},
603							modify_meta => sub {
604							$_[0]{examples} = [
605							{
606							summary => 'List all files (recursively, and in detail) which have duplicate contents (all duplicate copies)',
607							src => 'dupe-files -lR *',
608							src_plang => 'bash',
609							test => 0,
610							'x.doc.show_result' => 0,
611							},
612							];
613							},
614							output_code => sub {
615	0			0			my %args = @_;
616	0		0				$args{report_unique} //= 0;
617	0		0				$args{report_duplicate} //= 1;
618	0						uniq_files(%args);
619							},
620							);
621
622							1;
623							# ABSTRACT: Report duplicate or unique file contents
624
625							__END__
626
627							=pod
628
629							=encoding UTF-8
630
631							=head1 NAME
632
633							App::UniqFiles - Report duplicate or unique file contents
634
635							=head1 VERSION
636
637							This document describes version 0.141 of App::UniqFiles (from Perl distribution App-UniqFiles), released on 2023-02-06.
638
639							=head1 SYNOPSIS
640
641							# See uniq-files script
642
643							=head1 NOTES
644
645							=head1 FUNCTIONS
646
647
648							=head2 dupe_files
649
650							Usage:
651
652							dupe_files(%args) -> [$status_code, $reason, $payload, \%result_meta]
653
654							Report duplicate or unique file contents.
655
656							This is a thin wrapper to L<uniq-files>. It defaults C<report_unique> to 0
657							and C<report_duplicate> to 1.
658
659							This function is not exported.
660
661							Arguments ('*' denotes required arguments):
662
663							=over 4
664
665							=item * B<algorithm> => I<str>
666
667							What algorithm is used to compute the digest of the content.
668
669							The default is to use C<md5>. Some algorithms supported include C<crc32>, C<sha1>,
670							C<sha256>, as well as C<Digest> to use Perl L<Digest> which supports a lot of
671							other algorithms, e.g. C<SHA-1>, C<BLAKE2b>.
672
673							If set to '', 'none', or 'size', then digest will be set to file size. This
674							means uniqueness will be determined solely from file size. This can be quicker
675							but will generate a false positive when two files of the same size are deemed as
676							duplicate even though their content may be different.
677
678							If set to 'name' then only name comparison will be performed. This of course can
679							potentially generate lots of false positives, but in some cases you might want
680							to compare filename for uniqueness.
681
682							=item * B<authoritative_dirs> => I<array[str]>
683
684							Denote director(yE<verbar>ies) where authoritativeE<sol>"Original" copies are found.
685
686							=item * B<detail> => I<true>
687
688							Show details (a.k.a. --show-digest, --show-size, --show-count).
689
690							=item * B<digest_args> => I<array>
691
692							Some Digest algorithms require arguments, you can pass them here.
693
694							=item * B<exclude_empty_files> => I<bool>
695
696							(No description)
697
698							=item * B<exclude_file_patterns> => I<array[str]>
699
700							Filename (including path) regex patterns to include.
701
702							=item * B<files>* => I<array[str]>
703
704							(No description)
705
706							=item * B<group_by_digest> => I<bool>
707
708							Sort files by its digest (or size, if not computing digest), separate each different digest.
709
710							=item * B<include_file_patterns> => I<array[str]>
711
712							Filename (including path) regex patterns to exclude.
713
714							=item * B<max_size> => I<filesize>
715
716							Maximum file size to consider.
717
718							=item * B<min_size> => I<filesize>
719
720							Minimum file size to consider.
721
722							=item * B<recurse> => I<bool>
723
724							If set to true, will recurse into subdirectories.
725
726							=item * B<report_duplicate> => I<int> (default: 1)
727
728							Whether to return duplicate items.
729
730							Can be set to either 0, 1, 2, or 3.
731
732							If set to 0, duplicate items will not be returned.
733
734							If set to 1 (the default for C<dupe-files>), will return all the the duplicate
735							files. For example: C<file1> contains text 'a', C<file2> 'b', C<file3> 'a'. Then
736							C<file1> and C<file3> will be returned.
737
738							If set to 2 (the default for C<uniq-files>), will only return the first of
739							duplicate items. Continuing from previous example, only C<file1> will be returned
740							because C<file2> is unique and C<file3> contains 'a' (already represented by
741							C<file1>). If one or more C<--authoritative-dir> (C<-O>) options are specified,
742							files under these directories will be preferred.
743
744							If set to 3, will return all but the first of duplicate items. Continuing from
745							previous example: C<file3> will be returned. This is useful if you want to keep
746							only one copy of the duplicate content. You can use the output of this routine
747							to C<mv> or C<rm>. Similar to the previous case, if one or more
748							C<--authoritative-dir> (C<-O>) options are specified, then files under these
749							directories will not be listed if possible.
750
751							=item * B<report_unique> => I<bool> (default: 0)
752
753							Whether to return unique items.
754
755							=item * B<show_count> => I<bool> (default: 0)
756
757							Whether to return each file content's number of occurence.
758
759							1 means the file content is only encountered once (unique), 2 means there is one
760							duplicate, and so on.
761
762							=item * B<show_digest> => I<true>
763
764							Show the digest value (or the size, if not computing digest) for each file.
765
766							Note that this routine does not compute digest for files which have unique
767							sizes, so they will show up as empty.
768
769							=item * B<show_size> => I<true>
770
771							Show the size for each file.
772
773
774							=back
775
776							Returns an enveloped result (an array).
777
778							First element ($status_code) is an integer containing HTTP-like status code
779							(200 means OK, 4xx caller error, 5xx function error). Second element
780							($reason) is a string containing error message, or something like "OK" if status is
781							200. Third element ($payload) is the actual result, but usually not present when enveloped result is an error response ($status_code is not 2xx). Fourth
782							element (%result_meta) is called result metadata and is optional, a hash
783							that contains extra information, much like how HTTP response headers provide additional metadata.
784
785							Return value: (any)
786
787
788
789							=head2 uniq_files
790
791							Usage:
792
793							uniq_files(%args) -> [$status_code, $reason, $payload, \%result_meta]
794
795							Report duplicate or unique file contents.
796
797							Given a list of filenames, will check each file size and content for duplicate
798							content. Interface is a bit like the C<uniq> Unix command-line program.
799
800							This function is not exported by default, but exportable.
801
802							Arguments ('*' denotes required arguments):
803
804							=over 4
805
806							=item * B<algorithm> => I<str>
807
808							What algorithm is used to compute the digest of the content.
809
810							The default is to use C<md5>. Some algorithms supported include C<crc32>, C<sha1>,
811							C<sha256>, as well as C<Digest> to use Perl L<Digest> which supports a lot of
812							other algorithms, e.g. C<SHA-1>, C<BLAKE2b>.
813
814							If set to '', 'none', or 'size', then digest will be set to file size. This
815							means uniqueness will be determined solely from file size. This can be quicker
816							but will generate a false positive when two files of the same size are deemed as
817							duplicate even though their content may be different.
818
819							If set to 'name' then only name comparison will be performed. This of course can
820							potentially generate lots of false positives, but in some cases you might want
821							to compare filename for uniqueness.
822
823							=item * B<authoritative_dirs> => I<array[str]>
824
825							Denote director(yE<verbar>ies) where authoritativeE<sol>"Original" copies are found.
826
827							=item * B<detail> => I<true>
828
829							Show details (a.k.a. --show-digest, --show-size, --show-count).
830
831							=item * B<digest_args> => I<array>
832
833							Some Digest algorithms require arguments, you can pass them here.
834
835							=item * B<exclude_empty_files> => I<bool>
836
837							(No description)
838
839							=item * B<exclude_file_patterns> => I<array[str]>
840
841							Filename (including path) regex patterns to include.
842
843							=item * B<files>* => I<array[str]>
844
845							(No description)
846
847							=item * B<group_by_digest> => I<bool>
848
849							Sort files by its digest (or size, if not computing digest), separate each different digest.
850
851							=item * B<include_file_patterns> => I<array[str]>
852
853							Filename (including path) regex patterns to exclude.
854
855							=item * B<max_size> => I<filesize>
856
857							Maximum file size to consider.
858
859							=item * B<min_size> => I<filesize>
860
861							Minimum file size to consider.
862
863							=item * B<recurse> => I<bool>
864
865							If set to true, will recurse into subdirectories.
866
867							=item * B<report_duplicate> => I<int> (default: 2)
868
869							Whether to return duplicate items.
870
871							Can be set to either 0, 1, 2, or 3.
872
873							If set to 0, duplicate items will not be returned.
874
875							If set to 1 (the default for C<dupe-files>), will return all the the duplicate
876							files. For example: C<file1> contains text 'a', C<file2> 'b', C<file3> 'a'. Then
877							C<file1> and C<file3> will be returned.
878
879							If set to 2 (the default for C<uniq-files>), will only return the first of
880							duplicate items. Continuing from previous example, only C<file1> will be returned
881							because C<file2> is unique and C<file3> contains 'a' (already represented by
882							C<file1>). If one or more C<--authoritative-dir> (C<-O>) options are specified,
883							files under these directories will be preferred.
884
885							If set to 3, will return all but the first of duplicate items. Continuing from
886							previous example: C<file3> will be returned. This is useful if you want to keep
887							only one copy of the duplicate content. You can use the output of this routine
888							to C<mv> or C<rm>. Similar to the previous case, if one or more
889							C<--authoritative-dir> (C<-O>) options are specified, then files under these
890							directories will not be listed if possible.
891
892							=item * B<report_unique> => I<bool> (default: 1)
893
894							Whether to return unique items.
895
896							=item * B<show_count> => I<bool> (default: 0)
897
898							Whether to return each file content's number of occurence.
899
900							1 means the file content is only encountered once (unique), 2 means there is one
901							duplicate, and so on.
902
903							=item * B<show_digest> => I<true>
904
905							Show the digest value (or the size, if not computing digest) for each file.
906
907							Note that this routine does not compute digest for files which have unique
908							sizes, so they will show up as empty.
909
910							=item * B<show_size> => I<true>
911
912							Show the size for each file.
913
914
915							=back
916
917							Returns an enveloped result (an array).
918
919							First element ($status_code) is an integer containing HTTP-like status code
920							(200 means OK, 4xx caller error, 5xx function error). Second element
921							($reason) is a string containing error message, or something like "OK" if status is
922							200. Third element ($payload) is the actual result, but usually not present when enveloped result is an error response ($status_code is not 2xx). Fourth
923							element (%result_meta) is called result metadata and is optional, a hash
924							that contains extra information, much like how HTTP response headers provide additional metadata.
925
926							Return value: (any)
927
928							=head1 HOMEPAGE
929
930							Please visit the project's homepage at L<https://metacpan.org/release/App-UniqFiles>.
931
932							=head1 SOURCE
933
934							Source repository is at L<https://github.com/perlancar/perl-App-UniqFiles>.
935
936							=head1 SEE ALSO
937
938							L<find-duplicate-filenames> from L<App::FindUtils>
939
940							L<move-duplicate-files-to> from L<App::DuplicateFilesUtils>, which is basically
941							a shortcut for C<< uniq-files -D -R . \| while read f; do mv "$f" SOMEDIR/; done
942							>>.
943
944							=head1 AUTHOR
945
946							perlancar <perlancar@cpan.org>
947
948							=head1 CONTRIBUTOR
949
950							=for stopwords Steven Haryanto
951
952							Steven Haryanto <stevenharyanto@gmail.com>
953
954							=head1 CONTRIBUTING
955
956
957							To contribute, you can send patches by email/via RT, or send pull requests on
958							GitHub.
959
960							Most of the time, you don't need to build the distribution yourself. You can
961							simply modify the code, then test via:
962
963							% prove -l
964
965							If you want to build the distribution (e.g. to try to install it locally on your
966							system), you can install L<Dist::Zilla>,
967							L<Dist::Zilla::PluginBundle::Author::PERLANCAR>,
968							L<Pod::Weaver::PluginBundle::Author::PERLANCAR>, and sometimes one or two other
969							Dist::Zilla- and/or Pod::Weaver plugins. Any additional steps required beyond
970							that are considered a bug and can be reported to me.
971
972							=head1 COPYRIGHT AND LICENSE
973
974							This software is copyright (c) 2023, 2022, 2020, 2019, 2017, 2015, 2014, 2012, 2011 by perlancar <perlancar@cpan.org>.
975
976							This is free software; you can redistribute it and/or modify it under
977							the same terms as the Perl 5 programming language system itself.
978
979							=head1 BUGS
980
981							Please report any bugs or feature requests on the bugtracker website L<https://rt.cpan.org/Public/Dist/Display.html?Name=App-UniqFiles>
982
983							When submitting a bug or request, please include a test-file or a
984							patch to an existing test-file that illustrates the bug or desired
985							feature.
986
987							=cut