File Coverage

blib/lib/App/CSVUtils/csv_setop.pm

Criterion	Covered	Total	%
statement	11	11	100.0
branch			n/a
condition			n/a
subroutine	4	4	100.0
pod			n/a
total	15	15	100.0

line	stmt	sub	time	code
1				package App::CSVUtils::csv_setop;
2
3	1	1	4886	use 5.010001;
	1		3
4	1	1	6	use strict;
	1		1
	1		21
5	1	1	4	use warnings;
	1		3
	1		66
6
7				our $AUTHORITY = 'cpan:PERLANCAR'; # AUTHORITY
8				our $DATE = '2023-04-01'; # DATE
9				our $DIST = 'App-CSVUtils'; # DIST
10				our $VERSION = '1.024'; # VERSION
11
12	1		1707	use App::CSVUtils qw(
13				gen_csv_util
14	1	1	5	);
	1		2
15
16				gen_csv_util(
17				name => 'csv_setop',
18				summary => 'Set operation (union/unique concatenation of rows, intersection/common rows, difference of rows) against several CSV files',
19				description => <<'_',
20
21				This utility lets you perform one of several set options against several CSV
22				files:
23				- union
24				- intersection
25				- difference
26				- symmetric difference
27
28				Example input:
29
30				# file1.csv
31				a,b,c
32				1,2,3
33				4,5,6
34				7,8,9
35
36				# file2.csv
37				a,b,c
38				1,2,3
39				4,5,7
40				7,8,9
41
42				Output of intersection (`--intersect file1.csv file2.csv`), which will return
43				common rows between the two files:
44
45				a,b,c
46				1,2,3
47				7,8,9
48
49				Output of union (`--union file1.csv file2.csv`), which will return all rows with
50				duplicate removed:
51
52				a,b,c
53				1,2,3
54				4,5,6
55				4,5,7
56				7,8,9
57
58				Output of difference (`--diff file1.csv file2.csv`), which will return all rows
59				in the first file but not in the second:
60
61				a,b,c
62				4,5,6
63
64				Output of symmetric difference (`--symdiff file1.csv file2.csv`), which will
65				return all rows in the first file not in the second, as well as rows in the
66				second not in the first:
67
68				a,b,c
69				4,5,6
70				4,5,7
71
72				You can specify `--compare-fields` to only consider some fields only, for
73				example `--union --compare-fields a,b file1.csv file2.csv`:
74
75				a,b,c
76				1,2,3
77				4,5,6
78				7,8,9
79
80				Each field specified in `--compare-fields` can be specified using
81				`F1:OTHER1,F2:OTHER2,...` format to refer to different field names or indexes in
82				each file, for example if `file3.csv` is:
83
84				# file3.csv
85				Ei,Si,Bi
86				1,3,2
87				4,7,5
88				7,9,8
89
90				Then `--union --compare-fields a:Ei,b:Bi file1.csv file3.csv` will result in:
91
92				a,b,c
93				1,2,3
94				4,5,6
95				7,8,9
96
97				Finally you can print out only certain fields using `--result-fields`.
98
99				_
100				add_args => {
101				op => {
102				summary => 'Set operation to perform',
103				schema => ['str*', in=>[qw/intersect union diff symdiff/]],
104				req => 1,
105				cmdline_aliases => {
106				intersect => {is_flag=>1, summary=>'Shortcut for --op=intersect', code=>sub{ $_[0]{op} = 'intersect' }},
107				union => {is_flag=>1, summary=>'Shortcut for --op=union' , code=>sub{ $_[0]{op} = 'union' }},
108				diff => {is_flag=>1, summary=>'Shortcut for --op=diff' , code=>sub{ $_[0]{op} = 'diff' }},
109				symdiff => {is_flag=>1, summary=>'Shortcut for --op=symdiff' , code=>sub{ $_[0]{op} = 'symdiff' }},
110				},
111				},
112				ignore_case => {
113				schema => 'bool*',
114				cmdline_aliases => {i=>{}},
115				},
116				compare_fields => {
117				schema => ['str*'],
118				},
119				result_fields => {
120				schema => ['str*'],
121				},
122				},
123
124				links => [
125				{url=>'prog:setop'},
126				],
127
128				reads_multiple_csv => 1,
129
130				tags => ['category:combining', 'set'],
131
132				on_begin => sub {
133				my $r = shift;
134
135				# check arguments
136				die [400, "Please specify at least 2 files"]
137				unless @{ $r->{util_args}{input_filenames} } >= 2;
138
139				# these are the keys we add to the stash
140				$r->{all_input_data_rows} = []; # array of all data rows, one elem for each input file
141				$r->{all_input_fields} = []; # array of input_fields, one elem for each input file
142				$r->{all_input_fields_idx} = []; # array of input_fields_idx, one elem for each input file
143				},
144
145				on_input_header_row => sub {
146				my $r = shift;
147
148				$r->{all_input_fields} [ $r->{input_filenum}-1 ] = $r->{input_fields};
149				$r->{all_input_fields_idx}[ $r->{input_filenum}-1 ] = $r->{input_fields_idx};
150				$r->{all_input_data_rows} [ $r->{input_filenum}-1 ] = [];
151				},
152
153				on_input_data_row => sub {
154				my $r = shift;
155
156				push @{ $r->{all_input_data_rows}[ $r->{input_filenum}-1 ] },
157				$r->{input_row};
158				},
159
160				after_close_input_files => sub {
161				require Tie::IxHash;
162
163				my $r = shift;
164
165				my $op = $r->{util_args}{op};
166				my $ci = $r->{util_args}{ignore_case};
167				my $num_files = @{ $r->{util_args}{input_filenames} };
168
169				my @compare_fields; # elem = [fieldname-for-file1, fieldname-for-file2, ...]
170				if (defined $r->{util_args}{compare_fields}) {
171				my @ff = ref($r->{util_args}{compare_fields}) eq 'ARRAY' ?
172				@{$r->{util_args}{compare_fields}} : split(/,/, $r->{util_args}{compare_fields});
173				for my $field_idx (0..$#ff) {
174				my @ff2 = split /:/, $ff[$field_idx];
175				for (@ff2+1 .. $num_files) {
176				push @ff2, $ff2[0];
177				}
178				$compare_fields[$field_idx] = \@ff2;
179				}
180				# XXX check that specified fields exist
181				} else {
182				for my $field_idx (0..$#{ $r->{all_input_fields}[0] }) {
183				$compare_fields[$field_idx] = [
184				map { $r->{all_input_fields}[0][$field_idx] } 0..$num_files-1];
185				}
186				}
187
188				my @result_fields; # elem = fieldname, ...
189				if (defined $r->{util_args}{result_fields}) {
190				@result_fields = ref($r->{util_args}{result_fields}) eq 'ARRAY' ?
191				@{$r->{util_args}{result_fields}} : split(/,/, $r->{util_args}{result_fields});
192				# XXX check that specified fields exist
193				} else {
194				@result_fields = @{ $r->{all_input_fields}[0] };
195				}
196				$r->{output_fields} = \@result_fields;
197
198				tie my(%res), 'Tie::IxHash';
199
200				my $code_get_compare_key = sub {
201				my ($file_idx, $row_idx) = @_;
202				my $row = $r->{all_input_data_rows}[$file_idx][$row_idx];
203				my $key = join "\|", map {
204				my $field = $compare_fields[$_][$file_idx];
205				my $field_idx = $r->{all_input_fields_idx}[$file_idx]{$field};
206				my $val = defined $field_idx ? $row->[$field_idx] : "";
207				$val = uc $val if $ci;
208				$val;
209				} 0..$#compare_fields;
210				#say "D:compare_key($file_idx, $row_idx)=<$key>";
211				$key;
212				};
213
214				my $code_print_result_row = sub {
215				my ($file_idx, $row) = @_;
216				my @res_row = map {
217				my $field = $result_fields[$_];
218				my $field_idx = $r->{all_input_fields_idx}[$file_idx]{$field};
219				defined $field_idx ? $row->[$field_idx] : "";
220				} 0..$#result_fields;
221				$r->{code_print_row}->(\@res_row);
222				};
223
224				if ($op eq 'intersect') {
225				for my $file_idx (0..$num_files-1) {
226				if ($file_idx == 0) {
227				for my $row_idx (0..$#{ $r->{all_input_data_rows}[$file_idx] }) {
228				my $key = $code_get_compare_key->($file_idx, $row_idx);
229				$res{$key} //= [1, $row_idx]; # [num_of_occurrence, row_idx]
230				}
231				} else {
232				for my $row_idx (0..$#{ $r->{all_input_data_rows}[$file_idx] }) {
233				my $key = $code_get_compare_key->($file_idx, $row_idx);
234				if ($res{$key} && $res{$key}[0] == $file_idx) {
235				$res{$key}[0]++;
236				}
237				}
238				}
239
240				# print result
241				if ($file_idx == $num_files-1) {
242				for my $key (keys %res) {
243				$code_print_result_row->(
244				0, $r->{all_input_data_rows}[0][$res{$key}[1]])
245				if $res{$key}[0] == $num_files;
246				}
247				}
248				} # for file_idx
249
250				} elsif ($op eq 'union') {
251
252				for my $file_idx (0..$num_files-1) {
253				for my $row_idx (0..$#{ $r->{all_input_data_rows}[$file_idx] }) {
254				my $key = $code_get_compare_key->($file_idx, $row_idx);
255				next if $res{$key}++;
256				my $row = $r->{all_input_data_rows}[$file_idx][$row_idx];
257				$code_print_result_row->($file_idx, $row);
258				}
259				} # for file_idx
260
261				} elsif ($op eq 'diff') {
262
263				for my $file_idx (0..$num_files-1) {
264				if ($file_idx == 0) {
265				for my $row_idx (0..$#{ $r->{all_input_data_rows}[$file_idx] }) {
266				my $key = $code_get_compare_key->($file_idx, $row_idx);
267				$res{$key} //= [$file_idx, $row_idx];
268				}
269				} else {
270				for my $row_idx (0..$#{ $r->{all_input_data_rows}[$file_idx] }) {
271				my $key = $code_get_compare_key->($file_idx, $row_idx);
272				delete $res{$key};
273				}
274				}
275
276				# print result
277				if ($file_idx == $num_files-1) {
278				for my $key (keys %res) {
279				my ($file_idx, $row_idx) = @{ $res{$key} };
280				$code_print_result_row->(
281				0, $r->{all_input_data_rows}[$file_idx][$row_idx]);
282				}
283				}
284				} # for file_idx
285
286				} elsif ($op eq 'symdiff') {
287
288				for my $file_idx (0..$num_files-1) {
289				if ($file_idx == 0) {
290				for my $row_idx (0..$#{ $r->{all_input_data_rows}[$file_idx] }) {
291				my $key = $code_get_compare_key->($file_idx, $row_idx);
292				$res{$key} //= [1, $file_idx, $row_idx]; # [num_of_occurrence, file_idx, row_idx]
293				}
294				} else {
295				for my $row_idx (0..$#{ $r->{all_input_data_rows}[$file_idx] }) {
296				my $key = $code_get_compare_key->($file_idx, $row_idx);
297				if (!$res{$key}) {
298				$res{$key} = [1, $file_idx, $row_idx];
299				} else {
300				$res{$key}[0]++;
301				}
302				}
303				}
304
305				# print result
306				if ($file_idx == $num_files-1) {
307				for my $key (keys %res) {
308				my ($num_occur, $file_idx, $row_idx) = @{ $res{$key} };
309				$code_print_result_row->(
310				0, $r->{all_input_data_rows}[$file_idx][$row_idx])
311				if $num_occur == 1;
312				}
313				}
314				} # for file_idx
315
316				} else {
317
318				die [400, "Unknown/unimplemented op '$op'"];
319
320				}
321
322				#use DD; dd +{
323				# compare_fields => \@compare_fields,
324				# result_fields => \@result_fields,
325				# all_input_data_rows=>$r->{all_input_data_rows},
326				# all_input_fields=>$r->{all_input_fields},
327				# all_input_fields_idx=>$r->{all_input_fields_idx},
328				#};
329				},
330				);
331
332				1;
333				# ABSTRACT: Set operation (union/unique concatenation of rows, intersection/common rows, difference of rows) against several CSV files
334
335				__END__
336
337				=pod
338
339				=encoding UTF-8
340
341				=head1 NAME
342
343				App::CSVUtils::csv_setop - Set operation (union/unique concatenation of rows, intersection/common rows, difference of rows) against several CSV files
344
345				=head1 VERSION
346
347				This document describes version 1.024 of App::CSVUtils::csv_setop (from Perl distribution App-CSVUtils), released on 2023-04-01.
348
349				=head1 FUNCTIONS
350
351
352				=head2 csv_setop
353
354				Usage:
355
356				csv_setop(%args) -> [$status_code, $reason, $payload, \%result_meta]
357
358				Set operation (unionE<sol>unique concatenation of rows, intersectionE<sol>common rows, difference of rows) against several CSV files.
359
360				This utility lets you perform one of several set options against several CSV
361				files:
362				- union
363				- intersection
364				- difference
365				- symmetric difference
366
367				Example input:
368
369				# file1.csv
370				a,b,c
371				1,2,3
372				4,5,6
373				7,8,9
374
375				# file2.csv
376				a,b,c
377				1,2,3
378				4,5,7
379				7,8,9
380
381				Output of intersection (C<--intersect file1.csv file2.csv>), which will return
382				common rows between the two files:
383
384				a,b,c
385				1,2,3
386				7,8,9
387
388				Output of union (C<--union file1.csv file2.csv>), which will return all rows with
389				duplicate removed:
390
391				a,b,c
392				1,2,3
393				4,5,6
394				4,5,7
395				7,8,9
396
397				Output of difference (C<--diff file1.csv file2.csv>), which will return all rows
398				in the first file but not in the second:
399
400				a,b,c
401				4,5,6
402
403				Output of symmetric difference (C<--symdiff file1.csv file2.csv>), which will
404				return all rows in the first file not in the second, as well as rows in the
405				second not in the first:
406
407				a,b,c
408				4,5,6
409				4,5,7
410
411				You can specify C<--compare-fields> to only consider some fields only, for
412				example C<--union --compare-fields a,b file1.csv file2.csv>:
413
414				a,b,c
415				1,2,3
416				4,5,6
417				7,8,9
418
419				Each field specified in C<--compare-fields> can be specified using
420				C<F1:OTHER1,F2:OTHER2,...> format to refer to different field names or indexes in
421				each file, for example if C<file3.csv> is:
422
423				# file3.csv
424				Ei,Si,Bi
425				1,3,2
426				4,7,5
427				7,9,8
428
429				Then C<--union --compare-fields a:Ei,b:Bi file1.csv file3.csv> will result in:
430
431				a,b,c
432				1,2,3
433				4,5,6
434				7,8,9
435
436				Finally you can print out only certain fields using C<--result-fields>.
437
438				This function is not exported.
439
440				Arguments ('*' denotes required arguments):
441
442				=over 4
443
444				=item * B<compare_fields> => I<str>
445
446				(No description)
447
448				=item * B<ignore_case> => I<bool>
449
450				(No description)
451
452				=item * B<inplace> => I<true>
453
454				Output to the same file as input.
455
456				Normally, you output to a different file than input. If you try to output to the
457				same file (C<-o INPUT.csv -O>) you will clobber the input file; thus the utility
458				prevents you from doing it. However, with this C<--inplace> option, you can
459				output to the same file. Like perl's C<-i> option, this will first output to a
460				temporary file in the same directory as the input file then rename to the final
461				file at the end. You cannot specify output file (C<-o>) when using this option,
462				but you can specify backup extension with C<-b> option.
463
464				Some caveats:
465
466				=over
467
468				=item * if input file is a symbolic link, it will be replaced with a regular file;
469
470				=item * renaming (implemented using C<rename()>) can fail if input filename is too long;
471
472				=item * value specified in C<-b> is currently not checked for acceptable characters;
473
474				=item * things can also fail if permissions are restrictive;
475
476				=back
477
478				=item * B<inplace_backup_ext> => I<str> (default: "")
479
480				Extension to add for backup of input file.
481
482				In inplace mode (C<--inplace>), if this option is set to a non-empty string, will
483				rename the input file using this extension as a backup. The old existing backup
484				will be overwritten, if any.
485
486				=item * B<input_escape_char> => I<str>
487
488				Specify character to escape value in field in input CSV, will be passed to Text::CSV_XS.
489
490				Defaults to C<\\> (backslash). Overrides C<--input-tsv> option.
491
492				=item * B<input_filenames> => I<array[filename]> (default: ["-"])
493
494				Input CSV files.
495
496				Use C<-> to read from stdin.
497
498				Encoding of input file is assumed to be UTF-8.
499
500				=item * B<input_header> => I<bool> (default: 1)
501
502				Specify whether input CSV has a header row.
503
504				By default, the first row of the input CSV will be assumed to contain field
505				names (and the second row contains the first data row). When you declare that
506				input CSV does not have header row (C<--no-input-header>), the first row of the
507				CSV is assumed to contain the first data row. Fields will be named C<field1>,
508				C<field2>, and so on.
509
510				=item * B<input_quote_char> => I<str>
511
512				Specify field quote character in input CSV, will be passed to Text::CSV_XS.
513
514				Defaults to C<"> (double quote). Overrides C<--input-tsv> option.
515
516				=item * B<input_sep_char> => I<str>
517
518				Specify field separator character in input CSV, will be passed to Text::CSV_XS.
519
520				Defaults to C<,> (comma). Overrides C<--input-tsv> option.
521
522				=item * B<input_tsv> => I<true>
523
524				Inform that input file is in TSV (tab-separated) format instead of CSV.
525
526				Overriden by C<--input-sep-char>, C<--input-quote-char>, C<--input-escape-char>
527				options. If one of those options is specified, then C<--input-tsv> will be
528				ignored.
529
530				=item * B<op>* => I<str>
531
532				Set operation to perform.
533
534				=item * B<output_always_quote> => I<bool> (default: 0)
535
536				Whether to always quote values.
537
538				When set to false (the default), values are quoted only when necessary:
539
540				field1,field2,"field three contains comma (,)",field4
541
542				When set to true, then all values will be quoted:
543
544				"field1","field2","field three contains comma (,)","field4"
545
546				=item * B<output_escape_char> => I<str>
547
548				Specify character to escape value in field in output CSV, will be passed to Text::CSV_XS.
549
550				This is like C<--input-escape-char> option but for output instead of input.
551
552				Defaults to C<\\> (backslash). Overrides C<--output-tsv> option.
553
554				=item * B<output_filename> => I<filename>
555
556				Output filename.
557
558				Use C<-> to output to stdout (the default if you don't specify this option).
559
560				Encoding of output file is assumed to be UTF-8.
561
562				=item * B<output_header> => I<bool>
563
564				Whether output CSV should have a header row.
565
566				By default, a header row will be output I<if> input CSV has header row. Under
567				C<--output-header>, a header row will be output even if input CSV does not have
568				header row (value will be something like "col0,col1,..."). Under
569				C<--no-output-header>, header row will I<not> be printed even if input CSV has
570				header row. So this option can be used to unconditionally add or remove header
571				row.
572
573				=item * B<output_quote_char> => I<str>
574
575				Specify field quote character in output CSV, will be passed to Text::CSV_XS.
576
577				This is like C<--input-quote-char> option but for output instead of input.
578
579				Defaults to C<"> (double quote). Overrides C<--output-tsv> option.
580
581				=item * B<output_quote_empty> => I<bool> (default: 0)
582
583				Whether to quote empty values.
584
585				When set to false (the default), empty values are not quoted:
586
587				field1,field2,,field4
588
589				When set to true, then empty values will be quoted:
590
591				field1,field2,"",field4
592
593				=item * B<output_sep_char> => I<str>
594
595				Specify field separator character in output CSV, will be passed to Text::CSV_XS.
596
597				This is like C<--input-sep-char> option but for output instead of input.
598
599				Defaults to C<,> (comma). Overrides C<--output-tsv> option.
600
601				=item * B<output_tsv> => I<bool>
602
603				Inform that output file is TSV (tab-separated) format instead of CSV.
604
605				This is like C<--input-tsv> option but for output instead of input.
606
607				Overriden by C<--output-sep-char>, C<--output-quote-char>, C<--output-escape-char>
608				options. If one of those options is specified, then C<--output-tsv> will be
609				ignored.
610
611				=item * B<overwrite> => I<bool>
612
613				Whether to override existing output file.
614
615				=item * B<result_fields> => I<str>
616
617				(No description)
618
619
620				=back
621
622				Returns an enveloped result (an array).
623
624				First element ($status_code) is an integer containing HTTP-like status code
625				(200 means OK, 4xx caller error, 5xx function error). Second element
626				($reason) is a string containing error message, or something like "OK" if status is
627				200. Third element ($payload) is the actual result, but usually not present when enveloped result is an error response ($status_code is not 2xx). Fourth
628				element (%result_meta) is called result metadata and is optional, a hash
629				that contains extra information, much like how HTTP response headers provide additional metadata.
630
631				Return value: (any)
632
633				=head1 HOMEPAGE
634
635				Please visit the project's homepage at L<https://metacpan.org/release/App-CSVUtils>.
636
637				=head1 SOURCE
638
639				Source repository is at L<https://github.com/perlancar/perl-App-CSVUtils>.
640
641				=head1 AUTHOR
642
643				perlancar <perlancar@cpan.org>
644
645				=head1 CONTRIBUTING
646
647
648				To contribute, you can send patches by email/via RT, or send pull requests on
649				GitHub.
650
651				Most of the time, you don't need to build the distribution yourself. You can
652				simply modify the code, then test via:
653
654				% prove -l
655
656				If you want to build the distribution (e.g. to try to install it locally on your
657				system), you can install L<Dist::Zilla>,
658				L<Dist::Zilla::PluginBundle::Author::PERLANCAR>,
659				L<Pod::Weaver::PluginBundle::Author::PERLANCAR>, and sometimes one or two other
660				Dist::Zilla- and/or Pod::Weaver plugins. Any additional steps required beyond
661				that are considered a bug and can be reported to me.
662
663				=head1 COPYRIGHT AND LICENSE
664
665				This software is copyright (c) 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016 by perlancar <perlancar@cpan.org>.
666
667				This is free software; you can redistribute it and/or modify it under
668				the same terms as the Perl 5 programming language system itself.
669
670				=head1 BUGS
671
672				Please report any bugs or feature requests on the bugtracker website L<https://rt.cpan.org/Public/Dist/Display.html?Name=App-CSVUtils>
673
674				When submitting a bug or request, please include a test-file or a
675				patch to an existing test-file that illustrates the bug or desired
676				feature.
677
678				=cut