File Coverage

blib/lib/App/CSVUtils/csv_concat.pm

Criterion	Covered	Total	%
statement	14	14	100.0
branch			n/a
condition			n/a
subroutine	5	5	100.0
pod			n/a
total	19	19	100.0

line	stmt	sub	time	code
1				package App::CSVUtils::csv_concat;
2
3	1	1	4169	use 5.010001;
	1		4
4	1	1	6	use strict;
	1		1
	1		27
5	1	1	4	use warnings;
	1		3
	1		22
6	1	1	12	use Log::ger;
	1		2
	1		32
7
8				our $AUTHORITY = 'cpan:PERLANCAR'; # AUTHORITY
9				our $DATE = '2023-03-31'; # DATE
10				our $DIST = 'App-CSVUtils'; # DIST
11				our $VERSION = '1.023'; # VERSION
12
13	1		524	use App::CSVUtils qw(
14				gen_csv_util
15	1	1	268	);
	1		1
16
17				gen_csv_util(
18				name => 'csv_concat',
19				summary => 'Concatenate several CSV files together, '.
20				'collecting all the fields',
21				description => <<'_',
22
23				Example, concatenating this CSV:
24
25				col1,col2
26				1,2
27				3,4
28
29				and:
30
31				col2,col4
32				a,b
33				c,d
34				e,f
35
36				and:
37
38				col3
39				X
40				Y
41
42				will result in:
43
44				col1,col2,col4,col3
45				1,2,
46				3,4,
47				,a,b
48				,c,d
49				,e,f
50				,,,X
51				,,,Y
52
53				Keywords: join, merge
54
55				_
56				add_args => {
57				%App::CSVUtils::argspecopt_with_data_rows,
58				},
59				tags => ['category:combining', 'join', 'merge'],
60
61				reads_multiple_csv => 1,
62
63				before_open_input_files => sub {
64				my $r = shift;
65
66				# we add the following keys to the stash
67				$r->{all_input_fields} = [];
68				$r->{all_input_fh} = [];
69				},
70
71				on_input_header_row => sub {
72				my $r = shift;
73
74				# after we read the header row of each input file, we record the fields
75				# as well as the filehandle, so we can resume reading the data rows
76				# later. before printing all the rows, we collect all the fields from
77				# all files first.
78
79				push @{ $r->{all_input_fields} }, $r->{input_fields};
80				push @{ $r->{all_input_fh} }, $r->{input_fh};
81				$r->{wants_skip_file}++;
82				},
83
84				after_close_input_files => sub {
85				my $r = shift;
86
87				# collect all output fields
88				$r->{output_fields} = [];
89				$r->{output_fields_idx} = {};
90				for my $i (0 .. $#{ $r->{all_input_fields} }) {
91				my $input_fields = $r->{all_input_fields}[$i];
92				for my $j (0 .. $#{ $input_fields }) {
93				my $field = $input_fields->[$j];
94				unless (grep {$field eq $_} @{ $r->{output_fields} }) {
95				push @{ $r->{output_fields} }, $field;
96				$r->{output_fields_idx}{$field} = $#{ $r->{output_fields} };
97				}
98				}
99				}
100
101				# print all the data rows
102				my $csv = $r->{input_parser};
103				for my $i (0 .. $#{ $r->{all_input_fh} }) {
104				log_trace "[%d/%d] Adding rows from file #%d ...",
105				$i+1, scalar(@{$r->{all_input_fh}}), $i+1;
106				my $fh = $r->{all_input_fh}[$i];
107				my $input_fields = $r->{all_input_fields}[$i];
108				while (my $row = $csv->getline($fh)) {
109				my $combined_row = [("") x @{ $r->{output_fields} }];
110				for my $j (0 .. $#{ $input_fields }) {
111				my $field = $input_fields->[$j];
112				$combined_row->[ $r->{output_fields_idx}{$field} ] = $row->[$j];
113				}
114				$r->{code_print_row}->($combined_row);
115				}
116				} # for all input fh
117				},
118				);
119
120				1;
121				# ABSTRACT: Concatenate several CSV files together, collecting all the fields
122
123				__END__
124
125				=pod
126
127				=encoding UTF-8
128
129				=head1 NAME
130
131				App::CSVUtils::csv_concat - Concatenate several CSV files together, collecting all the fields
132
133				=head1 VERSION
134
135				This document describes version 1.023 of App::CSVUtils::csv_concat (from Perl distribution App-CSVUtils), released on 2023-03-31.
136
137				=head1 FUNCTIONS
138
139
140				=head2 csv_concat
141
142				Usage:
143
144				csv_concat(%args) -> [$status_code, $reason, $payload, \%result_meta]
145
146				Concatenate several CSV files together, collecting all the fields.
147
148				Example, concatenating this CSV:
149
150				col1,col2
151				1,2
152				3,4
153
154				and:
155
156				col2,col4
157				a,b
158				c,d
159				e,f
160
161				and:
162
163				col3
164				X
165				Y
166
167				will result in:
168
169				col1,col2,col4,col3
170				1,2,
171				3,4,
172				,a,b
173				,c,d
174				,e,f
175				,,,X
176				,,,Y
177
178				Keywords: join, merge
179
180				This function is not exported.
181
182				Arguments ('*' denotes required arguments):
183
184				=over 4
185
186				=item * B<inplace> => I<true>
187
188				Output to the same file as input.
189
190				Normally, you output to a different file than input. If you try to output to the
191				same file (C<-o INPUT.csv -O>) you will clobber the input file; thus the utility
192				prevents you from doing it. However, with this C<--inplace> option, you can
193				output to the same file. Like perl's C<-i> option, this will first output to a
194				temporary file in the same directory as the input file then rename to the final
195				file at the end. You cannot specify output file (C<-o>) when using this option,
196				but you can specify backup extension with C<-b> option.
197
198				Some caveats:
199
200				=over
201
202				=item * if input file is a symbolic link, it will be replaced with a regular file;
203
204				=item * renaming (implemented using C<rename()>) can fail if input filename is too long;
205
206				=item * value specified in C<-b> is currently not checked for acceptable characters;
207
208				=item * things can also fail if permissions are restrictive;
209
210				=back
211
212				=item * B<inplace_backup_ext> => I<str> (default: "")
213
214				Extension to add for backup of input file.
215
216				In inplace mode (C<--inplace>), if this option is set to a non-empty string, will
217				rename the input file using this extension as a backup. The old existing backup
218				will be overwritten, if any.
219
220				=item * B<input_escape_char> => I<str>
221
222				Specify character to escape value in field in input CSV, will be passed to Text::CSV_XS.
223
224				Defaults to C<\\> (backslash). Overrides C<--input-tsv> option.
225
226				=item * B<input_filenames> => I<array[filename]> (default: ["-"])
227
228				Input CSV files.
229
230				Use C<-> to read from stdin.
231
232				Encoding of input file is assumed to be UTF-8.
233
234				=item * B<input_header> => I<bool> (default: 1)
235
236				Specify whether input CSV has a header row.
237
238				By default, the first row of the input CSV will be assumed to contain field
239				names (and the second row contains the first data row). When you declare that
240				input CSV does not have header row (C<--no-input-header>), the first row of the
241				CSV is assumed to contain the first data row. Fields will be named C<field1>,
242				C<field2>, and so on.
243
244				=item * B<input_quote_char> => I<str>
245
246				Specify field quote character in input CSV, will be passed to Text::CSV_XS.
247
248				Defaults to C<"> (double quote). Overrides C<--input-tsv> option.
249
250				=item * B<input_sep_char> => I<str>
251
252				Specify field separator character in input CSV, will be passed to Text::CSV_XS.
253
254				Defaults to C<,> (comma). Overrides C<--input-tsv> option.
255
256				=item * B<input_tsv> => I<true>
257
258				Inform that input file is in TSV (tab-separated) format instead of CSV.
259
260				Overriden by C<--input-sep-char>, C<--input-quote-char>, C<--input-escape-char>
261				options. If one of those options is specified, then C<--input-tsv> will be
262				ignored.
263
264				=item * B<output_always_quote> => I<bool> (default: 0)
265
266				Whether to always quote values.
267
268				When set to false (the default), values are quoted only when necessary:
269
270				field1,field2,"field three contains comma (,)",field4
271
272				When set to true, then all values will be quoted:
273
274				"field1","field2","field three contains comma (,)","field4"
275
276				=item * B<output_escape_char> => I<str>
277
278				Specify character to escape value in field in output CSV, will be passed to Text::CSV_XS.
279
280				This is like C<--input-escape-char> option but for output instead of input.
281
282				Defaults to C<\\> (backslash). Overrides C<--output-tsv> option.
283
284				=item * B<output_filename> => I<filename>
285
286				Output filename.
287
288				Use C<-> to output to stdout (the default if you don't specify this option).
289
290				Encoding of output file is assumed to be UTF-8.
291
292				=item * B<output_header> => I<bool>
293
294				Whether output CSV should have a header row.
295
296				By default, a header row will be output I<if> input CSV has header row. Under
297				C<--output-header>, a header row will be output even if input CSV does not have
298				header row (value will be something like "col0,col1,..."). Under
299				C<--no-output-header>, header row will I<not> be printed even if input CSV has
300				header row. So this option can be used to unconditionally add or remove header
301				row.
302
303				=item * B<output_quote_char> => I<str>
304
305				Specify field quote character in output CSV, will be passed to Text::CSV_XS.
306
307				This is like C<--input-quote-char> option but for output instead of input.
308
309				Defaults to C<"> (double quote). Overrides C<--output-tsv> option.
310
311				=item * B<output_quote_empty> => I<bool> (default: 0)
312
313				Whether to quote empty values.
314
315				When set to false (the default), empty values are not quoted:
316
317				field1,field2,,field4
318
319				When set to true, then empty values will be quoted:
320
321				field1,field2,"",field4
322
323				=item * B<output_sep_char> => I<str>
324
325				Specify field separator character in output CSV, will be passed to Text::CSV_XS.
326
327				This is like C<--input-sep-char> option but for output instead of input.
328
329				Defaults to C<,> (comma). Overrides C<--output-tsv> option.
330
331				=item * B<output_tsv> => I<bool>
332
333				Inform that output file is TSV (tab-separated) format instead of CSV.
334
335				This is like C<--input-tsv> option but for output instead of input.
336
337				Overriden by C<--output-sep-char>, C<--output-quote-char>, C<--output-escape-char>
338				options. If one of those options is specified, then C<--output-tsv> will be
339				ignored.
340
341				=item * B<overwrite> => I<bool>
342
343				Whether to override existing output file.
344
345				=item * B<with_data_rows> => I<bool>
346
347				Whether to also output data rows.
348
349
350				=back
351
352				Returns an enveloped result (an array).
353
354				First element ($status_code) is an integer containing HTTP-like status code
355				(200 means OK, 4xx caller error, 5xx function error). Second element
356				($reason) is a string containing error message, or something like "OK" if status is
357				200. Third element ($payload) is the actual result, but usually not present when enveloped result is an error response ($status_code is not 2xx). Fourth
358				element (%result_meta) is called result metadata and is optional, a hash
359				that contains extra information, much like how HTTP response headers provide additional metadata.
360
361				Return value: (any)
362
363				=head1 IMPLEMENTATION NOTES
364
365				We first read only the header rows for all input files, while collecting the
366				input filehandles. Then we read the data rows of all the files ourselves.
367
368				=head1 HOMEPAGE
369
370				Please visit the project's homepage at L<https://metacpan.org/release/App-CSVUtils>.
371
372				=head1 SOURCE
373
374				Source repository is at L<https://github.com/perlancar/perl-App-CSVUtils>.
375
376				=head1 AUTHOR
377
378				perlancar <perlancar@cpan.org>
379
380				=head1 CONTRIBUTING
381
382
383				To contribute, you can send patches by email/via RT, or send pull requests on
384				GitHub.
385
386				Most of the time, you don't need to build the distribution yourself. You can
387				simply modify the code, then test via:
388
389				% prove -l
390
391				If you want to build the distribution (e.g. to try to install it locally on your
392				system), you can install L<Dist::Zilla>,
393				L<Dist::Zilla::PluginBundle::Author::PERLANCAR>,
394				L<Pod::Weaver::PluginBundle::Author::PERLANCAR>, and sometimes one or two other
395				Dist::Zilla- and/or Pod::Weaver plugins. Any additional steps required beyond
396				that are considered a bug and can be reported to me.
397
398				=head1 COPYRIGHT AND LICENSE
399
400				This software is copyright (c) 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016 by perlancar <perlancar@cpan.org>.
401
402				This is free software; you can redistribute it and/or modify it under
403				the same terms as the Perl 5 programming language system itself.
404
405				=head1 BUGS
406
407				Please report any bugs or feature requests on the bugtracker website L<https://rt.cpan.org/Public/Dist/Display.html?Name=App-CSVUtils>
408
409				When submitting a bug or request, please include a test-file or a
410				patch to an existing test-file that illustrates the bug or desired
411				feature.
412
413				=cut