File Coverage

blib/lib/App/CSVUtils/csv_concat.pm
Criterion Covered Total %
statement 14 14 100.0
branch n/a
condition n/a
subroutine 5 5 100.0
pod n/a
total 19 19 100.0


line stmt bran cond sub pod time code
1             package App::CSVUtils::csv_concat;
2              
3 1     1   4169 use 5.010001;
  1         4  
4 1     1   6 use strict;
  1         1  
  1         27  
5 1     1   4 use warnings;
  1         3  
  1         22  
6 1     1   12 use Log::ger;
  1         2  
  1         32  
7              
8             our $AUTHORITY = 'cpan:PERLANCAR'; # AUTHORITY
9             our $DATE = '2023-03-31'; # DATE
10             our $DIST = 'App-CSVUtils'; # DIST
11             our $VERSION = '1.023'; # VERSION
12              
13 1         524 use App::CSVUtils qw(
14             gen_csv_util
15 1     1   268 );
  1         1  
16              
17             gen_csv_util(
18             name => 'csv_concat',
19             summary => 'Concatenate several CSV files together, '.
20             'collecting all the fields',
21             description => <<'_',
22              
23             Example, concatenating this CSV:
24              
25             col1,col2
26             1,2
27             3,4
28              
29             and:
30              
31             col2,col4
32             a,b
33             c,d
34             e,f
35              
36             and:
37              
38             col3
39             X
40             Y
41              
42             will result in:
43              
44             col1,col2,col4,col3
45             1,2,
46             3,4,
47             ,a,b
48             ,c,d
49             ,e,f
50             ,,,X
51             ,,,Y
52              
53             Keywords: join, merge
54              
55             _
56             add_args => {
57             %App::CSVUtils::argspecopt_with_data_rows,
58             },
59             tags => ['category:combining', 'join', 'merge'],
60              
61             reads_multiple_csv => 1,
62              
63             before_open_input_files => sub {
64             my $r = shift;
65              
66             # we add the following keys to the stash
67             $r->{all_input_fields} = [];
68             $r->{all_input_fh} = [];
69             },
70              
71             on_input_header_row => sub {
72             my $r = shift;
73              
74             # after we read the header row of each input file, we record the fields
75             # as well as the filehandle, so we can resume reading the data rows
76             # later. before printing all the rows, we collect all the fields from
77             # all files first.
78              
79             push @{ $r->{all_input_fields} }, $r->{input_fields};
80             push @{ $r->{all_input_fh} }, $r->{input_fh};
81             $r->{wants_skip_file}++;
82             },
83              
84             after_close_input_files => sub {
85             my $r = shift;
86              
87             # collect all output fields
88             $r->{output_fields} = [];
89             $r->{output_fields_idx} = {};
90             for my $i (0 .. $#{ $r->{all_input_fields} }) {
91             my $input_fields = $r->{all_input_fields}[$i];
92             for my $j (0 .. $#{ $input_fields }) {
93             my $field = $input_fields->[$j];
94             unless (grep {$field eq $_} @{ $r->{output_fields} }) {
95             push @{ $r->{output_fields} }, $field;
96             $r->{output_fields_idx}{$field} = $#{ $r->{output_fields} };
97             }
98             }
99             }
100              
101             # print all the data rows
102             my $csv = $r->{input_parser};
103             for my $i (0 .. $#{ $r->{all_input_fh} }) {
104             log_trace "[%d/%d] Adding rows from file #%d ...",
105             $i+1, scalar(@{$r->{all_input_fh}}), $i+1;
106             my $fh = $r->{all_input_fh}[$i];
107             my $input_fields = $r->{all_input_fields}[$i];
108             while (my $row = $csv->getline($fh)) {
109             my $combined_row = [("") x @{ $r->{output_fields} }];
110             for my $j (0 .. $#{ $input_fields }) {
111             my $field = $input_fields->[$j];
112             $combined_row->[ $r->{output_fields_idx}{$field} ] = $row->[$j];
113             }
114             $r->{code_print_row}->($combined_row);
115             }
116             } # for all input fh
117             },
118             );
119              
120             1;
121             # ABSTRACT: Concatenate several CSV files together, collecting all the fields
122              
123             __END__
124              
125             =pod
126              
127             =encoding UTF-8
128              
129             =head1 NAME
130              
131             App::CSVUtils::csv_concat - Concatenate several CSV files together, collecting all the fields
132              
133             =head1 VERSION
134              
135             This document describes version 1.023 of App::CSVUtils::csv_concat (from Perl distribution App-CSVUtils), released on 2023-03-31.
136              
137             =head1 FUNCTIONS
138              
139              
140             =head2 csv_concat
141              
142             Usage:
143              
144             csv_concat(%args) -> [$status_code, $reason, $payload, \%result_meta]
145              
146             Concatenate several CSV files together, collecting all the fields.
147              
148             Example, concatenating this CSV:
149              
150             col1,col2
151             1,2
152             3,4
153              
154             and:
155              
156             col2,col4
157             a,b
158             c,d
159             e,f
160              
161             and:
162              
163             col3
164             X
165             Y
166              
167             will result in:
168              
169             col1,col2,col4,col3
170             1,2,
171             3,4,
172             ,a,b
173             ,c,d
174             ,e,f
175             ,,,X
176             ,,,Y
177              
178             Keywords: join, merge
179              
180             This function is not exported.
181              
182             Arguments ('*' denotes required arguments):
183              
184             =over 4
185              
186             =item * B<inplace> => I<true>
187              
188             Output to the same file as input.
189              
190             Normally, you output to a different file than input. If you try to output to the
191             same file (C<-o INPUT.csv -O>) you will clobber the input file; thus the utility
192             prevents you from doing it. However, with this C<--inplace> option, you can
193             output to the same file. Like perl's C<-i> option, this will first output to a
194             temporary file in the same directory as the input file then rename to the final
195             file at the end. You cannot specify output file (C<-o>) when using this option,
196             but you can specify backup extension with C<-b> option.
197              
198             Some caveats:
199              
200             =over
201              
202             =item * if input file is a symbolic link, it will be replaced with a regular file;
203              
204             =item * renaming (implemented using C<rename()>) can fail if input filename is too long;
205              
206             =item * value specified in C<-b> is currently not checked for acceptable characters;
207              
208             =item * things can also fail if permissions are restrictive;
209              
210             =back
211              
212             =item * B<inplace_backup_ext> => I<str> (default: "")
213              
214             Extension to add for backup of input file.
215              
216             In inplace mode (C<--inplace>), if this option is set to a non-empty string, will
217             rename the input file using this extension as a backup. The old existing backup
218             will be overwritten, if any.
219              
220             =item * B<input_escape_char> => I<str>
221              
222             Specify character to escape value in field in input CSV, will be passed to Text::CSV_XS.
223              
224             Defaults to C<\\> (backslash). Overrides C<--input-tsv> option.
225              
226             =item * B<input_filenames> => I<array[filename]> (default: ["-"])
227              
228             Input CSV files.
229              
230             Use C<-> to read from stdin.
231              
232             Encoding of input file is assumed to be UTF-8.
233              
234             =item * B<input_header> => I<bool> (default: 1)
235              
236             Specify whether input CSV has a header row.
237              
238             By default, the first row of the input CSV will be assumed to contain field
239             names (and the second row contains the first data row). When you declare that
240             input CSV does not have header row (C<--no-input-header>), the first row of the
241             CSV is assumed to contain the first data row. Fields will be named C<field1>,
242             C<field2>, and so on.
243              
244             =item * B<input_quote_char> => I<str>
245              
246             Specify field quote character in input CSV, will be passed to Text::CSV_XS.
247              
248             Defaults to C<"> (double quote). Overrides C<--input-tsv> option.
249              
250             =item * B<input_sep_char> => I<str>
251              
252             Specify field separator character in input CSV, will be passed to Text::CSV_XS.
253              
254             Defaults to C<,> (comma). Overrides C<--input-tsv> option.
255              
256             =item * B<input_tsv> => I<true>
257              
258             Inform that input file is in TSV (tab-separated) format instead of CSV.
259              
260             Overriden by C<--input-sep-char>, C<--input-quote-char>, C<--input-escape-char>
261             options. If one of those options is specified, then C<--input-tsv> will be
262             ignored.
263              
264             =item * B<output_always_quote> => I<bool> (default: 0)
265              
266             Whether to always quote values.
267              
268             When set to false (the default), values are quoted only when necessary:
269              
270             field1,field2,"field three contains comma (,)",field4
271              
272             When set to true, then all values will be quoted:
273              
274             "field1","field2","field three contains comma (,)","field4"
275              
276             =item * B<output_escape_char> => I<str>
277              
278             Specify character to escape value in field in output CSV, will be passed to Text::CSV_XS.
279              
280             This is like C<--input-escape-char> option but for output instead of input.
281              
282             Defaults to C<\\> (backslash). Overrides C<--output-tsv> option.
283              
284             =item * B<output_filename> => I<filename>
285              
286             Output filename.
287              
288             Use C<-> to output to stdout (the default if you don't specify this option).
289              
290             Encoding of output file is assumed to be UTF-8.
291              
292             =item * B<output_header> => I<bool>
293              
294             Whether output CSV should have a header row.
295              
296             By default, a header row will be output I<if> input CSV has header row. Under
297             C<--output-header>, a header row will be output even if input CSV does not have
298             header row (value will be something like "col0,col1,..."). Under
299             C<--no-output-header>, header row will I<not> be printed even if input CSV has
300             header row. So this option can be used to unconditionally add or remove header
301             row.
302              
303             =item * B<output_quote_char> => I<str>
304              
305             Specify field quote character in output CSV, will be passed to Text::CSV_XS.
306              
307             This is like C<--input-quote-char> option but for output instead of input.
308              
309             Defaults to C<"> (double quote). Overrides C<--output-tsv> option.
310              
311             =item * B<output_quote_empty> => I<bool> (default: 0)
312              
313             Whether to quote empty values.
314              
315             When set to false (the default), empty values are not quoted:
316              
317             field1,field2,,field4
318              
319             When set to true, then empty values will be quoted:
320              
321             field1,field2,"",field4
322              
323             =item * B<output_sep_char> => I<str>
324              
325             Specify field separator character in output CSV, will be passed to Text::CSV_XS.
326              
327             This is like C<--input-sep-char> option but for output instead of input.
328              
329             Defaults to C<,> (comma). Overrides C<--output-tsv> option.
330              
331             =item * B<output_tsv> => I<bool>
332              
333             Inform that output file is TSV (tab-separated) format instead of CSV.
334              
335             This is like C<--input-tsv> option but for output instead of input.
336              
337             Overriden by C<--output-sep-char>, C<--output-quote-char>, C<--output-escape-char>
338             options. If one of those options is specified, then C<--output-tsv> will be
339             ignored.
340              
341             =item * B<overwrite> => I<bool>
342              
343             Whether to override existing output file.
344              
345             =item * B<with_data_rows> => I<bool>
346              
347             Whether to also output data rows.
348              
349              
350             =back
351              
352             Returns an enveloped result (an array).
353              
354             First element ($status_code) is an integer containing HTTP-like status code
355             (200 means OK, 4xx caller error, 5xx function error). Second element
356             ($reason) is a string containing error message, or something like "OK" if status is
357             200. Third element ($payload) is the actual result, but usually not present when enveloped result is an error response ($status_code is not 2xx). Fourth
358             element (%result_meta) is called result metadata and is optional, a hash
359             that contains extra information, much like how HTTP response headers provide additional metadata.
360              
361             Return value: (any)
362              
363             =head1 IMPLEMENTATION NOTES
364              
365             We first read only the header rows for all input files, while collecting the
366             input filehandles. Then we read the data rows of all the files ourselves.
367              
368             =head1 HOMEPAGE
369              
370             Please visit the project's homepage at L<https://metacpan.org/release/App-CSVUtils>.
371              
372             =head1 SOURCE
373              
374             Source repository is at L<https://github.com/perlancar/perl-App-CSVUtils>.
375              
376             =head1 AUTHOR
377              
378             perlancar <perlancar@cpan.org>
379              
380             =head1 CONTRIBUTING
381              
382              
383             To contribute, you can send patches by email/via RT, or send pull requests on
384             GitHub.
385              
386             Most of the time, you don't need to build the distribution yourself. You can
387             simply modify the code, then test via:
388              
389             % prove -l
390              
391             If you want to build the distribution (e.g. to try to install it locally on your
392             system), you can install L<Dist::Zilla>,
393             L<Dist::Zilla::PluginBundle::Author::PERLANCAR>,
394             L<Pod::Weaver::PluginBundle::Author::PERLANCAR>, and sometimes one or two other
395             Dist::Zilla- and/or Pod::Weaver plugins. Any additional steps required beyond
396             that are considered a bug and can be reported to me.
397              
398             =head1 COPYRIGHT AND LICENSE
399              
400             This software is copyright (c) 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016 by perlancar <perlancar@cpan.org>.
401              
402             This is free software; you can redistribute it and/or modify it under
403             the same terms as the Perl 5 programming language system itself.
404              
405             =head1 BUGS
406              
407             Please report any bugs or feature requests on the bugtracker website L<https://rt.cpan.org/Public/Dist/Display.html?Name=App-CSVUtils>
408              
409             When submitting a bug or request, please include a test-file or a
410             patch to an existing test-file that illustrates the bug or desired
411             feature.
412              
413             =cut