File Coverage

blib/lib/App/CSVUtils/csv_sort_rows.pm
Criterion Covered Total %
statement 96 102 94.1
branch 45 62 72.5
condition 8 9 88.8
subroutine 10 10 100.0
pod 0 3 0.0
total 159 186 85.4


line stmt bran cond sub pod time code
1             package App::CSVUtils::csv_sort_rows;
2              
3 1     1   7417 use 5.010001;
  1         4  
4 1     1   6 use strict;
  1         2  
  1         26  
5 1     1   9 use warnings;
  1         3  
  1         23  
6 1     1   5 use Log::ger;
  1         2  
  1         5  
7              
8             our $AUTHORITY = 'cpan:PERLANCAR'; # AUTHORITY
9             our $DATE = '2023-04-01'; # DATE
10             our $DIST = 'App-CSVUtils'; # DIST
11             our $VERSION = '1.024'; # VERSION
12              
13 1         1205 use App::CSVUtils qw(
14             gen_csv_util
15             compile_eval_code
16 1     1   256 );
  1         3  
17              
18             sub on_input_header_row {
19 14     14 0 29 my $r = shift;
20 14 100       48 $r->{wants_input_row_as_hashref}++ if $r->{util_args}{hash};
21             }
22              
23             sub on_input_data_row {
24 42     42 0 76 my $r = shift;
25              
26             # keys we add to the stash
27 42   100     118 $r->{input_rows} //= [];
28 42 100       79 if ($r->{wants_input_row_as_hashref}) {
29 9   100     36 $r->{input_rows_as_hashref} //= [];
30             }
31              
32 42         58 push @{ $r->{input_rows} }, $r->{input_row};
  42         80  
33 42 100       101 if ($r->{wants_input_row_as_hashref}) {
34 9         13 push @{ $r->{input_rows_as_hashref} }, $r->{input_row_as_hashref};
  9         27  
35             }
36             }
37              
38             sub after_close_input_files {
39 14     14 0 22 my $r = shift;
40              
41             # we do the actual sorting here after collecting all the rows
42              
43             # whether we should compute keys
44 14         23 my @keys;
45 14 100       36 if ($r->{util_args}{key}) {
46 4         13 my $code_gen_key = compile_eval_code($r->{util_args}{key}, 'key');
47 4 100       9 for my $row (@{ $r->{util_args}{hash} ? $r->{input_rows_as_hashref} : $r->{input_rows} }) {
  4         16  
48 12         19 local $_ = $row;
49 12         231 push @keys, $code_gen_key->($row);
50             }
51             }
52              
53             # if user doesn't specify any by_* args, try to set a default if we can
54 14 50       53 if (!(grep {defined} ($r->{util_args}{by_code}, $r->{util_args}{by_sortsub}, $r->{util_args}{by_fields}))) {
  42         97  
55             # if there is only a single field, use it
56 0 0       0 if (@{ $r->{input_fields} } == 1) {
  0         0  
57 0         0 my $field = $r->{input_fields}[0];
58             $r->{util_args}{by_fields} = App::CSVUtils::_is_numeric_field(
59 0 0       0 $r->{input_rows}, 0) ? ["+$field"] : [$field];
60             }
61             }
62              
63 14         23 my $sorted_rows;
64 14 100 100     66 if ($r->{util_args}{by_code} || $r->{util_args}{by_sortsub}) {
    50          
65              
66 6         8 my $code0;
67 6 100       17 if ($r->{util_args}{by_code}) {
    50          
68 4         12 $code0 = compile_eval_code($r->{util_args}{by_code}, 'by_code');
69             } elsif (defined $r->{util_args}{by_sortsub}) {
70 2         8 require Sort::Sub;
71             $code0 = Sort::Sub::get_sorter(
72 2         9 $r->{util_args}{by_sortsub}, $r->{util_args}{sortsub_args});
73             }
74              
75 6         88 my $sort_indices;
76             my $code;
77 6 100       20 if (@keys) {
    100          
78             # compare two sort keys ($a & $b) are indices
79 4         7 $sort_indices++;
80             $code = sub {
81 11     11   22 local $main::a = $keys[$a];
82 11         15 local $main::b = $keys[$b];
83             #log_trace "a=<$main::a> vs b=<$main::b>";
84 11         130 $code0->($main::a, $main::b);
85 4         13 };
86             } elsif ($r->{util_args}{hash}) {
87 1         2 $sort_indices++;
88             $code = sub {
89 3     3   6 local $main::a = $r->{input_rows_as_hashref}[$a];
90 3         4 local $main::b = $r->{input_rows_as_hashref}[$b];
91             #log_trace "a=<%s> vs b=<%s>", $main::a, $main::b;
92 3         64 $code0->($main::a, $main::b);
93 1         4 };
94             } else {
95 1         2 $code = $code0;
96             }
97              
98 6 100       13 if ($sort_indices) {
99 5         10 my @sorted_indices = sort { local $main::a=$a; local $main::b=$b; $code->($main::a,$main::b) } 0..$#{$r->{input_rows}};
  14         135  
  14         28  
  14         21  
  5         21  
100 5         52 $sorted_rows = [map {$r->{input_rows}[$_]} @sorted_indices];
  15         52  
101             } else {
102 1         2 $sorted_rows = [sort { local $main::a=$a; local $main::b=$b; $code->($main::a,$main::b) } @{$r->{input_rows}}];
  3         5  
  3         4  
  3         58  
  1         4  
103             }
104              
105             } elsif ($r->{util_args}{by_fields}) {
106              
107 8         14 my @fields;
108 8         13 my $code_str = "";
109 8         12 for my $field_spec (@{ $r->{util_args}{by_fields} }) {
  8         21  
110 9         49 my ($prefix, $field) = $field_spec =~ /\A([+~-]?)(.+)/;
111 9         29 my $field_idx = App::CSVUtils::_find_field($r->{input_fields}, $field);
112 9   50     24 $prefix //= "";
113 9 100       27 if ($prefix eq '+') {
    100          
    100          
    50          
114 2 100       12 $code_str .= ($code_str ? " || " : "") .
115             "(\$a->[$field_idx] <=> \$b->[$field_idx])";
116             } elsif ($prefix eq '-') {
117 2 50       13 $code_str .= ($code_str ? " || " : "") .
118             "(\$b->[$field_idx] <=> \$a->[$field_idx])";
119             } elsif ($prefix eq '') {
120 4 100       9 if ($r->{util_args}{ci}) {
121 1 50       7 $code_str .= ($code_str ? " || " : "") .
122             "(lc(\$a->[$field_idx]) cmp lc(\$b->[$field_idx]))";
123             } else {
124 3 50       13 $code_str .= ($code_str ? " || " : "") .
125             "(\$a->[$field_idx] cmp \$b->[$field_idx])";
126             }
127             } elsif ($prefix eq '~') {
128 1 50       4 if ($r->{util_args}{ci}) {
129 0 0       0 $code_str .= ($code_str ? " || " : "") .
130             "(lc(\$b->[$field_idx]) cmp lc(\$a->[$field_idx]))";
131             } else {
132 1 50       7 $code_str .= ($code_str ? " || " : "") .
133             "(\$b->[$field_idx] cmp \$a->[$field_idx])";
134             }
135             }
136             }
137 8         32 my $code = compile_eval_code($code_str, 'from sort_by_fields');
138 8         17 $sorted_rows = [sort { local $main::a = $a; local $main::b = $b; $code->($main::a, $main::b) } @{$r->{input_rows}}];
  22         38  
  22         27  
  22         477  
  8         32  
139              
140             } else {
141              
142 0         0 die [400, "Please specify by_fields or by_sortsub or by_code"];
143              
144             }
145              
146 14 100       37 if ($main::_CSV_SORTED_ROWS) {
147 2         10 require Data::Cmp;
148             #use DD; dd $r->{input_rows}; print "\n"; dd $sorted_rows;
149 2 100       13 if (Data::Cmp::cmp_data($r->{input_rows}, $sorted_rows)) {
150             # not sorted
151 1 50       92 $r->{result} = [400, "NOT sorted", $r->{util_args}{quiet} ? undef : "Rows are NOT sorted"];
152             } else {
153             # sorted
154 1 50       299 $r->{result} = [200, "Sorted", $r->{util_args}{quiet} ? undef : "Rows are sorted"];
155             }
156             } else {
157 12         25 for my $row (@$sorted_rows) {
158 36         96 $r->{code_print_row}->($row);
159             }
160             }
161             }
162              
163             gen_csv_util(
164             name => 'csv_sort_rows',
165             summary => 'Sort CSV rows',
166             description => <<'_',
167              
168             This utility sorts the rows in the CSV. Example input CSV:
169              
170             name,age
171             Andy,20
172             Dennis,15
173             Ben,30
174             Jerry,30
175              
176             Example output CSV (using `--by-field +age` which means by age numerically and
177             ascending):
178              
179             name,age
180             Dennis,15
181             Andy,20
182             Ben,30
183             Jerry,30
184              
185             Example output CSV (using `--by-field -age`, which means by age numerically and
186             descending):
187              
188             name,age
189             Ben,30
190             Jerry,30
191             Andy,20
192             Dennis,15
193              
194             Example output CSV (using `--by-field name`, which means by name ascibetically
195             and ascending):
196              
197             name,age
198             Andy,20
199             Ben,30
200             Dennis,15
201             Jerry,30
202              
203             Example output CSV (using `--by-field ~name`, which means by name ascibetically
204             and descending):
205              
206             name,age
207             Jerry,30
208             Dennis,15
209             Ben,30
210             Andy,20
211              
212             Example output CSV (using `--by-field +age --by-field ~name`):
213              
214             name,age
215             Dennis,15
216             Andy,20
217             Jerry,30
218             Ben,30
219              
220             You can also reverse the sort order (`-r`) or sort case-insensitively (`-i`).
221              
222             For more flexibility, instead of `--by-field` you can use `--by-code`:
223              
224             Example output `--by-code '$a->[1] <=> $b->[1] || $b->[0] cmp $a->[0]'` (which
225             is equivalent to `--by-field +age --by-field ~name`):
226              
227             name,age
228             Dennis,15
229             Andy,20
230             Jerry,30
231             Ben,30
232              
233             If you use `--hash`, your code will receive the rows to be compared as hashref,
234             e.g. `--hash --by-code '$a->{age} <=> $b->{age} || $b->{name} cmp $a->{name}'.
235              
236             A third alternative is to sort using <pm:Sort::Sub> routines. Example output
237             (using `--by-sortsub 'by_length<r>' --key '$_->[0]'`, which is to say to sort by
238             descending length of name):
239              
240             name,age
241             Dennis,15
242             Jerry,30
243             Andy,20
244             Ben,30
245              
246             If none of the `--by-*` options are specified, the utility will bail unless
247             there's a default that can be used, e.g. when CSV has a single field then that
248             field will be used.
249              
250             _
251              
252             add_args => {
253             %App::CSVUtils::argspecopt_hash,
254             %App::CSVUtils::argspecs_sort_rows,
255             },
256              
257             tags => ['category:sorting'],
258              
259             on_input_header_row => \&App::CSVUtils::csv_sort_rows::on_input_header_row,
260              
261             on_input_data_row => \&App::CSVUtils::csv_sort_rows::on_input_data_row,
262              
263             after_close_input_files => \&App::CSVUtils::csv_sort_rows::after_close_input_files,
264              
265             );
266              
267             1;
268             # ABSTRACT: Sort CSV rows
269              
270             __END__
271              
272             =pod
273              
274             =encoding UTF-8
275              
276             =head1 NAME
277              
278             App::CSVUtils::csv_sort_rows - Sort CSV rows
279              
280             =head1 VERSION
281              
282             This document describes version 1.024 of App::CSVUtils::csv_sort_rows (from Perl distribution App-CSVUtils), released on 2023-04-01.
283              
284             =for Pod::Coverage ^(on|after|before)_.+$
285              
286             =head1 FUNCTIONS
287              
288              
289             =head2 csv_sort_rows
290              
291             Usage:
292              
293             csv_sort_rows(%args) -> [$status_code, $reason, $payload, \%result_meta]
294              
295             Sort CSV rows.
296              
297             This utility sorts the rows in the CSV. Example input CSV:
298              
299             name,age
300             Andy,20
301             Dennis,15
302             Ben,30
303             Jerry,30
304              
305             Example output CSV (using C<--by-field +age> which means by age numerically and
306             ascending):
307              
308             name,age
309             Dennis,15
310             Andy,20
311             Ben,30
312             Jerry,30
313              
314             Example output CSV (using C<--by-field -age>, which means by age numerically and
315             descending):
316              
317             name,age
318             Ben,30
319             Jerry,30
320             Andy,20
321             Dennis,15
322              
323             Example output CSV (using C<--by-field name>, which means by name ascibetically
324             and ascending):
325              
326             name,age
327             Andy,20
328             Ben,30
329             Dennis,15
330             Jerry,30
331              
332             Example output CSV (using C<--by-field ~name>, which means by name ascibetically
333             and descending):
334              
335             name,age
336             Jerry,30
337             Dennis,15
338             Ben,30
339             Andy,20
340              
341             Example output CSV (using C<--by-field +age --by-field ~name>):
342              
343             name,age
344             Dennis,15
345             Andy,20
346             Jerry,30
347             Ben,30
348              
349             You can also reverse the sort order (C<-r>) or sort case-insensitively (C<-i>).
350              
351             For more flexibility, instead of C<--by-field> you can use C<--by-code>:
352              
353             Example output C<< --by-code '$a-E<gt>[1] E<lt>=E<gt> $b-E<gt>[1] || $b-E<gt>[0] cmp $a-E<gt>[0]' >> (which
354             is equivalent to C<--by-field +age --by-field ~name>):
355              
356             name,age
357             Dennis,15
358             Andy,20
359             Jerry,30
360             Ben,30
361              
362             If you use C<--hash>, your code will receive the rows to be compared as hashref,
363             e.g. `--hash --by-code '$a->{age} <=> $b->{age} || $b->{name} cmp $a->{name}'.
364              
365             A third alternative is to sort using L<Sort::Sub> routines. Example output
366             (using C<< --by-sortsub 'by_lengthE<lt>rE<gt>' --key '$_-E<gt>[0]' >>, which is to say to sort by
367             descending length of name):
368              
369             name,age
370             Dennis,15
371             Jerry,30
372             Andy,20
373             Ben,30
374              
375             If none of the C<--by-*> options are specified, the utility will bail unless
376             there's a default that can be used, e.g. when CSV has a single field then that
377             field will be used.
378              
379             This function is not exported.
380              
381             Arguments ('*' denotes required arguments):
382              
383             =over 4
384              
385             =item * B<by_code> => I<str|code>
386              
387             Sort by using Perl code.
388              
389             C<$a> and C<$b> (or the first and second argument) will contain the two rows to be
390             compared. Which are arrayrefs; or if C<--hash> (C<-H>) is specified, hashrefs; or
391             if C<--key> is specified, whatever the code in C<--key> returns.
392              
393             =item * B<by_fields> => I<array[str]>
394              
395             Sort by a list of field specifications.
396              
397             Each field specification is a field name with an optional prefix. C<FIELD>
398             (without prefix) means sort asciibetically ascending (smallest to largest),
399             C<~FIELD> means sort asciibetically descending (largest to smallest), C<+FIELD>
400             means sort numerically ascending, C<-FIELD> means sort numerically descending.
401              
402             =item * B<by_sortsub> => I<str>
403              
404             Sort using a Sort::Sub routine.
405              
406             When sorting rows, usually combined with C<--key> because most Sort::Sub routine
407             expects a string to be compared against.
408              
409             When sorting fields, the Sort::Sub routine will get the field name as argument.
410              
411             =item * B<ci> => I<bool>
412              
413             (No description)
414              
415             =item * B<hash> => I<bool>
416              
417             Provide row in $_ as hashref instead of arrayref.
418              
419             =item * B<inplace> => I<true>
420              
421             Output to the same file as input.
422              
423             Normally, you output to a different file than input. If you try to output to the
424             same file (C<-o INPUT.csv -O>) you will clobber the input file; thus the utility
425             prevents you from doing it. However, with this C<--inplace> option, you can
426             output to the same file. Like perl's C<-i> option, this will first output to a
427             temporary file in the same directory as the input file then rename to the final
428             file at the end. You cannot specify output file (C<-o>) when using this option,
429             but you can specify backup extension with C<-b> option.
430              
431             Some caveats:
432              
433             =over
434              
435             =item * if input file is a symbolic link, it will be replaced with a regular file;
436              
437             =item * renaming (implemented using C<rename()>) can fail if input filename is too long;
438              
439             =item * value specified in C<-b> is currently not checked for acceptable characters;
440              
441             =item * things can also fail if permissions are restrictive;
442              
443             =back
444              
445             =item * B<inplace_backup_ext> => I<str> (default: "")
446              
447             Extension to add for backup of input file.
448              
449             In inplace mode (C<--inplace>), if this option is set to a non-empty string, will
450             rename the input file using this extension as a backup. The old existing backup
451             will be overwritten, if any.
452              
453             =item * B<input_escape_char> => I<str>
454              
455             Specify character to escape value in field in input CSV, will be passed to Text::CSV_XS.
456              
457             Defaults to C<\\> (backslash). Overrides C<--input-tsv> option.
458              
459             =item * B<input_filename> => I<filename> (default: "-")
460              
461             Input CSV file.
462              
463             Use C<-> to read from stdin.
464              
465             Encoding of input file is assumed to be UTF-8.
466              
467             =item * B<input_header> => I<bool> (default: 1)
468              
469             Specify whether input CSV has a header row.
470              
471             By default, the first row of the input CSV will be assumed to contain field
472             names (and the second row contains the first data row). When you declare that
473             input CSV does not have header row (C<--no-input-header>), the first row of the
474             CSV is assumed to contain the first data row. Fields will be named C<field1>,
475             C<field2>, and so on.
476              
477             =item * B<input_quote_char> => I<str>
478              
479             Specify field quote character in input CSV, will be passed to Text::CSV_XS.
480              
481             Defaults to C<"> (double quote). Overrides C<--input-tsv> option.
482              
483             =item * B<input_sep_char> => I<str>
484              
485             Specify field separator character in input CSV, will be passed to Text::CSV_XS.
486              
487             Defaults to C<,> (comma). Overrides C<--input-tsv> option.
488              
489             =item * B<input_tsv> => I<true>
490              
491             Inform that input file is in TSV (tab-separated) format instead of CSV.
492              
493             Overriden by C<--input-sep-char>, C<--input-quote-char>, C<--input-escape-char>
494             options. If one of those options is specified, then C<--input-tsv> will be
495             ignored.
496              
497             =item * B<key> => I<str|code>
498              
499             Generate sort keys with this Perl code.
500              
501             If specified, then will compute sort keys using Perl code and sort using the
502             keys. Relevant when sorting using C<--by-code> or C<--by-sortsub>. If specified,
503             then instead of row when sorting rows, the code (or Sort::Sub routine) will
504             receive these sort keys to sort against.
505              
506             The code will receive the row (arrayref, or if -H is specified, hashref) as the
507             argument.
508              
509             =item * B<output_always_quote> => I<bool> (default: 0)
510              
511             Whether to always quote values.
512              
513             When set to false (the default), values are quoted only when necessary:
514              
515             field1,field2,"field three contains comma (,)",field4
516              
517             When set to true, then all values will be quoted:
518              
519             "field1","field2","field three contains comma (,)","field4"
520              
521             =item * B<output_escape_char> => I<str>
522              
523             Specify character to escape value in field in output CSV, will be passed to Text::CSV_XS.
524              
525             This is like C<--input-escape-char> option but for output instead of input.
526              
527             Defaults to C<\\> (backslash). Overrides C<--output-tsv> option.
528              
529             =item * B<output_filename> => I<filename>
530              
531             Output filename.
532              
533             Use C<-> to output to stdout (the default if you don't specify this option).
534              
535             Encoding of output file is assumed to be UTF-8.
536              
537             =item * B<output_header> => I<bool>
538              
539             Whether output CSV should have a header row.
540              
541             By default, a header row will be output I<if> input CSV has header row. Under
542             C<--output-header>, a header row will be output even if input CSV does not have
543             header row (value will be something like "col0,col1,..."). Under
544             C<--no-output-header>, header row will I<not> be printed even if input CSV has
545             header row. So this option can be used to unconditionally add or remove header
546             row.
547              
548             =item * B<output_quote_char> => I<str>
549              
550             Specify field quote character in output CSV, will be passed to Text::CSV_XS.
551              
552             This is like C<--input-quote-char> option but for output instead of input.
553              
554             Defaults to C<"> (double quote). Overrides C<--output-tsv> option.
555              
556             =item * B<output_quote_empty> => I<bool> (default: 0)
557              
558             Whether to quote empty values.
559              
560             When set to false (the default), empty values are not quoted:
561              
562             field1,field2,,field4
563              
564             When set to true, then empty values will be quoted:
565              
566             field1,field2,"",field4
567              
568             =item * B<output_sep_char> => I<str>
569              
570             Specify field separator character in output CSV, will be passed to Text::CSV_XS.
571              
572             This is like C<--input-sep-char> option but for output instead of input.
573              
574             Defaults to C<,> (comma). Overrides C<--output-tsv> option.
575              
576             =item * B<output_tsv> => I<bool>
577              
578             Inform that output file is TSV (tab-separated) format instead of CSV.
579              
580             This is like C<--input-tsv> option but for output instead of input.
581              
582             Overriden by C<--output-sep-char>, C<--output-quote-char>, C<--output-escape-char>
583             options. If one of those options is specified, then C<--output-tsv> will be
584             ignored.
585              
586             =item * B<overwrite> => I<bool>
587              
588             Whether to override existing output file.
589              
590             =item * B<reverse> => I<bool>
591              
592             (No description)
593              
594             =item * B<sortsub_args> => I<hash>
595              
596             Arguments to pass to Sort::Sub routine.
597              
598              
599             =back
600              
601             Returns an enveloped result (an array).
602              
603             First element ($status_code) is an integer containing HTTP-like status code
604             (200 means OK, 4xx caller error, 5xx function error). Second element
605             ($reason) is a string containing error message, or something like "OK" if status is
606             200. Third element ($payload) is the actual result, but usually not present when enveloped result is an error response ($status_code is not 2xx). Fourth
607             element (%result_meta) is called result metadata and is optional, a hash
608             that contains extra information, much like how HTTP response headers provide additional metadata.
609              
610             Return value: (any)
611              
612             =head1 HOMEPAGE
613              
614             Please visit the project's homepage at L<https://metacpan.org/release/App-CSVUtils>.
615              
616             =head1 SOURCE
617              
618             Source repository is at L<https://github.com/perlancar/perl-App-CSVUtils>.
619              
620             =head1 AUTHOR
621              
622             perlancar <perlancar@cpan.org>
623              
624             =head1 CONTRIBUTING
625              
626              
627             To contribute, you can send patches by email/via RT, or send pull requests on
628             GitHub.
629              
630             Most of the time, you don't need to build the distribution yourself. You can
631             simply modify the code, then test via:
632              
633             % prove -l
634              
635             If you want to build the distribution (e.g. to try to install it locally on your
636             system), you can install L<Dist::Zilla>,
637             L<Dist::Zilla::PluginBundle::Author::PERLANCAR>,
638             L<Pod::Weaver::PluginBundle::Author::PERLANCAR>, and sometimes one or two other
639             Dist::Zilla- and/or Pod::Weaver plugins. Any additional steps required beyond
640             that are considered a bug and can be reported to me.
641              
642             =head1 COPYRIGHT AND LICENSE
643              
644             This software is copyright (c) 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016 by perlancar <perlancar@cpan.org>.
645              
646             This is free software; you can redistribute it and/or modify it under
647             the same terms as the Perl 5 programming language system itself.
648              
649             =head1 BUGS
650              
651             Please report any bugs or feature requests on the bugtracker website L<https://rt.cpan.org/Public/Dist/Display.html?Name=App-CSVUtils>
652              
653             When submitting a bug or request, please include a test-file or a
654             patch to an existing test-file that illustrates the bug or desired
655             feature.
656              
657             =cut