File Coverage

blib/lib/Text/AutoCSV.pm
Criterion Covered Total %
statement 1480 1730 85.5
branch 686 958 71.6
condition 188 291 64.6
subroutine 111 117 94.8
pod 43 43 100.0
total 2508 3139 79.9


line stmt bran cond sub pod time code
1             #!/usr/bin/perl
2             # ABSTRACT: helper module to automate the use of Text::CSV
3              
4             # vim:tw=100
5              
6             # Text/AutoCSV.pm
7              
8             #
9             # Written by Sébastien Millet
10             # March, July, August, September 2016
11             # January, February 2017
12             #
13              
14             package Text::AutoCSV;
15             $Text::AutoCSV::VERSION = '1.1.9';
16             my $PKG = "Text::AutoCSV";
17              
18 20     20   879687 use strict;
  20         65  
  20         510  
19 17     17   83 use warnings;
  17         33  
  17         904  
20              
21             require Exporter;
22             our @ISA = 'Exporter';
23             our @EXPORT_OK = qw(remove_accents);
24              
25 17     17   91 use Carp;
  17         33  
  17         995  
26 17     17   5429 use Params::Validate qw(validate validate_pos :types);
  17         112305  
  17         3238  
27 17     17   5606 use List::MoreUtils qw(first_index indexes);
  17         113952  
  17         156  
28 17     17   11361 use Fcntl qw(SEEK_SET);
  17         34  
  17         791  
29 17     17   4861 use File::BOM;
  17         398401  
  17         868  
30 17     17   7409 use Text::CSV;
  17         231961  
  17         780  
31 17     17   9579 use DateTime;
  17         6663742  
  17         958  
32             # DateTime::Format::Strptime 1.70 does not work properly with us.
33             # Actually all version as of 1.63 are fine, except 1.70.
34 17     17   8364 use DateTime::Format::Strptime 1.71;
  17         885728  
  17         125  
35 17     17   7995 use Class::Struct;
  17         25020  
  17         102  
36 17     17   7503 use Unicode::Normalize;
  17         26892  
  17         988  
37             # lock_keys is used to prevent accessing non existing keys
38             # Credits: 3381159 on http://stackoverflow.com
39             # "make perl shout when trying to access undefined hash key"
40 17     17   5396 use Hash::Util qw(lock_keys);
  17         33754  
  17         97  
41              
42             # FIXME
43             # Not needed in release -> should be always commented unless at dev time
44             #use feature qw(say);
45             #use Data::Dumper;
46             #$Data::Dumper::Sortkeys = 1;
47              
48             # Set to 1 if you wish to turn on debug without touching caller's code
49             our $ALWAYS_DEBUG = 0;
50              
51             # Keep it set to 0 unless you know what you're doing!
52             # Note
53             # Taken into account only if debug is set.
54             my $DEBUG_DATETIME_FORMATS = 0;
55             # The below is taken into account only if $DEBUG_DATETIME_FORMATS is set.
56             # It'll resqult in even more debug output. It becomes really MASSIVE debug output.
57             my $DEBUG_DATETIME_FORMATS_EVEN_MORE = 0;
58              
59             #
60             # Uncomment to replace carp and croak with cluck and confess, respectively
61             # Also reachable with perl option:
62             # -MCarp=verbose
63             # See 'perldoc Carp'.
64             #
65             #$Carp::Verbose = 1;
66              
67              
68             # * *************** *
69             # * BEHAVIOR TUNING *
70             # * *************** *
71              
72              
73             # * **************************************************** *
74             # * ALL THE VARIABLES BELOW ARE RATHER LOW LEVEL. *
75             # * IF YOU UPDATE IT, IT WILL LIKELY BREAK THE TEST PLAN *
76             # * **************************************************** *
77              
78             my $DEF_SEARCH_CASE = 0; # Case insensitive search by default
79             my $DEF_SEARCH_TRIM = 1; # Trim values by default
80             my $DEF_SEARCH_IGNORE_ACCENTS = 1; # Ignore accents
81             my $DEF_SEARCH_IGNORE_EMPTY = 1; # Ignore empty strings in searches by default
82             my $DEF_SEARCH_VALUE_IF_NOT_FOUND = undef; # If not found, returned field value is undef
83             my $DEF_SEARCH_VALUE_IF_AMBIGUOUS = undef; # If more than one record found by search (when a
84             # unique value is expected), return undef
85             my $DEF_SEARCH_IGNORE_AMBIGUOUS = 1; # By default, ignore the fact that multiple records are
86             # found by search and return the first record found
87              
88             my $DETECT_ENCODING = 1;
89              
90             my $DEFAULT_IN_ENCODING = 'UTF-8,latin1';
91              
92             # By default, input encoding detected is used for output.
93             # -> the constant below is used if and only if:
94             # Inbound encoding is unknown
95             # No providing of out_encoding attribute (out_encoding takes precedence when provided)
96             my $DEFAULT_OUT_ENCODING = 'UTF-8';
97              
98             my $DEFAULT_ESCAPE_CHAR = '\\';
99             my $DEFAULT_QUOTE_CHAR = '"';
100              
101             #
102             # The code that workarounds $FIX_PERLMONKS_823214 (see below) makes sense only under plain
103             # Windows.
104             #
105             # "Plain" Windows?
106             # This code MUST NOT be executed under cygwin because cygwin uses unix line breaks. This is
107             # why we detect /mswin/. Would we detect /win/, we'd catch cygwin, too, and we don't want
108             # that.
109             #
110             my $OS_IS_PLAIN_WINDOWS = !! ($^O =~ /mswin/i);
111              
112             #
113             # Shall we fix the issue reported as #823214 in PerlMonks? See
114             # http://www.perlmonks.org/?node_id=823214
115             #
116             # In brief (in case the link above would be broken one day):
117             # Under Windows, output mode set to UTF-16LE produces line breaks made of octets "0d 0a 00",
118             # whereas it should be "0d 00 0a 00".
119             #
120             # The code also fixes UTF-16BE (but it was not tested).
121             #
122             my $FIX_PERLMONKS_823214 = 1;
123              
124              
125             # * **** *
126             # * CODE *
127             # * **** *
128              
129              
130             sub ERR_UNKNOWN_FIELD() { 0 }
131              
132             # Store meta-data about each column
133             struct ColData => {
134             field_name => '$',
135             header_text => '$',
136             description => '$',
137             dt_format => '$',
138             dt_locale => '$',
139             multiline => '$'
140             };
141              
142             #
143             # Enumeration of ef_type member below
144             # Alternative:
145             # use enum (...)
146             #
147             # But it is not also by default on my distro and installing a package for 3 constants, I find it
148             # a bit overkill!
149             #
150             my ($EF_LINK, $EF_FUNC, $EF_COPY) = 0..2;
151             struct ExtraField => {
152             ef_type => '$',
153             self_name => '$',
154             description => '$',
155              
156             check_field_existence => '$',
157              
158             # For when ef_type is set to $EF_LINK
159              
160             link_self_search => '$',
161             link_remote_obj => '$',
162             link_remote_search => '$',
163             link_remote_read => '$',
164             link_vlookup_opts => '%',
165              
166             # For when ef_type is set to $EF_FUNC
167              
168             func_sub => '$',
169              
170             # For when ef_type is set to $EF_COPY
171              
172             copy_source => '$',
173             copy_sub => '$'
174              
175             };
176              
177             my $SEARCH_VALIDATE_OPTIONS = {
178             value_if_not_found => {type => UNDEF | SCALAR, optional => 1},
179             value_if_found => {type => UNDEF | SCALAR, optional => 1},
180             value_if_ambiguous => {type => UNDEF | SCALAR, optional => 1},
181             ignore_ambiguous => {type => BOOLEAN, optional => 1},
182             case => {type => BOOLEAN, optional => 1},
183             trim => {type => BOOLEAN, optional => 1},
184             ignore_empty => {type => BOOLEAN, optional => 1},
185             ignore_accents => {type => BOOLEAN, optional => 1}
186             };
187              
188             sub _is_utf8 {
189 424     424   828 my $e = shift;
190              
191 424 100       3130 return 1 if $e =~ m/^(utf-?8|ucs-?8)/i;
192 32         94 return 0;
193             }
194              
195             # To replace // in old perls: return the first non-undef value in provided list
196             sub _get_def {
197 12079     12079   19554 for (@_) {
198 18709 100       40127 return $_ if defined($_);
199             }
200 895         1451 return undef;
201             }
202              
203             sub _print {
204 15     15   20 my $self = shift;
205 15         21 my $t = shift;
206              
207 15         26 my $infoh = $self->{infoh};
208 15 50       27 return if ref $infoh ne 'GLOB';
209              
210 0         0 print($infoh $t);
211             }
212              
213             sub _printf {
214 17     17   21 my $self = shift;
215              
216 17         20 my $infoh = $self->{infoh};
217 17 50       53 return if ref $infoh ne 'GLOB';
218              
219 0         0 printf($infoh @_);
220             }
221              
222             sub _print_warning {
223 67     67   103 my $self = shift;
224 67         93 my $warning_message = shift;
225 67         93 my $dont_wrap = shift;
226              
227 67 100       161 my $msg = ($dont_wrap ? $warning_message : "$PKG: warning: $warning_message");
228 67 100       4313 carp $msg unless $self->{quiet};
229             }
230              
231             sub _close_inh {
232 316     316   467 my $self = shift;
233              
234 316 100       3175 close $self->{_inh} if $self->{_close_inh_when_finished};
235 316         1218 $self->{_inh} = undef;
236 316         602 $self->{_close_inh_when_finished} = undef;
237             }
238              
239             sub _close_outh {
240 127     127   197 my $self = shift;
241              
242 127 50 66     11912 close $self->{outh} if defined($self->{outh}) and $self->{_close_outh_when_finished};
243 127         413 $self->{outh} = undef;
244 127         278 $self->{_close_outh_when_finished} = undef;
245             }
246              
247             sub _print_error {
248 81     81   316 my ($self, $error_message, $dont_stop, $err_code, $err_extra) = @_;
249              
250 81         194 my $msg = "$PKG: error: $error_message";
251              
252 81 100 100     267 if (defined($err_code) and !$self->{quiet} and $self->{croak_if_error}) {
      100        
253 5 50       15 if ($err_code == ERR_UNKNOWN_FIELD) {
254 5         10 my %f = %{$err_extra};
  5         20  
255 5         9 my @cols;
256 5         18 for my $n (keys %f) {
257 15         30 $cols[$f{$n}] = $n;
258             }
259 5         17 $self->_print($self->get_in_file_disp() . " column - field name correspondance:\n");
260 5         13 $self->_print("COL # FIELD\n");
261 5         13 $self->_print("----- -----\n");
262 5         15 for my $i (0..$#cols) {
263 17 100       45 $self->_printf("%05d %s\n", $i, (defined($cols[$i]) ? $cols[$i] : ''));
264             }
265             } else {
266 0         0 confess "Unknown error code: '$err_code'\n";
267             }
268             }
269              
270 81 100 100     287 if ($self->{croak_if_error} and !$dont_stop) {
271 30         92 $self->_close_read(1);
272 30         84 $self->_close_inh();
273 30         72 $self->_close_outh();
274 30         91 $self->_status_reset(1);
275 30         4526 croak $msg;
276             }
277 51         132 $self->_print_warning($msg, 1);
278             }
279              
280             #
281             # Return the string passed in argument with all accents removed from characters.
282             # Do it in a rather general and reliable way, not tied to latin1.
283             # Tested on latin1 and latin2 character sets.
284             #
285             # Credits:
286             # http://stackoverflow.com/questions/17561839/remove-accents-from-accented-characters
287             #
288             sub remove_accents {
289 3163     3163 1 25231 validate_pos(@_, {type => SCALAR});
290              
291 3163         7569 my $s = $_[0];
292 3163         11025 my $r = NFKD($s);
293 3163         7004 $r =~ s/\p{Nonspacing_Mark}//g;
294 3163         6659 return $r;
295             }
296              
297             sub _detect_csv_sep {
298 262     262   435 my $ST_OUTSIDE = 0;
299 262         435 my $ST_INSIDE = 1;
300              
301 262         605 my ($self, $escape_char, $quote_char, $sep) = @_;
302              
303 262         491 my $_debugh = $self->{_debugh};
304 262         425 my $inh = $self->{_inh};
305 262         413 my $_debug = $self->{_debug};
306              
307 262         401 delete $self->{_inh_header};
308              
309 262 100       596 $escape_char = $DEFAULT_ESCAPE_CHAR unless defined($escape_char);
310              
311 262 50       612 $self->_print_error("illegal \$escape_char: '$escape_char' (length >= 2)"), return 0
312             if length($escape_char) >= 2;
313              
314 262 50       528 $self->_print_error("$PKG: error: illegal \$quote_char '$quote_char' (length >= 2)"), return 0
315             if length($quote_char) >= 2;
316              
317 262 50       553 $escape_char = '--' if $escape_char eq '';
318 262 50       511 $quote_char = '--' if $quote_char eq '';
319              
320             # FIXME (?)
321             # Avoid inlined magic values for separator auto-detection.
322             # Issue is, as you can see below, the behavior is also hard-coded and not straightforward to
323             # render 'tunable' ("," and ";" take precedence over "\t").
324 262         857 my %Seps = (
325             ";" => 0,
326             "," => 0,
327             "\t" => 0
328             );
329              
330 262         2921 my $h = <$inh>;
331 262 50       2566 if ($self->{inh_is_stdin}) {
332 0         0 $self->{_inh_header} = $h;
333 0 0       0 print($_debugh "Input is STDIN => saving header line to re-read it " .
334             "later (in-memory)\n") if $_debug;
335             } else {
336 262         1974 seek $inh, 0, SEEK_SET;
337 262 50       699 print($_debugh "Input is not STDIN => using seek function to rewind " .
338             "read head after header line reading\n") if $_debug;
339             }
340              
341 262         581 chomp $h;
342 262         439 my $status = $ST_OUTSIDE;
343 262         587 my $l = length($h);
344 262         392 my $c = 0;
345 262         600 while ($c < $l) {
346 4808         6085 my $ch = substr($h, $c, 1);
347 4808         5203 my $chnext = '';
348 4808 100       8032 $chnext = substr($h, $c + 1, 1) if ($c < $l - 1);
349 4808 100       7524 if ($status == $ST_INSIDE) {
    50          
350 1521 50 66     2933 if ($ch eq $escape_char and $chnext eq $quote_char) {
    100          
351 0         0 $c += 2;
352             } elsif ($ch eq $quote_char) {
353 197         231 $status = $ST_OUTSIDE;
354 197         306 $c++;
355             } else {
356 1324         1982 $c++;
357             }
358             } elsif ($status == $ST_OUTSIDE) {
359 3287 50 33     7145 if ($ch eq $escape_char and ($chnext eq $quote_char or
    100 66        
    100          
360             exists $Seps{$chnext})) {
361 0         0 $c += 2;
362             } elsif (exists $Seps{$ch}) {
363 946         1175 $Seps{$ch}++;
364 946         1410 $c++;
365             } elsif ($ch eq $quote_char) {
366 197         226 $status = $ST_INSIDE;
367 197         315 $c++;
368             } else {
369 2144         3386 $c++;
370             }
371             }
372             }
373              
374 262 100 100     1342 if ($Seps{";"} == 0 and $Seps{","} >= 1) {
    100 66        
    50 33        
      33        
375 147         260 $$sep = ",";
376 147         636 return 1;
377             } elsif ($Seps{","} == 0 and $Seps{";"} >= 1) {
378 106         183 $$sep = ";";
379 106         434 return 1;
380             } elsif ($Seps{","} == 0 and $Seps{";"} == 0 and $Seps{"\t"} >= 1) {
381 0         0 $$sep = "\t";
382 0         0 return 1;
383             } else {
384              
385             # Check the case where there is one unique column, in which case,
386             # assume comma separator.
387 9         19 my $h_no_accnt = remove_accents($h);
388 9 100       56 if ($h_no_accnt =~ m/^[[:alnum:]_]+$/i) {
389 3         8 $$sep = ",";
390 3         15 return 1;
391             }
392              
393 6         10 $$sep = "";
394 6 50       13 if ($_debug) {
395 0         0 for my $k (keys %Seps) {
396 0         0 print($_debugh "\$Seps{'$k'} = $Seps{$k}\n");
397             }
398             }
399 6         23 return 0;
400             }
401             }
402              
403             sub _reopen_input {
404 652     652   964 my $self = shift;
405              
406 652         967 my $in_file = $self->{in_file};
407              
408 652         804 my $inh;
409 652 50       13337 if (!open($inh, "<", $in_file)) {
410 0         0 $self->_print_error("unable to open file '$in_file': $!");
411 0         0 return undef;
412             }
413 652 50       1847 if (!$self->{_leave_encoding_alone}) {
414              
415             confess "Oups! _inh_encoding_string undef?"
416 652 50       1329 unless defined($self->{_inh_encoding_string});
417              
418 652         4216 binmode $inh, $self->{_inh_encoding_string};
419             }
420              
421 652         29166 return $inh;
422             }
423              
424             # Abstraction layer, not useful Today, could bring added value when looking into Text::CSV I/O
425             sub _mygetline {
426 14863     14863   26338 my ($csvobj, $fh) = @_;
427              
428 14863         300278 return $csvobj->getline($fh);
429             }
430              
431             sub _detect_meta {
432 456     456   955 my ($self, $quote_char, $sep_char) = @_;
433              
434 456         755 my $in_file = $self->{in_file};
435 456         688 my $_debug = $self->{_debug};
436 456         693 my $_debugh = $self->{_debugh};
437              
438 456 100       896 return if $self->{_int_one_pass};
439 436 100       1018 return if $self->{_detect_meta_done}; # Sans jeu de mot...
440              
441 300 100       717 if (!defined($self->{escape_char})) {
442 298         742 $self->_register_pass("detect escape character");
443              
444 298         418 my $flag = 0;
445 298         693 my $inh = $self->_reopen_input();
446 298 50       752 if (defined($inh)) {
447 298         3892 while (my $l = <$inh>) {
448 7131         16218 chomp $l;
449              
450             # Very heuristic criteria...
451             # Tant pis.
452             # $flag = 1 if $l =~ m/(?<!$sep_char)$quote_char$quote_char(?!$sep_char)/;
453             # $flag = 1 if $l =~ m/(\\$quote_char|\\\\)/;
454 7131 100       24067 $flag = 1 if $l =~ m/(\\$quote_char)/;
455              
456             }
457 298         2174 close $inh;
458             }
459 298 100       1193 $self->{escape_char} = ($flag ? '\\' : '"');
460 298 50       1002 print($_debugh " detected escape_char: '$self->{escape_char}'\n") if $_debug;
461             }
462              
463 300 50       634 print($_debugh " using escape_char: '$self->{escape_char}' " .
464             "to further examine input (is_always_quoted, multiline)\n") if $_debug;
465              
466 300         444 my $is_always_quoted = 0;
467 300         743 my $inh = $self->_reopen_input();
468 300         484 my @multiline;
469 300 50       707 if (defined($inh)) {
470 300         782 $self->_register_pass("detect is_always_quoted and multiline");
471             my $csv = Text::CSV->new({sep_char => $sep_char,
472             allow_whitespace => 1, binary => 1, auto_diag => 0,
473             quote_char => $quote_char, escape_char => $self->{escape_char},
474 300         3106 keep_meta_info => 1,
475             allow_loose_escapes => 1});
476 300         56555 my $nb_rows = 0;
477 300         490 $is_always_quoted = 1;
478 300         675 while (my $ar = _mygetline($csv, $inh)) {
479 7129         190964 $nb_rows++;
480              
481 7129         8760 my @a = @{$ar};
  7129         15824  
482 7129         10602 my $e = $#a;
483 7129         12760 for my $i (0..$e) {
484 44737 100       73856 $is_always_quoted = 0 unless $csv->is_quoted($i);
485 44737 100       351542 $multiline[$i] = 1 if $a[$i] =~ m/\n/;
486             }
487             }
488 300         9064 $self->{_nb_rows} = $nb_rows;
489 300         3509 close $inh;
490             }
491 300         1150 $self->{_multiline} = [ @multiline ];
492              
493 300 50       720 print($_debugh " is_always_quoted: $is_always_quoted\n") if $_debug;
494 300         665 $self->{_is_always_quoted} = $is_always_quoted;
495              
496 300         1405 $self->{_detect_meta_done} = 1;
497             }
498              
499             sub _register_pass {
500 1269     1269   2339 my ($self, $pass_name) = @_;
501 1269         2074 my $_debug = $self->{_debug};
502 1269         1808 my $_debugh = $self->{_debugh};
503              
504 1269         2043 $self->{_pass_count}++;
505              
506 1269 50       2809 return unless $_debug;
507              
508 0         0 print($_debugh "Pass #" . $self->{_pass_count} . " ($pass_name) done\n");
509             }
510              
511             sub _update_in_mem_record_count {
512 510     510   1093 my ($self, $nonexistent_arg) = @_;
513 510         905 my $_debug = $self->{_debug};
514 510         846 my $_debugh = $self->{_debugh};
515              
516 510 50       1056 confess "Hey! what is this second argument?" if defined($nonexistent_arg);
517              
518 510         673 my $new_count = $#{$self->{_flat}} + 1;
  510         1086  
519              
520 510         861 my $updated_max = 0;
521 510 100       1386 if ($new_count > $self->get_max_in_mem_record_count()) {
522 155         486 $self->_set_max_in_mem_record_count($new_count);
523 155         273 $updated_max = 1;
524             }
525              
526 510         1247 $self->{_in_mem_record_count} = $new_count;
527 510 50       1238 if ($_debug) {
528 0         0 print($_debugh "_in_mem_record_count updated, set to $new_count");
529 0 0       0 print($_debugh " (also updated max)") if $updated_max;
530 0         0 print($_debugh "\n");
531             }
532             }
533              
534             sub _detect_inh_encoding {
535 320     320   1287 my ($self, $enc, $via, $in_file, $detect_enc) = @_;
536 320         672 my $_debug = $self->{_debug};
537 320         508 my $_debugh = $self->{_debugh};
538              
539 320 100 66     1101 $enc = $DEFAULT_IN_ENCODING if !defined($enc) or $enc eq '';
540              
541 320         2061 my @encodings = split(/\s*,\s*/, $enc);
542              
543 320 50       932 confess "Oups! No encoding to try?" if $#encodings < 0;
544              
545 320 50       728 print($_debugh "[ST] _detect_inh_encoding(): start\n") if $_debug;
546              
547 320         497 my $wrn = 0;
548 320         777 my $m;
549             my $m0;
550 320         0 my $ee;
551 320         684 for my $e (@encodings) {
552 332         514 $ee = $e;
553 332         692 my $viadef = _get_def($via, '');
554 332         985 $m = ":encoding($e)$viadef";
555 332 100       801 $m0 = $m unless defined($m0);
556              
557 332 100       676 last unless $detect_enc;
558              
559 320 50 33     1342 confess "Oups! in_file not defined?" if !defined($in_file) or $in_file eq '';
560              
561 320 50       641 print($_debugh " Checking encoding '$e' / '$m'\n") if $_debug;
562 320         465 $wrn = 0;
563              
564 320         1138 $self->_register_pass("check $e encoding");
565              
566 320         522 my $utf8_bom = 0;
567 320 100       733 if (_is_utf8($e)) {
568 296 50       6325 if (open my $fh, '<:raw', $in_file) {
569 296         622 my $bom;
570 296         4840 read $fh, $bom, 3;
571 296 100 66     1554 if (length($bom) == 3 and $bom eq "\xef\xbb\xbf") {
572 12 100       41 if (!defined($via)) {
573 10         34 $m .= ":via(File::BOM)";
574             }
575             }
576 296         2429 close $fh;
577             }
578             }
579              
580 320         641 my $inh;
581 320 50       4998 if (!open($inh, "<", $in_file)) {
582 0         0 $self->_print_error("unable to open file '$in_file': $!");
583 0         0 return ($encodings[0], $m0);
584             }
585 16     16   103 binmode $inh, $m;
  16         26  
  16         98  
  320         3529  
586              
587             # TURN OFF WARNINGS OUTPUT
588              
589             {
590 320         34466 local $SIG{__WARN__} = sub {
591 51     51   892 $wrn++;
592             # Uncomment only for debug!
593             # Otherwise you'll get quite a good deal of output at each execution :-)
594             # print(STDERR @_);
595 320         2319 };
596 320         4433 while (<$inh>) { }
597             }
598              
599             # WARNINGS ARE BACK ON
600              
601 320         15184 close $inh;
602 320 50       973 print($_debugh " '$m' counts $wrn warning(s)\n") if $_debug;
603              
604 320 100       1326 last if $wrn == 0;
605             }
606              
607 320 50       793 if ($wrn >= 1) {
608 0         0 $self->_print_warning("encoding warnings encountered during initial check, " .
609             "using '$encodings[0]'");
610 0         0 return ($encodings[0], $m0);
611             }
612              
613 320 50       670 confess "Oups! undef encoding string?" unless defined($m);
614              
615 320 50       656 print($_debugh " Detected encoding string '$ee' / '$m'\n") if $_debug;
616 320         1325 return ($ee, $m);
617             }
618              
619             #
620             # Each of these functions brings status to the next value (current status + 1).
621             # Each of these functions returns 0 if an error occured, 1 if all good
622             #
623             my @status_forward_functions = (
624             "_S1_init_input", # To go from S0 to S1
625             "_S2_init_fields_from_header", # To go form S1 to S2
626             "_S3_init_fields_extra", # To go from S2 to S3
627             "_S4_read_all_in_mem", # To go from S3 to S4
628             );
629              
630             sub _status_reset {
631 494     494   770 my $self = shift;
632              
633 494         3224 validate_pos(@_, {type => SCALAR, optional => 1});
634 494         1706 my $called_from_print_error = _get_def($_[0], 0);
635              
636 494 100 100     1742 if (defined($self->{_status}) and $self->{_status} == 4) {
637 18 100       46 unless ($called_from_print_error) {
638 16         27 my $msg = "in-memory CSV content discarded, will have to re-read input";
639 16         59 $self->_print_warning($msg);
640             }
641 18         159 $self->{_flat} = [ ];
642 18         61 $self->_update_in_mem_record_count();
643             }
644              
645 494         993 $self->{_status} = 0;
646 494 100       961 return 0 if $called_from_print_error;
647 464         1069 return $self->_status_forward('S1');
648             }
649              
650             sub _status_forward {
651 4084     4084   5417 my $self = shift;
652              
653 4084         7464 return $self->___status_move(@_, 1);
654             }
655              
656             sub _status_backward {
657 139     139   195 my $self = shift;
658              
659 139         310 return $self->___status_move(@_, -1);
660             }
661              
662             # You should not call ___status_move() in the code, that is why the name is prefixed with 3
663             # underscores! Only _status_forward and _status_backward should call it.
664             sub ___status_move {
665 4223     4223   7145 my ($self, $target, $step) = @_;
666              
667 4223         6270 my $_debug = $self->{_debug};
668 4223         5514 my $_debugh = $self->{_debugh};
669              
670 4223 50 66     17189 if (!defined($step) or ($step != -1 and $step != 1)) {
      33        
671 0         0 confess "Oups! \$step has a wrong value: '$step'";
672             }
673              
674 4223         5201 my $n;
675 4223 50       17241 confess "Oups! illegal status string: '$target'" unless ($n) = $target =~ m/^S(\d)$/;
676              
677 4223 100       8602 if ($self->{_read_in_progress}) {
678 1         4 $self->_print_error("illegal call while read is in progress, " .
679             "would lead to infinite recursion", 0);
680 0         0 confess "Aborted.";
681             }
682              
683 4222 100       7047 if ($step == -1) {
684 139 100       313 if ($n < $self->{_status}) {
685 19 100       49 if ($self->{_status} == 4) {
686 16 50       40 print($_debugh "[ST] Requested status $n but will go to status 0\n") if $_debug;
687 16         47 return $self->_status_reset();
688             }
689 3         5 $self->{_status} = $n ;
690 3 50       7 print($_debugh "[ST] New status: ". $self->{_status} . "\n") if $_debug;
691             }
692 123         336 return 1;
693             }
694              
695 4083 100       8334 if ($self->{_status} < $n) {
696 869 50       1584 print($_debugh "[ST] Current status: ". $self->{_status} . "\n") if $_debug;
697             }
698              
699 4083 100 100     10040 if ($self->{_status} <= 1 and $n >= 2 and $self->{_int_one_pass} and
      100        
      100        
700             $self->get_pass_count() >= 1) {
701 12         23 my $msg = "one_pass set, unable to read input again";
702 12 50       46 $self->_print_error($msg), return 0 if $self->{one_pass};
703 0 0       0 $self->_print_warning($msg) if !$self->{one_pass};
704             }
705              
706 4071         7551 while ($self->{_status} < $n) {
707              
708 1347         2548 my $funcname = $status_forward_functions[$self->{_status}];
709 1347 50       2661 confess "Oups! Unknown status?" unless defined($funcname);
710              
711 1347 50       2236 print($_debugh "[ST] Now executing $funcname\n") if $_debug;
712              
713 1347 50       5495 if (my $member_function = $self->can($funcname)) {
714 1347 100       3075 return 0 unless $self->$member_function();
715             } else {
716 0         0 confess "Could not find method $funcname in $PKG!";
717             }
718              
719 1328         2485 $self->{_status} += $step;
720 1328 50       4024 print($_debugh "[ST] New status: ". $self->{_status} . "\n") if $_debug;
721             }
722              
723 4052         9545 return 1;
724             }
725              
726             sub new {
727 333     333 1 255769 my ($class, @args) = @_;
728              
729 333         39585 @args = validate(@args,
730             { in_file => {type => SCALAR, optional => 1},
731             infoh => {type => UNDEF | GLOBREF, default => \*STDERR, optional => 1},
732             verbose => {type => BOOLEAN, default => 0, optional => 1},
733             quiet => {type => BOOLEAN, optional => 1},
734             croak_if_error => {type => BOOLEAN, default => 1, optional => 1},
735             inh => {type => GLOBREF, optional => 1},
736             in_csvobj => {type => OBJECT, optional => 1},
737             sep_char => {type => SCALAR, optional => 1},
738             quote_char => {type => SCALAR, optional => 1},
739             escape_char => {type => SCALAR, optional => 1},
740             has_headers => {type => BOOLEAN, default => 1, optional => 1},
741             out_has_headers => {type => UNDEF | BOOLEAN, default => undef, optional => 1},
742             fields_ar => {type => ARRAYREF, optional => 1},
743             fields_hr => {type => HASHREF, optional => 1},
744             fields_column_names => {type => ARRAYREF, optional => 1},
745             search_case => {type => SCALAR, optional => 1},
746             search_trim => {type => SCALAR, optional => 1},
747             search_ignore_empty => {type => SCALAR, optional => 1},
748             search_ignore_accents => {type => SCALAR, optional => 1},
749             search_ignore_ambiguous => {type => SCALAR, optional => 1},
750             search_value_if_not_found => {type => SCALAR, optional => 1},
751             search_value_if_found => {type => SCALAR, optional => 1},
752             search_value_if_ambiguous => {type => SCALAR, optional => 1},
753             walker_hr => {type => CODEREF, optional => 1},
754             walker_ar => {type => CODEREF, optional => 1},
755             read_post_update_hr => {type => CODEREF, optional => 1},
756             write_filter_hr => {type => CODEREF, optional => 1},
757             out_filter => {type => CODEREF, optional => 1},
758             write_fields => {type => ARRAYREF, optional => 1},
759             out_orderby => {type => ARRAYREF, optional => 1},
760             out_fields => {type => ARRAYREF, optional => 1},
761             out_file => {type => SCALAR, optional => 1},
762             out_always_quote => {type => BOOLEAN, optional => 1},
763             out_sep_char => {type => SCALAR, optional => 1},
764             out_quote_char => {type => SCALAR, optional => 1},
765             out_escape_char => {type => SCALAR, optional => 1},
766             out_dates_format => {type => SCALAR, optional => 1},
767             out_dates_locale => {type => SCALAR, optional => 1},
768             encoding => {type => SCALAR, optional => 1},
769             via => {type => SCALAR, optional => 1},
770             out_encoding => {type => SCALAR, optional => 1},
771             dont_mess_with_encoding => {type => BOOLEAN, optional => 1},
772             one_pass => {type => BOOLEAN, optional => 1},
773             no_undef => {type => BOOLEAN, optional => 1},
774             fields_dates => {type => ARRAYREF, optional => 1},
775             fields_dates_auto => {type => BOOLEAN, optional => 1},
776             fields_dates_auto_optimize => {type => BOOLEAN, optional => 1},
777             dates_formats_to_try => {type => ARRAYREF, optional => 1},
778             dates_formats_to_try_supp => {type => ARRAYREF, optional => 1},
779             dates_ignore_trailing_chars => {type => BOOLEAN, optional => 1},
780             dates_search_time => {type => BOOLEAN, optional => 1},
781             dates_locales => {type => SCALAR, optional => 1},
782             out_utf8_bom => {type => SCALAR, optional => 1},
783             dates_zeros_ok => {type => SCALAR, default => 1, optional => 1},
784             _debug => {type => BOOLEAN, default => 0, optional => 1},
785             _debug_read => {type => BOOLEAN, default => 0, optional => 1},
786             _debug_extra_fields => {type => BOOLEAN, optional => 1},
787             _debugh => {type => UNDEF | GLOBREF, optional => 1}
788             }
789             );
790              
791 329         7092 my $self = { @args };
792              
793 329         686 my @fields = keys %{$self};
  329         1229  
794              
795             # croak_if_error
796              
797 329         815 my $croak_if_error = $self->{croak_if_error};
798              
799             # verbose and _debug management
800              
801 329 50       1220 $self->{_debugh} = $self->{infoh} if !defined($self->{_debugh});
802 329 50       879 $self->{_debug} = 1 if $ALWAYS_DEBUG;
803 329         624 my $_debug = $self->{_debug};
804 329 50       714 $self->{verbose} = 1 if $_debug;
805 329         541 my $verbose = $self->{verbose};
806              
807 329         502 my $_debugh = $self->{_debugh};
808              
809 329         618 bless $self, $class;
810              
811             # fields_ar, fields_hr
812              
813 329 100       1086 if (defined($self->{fields_ar}) +
814             defined($self->{fields_hr}) +
815             defined($self->{fields_column_names})
816             >= 2) {
817 1         7 $self->_print_error("mixed use of fields_ar, fields_hr and fields_column_names. " .
818             "Use one at a time.");
819             }
820 329 100 100     1050 if (defined($self->{fields_ar}) and !defined($self->{fields_hr})) {
821 2         4 my @f = @{$self->{fields_ar}};
  2         6  
822 2         3 my %h;
823 2         6 for my $e (@f) {
824 6         15 $h{$e} = "^$e\$";
825             }
826 2         6 $self->{fields_hr} = \%h;
827             }
828 329 100       749 if (!$self->{has_headers}) {
829 13 100       33 if (defined($self->{fields_ar})) {
830 1         5 $self->_print_error("fields_ar irrelevant if CSV file has no headers");
831 1         175 return undef;
832             }
833 12 100       29 if (defined($self->{fields_hr})) {
834 1         4 $self->_print_error("fields_hr irrelevant if CSV file has no headers");
835 1         5 return undef;
836             }
837             }
838              
839             # in_file or inh
840              
841 327         733 $self->{_flat} = [ ];
842              
843 327         686 $self->{_read_update_after_hr} = { };
844 327         652 $self->{_write_update_before_hr} = { };
845              
846 327         1074 $self->_update_in_mem_record_count();
847              
848 327 100       882 return undef unless $self->_status_reset();
849              
850 318 50       672 $self->_debug_show_members() if $_debug;
851              
852 318 100       728 if ($self->{dates_zeros_ok}) {
853             $self->{_refsub_is_datetime_empty} = sub {
854 9637     9637   16324 my $v = $_[0];
855 9637 100       29969 if ($v !~ m/[1-9]/) {
856 4207 100       10364 return 1 if $v =~ m/^[^0:]*0+[^0:]+0+[^0:]+0+/;
857             }
858 9634         30100 return 0;
859             }
860 316         1466 }
861              
862 318         5336 return $self;
863             };
864              
865             #
866             # Return 0 if error, 1 if all good
867             #
868             # Do all low level activities associated to input:
869             # I/O init
870             # Detect encoding
871             # Detect CSV separator
872             # Detect escape character
873             #
874             sub _S1_init_input {
875 470     470   726 my $self = shift;
876              
877 470         786 my $croak_if_error = $self->{croak_if_error};
878 470         751 my $_debug = $self->{_debug};
879 470         675 my $_debugh = $self->{_debugh};
880              
881 470 100       952 $self->{in_file} = '' unless defined($self->{in_file});
882 470         946 $self->{_close_inh_when_finished} = 0;
883              
884             $self->{_leave_encoding_alone} = $self->{dont_mess_with_encoding}
885 470 50       1027 if defined($self->{dont_mess_with_encoding});
886              
887 470         1341 $self->{_int_one_pass} = _get_def($self->{one_pass}, 0);
888 470         878 my $in_file_disp;
889              
890              
891             #
892             # LOW LEVEL INIT STEP 1 OF 4
893             #
894             # Manage I/O (= in most cases, open input file...)
895             #
896              
897 470 100       993 if (defined($self->{inh})) {
898 4 50       13 $self->{_leave_encoding_alone} = 1 unless defined($self->{dont_mess_with_encoding});
899 4         10 $in_file_disp = _get_def($self->{in_file}, '<?>');
900 4 50       12 $self->{_int_one_pass} = 1 unless defined($self->{one_pass});
901 4         8 $self->{_inh} = $self->{inh};
902             } else {
903 466 50       1201 $self->{_leave_encoding_alone} = 0 unless defined($self->{dont_mess_with_encoding});
904 466         803 my $in_file = $self->{in_file};
905 466         670 my $inh;
906 466 50       972 if ($in_file eq '') {
907 0         0 $inh = \*STDIN;
908 0         0 $self->{inh_is_stdin} = 1;
909 0 0       0 $self->{_int_one_pass} = 1 unless defined($self->{one_pass});
910 0         0 $in_file_disp = '<stdin>';
911             } else {
912 466 100       17077 if (!open($inh, '<', $in_file)) {
913 3         52 $self->_print_error("unable to open file '$in_file': $!");
914 3         77 return 0;
915             }
916 463         1180 $in_file_disp = $in_file;
917 463         1080 $self->{_close_inh_when_finished} = 1;
918             }
919 463         1005 $self->{_inh} = $inh;
920             }
921              
922 467 50       1021 confess "Oups! in_file_disp not defined?" unless defined($in_file_disp);
923 467         964 $self->{_in_file_disp} = $in_file_disp;
924              
925              
926             #
927             # LOW LEVEL INIT STEP 2 OF 4
928             #
929             # "Detection" of encoding
930             #
931             # WARNING
932             # As explained in the manual, it is a very partial and limited detection...
933             #
934              
935 467 100       1012 unless ($self->{_leave_encoding_alone}) {
936 463 100       1032 unless ($self->{_init_input_already_called}) {
937             my ($e, $m) = $self->_detect_inh_encoding($self->{encoding}, $self->{via},
938 320 100       1815 $self->{in_file}, ($self->{_int_one_pass} ? 0 : $DETECT_ENCODING));
939 320         1083 $self->{_inh_encoding} = $e;
940 320         706 $self->{_inh_encoding_string} = $m;
941             }
942              
943 463         3422 binmode $self->{_inh}, $self->{_inh_encoding_string};
944             print($_debugh "Input encoding: '" . $self->{_inh_encoding} . "' / '" .
945 463 50       21442 $self->{_inh_encoding_string} . "'\n") if $_debug;
946              
947             }
948              
949 467 100       1313 $self->{out_file} = '' unless defined($self->{out_file});
950              
951              
952             #
953             # LOW LEVEL INIT STEP 3 OF 4
954             #
955             # Detection of CSV separator and escape character
956             #
957              
958 467         639 my $sep_char;
959 467         805 my $escape_char = $self->{escape_char};
960 467 100       1180 $self->{quote_char} = $DEFAULT_QUOTE_CHAR unless defined($self->{quote_char});
961 467         776 my $quote_char = $self->{quote_char};
962 467 100       1044 unless (defined($self->{in_csvobj})) {
963 462 100       913 if (defined($self->{sep_char})) {
964 200         395 $sep_char = $self->{sep_char};
965 200 50       473 print($_debugh "-- $in_file_disp: CSV separator set to \"") if $_debug;
966             } else {
967             # The test below (on _init_input_already_called) shoud be useless.
968             # Left for the sake of robustness.
969 262 50       550 unless ($self->{_init_input_already_called}) {
970 262 100       823 if (!$self->_detect_csv_sep($escape_char, $quote_char, \$sep_char)) {
971 6         20 $self->_print_error("'$in_file_disp': cannot detect CSV separator");
972 0         0 return 0;
973             }
974 256 50       567 print($_debugh "-- $in_file_disp: CSV separator detected to \"") if $_debug;
975 256         581 $self->{sep_char} = $sep_char;
976             }
977             }
978 456 50       865 print($_debugh _render($sep_char) . "\"\n") if $_debug;
979              
980 456         1433 $self->_detect_meta($quote_char, $sep_char);
981              
982             $self->{_in_csvobj} = Text::CSV->new({sep_char => $sep_char,
983             allow_whitespace => 1, binary => 1, auto_diag => 0,
984             quote_char => $quote_char, escape_char => $self->{escape_char},
985 456         3227 allow_loose_escapes => 1});
986 456 50       75000 unless (defined($self->{_in_csvobj})) {
987 0         0 $self->_print_error("error creating input Text::CSV object");
988 0         0 return 0;
989             }
990              
991             } else {
992 5         8 $self->{_in_csvobj} = $self->{in_csvobj};
993             }
994              
995 461         888 $self->{_init_input_already_called} = 1;
996              
997 461         1297 return 1;
998             }
999              
1000             sub get_in_file_disp {
1001 3663     3663 1 4836 my $self = shift;
1002              
1003 3663         19857 validate_pos(@_);
1004              
1005 3663         9455 my $in_file_disp = _get_def($self->{_in_file_disp}, '?');
1006 3663         6690 return $in_file_disp;
1007             }
1008              
1009             sub get_sep_char {
1010 0     0 1 0 my $self = shift;
1011              
1012 0         0 validate_pos(@_);
1013              
1014 0         0 return $self->{sep_char};
1015             }
1016              
1017             sub get_escape_char {
1018 9     9 1 37 my $self = shift;
1019              
1020 9         53 validate_pos(@_);
1021              
1022 9         52 return $self->{escape_char};
1023             }
1024              
1025             sub get_in_encoding {
1026 27     27 1 6959 my $self = shift;
1027              
1028 27         271 validate_pos(@_);
1029              
1030 27         118 return _get_def($self->{_inh_encoding}, '');
1031             }
1032              
1033             sub get_is_always_quoted {
1034 15     15 1 64 my $self = shift;
1035              
1036 15         88 validate_pos(@_);
1037              
1038 15         73 return $self->{_is_always_quoted};
1039             }
1040              
1041             sub get_pass_count {
1042 43     43 1 687 my $self = shift;
1043              
1044 43         233 validate_pos(@_);
1045              
1046 43         137 return _get_def($self->{_pass_count}, 0);
1047             }
1048              
1049             sub get_in_mem_record_count {
1050 0     0 1 0 my $self = shift;
1051              
1052 0         0 validate_pos(@_);
1053              
1054 0         0 return ($self->{_in_mem_record_count}, 0);
1055             }
1056              
1057             sub get_max_in_mem_record_count {
1058 513     513 1 2254 my $self = shift;
1059              
1060 513         3038 validate_pos(@_);
1061              
1062 513         2118 return _get_def($self->{_max_in_mem_record_count}, 0);
1063             }
1064              
1065             sub _set_max_in_mem_record_count {
1066 155     155   279 my $self = shift;
1067              
1068 155         1537 validate_pos(@_, {type => SCALAR});
1069              
1070 155         538 $self->{_max_in_mem_record_count} = $_[0];
1071             }
1072              
1073             sub get_fields_names {
1074 19     19 1 3445 my $self = shift;
1075              
1076 19         122 validate_pos(@_);
1077              
1078 19 50       58 return () unless $self->_status_forward('S3');
1079 19         31 return @{$self->{_columns}};
  19         89  
1080             }
1081              
1082             sub get_field_name {
1083 1     1 1 475 my $self = shift;
1084              
1085 1         11 validate_pos(@_, {type => SCALAR});
1086              
1087 1         3 my ($n) = @_;
1088              
1089 1 50       3 return undef unless $self->_status_forward('S3');
1090 1         4 return $self->{_columns}->[$n];
1091             }
1092              
1093             sub get_coldata {
1094 1     1 1 2 my $self = shift;
1095              
1096 1         7 validate_pos(@_);
1097              
1098 1 50       8 return () unless $self->_status_forward('S3');
1099 1         2 my @ret;
1100 1         3 for (@{$self->{_coldata}}) {
  1         2  
1101 4         135 push @ret, [
1102             $_->field_name,
1103             $_->header_text,
1104             $_->description,
1105             $_->dt_format,
1106             $_->dt_locale,
1107             $_->multiline];
1108             }
1109              
1110 1         26 return @ret;
1111             }
1112              
1113             sub get_stats {
1114 2     2 1 12 my $self = shift;
1115              
1116 2         11 validate_pos(@_);
1117              
1118 2 50       7 return () unless defined($self->{_stats});
1119 2         3 return %{$self->{_stats}};
  2         7  
1120             }
1121              
1122             sub get_nb_rows {
1123 1     1 1 2 my $self = shift;
1124              
1125 1         7 validate_pos(@_);
1126              
1127 1         3 return $self->{_nb_rows};
1128             }
1129              
1130             sub _debug_show_members {
1131 0     0   0 my ($self) = @_;
1132 0         0 my $_debugh = $self->{_debugh};
1133 0 0       0 my @a = @{$self->{fields_ar}} if defined($self->{fields_ar});
  0         0  
1134 0 0       0 my @c = @{$self->{fields_column_names}} if defined($self->{fields_column_names});
  0         0  
1135 0 0       0 my %h = %{$self->{fields_hr}} if defined($self->{fields_hr});
  0         0  
1136              
1137 0         0 print($_debugh "-- _debug_show_members() start\n");
1138 0         0 print($_debugh " croak_if_error $self->{croak_if_error}\n");
1139 0         0 print($_debugh " verbose $self->{verbose}\n");
1140 0         0 print($_debugh " _debug $self->{_debug}\n");
1141 0         0 print($_debugh " _debug_read $self->{_debug_read}\n");
1142 0         0 print($_debugh " infoh $self->{infoh}\n");
1143 0         0 print($_debugh " _debugh $_debugh\n");
1144 0         0 print($_debugh " inh: $self->{_inh}\n");
1145 0         0 print($_debugh " in_file_disp " . $self->get_in_file_disp() . "\n");
1146 0         0 print($_debugh " _in_csvobj $self->{_in_csvobj}\n");
1147 0         0 print($_debugh " has_headers $self->{has_headers}\n");
1148 0         0 print($_debugh " fields_ar:\n");
1149 0         0 for my $e (@a) {
1150 0         0 print($_debugh " '$e'\n");
1151             }
1152 0         0 print($_debugh " fields_hr:\n");
1153 0         0 for my $e (keys %h) {
1154 0         0 print($_debugh " '$e' => '$h{$e}'\n");
1155             }
1156 0         0 print($_debugh " fields_column_names:\n");
1157 0         0 for my $e (@c) {
1158 0         0 print($_debugh " '$e'\n");
1159             }
1160 0         0 print($_debugh "-- _debug_show_members() end\n");
1161             }
1162              
1163             #
1164             # Check headers in CSV header line
1165             # Used to increase robustness by relying on header title rather than
1166             # column number.
1167             #
1168             # Return 1 if success (all fields found), 0 otherwise.
1169             #
1170             sub _process_header {
1171 12     12   30 my $self = shift;
1172 12         19 my @headers = @{shift(@_)};
  12         43  
1173 12         22 my %fields_h = %{shift(@_)};
  12         69  
1174 12         26 my $retval = shift;
1175              
1176 12         23 my @tmp = keys %{$retval};
  12         51  
1177              
1178 12         38 my $in_file_disp = $self->get_in_file_disp();
1179              
1180 12 50       40 confess '$_[4] must be an empty by-ref hash' if $#tmp >= 0;
1181              
1182 12         25 my $e = 0;
1183 12         43 for my $k (keys %fields_h) {
1184 48         79 my $v = $fields_h{$k};
1185              
1186 48     462   205 my @all_idx = indexes { /$v/i } @headers;
  462         2576  
1187 48 50       164 if ($#all_idx >= 1) {
1188 0         0 $self->_print_error("file $in_file_disp: " .
1189             "more than one column matches the criteria '$v'");
1190 0         0 $e++;
1191             }
1192 48     258   172 my $idx = first_index { /$v/i } @headers;
  258         1541  
1193 48 50       206 if ($idx < 0) {
1194 0         0 $self->_print_error("file $in_file_disp: unable to find field '$v'");
1195 0         0 $e++;
1196             } else {
1197 48         117 $retval->{$k} = $idx;
1198             }
1199             }
1200              
1201 12 50       73 return ($e >= 1 ? 0 : 1);
1202             }
1203              
1204             sub set_walker_hr {
1205 2     2 1 1800 my $self = shift;
1206 2         24 validate_pos(@_, {type => UNDEF | CODEREF, optional => 1});
1207              
1208 2         8 my ($walker_hr) = @_;
1209              
1210 2 50       11 return undef unless $self->_status_forward('S2');
1211 2 50       7 return undef unless $self->_status_backward('S2');
1212 2         3 $self->{walker_hr} = $walker_hr;
1213              
1214 2         6 return $self;
1215             }
1216              
1217             sub set_walker_ar {
1218 2     2 1 1460 my $self = shift;
1219 2         24 validate_pos(@_, {type => UNDEF | CODEREF, optional => 1});
1220              
1221 2         8 my ($walker_ar) = @_;
1222              
1223 2 50       6 return undef unless $self->_status_forward('S2');
1224 2 50       5 return undef unless $self->_status_backward('S2');
1225 2         4 $self->{walker_ar} = $walker_ar;
1226              
1227 2         4 return $self;
1228             }
1229              
1230              
1231             # * *************************************** *
1232             # * BEGINNING OF DATE FORMAT DETECTION CODE *
1233             # * *************************************** *
1234              
1235              
1236             #
1237             # The '%m.%d.%y' is not at its "logical" location. It is done to make sure the order in which
1238             # entries are written does not impact the result.
1239             #
1240             # It could occur because there is some code that correlates an entry containing %y with another
1241             # one that would contain %Y. The %Y will be called the master, the %y will be called the slave.
1242             # It is important to match such entries, otherwise an identified format with %y would always be
1243             # ambiguous with the same written with %Y.
1244             #
1245             # IMPORTANT
1246             # The list below is written almost as-is in the POD at the bottom of this file.
1247             #
1248             my @DATES_DEFAULT_FORMATS_TO_TRY = (
1249             '',
1250             '%Y-%m-%d',
1251             '%Y.%m.%d',
1252             '%Y/%m/%d',
1253              
1254             '%m.%d.%y',
1255              
1256             '%m-%d-%Y',
1257             '%m.%d.%Y',
1258             '%m/%d/%Y',
1259             '%d-%m-%Y',
1260             '%d.%m.%Y',
1261             '%d/%m/%Y',
1262              
1263             '%m-%d-%y',
1264             '%m/%d/%y',
1265             '%d-%m-%y',
1266             '%d.%m.%y',
1267             '%d/%m/%y',
1268              
1269             '%Y%m%d%H%M%S',
1270              
1271             # Localizaed formats
1272              
1273             '%b %d, %Y',
1274             '%b %d %Y',
1275             '%b %d %T %Z %Y',
1276             '%d %b %Y',
1277             '%d %b, %Y'
1278             );
1279              
1280             #
1281             # IMPORTANT
1282             # Under Linux, $START is useless. Strptime will match a format exactly as it is, and a tring
1283             # like "01/01/16 13:00:00" won't match with "%T". Under Windows, Strptime is capable of doing
1284             # a match by ignoring characters at the beginning, thus "01/01/2016 13:00:00" for example will
1285             # return success when matched against "%T".
1286             # Possibly it has to do with versionning of Strptime, not Linux versus Windows as such. Any
1287             # way, this difference had to be dealt with.
1288             #
1289             # The flexibility under Windows would screw the code logic so I had to add the prefix string
1290             # below, to avoid unexpected success on match.
1291             #
1292             my $START = '<';
1293              
1294             struct RecordCounter => {
1295             count_ok => '$',
1296             count_ko => '$',
1297             has_searched_time => '$',
1298              
1299             format => '$',
1300             locale => '$',
1301              
1302             has_found_time => '$',
1303             format_with_addition_of_time => '$',
1304             locale_with_addition_of_time => '$',
1305             parser_with_addition_of_time => '$'
1306             };
1307              
1308             struct Format => {
1309             id => '$',
1310             format => '$',
1311             locale => '$',
1312             parser => '$',
1313             index_slave => '$',
1314             index_master => '$'
1315             };
1316              
1317             sub _col_dispname {
1318 539     539   897 my ($self, $n) = @_;
1319              
1320 539         664 my $col;
1321              
1322             #
1323             # IMPORTANT
1324             #
1325             # We cannot execute here a command like
1326             # $self->_status_forward('S3');
1327             # (to ensure _columns is well defined) because _col_dispname is called by
1328             # _detect_dates_formats that is in turn called by _S3_init_fields_extra. A call to
1329             # _status_forward would trigger a never-ending call loop.
1330             #
1331 539         1200 my $cols = _get_def($self->{'_columns'}, $self->{'_S2_columns'});
1332              
1333 539 50       1000 if ($self->{has_headers}) {
1334 539         860 $col = $cols->[$n];
1335 539 50       874 $col = "<UNDEF>" unless defined($col);
1336             } else {
1337 0         0 $col = "[$n]";
1338             }
1339 539         1061 return $col;
1340             }
1341              
1342             # Used by test plan only...
1343             sub _dds {
1344 45     45   1111 my $self = shift;
1345              
1346 45 50       107 return undef unless $self->_status_forward('S3');
1347 43 50       145 return undef unless defined($self->{_dates_detailed_status});
1348 43         333 return $self->{_dates_detailed_status};
1349             }
1350              
1351             sub _detect_dates_formats {
1352 354     354   536 my $self = shift;
1353              
1354 354 100       933 return if $self->{_detect_dates_formats_has_run};
1355 287         545 $self->{_detect_dates_formats_has_run} = 1;
1356 287 100       656 my @fields_dates = @{$self->{fields_dates}} if defined($self->{fields_dates});
  11         34  
1357 287 100 100     1348 return unless @fields_dates or $self->{fields_dates_auto};
1358              
1359 54 50       157 if ($self->{_int_one_pass}) {
1360 0         0 $self->_print_error("date format detection disallowed when one_pass is set");
1361 0         0 return;
1362             }
1363              
1364 54         109 my $_debug = $self->{_debug};
1365 54         92 my $_debugh = $self->{_debugh};
1366 54   33     171 my $debug_fmt = ($_debug and $DEBUG_DATETIME_FORMATS);
1367              
1368 54         158 $self->_register_pass("detect date format");
1369              
1370             #
1371             # Why re-opening the input?
1372             # I tried two other ways that never worked on some OSes (like freebsd) and/or with older perl
1373             # versions.
1374             #
1375             # 1) The "tell" tactic
1376             # Recording at the beginning of the function the file position with
1377             # my $pos = tell($self->{inh});
1378             # ... and then recalling with a seek instruction is the most logical.
1379             # But it didn't work = sometimes, reading would go back to first row (the headers) instead
1380             # of the second row, could not figure out why (it would work on my Ubuntu 16.04 / perl 5.22, but
1381             # would fail with other OSes and/or perl versions).
1382             #
1383             # 2) The "complete rewind" tactic
1384             # I then undertook to do (at the end of detection function):
1385             # seek $inh, 0, SEEK_SET;
1386             # $incsv->getline($inh) if $self->{has_headers};
1387             # based on the assumption that a seek to zero would behave differently from a seek to an
1388             # arbitrary position.
1389             # But still, it would sometimes fail....
1390             #
1391              
1392 54         131 my $inh = $self->_reopen_input();
1393 54         111 my $incsv = $self->{_in_csvobj};
1394 54 50       223 _mygetline($incsv, $inh) if $self->{has_headers};
1395              
1396 54         2360 my $formats_to_try = $self->{dates_formats_to_try};
1397 54         95 my $ignore_trailing_chars = $self->{dates_ignore_trailing_chars};
1398 54         105 my $search_time = $self->{dates_search_time};
1399 54         89 my $localizations = $self->{dates_locales};
1400              
1401 54         79 my %regular_named_fields = %{$self->{_regular_named_fields}};
  54         437  
1402              
1403 54         134 my $refsub_is_datetime_empty = $self->{_refsub_is_datetime_empty};
1404              
1405 54         88 my @fields_to_detect_format;
1406 54 100       195 if (defined($self->{fields_dates})) {
    50          
1407 11         21 my $count_field_not_found = 0;
1408 11         18 my %column_seen;
1409 11         18 for my $f (@{$self->{fields_dates}}) {
  11         40  
1410 23 100       55 if (!exists $regular_named_fields{$f}) {
1411 1         10 $self->_print_error("fields_dates: unknown field: '$f'",
1412             1, ERR_UNKNOWN_FIELD, { %regular_named_fields } );
1413 1         34 $count_field_not_found++;
1414 1         3 next;
1415             }
1416 22         36 my $n = $regular_named_fields{$f};
1417 22 50       56 if (exists $column_seen{$n}) {
1418 0         0 $self->_print_warning("field '$f' already seen");
1419 0         0 next;
1420             }
1421 22         41 $column_seen{$n} = 1;
1422 22         45 push @fields_to_detect_format, $n;
1423             }
1424 11 100       34 $self->_print_error("non existent field(s) encountered, aborted") if $count_field_not_found;
1425             } elsif ($self->{fields_dates_auto}) {
1426 43         168 my @k = keys %regular_named_fields;
1427 43         193 @fields_to_detect_format = (0..$#k);
1428             } else {
1429 0         0 confess "Hey! check this code, man";
1430             }
1431              
1432             #
1433             # FIXME?
1434             # Sort by column number of not?
1435             #
1436             # At this moment in time, the author inclines to answer "yes".
1437             # But I must admit it is rather arbitrary decision for now.
1438             #
1439 53         299 @fields_to_detect_format = sort { $a <=> $b } @fields_to_detect_format;
  753         868  
1440              
1441 2         5 my @dates_formats_supp = @{$self->{dates_formats_to_try_supp}}
1442 53 100       143 if defined($self->{dates_formats_to_try_supp});
1443              
1444 53 100       367 $formats_to_try = [ @DATES_DEFAULT_FORMATS_TO_TRY ] unless defined($formats_to_try);
1445 53         96 $formats_to_try = [ @{$formats_to_try}, @dates_formats_supp ];
  53         219  
1446 53         143 my %seen;
1447 53         105 my $f2 = [ ];
1448 53         149 for (@${formats_to_try}) {
1449 978 50       1460 push @{$f2}, $_ unless exists($seen{$_});
  978         1290  
1450 978         1693 $seen{$_} = undef;
1451             }
1452 53         122 $formats_to_try = $f2;
1453              
1454 53 100       134 $ignore_trailing_chars = 1 unless defined($ignore_trailing_chars);
1455 53 100       153 $search_time = 1 unless defined($search_time);
1456              
1457 53 100       122 my $stop = ($ignore_trailing_chars ? '' : '>');
1458              
1459             #
1460             # The code below (from # AMB to # AMB-END) aims to remove ambiguity that comes from %Y versus %y.
1461             # That is: provided you have (among others) the formats to try
1462             # '%d-%m-%Y'
1463             # and
1464             # '%d-%m-%y'
1465             # then if parsing 4-digit-year dates (like '31-12-2016'), the two formats will work and you'll end
1466             # up with an ambiguity. To be precise, there'll be no ambiguity if the date is followed by a time,
1467             # but if the date is alone, both formats will work.
1468             #
1469             # Thanks to the below code, the member 'index_slave' (and its counterpart index_master) is populated
1470             # and later, if such an ambiguity is detected, the upper case version (the one containing upper case
1471             # '%Y') will be kept and the other one will be discarded.
1472             #
1473             # NOTE
1474             # Such an ambiguity can exist only when ignore_trailing_chars is set. Otherwise, the remaining two
1475             # digits make the date parsing fail in the '%y' case.
1476             #
1477             # The other members of the 'Format' object are used to work "normally", independently from this
1478             # ambiguity removal feature.
1479             #
1480              
1481             # WIP = Work In Progress...
1482 53         147 my @formats_wip;
1483 53 100       134 my @locales = split(/,\s*/, $localizations) if defined($localizations);
1484 53         87 for my $f (@{$formats_to_try}) {
  53         127  
1485 978 100       2006 my $has_localized_item = ($f =~ m/%a|%A|%b|%B|%c|%\+/ ? 1 : 0);
1486 978 100 100     1634 unless (@locales and $has_localized_item) {
1487 968         1524 push @formats_wip, [$f, ''];
1488 968         1322 next;
1489             }
1490 10         33 push @formats_wip, [$f, $_] foreach @locales;
1491             }
1492              
1493             # AMB
1494 53         99 my @formats;
1495             my %mates;
1496 53         151 for my $i (0..$#formats_wip) {
1497 988         2210 my $fstr = $formats_wip[$i]->[0];
1498 988         1415 my $floc = $formats_wip[$i]->[1];
1499              
1500             # FIXME
1501             # Will not manage correctly a string like
1502             # '%%Y'
1503             # that means (when used with Strptime), the litteral string '%Y' with no substitution.
1504             # Such cases will be complicated to fix, as it'll require to do a kind-of
1505             # Strptime-equivalent parsing of the string, and I find it a bit overkill.
1506             #
1507             # I prefer to push back in caller world saying
1508             # "Hey, if using constructs like '%%Y', you'll be in trouble."
1509 988         1287 my $m = $fstr;
1510 988         3725 $m =~ s/%y//ig;
1511 988         1696 $m .= $floc;
1512              
1513 988         1374 my $index_slave = -1;
1514 988         1150 my $index_master = -1;
1515 988 100       1989 if (exists $mates{$m}) {
1516 262         584 my $alt_fstr = $formats_wip[$mates{$m}]->[0];
1517 262 100       720 my $m_lower = ($fstr =~ m/%y/ ? 1 : 0);
1518 262 100       564 my $m_upper = ($fstr =~ m/%Y/ ? 1 : 0);
1519 262 100       510 my $a_lower = ($alt_fstr =~ m/%y/ ? 1 : 0);
1520 262 100       616 my $a_upper = ($alt_fstr =~ m/%Y/ ? 1 : 0);
1521              
1522             # We ignore the weird cases where we'd have both %y and %Y in a format string.
1523              
1524 262 100 66     2167 if (!$m_lower and $m_upper and $a_lower and !$a_upper) {
    50 66        
      33        
      33        
      33        
      33        
1525 45         100 $index_slave = $mates{$m};
1526 45         807 $formats[$mates{$m}]->index_master($i);
1527             } elsif ($m_lower and !$m_upper and !$a_lower and $a_upper) {
1528 217         377 $index_master = $mates{$m};
1529 217         3539 $formats[$mates{$m}]->index_slave($i);
1530             }
1531              
1532             } else {
1533 726         1469 $mates{$m} = $i;
1534             }
1535              
1536 988         4093 my %strptime_opts = (pattern => $START . $fstr . $stop);
1537 988 100       2313 $strptime_opts{locale} = $floc if $floc ne '';
1538 988 100       5465 my $format = Format->new(
1539             id => "$i",
1540             format => $fstr,
1541             locale => $floc,
1542             parser => ($fstr ne '' ?
1543             DateTime::Format::Strptime->new(%strptime_opts) :
1544             undef),
1545             index_slave => $index_slave,
1546             index_master => $index_master
1547             );
1548 988         1078022 push @formats, $format;
1549             }
1550 53         236 for my $i (0..$#formats) {
1551 988         10442 my $format = $formats[$i];
1552              
1553             # If a master could be itself the slave of another entry, that'd make it a hierarchical
1554             # relation tree with multiple levels. It is not possible, only a direct, unique
1555             # master-slave relation can be managed here.
1556 988 50 66     11977 confess "Inonsistent data, check this module's code urgently!"
1557             if $format->index_slave >= 0 and $format->index_master >= 0;
1558              
1559 988 100       21334 if ($format->index_slave >= 0) {
1560 262         4607 my $mate = $formats[$format->index_slave];
1561 262 50 33     4188 if ($mate->index_master != $i or $mate->index_slave != -1) {
1562 0         0 confess "Inonsistent data (2), check this module's code urgently!"
1563             }
1564             }
1565              
1566 988 100       20435 if ($format->index_master >= 0) {
1567 262         4330 my $mate = $formats[$format->index_master];
1568 262 50 33     4128 if ($mate->index_slave != $i or $mate->index_master != -1) {
1569 0         0 confess "Inonsistent data (3), check this module's code urgently!"
1570             }
1571             }
1572              
1573             }
1574 53 50       464 if ($debug_fmt) {
1575 0         0 for (@formats) {
1576 0         0 my ($idx, $rel) = (-1, "");
1577 0 0       0 $idx = $_->index_slave, $rel = "S: " if $_->index_slave >= 0;
1578 0 0       0 $idx = $_->index_master, $rel = "M: " if $_->index_master >= 0;
1579 0         0 printf($_debugh "%-18s %s %2d", "'" . $_->format . "'", $rel, $idx);
1580 0 0       0 print($_debugh ": '" . $formats[$idx]->format . "'") if $idx >= 0;
1581 0         0 print($_debugh "\n");
1582             }
1583             }
1584             # AMB-END
1585              
1586 53         118 my %records;
1587             my $record_number;
1588 53         92 my $count_gotit = 0;
1589 53         74 my $count_ambiguous = 0;
1590 53         85 my $count_nodate = 0;
1591 53         80 my $count_empty = 0;
1592 53         72 my $has_signaled_can_start_recording_data = 0;
1593 53         140 $self->{_line_after_which_recording_can_start} = 0;
1594              
1595             #
1596             # Seems a weird optimization here, but it is very important.
1597             # In some cases, divides execution time (to detect date format on big files
1598             # containing numerous fields) by 10.
1599             #
1600             # When evaluates to true, it means the input column has no identified date format, meaning,
1601             # no further check to do.
1602             #
1603 53         96 my @cache_nodate;
1604              
1605 53         185 while (my $f = _mygetline($incsv, $inh)) {
1606 4650         145432 $record_number++;
1607              
1608 4650 50       8680 if ($debug_fmt) {
1609 0         0 print($_debugh "RECORD $record_number:\n");
1610 0         0 for (0 .. @$f - 1) { printf($_debugh " %02d: '%s'\n", $_, $f->[$_]); }
  0         0  
1611             }
1612              
1613 4650         8426 for my $n (@fields_to_detect_format) {
1614 31445 100       53619 next if $cache_nodate[$n];
1615              
1616 16908         23080 my $v = $f->[$n];
1617 16908 100       25435 $v = '' unless defined($v);
1618 16908 100       30164 next if $v eq '';
1619 9406 100 100     24886 next if defined($refsub_is_datetime_empty) and $refsub_is_datetime_empty->($v);
1620              
1621 9403 50       16351 if ($debug_fmt) {
1622 0         0 my $col = $self->_col_dispname($n);
1623 0         0 print($_debugh "Line $record_number, column '$col':\n");
1624             }
1625              
1626 9403         16398 for my $fmt (@formats) {
1627 158464         1938371 my $fid = $fmt->id;
1628 158464         2580620 my $fstr = $fmt->format;
1629              
1630 158464 50       961742 $self->_debug_output_fmt('** pre ', $fmt, $records{$n}->{$fid}) if $debug_fmt;
1631              
1632             $records{$n}->{$fid} = RecordCounter->new(
1633             count_ok => 0,
1634             count_ko => 0,
1635             has_searched_time => 0,
1636              
1637             format => undef,
1638             locale => undef,
1639              
1640             has_found_time => 0,
1641             format_with_addition_of_time => undef,
1642             locale_with_addition_of_time => undef,
1643             parser_with_addition_of_time => undef
1644 158464 100       402666 ) unless defined($records{$n}->{$fid});
1645              
1646 158464 100       2396526 unless ($records{$n}->{$fid}->count_ko) {
1647 19561         156101 my $is_ok = &_try_parser($fmt, $records{$n}->{$fid}, $START . $v . $stop);
1648              
1649 19561 100       39790 if (!$is_ok) {
1650 7789         10262 my $give_up_time = 0;
1651 7789 100 66     97784 if ($records{$n}->{$fid}->count_ko == 0 and
      100        
1652             $records{$n}->{$fid}->has_searched_time and
1653             $records{$n}->{$fid}->has_found_time) {
1654 77   100     4020 $give_up_time = (defined($fmt->parser) and
1655             defined($fmt->parser->parse_datetime($START . $v . $stop))
1656             ?
1657             1 : 0);
1658 77 100       40222 if ($give_up_time) {
1659 4         74 $records{$n}->{$fid}->has_found_time(0);
1660 4         28 $is_ok = 1;
1661             }
1662             }
1663             }
1664              
1665 19561 100 100     213960 if ($is_ok or !$ignore_trailing_chars) {
1666 14159   100     185995 my $incr = (defined($fmt->parser) and $is_ok ? 1: 0);
1667              
1668 14159 100       281680 unless ($records{$n}->{$fid}->has_searched_time) {
1669 3156         53415 $records{$n}->{$fid}->has_searched_time(1);
1670              
1671             croak "Inconsistent status! Issue in module code not in caller's!"
1672 3156 50       50360 if $records{$n}->{$fid}->count_ok != 0;
1673              
1674 3156 100       21664 if ($search_time) {
    100          
1675              
1676 1980 50       3617 print($_debugh " Search time in '$v', format '$fstr'\n")
1677             if $debug_fmt;
1678              
1679 1980         24457 my $t = $self->_guess_time_format($fstr, $fmt->locale, $v, $stop);
1680 1980 100       42084 $records{$n}->{$fid}->has_found_time((defined($t) ? 1 : 0));
1681 1980 100       16509 if (defined($t)) {
    100          
1682 254         3868 $records{$n}->{$fid}->format_with_addition_of_time($t->[0]);
1683 254         4761 $records{$n}->{$fid}->locale_with_addition_of_time($t->[1]);
1684 254         4595 $records{$n}->{$fid}->parser_with_addition_of_time($t->[2]);
1685 254         1744 $incr = 1;
1686             } elsif ($fstr eq '') {
1687 200         3084 $records{$n}->{$fid}->count_ko(1);
1688             }
1689             } elsif ($fstr eq '') {
1690 78         932 $records{$n}->{$fid}->count_ko(1);
1691             }
1692              
1693             }
1694              
1695 14159         240714 $records{$n}->{$fid}->count_ok($records{$n}->{$fid}->count_ok + $incr);
1696              
1697 14159 100 100     181498 $records{$n}->{$fid}->count_ko($records{$n}->{$fid}->count_ko + 1)
1698             if !$incr and !$is_ok;
1699              
1700 14159 100       45090 if ($incr) {
1701             # We remove the slave if master is fine.
1702             # Depending on the order in which parsing got done, the master could
1703             # pop up first, or the slave, that is why we need manage both cases.
1704 9161 100 100     115808 if ($fmt->index_slave >= 0 or $fmt->index_master >= 0) {
1705 6843 100       202853 my $has_slave = ($fmt->index_slave >= 0 ? 1 : 0);
1706 6843 100       119149 my $idx = ($has_slave ? $fmt->index_slave : $fmt->index_master);
1707 6843         114981 my $mate = $formats[$idx]->id;
1708 6843 100       50518 if (exists $records{$n}->{$mate}) {
1709 6689 100       12247 if ($has_slave) {
1710 2239 100       30531 if ($records{$n}->{$mate}->count_ko == 0) {
1711             # I am the master: I remove the slave
1712 2         53 $records{$n}->{$mate}->count_ko(1);
1713             }
1714             } else {
1715 4450 50 66     56764 if ($records{$n}->{$mate}->count_ko == 0 and
      66        
1716             $records{$n}->{$mate}->count_ok >= 1 and
1717             $records{$n}->{$fid}->count_ko == 0) {
1718 117         6413 $records{$n}->{$fid}->count_ko(1);
1719             }
1720             }
1721             }
1722             }
1723             }
1724              
1725             } else {
1726 5402         69351 $records{$n}->{$fid}->count_ko($records{$n}->{$fid}->count_ko + 1);
1727             }
1728             }
1729              
1730 158464 50       1103496 $self->_debug_output_fmt(' post', $fmt, $records{$n}->{$fid}) if $debug_fmt;
1731              
1732             }
1733             }
1734              
1735 4650         7265 $count_gotit = 0;
1736 4650         6724 $count_ambiguous = 0;
1737 4650         6590 $count_empty = 0;
1738 4650         9138 for my $n (@fields_to_detect_format) {
1739 31445 100       53472 next if $cache_nodate[$n];
1740              
1741 16908         21910 my $candidate = 0;
1742 16908         21516 my $tt = 0;
1743 16908         24498 for my $fmt (@formats) {
1744 259112         3130002 my $fid = $fmt->id;
1745 259112         1561608 my $rec = $records{$n}->{$fid};
1746 259112 100       410964 next unless defined($rec);
1747              
1748 200290         2343828 my $ok = $rec->count_ok;
1749 200290         3145687 my $ko = $rec->count_ko;
1750              
1751 200290 50 66     1352272 confess "Oups. Check this module code urgently!" if $ok == 0 and $ko == 0;
1752 200290         239261 $tt += $ok + $ko;
1753              
1754 200290 100 100     407111 $candidate++ if $ok >= 1 and $ko == 0;
1755             }
1756 16908 100       31488 if ($candidate == 1) {
    100          
    100          
1757 9602         13880 $count_gotit++;
1758             } elsif ($candidate >= 2) {
1759 2269         3620 $count_ambiguous++;
1760             } elsif ($tt != 0) {
1761 230         282 $count_nodate++;
1762 230         423 $cache_nodate[$n] = 1;
1763             } else {
1764 4807         6511 $count_empty++;
1765             }
1766             }
1767              
1768 4650 50       8491 if ($debug_fmt) {
1769 0         0 print($_debugh "\$count_gotit = $count_gotit\n");
1770 0         0 print($_debugh "\$count_ambiguous = $count_ambiguous\n");
1771 0         0 print($_debugh "\$count_nodate = $count_nodate\n");
1772 0         0 print($_debugh "\$count_empty = $count_empty\n");
1773             }
1774              
1775 4650         7169 my $can_start_recording_data = 0;
1776 4650 100 100     23340 $can_start_recording_data = 1
      100        
1777             if $count_gotit + $count_ambiguous + $count_nodate >= 1 and
1778             !$count_ambiguous and !$count_empty;
1779              
1780 4650 100 100     27375 if ($can_start_recording_data and !$has_signaled_can_start_recording_data) {
1781 31         60 $has_signaled_can_start_recording_data = 1;
1782              
1783 31 50       85 print($_debugh "Can start recording (all dates formats detection closed) " .
1784             "after record #$record_number\n") if $_debug;
1785              
1786 31         79 $self->{_line_after_which_recording_can_start} = $record_number;
1787 31 100       113 last unless $self->{fields_dates_auto};
1788 24 100       196 last if $self->{fields_dates_auto_optimize};
1789             }
1790             }
1791              
1792 53         3318 close $inh;
1793              
1794 53         156 my %dates_detailed_status;
1795             my @dates_formats;
1796 53         109 my $check_empty = 0;
1797 53         86 my $check_nodate = 0;
1798 53         85 my $check_ambiguous = 0;
1799 53         111 my $check_gotit = 0;
1800 53         128 for my $n (@fields_to_detect_format) {
1801 532         662 my @formats_ok;
1802 532         650 my $tt = 0;
1803 532         626 for my $fid (sort keys %{$records{$n}}) {
  532         4453  
1804 8465         73766 my $rec = $records{$n}->{$fid};
1805 8465 100 100     99187 if ($rec->count_ok >= 1 and $rec->count_ko == 0) {
1806              
1807 298         9864 my ($fstr, $floc) = ($rec->format, $rec->locale);
1808 298 100       6337 ($fstr, $floc) = (
1809             $rec->format_with_addition_of_time,
1810             $rec->locale_with_addition_of_time
1811             ) if $rec->has_found_time;
1812              
1813 298         5808 push @formats_ok, [$fstr, $floc];
1814             }
1815 8465         139405 $tt += $rec->count_ok + $rec->count_ko;
1816             }
1817 532         5100 my $is_ok = 0;
1818 532         631 my $format;
1819 532         820 my $locale = '';
1820 532 100 100     1961 if ($#formats_ok < 0 and $tt == 0) {
    100          
    100          
1821 16         29 $format = "Z";
1822 16         28 $check_empty++;
1823             } elsif ($#formats_ok < 0) {
1824 230         307 $format = "N";
1825 230         289 $check_nodate++;
1826             } elsif ($#formats_ok > 0) {
1827 12         21 $format = "A";
1828 12         19 $check_ambiguous++;
1829             } else {
1830 274         386 $is_ok = 1;
1831 274         506 $format = $formats_ok[0]->[0];
1832 274         376 $locale = $formats_ok[0]->[1];
1833 274         384 $check_gotit++;
1834             }
1835 532         1111 my $col = $self->_col_dispname($n);
1836              
1837 532 50       1589 $dates_detailed_status{$col} = $format unless exists $dates_detailed_status{$col};
1838 532 100 66     2036 $dates_formats[$n] = [ $format, $locale ] if $is_ok and !defined($dates_formats[$n]);
1839             }
1840 53         185 $dates_detailed_status{'.'} = $self->{_line_after_which_recording_can_start};
1841              
1842 53 50 66     479 if ($check_empty != $count_empty or $check_nodate != $count_nodate or
      66        
      33        
1843             $check_ambiguous != $count_ambiguous or $check_gotit != $count_gotit) {
1844             # The below condition can happen with an empty CSV (empty file (no header) or
1845             # only a header line).
1846 1 50 33     37 unless (!$count_empty and !$check_nodate and !$count_nodate and
      33        
      33        
      33        
      33        
      33        
1847             !$check_ambiguous and !$count_ambiguous and !$check_gotit and !$count_gotit) {
1848 0         0 print(STDERR "\$check_empty = $check_empty\n");
1849 0         0 print(STDERR "\$count_empty = $count_empty\n");
1850 0         0 print(STDERR "\$check_nodate = $check_nodate\n");
1851 0         0 print(STDERR "\$count_nodate = $count_nodate\n");
1852 0         0 print(STDERR "\$check_ambiguous = $check_ambiguous\n");
1853 0         0 print(STDERR "\$count_ambiguous = $count_ambiguous\n");
1854 0         0 print(STDERR "\$check_gotit = $check_gotit\n");
1855 0         0 print(STDERR "\$count_gotit = $count_gotit\n");
1856 0         0 confess "Oups! Check immediately this module code, man!";
1857             }
1858             }
1859              
1860 53 50       173 if ($debug_fmt) {
1861             # A very detailed debug output
1862 0         0 for my $n (@fields_to_detect_format) {
1863 0         0 my $col = $self->_col_dispname($n);
1864 0         0 print($_debugh "$col\n");
1865 0         0 printf($_debugh " %-25s %3s %3s\n", "format", "OK", "KO");
1866 0         0 for my $fid (sort keys %{$records{$n}}) {
  0         0  
1867 0         0 my $rec = $records{$n}->{$fid};
1868 0         0 my $cc = '';
1869 0 0 0     0 $cc = "(" . $rec->locale . ")" if defined($rec->locale) and $rec->locale ne '';
1870 0         0 printf($_debugh " %-25s %3d %3d\n",
1871             $rec->format . $cc, $rec->count_ok, $rec->count_ko);
1872             }
1873             }
1874             }
1875             # Not a typo - displaying it IN ADDITION to debug output above is done on purpose...
1876 53 50       141 if ($_debug) {
1877             # A shorter (as compared to above) output of outcome of DateTime detection
1878 0         0 print($_debugh "Result of DateTime detection:\n");
1879 0         0 printf($_debugh "%-3s %-25s %-30s %s\n", '###', 'FIELD', 'DATETIME FORMAT',
1880             'DATETIME LOCALE');
1881 0         0 for my $n (@fields_to_detect_format) {
1882 0         0 my ($fmt, $loc) = ('<undef>', '<undef>');
1883 0 0       0 if (defined($dates_formats[$n])) {
1884 0         0 ($fmt, $loc) = @{$dates_formats[$n]}[0, 1];
  0         0  
1885             }
1886 0         0 printf($_debugh "%03d %-25s %-30s %s\n", $n, $self->_col_dispname($n), $fmt, $loc);
1887             }
1888             }
1889              
1890 53 100       177 if (!$self->{fields_dates_auto}) {
1891 10         21 my $e = 0;
1892 10         25 for my $n (@fields_to_detect_format) {
1893 20 100       50 next if defined($dates_formats[$n]);
1894 7         19 $self->_print_error("unable to detect DateTime format of field '" .
1895             $self->_col_dispname($n) . "'", 1);
1896 7         226 $e++;
1897             }
1898 10 100       38 $self->_print_error("$e field(s) encountered with unknown DateTime format") if $e;
1899             }
1900              
1901 51         629 $self->{_dates_detailed_status} = { %dates_detailed_status };
1902 51         71913 $self->{_dates_formats} = [ @dates_formats ];
1903             }
1904              
1905             sub _debug_output_fmt {
1906 0     0   0 my ($self, $prefix, $fmt, $rec) = @_;
1907              
1908 0         0 my $_debugh = $self->{_debugh};
1909              
1910 0         0 my ($fstr, $floc) = ($fmt->format, $fmt->locale);
1911 0 0 0     0 ($fstr, $floc) = (
1912             '<+T>' . $rec->format_with_addition_of_time,
1913             $rec->locale_with_addition_of_time
1914             ) if defined($rec) and $rec->has_found_time;
1915              
1916 0         0 my $locstr = '';
1917 0 0 0     0 $locstr = "(" . $floc . ")" if defined($floc) and $floc ne '';
1918              
1919 0 0       0 my $tmpok = $rec->count_ok if defined($rec);
1920 0 0       0 $tmpok = '<undef>' unless defined($tmpok);
1921 0 0       0 my $tmpko = $rec->count_ko if defined($rec);
1922 0 0       0 $tmpko = '<undef>' unless defined($tmpko);
1923              
1924 0         0 print($_debugh "$prefix (format '$fstr$locstr': OK = $tmpok, KO = $tmpko)\n");
1925             }
1926              
1927             # When no parse can be done (parser to test is undef), return 1
1928             sub _try_parser {
1929 19561     19561   36197 my ($fmt, $rec, $value_to_parse) = @_;
1930              
1931 19561         251597 my $parser = $fmt->parser;
1932 19561 100       329188 $parser = $rec->parser_with_addition_of_time if $rec->has_found_time;
1933              
1934 19561         199756 my $is_ok = 1;
1935 19561 100       63674 $is_ok = (defined($parser->parse_datetime($value_to_parse)) ? 1 : 0) if $parser;
    100          
1936              
1937 19561 100       6762982 unless (defined($rec->format)) {
1938 8465         145360 $rec->format($fmt->format);
1939 8465         255757 $rec->locale($fmt->locale);
1940             }
1941              
1942 19561         268220 return $is_ok;
1943             }
1944              
1945             sub _guess_time_format {
1946              
1947             # IMPORTANT
1948             # Formats are tested in the order of the list below, and the first one that succeeds stops the
1949             # tests.
1950             # That makes the order of the elements important: %R would match any value that'd also match
1951             # %T, that'd cause to return %R whereas %T would be possible. Same with AM/PM formats. Thus
1952             # the longest patterns appear first.
1953 1980     1980   15671 my @T = (
1954             '%I:%M:%S %p',
1955             '%I:%M %p',
1956             '%I:%M:%S%p',
1957             '%I:%M%p',
1958             '%T',
1959             '%R'
1960             );
1961              
1962 1980         4508 my ($self, $format, $locale, $v, $stop) = @_;
1963              
1964 1980         3199 my $_debugh = $self->{_debugh};
1965 1980   33     4426 my $debug_fmt = ($self->{_debug} and $DEBUG_DATETIME_FORMATS);
1966              
1967 1980 100       4052 return undef if $format =~ /:/;
1968              
1969 1976         2690 my $sep;
1970 1976 100       3837 if ($format eq '') {
1971 325         639 $sep = '';
1972             } else {
1973 1651 100       9128 unless ((undef, $sep) = $v =~ /(^|\d([^0-9:]+))(\d{1,2}):(\d{1,2})(\D|$)/) {
1974 393 100       852 if ($v =~ /\d{4}:\d{2}(\D|$)/) {
1975 29         52 $sep = '';
1976             } else {
1977              
1978 364 50       644 print($_debugh "_guess_time_format(): separator candidate not found in '$v'\n")
1979             if $debug_fmt;
1980              
1981 364         861 return undef;
1982             }
1983             }
1984             }
1985 1612 100       4326 $sep = '' unless defined($sep);
1986              
1987             #
1988             # IMPORTANT
1989             #
1990             # The code below allows to successfully detect DateTime format when
1991             # the first lines contain things like:
1992             # Jan 20 2017 2:00AM
1993             # that could lead to a separator set to ' ' while actually it should be ' '. In this case
1994             # if the double-space is kept, then a later value of
1995             # Jan 20 2017 10:00AM
1996             # won't be parsed.
1997             #
1998             # See t/11-bugfix.t, BUG 5, for an explanation of why the line below.
1999             #
2000              
2001             # More generic code, but will also break some separators like ' ' (4 spaces)
2002             # $sep = substr($sep, 0, length($sep) - 1) if length($sep) >= 2 and substr($sep, -2) eq ' ';
2003 1612 100       3365 $sep = ' ' if $sep eq ' ';
2004              
2005 1612 50       2884 if ($debug_fmt) {
2006 0         0 print($_debugh " _guess_time_format(): Searching time in '$v'\n");
2007             }
2008              
2009 1612         2649 for my $t (@T) {
2010 9193         527128 my $increased_format = "$format$sep$t";
2011              
2012 9193 50       19654 print($_debugh " _guess_time_format(): Trying format '$increased_format'\n") if $debug_fmt;
2013              
2014 9193         26155 my %opts = (pattern => $START . $increased_format . $stop);
2015 9193 100 66     33050 $opts{locale} = $locale if defined($locale) and $locale ne '';
2016 9193         32325 my $parser_of_increased_format = DateTime::Format::Strptime->new(%opts);
2017 9193 100       10184123 next unless defined($parser_of_increased_format->parse_datetime($START . $v . $stop));
2018              
2019 254 50       151930 if ($debug_fmt) {
2020 0         0 print($_debugh " _guess_time_format(): found time in '$v'\n");
2021 0         0 print($_debugh " Initial format: '$format'\n");
2022 0         0 print($_debugh " Increased format: '$increased_format'\n");
2023             }
2024              
2025 254         1391 return [$increased_format, $locale, $parser_of_increased_format];
2026             }
2027 1358         98908 return undef;
2028             }
2029              
2030              
2031             # * ********************************* *
2032             # * END OF DATE FORMAT DETECTION CODE *
2033             # * ********************************* *
2034              
2035              
2036             # Take the string of a header in $_ and replace it with the corresponding field name
2037             sub _header_to_field_name {
2038 1265     1265   1978 $_ = remove_accents($_);
2039 1265         2436 s/[^[:alnum:]_]//gi;
2040 1265         6799 s/^.*$/\U$&/;
2041             }
2042              
2043             # Return 0 if error, 1 if all good
2044             sub _S2_init_fields_from_header {
2045 355     355   602 my $self = shift;
2046              
2047 355         645 my $has_headers = $self->{has_headers};
2048 355         582 my $_debug = $self->{_debug};
2049 355         553 my $_debugh = $self->{_debugh};
2050              
2051 355         828 my $in_file_disp = $self->get_in_file_disp();
2052              
2053 355         641 my $inh = $self->{_inh};
2054 355         557 my $incsv = $self->{_in_csvobj};
2055              
2056 355         685 $self->{_row_read} = 0;
2057              
2058 355         648 my @columns;
2059             my @headers;
2060 355 100       763 if ($has_headers) {
2061              
2062             print($_debugh "$PKG: '$in_file_disp': will parse header line to get column names\n")
2063 344 50       812 if $self->{_debug_read};
2064              
2065 344         579 $self->{_row_read}++;
2066              
2067             print($_debugh "$PKG: '$in_file_disp': will read line #" . $self->{_row_read} . "\n")
2068 344 50       753 if $self->{_debug_read};
2069              
2070 344 50       768 if (defined($self->{_inh_header})) {
2071 0         0 my $l = $self->{_inh_header};
2072 0         0 my $inmemh;
2073 0 0       0 if (!open ($inmemh, '<', \$l)) {
2074 0         0 $self->_print_error("can't open header line in-memory. CSV read aborted.");
2075 0         0 return 0;
2076             }
2077 0         0 @headers = @{_mygetline($incsv, $inmemh)};
  0         0  
2078             } else {
2079 344         689 my $r = _mygetline($incsv, $inh);
2080 342 50       16073 @headers = @{$r} if defined($r);
  342         1071  
2081             }
2082             print($_debugh "Line " . $self->{_row_read} . ":\n--\n" . join('::', @headers) . "\n--\n")
2083 342 50       1114 if $self->{_debug_read};
2084             }
2085              
2086 353 100 100     1547 if ($has_headers and !defined($self->{fields_column_names})) {
2087 336         525 my %indexes;
2088 336 100       812 if (defined($self->{fields_hr})) {
2089 12 50       87 if (!$self->_process_header(\@headers, $self->{fields_hr}, \%indexes)) {
2090 0         0 $self->_print_error("missing headers. CSV read aborted.");
2091 0         0 return 0;
2092             }
2093 12 50       44 if ($_debug) {
2094 0         0 print($_debugh " \%indexes:\n");
2095 0         0 for my $k (sort keys %indexes) {
2096 0         0 print($_debugh " \t$k => $indexes{$k}\n");
2097             }
2098             }
2099 12         80 for (sort keys %indexes) {
2100 48 50       84 next if $_ eq '';
2101 48         90 $columns[$indexes{$_}] = $_;
2102             }
2103             } else {
2104 324         764 @columns = @headers;
2105 324         617 map { _header_to_field_name } @columns;
  1265         7496  
2106             }
2107             }
2108              
2109 353 100       1064 @columns = @{$self->{fields_column_names}} if defined($self->{fields_column_names});
  14         41  
2110              
2111             # Avoid undef in column names... I prefer empty strings
2112 353 100       702 @columns = map { defined($_) ? $_ : '' } @columns;
  1389         3105  
2113              
2114 353 50       828 if ($_debug) {
2115 0         0 print($_debugh "-- CSV headers management\n");
2116 0 0       0 if (@columns) {
2117 0         0 printf($_debugh " %-3s %-40s %-40s\n", 'COL', 'CSV Header', 'Hash Key');
2118 0         0 for my $i (0..$#columns) {
2119 0         0 my $h = '';
2120 0 0       0 $h = $headers[$i] if defined($headers[$i]);
2121 0         0 printf($_debugh " %03d %-40s %-40s\n", $i, "'$h'", "'$columns[$i]'");
2122             }
2123             } else {
2124 0         0 print($_debugh " No headers\n");
2125             }
2126             }
2127              
2128 353         548 my %regular_named_fields;
2129 353         953 for my $i (0..$#columns) {
2130 1389 100 66     5084 $regular_named_fields{$columns[$i]} = $i if defined($columns[$i]) and $columns[$i] ne '';
2131             }
2132 353         1716 $self->{_regular_named_fields} = { %regular_named_fields };
2133 353         1253 $self->{_S2_columns} = [ @columns ];
2134 353 100       1245 $self->{_S2_headers} = [ @headers ] if $has_headers;
2135              
2136 353         1368 return 1;
2137             }
2138              
2139             sub out_header {
2140 8     8 1 11 my $self = shift;
2141 8         73 validate_pos(@_, {type => SCALAR}, {type => SCALAR});
2142              
2143 8         21 my ($field, $header) = @_;
2144 8 100       20 $self->{_out_headers} = { } unless exists $self->{_out_headers};
2145              
2146             $self->_print_warning("out_header: field $field already set")
2147 8 50       14 if exists $self->{_out_headers}->{$field};
2148              
2149 8         15 $self->{_out_headers}->{$field} = $header;
2150              
2151 8         24 return $self;
2152             }
2153              
2154             # Return 0 if error, 1 if all good
2155             sub _S3_init_fields_extra {
2156 355     355   572 my $self = shift;
2157              
2158 355         659 my $_debug = $self->{_debug};
2159 355         579 my $_debugh = $self->{_debugh};
2160              
2161 355         547 my $verbose = $self->{verbose};
2162              
2163 355         532 my $has_headers = $self->{has_headers};
2164              
2165 355         520 my %named_fields = %{$self->{_regular_named_fields}};
  355         1597  
2166 355         666 my @columns = @{$self->{_S2_columns}};
  355         949  
2167 355 100       826 my @headers = @{$self->{_S2_headers}} if $has_headers;
  344         860  
2168              
2169 355         514 my @extra_fields_indexes;
2170 355 100       773 my @extra_fields_definitions_list = @{$self->{_extra_fields}} if exists $self->{_extra_fields};
  36         72  
2171 355         505 my %extra_fields_definitions;
2172              
2173 355 100       839 my @multiline = @{$self->{_multiline}} if defined($self->{_multiline});
  340         664  
2174              
2175 355         487 my @coldata;
2176 355         824 for my $i (0..$#columns) {
2177 1395         45332 my $col = $columns[$i];
2178 1395 100       2682 my $h = $headers[$i] if $has_headers;
2179 1395 100       23430 push @coldata, ColData->new(
2180             field_name => $col,
2181             header_text => $h,
2182             description => '',
2183             multiline => ($multiline[$i] ? 'm' : '1')
2184             );
2185             }
2186              
2187 355         15896 for my $edef (@extra_fields_definitions_list) {
2188 92         4275 my $c = $edef->check_field_existence;
2189 92 100       668 if (defined($c)) {
2190 80 100       174 unless (exists $named_fields{$c}) {
2191 9         121 $self->_print_error("unknown field '" . $edef->check_field_existence . "'",
2192             0, ERR_UNKNOWN_FIELD, { %named_fields } );
2193 8         23 next;
2194             }
2195             }
2196              
2197 83         155 my @e_eclated = $edef;
2198              
2199 83 100 100     1102 if ($edef->ef_type == $EF_LINK and $edef->link_remote_read eq '*') {
2200 2         76 my @cols = $edef->link_remote_obj->get_fields_names();
2201              
2202 2         3 @e_eclated = ();
2203 2         9 my %nf = %named_fields;
2204              
2205 2         5 for my $c (@cols) {
2206              
2207 4         130 my $ex_base = $edef->self_name . $c;
2208 4         35 my $ex_target = $ex_base;
2209 4         8 my $i = 1;
2210 4         15 while (exists $nf{$ex_target}) {
2211 1         2 $i++;
2212 1         4 $ex_target = $ex_base . '_' . $i;
2213             }
2214              
2215 4         60 my $e = ExtraField->new(
2216             ef_type => $EF_LINK,
2217             self_name => $ex_target,
2218             description => $edef->description . " ($c)",
2219              
2220             link_self_search => $edef->link_self_search,
2221             link_remote_obj => $edef->link_remote_obj,
2222             link_remote_search => $edef->link_remote_search,
2223             link_remote_read => $c,
2224              
2225             link_vlookup_opts => $edef->link_vlookup_opts
2226             );
2227 4         361 push @e_eclated, $e;
2228 4         14 $nf{$ex_target} = undef;
2229             }
2230             }
2231              
2232 83         1348 for my $e1 (@e_eclated) {
2233 85 100       1249 if (exists $named_fields{$e1->self_name}) {
2234 6         101 $self->_print_error("extra field: duplicate field name: '" . $e1->self_name . "'");
2235 6         15 next;
2236             }
2237              
2238 79         599 my $index_of_new_element = $#columns + 1;
2239 79         128 push @extra_fields_indexes, $index_of_new_element;
2240 79         1017 $columns[$index_of_new_element] = $e1->self_name;
2241 79         1335 $named_fields{$e1->self_name} = $index_of_new_element;
2242 79         1398 $extra_fields_definitions{$e1->self_name} = $e1;
2243              
2244 79 50       1403 push @headers, $e1->self_name if $has_headers;
2245 79         1328 push @coldata, ColData->new(
2246             field_name => $e1->self_name,
2247             header_text => $e1->self_name,
2248             description => $e1->description,
2249             multiline => '?'
2250             );
2251             }
2252              
2253             }
2254 354 100       3365 $self->{_headers} = [ @headers ] if $has_headers;
2255 354         792 $self->{_extra_fields_indexes} = [ @extra_fields_indexes ];
2256 354         1079 $self->{_columns} = [ @columns ];
2257 354         933 $self->{_extra_fields_definitions} = { %extra_fields_definitions };
2258              
2259 354         1631 $self->{_named_fields} = { %named_fields };
2260              
2261 354         1338 $self->_detect_dates_formats();
2262              
2263 351         5487 $self->{_read_update_after_ar} = [ ];
2264 351         3290 $self->{_write_update_before_ar} = [ ];
2265 351 100       918 my @dates_formats = @{$self->{_dates_formats}} if defined($self->{_dates_formats});
  60         253  
2266 351         1019 for my $i (0..$#columns) {
2267 1455         2163 my $dt_format;
2268             my $dt_locale;
2269 1455 100       2735 if (defined($dates_formats[$i])) {
2270 330         663 $dt_format = $dates_formats[$i]->[0];
2271 330         654 $dt_locale = $dates_formats[$i]->[1];
2272             }
2273 1455         23224 $coldata[$i]->dt_format($dt_format);
2274 1455         26390 $coldata[$i]->dt_locale($dt_locale);
2275              
2276 1455 100       9824 next unless defined($dt_format);
2277              
2278 330         478 my %opts_in;
2279 330 100 66     1369 $opts_in{locale} = $dt_locale if defined($dt_locale) and $dt_locale ne '';
2280              
2281 330         1369 my $obj_strptime_in = DateTime::Format::Strptime->new(pattern => $dt_format, %opts_in);
2282              
2283 330         361097 my %opts_out;
2284 330 50       960 my $loc_out = (exists $self->{out_dates_locale} ? $self->{out_dates_locale} : $dt_locale);
2285 330 100 66     1625 $opts_out{locale} = $loc_out if defined($loc_out) and $loc_out ne '';
2286             my $obj_strptime_out = DateTime::Format::Strptime->new(
2287 330 50       1326 pattern => (exists $self->{out_dates_format} ? $self->{out_dates_format} :$dt_format),
2288             %opts_out
2289             );
2290              
2291 330         339767 my $refsub_is_datetime_empty = $self->{_refsub_is_datetime_empty};
2292 330         1111 my $in_file_disp = $self->get_in_file_disp();
2293              
2294             $self->{_read_update_after_ar}->[$i] = sub {
2295 352 100 66 352   1818 return undef if !defined($_) or $_ eq '' or
      33        
      66        
2296             (defined($refsub_is_datetime_empty) and $refsub_is_datetime_empty->($_));
2297              
2298 315         552 my $s = $_[0];
2299 315         660 my $field = _get_def($_[1], '<?>');
2300              
2301 315         937 my $dt = $obj_strptime_in->parse_datetime($_);
2302              
2303 315 0 33     175487 if ($_debug and $DEBUG_DATETIME_FORMATS and $DEBUG_DATETIME_FORMATS_EVEN_MORE) {
      33        
2304 0 0       0 print($_debugh "-- Record " . $s->get_recnum() .
2305             ", field '$field':\n String parsed: '$_'\n Parse format: '$dt_format'\n" .
2306             " DateTime obj: <" . (defined($dt) ? $dt . '' : 'undef') . ">\n");
2307             }
2308              
2309 315 100       706 if (!defined($dt)) {
2310 2         10 my $recnum = $s->get_recnum();
2311 2 50       7 if ($verbose) {
2312 0         0 $s->_print("$PKG: " .
2313             "$in_file_disp: record $recnum: field $field: unable to parse DateTime\n");
2314 0         0 $s->_print("$PKG: field: '$_'\n");
2315 0         0 $s->_print("$PKG: format: '$dt_format'\n");
2316 0 0       0 $s->_print("$PKG: " .
2317             "locale: '" . ($dt_locale eq '' ? '<none>' : $dt_locale) . "'\n");
2318 0         0 $s->_print("$PKG: " .
2319             "Probable cause: when detecting DateTime format, $PKG will stop reading\n");
2320 0         0 $s->_print("$PKG: " .
2321             "input as soon as the format is worked out. If a value found later\n");
2322 0         0 $s->_print("$PKG: " .
2323             "turns out to use another DateTime format, it'll generate a DateTime\n");
2324 0         0 $s->_print("$PKG: parse error, as is the case now.\n");
2325 0         0 $s->_print_error("unable to parse DateTime");
2326             } else {
2327 2         20 $s->_print_error("$in_file_disp: record $recnum: field $field: " .
2328             "unable to parse DateTime '$_'");
2329             }
2330             }
2331              
2332 314         631 return $dt;
2333 330         2611 };
2334             $self->{_write_update_before_ar}->[$i] = sub {
2335 96 100   96   194 return '' unless defined($_);
2336 87 100       202 return $_ if !ref($_);
2337 81 50       279 return $_ unless $_->isa('DateTime');
2338              
2339 81         236 my $str = $obj_strptime_out->format_datetime($_);
2340              
2341 81 50       17709 if (!defined($str)) {
2342 0         0 my $s = $_[0];
2343 0         0 my $recnum = $s->get_recnum();
2344 0         0 my $field = _get_def($_[1], '<?>');
2345 0         0 $s->_print_error("$in_file_disp: record $recnum: field $field: " .
2346             "unable to print DateTime '$_'")
2347             }
2348              
2349 81         181 return $str;
2350 330         1697 };
2351             }
2352              
2353 351         1366 $self->{_coldata} = [ @coldata ];
2354              
2355 351         1330 my @loop = (
2356             ['_read_update_after_hr', '_read_update_after_ar', 'read post'],
2357             ['_write_update_before_hr', '_write_update_before_ar', 'write pre']
2358             );
2359 351         956 for my $ii (0..$#loop) {
2360 702         1076 my $l = $loop[$ii];
2361              
2362 702         1263 my $ht = $self->{$l->[0]};
2363 702         933 my @subrefs = @{$self->{$l->[1]}};
  702         1435  
2364 702         961 for my $field (keys %{$ht}) {
  702         1453  
2365 70 50       141 unless (exists $named_fields{$field}) {
2366 0         0 $self->_print_error($l->[2] . ": unknown field '$field'",
2367             0, ERR_UNKNOWN_FIELD, { %named_fields } );
2368 0         0 next;
2369             }
2370              
2371 70         123 my $i = $named_fields{$field};
2372              
2373 70         87 my @allsubs;
2374 70         82 push @allsubs, @{$ht->{$field}};
  70         120  
2375 70 100       151 if (defined($subrefs[$i])) {
2376 2 50       6 unshift @allsubs, $subrefs[$i] if $ii == 0;
2377 2 50       6 push @allsubs, $subrefs[$i] if $ii == 1;
2378             }
2379              
2380             my $finalsub = sub {
2381 196     196   308 for my $s (@allsubs) {
2382 262         778 $_ = $s->(@_);
2383             }
2384 189         3877 return $_;
2385 70         216 };
2386 70         145 $subrefs[$i] = $finalsub;
2387              
2388             }
2389 702         1915 $self->{$l->[1]} = [ @subrefs ];
2390             }
2391              
2392 351         1492 my $tmp = _get_def($self->{out_fields}, $self->{write_fields});
2393 351 100       1034 my @wf = @{$tmp} if defined($tmp);
  6         13  
2394 351         570 my $count_field_not_found = 0;
2395 351         683 for (@wf) {
2396 16 100 66     90 next if !defined($_) or $_ eq '' or exists $named_fields{$_};
      100        
2397 3         5 $count_field_not_found++;
2398 3         18 $self->_print_error("out_fields: unknown field '$_'",
2399             1, ERR_UNKNOWN_FIELD, { %named_fields } );
2400             }
2401 351 100       756 if ($count_field_not_found) {
2402 2         5 $self->_print_error("non existent field(s) encountered");
2403 1         5 delete $self->{out_fields};
2404 1         3 delete $self->{write_fields};
2405             }
2406              
2407 350 100       847 my %sh = %{$self->{_out_headers}} if defined($self->{_out_headers});
  4         13  
2408 350         525 $count_field_not_found = 0;
2409 350         1157 for (keys %sh) {
2410 8 100 33     35 next if !defined($_) or $_ eq '' or exists $named_fields{$_};
      66        
2411 2         3 $count_field_not_found++;
2412 2         12 $self->_print_error("out_header: unknown field '$_'",
2413             1, ERR_UNKNOWN_FIELD, { %named_fields } );
2414             }
2415 350 100       697 $self->_print_error("non existent field(s) encountered") if $count_field_not_found;
2416              
2417 349         2306 return 1;
2418             }
2419              
2420             #
2421             # Return 0 if there's no more records (error or eof reached), 1 if a record got read
2422             # successfully.
2423             #
2424             # If return value is 1:
2425             # $$ref_ar and $$ref_hr are set to array ref and hash ref of the record, respectively
2426             #
2427             # If return value is 0:
2428             # $$ref_ar and $$ref_hr are set to undef if an error occured
2429             # $$ref_ar and $$ref_hr are set to a scalar if eof reached
2430             #
2431             sub _read_one_record_from_input {
2432 2341     2341   4158 my ($self, $ref_ar, $ref_row_hr) = @_;
2433              
2434 2341         3236 my $_debug = $self->{_debug};
2435 2341         2856 my $_debug_extra_fields = $self->{_debug_extra_fields};
2436 2341         3166 my $_debugh = $self->{_debugh};
2437              
2438 2341         4125 my $in_file_disp = $self->get_in_file_disp();
2439              
2440 2341         3603 my $incsv = $self->{_in_csvobj};
2441 2341         2762 my $ar;
2442              
2443             print($_debugh "$PKG: '$in_file_disp': will read line #" . ($self->{_row_read} + 1) . "\n")
2444 2341 50       4000 if $self->{_debug_read};
2445              
2446 2341 100       4187 unless ($ar = _mygetline($incsv, $self->{_inh})) {
2447 286 50       10010 if (!$incsv->eof()) {
2448 0         0 my ($code, $str, $pos) = $incsv->error_diag();
2449 0         0 $self->_print_error("$code: $str, record " . $incsv->record_number . ", position $pos");
2450 0         0 $$ref_ar = undef;
2451 0         0 $$ref_row_hr = undef;
2452             } else {
2453 286         1780 $$ref_ar = 1;
2454 286         709 $$ref_row_hr = 1;
2455             }
2456              
2457 286         859 $self->_close_inh();
2458              
2459 286         944 return 0;
2460             }
2461              
2462 2055         59052 $self->{_row_read}++;
2463              
2464 2055         2767 my %named_fields = %{$self->{_named_fields}};
  2055         8734  
2465              
2466 2055 50       4666 if ($self->{_debug_read}) {
2467 0         0 print($_debugh "Line " . $self->{_row_read} . ":\n--\n");
2468 0         0 for (sort keys %named_fields) {
2469 0         0 my $c = _get_def($ar->[$named_fields{$_}], '<undef>');
2470 0         0 print($_debugh " $_ => '" . $c . "'\n");
2471             }
2472             }
2473              
2474 2055         2927 my $columns_ar = $self->{_columns};
2475              
2476 2055         2633 my $no_undef = $self->{no_undef};
2477 2055 100       3244 if ($no_undef) {
2478 30         37 for (0..$#{$columns_ar}) {
  30         71  
2479 324 100       607 $ar->[$_] = '' unless defined($ar->[$_]);
2480             }
2481             }
2482              
2483 2055         2857 my $row_hr = { };
2484             $row_hr->{$_} = $ar->[$self->{_regular_named_fields}->{$_}]
2485 2055         2483 foreach keys %{$self->{_regular_named_fields}};
  2055         13216  
2486              
2487 2055         3891 my $rpost = $self->{_read_update_after_ar};
2488 2055         2541 for my $i (0..$#{$columns_ar}) {
  2055         4135  
2489 10586         12777 my $subref = $rpost->[$i];
2490 10586 100       17217 next unless defined($subref);
2491              
2492 479         594 do {
2493 479         712 my $field = $columns_ar->[$i];
2494 479         809 local $_ = $ar->[$i];
2495 479         930 my $new_val = $subref->($self, $field);
2496 474         878 $ar->[$i] = $new_val;
2497 474 50       1684 $row_hr->{$field} = $new_val if defined($field);
2498             }
2499              
2500             }
2501              
2502 2050         2627 for my $i (@{$self->{_extra_fields_indexes}}) {
  2050         3584  
2503 442         634 my $name = $columns_ar->[$i];
2504 442         692 my $e = $self->{_extra_fields_definitions}->{$name};
2505              
2506 442 50       723 print($_debugh "Extra field: #$i: $name\n") if $_debug_extra_fields;
2507              
2508 442         534 my $value;
2509 442 100       7218 if ($e->ef_type == $EF_LINK) {
    100          
    50          
2510              
2511 238 50       1804 print($_debugh " linked field\n") if $_debug_extra_fields;
2512              
2513 238         3293 my $remobj = $e->link_remote_obj;
2514             $value = $remobj->vlookup(
2515             $e->link_remote_search,
2516 238         4381 $ar->[$named_fields{$e->link_self_search}],
2517             $e->link_remote_read,
2518             $e->link_vlookup_opts
2519             );
2520              
2521             } elsif ($e->ef_type == $EF_FUNC) {
2522              
2523 51 50       1176 print($_debugh " computed field\n") if $_debug_extra_fields;
2524              
2525 51         666 $value = $e->func_sub->($name, $row_hr, $self->{_stats});
2526              
2527             } elsif ($e->ef_type == $EF_COPY) {
2528              
2529 153 50       5733 print($_debugh " copy field\n") if $_debug_extra_fields;
2530              
2531 153         1898 my $input = $row_hr->{$e->copy_source};
2532 153 50 33     1072 $input = '' if !defined($input) and $no_undef;
2533 153 100       1907 if (defined($e->copy_sub)) {
2534 57         395 local $_ = $input;
2535 57         742 $value = $e->copy_sub->();
2536             } else {
2537 96         603 $value = $input;
2538             }
2539              
2540 153 50       970 print($_debugh " in: '$input', out: '$value'\n") if $_debug_extra_fields;
2541              
2542             } else {
2543 0         0 confess "Unknown ef_type '" . $e->ef_type . "', check this module' code urgently!";
2544             }
2545              
2546 440 100 100     1688 $value = '' if !defined($value) and $no_undef;
2547 440         731 $ar->[$i] = $value;
2548 440         780 $row_hr->{$name} = $value;
2549              
2550 440 50       913 print($_debugh " $name => '$value'\n") if $_debug_extra_fields;
2551              
2552             }
2553              
2554 2048 100       3785 if (defined($self->{read_post_update_hr})) {
2555 33         79 $self->{read_post_update_hr}->($row_hr, $self->{_stats}, $self->get_recnum());
2556 33         288 $ar->[$named_fields{$_}] = $row_hr->{$_} foreach keys %named_fields;
2557             }
2558              
2559 2048 100       5593 lock_keys(%$row_hr) if $self->{croak_if_error};
2560              
2561             $self->{walker_ar}->($ar, $self->{_stats}, $self->get_recnum())
2562 2048 100       12845 if defined($self->{walker_ar});
2563             $self->{walker_hr}->($row_hr, $self->{_stats}, $self->get_recnum())
2564 2048 100       4067 if defined($self->{walker_hr});
2565              
2566 2047         3211 $$ref_ar = $ar;
2567 2047         4113 $$ref_row_hr = $row_hr;
2568              
2569 2047         5682 return 1;
2570             }
2571              
2572             sub _open_read {
2573 297     297   468 my $self = shift;
2574              
2575 297         597 my $verbose = $self->{verbose};
2576 297         696 my $in_file_disp = $self->get_in_file_disp();
2577              
2578 297         642 $self->{_stats} = { };
2579 297         574 $self->{_read_in_progress} = 1;
2580              
2581 297 50       692 $self->_print("-- $in_file_disp reading start\n") if $verbose;
2582             }
2583              
2584             sub _close_read {
2585 316     316   483 my $self = shift;
2586 316         491 my $keep_quiet = shift;
2587              
2588 316         567 my $verbose = $self->{verbose};
2589 316         642 my $in_file_disp = $self->get_in_file_disp();
2590              
2591 316         570 $self->{_read_in_progress} = 0;
2592              
2593 316 50 33     778 if ($verbose and !$keep_quiet) {
2594 0         0 $self->_print("-- $in_file_disp reading end: " . $self->{_row_read} . " row(s) read\n");
2595 0         0 for my $k (sort keys %{$self->{_stats}}) {
  0         0  
2596 0         0 $self->_printf(" %7d %s\n", $self->{_stats}->{$k}, $k);
2597             }
2598             }
2599              
2600 316         609 $self->{_nb_rows} = $self->{_row_read};
2601             }
2602              
2603             # Return 0 if error, 1 if all good
2604             sub _S4_read_all_in_mem {
2605 167     167   320 my $self = shift;
2606              
2607 167         495 $self->_register_pass("_S4_read_all_in_mem()");
2608              
2609 167         524 $self->_open_read();
2610              
2611 167         311 my $ar;
2612             my $row_hr;
2613 167         547 while ($self->_read_one_record_from_input(\$ar, \$row_hr)) {
2614              
2615 1553         2045 push @{$self->{_flat}}, $ar;
  1553         4063  
2616              
2617             }
2618              
2619 165 50       473 my $retcode = (defined($ar) ? 1 : 0);
2620 165         515 $self->_update_in_mem_record_count();
2621              
2622 165         557 $self->_close_read();
2623              
2624 165         415 return $retcode;
2625             }
2626              
2627             sub _chain_array {
2628 24     24   146 return split(/\s*->\s*/, $_[0]);
2629             }
2630              
2631             sub _chain_str {
2632 2     2   7 return join('->', @_);
2633             }
2634              
2635             sub field_add_link {
2636 25     25 1 4309 my $self = shift;
2637              
2638 25         852 validate_pos(@_, {type => UNDEF | SCALAR}, {type => SCALAR}, {type => SCALAR | OBJECT},
2639             {type => HASHREF, optional => 1});
2640              
2641 22         92 my ($new_field, $chain, $obj, $param_opts) = @_;
2642              
2643 22         44 my $croak_if_error = $self->{croak_if_error};
2644 22         39 my $_debug = $self->{_debug};
2645 22         35 my $_debugh = $self->{_debugh};
2646              
2647 22         57 my @c = _chain_array($chain);
2648 22 100       60 $new_field = $c[2] unless defined($new_field);
2649              
2650 22 50       51 print($_debugh "Registering new linked field, new_field = '$new_field', chain = '$chain'\n")
2651             if $_debug;
2652              
2653 22 100 66     104 unless (@c == 3 and $c[2] ne '') {
2654 1         8 $self->_print_error("wrong links chain parameter: '$chain', " .
2655             "look for CHAIN in Text::AutoCSV manual for help");
2656 1         12 return undef;
2657             }
2658              
2659 21 50       59 return undef unless $self->_status_forward('S2');
2660 21 50       54 return undef unless $self->_status_backward('S2');
2661              
2662 21 100       52 my @tmp = %{$param_opts} if $param_opts;
  13         37  
2663 21         513 my %opts = validate(@tmp, $SEARCH_VALIDATE_OPTIONS);
2664              
2665 20         75 my $target_name = '';
2666 20 100       55 if (ref $obj eq '') {
2667 19         30 my $in_file = $obj;
2668 19         31 $target_name = $in_file;
2669              
2670             #
2671             # TODO (?)
2672             #
2673             # Take into account the fact that the OS' file system is case insensitive. At the
2674             # moment, two different strings (even if identical in a case insensitive comparison)
2675             # will be managed as being distinct.
2676             # I put a question mark in this TO DO - after all, the user of this module had better
2677             # use same case when dealing with multiple links of the same file.
2678             #
2679             # Also, tuning this module' behavior depending on the OS' characteristics would be not
2680             # ideal, it'd add a level of complexity to understand how it works and what to expect.
2681             #
2682 19 100 100     66 if (exists $self->{_obj} and exists $self->{_obj}->{$in_file}) {
2683              
2684 4 50       110 print(
2685             $_debugh
2686             "field_add_link: file '$in_file': re-using existing Text::AutoCSV object\n"
2687             ) if $_debug;
2688              
2689 4         15 $obj = $self->{_obj}->{$in_file};
2690             } else {
2691              
2692 15 50       36 print($_debugh "field_add_link: file '$in_file': creating new Text::AutoCSV object\n")
2693             if $_debug;
2694              
2695 15 100       58 $self->{_obj} = { } unless exists $self->{_obj};
2696              
2697             #
2698             # The created Text::AutoCSV must be created with the same search options as what is
2699             # currently found in $self.
2700             #
2701             # Why?
2702             # Because the link is populated doing a vlookup on the remote object ($obj below),
2703             # not on $self. Therefore, if we don't "propagate" search options from $self to
2704             # $obj, search tunnings won't work as user would expect.
2705             #
2706 15         24 my %search_opts;
2707 15         40 for (qw(search_case search_trim search_ignore_empty search_ignore_accents
2708             search_value_if_not_found search_value_if_ambiguous search_ignore_ambiguous)) {
2709             # We assign depending on whether or not the attribute EXISTS - the definedness
2710             # is not appropriate, in case an attribute would have been assigned to undef.
2711 105 100       189 $search_opts{$_} = $self->{$_} if exists $self->{$_};
2712             }
2713              
2714             $obj = Text::AutoCSV->new(
2715             in_file => $in_file,
2716             verbose => $self->{verbose},
2717             infoh => $self->{infoh},
2718             _debug => $self->{debug},
2719             _debugh => $self->{debugh},
2720 15         103 %search_opts
2721             );
2722 15         61 $self->{_obj}->{$in_file} = $obj;
2723             }
2724             } else {
2725 1         2 $target_name = '(object)';
2726 1 50       4 print($_debugh "field_add_link: Text::AutoCSV object provided\n") if $_debug;
2727             }
2728              
2729 20 100       64 $self->{_extra_fields} = [ ] unless exists $self->{_extra_fields};
2730              
2731 20         35 push @{$self->{_extra_fields}}, ExtraField->new(
  20         516  
2732             ef_type => $EF_LINK,
2733             self_name => $new_field,
2734             description => "link: $target_name, chain: $chain",
2735             check_field_existence => $c[0],
2736              
2737             link_self_search => $c[0],
2738             link_remote_obj => $obj,
2739             link_remote_search => $c[1],
2740             link_remote_read => $c[2],
2741              
2742             link_vlookup_opts => \%opts
2743             );
2744              
2745 20         2417 return $self;
2746             }
2747              
2748             sub links {
2749 2     2 1 5 my $self = shift;
2750              
2751 2         31 validate_pos(@_, {type => UNDEF | SCALAR}, {type => SCALAR}, {type => SCALAR | OBJECT},
2752             {type => HASHREF, optional => 1});
2753              
2754 2         7 my $prefix_field = shift;
2755 2         4 my $chain = shift;
2756 2         5 my ($obj, $param_opts) = @_;
2757              
2758 2         5 my @c = _chain_array($chain);
2759              
2760 2 50 33     17 if (@c != 2 or $c[0] eq '' or $c[1] eq '') {
      33        
2761 0         0 $self->_print_error("wrong links chain parameter: '$chain', " .
2762             "look for JOINCHAIN in Text::AutoCSV manual for help");
2763 0         0 return undef;
2764             }
2765              
2766 2 100       6 $prefix_field = '' unless defined($prefix_field);
2767 2         6 my $chain2 = _chain_str(@c, '*');
2768              
2769 2         8 return $self->field_add_link($prefix_field, $chain2, @_);
2770             }
2771              
2772             sub field_add_computed {
2773 7     7 1 2550 my $self = shift;
2774              
2775 7         217 validate_pos(@_, {type => SCALAR}, {type => CODEREF});
2776 6         26 my ($new_field, $func) = @_;
2777              
2778 6         15 my $croak_if_error = $self->{croak_if_error};
2779              
2780 6         12 my $_debug = $self->{_debug};
2781 6         12 my $_debugh = $self->{_debugh};
2782              
2783 6 50       20 print($_debugh "Registering new computed field, new_field = '$new_field'\n") if $_debug;
2784              
2785 6 100       19 return undef unless $self->_status_forward('S2');
2786 5 50       13 return undef unless $self->_status_backward('S2');
2787              
2788 5         10 push @{$self->{_extra_fields}}, ExtraField->new(
  5         108  
2789             ef_type => $EF_FUNC,
2790             self_name => $new_field,
2791             description => "computed",
2792              
2793             func_sub => $func
2794             );
2795              
2796 5         464 return $self;
2797             }
2798              
2799             sub field_add_copy {
2800 14     14 1 8764 my $self = shift;
2801              
2802 14         213 validate_pos(@_, {type => SCALAR}, {type => SCALAR}, {type => CODEREF, optional => 1});
2803 14         62 my ($new_field, $copy_source, $func) = @_;
2804              
2805 14         32 my $croak_if_error = $self->{croak_if_error};
2806              
2807 14         26 my $_debug = $self->{_debug};
2808 14         25 my $_debugh = $self->{_debugh};
2809              
2810 14 50       36 print($_debugh "Registering field copy, new_field = '$new_field' copied from '$copy_source'\n")
2811             if $_debug;
2812              
2813 14 100       34 return undef unless $self->_status_forward('S2');
2814 12 50       35 return undef unless $self->_status_backward('S2');
2815              
2816 12 100       22 push @{$self->{_extra_fields}}, ExtraField->new(
  12         305  
2817             ef_type => $EF_COPY,
2818             self_name => $new_field,
2819             description => "copy of $copy_source " . (defined($func) ? '(with sub)' : '(no sub)'),
2820             check_field_existence => $copy_source,
2821              
2822             copy_source => $copy_source,
2823             copy_sub => $func
2824             );
2825              
2826 12         1147 return $self;
2827             }
2828              
2829             sub in_map {
2830 15     15 1 1062 my $self = shift;
2831              
2832 15         39 return $self->read_update_after(@_);
2833             }
2834              
2835             sub read_update_after {
2836 16     16 1 46 my $self = shift;
2837 16         179 validate_pos(@_, {type => SCALAR}, {type => CODEREF});
2838              
2839 16         55 my ($field, $subref) = @_;
2840              
2841 16         32 my $_debug = $self->{_debug};
2842 16         24 my $_debugh = $self->{_debugh};
2843              
2844 16 50       39 return undef unless $self->_status_forward('S2');
2845 16 50       40 return undef unless $self->_status_backward('S2');
2846              
2847 16 50       37 print($_debugh "Registering read_post_update subref for field '$field'\n") if $_debug;
2848              
2849             $self->{_read_update_after_hr}->{$field} = [ ]
2850 16 100       53 unless defined($self->{_read_update_after_hr}->{$field});
2851              
2852 16         21 push @{$self->{_read_update_after_hr}->{$field}}, $subref;
  16         44  
2853              
2854 16         188 return $self;
2855             }
2856              
2857             sub out_map {
2858 11     11 1 24 my $self = shift;
2859              
2860 11         31 return $self->write_update_before(@_);
2861             }
2862              
2863             sub write_update_before {
2864 12     12 1 21 my $self = shift;
2865 12         147 validate_pos(@_, {type => SCALAR}, {type => CODEREF});
2866              
2867 12         68 my ($field, $subref) = @_;
2868              
2869 12         26 my $_debug = $self->{_debug};
2870 12         23 my $_debugh = $self->{_debugh};
2871              
2872 12 50       35 return undef unless $self->_status_forward('S2');
2873 12 50       34 return undef unless $self->_status_backward('S2');
2874              
2875 12 50       36 print($_debugh "Registering write_pre_update subref for field '$field'\n") if $_debug;
2876              
2877             $self->{_write_update_before_hr}->{$field} = [ ]
2878 12 100       42 unless defined($self->{_write_update_before_hr}->{$field});
2879              
2880 12         24 push @{$self->{_write_update_before_hr}->{$field}}, $subref;
  12         31  
2881              
2882 12         98 return $self;
2883             }
2884              
2885             sub reset_next_record_hr {
2886 189     189 1 317 my $self = shift;
2887              
2888 189         869 validate_pos(@_);
2889              
2890 189         507 $self->{_current_record} = undef;
2891              
2892 189         313 return $self;
2893             }
2894              
2895             sub _create_internal_column_name_from_its_number {
2896 1120     1120   2658 return sprintf("__%04i__", $_[0]);
2897             }
2898              
2899             sub _ar_to_hr {
2900 2079     2079   2565 my $self = shift;
2901              
2902 2079         14622 validate_pos(@_, {type => ARRAYREF});
2903              
2904 2079         5186 my ($ar) = @_;
2905 2079         2603 my $last_elem_index = scalar(@{$ar}) - 1;
  2079         3428  
2906              
2907 2079         2998 my $nr = $self->{_named_fields};
2908 2079         2499 my %h;
2909             my %n_seen;
2910 2079         2286 for (keys %{$nr}) {
  2079         5571  
2911 9934         18379 $h{$_} = $ar->[$nr->{$_}];
2912 9934         16214 undef $n_seen{$nr->{$_}};
2913             }
2914 2079         4437 for my $i (0..$last_elem_index) {
2915 11001 100       18595 if (!exists($n_seen{$i})) {
2916 1120         1509 my $k = _create_internal_column_name_from_its_number($i);
2917 1120 50       3161 $h{$k} = $ar->[$i] if !exists $h{$k};
2918             }
2919             }
2920              
2921 2079 100       5685 lock_keys(%h) if $self->{croak_if_error};
2922              
2923 2079         16022 return \%h;
2924             }
2925              
2926             sub get_next_record_hr {
2927 2001     2001 1 2852 my $self = shift;
2928              
2929 2001         13072 validate_pos(@_, {type => SCALARREF, optional => 1});
2930              
2931 2001         4485 my $refkey = $_[0];
2932              
2933 2001 50       3628 return undef unless $self->_status_forward('S4');
2934              
2935 1998 100       3831 if (!defined($self->{_current_record})) {
2936 186         341 $self->{_current_record} = 0;
2937             } else {
2938 1812         2408 $self->{_current_record}++;
2939             }
2940              
2941 1998         3250 my $ar = $self->{_flat}->[$self->{_current_record}];
2942 1998 100       3183 if (!defined($ar)) {
2943 186         310 $self->{_current_record} = undef;
2944 186         285 $$refkey = undef;
2945 186         495 return undef;
2946             }
2947              
2948 1812         2623 $$refkey = $self->{_current_record};
2949              
2950 1812         3202 return $self->_ar_to_hr($ar);
2951             }
2952              
2953             sub read {
2954 60     60 1 19497 my $self = shift;
2955              
2956 60         417 validate_pos(@_);
2957              
2958 60 50       185 return undef unless $self->_status_backward('S3');
2959 60 100       132 return undef unless $self->_status_forward('S3');
2960              
2961 51         175 $self->_register_pass("read()");
2962              
2963 51         172 $self->_open_read();
2964              
2965 51         84 my $ar;
2966             my $row_hr;
2967 51         142 while ($self->_read_one_record_from_input(\$ar, \$row_hr)) {
2968             # Ben oui quoi... qu'est-ce que l'on peut bien faire d'autre ?
2969             }
2970              
2971 49         151 $self->_close_read();
2972 49 50       109 return undef unless defined($ar);
2973              
2974 49 50       127 return undef unless $self->_status_reset();
2975              
2976 49         244 return $self;
2977             }
2978              
2979             #
2980             # Initially, _read_all_in_mem was intended for the test plan.
2981             #
2982             # Turned out to be sometimes useful for user, thus, is no longer private since 1.1.5.
2983             # Private version below is kept for compatibility.
2984             #
2985             sub read_all_in_mem {
2986 1     1 1 4 my $self = shift;
2987              
2988 1         3 return $self->_read_all_in_mem();
2989             }
2990              
2991             sub _read_all_in_mem {
2992 9     9   1002 my $self = shift;
2993              
2994 9 50       17 return 0 unless $self->_status_backward('S3');
2995 9 50       21 return 0 unless $self->_status_forward('S4');
2996              
2997 9         63 return $self;
2998             }
2999              
3000             sub _render {
3001 0     0   0 my $v = $_[0];
3002              
3003 0 0 0     0 if (length($v) == 1 and ord($v) < 32) {
3004 0         0 my $n = ord($v);
3005 0 0       0 return '\n' if $n == 10;
3006 0 0       0 return '\r' if $n == 13;
3007 0 0       0 return '\t' if $n == 9;
3008 0 0       0 return '\f' if $n == 12;
3009 0 0       0 return '\b' if $n == 8;
3010 0 0       0 return '\a' if $n == 7;
3011 0 0       0 return '\e' if $n == 27;
3012 0         0 return '\0' . oct($n);
3013             }
3014 0         0 return $v;
3015             }
3016              
3017             sub print_id {
3018 0     0 1 0 my $self = shift;
3019              
3020 0         0 $self->_printf("-- " . $self->get_in_file_disp() . ":\n");
3021 0         0 $self->_printf("sep_char: " . _render($self->get_sep_char()) . "\n");
3022 0         0 $self->_printf("escape_char: " . _render($self->get_escape_char()) . "\n");
3023 0         0 $self->_printf("in_encoding: " . _render($self->get_in_encoding()) . "\n");
3024 0 0       0 $self->_printf("is_always_quoted: " . ($self->get_is_always_quoted() ? 'yes' : 'no') . "\n");
3025              
3026 0         0 my @coldata = $self->get_coldata();
3027 0         0 my @disp;
3028 0         0 push @disp, [ '#', 'FIELD', 'HEADER', 'EXT DATA', 'DATETIME FORMAT', 'DATETIME LOCALE' ];
3029 0         0 push @disp, [ map { my $s = $_; $s =~ s/./-/g; $s } @{$disp[0]} ];
  0         0  
  0         0  
  0         0  
  0         0  
3030 0         0 for my $i (0..$#coldata) {
3031 0         0 my $col = $coldata[$i];
3032              
3033 0         0 my @row;
3034 0         0 push @row, "$i";
3035 0 0       0 push @row, (defined($col->[$_]) ? ($col->[$_] . '') : '') for (0..4);
3036 0         0 map { s/\n/\\n/g; s/\r/\\r/g; s/\t/\\t/g } @row;
  0         0  
  0         0  
  0         0  
3037 0         0 push @disp, [ @row ];
3038             }
3039 0         0 my $n = @{$disp[-1]};
  0         0  
3040 0         0 my @max = (-1) x $n;
3041 0         0 for my $l (@disp) {
3042 0 0       0 do { $max[$_] = length($l->[$_]) if $max[$_] < length($l->[$_]) } for (0 .. $n - 1);
  0         0  
3043             }
3044 0         0 my $s = join(' ', map { "%-${_}s" } @max);
  0         0  
3045 0         0 $self->_print("\n");
3046 0         0 $self->_printf("$s\n", @{$_}) for (@disp);
  0         0  
3047             }
3048              
3049             sub set_out_file {
3050 2     2 1 5260 my $self = shift;
3051 2         31 validate_pos(@_, {type => SCALAR});
3052              
3053 2         10 my ($out_file) = @_;
3054 2         8 $self->{out_file} = $out_file;
3055              
3056 2         11 return $self;
3057             }
3058              
3059             # Subrefs set with out_map
3060             sub _execute_write_update_before {
3061 367     367   614 my ($self, $ar) = @_;
3062              
3063 367         529 my $columns_ar = $self->{_columns};
3064              
3065 367         511 my $wpre = $self->{_write_update_before_ar};
3066 367         487 for my $i (0..$#{$columns_ar}) {
  367         753  
3067 1177         1495 my $subref = $wpre->[$i];
3068 1177 100       2077 next unless defined($subref);
3069              
3070 159         181 do {
3071 159         257 local $_ = $ar->[$i];
3072 159         212 my $field = $columns_ar->[$i];
3073 159         301 my $new_val = $subref->($self, $field);
3074 156         367 $ar->[$i] = $new_val;
3075             }
3076              
3077             }
3078             }
3079              
3080             # Take into account write_fields if it got set
3081             sub _apply_write_fields {
3082 466     466   797 my ($self, $ar) = @_;
3083              
3084 466         564 my @final;
3085              
3086 466         1308 my $tmp = _get_def($self->{out_fields}, $self->{write_fields});
3087 466 100       1066 my @wf = @{$tmp} if defined($tmp);
  16         26  
3088              
3089 466 100       1001 return unless @wf;
3090              
3091 16         18 my %named_fields = %{$self->{_named_fields}};
  16         63  
3092 16         37 for my $i (0..$#wf) {
3093 40         47 my $field = $wf[$i];
3094 40 100 66     108 my $tmp = $ar->[$named_fields{$field}] if defined($field) and $field ne '';
3095              
3096             # Put here any post-processing of value
3097             # WARNING
3098             # $tmp can be undef
3099             # ...
3100              
3101 40         71 $final[$i] = $tmp;
3102             }
3103 16         50 $_[1] = [ @final ];
3104             }
3105              
3106             sub write {
3107 109     109 1 23249 my $self = shift;
3108              
3109 109         728 validate_pos(@_);
3110              
3111 109 50       390 return undef unless $self->_status_forward('S3');
3112              
3113 104         211 my $verbose = $self->{verbose};
3114 104         170 my $_debug = $self->{_debug};
3115 104         175 my $_debugh = $self->{_debugh};
3116              
3117 104         195 my $out_file = $self->{out_file};
3118              
3119 104         176 my %stats;
3120              
3121 104 50       213 $self->_print("-- $out_file writing start\n") if $verbose;
3122 104         157 my $rows_written = 0;
3123              
3124 104         164 my $outh = $self->{outh};
3125              
3126 104         201 $self->{_close_outh_when_finished} = 0;
3127 104 50       245 unless (defined($outh)) {
3128 104 50       286 if ($out_file eq '') {
3129 0         0 $outh = \*STDOUT;
3130             } else {
3131 104 50       7450 unless (open($outh, '>', $out_file)) {
3132 0         0 $self->_print_error("unable to open file '$out_file': $!");
3133 0         0 return undef;
3134             }
3135 104         449 $self->{_close_outh_when_finished} = 1;
3136             }
3137 104         226 $self->{outh} = $outh;
3138             }
3139              
3140 104 50       264 unless ($self->{_leave_encoding_alone}) {
3141             my $enc = (defined($self->{_inh_encoding}) ?
3142             $self->{_inh_encoding} :
3143 104 50       341 $DEFAULT_OUT_ENCODING);
3144              
3145             # out_encoding option takes precedence
3146 104 100       258 $enc = $self->{out_encoding} if defined($self->{out_encoding});
3147 104         289 my $m = ":encoding($enc)";
3148 104 50 66     264 if (_is_utf8($enc) and $self->{out_utf8_bom}) {
3149 0         0 $m .= ':via(File::BOM)';
3150             }
3151              
3152 104 50 33     423 if ($OS_IS_PLAIN_WINDOWS and $FIX_PERLMONKS_823214) {
3153             # Tested with UTF-16LE, NOT tested with UTF-16BE (it should be the same story)
3154 0 0       0 $m = ":raw:perlio:$m:crlf" if $enc =~ /^utf-?16/i;
3155             }
3156              
3157 104         982 binmode $outh, $m;
3158 104 50       10140 print($_debugh "Encoding string used for output: $m\n") if $_debug;
3159             }
3160              
3161 104         224 my $escape_char = $self->{escape_char};
3162 104         195 my $quote_char = $self->{quote_char};
3163              
3164 104         160 my %opts;
3165 104         202 $opts{binary} = 1;
3166 104         199 $opts{eol} = "\n";
3167              
3168 104 50       350 $opts{sep_char} = $self->{sep_char} if defined($self->{sep_char});
3169 104 100       244 $opts{sep_char} = $self->{out_sep_char} if defined($self->{out_sep_char});
3170              
3171 104 50       316 $opts{quote_char} = $self->{quote_char} if defined($self->{quote_char});
3172 104 50       240 $opts{quote_char} = $self->{out_quote_char} if defined($self->{out_quote_char});
3173              
3174 104 100       286 $opts{escape_char} = $self->{escape_char} if defined($self->{escape_char});
3175 104 100       219 $opts{escape_char} = $self->{out_escape_char} if defined($self->{out_escape_char});
3176              
3177 104         211 $opts{always_quote} = $self->{_is_always_quoted};
3178 104 100       215 $opts{always_quote} = $self->{out_always_quote} if defined($self->{out_always_quote});
3179              
3180 104         774 my $csvout = Text::CSV->new({ %opts });
3181 104 50       16000 if (!defined($csvout)) {
3182 0         0 $self->_print_error("error creating output Text::CSV object");
3183 0         0 return undef;
3184             }
3185              
3186 104         440 my $write_filter_hr = _get_def($self->{out_filter}, $self->{write_filter_hr});
3187              
3188 104 100 66     673 if (($self->{has_headers} and
      100        
      100        
3189             !(defined($self->{out_has_headers}) and !$self->{out_has_headers}))
3190             or $self->{out_has_headers}) {
3191 102         215 my $ar = [ ];
3192 102 100       245 if ($self->{has_headers}) {
3193 98         192 $ar = $self->{_headers};
3194             } else {
3195 4         7 my $nf = $self->{_named_fields};
3196 4         6 $ar->[$nf->{$_}] = $_ for (keys %{$nf});
  4         20  
3197             }
3198              
3199 102 100       271 if (exists $self->{_out_headers}) {
3200 3         4 my $h = $self->{_out_headers};
3201 3         5 for (keys %{$self->{_named_fields}}) {
  3         10  
3202 12 100       22 if (exists $h->{$_}) {
3203 5         13 $ar->[$self->{_named_fields}->{$_}] = $h->{$_};
3204             }
3205             }
3206             }
3207              
3208 102         347 $self->_apply_write_fields($ar);
3209              
3210 102         1568 $csvout->print($outh, $ar);
3211 102         1115 $rows_written++;
3212             }
3213              
3214 104         182 my $do_status_reset = 0;
3215              
3216              
3217             #
3218             # FIXME!!!
3219             #
3220             # Instead of this duplication of code, provide AutoCSV with a "create iterator sub" feature to
3221             # iterate over all records, whatever is going on behind the scene (in-memory or read input).
3222             #
3223             # Such an iterator would also benefit to module users.
3224             #
3225              
3226              
3227 104 100       280 if ($self->{_status} == 4) {
3228              
3229             #
3230             # The content is available in-memory: we write from what we have in-memory then...
3231             #
3232              
3233 25         80 my @keys = $self->get_keys();
3234 25         48 my @ordered_keys = @keys;
3235 25 100       65 if (exists $self->{'out_orderby'}) {
3236 1         2 my @orderby = @{$self->{'out_orderby'}};
  1         2  
3237             @ordered_keys = sort {
3238 1         4 for my $f (@orderby) {
  7         12  
3239 8         12 my $cmp = $self->get_cell($a, $f) cmp $self->get_cell($b, $f);
3240 8 100       20 return $cmp if $cmp;
3241             }
3242 0         0 return 0;
3243             } @keys;
3244             }
3245              
3246             # for my $k ($self->get_keys()) {
3247 25         54 for my $k (@ordered_keys) {
3248 92         179 my $hr = $self->get_row_hr($k);
3249 92 50       173 if (defined($write_filter_hr)) {
3250 0 0       0 next unless $write_filter_hr->($hr);
3251             }
3252 92         117 my $ar = [ @{$self->get_row_ar($k)} ];
  92         158  
3253              
3254 92         229 $self->_execute_write_update_before($ar);
3255 92         182 $self->_apply_write_fields($ar);
3256              
3257 92         483 $csvout->print($outh, $ar);
3258 92         747 $rows_written++;
3259             }
3260              
3261             } else {
3262              
3263             #
3264             # No in-memory content available: we read and write in parallel.
3265             #
3266              
3267 79         275 $self->_register_pass("write()");
3268              
3269 79         257 $self->_open_read();
3270 79         143 my $ar;
3271             my $row_hr;
3272 79         255 while ($self->_read_one_record_from_input(\$ar, \$row_hr)) {
3273 301 100       557 if (defined($write_filter_hr)) {
3274 46 100       103 next unless $write_filter_hr->($row_hr, \%stats, $self->get_recnum());
3275             }
3276 275         504 $ar = [ @{$ar} ];
  275         655  
3277              
3278 275         831 $self->_execute_write_update_before($ar);
3279 272         617 $self->_apply_write_fields($ar);
3280              
3281 272         1571 $csvout->print($outh, $ar);
3282 272         2270 $rows_written++;
3283             }
3284 72         254 $self->_close_read();
3285              
3286 72         127 $do_status_reset = 1
3287             }
3288              
3289 97         267 $self->_close_outh();
3290              
3291 97 50       286 if ($verbose) {
3292 0         0 $self->_print("-- $out_file writing end: $rows_written row(s) written\n");
3293 0         0 for my $k (sort keys %stats) {
3294 0         0 $self->_printf(" %7d %s\n", $stats{$k}, $k);
3295             }
3296             }
3297              
3298 97 100       229 if ($do_status_reset) {
3299 72 50       231 return undef unless $self->_status_reset();
3300             }
3301 97         1192 return $self;
3302             }
3303              
3304              
3305              
3306             #
3307             # * *** ***************************************************************************
3308             # * *** ***************************************************************************
3309             # * OBJ ***************************************************************************
3310             # * *** ***************************************************************************
3311             # * *** ***************************************************************************
3312             #
3313              
3314             #
3315             # The subs below assume Text::AutoCSV can be in status S4 = all in memory.
3316             #
3317              
3318              
3319             sub get_keys {
3320 32     32 1 2003 my $self = shift;
3321 32         216 validate_pos(@_);
3322              
3323 32 50       103 return undef unless $self->_status_forward('S4');
3324              
3325 32         52 my $last_key = @{$self->{_flat}} - 1;
  32         82  
3326 32         85 my @r = (0..$last_key);
3327              
3328 32         81 return @r;
3329             }
3330              
3331             sub get_row_ar {
3332 364     364 1 1051 my $self = shift;
3333 364         2436 validate_pos(@_, {type => SCALAR});
3334 364         860 my ($key) = @_;
3335              
3336 364 50       703 return undef unless $self->_status_forward('S4');
3337              
3338 364 50       669 unless (defined($key)) {
3339 0         0 $self->_print_error("get_row_ar(): \$key is not defined!");
3340 0         0 return undef;
3341             }
3342              
3343 364 100       758 $self->_print_error("unknown row '$key'") unless defined($self->{_flat}->[$key]);
3344 364         721 return $self->{_flat}->[$key];
3345             }
3346              
3347             sub get_row_hr {
3348 269     269 1 403 my $self = shift;
3349 269         2074 validate_pos(@_, {type => SCALAR});
3350 269         717 my ($key) = @_;
3351              
3352 269         553 my $ar = $self->get_row_ar($key);
3353 269 100       498 return undef unless defined($ar);
3354              
3355 267         533 return $self->_ar_to_hr($ar);
3356             }
3357              
3358             #
3359             # Could be made much more efficient (directly read $self->{_flat} instead of calling get_row_hr
3360             # that itself calls get_row_ar).
3361             # I leave it as is because get_hr_all is not good practice (it is not scalable), it was
3362             # primarily done to ease test plan.
3363             #
3364             # By the way I may make it one day not available by default, requesting caller to tune some
3365             # variable (like { $Text::AutoCSV::i_am_the_test_plan = 1 }) to expose it.
3366             #
3367             sub get_hr_all {
3368 108     108 1 2663 my $self = shift;
3369 108         771 validate_pos(@_);
3370              
3371 108         244 my @resp;
3372 108         384 $self->reset_next_record_hr();
3373 108         327 while (my $hr = $self->get_next_record_hr()) {
3374 414         1013 push @resp, $hr;
3375             }
3376 105         631 return @resp;
3377             }
3378              
3379             sub get_recnum {
3380 190     190 1 300 my $self = shift;
3381 190         969 validate_pos(@_);
3382              
3383 190 50       501 return -1 unless $self->{_read_in_progress};
3384 190         362 return _get_def($self->{_row_read}, -1);
3385             }
3386              
3387             sub _check_for_search {
3388 906     906   1584 my ($self, $field) = @_;
3389 906 50       1558 return undef unless $self->_status_forward('S4');
3390              
3391 905 100       2450 return 1 if exists $self->{_named_fields}->{$field};
3392             $self->_print_error("search: unknown field '$field'",
3393 6         34 0, ERR_UNKNOWN_FIELD, $self->{_named_fields});
3394             }
3395              
3396             sub get_cell {
3397 20     20 1 561 my $self = shift;
3398 20         174 validate_pos(@_, {type => SCALAR}, {type => SCALAR});
3399 20         57 my ($key, $field) = @_;
3400              
3401 20 50       34 return undef unless $self->_check_for_search($field);
3402 19         36 my $row = $self->get_row_hr($key);
3403 19 100       44 return $row unless defined($row);
3404 18         61 return $row->{$field};
3405             }
3406              
3407             sub get_values {
3408 9     9 1 3458 my $self = shift;
3409 9         127 validate_pos(@_, {type => SCALAR}, {type => UNDEF | CODEREF, optional => 1});
3410 9         32 my ($field, $filter_subref) = @_;
3411              
3412 9 50       28 return undef unless $self->_check_for_search($field);
3413              
3414 9         18 my @values;
3415 9         32 $self->reset_next_record_hr();
3416 9         23 while (my $hr = $self->get_next_record_hr()) {
3417 53 100       94 if (defined($filter_subref)) {
3418 23         40 local $_ = $hr->{$field};
3419 23 100       40 next unless $filter_subref->();
3420             }
3421 42         171 push @values, $hr->{$field};
3422             }
3423 9         47 return @values;
3424             }
3425              
3426             sub _get_hash_and_projector {
3427 561     561   926 my ($self, $field, $arg_opts) = @_;
3428              
3429 561         774 my $_debug = $self->{_debug};
3430 561         654 my $_debugh = $self->{_debugh};
3431              
3432 561 50       905 my %opts = %{$arg_opts} if defined($arg_opts);
  561         1100  
3433              
3434 561         1685 my $opt_case = _get_def($opts{'case'}, $self->{search_case}, $DEF_SEARCH_CASE);
3435 561         1546 my $opt_trim = _get_def($opts{'trim'}, $self->{search_trim}, $DEF_SEARCH_TRIM);
3436             my $opt_ignore_empty = _get_def($opts{'ignore_empty'}, $self->{search_ignore_empty},
3437 561         1486 $DEF_SEARCH_IGNORE_EMPTY);
3438             my $opt_ignacc = _get_def($opts{'ignore_accents'}, $self->{search_ignore_accents},
3439 561         1396 $DEF_SEARCH_IGNORE_ACCENTS);
3440              
3441 561         1315 my $opts_stringified = $opt_case . $opt_trim . $opt_ignore_empty . $opt_ignacc;
3442 561         899 my $hash_name = "_h${field}_${opts_stringified}";
3443 561         798 my $projector_name = "_p${field}_${opts_stringified}";
3444              
3445 561 100 66     1917 if (exists $self->{$hash_name} and exists $self->{$projector_name}) {
    50 33        
3446 489 50       766 print($_debugh "Search by key '$field': using existing hash and projector (" .
3447             "$hash_name, $projector_name)\n") if $_debug;
3448 489         1315 return ($hash_name, $projector_name);
3449             } elsif (exists $self->{$hash_name} or exists $self->{$projector_name}) {
3450 0         0 confess "Man, check your $PKG module code now!";
3451             }
3452              
3453 72 50       147 print($_debugh "Search by key '$field': building hash\n") if $_debug;
3454              
3455             #
3456             # Projectors
3457             #
3458             # The projector contains subs to derivate the search key from the field value.
3459             # At the moment it is used to manage with case / without case searches and with trim / without trim
3460             # searches (meaning, ignoring spaces at beginning and end of fields)
3461             #
3462             # Why naming it a projector?
3463             # Because if you run it twice on a value, the second run should produce the same result, meaning:
3464             # p(p(x)) = p(x) whatever x
3465             #
3466              
3467 72         100 my @projectors;
3468              
3469             # Add case removal in the projector function list
3470 72 100   1479   295 push @projectors, sub { return lc(shift); } unless $opt_case;
  1479         3507  
3471              
3472             # Add trim in the projector function list
3473 72 100       148 if ($opt_trim) {
3474             push @projectors,
3475             sub {
3476 1479     1479   6391 my $v = shift;
3477 1479         4668 $v =~ s/^\s+|\s+$//g;
3478 1479         3245 return $v;
3479 57         170 };
3480             }
3481              
3482             # Add remove_accents in the projector function list
3483 72 100   1886   239 push @projectors, sub { return remove_accents(shift); } if $opt_ignacc;
  1886         3049  
3484              
3485             my $projector = sub {
3486 1906     1906   3318 my $v = _get_def($_[0], '');
3487 1906         3587 $v = $_->($v) foreach (@projectors);
3488 1906         3177 return $v;
3489 72         190 };
3490              
3491             #
3492             # Filter
3493             #
3494             # As opposed to projectors above (where a search key is transformed), the idea now is to ignore
3495             # certain keys when doing a search.
3496             # At the moment, used to manage searches with / without empty values.
3497             #
3498             # That is to say: shall we use empty value as a regular value to search on, as in
3499             # my @results = $self->search('FIELDNAME', '');
3500             # ?
3501             #
3502             # Right now we don't use an array-based construct, that'd allow to chain filters with one another
3503             # (as we now have only one filter to deal with), later, we may use an array of filters, as done with
3504             # projectors...
3505             #
3506              
3507 72         102 my $filter;
3508 72 100       141 if ($opt_ignore_empty) {
3509 1268     1268   2771 $filter = sub { return $_[0] ne ''; }
3510 61         179 } else {
3511 77     77   123 $filter = sub { return 1; }
3512 11         30 }
3513              
3514 72         136 my %h;
3515             my $k;
3516 72         224 $self->reset_next_record_hr();
3517 72         202 while (my $hr = $self->get_next_record_hr(\$k)) {
3518 1345         2072 my $kv = $hr->{$field};
3519 1345         2090 my $p = $projector->($kv);
3520 1345 100       2153 unless ($filter->($p)) {
3521 76 50       145 print($_debugh "Ignoring key value '$p' in hash build\n") if $_debug;
3522 76         218 next;
3523             }
3524 1269         1689 push @{$h{$p}}, $k;
  1269         5686  
3525             }
3526 72         494 for (keys %h) {
3527 1178         1330 @{$h{$_}} = sort { $a <=> $b } @{$h{$_}};
  1178         2026  
  103         222  
  1178         2065  
3528             }
3529              
3530 72         193 $self->{_hash_build_count}++;
3531 72         805 $self->{$hash_name} = { %h };
3532 72         218 $self->{$projector_name} = $projector;
3533 72         525 return ($hash_name, $projector_name);
3534             }
3535              
3536             sub _get_hash_build_count {
3537 6     6   878 my $self = shift;
3538              
3539 6         18 return _get_def($self->{_hash_build_count}, 0);
3540             }
3541              
3542             sub search {
3543 563     563 1 8507 my $self = shift;
3544 563         4719 validate_pos(@_,
3545             {type => SCALAR}, {type => UNDEF | SCALAR}, {type => UNDEF | HASHREF, optional => 1});
3546 563         1685 my ($field, $value, $param_opts) = @_;
3547              
3548 563         1140 my $croak_if_error = $self->{croak_if_error};
3549              
3550             #
3551             # FIXME?
3552             # A bit overkill to check options each time search is called...
3553             # To be thought about.
3554             #
3555              
3556 563 100       1053 my @tmp = %{$param_opts} if $param_opts;
  421         924  
3557 563         7579 my %opts = validate(@tmp, $SEARCH_VALIDATE_OPTIONS);
3558              
3559 562 50       1930 return undef unless $self->_check_for_search($field);
3560              
3561             # $self->_print_error("undef value in search call") if !defined($value);
3562 561 50       959 $value = '' unless defined($value);
3563              
3564 561         1094 my ($hash_name, $projector_name) = $self->_get_hash_and_projector($field, \%opts);
3565              
3566 561         1358 my $ret = $self->{$hash_name}->{$self->{$projector_name}->($value)};
3567              
3568 561 100       1590 return $ret if defined($ret);
3569 185         516 return [ ];
3570             }
3571              
3572             sub search_1hr {
3573 22     22 1 5923 my $self = shift;
3574 22         275 validate_pos(@_,
3575             {type => SCALAR}, {type => UNDEF | SCALAR}, {type => UNDEF | HASHREF, optional => 1});
3576 22         77 my ($field, $value, $arg_opts) = @_;
3577              
3578 22         49 my $r = $self->search($field, $value, $arg_opts);
3579              
3580 22 100       57 return undef unless defined($r->[0]);
3581              
3582 20         39 my $opts = _get_def($arg_opts, { });
3583             my $opt_ignore_ambiguous = _get_def($opts->{'ignore_ambiguous'},
3584 20         61 $self->{'search_ignore_ambiguous'}, $DEF_SEARCH_IGNORE_AMBIGUOUS);
3585              
3586 20 100 100     34 return undef if @{$r} >= 2 and !$opt_ignore_ambiguous;
  20         83  
3587 14         42 return $self->get_row_hr($r->[0]);
3588             }
3589              
3590             sub vlookup {
3591 317     317 1 21706 my $self = shift;
3592 317         4400 validate_pos(@_, {type => SCALAR}, {type => UNDEF | SCALAR}, {type => SCALAR},
3593             {type => UNDEF | HASHREF, optional => 1});
3594 317         1207 my ($searched_field, $value, $target_field, $arg_opts) = @_;
3595              
3596 317         735 my $r = $self->search($searched_field, $value, $arg_opts);
3597 315 50       674 return undef unless $self->_check_for_search($target_field);
3598              
3599 314         627 my $opts = _get_def($arg_opts, { });
3600 314 100 66     745 unless (defined($r->[0])) {
3601             return (exists $opts->{'value_if_not_found'} ? $opts->{'value_if_not_found'} :
3602 143 100       510 $self->{'search_value_if_not_found'});
3603             } elsif (@{$r} >= 2) {
3604             my $opt_ignore_ambiguous = _get_def($opts->{'ignore_ambiguous'},
3605             $self->{'search_ignore_ambiguous'}, $DEF_SEARCH_IGNORE_AMBIGUOUS);
3606             return (exists $opts->{'value_if_ambiguous'} ? $opts->{'value_if_ambiguous'} :
3607             $self->{'search_value_if_ambiguous'}) if !$opt_ignore_ambiguous;
3608             }
3609              
3610 144 100       285 return $opts->{value_if_found} if exists $opts->{value_if_found};
3611 140 50       239 return $self->{search_value_if_found} if exists $opts->{search_value_if_found};
3612              
3613 140         352 my $hr = $self->get_row_hr($r->[0]);
3614              
3615 140         591 return $hr->{$target_field};
3616             }
3617              
3618             1;
3619              
3620             __END__
3621              
3622             =pod
3623              
3624             =encoding UTF-8
3625              
3626             =head1 NAME
3627              
3628             Text::AutoCSV - helper module to automate the use of Text::CSV
3629              
3630             =head1 VERSION
3631              
3632             version 1.1.9
3633              
3634             =head1 SYNOPSIS
3635              
3636             By default, Text::AutoCSV will detect the following characteristics of the input:
3637              
3638             - The separator, among ",", ";" and "\t" (tab)
3639              
3640             - The escape character, among '"' (double-quote) and '\\' (backslash)
3641              
3642             - Try UTF-8 and if it fails, fall back on latin1
3643              
3644             - Read the header line and compute field names
3645              
3646             - If asked to (see L</fields_dates_auto>), detect any field that contains a DateTime value, trying
3647             20 date formats, possibly followed by a time (6 time formats tested)
3648              
3649             - If asked to (see L</fields_dates>), detect DateTime format of certain fields, croak if no DateTime
3650             format can be worked out
3651              
3652             - Fields identified as containing a DateTime value (L</fields_dates_auto> or L</fields_dates>) are
3653             stored as DateTime objects by default
3654              
3655             Text::AutoCSV also provides methods to search on fields (using cached hash tables) and it can
3656             populate the value of "remote" fields, made from joining 2 CSV files with a key-value search
3657              
3658             =head2 General
3659              
3660             use Text::AutoCSV;
3661              
3662             Text::AutoCSV->new()->write(); # Read CSV data from std input, write to std output
3663              
3664             Text::AutoCSV->new(in_file => 'f.csv')->write(); # Read CSV data from f.csv, write to std output
3665              
3666             # Read CSV data from f.csv, write to g.csv
3667             Text::AutoCSV->new(in_file => 'f.csv', out_file => 'g.csv')->write();
3668              
3669             # "Rewrite" CSV file by printing out records as a list (separated by line breaks) of field
3670             # name followed by its value.
3671             my $csv = Text::AutoCSV->new(in_file => 'in.csv', walker_hr => \&walk);
3672             my @cols = $csv->get_fields_names();
3673             $csv->read();
3674             sub walk {
3675             my %rec = %{$_[0]};
3676             for (@cols) {
3677             next if $_ eq '';
3678             print("$_ => ", $rec{$_}, "\n");
3679             }
3680             print("\n");
3681             }
3682              
3683             =head2 OBJ-ish functions
3684              
3685             # Identify column internal names with more flexibility as the default mechanism
3686             my $csv = Text::AutoCSV->new(in_file => 'zips.csv',
3687             fields_hr => {'CITY' => '^(city|town)', 'ZIPCODE' => '^zip(code)?$'});
3688             # Get zipcode of Claix
3689             my $z = $csv->vlookup('CITY', ' claix ', 'ZIPCODE');
3690              
3691             my $csv = Text::AutoCSV->new(in_file => 'zips.csv');
3692             # Get zipcode of Claix
3693             my $z = $csv->vlookup('CITY', ' claix ', 'ZIPCODE');
3694             # Same as above, but vlookup is strict for case and spaces around
3695             my $csv = Text::AutoCSV->new(in_file => 'zips.csv', search_case => 1, search_trim => 0);
3696             my $z = $csv->vlookup('CITY', 'Claix', 'ZIPCODE');
3697              
3698             # Create field 'MYCITY' made by taking pers.csv' ZIP column value, looking it up in the
3699             # ZIPCODE columns of zips.csv, taking CITY colmun value and naming it 'MYCITY'. Output is
3700             # written in std output.
3701             # If a zipcode is ambiguous, say it.
3702             Text::AutoCSV->new(in_file => 'pers.csv')
3703             ->field_add_link('MYCITY', 'ZIP->ZIPCODE->CITY', 'zips.csv',
3704             { ignore_ambiguous => 0, value_if_ambiguous => '<duplicate zipcode found!>' })->write();
3705              
3706             # Note the above can also be written using Text::AutoCSV level attributes:
3707             Text::AutoCSV->new(in_file => 'pers.csv',
3708             search_ignore_ambiguous => 0, search_value_if_ambiguous => '<duplicate zipcode found!>')
3709             ->field_add_link('MYCITY', 'ZIP->ZIPCODE->CITY', 'zips.csv')->write();
3710              
3711             # Create 'MYCITY' field as above, then display some statistics
3712             my $nom_compose = 0;
3713             my $zip_not_found = 0;
3714             Text::AutoCSV->new(in_file => 'pers.csv', walker_hr => \&walk)
3715             ->field_add_link('MYCITY', 'ZIP->ZIPCODE->CITY', 'zips.csv')->read();
3716             sub walk {
3717             my $hr = shift;
3718             $nom_compose++ if $hr->{'NAME'} =~ m/[- ]/;
3719             $zip_not_found++ unless defined($hr->{'MYCITY'});
3720             }
3721             print("Number of persons with a multi-part name: $nom_compose\n");
3722             print("Number of persons with unknown zipcode: $zip_not_found\n");
3723              
3724             =head2 Updating
3725              
3726             Text::AutoCSV->new(in_file => 'names.csv', out_file => 'ucnames.csv',
3727             read_post_update_hr => \&updt)->write();
3728             sub updt { $_[0]->{'LASTNAME'} =~ s/^.*$/\U&/; }
3729              
3730             Text::AutoCSV->new(in_file => 'squares.csv', out_file => 'checkedsquares.csv',
3731             out_filter => \&wf)->write();
3732             sub wf { return ($_[0]->{'X'} ** 2 == $_[0]->{'SQUAREOFX'}); }
3733              
3734             # Add a field for the full name, made of the concatenation of the
3735             # first name and the last name.
3736             # Also display stats about empty full names.
3737             Text::AutoCSV->new(in_file => 'dirpeople.csv', out_file => 'dirwithfn.csv', verbose => 1)
3738             ->field_add_computed('FULLNAME', \&calc_fn)->write();
3739             sub calc_fn {
3740             my ($field, $hr, $stats) = @_;
3741             my $fn = $hr->{'FIRSTNAME'} . ' ' . uc($hr->{'LASTNAME'});
3742             $stats->{'empty full name'}++ if $fn eq ' ';
3743             return $fn;
3744             }
3745              
3746             # Read a file with a lot of columns and keep only 2 columns in output
3747             Text::AutoCSV->new(in_file => 'big.csv', out_file => 'addr.csv',
3748             out_fields => ['NAME', 'ADDRESS'])
3749             ->out_header('ADDRESS', 'Postal Address')
3750             ->write();
3751              
3752             =head2 Datetime management
3753              
3754             # Detect any field containing a DateTime value and convert it to yyyy-mm-dd whatever the
3755             # input format is.
3756             Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv', fields_dates_auto => 1,
3757             out_dates_format => '%F')->write();
3758              
3759             # Detect any field containing a DateTime value and convert it to a US DateTime whatever the
3760             # input format is.
3761             Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv', fields_dates_auto => 1,
3762             out_dates_format => '%b %d, %Y, %I:%M:%S %p', out_dates_locale => 'en')->write();
3763              
3764             # Find dates of specific formats and convert it into yyyy-mm-dd
3765             Text::AutoCSV->new(in_file => 'raw.csv', out_file => 'cooked.csv',
3766             dates_formats_to_try => ['%d_%m_%Y', '%m_%d_%Y', '%Y_%m_%d'],
3767             out_dates_format => '%F')->write();
3768              
3769             # Take the dates on columns 'LASTLOGIN' and 'CREATIONDATE' and convert it into French dates
3770             # (day/month/year).
3771             # Text::AutoCSV will croak if LASTLOGIN or CREATIONDATE do not contain a DateTime format.
3772             # By default, Text::AutoCSV will try 20 different formats.
3773             Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv',
3774             fields_dates => ['LASTLOGIN', 'CREATIONDATE'], out_dates_format => '%d/%m/%Y')->write();
3775              
3776             # Convert 2 DateTime fields into unix standard epoch
3777             # Write -1 if DateTime is empty.
3778             sub toepoch { return $_->epoch() if $_; -1; }
3779             Text::AutoCSV->new(in_file => 'stats.csv', out_file => 'stats-epoch.csv',
3780             fields_dates => ['ATIME', 'MTIME'])
3781             ->in_map('ATIME', \&toepoch)
3782             ->in_map('MTIME', \&toepoch)
3783             ->write();
3784              
3785             # Do the other way round from above: convert 2 fields containing unix standard epoch into a
3786             # string displaying a human-readable DateTime.
3787             my $formatter = DateTime::Format::Strptime->new(pattern => 'DATE=%F, TIME=%T');
3788             sub fromepoch {
3789             return $formatter->format_datetime(DateTime->from_epoch(epoch => $_)) if $_ >= 0;
3790             '';
3791             }
3792             $csv = Text::AutoCSV->new(in_file => 'stats-epoch.csv', out_file => 'stats2.csv')
3793             ->in_map('ATIME', \&fromepoch)
3794             ->in_map('MTIME', \&fromepoch)
3795             ->write();
3796              
3797             =head2 Miscellaneous
3798              
3799             use Text::AutoCSV 'remove_accents';
3800             # Output 'Francais: etre elementaire, Tcheque: sluzba dum' followed by a new line.
3801             print remove_accents("Français: être élémentaire, Tchèque: služba dům"), "\n";
3802              
3803             =for Pod::Coverage ERR_UNKNOWN_FIELD
3804              
3805             =head1 NAME
3806              
3807             Text::AutoCSV - helper module to automate the use of Text::CSV
3808              
3809             =head1 METHODS
3810              
3811             =head2 new
3812              
3813             my $csv = Text::AutoCSV->new(%attr);
3814              
3815             (Class method) Returns a new instance of Text::AutoCSV. The object attributes are described by the
3816             hash C<%attr> (can be empty).
3817              
3818             Currently the following attributes are available:
3819              
3820             =over 4
3821              
3822             =item Preliminary note about L</fields_hr>, L</fields_ar> and L</fields_column_names> attributes
3823              
3824             By default, Text::AutoCSV assumes the input has a header and will use the field values of this first
3825             line (the header) to work out the column internal names. These internal names are used everywhere in
3826             Text::AutoCSV to designate columns.
3827              
3828             The values are transformed as follows:
3829              
3830             - All accents are removed using the exportable function L</remove_accents>.
3831              
3832             - Any non-alphanumeric character is removed (except underscore) and all letters are switched to
3833             upper case. The regex to do this is
3834              
3835             s/[^[:alnum:]_]//gi; s/^.*$/\U$&/;
3836              
3837             Thus a header line of
3838              
3839             'Office Number 1,Office_2,Personal Number'
3840              
3841             will produce the internal column names
3842              
3843             'OFFICENUMBER1' (first column)
3844              
3845             'OFFICE_2' (second column)
3846              
3847             'PERSONALNUMBER' (third column).
3848              
3849             The attribute L</fields_hr>, L</fields_ar> or L</fields_column_names> (only one of the three is
3850             useful at a time) allows to change this behavior.
3851              
3852             B<NOTE>
3853              
3854             The removal of accents is *not* a conversion to us-ascii, see L</remove_accents> for details.
3855              
3856             =item Preliminary note about fields reading
3857              
3858             Functions that are given a field name (L</get_cell>, L</vlookup>, L</field_add_copy>, ...) raise an
3859             error if the field requested does not exist.
3860              
3861             B<SO WILL THE HASHREFS GIVEN BY Text::AutoCSV:> when a function returns a hashref (L</search_1hr>,
3862             L</get_row_hr>, ...), the hash is locked with the C<lock_keys> function of C<Hash::Util>. Any
3863             attempt to read a non-existing key from the hash causes a croak. This feature is de-activated if you
3864             specified C<croak_if_error =E<gt> 0> when creating Text::AutoCSV object.
3865              
3866             =item in_file
3867              
3868             The name of the file to read CSV data from.
3869              
3870             If not specified or empty, read standard input.
3871              
3872             Example:
3873              
3874             my $csv = Text::AutoCSV->new(in_file => 'in.csv');
3875              
3876             =item inh
3877              
3878             File handle to read CSV data from.
3879             Normally you don't want to specify this attribute.
3880              
3881             C<inh> is useful if you don't like the way Text::AutoCSV opens the input file for you.
3882              
3883             Example:
3884              
3885             open my $inh, "producecsv.sh|";
3886             my $csv = Text::AutoCSV->new(inh => $inh);
3887              
3888             =item encoding
3889              
3890             Comma-separated list of encodings to try to read input.
3891              
3892             Note that finding the correct encoding of any given input is overkill. This script just tries
3893             encodings one after the other, and selects the first one that does not trigger a warning during
3894             reading of input. If all produce warnings, select the first one.
3895              
3896             The encoding chosen is used in output, unless attribute L</out_encoding> is specified.
3897              
3898             Value by default: 'UTF-8,latin1'
3899              
3900             B<IMPORTANT>
3901              
3902             If one tries something like C<encoding =E<gt> 'latin1,UTF-8'>, it'll almost never detect UTF-8
3903             because latin1 rarely triggers warnings during reading. It tends to be also true with encodings like
3904             UTF-16 that can remain happy with various inputs (sometimes resulting in Western languages turned
3905             into Chinese text).
3906              
3907             Ultimately this attribute should be used with a unique value. The result when using more than one
3908             value can produce weird results and should be considered B<experimental>.
3909              
3910             Example:
3911              
3912             my $csv = Text::AutoCSV->new(in_file => 'w.csv', encoding => 'UTF-16');
3913              
3914             =item via
3915              
3916             Adds a C<via> to the file opening instruction performed by Text::AutoCSV. You don't want to use it
3917             under normal circumstances.
3918              
3919             The value should start with a ':' character (Text::AutoCSV won't add one for you).
3920              
3921             Value by default: none
3922              
3923             Example:
3924              
3925             my $csv = Text::AutoCSV->new(in_file => 'in.csv', via => ':raw:perlio:UTF-32:crlf');
3926              
3927             =item dont_mess_with_encoding
3928              
3929             If true, just ignore completely encoding and don't try to alter I/O operations with encoding
3930             considerations (using C<binmode> instruction). Note that if inh attribute is specified, then
3931             Text::AutoCSV will consider the caller manages encoding for himself and dont_mess_with_encoding will
3932             be automatically set, too.
3933              
3934             B<IMPORTANT>
3935              
3936             This attribute does not mean perl will totally ignore encoding and would consider character strings
3937             as bytes for example. The meaning of L</dont_mess_with_encoding> is that Text::AutoCSV itself will
3938             totally ignore encoding matters, and leave it entirely to Perl' default.
3939              
3940             Value by default:
3941              
3942             0 if inh attribute is not set
3943             1 if inh attribute is set
3944              
3945             Example:
3946              
3947             my $csv = Text::AutoCSV->new(in_file => 'in.csv', dont_mess_with_encoding => 1);
3948              
3949             =item sep_char
3950              
3951             Specify the CSV separator character. Turns off separator auto-detection. This attribute is passed as
3952             is to C<Text::CSV-E<gt>new()>.
3953              
3954             Example:
3955              
3956             my $csv = Text::AutoCSV->new(in_file => 'in.csv', sep_char => ';');
3957              
3958             =item quote_char
3959              
3960             Specify the field quote character. This attribute is passed as is to C<Text::CSV-E<gt>new()>.
3961              
3962             Value by default: double quote ('"')
3963              
3964             Example:
3965              
3966             my $csv = Text::AutoCSV->new(in_file => 'in.csv', quote_char => '\'');
3967              
3968             =item escape_char
3969              
3970             Specify the escape character. Turns off escape character auto-detection. This attribute is passed as
3971             is to C<Text::CSV-E<gt>new()>.
3972              
3973             Value by default: backslash ('\\')
3974              
3975             Example:
3976              
3977             my $csv = Text::AutoCSV->new(in_file => 'in.csv', escape_char => '"');
3978              
3979             =item in_csvobj
3980              
3981             Text::CSV object to use.
3982             Normally you don't want to specify this attribute.
3983              
3984             By default, Text::AutoCSV will manage creating such an object and will work hard to detect the
3985             parameters it requires.
3986              
3987             Defining C<in_csvobj> attribute turns off separator character and escape character auto-detection.
3988              
3989             Using this attribute workarounds Text::AutoCSV philosophy a bit, but you may need it in case
3990             Text::AutoCSV behavior is not suitable for Text::CSV creation.
3991              
3992             Example:
3993              
3994             my $tcsv = Text::CSV->new();
3995             my $acsv = Text::AutoCSV->new(in_file => 'in.csv', in_csvobj => $tcsv);
3996              
3997             =item has_headers
3998              
3999             If true, Text::AutoCSV assumes the input has a header line.
4000              
4001             Value by default: 1
4002              
4003             Example:
4004              
4005             my $csv = Text::AutoCSV->new(in_file => 'in.csv', has_headers => 0);
4006              
4007             =item fields_hr
4008              
4009             (Only if input has a header line) Hash ref that contains column internal names along with a regular
4010             expression to find it in the header line.
4011             For example if you have:
4012              
4013             my $csv = Text::AutoCSV->new(in_file => 'in.csv',
4014             fields_hr => {'PHONE OFFICE' => '^office phone nu',
4015             'PHONE PERSONAL' => '^personal phone nu'});
4016              
4017             And the header line is
4018              
4019             'Personal Phone Number,Office Phone Number'
4020              
4021             the column name 'PHONE OFFICE' will designate the second column and the column name 'PHONE PERSONAL'
4022             will designate the first column.
4023              
4024             You can choose column names like 'Phone Office' and 'Phone Personal' as well.
4025              
4026             The regex search is case insensitive.
4027              
4028             =item fields_ar
4029              
4030             (Only if input has a header line) Array ref that contains column internal names. The array is used
4031             to create a hash ref of the same kind as L</fields_hr>, by wrapping the column name in a regex. The
4032             names are surrounded by a leading '^' and a trailing '$', meaning, the name must match the entire
4033             field name.
4034              
4035             For example
4036              
4037             fields_ar => ['OFFICENUMBER', 'PERSONALNUMBER']
4038              
4039             is strictly equivalent to
4040              
4041             fields_hr => {'OFFICENUMBER' => '^officenumber$', 'PERSONALNUMBER' = '^personalnumber$'}
4042              
4043             The regex search is case insensitive.
4044              
4045             C<fields_ar> is useful if the internal names are identical to the file column names. It avoids
4046             repeating the names over and over as would happen if using L</fields_hr> attribute.
4047              
4048             I<NOTE>
4049              
4050             You might wonder why using fields_ar as opposed to Text::AutoCSV default' mechanism. There are two
4051             reasons for that:
4052              
4053             1- Text::AutoCSV removes spaces from column names, and some people may want another behavior. A
4054             header name of 'Phone Number' will get an internal column name of 'PHONENUMBER' (default behavior,
4055             if none of fields_hr, fields_ar and fields_column_names attributes is specified), and one may prefer
4056             'PHONE NUMBER' or 'phone number' or whatsoever.
4057              
4058             2- By specifying a list of columns using either of fields_hr or fields_ar, you not only map column
4059             names as found in the header line to internal column names: you also I<request> these columns to be
4060             available. If one of the requested columns cannot be found, Text::AutoCSV will croak (default) or
4061             print an error and return an undef object (if created with C<croak_if_error =E<gt> 0>).
4062              
4063             =item fields_column_names
4064              
4065             Array ref of column internal names, in the order of columns in file. This attribute works like the
4066             C<column_names> attribute of Text::CSV. It'll just assign names to columns one by one, regardless of
4067             what the header line contains. It'll work also if the file has no header line.
4068              
4069             Example:
4070              
4071             my $csv = Text::AutoCSV->new(in_file => 'in.csv',
4072             fields_column_names => ['My COL1', '', 'My COL3']);
4073              
4074             =item out_file
4075              
4076             Output file when executing the L</write> method.
4077              
4078             If not specified or empty, write to standard output.
4079              
4080             Example:
4081              
4082             my $csv = Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv');
4083              
4084             =item outh
4085              
4086             File handle to write CSV data to when executing the L</write> method.
4087             Normally you don't want to specify this attribute.
4088              
4089             C<outh> is useful if you don't like the way Text::AutoCSV opens the output file for you.
4090              
4091             Example:
4092              
4093             my $outh = open "myin.csv', ">>";
4094             my $csv = Text::AutoCSV->new(in_file => 'in.csv', has_headers => 0, outh => $outh);
4095              
4096             =item out_encoding
4097              
4098             Enforce the encoding of output.
4099              
4100             Value by default: input encoding
4101              
4102             Example:
4103              
4104             my $csv = Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv',
4105             out_encoding => 'UTF-16');
4106              
4107             =item out_utf8_bom
4108              
4109             Enforce BOM (Byte-Order-Mark) on output, when it is UTF8. If output encoding is not UTF-8, this
4110             attribute is ignored.
4111              
4112             B<NOTE>
4113              
4114             UTF-8 needs no BOM (there is no Byte-Order in UTF-8), and in practice, UTF8-encoded files rarely
4115             have a BOM.
4116              
4117             Using this attribute is not recommended. It is provided for the sake of completeness, and also to
4118             produce Unicode files Microsoft EXCEL will be happy to read.
4119              
4120             At first sight it would seem more logical to make EXCEL happy with something like this:
4121              
4122             out_encoding => 'UTF-16'
4123              
4124             But... While EXCEL will identify UTF-16 and read it as such, it will not take into account the BOM
4125             found at the beginning. In the end the first cell will have 2 useless characters prepended. The only
4126             solution the author knows to workaround this issue if to use UTF-8 as output encoding, and enforce a
4127             BOM. That is, use:
4128              
4129             ..., out_encoding => 'UTF-8', out_utf8_bom => 1, ...
4130              
4131             =item out_sep_char
4132              
4133             Enforce the output CSV separator character.
4134              
4135             Value by default: input separator
4136              
4137             Example:
4138              
4139             my $csv = Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv', out_sep_char => ',');
4140              
4141             =item out_quote_char
4142              
4143             Enforce the output CSV quote character.
4144              
4145             Value by default: input quote character
4146              
4147             Example:
4148              
4149             my $csv = Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv', out_quote_char => '"');
4150              
4151             =item out_escape_char
4152              
4153             Enforce the output CSV escape character.
4154              
4155             Value by default: input escape character
4156              
4157             Example:
4158              
4159             my $csv = Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv',
4160             out_escape_char_char => '\\');
4161              
4162             =item out_always_quote
4163              
4164             If true, quote all fields of output (set always_quote of Text::CSV).
4165              
4166             If false, don't quote all fields of output (don't set C<always_quote> of Text::CSV).
4167              
4168             Value by default: same as what is found in input
4169              
4170             While reading input, Text::AutoCSV works out whether or not all fields were quoted. If yes, then the
4171             output Text::CSV object has the always_quote attribute set, if no, then the output Text::CSV object
4172             does not have this attribute set.
4173              
4174             Example:
4175              
4176             my $csv = Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv', out_always_quote => 1);
4177              
4178             =item out_has_headers
4179              
4180             If true, when writing output, write a header line on first line.
4181              
4182             If false, when writing output, don't write a header line on first line.
4183              
4184             Value by default: same as has_headers attribute
4185              
4186             Example 1
4187              
4188             Read standard input and write to standard output, removing the header line.
4189              
4190             Text::AutoCSV->new(out_has_headers => 0)->write();
4191              
4192             Example 2
4193              
4194             Read standard input and write to standard output, adding a header line.
4195              
4196             Text::AutoCSV->new(fields_column_names => ['MYCOL1', 'MYCOL2'], out_has_headers => 1)->write();
4197              
4198             =item no_undef
4199              
4200             If true, non-existent column values are set to an empty string instead of undef. It is also done on
4201             extra fields that happen to have an undef value (for example when the target of a linked field is
4202             not found).
4203              
4204             Note this attribute does not work on callback functions output set with L</in_map>: for example
4205             empty DateTime values (on fields identified as containing a date/time, see C<dates_*> attributes
4206             below) are set to C<undef>, even while C<no_undef> is set. Indeed setting it to an empty string
4207             while non-empty values would contain a Datetime object would not be clean. An empty value in a
4208             placeholder containing an object must be undef.
4209              
4210             Since version 1.1.5 of Text::AutoCSV, C<no_undef> is examined when sending parameter ($_) to
4211             L</in_map> callback: an undef value is now passed as is (as undef), unless C<no_undef> is set. If
4212             C<no_undef> is set, and field value is undef, then $_ is set to the empty string ('') when calling
4213             callback defined by L</in_map>. This new behavior was put in place to be consistent with what is
4214             being done with DateTime values.
4215              
4216             Value by default: 0
4217              
4218             Example:
4219              
4220             my $csv = Text::AutoCSV->new(in_file => 'in.csv', no_undef => 1);
4221              
4222             =item read_post_update_hr
4223              
4224             To be set to a ref sub. Each time a record is read from input, call C<read_post_update_hr> to update
4225             the hash ref of the record. The sub is called with 2 arguments: the hash ref to the record value and
4226             the hash ref to stats.
4227              
4228             The stats allow to count events and are printed in the end of reading in case Text::AutoCSV is
4229             called in verbose mode (C<verbose =E<gt> 1>).
4230              
4231             For example, the C<read_post_update_hr> below will turn column 'CITY' values in upper case and count
4232             occurences of empty cities in stat display:
4233              
4234             Text::AutoCSV->new(in_file => 'addresses.csv', read_post_update_hr => \&updt, verbose => 1)
4235             ->write();
4236             sub updt {
4237             my ($hr, $stats) = @_;
4238             $hr->{'CITY'} =~ s/^.*$/\U$&/;
4239             $stats->{'empty city encountered'}++ if $hr->{'CITY'} eq '';
4240             }
4241              
4242             B<IMPORTANT>
4243              
4244             You cannot create a field this way. To create a field, you have to use the member functions
4245             L</field_add_link>, L</field_add_copy> or L</field_add_computed>.
4246              
4247             B<NOTE>
4248              
4249             If you wish to manage some updates at field level, consider registering update functions with
4250             L</in_map> and L</out_map> member functions. These functions register callbacks that work at field
4251             level and with $_ variable (thus the callback function invoked is AutoCSV-agnostic).
4252              
4253             L</in_map> updates a field after read, L</out_map> updates the field content before writing it.
4254              
4255             =item walker_hr
4256              
4257             To set to a sub ref that'll be executed each time a record is read from input. It is executed after
4258             L</read_post_update_hr>. The sub is called with 2 arguments: the hash ref to the record value and
4259             the hash ref to stats.
4260              
4261             Note L</read_post_update_hr> is meant for updating record fields just after reading, whereas
4262             L</walker_hr> is read-only.
4263              
4264             The stats allow to count events and are printed in the end of reading in case Text::AutoCSV is
4265             called in verbose mode (C<verbose =E<gt> 1>). If the L</verbose> attribute is not set, the stats are
4266             not displayed, however you can get stats by calling the get_stats function.
4267              
4268             The example below will count in the stats the number of records where the 'CITY' field is empty.
4269             Thanks to C<verbose =E<gt> 1> attribute, at the end of reading the stats are displayed.
4270              
4271             my $csv = Text::AutoCSV->new(in_file => 'addresses.csv', walker_hr => \&walk1,
4272             verbose => 1)->read();
4273             sub walk1 {
4274             my ($hr, $stats) = @_;
4275             $stats->{'empty city'}++ if $hr->{'CITY'} eq '';
4276             }
4277              
4278             =item walker_ar
4279              
4280             To set to a sub ref that'll be executed each time a record is read from input. It is executed after
4281             L</read_post_update_hr>. The sub is called with 2 arguments: the array ref to the record value and
4282             the hash ref to stats.
4283              
4284             Note L</read_post_update_hr> is meant for updating record fields just after reading, whereas
4285             C<walker_hr> is read-only.
4286              
4287             The stats allow to count events and are printed in the end of reading in case Text::AutoCSV is
4288             called in verbose mode (C<verbose =E<gt> 1>). If the L</verbose> attribute is not set, the stats are
4289             lost.
4290              
4291             The array ref contains values in their natural order in the CSV. To be used with the column names,
4292             you have to use L</get_fields_names> member function.
4293              
4294             The example below will count in the stats the number of records where the 'CITY' field is empty.
4295             Thanks to C<verbose =E<gt> 1> attribute, at the end of reading the stats are displayed. It produces
4296             the exact same result as the example in walker_hr attribute, but it uses walker_ar.
4297              
4298             use List::MoreUtils qw(first_index);
4299             my $csv = Text::AutoCSV->new(in_file => 'addresses.csv', walker_ar => \&walk2, verbose => 1);
4300             my @cols = $csv->get_fields_names();
4301             my $idxCITY = first_index { /^city$/i } @cols;
4302             die "No city field!??" if $idxCITY < 0;
4303             $csv->read();
4304             sub walk2 {
4305             my ($ar, $stats) = @_;
4306             $stats->{'empty city'}++ if $ar->[$idxCITY] eq '';
4307             }
4308              
4309             =item write_filter_hr
4310              
4311             Alias of L</out_filter>.
4312              
4313             =item out_filter
4314              
4315             To be set to a ref sub. Before writing a record to output, C<out_filter> is called and the record
4316             gets writen only if C<out_filter> return value is true. The sub is called with 1 argument: the hash
4317             ref to the record value.
4318              
4319             For example, if you want to output only records where the 'CITY' column value is Grenoble:
4320              
4321             Text::AutoCSV->new(in_file => 'addresses.csv', out_file => 'grenoble.csv',
4322             out_filter => \&filt)->write();
4323             sub filt {
4324             my $hr = shift;
4325             return 1 if $hr->{'CITY'} =~ /^grenoble$/i;
4326             return 0;
4327             }
4328              
4329             =item write_fields
4330              
4331             Alias of L</out_fields>.
4332              
4333             =item out_fields
4334              
4335             Set to an array ref. List fields to write to output.
4336              
4337             Fields are written in their order in the array ref, the first CSV column being the first element in
4338             the array, and so on. Fields not listed in B<out_fields> are not written in output.
4339              
4340             You can use empty field names to have empty columns in output.
4341              
4342             Value by default: none, meaning, all fields are output in their natural order. What is natural
4343             order? It is the input order for fields that were read from input, and the order in which they got
4344             created for created fields.
4345              
4346             Example:
4347              
4348             Text::AutoCSV->new(in_file => 'allinfos.csv', out_file => 'only-addresses.csv',
4349             out_fields => [ 'NAME', 'ADDRESS' ] )->write();
4350              
4351             =item out_orderby
4352              
4353             Array reference to a list of fields to sort output with.
4354              
4355             At the moment this feature is a bit of a hack (no option to make sort descending or ascending,
4356             numeric or text, and it is not part of test plan).
4357              
4358             Example:
4359              
4360             Text::AutoCSV->new(in_file => 'names.csv', out_file => 'sortednames.csv',
4361             out_orderby => [ 'LASTNAME', 'FIRSTNAME']);
4362              
4363             =item search_case
4364              
4365             If true, searches are case sensitive by default. Searches are done by the member functions
4366             L</search>, L</search_1hr>, L</vlookup>, and linked fields (L</field_add_link>).
4367              
4368             The search functions can also be called with the option L</case>, that takes precedence over the
4369             object-level C<search_case> attribute value. See L</vlookup> help.
4370              
4371             Value by default: 0 (by default searches are case insensitive)
4372              
4373             Example:
4374              
4375             my $csv = Text::AutoCSV->new(in_file => 'in.csv', search_case => 1);
4376              
4377             =item search_trim
4378              
4379             If true, searches ignore the presence of leading or trailing spaces in values.
4380              
4381             The search functions can also be called with the option L</trim>, that takes precedence over the
4382             object-level C<search_trim> attribute value. See L</vlookup> help.
4383              
4384             Value by default: 1 (by default searches ignore leading and trailing spaces)
4385              
4386             Example:
4387              
4388             my $csv = Text::AutoCSV->new(in_file => 'in.csv', search_trim => 0);
4389              
4390             =item search_ignore_empty
4391              
4392             If true, empty fields are not included in the search indexes.
4393              
4394             The search functions can also be called with the option L</ignore_empty>, that takes precedence over
4395             the object-level C<search_ignore_empty> attribute value. See L</vlookup> help.
4396              
4397             Value by default: 1 (by default, search of the value '' will find nothing)
4398              
4399             Example:
4400              
4401             my $csv = Text::AutoCSV->new(in_file => 'in.csv', search_ignore_empty => 0);
4402              
4403             =item search_ignore_accents
4404              
4405             If true, accents are ignored by search indexes.
4406              
4407             The search functions can also be called with the option L</ignore_accents>, that takes precedence
4408             over the object-level C<search_ignore_accents> attribute value. See L</vlookup> help.
4409              
4410             Value by default: 1 (by default, accents are ignored by search functions)
4411              
4412             Example:
4413              
4414             my $csv = Text::AutoCSV->new(in_file => 'in.csv', search_ignore_accents => 0);
4415              
4416             =item search_value_if_not_found
4417              
4418             When a search is done with a unique value to return (field_add_link member function behavior or
4419             return value of vlookup), default value of option L</value_if_not_found>. See L</vlookup>.
4420              
4421             =item search_value_if_found
4422              
4423             When a search is done with a unique value to return (field_add_link member function behavior or
4424             return value of vlookup), default value of option L</value_if_found>. See L</vlookup>.
4425              
4426             B<IMPORTANT>
4427              
4428             This attribute is extremly unusual. Once you've provided it, all vlookups and the target field value
4429             of fields created with field_add_link will all be populated with the value provided with this
4430             option.
4431              
4432             Don't use it unless you know what you are doing.
4433              
4434             =item search_ignore_ambiguous
4435              
4436             When a search is done with a unique value to return (field_add_link member function behavior or
4437             return value of search_1hr and vlookup), default value of option L</ignore_ambiguous>. See
4438             L</vlookup>.
4439              
4440             =item search_value_if_ambiguous
4441              
4442             When a search is done with a unique value to return (field_add_link member function behavior or
4443             return value of vlookup), default value of option L</value_if_ambiguous>. See L</vlookup>.
4444              
4445             =item fields_dates
4446              
4447             Array ref of field names that contain a date.
4448              
4449             Once the formats of these fields is known (auto-detection by default), each of these fields will get
4450             a specific L</in_map> sub that converts the text in a DateTime object and a L</out_map> sub that
4451             converts back from DateTime to text.
4452              
4453             B<NOTE>
4454              
4455             The L</out_map> given to a DateTime field is "defensive code": normally, L</in_map> converts text
4456             into a DateTime object and L</out_map> does the opposite, it takes a DateTime object and converts it
4457             to text. If ever L</out_map> encounters a value that is not a DateTime object, it'll just stringify
4458             it (evaluation in a string context), without calling its DateTime formatter.
4459              
4460             If the format cannot be detected for a given field, output an error message and as always when an
4461             error occurs, croak (unless L</croak_if_error> got set to 0).
4462              
4463             Value by default: none
4464              
4465             Example:
4466              
4467             my $csv = Text::AutoCSV->new(in_file => 'logins.csv',
4468             fields_dates => ['LASTLOGIN', 'CREATIONDATE']);
4469              
4470             =item fields_dates_auto
4471              
4472             Boolean value. If set to 1, will detect dates formats on all fields. Fields in which a DateTime
4473             format got detected are then managed as if they had been being listed in L</fields_dates> attribute:
4474             they get an appropriate L</in_map> sub and a L</out_map> sub to convert to and from DateTime (see
4475             L</fields_dates> attribute above).
4476              
4477             C<fields_dates_auto> looks for DateTime on all fields, but it expects nothing: it won't raise an
4478             error if no field is found that contains DateTime.
4479              
4480             Value by default: 0
4481              
4482             Example:
4483              
4484             my $csv = Text::AutoCSV->new(in_file => 'logins.csv', fields_dates_auto => 1);
4485              
4486             =item fields_dates_auto_optimize
4487              
4488             Relevant only if L</fields_dates_auto> is set.
4489              
4490             Normally when L</fields_dates_auto> is set, the input is read completely to make sure auto-detection
4491             produces a reliable result. If C<fields_dates_auto_optimize> is set, this reading pass will stop as
4492             soon as there is no ambiguity left. That is, for every fields in input, the date format (or the fact
4493             that no date format is suitable) is known.
4494              
4495             Using this option is a bit risky because it could trigger a date format detection that later in the
4496             input, would turn out to be wrong. Should that be the case, strange errors will occur, that are not
4497             easy to understand. Use it at your own risk.
4498              
4499             Value by default: 0
4500              
4501             Example:
4502              
4503             my $csv = Text::AutoCSV->new(in_file => 'logins.csv', fields_dates_auto => 1,
4504             fields_dates_auto_optimize => 1);
4505              
4506             =item dates_formats_to_try
4507              
4508             Array ref of string formats.
4509              
4510             Text::AutoCSV has a default built-in list of 20 date formats to try and 6 time formats (also it'll
4511             combine any date format with any time format).
4512              
4513             C<dates_formats_to_try> will replace Text::AutoCSV default format-list will the one you specify, in
4514             case the default would not produce the results you expect.
4515              
4516             The formats are written in Strptime format.
4517              
4518             Value by default (see below about the role of the pseudo-format ''):
4519              
4520             [ '',
4521             '%Y-%m-%d',
4522             '%Y.%m.%d',
4523             '%Y/%m/%d',
4524             '%m.%d.%y',
4525             '%m-%d-%Y',
4526             '%m.%d.%Y',
4527             '%m/%d/%Y',
4528             '%d-%m-%Y',
4529             '%d.%m.%Y',
4530             '%d/%m/%Y',
4531             '%m-%d-%y',
4532             '%m/%d/%y',
4533             '%d-%m-%y',
4534             '%d.%m.%y',
4535             '%d/%m/%y',
4536             '%Y%m%d%H%M%S',
4537             '%b %d, %Y',
4538             '%b %d %Y',
4539             '%b %d %T %Z %Y',
4540             '%d %b %Y',
4541             '%d %b, %Y' ]
4542              
4543             B<IMPORTANT>
4544              
4545             The empty format (empty string) has a special meaning: when specified, Text::AutoCSV will be able to
4546             identify fields that contain only a time (not preceeded by a date).
4547              
4548             B<Note>
4549              
4550             Format identification is over only when there is no more ambiguity. So the usual pitfall of US
4551             versus French dates (month-day versus day-month) gets resolved only when a date is encountered that
4552             disambiguates it (a date of 13th of the month or later).
4553              
4554             Example with a weird format that uses underscores to separate elements, using either US (month, day,
4555             year), French (day, month, year), or international (year, month, day) order:
4556              
4557             my $csv = Text::AutoCSV->new(in_file => 'logins.csv',
4558             dates_formats_to_try => ['%d_%m_%Y', '%m_%d_%Y', '%Y_%m_%d']);
4559              
4560             =item dates_formats_to_try_supp
4561              
4562             Same as L</dates_formats_to_try> but instead of replacing the default list of formats used during
4563             detection, it is added to this default list.
4564              
4565             You want to use this attribute if you need a specific DateTime format while continuing to benefit
4566             from the default list.
4567              
4568             B<IMPORTANT>
4569              
4570             Text::AutoCSV will identify a given Datetime format only when there is no ambiguity, meaning, one
4571             unique Datetime format matches (all other failed). Adding a format that already exists in the
4572             default list will prevent the format from being identified, as it'll always be ambiguous. See
4573             L</dates_formats_to_try> for the default list of formats.
4574              
4575             Example:
4576              
4577             my $csv = Text::AutoCSV->new(in_file => 'logins.csv',
4578             dates_formats_to_try_supp => ['%d_%m_%Y', '%m_%d_%Y', '%Y_%m_%d']);
4579              
4580             =item dates_ignore_trailing_chars
4581              
4582             If set to 1, DateTime auto-detection will ignore trailing text that may follow detected
4583             DateTime-like text.
4584              
4585             Value by default: 1 (do ignore trailing chars)
4586              
4587             my $csv = Text::AutoCSV->new(in_file => 'logins.csv', dates_ignore_trailing_chars => 0);
4588              
4589             =item dates_search_time
4590              
4591             If set to 1, look for times when detecting DateTime format. That is, whenever a date format
4592             candidate is found, a longer candidate that also contains a time (after the date) is tested.
4593              
4594             Value by default: 1 (do look for times when auto-detecting DateTime formats)
4595              
4596             Example:
4597              
4598             my $csv = Text::AutoCSV->new(in_file => 'logins.csv', dates_search_time => 0);
4599              
4600             =item dates_locales
4601              
4602             Comma-separated string of locales to test when detecting DateTime formats. Ultimately, Text::AutoCSV
4603             will try all combinations of date formats, times and locales.
4604              
4605             Value by default: none (use perl default locale)
4606              
4607             Example:
4608              
4609             my $csv = Text::AutoCSV->new(in_file => 'logins.csv', dates_locales => 'fr,de,en');
4610              
4611             =item dates_zeros_ok
4612              
4613             Boolean. If true, a date made only of 0s is regarded as being empty.
4614              
4615             For example if C<dates_zeros_ok> is False, then a date like 0000-00-00 will be always incorrect (as
4616             the day and month are out of bounds), therefore a format like '%Y-%m-%d' will never match for the
4617             field.
4618              
4619             Conversely if C<dates_zeros_ok> is true, then a date like 0000-00-00 will be processed as if being
4620             the empty string, thus the detection of format will work and when parsed, this "full of zeros" dates
4621             will be processed the same way as the empty string (= value will be undef).
4622              
4623             B<IMPORTANT>
4624              
4625             "0s dates" are evaluated to undef when parsed, thus when converted back to text (out_map), they are
4626             set to an empty string, not to the original value.
4627              
4628             Value by default: 1
4629              
4630             Example:
4631              
4632             my $csv = Text::AutoCSV->new(in_file => 'in.csv', dates_zeros_ok => 0);
4633              
4634             =item out_dates_format
4635              
4636             Enforce the format of dates in output, for all fields that contain a DateTime value.
4637              
4638             The format is written in Strptime format.
4639              
4640             Value by default: none (by default, use format detected on input)
4641              
4642             Example:
4643              
4644             # Detect any field containing a DateTime value and convert it to yyyy-mm-dd whatever the
4645             # input format is.
4646             Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv', fields_dates_auto => 1,
4647             out_dates_format => '%F')->write();
4648              
4649             =item out_dates_locale
4650              
4651             Taken into account only if L</out_dates_format> is used.
4652              
4653             Sets the locale to apply on L</out_dates_format>.
4654              
4655             Value by default: none (by default, use the locale detected on input)
4656              
4657             Example:
4658              
4659             # Detect any field containing a DateTime value and convert it to a US DateTime whatever the
4660             # input format is.
4661             Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv', fields_dates_auto => 1,
4662             out_dates_format => '%b %d, %Y, %I:%M:%S %p', out_dates_locale => 'en')->write();
4663              
4664             =item croak_if_error
4665              
4666             If true, stops the program execution in case of error.
4667              
4668             B<IMPORTANT>
4669              
4670             Value by default: 1
4671              
4672             If set to zero (C<croak_if_error =E<gt> 0>), errors are displayed as warnings. This printing can
4673             then be affected by setting the L</quiet> attribute.
4674              
4675             =item verbose
4676              
4677             If true, get Text::AutoCSV to be a bit talkative instead of speaking only when warnings and errors
4678             occur. Verbose output is printed to STDERR by default, this can be tuned with the L</infoh>
4679             attribute.
4680              
4681             Value by default: 0
4682              
4683             Example:
4684              
4685             my $csv = Text::AutoCSV->new(in_file => 'in.csv', verbose => 1);
4686              
4687             =item infoh
4688              
4689             File handle to display program's verbose output. Has effect *mainly* with attribute
4690             C<verbose =E<gt> 1>.
4691              
4692             Note B<infoh> is used to display extra information in case of error (if a field does not exist,
4693             Text::AutoCSV will display the list of existing fields). If you don't want such output, you can set
4694             C<infoh> to undef.
4695              
4696             Value by default: \*STDERR
4697              
4698             Example:
4699              
4700             open my $infoh, ">", "log.txt";
4701             my $csv = Text::AutoCSV->new(in_file => 'in.csv', infoh => $infoh);
4702              
4703             =item quiet
4704              
4705             If true, don't display warnings and errors, unless croaking.
4706              
4707             If L</croak_if_error> attribute is set (as per default), still, Text::AutoCSV will produce output
4708             (on STDERR) when croaking miserably.
4709              
4710             When using C<croak_if_error =E<gt> 0>, errors are processed as warnings and if L</quiet> is set (in
4711             addition to L</croak_if_error> being set to 0), there'll be no output. Note this way of working is
4712             not recommended, as things can go wrong without any notice to the caller.
4713              
4714             Example:
4715              
4716             my $csv = Text::AutoCSV->new(in_file => 'in.csv', quiet => 1);
4717              
4718             =item one_pass
4719              
4720             If true, Text::AutoCSV will perform one reading of the input. If other readings are triggered, it'll
4721             raise an error and no reading will be done. Should that be the case (you ask Text::AutoCSV to do
4722             something that'll trigger more than one reading of input), Text::AutoCSV will croak as is always the
4723             case if an error occurs.
4724              
4725             Normally Text::AutoCSV will do multiple reads of input to work out certain characteristics of the
4726             CSV: guess of encoding and guess of escape character.
4727              
4728             Also if member functions like L</field_add_link>, L</field_add_copy>, L</field_add_computed>,
4729             L</read> or L</write> are called after input has already been read, it'll trigger further reads as
4730             needed.
4731              
4732             If one wishes a unique read of the input to occur, one_pass attribute is to be set.
4733              
4734             When true, encoding will be assumed to be the first one in the provided list (L</encoding>
4735             attribute), if no encoding attribute is provided, it'll be the first one in the default list, to
4736             date, it is UTF-8.
4737              
4738             When true, and if attribute L</escape_char> is not set, escape_char will be assumed to be '\\'
4739             (backslash).
4740              
4741             By default, one_pass is set if inh attribute is set (caller provides the input file handle of input)
4742             or if input file is stdin (in_file attribute not set or set to an empty string).
4743              
4744             Value by default:
4745              
4746             0 if inh attribute is not set and in_file attribute is set to a non empty string
4747             1 if inh attribute is set or in_file is not set or set to an empty string
4748              
4749             Example:
4750              
4751             my $csv = Text::AutoCSV->new(in_file => 'in.csv', one_pass => 1);
4752              
4753             =back
4754              
4755             =head2 read
4756              
4757             $csv->read();
4758              
4759             Read input entirely.
4760              
4761             B<Return value>
4762              
4763             Returns the object itself in case of success.
4764             Returns undef if error.
4765              
4766             Callback functions (when defined) are invoked, in the following order:
4767              
4768             L</read_post_update_hr>, intended to do updates on fields values after each record read
4769              
4770             L</walker_ar>, called after each record read, with an array ref of fields values
4771              
4772             L</walker_hr>, called after each record read, with a hash ref of fields values
4773              
4774             Example:
4775              
4776             # Do nothing - just check CSV can be read successfully
4777             Text::AutoCSV->new(in_file => 'in.csv')->read();
4778              
4779             =head2 read_all_in_mem
4780              
4781             $csv->read_all_in_mem();
4782              
4783             Created in version 1.1.5. Before, existed only as _read_all_in_mem, meaning, was private.
4784              
4785             Read input entirely, as with L</read> function, but enforcing content to be kept in-memory.
4786              
4787             Having the content kept in-memory is implied by search functions (L</vlookup> for example). With
4788             C<read_all_in_mem> you can enforce this behavior without doing a fake search.
4789              
4790             =head2 reset_next_record_hr
4791              
4792             $csv->reset_next_record_hr();
4793              
4794             Reset the internal status to start from the beginning with L</get_next_record_hr>. Used in
4795             conjunction with L</get_next_record_hr>.
4796              
4797             =head2 get_next_record_hr
4798              
4799             my $hr = $csv->get_next_record_hr(\$opt_key);
4800              
4801             Get the next record content as a hash ref. C<$hr> is undef when the end of records has been reached.
4802              
4803             When specified, C<$opt_key> is set to the current (returned) record key.
4804              
4805             B<NOTE>
4806              
4807             You do not need to call L</reset_next_record_hr> once before using C<get_next_record_hr>.
4808              
4809             Therefore L</reset_next_record_hr> is useful only if you wish to restart from the beginning before
4810             you've reached the end of the records.
4811              
4812             B<NOTE bis>
4813              
4814             L</walker_hr> allows to execute some code each time a record is read, and it better fits with
4815             Text::AutoCSV philosophy. Using a loop with C<get_next_record_hr> is primarily meant for
4816             Text::AutoCSV internal usage. Also when using this mechanism, you get very close to original
4817             Text::CSV logic, that makes Text::AutoCSV less useful.
4818              
4819             B<Return value>
4820              
4821             A hashref of the record, or undef once there's no more record to return.
4822              
4823             Example:
4824              
4825             while (my $hr = $csv->get_next_record_hr()) {
4826             say Dumper($hr);
4827             }
4828              
4829             =head2 write
4830              
4831             $csv->write();
4832              
4833             Write input into output.
4834              
4835             B<Return value>
4836              
4837             Returns the object itself in case of success.
4838             Returns undef if error.
4839              
4840             - If the content is not in-memory at the time write() is called:
4841              
4842             Each record is read (with call of L</read_post_update_hr>, L</walker_ar> and L</walker_hr>) and then
4843             written. The read-and-write is done in sequence, each record is written to output before the next
4844             record is read from input.
4845              
4846             - If the content is in-memory at the time write() is called:
4847              
4848             No L</read> operation is performed, instead, records are directly written to output.
4849              
4850             If defined, L</out_filter> is called for each record. If the return value of L</out_filter> is
4851             false, the record is not written.
4852              
4853             Example:
4854              
4855             # Copy input to output.
4856             # As CSV is parsed in-between, this copy also checks a number of characteristics about the
4857             # input, as opposed to a plain file copy operation.
4858             Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv')->write();
4859              
4860             =head2 out_header
4861              
4862             $csv->out_header($field, $header);
4863              
4864             Set the header text of C<$field> to C<$header>.
4865              
4866             By default, the input header value is rewritten as is to output. C<out_header> allows you to change
4867             it.
4868              
4869             B<Return value>
4870              
4871             Returns the object itself.
4872              
4873             Example:
4874              
4875             Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv')
4876             ->out_header('LOGIN', 'Login')
4877             ->out_header('FULLNAME', 'Full Name')
4878             ->write();
4879              
4880             =head2 print_id
4881              
4882             $csv->print_id();
4883              
4884             Print out a description of input. Write to \*STDERR by default or to L</infoh> attribute if set.
4885              
4886             The description consists in a list of a few characteristics (CSV separator and the like) followed by
4887             the list of columns with the details of each.
4888              
4889             Example of output:
4890              
4891             If you go to the C<utils> directory of this module and execute the following:
4892              
4893             ./csvcopy.pl -i f1.csv -l "1:,A->B,f2.csv" --id
4894              
4895             You will get this output:
4896              
4897             -- f1.csv:
4898             sep_char: ,
4899             escape_char: \
4900             in_encoding: UTF-8
4901             is_always_quoted: no
4902              
4903             # FIELD HEADER EXT DATA DATETIME FORMAT DATETIME LOCALE
4904             - ----- ------ -------- --------------- ---------------
4905             0 TIMESTAMP timestamp %Y%m%d%H%M%S
4906             1 A a
4907             2 B b
4908             3 C c
4909             4 D d %d/%m/%Y
4910             5 1:SITE 1:SITE link: f2.csv, chain: A->B->* (SITE)
4911             6 1:B 1:B link: f2.csv, chain: A->B->* (B)
4912              
4913             =head2 field_add_computed
4914              
4915             $csv->field_add_computed($new_field, $subref);
4916              
4917             C<$new_field> is the name of the created field.
4918              
4919             C<$subref> is a reference to a sub that'll calculate the new field value.
4920              
4921             B<Return value>
4922              
4923             Returns the object itself in case of success.
4924             Returns undef if error.
4925              
4926             Add a field calculated from other fields values. The subref runs like this:
4927              
4928             sub func {
4929             # $new_field is the name of the field (allows to use one subref for more than one field
4930             # calculation).
4931             # $hr is a hash ref of fields values.
4932             # $stats is a hash ref that gets printed (if Text::AutoCSV is created with verbose => 1)
4933             # in the end.
4934             my ($new_field, $hr, $stats) = @_;
4935              
4936             my $field_value;
4937             # ... compute $field_value
4938              
4939             return $field_value;
4940             }
4941              
4942             Example:
4943              
4944             # Add a field for the full name, made of the concatenation of the
4945             # first name and the last name.
4946             Text::AutoCSV->new(in_file => 'dirpeople.csv', out_file => 'dirwithfn.csv', verbose => 1)
4947             ->field_add_computed('FULLNAME', \&calc_fn)->write();
4948             sub calc_fn {
4949             my ($new_field, $hr, $stats) = @_;
4950             die "Man, you are in serious trouble!" unless $new_field eq 'FULLNAME';
4951             my $fn = $hr->{'FIRSTNAME'} . ' ' . uc($hr->{'LASTNAME'});
4952             $stats->{'empty full name'}++ if $fn eq ' ';
4953             return $fn;
4954             }
4955              
4956             =head2 field_add_copy
4957              
4958             $csv->field_add_copy($new_field, $src_field, $opt_subref);
4959              
4960             C<$new_field> if the name of the new field.
4961              
4962             C<$src_field> is the name of the field being copied.
4963              
4964             C<$opt_subref> is optional. It is a reference to a sub that takes one string (the value of
4965             C<$src_field>) and returns a string (the value assigned to C<$new_field>).
4966              
4967             B<Return value>
4968              
4969             Returns the object itself in case of success.
4970             Returns undef if error.
4971              
4972             C<field_add_copy> is a special case of L</field_add_computed>. The advantage of C<field_add_copy> is
4973             that it relies on a sub that is Text::AutoCSV "unaware", just taking one string as input and
4974             returning another string as output.
4975              
4976             B<IMPORTANT>
4977              
4978             The current field value is passed to C<field_add_copy> in $_.
4979              
4980             A call to
4981              
4982             $csv->field_add_copy($new_field, $src_field, $subref);
4983              
4984             is equivalent to
4985              
4986             $csv->field_add_computed($new_field, \&subref2);
4987             sub subref2 {
4988             my (undef, $hr) = @_;
4989             local $_ = $hr->{$src_field};
4990             return $subref->();
4991             }
4992              
4993             Example of a field copy + pass copied field in upper case and surround content with <<>>:
4994              
4995             my $csv = Text::AutoCSV->new(in_file => 'dirpeople.csv', out_file => 'd2.csv');
4996             $csv->field_add_copy('UCLAST', 'LASTNAME', \&myfunc);
4997             $csv->write();
4998             sub myfunc { s/^.*$/<<\U$&>>/; $_; }
4999              
5000             Note that the calls can be chained as most member functions return the object itself upon success.
5001             The example above is equivalent to:
5002              
5003             Text::AutoCSV->new(in_file => 'dirpeople.csv', out_file => 'd2.csv')
5004             ->field_add_copy('UCLAST', 'LASTNAME', \&myfunc)
5005             ->write();
5006             sub myfunc { s/^.*$/<<\U$&>>/; $_; }
5007              
5008             =head2 field_add_link
5009              
5010             $csv->field_add_link($new_field, $chain, $linked_file, \%opts);
5011              
5012             C<$new_field> is the name of the new field.
5013              
5014             C<$chain> is the CHAIN of the link, that is: 'LOCAL->REMOTE->PICK' where:
5015              
5016             C<LOCAL> is the field name to read the value from.
5017              
5018             C<REMOTE> is the linked field to find the value in. This field belongs to $linked_file.
5019              
5020             C<PICK> is the field from which to read the value of, in the record found by the search. This field
5021             belongs to $linked_file.
5022              
5023             If $new_field is undef, the new field name is the name of the third field of $chain (PICK).
5024              
5025             C<$linked_file> is the name of the linked file, that gets read in a Text::AutoCSV object created
5026             on-the-fly to do the search on. C<$linked_file> can also be a Text::AutoCSV object that you created
5027             yourself, allowing for more flexibility. Example:
5028              
5029             my $lcsv = Text::AutoCSV->new(in_file => 'logins.csv', case => 1);
5030             $csv->field_add_link('NAME', 'ID->LOGIN->DISPLAYNAME', $lcsv);
5031              
5032             C<\%opts> is a hash ref of optional attributes. The same values can be provided as with vlookup.
5033              
5034             =over 4
5035              
5036             =item trim
5037              
5038             If set to 1, searches will ignore leading and trailing spaces. That is, a C<LOCAL> value of ' x '
5039             will match with a C<REMOTE> value of 'x'.
5040              
5041             If option is not present, use L</search_value_if_not_found> attribute of object (default value: 1).
5042              
5043             Example:
5044              
5045             $csv->field_add_link('NAME', 'ID->LOGIN->DISPLAYNAME', 'logins.csv',
5046             { trim => 0 });
5047              
5048             =item case
5049              
5050             If set to 1, searches will take the case into account. That is, a C<LOCAL> value of 'X' will B<not>
5051             match with a C<REMOTE> value of 'x'.
5052              
5053             If option is not present, use L</search_case> attribute of object (default value: 0).
5054              
5055             Example:
5056              
5057             $csv->field_add_link('NAME', 'ID->LOGIN->DISPLAYNAME', 'logins.csv',
5058             { case => 1 });
5059              
5060             =item ignore_empty
5061              
5062             If set to 1, empty values won't match. That is, a C<LOCAL> value of '' will not match with a
5063             C<REMOTE> value of ''.
5064              
5065             If option is not present, use L</search_ignore_empty> attribute of object (default value: 1).
5066              
5067             Example:
5068              
5069             $csv->field_add_link('NAME', 'ID->LOGIN->DISPLAYNAME', 'logins.csv',
5070             { ignore_empty => 0 });
5071              
5072             =item value_if_not_found
5073              
5074             If the searched value is not found, the value of the field is undef, that produces an empty string
5075             at write time. Instead, you can specify the value.
5076              
5077             If option is not present, use L</search_value_if_not_found> attribute of object (default value:
5078             undef).
5079              
5080             Example:
5081              
5082             $csv->field_add_link('NAME', 'ID->LOGIN->DISPLAYNAME', 'logins.csv',
5083             { value_if_not_found => '<not found!>' });
5084              
5085             =item value_if_found
5086              
5087             If the searched value is found, you can specify the value to return.
5088              
5089             If option is not present, use L</search_value_if_found> attribute of object (default value: none).
5090              
5091             B<NOTE>
5092              
5093             Although the C<PICK> field is ignored when using this option, you must specify it any way.
5094              
5095             Example:
5096              
5097             $csv->field_add_link('NAME', 'ID->LOGIN->DISPLAYNAME', 'logins.csv',
5098             { value_if_not_found => '0', value_if_found => '1' });
5099              
5100             =item value_if_ambiguous
5101              
5102             If the searched value is found in more than one record, the value of the field is undef, that
5103             produces an empty string at write time. Instead, you can specify the value.
5104              
5105             If option is not present, use L</search_value_if_ambiguous> attribute of object (default value:
5106             undef).
5107              
5108             Example:
5109              
5110             $csv->field_add_link('NAME', 'ID->LOGIN->DISPLAYNAME', 'logins.csv',
5111             { value_if_ambiguous => '<ambiguous!>' });
5112              
5113             =item ignore_ambiguous
5114              
5115             Boolean value. If ignore_ambiguous is true and the searched value is found in more than one record,
5116             then, silently fall back on returning the value of the first record. Obviously if
5117             C<ignore_ambiguous> is true, then the value of L</value_if_ambiguous> is ignored.
5118              
5119             If option is not present, use L</search_ignore_ambiguous> attribute of object (default value: 1).
5120              
5121             Example:
5122              
5123             $csv->field_add_link('NAME', 'ID->LOGIN->DISPLAYNAME', 'logins.csv',
5124             { ignore_ambiguous => 1 });
5125              
5126             Example with multiple options:
5127              
5128             $csv->field_add_link('NAME', 'ID->LOGIN->DISPLAYNAME', 'logins.csv',
5129             { value_if_not_found => '?', ignore_ambiguous => 1 });
5130              
5131             =back
5132              
5133             B<Return value>
5134              
5135             Returns the object itself in case of success.
5136             Returns undef if error.
5137              
5138             Example of field_add_link usage:
5139              
5140             my $nom_compose = 0;
5141             my $zip_not_found = 0;
5142             Text::AutoCSV->new(in_file => 'pers.csv', walker_hr => \&walk)
5143             ->field_add_link('MYCITY', 'ZIP->ZIPCODE->CITY', 'zips.csv')->read();
5144             sub walk {
5145             my $hr = shift;
5146             $nom_compose++ if $hr->{'NAME'} =~ m/[- ]/;
5147             $zip_not_found++ unless defined($hr->{'MYCITY'});
5148             }
5149             print("Number of persons with a multi-part name: $nom_compose\n");
5150             print("Number of persons with unknown zipcode: $zip_not_found\n");
5151              
5152             =head2 links
5153              
5154             $csv->links($prefix, $chain, $linked_file, \%opts);
5155              
5156             C<$prefix> is the name to add to joined fields
5157              
5158             C<$chain> is the JOINCHAIN of the link, that is: 'LOCAL->REMOTE' where:
5159              
5160             C<LOCAL> is the field name to read the value from.
5161              
5162             C<REMOTE> is the linked field to find the value in. This field belongs to $linked_file.
5163              
5164             As opposed to L</field_add_link>, there is no C<PICK> part, as all fields of target are read.
5165              
5166             As opposed to Text::AutoCSV habits of croaking whenever a field name is duplicate, here, the
5167             duplicates are resolved by appending _2 to the joined field name if it already exists. If _2 already
5168             exists, too, then _3 is appended instead, and so on, until a non-duplicate is found. This mechanism
5169             is executed given the difficulty to control all field names when joining CSVs.
5170              
5171             C<$linked_file> and C<\%opts> work exactly the same way as for L</field_add_link>, see
5172             L</field_add_link> for help.
5173              
5174             B<Return value>
5175              
5176             Returns the object itself in case of success.
5177             Returns undef if error.
5178              
5179             B<NOTE>
5180              
5181             This function used to be called C<join> but got renamed to avoid clash with perl' builtin C<join>.
5182              
5183             Example:
5184              
5185             Text::AutoCSV->new(in_file => 'pers.csv', out_file => 'pers_with_city.csv')
5186             ->links('Read from zips.csv:', 'ZIP->ZIPCODE', 'zips.csv')->write();
5187              
5188             =head2 get_in_encoding
5189              
5190             my $enc = $csv->get_in_encoding();
5191              
5192             Return the string of input encoding, for example 'latin2' or 'UTF-8', etc.
5193              
5194             =head2 get_in_file_disp
5195              
5196             my $f = $csv->get_in_file_disp();
5197              
5198             Return the printable name of in_file.
5199              
5200             =head2 get_sep_char
5201              
5202             my $s = $csv->get_sep_char();
5203              
5204             Return the string of the input CSV separator character, for example ',' or ';'.
5205              
5206             =head2 get_escape_char
5207              
5208             my $e = $csv->get_escape_char();
5209              
5210             Return the string of the input escape character, for example '"' or '\\'.
5211              
5212             =head2 get_is_always_quoted
5213              
5214             my $a = $csv->get_is_always_quoted();
5215              
5216             Return 1 if all fields of input are always quoted, 0 otherwise.
5217              
5218             =head2 get_coldata
5219              
5220             my @cd = get_coldata();
5221              
5222             Return an array that describes each column, from the first one (column 0) to the last.
5223              
5224             Each element of the array is itself an array ref that contains 5 elements:
5225              
5226             0: Name of the field (as accessed in *_hr functions)
5227             1: Content of the field in the header line (if input has a header line)
5228             2: Column content type, shows some meta-data of fields created with field_add_* functions
5229             3: Datetime format detected, if ever, in the format Strptime
5230             4: Locale of DateTime format detected, if ever
5231             5: Multiline field: '1' if not, 'm' if newlines encountered in the field
5232              
5233             =head2 get_pass_count
5234              
5235             my $n = $csv->get_pass_count();
5236              
5237             Return the number of input readings done. Useful only if you're interested in Text::AutoCSV
5238             internals.
5239              
5240             =head2 get_in_mem_record_count
5241              
5242             my $m = $csv->get_in_mem_record_count();
5243              
5244             Return the number of records currently stored in-memory. Useful only if you're interested in
5245             Text::AutoCSV internals.
5246              
5247             =head2 get_max_in_mem_record_count
5248              
5249             my $mm = $csv->get_max_in_mem_record_count();
5250              
5251             Return the maximum number of records ever stored in-memory. Indeed this number can decrease: certain
5252             functions like field_add* member-functions discard in-memory content. Useful only if you're
5253             interested in Text::AutoCSV internals.
5254              
5255             =head2 get_fields_names
5256              
5257             my @f = $csv->get_fields_names();
5258              
5259             Return an array of the internal names of the columns.
5260              
5261             =head2 get_field_name
5262              
5263             my $name = $csv->get_field_name($n);
5264              
5265             Return the C<$n>-th column name, the first column being number 0.
5266              
5267             Example:
5268              
5269             # Get the field name of the third column
5270             my $col = $csv->get_field_name(2);
5271              
5272             =head2 get_stats
5273              
5274             my %stats = $csv->get_stats();
5275              
5276             Certain callback functions provide a parameter to record event count: L</field_add_computed>,
5277             L</read_post_update_hr>, L</walker_ar> and L</walker_hr>. By default, these stats are displayed if
5278             Text::AutoCSV got created with attribute C<verbose =E<gt> 1>. get_stats() returns the statistics
5279             hash of the object.
5280              
5281             B<IMPORTANT>
5282              
5283             As opposed to most functions that trigger input reading automatically (search functions and other
5284             get_* functions), C<get_stats> just returns you the stats as it is, regardless of whether some
5285             execution already occured.
5286              
5287             =head2 get_nb_rows
5288              
5289             my $nb_rows = $csv->get_nb_rows();
5290              
5291             Gives the number of rows of the input. Does not trigger any reading - just provides the number of
5292             rows as known at the moment of the call. If unknown, return undef. Typically, the number of rows is
5293             known after doing the initial detection of CSV options (escape character, etc.), or, after doing one
5294             complete reading.
5295              
5296             The header line counts for one row.
5297              
5298             B<IMPORTANT>
5299              
5300             As some fields can contain new lines, this number is not necessarily identical to the number of
5301             lines.
5302              
5303             =head2 set_walker_ar
5304              
5305             $csv->set_walker_ar($subref);
5306              
5307             Normally one wants to define it at object creation time using L</walker_ar> attribute.
5308             C<set_walker_ar> allows to assign the attribute walker_ar after object creation.
5309              
5310             See attribute L</walker_ar> for help about the way C<$subref> should work.
5311              
5312             B<Return value>
5313              
5314             Returns the object itself in case of success.
5315             Returns undef if error.
5316              
5317             Example:
5318              
5319             # Calculate the total of the two first columns, the first column being money in and the
5320             # second one being money out.
5321             my ($actif, $passif) = (0, 0);
5322             $csv->set_walker_ar(sub { my $ar = $_[0]; $actif += $ar->[0]; $passif += $ar->[1]; })->read();
5323             print("Actif = $actif\n");
5324             print("Passif = $passif\n");
5325              
5326             =head2 set_walker_hr
5327              
5328             $csv->set_walker_hr($subref);
5329              
5330             Normally one wants to define it at object creation time using L</walker_hr> attribute.
5331             C<set_walker_hr> allows to assign the attribute L</walker_hr> after object creation.
5332              
5333             See attribute L</walker_hr> for help about the way C<$subref> should work.
5334              
5335             B<Return value>
5336              
5337             Returns the object itself in case of success.
5338             Returns undef if error.
5339              
5340             Example:
5341              
5342             my $csv = Text::AutoCSV->new(in_file => 'directory.csv', verbose => 1);
5343              
5344             # ...
5345              
5346             $csv->set_walker_hr(
5347             sub {
5348             my ($hr, $stat) = @_;
5349             $stat{'not capital name'}++, return if $hr->{'NAME'} ne uc($hr->{'NAME'});
5350             $stat{'name is capital letters'}++;
5351             }
5352             )->read();
5353              
5354             =head2 set_out_file
5355              
5356             $csv->set_out_file($out_file);
5357              
5358             Normally one wants to define it at object creation time using L</out_file> attribute.
5359             C<set_out_file> allows to assign the attribute L</out_file> after object creation. It is set to
5360             C<$out_file> value.
5361              
5362             B<Return value>
5363              
5364             Returns the object itself in case of success.
5365             Returns undef if error.
5366              
5367             Example:
5368              
5369             $csv->set_out_file('mycopy.csv')->write();
5370              
5371             =head2 get_keys
5372              
5373             my @allkeys = $csv->get_keys();
5374              
5375             Returns an array of all the record keys of input. A record key is a unique identifier that
5376             designates the record.
5377              
5378             At the moment it is just an integer being the record number, the first one (that comes after the
5379             header line) being of number 0. For example if $csv input is made of one header line and 3 records
5380             (that is, a 4-line file typically, if no record contains a line break), $csv->get_keys() returns
5381              
5382             (0, 1, 2)
5383              
5384             B<IMPORTANT>
5385              
5386             If not yet done, this function causes the input to be read entirely and stored in-memory.
5387              
5388             =head2 get_hr_all
5389              
5390             my @allin = $csv->get_hr_all();
5391              
5392             Returns an array of all record contents of the input, each record being a hash ref.
5393              
5394             B<IMPORTANT>
5395              
5396             If not yet done, this function causes the input to be read entirely and stored in-memory.
5397              
5398             =head2 get_row_ar
5399              
5400             my $row_ar = $csv->get_row_ar($record_key);
5401              
5402             Returns an array ref of the record designated by C<$record_key>.
5403              
5404             Example:
5405              
5406             # Get content (as array ref) of last record
5407             my @allkeys = $csv->get_keys();
5408             my $lastk = $allkeys[-1];
5409             my $lastrec_ar = $csv->get_row_ar($lastk);
5410              
5411             B<IMPORTANT>
5412              
5413             If not yet done, this function causes the input to be read entirely and stored in-memory.
5414              
5415             =head2 get_row_hr
5416              
5417             my $row_hr = $csv->get_row_hr($record_key);
5418              
5419             Returns a hash ref of the record designated by C<$record_key>.
5420              
5421             Example:
5422              
5423             # Get content (as hash ref) of first record
5424             my @allkeys = $csv->get_keys();
5425             my $firstk = $allkeys[0];
5426             my $firstrec_hr = $csv->get_row_hr($firstk);
5427              
5428             B<IMPORTANT>
5429              
5430             If not yet done, this function causes the input to be read entirely and stored in-memory.
5431              
5432             =head2 get_cell
5433              
5434             my $val = $csv->get_cell($record_key, $field_name);
5435              
5436             Return the value of the cell designated by its record key (C<$record_key>) and field name
5437             (C<$field_name>).
5438              
5439             Example:
5440              
5441             my @allkeys = $csv->get_keys();
5442             my $midk = $allkeys[int($#allkeys / 2)];
5443             my $midname = $csv->get_cell($midk, 'NAME');
5444              
5445             Note the above example is equivalent to:
5446              
5447             my @allkeys = $csv->get_keys();
5448             my $midk = $allkeys[int($#allkeys / 2)];
5449             my $midrec_hr = $csv->get_row_hr($midk);
5450             my $midname = $midrec_hr->{'NAME'};
5451              
5452             B<IMPORTANT>
5453              
5454             If not yet done, this function causes the input to be read entirely and stored in-memory.
5455              
5456             =head2 get_values
5457              
5458             my @vals = $csv->get_values($field_name, $opt_filter_subref);
5459              
5460             Return an array made of the values of the given field name (C<$field_name>), for every records, in
5461             the order of the records.
5462              
5463             C<$opt_filter_subref> is an optional subref. If defined, it is called with every values in turn (one
5464             call per value) and only values for which C<$opt_filter_subref> returned True are included in the
5465             returned array. Call to C<$opt_filter_subref> is done with $_ to pass the value.
5466              
5467             Example:
5468              
5469             my @logins = $csv->get_values('LOGIN");
5470              
5471             This is equivalent to:
5472              
5473             my @allkeys = $csv->get_keys();
5474             my @logins;
5475             push @logins, $csv->get_cell($_, 'LOGIN') for (@allkeys);
5476              
5477             Example bis
5478              
5479             # @badlogins is the list of logins that contain non alphanumeric characters
5480             my @badlogins = Text::AutoCSV->new(in_file => 'logins.csv')
5481             ->get_values('LOGIN', sub { m/[^a-z0-9]/ });
5482              
5483             This is equivalent to:
5484              
5485             # @badlogins is the list of logins that contain non alphanumeric characters
5486             # This method leads to carrying all values of a given field across function calls...
5487             my @badlogins = grep { m/[^a-z0-9]/ } (
5488             Text::AutoCSV->new(in_file => 'logins.csv')->get_values('LOGIN')
5489             );
5490              
5491             B<IMPORTANT>
5492              
5493             If not yet done, this function causes the input to be read entirely and stored in-memory.
5494              
5495             =head2 get_recnum
5496              
5497             my $r = $csv->get_recnum();
5498              
5499             Returns the current record identifier, if a reading is in progress. If no read is in progress,
5500             return undef.
5501              
5502             =head2 in_map
5503              
5504             =head2 read_update_after
5505              
5506             C<read_update_after> is an alias of C<in_map>.
5507              
5508             $csv->in_map($field, $subref);
5509              
5510             After reading a record from input, update C<$field> by calling C<$subref>. The value is put in
5511             C<$_>. Then the field value is set to the return value of C<$subref>.
5512              
5513             This feature is originally meant to manage DateTime fields: the input and output CSVs carry text
5514             content, and in-between, the values dealt with are DateTime objects.
5515              
5516             See L</out_map> for an example.
5517              
5518             =head2 out_map
5519              
5520             =head2 write_update_before
5521              
5522             C<write_update_before> is an alias of C<out_map>.
5523              
5524             $csv->out_map($field, $subref);
5525              
5526             Before writing C<$field> field content into the output file, pass it through C<out_map>. The value
5527             is put in C<$_>. Then the return value of C<$subref> is written in the output.
5528              
5529             Example:
5530              
5531             Suppose you have a CSV file with the convention that a number surrounded by parenthesis is negative.
5532             You can register corresponding L</in_map> and L</out_map> functions. During the processing of data,
5533             the field content will be just a number (positive or negative), while in input and in output, it'll
5534             follow the "negative under parenthesis" convention.
5535              
5536             In the below example, we rely on convention above and add a new field converted from the original
5537             one, that follows the same convention.
5538              
5539             sub in_updt {
5540             return 0 if !defined($_) or $_ eq '';
5541             my $i;
5542             return -$i if ($i) = $_ =~ m/^\((.*)\)$/;
5543             $_;
5544             }
5545             sub out_updt {
5546             return '' unless defined($_);
5547             return '(' . (-$_) . ')' if $_ < 0;
5548             $_;
5549             }
5550             sub convert {
5551             return ;
5552             }
5553             Text::AutoCSV->new(in_file => 'trans-euros.csv', out_file => 'trans-devises.csv')
5554             ->in_map('EUROS', \&in_updt)
5555             ->out_map('EUROS', \&out_updt)
5556             ->out_map('DEVISE', \&out_updt)
5557             ->field_add_copy('DEVISE', 'EUROS', sub { sprintf("%.2f", $_ * 1.141593); } )
5558             ->write();
5559              
5560             =head2 search
5561              
5562             my $found_ar = $csv->search($field_name, $value, \%opts);
5563              
5564             Returns an array ref of all records keys where the field C<$field_name> has the value C<$value>.
5565              
5566             C<\%opts> is an optional hash ref of options for the search. See help of L</vlookup> about options.
5567              
5568             B<IMPORTANT>
5569              
5570             An unsuccessful search returns an empty array ref, that is, [ ]. Thus you B<cannot> check for
5571             definedness of C<search> return value to know whether or not the search found something.
5572              
5573             On the other hand, you can always examine the value C<search(...)-E<gt>[0]>, as search is always an
5574             array ref. If the search found nothing, then, C<search(...)-E<gt>[0]> is not defined.
5575              
5576             B<IMPORTANT bis>
5577              
5578             If not yet done, this function causes the input to be read entirely and stored in-memory.
5579              
5580             Example:
5581              
5582             my $linux_os_keys_ar = $csv->search('OS', 'linux');
5583              
5584             =head2 search_1hr
5585              
5586             my $found_hr = $csv->search_1hr($field_name, $value, \%opts);
5587              
5588             Returns a hash ref of the first record where the field C<$field_name> has the value C<$value>.
5589              
5590             C<\%opts> is an optional hash ref of options for the search. See help of L</vlookup> about options.
5591              
5592             Note the options L</value_if_not_found> and L</value_if_ambiguous> are ignored. If not found, return
5593             undef. If the result is ambiguous (more than one record found) and ignore_ambiguous is set to a
5594             false value, return undef.
5595              
5596             The other options are taken into account as for any search: L</ignore_ambiguous>, L</trim>,
5597             L</case>, L</ignore_empty>.
5598              
5599             B<IMPORTANT>
5600              
5601             As opposed to L</search>, an unsuccessful C<search_1hr> will return C<undef>.
5602              
5603             B<IMPORTANT bis>
5604              
5605             If not yet done, this function causes the input to be read entirely and stored in-memory.
5606              
5607             Example:
5608              
5609             my $hr = $csv->search_1hr('LOGIN', $login);
5610             my $full_name = $hr->{'FIRSTNAME'} . ' ' . $hr->{'LASTNAME'};
5611              
5612             =head2 vlookup
5613              
5614             my $val = $csv->vlookup($searched_field, $value, $target_field, \%opts);
5615              
5616             Find the first record where C<$searched_field> contains C<$value> and out of this record, returns
5617             the value of C<$target_field>.
5618              
5619             C<\%opts> is optional. It is a hash of options for C<vlookup>:
5620              
5621             =over 4
5622              
5623             =item trim
5624              
5625             If true, ignore spaces before and after the values to search.
5626              
5627             If option is not present, use L</search_trim> attribute of object (default value: 1).
5628              
5629             =item case
5630              
5631             If true, do case sensitive searches.
5632              
5633             If option is not present, use L</search_case> attribute of object (default value: 0).
5634              
5635             =item ignore_empty
5636              
5637             If true, ignore empty values in the search. The consequence is that you won't be able to find
5638             empty values by searching it.
5639              
5640             If option is not present, use L</search_ignore_empty> attribute of object (default value: 1).
5641              
5642             =item ignore_accents
5643              
5644             If true, ignore accents in searches. For exampe, if C<ignore_accents> is set, a string like
5645             "élémentaire" will match "elementaire".
5646              
5647             If option is not present, use L</search_ignore_accents> attribute of object (default value: 1).
5648              
5649             B<NOTE>
5650              
5651             This option uses the function L</remove_accents> to build its internal hash tables. See
5652             L</remove_accents> help for more details.
5653              
5654             =item value_if_not_found
5655              
5656             Return value if vlookup finds nothing.
5657              
5658             If option is not present, use L</search_value_if_not_found> attribute of object (default value:
5659             undef).
5660              
5661             =item value_if_found
5662              
5663             Return value if vlookup finds something.
5664              
5665             If option is not present, use L</search_value_if_found> attribute of object (default value: none).
5666              
5667             This option is to just check whether a value exists, regardless of the target value found.
5668              
5669             B<NOTE>
5670              
5671             Although the B<$target_field> is ignored when using this option, you must specify it any way.
5672              
5673             =item value_if_ambiguous
5674              
5675             Return value if vlookup find more than one result. Tune it only if ignore_ambiguous is unset.
5676              
5677             If option is not present, use L</search_value_if_ambiguous> attribute of object (default value:
5678             undef).
5679              
5680             =item ignore_ambiguous
5681              
5682             If true, then if more than one result is found, silently return the first one.
5683              
5684             If option is not present, use L</search_ignore_ambiguous> attribute of object (default value: 1).
5685              
5686             =back
5687              
5688             B<IMPORTANT>
5689              
5690             If not yet done, this function causes the input to be read entirely and stored in-memory.
5691              
5692             Example:
5693              
5694             my $name = $csv->vlookup('LOGIN', $id, 'NAME', { value_if_not_found => '<login not found>' });
5695              
5696             =head2 remove_accents
5697              
5698             my $t = $csv->remove_accents($s);
5699              
5700             Take the string C<$s> as argument and return the string without accents. Uses a Unicode
5701             decomposition followed by removal of every characters that have the Unicode property
5702             C<Nonspacing_Mark>.
5703              
5704             B<NOTE>
5705              
5706             Only accents are removed. It is not a C<whatever-encoding -E<gt> us-ascii> conversion. For example,
5707             the French B<Å“> character (o followed by e) or the German B<ß> (eszett) are kept as is.
5708              
5709             B<NOTE bis>
5710              
5711             Tested with some latin1 and latin2 characters.
5712              
5713             B<NOTE ter>
5714              
5715             There is no language-level transformation during accents removal. For example B<Jürgen> is returned
5716             as B<Jurgen>, not B<Juergen>.
5717              
5718             This function is not exported by default.
5719              
5720             Example:
5721              
5722             use Text::AutoCSV qw(remove_accents);
5723             my $s = remove_accents("Français: être élémentaire, Tchèque: služba dům");
5724             die "This script will never die" if $s ne 'Francais: etre elementaire, Tcheque: sluzba dum';
5725              
5726             =head1 AUTHOR
5727              
5728             Sébastien Millet <milletseb@laposte.net>
5729              
5730             =head1 COPYRIGHT AND LICENSE
5731              
5732             This software is copyright (c) 2016, 2017 by Sébastien Millet.
5733              
5734             This is free software; you can redistribute it and/or modify it under
5735             the same terms as the Perl 5 programming language system itself.
5736              
5737             =cut