File Coverage

blib/lib/Data/TableReader.pm
Criterion Covered Total %
statement 402 532 75.5
branch 178 322 55.2
condition 61 134 45.5
subroutine 46 56 82.1
pod 5 7 71.4
total 692 1051 65.8


line stmt bran cond sub pod time code
1             package Data::TableReader;
2 7     7   1983467 use Moo 2;
  7         44797  
  7         41  
3 7     7   9984 use Try::Tiny;
  7         14  
  7         495  
4 7     7   57 use Carp;
  7         13  
  7         444  
5 7     7   38 use Scalar::Util qw( blessed refaddr );
  7         17  
  7         397  
6 7     7   45 use List::Util 'max';
  7         14  
  7         476  
7 7     7   3536 use Module::Runtime 'require_module';
  7         13627  
  7         40  
8 7     7   4057 use Data::TableReader::Field;
  7         50  
  7         311  
9 7     7   3744 use Data::TableReader::Iterator;
  7         37  
  7         243  
10 7     7   41 use namespace::clean;
  7         14  
  7         51  
11              
12             # ABSTRACT: Extract records from "dirty" tabular data sources
13             our $VERSION = '0.021'; # VERSION
14              
15              
16             has input => ( is => 'rw', required => 1 );
17             has _file_handle => ( is => 'lazy' );
18             has _decoder_arg => ( is => 'rw', init_arg => 'decoder' );
19             has decoder => ( is => 'lazy', init_arg => undef );
20             has fields => ( is => 'rw', required => 1, coerce => \&_coerce_field_list );
21 44     44 1 69 sub field_list { @{ shift->fields } }
  44         898  
22             has field_by_name => ( is => 'lazy' );
23             has field_by_addr => ( is => 'lazy' );
24             has record_class => ( is => 'rw', required => 1, default => sub { 'HASH' } );
25             has static_field_order => ( is => 'rw' ); # force order of columns
26             has header_row_at => ( is => 'rw', default => sub { [1,10] } ); # row of header, or range to scan
27             has header_row_combine => ( is => 'rw', lazy => 1, builder => 1 );
28             has table_search_results=> ( is => 'rw', lazy => 1, builder => 1, clearer => 1, predicate => 1 );
29             has col_map => ( is => 'rw', lazy => 1, builder => 1, predicate => 1 );
30             has on_partial_match => ( is => 'rw', default => sub { 'next' } );
31             has on_ambiguous_columns=> ( is => 'rw', default => sub { 'error' } );
32             has on_unknown_columns => ( is => 'rw', default => sub { 'warn' } );
33             has on_blank_row => ( is => 'rw', default => sub { 'next' } );
34             has on_validation_error => ( is => 'rw', default => sub { 'die' } );
35             has log => ( is => 'rw', trigger => sub { shift->_clear_log } );
36              
37             sub BUILD {
38 23     23 0 152 my ($self, $args)= @_;
39             # If user supplied col_map, it probably contains names instead of Field objects.
40 23 100       98 if ($self->has_col_map) {
41             # Make a new array in case other parts of user code refer to current one
42 4         9 $self->col_map($self->_resolve_colmap_names([ @{ $self->col_map } ]));
  4         86  
43             }
44             # Back-compat for previous API
45 23 100       203 if (defined (my $act= $args->{on_validation_fail})) {
46             croak "on_validation_fail (back-compat alias) conflicts with on_validation_error"
47 3 50       5 if defined $args->{on_validation_error};
48 3         7 $self->on_validation_fail($act);
49             }
50             }
51              
52             sub on_validation_fail {
53 3     3 0 4 my $self= shift;
54 3 50       5 if (@_) {
55 3         6 my $act= shift;
56             #warn "on_validation_fail is deprecated (see on_validation_error)";
57             # adapt for the old API
58 3 100       7 $act= _wrap_on_validation_fail($act) if ref $act eq 'CODE';
59 3         19 return $self->on_validation_error($act);
60             }
61 0         0 return $self->on_validation_error;
62             }
63              
64             # Modifies array to replace name with field ref
65             sub _resolve_colmap_names {
66 4     4   47 my ($self, $col_map)= @_;
67 4   66     39 for (grep defined && !ref, @$col_map) {
68 7 50       142 defined(my $f= $self->field_by_name->{$_})
69             or croak("col_map specifies non-existent field '$_'");
70 7         123 $_= $f;
71             }
72 4         86 $col_map;
73             }
74              
75             # Open 'input' if it isn't already a file handle
76             sub _build__file_handle {
77 23     23   232 my $self= shift;
78 23         77 my $i= $self->input;
79 23 50 66     843 return undef if ref($i) && (
      66        
80             (blessed($i) && ($i->can('get_cell') || $i->can('worksheets')))
81             or ref($i) eq 'ARRAY'
82             );
83 0 0 0     0 return $i if ref($i) && (ref($i) eq 'GLOB' or ref($i)->can('read'));
      0        
84 0 0       0 open(my $fh, '<', $i) or croak "open($i): $!";
85 0         0 binmode $fh;
86 0         0 return $fh;
87             }
88              
89             # Create ::Decoder instance either from user-supplied args, or by detecting input format
90             sub _build_decoder {
91 23     23   254 my $self= shift;
92 23         78 my $decoder_arg= $self->_decoder_arg;
93 23         56 my $decoder_ref= ref $decoder_arg;
94 23         51 my ($class, @args);
95 23 50 0     77 if (!$decoder_arg) {
    0          
    0          
    0          
96 23         87 ($class, @args)= $self->detect_input_format;
97 23         609 $self->_log->('trace', "Detected input format as %s", $class);
98             }
99             elsif (!$decoder_ref) {
100 0         0 $class= $decoder_arg;
101             }
102             elsif ($decoder_ref eq "HASH" or $decoder_ref eq "ARRAY") {
103 0 0       0 ($class, @args)= $decoder_ref eq "ARRAY"? @$decoder_arg : do {
104 0         0 my %tmp= %$decoder_arg;
105 0         0 (delete($tmp{CLASS}), %tmp);
106             };
107 0 0       0 if(!$class) {
108 0         0 my ($input_class, @input_args)= $self->detect_input_format;
109 0 0       0 croak "decoder class not in arguments and unable to identify decoder class from input"
110             if !$input_class;
111 0         0 ($class, @args)= ($input_class, @input_args, @args);
112             }
113             }
114             elsif ($decoder_ref->can('iterator')) {
115 0         0 return $decoder_arg;
116             }
117             else {
118 0         0 croak "Can't create decoder from $decoder_ref";
119             }
120 23 50       180 $class= "Data::TableReader::Decoder::$class"
121             unless $class =~ /::/;
122 23 50       177 require_module($class) or croak "$class does not exist or is not installed";
123 23         1244 $self->_log->('trace', 'Creating decoder %s on input %s', $class, $self->input);
124 23 50 50     593 return $class->new(
125             file_name => ($self->input eq ($self->_file_handle||"") ? '' : $self->input),
126             file_handle => $self->_file_handle,
127             _log => $self->_log,
128             @args
129             );
130             }
131              
132             # User supplies any old perl data, but this field should always be an arrayref of ::Field
133             sub _coerce_field_list {
134 23     23   489657 my ($list)= @_;
135 23 50 33     200 defined $list and ref $list eq 'ARRAY' or croak "'fields' must be a non-empty arrayref";
136 23         79 my @list= @$list; # clone it, to make sure we don't unexpectedly alter the caller's data
137 23         55 for (@list) {
138 68 100       245 if (!ref $_) {
    100          
139 13         311 $_= Data::TableReader::Field->new({ name => $_ });
140             } elsif (ref $_ eq 'HASH') {
141 54         229 my %args= %$_;
142             # "isa" alias for the 'type' attribute
143 54 50 33     178 $args{type}= delete $args{isa} if defined $args{isa} && !defined $args{type};
144 54         1087 $_= Data::TableReader::Field->new(\%args)
145             } else {
146 1 50 33     31 croak "Can't coerce '$_' to a Field object"
147             unless blessed($_) && $_->isa('Data::TableReader::Field');
148             }
149             }
150 23         444 return \@list;
151             }
152              
153             sub _build_field_by_name {
154 4     4   42 my $self= shift;
155             # reverse list so first field of a name takes precedence
156 4         8 return { map +( $_->name => $_ ), reverse @{ $self->fields } }
  4         82  
157             }
158              
159             sub _build_field_by_addr {
160 0     0   0 my $self= shift;
161 0         0 return { map +( refaddr $_ => $_ ), @{ $self->fields } }
  0         0  
162             }
163              
164             sub _build_header_row_combine {
165 22     22   807 my $self= shift;
166             # If headers contain "\n", we need to collect multiple cells per column
167             # Find the maximum number of \n contained in any regex.
168 22         61 max map { 1+(()= ($_->header_regex =~ /\\n|\n/g)) } $self->field_list;
  68         1971  
169             }
170              
171             # 'log' can be a variety of things, but '_log' will always be a coderef
172             has _log => ( is => 'lazy', clearer => 1 );
173             sub _build__log {
174 23     23   623 _log_fn(shift->log);
175             }
176             sub _log_fn {
177 23     23   151 my $dest= shift;
178             !$dest? sub {
179 2     2   9 my ($level, $msg, @args)= @_;
180 2 50 33     9 return unless $level eq 'warn' or $level eq 'error';
181 0 0       0 $msg= sprintf($msg, @args) if @args;
182 0         0 warn $msg."\n";
183             }
184             : ref $dest eq 'ARRAY'? sub {
185 263     263   1953 my ($level, $msg, @args)= @_;
186 263 100 100     3203 return unless $level eq 'warn' or $level eq 'error';
187 34 100       85 $msg= sprintf($msg, @args) if @args;
188 34         253 push @$dest, [ $level, $msg ];
189             }
190             : ref($dest)->can('info')? sub {
191 34     34   612 my ($level, $msg, @args)= @_;
192 34 50       193 $dest->$level( @args? sprintf($msg, @args) : $msg )
    100          
193             if $dest->can('is_'.$level)->($dest);
194             }
195 23 50       376 : croak "Don't know how to log to $dest";
    100          
    100          
196             }
197              
198              
199             sub detect_input_format {
200 23     23 1 64 my ($self, $filename, $magic)= @_;
201              
202 23         111 my $input= $self->input;
203             # As convenience to spreadsheet users, let input be a parsed workbook/worksheet object.
204 23 100 66     384 return ('XLSX', sheet => $input)
205             if ref($input) && ref($input)->can('get_cell');
206 22 50 33     198 return ('XLSX', workbook => $input)
207             if ref($input) && ref($input)->can('worksheets');
208             # Convenience for passing already-parsed data
209 22 50       76 if (ref($input) eq 'ARRAY') {
210             # if user supplied single table of data, wrap it in an array to make an array of tables.
211             $input= [ $input ]
212             if @$input && ref($input->[0]) eq 'ARRAY'
213 22 100 33     222 && @{$input->[0]} && ref($input->[0][0]) ne 'ARRAY';
  22   33     154  
      66        
214 22         129 return ('Mock', datasets => $input);
215             }
216              
217             # Load first block of file, unless supplied
218 0         0 my $fpos;
219 0 0       0 if (!defined $magic) {
220 0         0 my $fh= $self->_file_handle;
221             # Need to be able to seek.
222 0 0       0 if (seek($fh, 0, 1)) {
    0          
223 0         0 $fpos= tell $fh;
224 0         0 read($fh, $magic, 4096);
225 0 0       0 seek($fh, $fpos, 0) or croak "seek: $!";
226             }
227             elsif ($fh->can('ungets')) {
228 0         0 $fpos= 0; # to indicate that we did try reading the file
229 0         0 read($fh, $magic, 4096);
230 0         0 $fh->ungets($magic);
231             }
232             else {
233 0         0 $self->_log->('notice',"Can't fully detect input format because handle is not seekable."
234             ." Consider fully buffering the file, or using FileHandle::Unget");
235 0         0 $magic= '';
236             }
237             }
238              
239             # Excel is obvious so check it first. This handles cases where an excel file is
240             # erroneously named ".csv" and sillyness like that.
241 0 0       0 return ( 'XLSX' ) if $magic =~ /^PK(\x03\x04|\x05\x06|\x07\x08)/;
242 0 0       0 return ( 'XLS' ) if $magic =~ /^\xD0\xCF\x11\xE0/;
243              
244             # Else trust the file extension, because TSV with commas can be very similar to CSV with
245             # tabs in the data, and some crazy person might store an HTML document as the first element
246             # of a CSV file.
247             # Detect filename if not supplied
248 0 0       0 if (!defined $filename) {
249 0         0 $filename= '';
250 0 0 0     0 $filename= "$input" if defined $input and (!ref $input || ref($input) =~ /path|file/i);
      0        
251             }
252 0 0       0 if ($filename =~ /\.([^.]+)$/) {
253 0         0 my $suffix= uc($1);
254 0 0       0 return 'HTML' if $suffix eq 'HTM';
255 0         0 return $suffix;
256             }
257              
258             # Else probe some more...
259 0         0 $self->_log->('debug',"Probing file format because no filename suffix");
260 0 0       0 length $magic or croak "Can't probe format. No filename suffix, and "
    0          
261             .(!defined $fpos? "unseekable file handle" : "no content");
262              
263             # HTML is pretty obvious
264 0 0       0 return 'HTML' if $magic =~ /^(\xEF\xBB\xBF|\xFF\xFE|\xFE\xFF)?<(!DOCTYPE )HTML/i;
265             # Else guess between CSV and TSV
266 0         0 my ($probably_csv, $probably_tsv)= (0,0);
267 0 0       0 ++$probably_csv if $magic =~ /^(\xEF\xBB\xBF|\xFF\xFE|\xFE\xFF)?["']?[\w ]+["']?,/;
268 0 0       0 ++$probably_tsv if $magic =~ /^(\xEF\xBB\xBF|\xFF\xFE|\xFE\xFF)?["']?[\w ]+["']?\t/;
269 0         0 my $comma_count= () = ($magic =~ /,/g);
270 0         0 my $tab_count= () = ($magic =~ /\t/g);
271 0         0 my $eol_count= () = ($magic =~ /\n/g);
272 0 0 0     0 ++$probably_csv if $comma_count > $eol_count and $comma_count > $tab_count;
273 0 0 0     0 ++$probably_tsv if $tab_count > $eol_count and $tab_count > $comma_count;
274 0         0 $self->_log->('debug', 'probe results: comma_count=%d tab_count=%d eol_count=%d probably_csv=%d probably_tsv=%d',
275             $comma_count, $tab_count, $eol_count, $probably_csv, $probably_tsv);
276 0 0 0     0 return 'CSV' if $probably_csv and $probably_csv > $probably_tsv;
277 0 0 0     0 return 'TSV' if $probably_tsv and $probably_tsv > $probably_csv;
278 0         0 croak "Can't determine file format";
279             }
280              
281              
282             sub _build_table_search_results {
283 6     6   49 my $self= shift;
284 6         136 my $result= $self->_find_table($self->decoder->iterator);
285             # When called during lazy-build, not finding the table is fatal
286 6 50       16 if (!$result->{found}) {
287 0   0     0 my $err= $$result->{fatal} || "Can't locate valid header";
288 0         0 $self->_log->('error', $err);
289 0         0 croak $err;
290             }
291 6         25 $result;
292             }
293              
294             sub _build_col_map {
295             shift->table_search_results->{found}{col_map}
296 1     1   544 }
297              
298             sub find_table {
299 16     16 1 9329 my $self= shift;
300 16         588 my $result= $self->_find_table($self->decoder->iterator);
301 16         376 $self->table_search_results($result);
302 16         238 return defined $result->{found};
303             }
304              
305 11     11 1 9924 sub field_map { _field_map(shift->col_map) }
306              
307             sub _field_map {
308 35     35   142 my $col_map= shift;
309 35         60 my %fmap;
310 35         127 for my $i (0 .. $#$col_map) {
311 165 100       354 next unless defined $col_map->[$i];
312 140 100       375 if ($col_map->[$i]->array) {
313 46         70 push @{ $fmap{$col_map->[$i]->name} }, $i;
  46         136  
314             } else {
315 94         284 $fmap{$col_map->[$i]->name}= $i;
316             }
317             }
318 35         156 return \%fmap;
319             }
320              
321             sub _find_table {
322 22     22   84 my ($self, $data_iter)= @_;
323             # $stash ||= {};
324             # while (1) {
325             # $success= $self->_find_table_in_dataset($data_iter, $stash);
326             # && !defined $stash->{fatal}
327             # && $data_iter->next_dataset
328             # ) {}
329             # if ($success) {
330             # # And record the stream position of the start of the table
331             # $self->col_map($stash->{col_map});
332             # $stash->{first_record_pos}= $data_iter->tell;
333             # $stash->{data_iter}= $data_iter;
334             # return $stash;
335             # }
336             # else {
337             # my $err= $stash->{fatal} || "Can't locate valid header";
338             # $self->_log->('error', $err);
339             # croak $err if $stash->{croak_on_fail};
340             # return undef;
341             # }
342 22         74 my @fields= $self->field_list;
343 22         208 my $header_at= $self->header_row_at;
344 22         77 my %result;
345              
346             # Special case for the file not having any headers in it.
347             # If header_row_at is undef, then there is no header.
348             # Ensure static_field_order, then set up columns.
349 22 50       85 if (!defined $header_at) {
350 0 0       0 unless ($self->static_field_order) {
351 0         0 $result{fatal}= "You must enable 'static_field_order' if there is no header row";
352 0         0 return;
353             }
354 0 0       0 my $col_map= [ $self->has_col_map? @{$self->col_map} : @fields ];
  0         0  
355             $result{found}= {
356 0         0 row_idx => -1,
357             row => undef,
358             dataset_idx => 0,
359             col_map => $col_map,
360             messages => [],
361             first_record_pos => $data_iter->tell,
362             _data_iter => $data_iter,
363             };
364 0         0 $result{candidates}= [ $result{found} ];
365 0         0 return \%result;
366             }
367              
368 22         38 my $dataset_idx= 0;
369 22         45 dataset: do {
370             # If headers contain "\n", we need to collect multiple cells per column
371 23         468 my $row_accum= $self->header_row_combine;
372            
373 23 100       139 my ($start, $end)= ref $header_at? @$header_at : ( $header_at, $header_at );
374 23         44 my @rows;
375            
376             # If header_row_at doesn't start at 1, seek forward
377 23 100       83 if ($start > 1) {
378 1         25 $self->_log->('trace', 'Skipping to row %s', $start);
379 1         9 push @rows, $data_iter->() for 1..$start-1;
380             }
381            
382             # Scan through the rows of the dataset up to the end of header_row_at, accumulating rows so that
383             # multi-line regexes can match.
384 23         77 for my $row ($start .. $end) {
385 35         213 my %attempt= (
386             row_idx => $row-1,
387             row => $row,
388             dataset_idx => $dataset_idx,
389             messages => []
390             );
391 35         160 my $vals= $data_iter->();
392 35 100       105 if (!$vals) { # if undef, we reached end of dataset
393 2         50 $self->_log->('trace', 'EOF');
394 2         10 last;
395             }
396 33 100       89 if ($row_accum > 1) {
397 5         11 push @rows, $vals;
398 5         17 shift @rows while @rows > $row_accum;
399 5         14 $vals= [ map { my $c= $_; join("\n", map $_->[$c], @rows) } 0 .. $#{$rows[-1]} ];
  28         39  
  28         109  
  5         14  
400 5         23 $attempt{context}= $row_accum.' rows ending at '.$data_iter->position;
401             } else {
402 28         105 $attempt{context}= $data_iter->position;
403             }
404 33         851 $self->_log->('trace', 'Checking for headers on %s', $attempt{context});
405             # Now fill-in the col_map
406 33 50       211 my $found= $self->static_field_order?
407             # If static field order, look for headers in sequence
408             $self->_match_headers_static($vals, \%attempt)
409             # else search for each header
410             : $self->_match_headers_dynamic($vals, \%attempt);
411 33         143 $attempt{first_record_pos}= $data_iter->tell;
412 33         85 $self->_log->(@$_) for @{$attempt{messages}};
  33         932  
413 33         69 push @{$result{candidates}}, \%attempt;
  33         102  
414 33 100       122 if ($found) {
415 21         54 $result{found}= \%attempt;
416 21         54 $result{found}{_data_iter}= $data_iter;
417 21         439 $self->col_map($attempt{col_map});
418 21         500 $self->_log->(info => 'Found header at '.$attempt{context});
419 21         1829 return \%result;
420             } else {
421             # Back-compat: if attempt ends with 'fatal' message, stop looking for header
422             last dataset
423 12 50       48 if delete $attempt{fatal};
424             # Was this a partial match? See if any col_map entries were added vs. what user already gave us.
425             my $initial_colmap_count= !$self->has_col_map? 0
426 12 100       48 : scalar(grep defined, @{$self->col_map});
  3         73  
427 12 100       63 if ($initial_colmap_count < scalar(grep defined, @{$attempt{col_map}})) {
  12         45  
428             # Handling of partial match determined by on_partial_match setting
429 2         6 my $act= $self->on_partial_match;
430 2 50       8 $act= $act->($self, \%attempt) if ref $act eq 'CODE';
431             last dataset
432 2 50       7 if $act eq 'last';
433             }
434             }
435 12         276 $self->_log->('debug', '%s: No match', $attempt{context});
436             }
437 2         43 $self->_log->('error','No row in dataset matched full header requirements');
438 2         13 ++$dataset_idx;
439             } while ($data_iter->next_dataset);
440 1         4 return \%result;
441             }
442              
443             # This mode assumes all headers match exactly as perscribed in the fields list or user-supplied col_map
444             sub _match_headers_static {
445 0     0   0 my ($self, $header, $attempt)= @_;
446 0 0       0 my @col_map= $self->has_col_map? @{$self->col_map} : @{$self->fields};
  0         0  
  0         0  
447 0         0 $attempt->{col_map}= \@col_map;
448 0         0 for my $i (0 .. $#col_map) {
449 0 0       0 next unless defined $col_map[$i];
450 0 0       0 next if $header->[$i] =~ $col_map[$i]->header_regex;
451             # Field header doesn't match. Start over on next row.
452 0         0 push @{$attempt->{messages}}, [ error => "Header at column $i does not look like field ".$col_map[$i]->name ];
  0         0  
453 0         0 return 0;
454             }
455             # found a match for every field!
456 0         0 $self->_log->('debug','%s: Found!', $attempt->{context});
457 0         0 return 1;
458             }
459              
460             sub _match_headers_dynamic {
461 33     33   73 my ($self, $header, $attempt)= @_;
462 33         130 my $context= $attempt->{context};
463 33         709 my $fields= $self->fields;
464             # Colmap starts empty unless user supplied one
465 33 100       362 my $user_colmap= $self->has_col_map? $self->col_map : [];
466 33 100       194 my @colmap= map +(defined $_? [ $_ ] : undef), @$user_colmap;
467 33         89 $attempt->{col_map}= \@colmap;
468             # Search every cell of the header, except ones specified by the user
469 33         172 my @col_search_idx= grep !defined $user_colmap->[$_], 0 .. $#$header;
470             # Divide remaining fields (not specified by user) into list that can only occur following
471             # another field, or the ones that can occur anywhere.
472 33         110 my %seen= map +(refaddr $_ => 1 ), grep defined, @$user_colmap;
473 33         63 my (@follows_fields, @free_fields);
474 97 100       252 defined $seen{refaddr $_} or push @{($_->follows_list? \@follows_fields : \@free_fields)}, $_
475 33   66     159 for @$fields;
476 33         76 undef %seen;
477             # Sort required fields to front, to fail faster on non-matching rows
478             # But otherwise preserve field order in case it matters for priority of matching
479 33         203 @free_fields= ( (grep $_->required, @free_fields), (grep !$_->required, @free_fields) );
480              
481             # For each freely-located field (free = lacking placement requirements) scan every un-used
482             # column for a match. Record all matches, for later analysis of ambiguity.
483             # But, stop as soon as a required column is missing; it helps speed up the search for the
484             # header.
485 33 50       126 my $ambiguous_log_level= $self->on_ambiguous_columns eq 'error'? 'error' : 'warn';
486 33         62 my %fieldname_cols;
487 33         64 for my $f (@free_fields) {
488 74         1929 my $hr= $f->header_regex;
489 74         516 push @{$attempt->{messages}}, [ trace => "looking for $hr" ];
  74         345  
490 74         920 my @found_idx= grep $header->[$_] =~ $hr, @col_search_idx;
491 74         119 push @{$attempt->{messages}}, [ debug => "found ".$f->name." header at col [".join(',', map $_+1, @found_idx).']' ];
  74         411  
492 74         167 for my $idx (@found_idx) {
493             # If another field of the same name matches a column, the first gets priority.
494             # ignore the duplicate.
495 68 50       239 if ($fieldname_cols{$f->name}{$idx}) {
496 0         0 push @{$attempt->{messages}}, [ debug => "Ignored; column $idx is already claimed by a field named ".$f->name ];
  0         0  
497             } else {
498 68         97 push @{$colmap[$idx]}, $f;
  68         155  
499             #$col_fieldnames{$idx}{$f->name}= 1;
500 68         246 $fieldname_cols{$f->name}{$idx}= $f;
501             }
502             }
503             # Flag missing required fields
504 74 100 100     377 if (!@found_idx && $f->required) {
505 9         15 push @{$attempt->{missing_required}}, $f;
  9         38  
506 9         15 push @{$attempt->{messages}}, [ error => 'No match for required field '.$f->name ];
  9         34  
507             # Missing required fields probably means this isn't he header row, or the input is
508             # garbage, so might as well stop here before genering a bunch of analysis.
509 9         27 last;
510             }
511             }
512             # Now, check for any of the 'follows' fields, some of which might also be 'required'.
513 33 100 100     105 if (@follows_fields && !$attempt->{missing_required}) {
514 2         5 my %following;
515             my %found;
516 2         9 for my $idx (0 .. $#$header) {
517 17 100 66     44 if ($colmap[$idx] && 1 == @{$colmap[$idx]}) {
  5         18  
518 5         23 %following= ( $colmap[$idx][0]->name => $colmap[$idx][0] );
519             } else {
520 12         25 my $val= $header->[$idx];
521 12         25 for my $f (@follows_fields) {
522 18 100       51 next unless grep $following{$_}, $f->follows_list;
523 9 100       219 next unless $val =~ $f->header_regex;
524             # If another field of the same name matches a column, the first gets priority.
525             # ignore the duplicate.
526 7 50       91 if ($fieldname_cols{$f->name}{$idx}) {
527 0         0 push @{$attempt->{messages}}, [ debug => "Ignored; column $idx is already claimed by a field named ".$f->name ];
  0         0  
528             } else {
529 7         12 push @{$colmap[$idx]}, $f;
  7         20  
530 7         21 $fieldname_cols{$f->name}{$idx}= $f;
531 7         21 $found{refaddr $f}= 1;
532             }
533             }
534             # If successfully matched exactly one field, add it to the 'following' set.
535 12 100 66     71 if ($colmap[$idx] && @{$colmap[$idx]} == 1) {
  7         22  
536 7         24 $following{$colmap[$idx][0]->name}= $colmap[$idx][0];
537             }
538             # Else if no matches, or ambiguous, so reset the following set
539             else {
540 5         15 %following= ();
541             }
542             }
543             }
544             # Check if any of the 'follows' fields were required
545 2 50 33     23 if (my @unfound= grep +($_->required && !$found{refaddr $_}), @follows_fields) {
546 0         0 push @{$attempt->{missing_required}}, @unfound;
  0         0  
547 0 0       0 push @{$attempt->{messages}}, [ error =>
  0         0  
548             sprintf('No match for required %s [%s]',
549             (@unfound > 1? 'fields':'field'),
550             join(', ', map $_->name, sort @unfound))
551             ];
552             }
553             }
554              
555             # Make the list of columns which didn't match anything before starting to munge things
556             # related to ambiguities.
557 33         162 my @unmatched= grep !defined $colmap[$_], 0 .. $#$header;
558 33 100       126 $attempt->{unmatched}= \@unmatched if @unmatched;
559              
560             # Ambiguity check: each field *name* may only be located in one column, unless
561             # the field(s) are flagged as being arrays.
562 33         63 my %ambiguous_fields;
563 33         171 for my $name (sort keys %fieldname_cols) {
564 57         114 my $cols= $fieldname_cols{$name};
565 57 100       202 next unless keys %$cols > 1;
566 7 50       51 next unless grep !$_->array, values %$cols;
567 0         0 push @{$attempt->{messages}}, [ $ambiguous_log_level =>
568             sprintf "Found field '%s' at multiple columns: %s",
569 0         0 $name, join(', ', map 1+$_, sort { $a <=> $b } keys %$cols)
  0         0  
570             ];
571 0         0 $ambiguous_fields{$name}= $cols;
572             }
573 33 50       94 $attempt->{ambiguous_fields}= \%ambiguous_fields
574             if keys %ambiguous_fields;
575              
576             # Ambiguity check: there must be only one field claiming each column
577             # If it's OK, resolve the arrayref down to its single member.
578 33         60 my $col_collision= 0;
579 33         119 for my $idx (0 .. $#colmap) {
580 100 100       231 next unless defined $colmap[$idx];
581 84 50       116 if (@{$colmap[$idx]} == 1) { # only claimed by one field
  84         201  
582 84         143 my $f= $colmap[$idx][0];
583             # but if that one field is ambiguous, discard it
584 84 50       287 $colmap[$idx]= $ambiguous_fields{$f->name}? undef : $f;
585             } else {
586 0         0 push @{$attempt->{messages}}, [ $ambiguous_log_level =>
587             sprintf "Column %d claimed by multiple fields: %s",
588 0         0 $idx+1, join(', ', sort map $_->name, @{$colmap[$idx]})
  0         0  
589             ];
590 0         0 $attempt->{ambiguous_columns}{$idx}= $colmap[$idx];
591 0         0 $colmap[$idx]= undef;
592             }
593             }
594              
595             # Need to have found at least one column (even if none required)
596 33 100       134 unless (grep defined, @colmap) {
597 7         43 push @{$attempt->{messages}}, [
598             error => ($attempt->{ambiguous_columns} || $attempt->{ambiguous_fields})
599 7 50 33     10 ? 'All matching headers were ambiguous'
600             : 'No field headers matched'
601             ];
602 7         34 return 0;
603             }
604              
605             return 0
606             if $attempt->{missing_required}
607             or $self->on_ambiguous_columns ne 'warn'
608 26 100 33     303 && ($attempt->{ambiguous_columns} || $attempt->{ambiguous_fields});
      33        
      66        
609              
610             # Now, if there are any un-claimed columns, handle per 'on_unknown_columns' setting.
611 24 100       62 if (@unmatched) {
612 9         34 my $act= $self->on_unknown_columns;
613 9         40 my $unknown_list= join(', ', map $self->_fmt_header_text($header->[$_]), @unmatched);
614 9 50       34 $act= $act->($self, $header, \@unmatched) if ref $act eq 'CODE';
615 9 100 66     71 if ($act eq 'warn' || $act eq 'use') { # 'use' is back-compat, 'warn' is official now.
    50 33        
616 6         12 push @{$attempt->{messages}}, [ warn => 'Ignoring unknown columns: '.$unknown_list ];
  6         29  
617             } elsif ($act eq 'error' || $act eq 'next') { # 'next' is back-compat, 'error' is official now.
618 3         5 push @{$attempt->{messages}}, [ error => 'Would match except for unknown columns: '.$unknown_list ];
  3         13  
619 3         41 return 0;
620             } else {
621 0 0       0 push @{$attempt->{messages}}, [ error =>
  0         0  
622             $act eq 'die'? "${context}Header row includes unknown columns: $unknown_list"
623             : "Invalid action '$act' for 'on_unknown_columns'"
624             ];
625 0         0 $attempt->{fatal}= 1;
626 0         0 return 0;
627             }
628             }
629 21         142 return 1;
630             }
631             # Make header string readable for log messages
632             sub _fmt_header_text {
633 25 50   25   63 shift if ref $_[0];
634 25         50 my $x= shift;
635 25         64 $x =~ s/ ( [^[:print:]] ) / sprintf("\\x%02X", ord $1 ) /gex;
  4         18  
636 25         124 qq{"$x"};
637             }
638             # format the colmap into a string
639             sub _colmap_progress_str {
640 0     0   0 my ($colmap, $headers)= @_;
641             join(' ', map {
642 0 0       0 $colmap->{$_}? $_.'='.$colmap->{$_}->name
  0         0  
643             : $_.':'._fmt_header_text($headers->[$_])
644             } 0 .. $#$headers)
645             }
646              
647              
648             sub iterator {
649 24     24 1 8987 my $self= shift;
650 24         829 my $fields= $self->fields;
651             $self->table_search_results->{found}
652 24 50       741 or croak "table_search_results does not contain 'found'";
653             # Creating the record iterator consumes the data source's iterator.
654             # The first time after detecting the table, we continue with the same iterator.
655             # Every time after that we need to create a new data iterator and seek to the
656             # first record under the header.
657 24         617 my $data_iter= delete $self->table_search_results->{found}{_data_iter};
658 24 100       195 unless ($data_iter) {
659 3         67 $data_iter= $self->decoder->iterator;
660 3         135 $data_iter->seek($self->table_search_results->{found}{first_record_pos});
661             }
662              
663             # The goal for this iterator is to perform as little work as possible on each iteration,
664             # while making all the features possible, but avoiding building custom perl code with 'eval'.
665             # To that end, most of the operations get vectorized and stored in array variables that
666             # get closed-over by the iterator function.
667             #
668             # The iterator algorithm goes like this:
669             #
670             # - Collect a slice of the next row from the Decoder, selecting only the columns we need.
671             # i.e. @row_slice= @decoder_row[@slice_idx];
672             # - Run trim functions on each value that needs trimmed.
673             # i.e. $row_slice[$i]= $trim[$i]->($row_slice[$i]) if defined $row_slice[$i]
674             # - Apply "blank" value to any value that is zero length
675             # i.e. $row_slice[$i]= $blank_value[$i] unless length $row_slice[$i]
676             # - Special handling if the entire row is blank
677             # - If user wanted validation, do type checks on each relevant value
678             # i.e. $type->validate($row_slice[$i])
679             # - Assemble array-valued fields into a single arrayref value
680             # i.e. $row_slice[$from]= [ splice(@row_slice, $from, $to, undef) ];
681             # - If user wanted an array matching the Field order, alter @row_slice to match
682             # - else if user wanted a hashref, build one,
683             # - If validations failed, run user callback
684             # - if user wanted an object, construct one
685              
686 24         435 my $col_map= $self->table_search_results->{found}{col_map};
687 24         163 my $field_map= _field_map($col_map);
688 24         196 my @input_slice; # list of column idx to retrieve from input
689             my @output_slice; # list of column idx to store in output, for record_class=>'ARRAY'
690 24         0 my @output_keys; # list of hash key names where values get stored
691 24         0 my @array_ranges; # list of value indices that get bundled into an arrayref
692 24         0 my @blank_val; # blank value per each fetched column
693 24         0 my @trim; # list of trim functions and the value indicies they should be applied to
694 24         0 my @type_check; # list of validation coderefs that should be applied
695 24         0 my $class; # optional object class to construct for the resulting rows
696 24         0 my ($n_blank, $first_blank, $eof);
697             my $sub= sub {
698 74     74   2287 my (@failed, $out, $vals);
699 79 100 50     340 again:
      66        
700             # Pull the specific slice of the next row that we need
701             $vals= !$eof && $data_iter->(\@input_slice)
702             or ++$eof && return undef;
703             # Apply 'trim' to any column whose field requested it
704 56         113 for my $t (@trim) {
705 58   66     94 defined and $t->[0]->() for @{$vals}[@{$t->[1]}];
  58         245  
  58         99  
706             }
707             # Apply 'blank value' to every column which is zero length
708 56         89 $n_blank= 0;
709             (defined $vals->[$_] and length $vals->[$_]) or (++$n_blank, $vals->[$_]= $blank_val[$_])
710 56   100     555 for 0..$#$vals;
      66        
711             # If all are blank, then handle according to $on_blank_row setting
712 56 100       201 if ($n_blank == @$vals) {
    100          
713 2   33     15 $first_blank ||= $data_iter->row;
714 2         13 goto again;
715             } elsif ($first_blank) {
716             # At the end of a series of blank rows, run the callback to decide what to do
717 2 50       7 unless ($self->_handle_blank_row($data_iter, $data_iter->row - $first_blank)) {
718 0         0 $eof= 1;
719 0         0 return undef;
720             }
721 2         5 $first_blank= undef;
722             }
723             # Check type constraints, if any
724 54         90 @failed= ();
725 54         103 push @failed, $_->($vals) for @type_check;
726             # Combine each set of array-valued fields into an arrayref
727 54         136 $vals->[$_->[0]]= [ splice @$vals, $_->[0], $_->[1], undef ] for @array_ranges;
728             # Generate the output structure
729 52         72 $out= @output_keys? do { my %out; @out{@output_keys}= @$vals; \%out }
  52         175  
  52         113  
730 54 50       100 : @output_slice? do { my @out; $#out= $#$fields; @out[@output_slice]= @$vals; \@out }
  2 100       3  
  2         5  
  2         6  
  2         4  
731             : $vals;
732             # Handle any validation errors detected above
733 54 100       112 if (@failed) {
734 11 100       36 $self->_handle_validation_error(\@failed, $out, $data_iter)
735             or goto again;
736             }
737             # Construct a class, if requested, else return hashref
738 50 50       340 return $class? $class->new($out) : $out;
739 24         203 };
740              
741             # User wants arrayref output, with one element per field?
742 24 100       134 if ($self->record_class eq 'ARRAY') {
743             # If two fields share a name, only the first one gets the value(s).
744 1         4 my %remaining= %$field_map;
745 1         3 my $need_output_slice= 0;
746 1         2 for my $field_idx (0 .. $#$fields) {
747 4         7 my $f= $fields->[$field_idx];
748 4         12 my $src= delete $remaining{$f->name};
749             # If this field has a source, add it to the input slice and output slice
750 4 100       12 if (defined $src) {
    50          
751 3 50       7 push @output_slice, $field_idx
752             if $need_output_slice;
753 3         7 push @input_slice, $src;
754             }
755             # output_slice isn't needed until the first field that doesn't have a source
756             elsif (!$need_output_slice) {
757 1         3 $need_output_slice= 1;
758 1         6 @output_slice= ( 0 .. $field_idx-1 );
759             }
760             }
761             } else {
762             # For any other record_class, we are building a hashref
763             # Only set the 'class' variable if we also need to construct an object.
764 23 50       88 $class= $self->record_class
765             unless 'HASH' eq $self->record_class;
766 23         104 @input_slice= values %$field_map;
767 23         108 @output_keys= keys %$field_map;
768             }
769              
770 24         68 my %trimmer;
771 24         127 for (my ($i, $out_ofs, $array_start, $array_lim)= (0,0); $i <= $#input_slice; $i++) {
772 86         158 my $col_idx= $input_slice[$i];
773 86 100       773 if (ref $col_idx eq 'ARRAY') {
    100          
774 7         17 $array_start= $i;
775 7         16 $array_lim= $array_start + @$col_idx;
776 7         28 splice(@input_slice, $i, 1, @$col_idx);
777 7         23 push @array_ranges, [ $i+$out_ofs, scalar @$col_idx ];
778 7         17 $col_idx= $col_idx->[0];
779             } elsif (defined $array_lim) {
780 24 100       56 if ($i >= $array_lim) {
781 6         12 $array_start= $array_lim= undef;
782             } else {
783 18         30 --$out_ofs; # each iteration within an array increases the offset
784             }
785             }
786 86         149 my $field= $col_map->[$col_idx];
787             # Handling for ->trim feature
788 86 100       2011 if (my $t= $field->trim_coderef) {
789 85   100     466 $trimmer{refaddr $t} ||= [ $t, [] ];
790 85         126 push @{ $trimmer{refaddr $t}[1] }, $i;
  85         222  
791             }
792             # Handling for ->blank feature
793 86         251 push @blank_val, $field->blank;
794             # Handling for ->type and ->coerce features
795 86 100       397 if ($field->type) {
796             # @path is needed to show the on_validation_fail callback where to find the value in
797             # the output. First element of @path is either ->{$name} or ->[$idx] depending whether
798             # the output is an array or hashref. Second element only happens if that field's value
799             # is an arrayref.
800 10         55 my $output_idx= $i + $out_ofs;
801 10 0       34 my @path= (
    50          
802             @output_keys? $output_keys[$output_idx]
803             : @output_slice? $output_slice[$output_idx]
804             : $output_idx
805             );
806 10 100       25 push @path, ($i - $array_start)
807             if defined $array_start;
808 10         31 push @type_check, $self->_make_validation_check_coderef($field, $i, \@path);
809             }
810             }
811 24         100 @trim= values %trimmer;
812              
813 24         231 return Data::TableReader::_RecIter->new(
814             $sub, { data_iter => $data_iter, reader => $self },
815             );
816             }
817              
818             sub _make_validation_check_coderef {
819 10     10   22 my ($self, $field, $vals_idx, $out_path)= @_;
820 10         19 my $t= $field->type;
821 10         21 my $c= $field->coerce;
822 10   66     33 my $t_can_validate= blessed($t) && $t->can('validate');
823             # If type object has method 'coerce' but does not have method 'has_coercions', just run it.
824             # But, if has_coercions is false, then there's no point in running it.
825 10   66     61 my $t_can_coerce= blessed($t) && ($t->can('has_coercions')? $t->has_coercions : $t->can('coerce'));
826              
827             # There are 5 possibilities for the callback:
828             # type is a coderef, and coerce is false
829             ref $t eq 'CODE'? (
830             !$c? sub {
831 30     30   62 my $e= $t->($_[0][$vals_idx]);
832 30 100       155 defined $e? ([ $field, undef, $e, $out_path ]) : ()
833             }
834             # type is a coderef, and coerce is a coderef
835             : ref $c eq 'CODE'? sub {
836 0     0   0 my $e= $t->($_[0][$vals_idx]);
837 0 0       0 if (defined $e) {
838 0         0 my $tmp= $c->($_[0][$vals_idx]);
839 0 0       0 ($_[0][$vals_idx], $e)= ($tmp) unless defined $t->($tmp);
840             }
841 0 0       0 defined $e? ([ $field, undef, $e, $out_path ]) : ()
842             }
843             : croak("Can't coerce field ".$field->name.": ->type is coderef and ->coerce is not a coderef")
844             )
845             # type is a Type::Tiny, and coerce is a coderef
846             : $t_can_validate? (
847             ref $c eq 'CODE'? sub {
848 0     0   0 my $e= $t->validate($_[0][$vals_idx]);
849 0 0       0 if (defined $e) {
850 0         0 my $tmp= $c->($_[0][$vals_idx]);
851 0 0       0 ($_[0][$vals_idx], $e)= ($tmp) unless defined $t->validate($tmp);
852             }
853 0 0       0 defined $e? ([ $field, undef, $e, $out_path ]) : ()
854             }
855             # type is a Type::Tiny, and coerce is requested and is available from the type object
856             : $c && $t_can_coerce? sub {
857 5     5   14 my $e= $t->validate($_[0][$vals_idx]);
858 5 50       340 if (defined $e) {
859 5         18 my $tmp= $t->coerce($_[0][$vals_idx]);
860 5 100       1476 ($_[0][$vals_idx], $e)= ($tmp) unless defined $t->validate($tmp);
861             }
862 5 100       198 defined $e? ([ $field, undef, $e, $out_path ]) : ()
863             }
864             # type is a Type::Tiny, and coerce is not requested or not available
865             : sub {
866 10     10   34 my $e= $t->validate($_[0][$vals_idx]);
867 10 100       155 defined $e? ([ $field, undef, $e, $out_path ]) : ()
868             }
869             )
870 10 0 66     304 : croak "Invalid type constraint $t on field ".$field->name;
    50          
    100          
    50          
    50          
    100          
871             }
872              
873             sub _handle_blank_row {
874 2     2   6 my ($self, $data_iter, $count)= @_;
875 2         6 my $last= $data_iter->row - 1;
876 2         6 my $first= $last - $count + 1;
877 2         8 my $act= $self->on_blank_row;
878 2 50       8 $act= $act->($self, $first, $last)
879             if ref $act eq 'CODE';
880 2 50       6 if ($act eq 'next') {
881 2 50       69 $self->_log->('warn', $first == $last?
882             ( 'Skipping blank row at %s', $first )
883             : ('Skipping blank rows from %s until %s', $first, $last )
884             );
885 2         8 return 1;
886             }
887 0 0       0 if ($act eq 'last') {
888 0         0 $self->_log->('warn', 'Ending at blank row %s', $first);
889 0         0 return 0;
890             }
891 0 0       0 if ($act eq 'die') {
892 0         0 my $msg= "Encountered blank rows at $first..$last";
893 0         0 $self->_log->('error', $msg);
894 0         0 croak $msg;
895             }
896 0         0 croak "Invalid value for 'on_blank_row': \"$act\"";
897             }
898              
899             sub _handle_validation_error {
900 11     11   23 my ($self, $failures, $output, $data_iter)= @_;
901 11         32 my $act= $self->on_validation_error;
902 11 100       27 if (ref $act eq 'CODE') {
903             # Fill in the second element (ref to $output) for each failure
904 6         12 for (@$failures) {
905 8         33 my $path= $_->[3];
906 8 50       27 my $ref= ref $output eq 'HASH'? \$output->{$path->[0]} : \$output->[$path->[0]];
907 8 100       23 $ref= \(${$ref}->[$path->[1]]) if @$path > 1;
  2         5  
908 8         18 $_->[1]= $ref;
909             }
910 6         22 $act= $act->($self, $failures, $output, $data_iter);
911             }
912 11         153 my $errors= join(', ', map $_->[0]->name.': '.$_->[2], @$failures);
913 11 100       26 if ($act eq 'next') {
914 3 50       7 $self->_log->('warn', "%s: Skipped for data errors: %s", $data_iter->position, $errors) if $errors;
915 3         10 return 0;
916             }
917 8 100       34 if ($act eq 'use') {
918 7 100       26 $self->_log->('warn', "%s: Possible data errors: %s", $data_iter->position, $errors) if $errors;
919 7         47 return 1;
920             }
921 1 50       13 if ($act eq 'die') {
922 1         4 my $msg= $data_iter->position.": Invalid record: $errors";
923 1         41 $self->_log->('error', $msg);
924 1         122 croak $msg;
925             }
926             }
927              
928             # This is back-compat for the previous callback API which was an attribute named 'on_validation_fail'
929             sub _wrap_on_validation_fail {
930 1     1   2 my $orig_cb= shift;
931             return sub {
932 1     1   2 my ($self, $failures, $output, $data_iter)= @_;
933             # Old API gave the user a value index rather than a ref to the value
934 1         3 my @value_refs= map $_->[1], @$failures;
935 1         2 my @values= map $$_, @value_refs;
936 1         4 $failures->[$_][1]= $_ for 0 .. $#$failures;
937 1         4 my $act= $orig_cb->($self, $failures, \@values, $data_iter->position.': ');
938             # if they changed any values, write them back to the refs.
939 1         17 ${$value_refs[$_]}= $values[$_] for 0 .. $#value_refs;
  1         2  
940 1         2 return $act;
941 1         6 };
942             }
943              
944 7     7   74708 BEGIN { @Data::TableReader::_RecIter::ISA= ( 'Data::TableReader::Iterator' ) }
945              
946             sub Data::TableReader::_RecIter::all {
947 11     11   51 my $self= shift;
948 11         20 my (@rec, $x);
949 11         34 push @rec, $x while ($x= $self->());
950 11         147 return \@rec;
951             }
952             sub Data::TableReader::_RecIter::dataset_idx {
953 1     1   9 shift->_fields->{data_iter}->dataset_idx(@_);
954             }
955             sub Data::TableReader::_RecIter::row {
956 5     5   19 shift->_fields->{data_iter}->row(@_);
957             }
958             sub Data::TableReader::_RecIter::position {
959 0     0     shift->_fields->{data_iter}->position(@_);
960             }
961             sub Data::TableReader::_RecIter::progress {
962 0     0     shift->_fields->{data_iter}->progress(@_);
963             }
964             sub Data::TableReader::_RecIter::tell {
965 0     0     shift->_fields->{data_iter}->tell(@_);
966             }
967             sub Data::TableReader::_RecIter::seek {
968 0     0     shift->_fields->{data_iter}->seek(@_);
969             }
970             sub Data::TableReader::_RecIter::next_dataset {
971             shift->_fields->{reader}->_log
972 0     0     ->('warn',"Searching for subsequent table headers is not supported yet");
973 0           return 0;
974             }
975              
976             1;
977              
978             __END__