File Coverage

blib/lib/Data/TableReader.pm
Criterion Covered Total %
statement 290 404 71.7
branch 133 244 54.5
condition 37 88 42.0
subroutine 36 44 81.8
pod 5 6 83.3
total 501 786 63.7


line stmt bran cond sub pod time code
1             package Data::TableReader;
2 4     4   3751 use Moo 2;
  4         26187  
  4         24  
3 4     4   4050 use Try::Tiny;
  4         9  
  4         203  
4 4     4   22 use Carp;
  4         8  
  4         177  
5 4     4   21 use List::Util 'max';
  4         9  
  4         361  
6 4     4   26 use Module::Runtime 'require_module';
  4         8  
  4         24  
7 4     4   1872 use Data::TableReader::Field;
  4         11  
  4         120  
8 4     4   1274 use Data::TableReader::Iterator;
  4         9  
  4         18630  
9              
10             # ABSTRACT: Extract records from "dirty" tabular data sources
11             our $VERSION = '0.011'; # VERSION
12              
13              
14             has input => ( is => 'rw', required => 1 );
15             has _file_handle => ( is => 'lazy' );
16             has _decoder_arg => ( is => 'rw', init_arg => 'decoder' );
17             has decoder => ( is => 'lazy', init_arg => undef );
18             has fields => ( is => 'rw', required => 1, coerce => \&_coerce_field_list );
19 27     27 0 34 sub field_list { @{ shift->fields } }
  27         393  
20             has field_by_name => ( is => 'lazy' );
21             has record_class => ( is => 'rw', required => 1, default => sub { 'HASH' } );
22             has static_field_order => ( is => 'rw' ); # force order of columns
23             has header_row_at => ( is => 'rw', default => sub { [1,10] } ); # row of header, or range to scan
24             has header_row_combine => ( is => 'rw', lazy => 1, builder => 1 );
25             has on_unknown_columns => ( is => 'rw', default => sub { 'use' } );
26             has on_blank_row => ( is => 'rw', default => sub { 'next' } );
27             has on_validation_fail => ( is => 'rw', default => sub { 'die' } );
28             has log => ( is => 'rw', trigger => sub { shift->_clear_log } );
29              
30             # Open 'input' if it isn't already a file handle
31             sub _build__file_handle {
32 14     14   110 my $self= shift;
33 14         36 my $i= $self->input;
34 14 100 66     112 return undef if ref($i) && (ref($i) eq "Spreadsheet::ParseExcel::Worksheet");
35 13 100 66     158 return $i if ref($i) && (ref($i) eq 'GLOB' or ref($i)->can('read'));
      33        
36 9 50   2   161 open(my $fh, '<', $i) or croak "open($i): $!";
  2         11  
  2         4  
  2         13  
37 9         1226 binmode $fh;
38 9         196 return $fh;
39             }
40              
41             # Create ::Decoder instance either from user-supplied args, or by detecting input format
42             sub _build_decoder {
43 14     14   134 my $self= shift;
44 14         39 my $decoder_arg= $self->_decoder_arg;
45 14         28 my $decoder_ref= ref $decoder_arg;
46 14         27 my ($class, @args);
47 14 100 33     73 if (!$decoder_arg) {
    100          
    50          
    0          
48 1         4 ($class, @args)= $self->detect_input_format;
49 1         22 $self->_log->('trace', "Detected input format as %s", $class);
50             }
51             elsif (!$decoder_ref) {
52 4         6 $class= $decoder_arg;
53             }
54             elsif ($decoder_ref eq "HASH" or $decoder_ref eq "ARRAY") {
55 9 50       22 ($class, @args)= $decoder_ref eq "ARRAY"? @$decoder_arg : do {
56 9         34 my %tmp= %$decoder_arg;
57 9         38 (delete($tmp{CLASS}), %tmp);
58             };
59 9 50       28 if(!$class) {
60 0         0 my ($input_class, @input_args)= $self->detect_input_format;
61 0 0       0 croak "decoder class not in arguments and unable to identify decoder class from input"
62             if !$input_class;
63 0         0 ($class, @args)= ($input_class, @input_args, @args);
64             }
65             }
66             elsif ($decoder_ref->can('iterator')) {
67 0         0 return $decoder_arg;
68             }
69             else {
70 0         0 croak "Can't create decoder from $decoder_ref";
71             }
72 14 50       70 $class= "Data::TableReader::Decoder::$class"
73             unless $class =~ /::/;
74 14 50       67 require_module($class) or croak "$class does not exist or is not installed";
75 14         568 $self->_log->('trace', 'Creating decoder %s on input %s', $class, $self->input);
76 14 100 100     339 return $class->new(
77             file_name => ($self->input eq ($self->_file_handle||"") ? '' : $self->input),
78             file_handle => $self->_file_handle,
79             _log => $self->_log,
80             @args
81             );
82             }
83              
84             # User supplies any old perl data, but this field should always be an arrayref of ::Field
85             sub _coerce_field_list {
86 14     14   35711 my ($list)= @_;
87 14 50 33     91 defined $list and ref $list eq 'ARRAY' or croak "'fields' must be a non-empty arrayref";
88 14         35 my @list= @$list; # clone it, to make sure we don't unexpectedly alter the caller's data
89 14         34 for (@list) {
90 38 100       186 if (!ref $_) {
    50          
91 5         82 $_= Data::TableReader::Field->new({ name => $_ });
92             } elsif (ref $_ eq 'HASH') {
93 33         102 my %args= %$_;
94             # "isa" alias for the 'type' attribute
95 33 50 33     82 $args{type}= delete $args{isa} if defined $args{isa} && !defined $args{type};
96 33         488 $_= Data::TableReader::Field->new(\%args)
97             } else {
98 0         0 croak "Can't coerce '$_' to a Field object"
99             }
100             }
101 14         249 return \@list;
102             }
103              
104             sub _build_field_by_name {
105 0     0   0 my $self= shift;
106             # reverse list so first field of a name takes precedence
107 0         0 { map { $_->name => $_ } reverse @{ $self->fields } }
  0         0  
  0         0  
  0         0  
108             }
109              
110             sub _build_header_row_combine {
111 13     13   464 my $self= shift;
112             # If headers contain "\n", we need to collect multiple cells per column
113 13         28 max map { 1+(()= ($_->header_regex =~ /\\n|\n/g)) } $self->field_list;
  38         688  
114             }
115              
116             # 'log' can be a variety of things, but '_log' will always be a coderef
117             has _log => ( is => 'lazy', clearer => 1 );
118             sub _build__log {
119 14     14   284 _log_fn(shift->log);
120             }
121             sub _log_fn {
122 14     14   91 my $dest= shift;
123             !$dest? sub {
124 2     2   13 my ($level, $msg, @args)= @_;
125 2 50 33     13 return unless $level eq 'warn' or $level eq 'error';
126 0 0       0 $msg= sprintf($msg, @args) if @args;
127 0         0 warn $msg."\n";
128             }
129             : ref $dest eq 'ARRAY'? sub {
130 11     11   44 my ($level, $msg, @args)= @_;
131 11 100 66     40 return unless $level eq 'warn' or $level eq 'error';
132 1 50       6 $msg= sprintf($msg, @args) if @args;
133 1         5 push @$dest, [ $level, $msg ];
134             }
135             : ref($dest)->can('info')? sub {
136 124     124   704 my ($level, $msg, @args)= @_;
137 124 100       461 $dest->$level( @args? sprintf($msg, @args) : $msg )
    100          
138             if $dest->can('is_'.$level)->($dest);
139             }
140 14 50       305 : croak "Don't know how to log to $dest";
    100          
    100          
141             }
142              
143              
144             sub detect_input_format {
145 1     1 1 4 my ($self, $filename, $magic)= @_;
146              
147 1         3 my $input= $self->input;
148             # As convenience to spreadsheet users, let input be a parsed workbook/worksheet object.
149 1 50 33     22 return ('XLSX', sheet => $input)
150             if ref($input) && ref($input)->can('get_cell');
151 0 0 0     0 return ('XLSX', workbook => $input)
152             if ref($input) && ref($input)->can('worksheets');
153              
154             # Load first block of file, unless supplied
155 0         0 my $fpos;
156 0 0       0 if (!defined $magic) {
157 0         0 my $fh= $self->_file_handle;
158             # Need to be able to seek.
159 0 0       0 if (seek($fh, 0, 1)) {
    0          
160 0         0 $fpos= tell $fh;
161 0         0 read($fh, $magic, 4096);
162 0 0       0 seek($fh, $fpos, 0) or croak "seek: $!";
163             }
164             elsif ($fh->can('ungets')) {
165 0         0 $fpos= 0; # to indicate that we did try reading the file
166 0         0 read($fh, $magic, 4096);
167 0         0 $fh->ungets($magic);
168             }
169             else {
170 0         0 $self->_log->('notice',"Can't fully detect input format because handle is not seekable."
171             ." Consider fully buffering the file, or using FileHandle::Unget");
172 0         0 $magic= '';
173             }
174             }
175              
176             # Excel is obvious so check it first. This handles cases where an excel file is
177             # erroneously named ".csv" and sillyness like that.
178 0 0       0 return ( 'XLSX' ) if $magic =~ /^PK(\x03\x04|\x05\x06|\x07\x08)/;
179 0 0       0 return ( 'XLS' ) if $magic =~ /^\xD0\xCF\x11\xE0/;
180              
181             # Else trust the file extension, because TSV with commas can be very similar to CSV with
182             # tabs in the data, and some crazy person might store an HTML document as the first element
183             # of a CSV file.
184             # Detect filename if not supplied
185 0 0       0 if (!defined $filename) {
186 0         0 $filename= '';
187 0 0 0     0 $filename= "$input" if defined $input and (!ref $input || ref($input) =~ /path|file/i);
      0        
188             }
189 0 0       0 if ($filename =~ /\.([^.]+)$/) {
190 0         0 my $suffix= uc($1);
191 0 0       0 return 'HTML' if $suffix eq 'HTM';
192 0         0 return $suffix;
193             }
194              
195             # Else probe some more...
196 0         0 $self->_log->('debug',"Probing file format because no filename suffix");
197 0 0       0 length $magic or croak "Can't probe format. No filename suffix, and "
    0          
198             .(!defined $fpos? "unseekable file handle" : "no content");
199              
200             # HTML is pretty obvious
201 0 0       0 return 'HTML' if $magic =~ /^(\xEF\xBB\xBF|\xFF\xFE|\xFE\xFF)?<(!DOCTYPE )HTML/i;
202             # Else guess between CSV and TSV
203 0         0 my ($probably_csv, $probably_tsv)= (0,0);
204 0 0       0 ++$probably_csv if $magic =~ /^(\xEF\xBB\xBF|\xFF\xFE|\xFE\xFF)?["']?[\w ]+["']?,/;
205 0 0       0 ++$probably_tsv if $magic =~ /^(\xEF\xBB\xBF|\xFF\xFE|\xFE\xFF)?["']?[\w ]+["']?\t/;
206 0         0 my $comma_count= () = ($magic =~ /,/g);
207 0         0 my $tab_count= () = ($magic =~ /\t/g);
208 0         0 my $eol_count= () = ($magic =~ /\n/g);
209 0 0 0     0 ++$probably_csv if $comma_count > $eol_count and $comma_count > $tab_count;
210 0 0 0     0 ++$probably_tsv if $tab_count > $eol_count and $tab_count > $comma_count;
211 0         0 $self->_log->('debug', 'probe results: comma_count=%d tab_count=%d eol_count=%d probably_csv=%d probably_tsv=%d',
212             $comma_count, $tab_count, $eol_count, $probably_csv, $probably_tsv);
213 0 0 0     0 return 'CSV' if $probably_csv and $probably_csv > $probably_tsv;
214 0 0 0     0 return 'TSV' if $probably_tsv and $probably_tsv > $probably_csv;
215 0         0 croak "Can't determine file format";
216             }
217              
218              
219             has _table_found => ( is => 'rw', lazy => 1, builder => 1, clearer => 1, predicate => 1 );
220             sub _build__table_found {
221 4     4   26 my $self= shift;
222 4         10 my %loc= ( croak_on_fail => 1 );
223 4         57 $self->_find_table($self->decoder->iterator, \%loc);
224 4         16 \%loc;
225             }
226              
227             sub find_table {
228 9     9 1 3078 my $self= shift;
229 9 50       38 return 1 if $self->_has_table_found;
230 9         13 my %loc;
231 9 50       233 if ($self->_find_table($self->decoder->iterator, \%loc)) {
232 9         157 $self->_table_found(\%loc);
233 9         81 return 1;
234             }
235 0         0 return 0;
236             }
237              
238 1     1 1 20 sub col_map { shift->_table_found->{col_map}; }
239 6     6 1 1247 sub field_map { shift->_table_found->{field_map}; }
240              
241             sub _find_table {
242 13     13   31 my ($self, $data_iter, $stash)= @_;
243 13   50     50 $stash ||= {};
244 13   66     79 while (!$self->_find_table_in_dataset($data_iter, $stash)
      66        
245             && !defined $stash->{fatal}
246             && $data_iter->next_dataset)
247             {}
248 13 50       41 if ($stash->{col_map}) {
249             # Calculate field map from col map
250 13         20 my $col_map= $stash->{col_map};
251 13         16 my %fmap;
252 13         30 for my $i (0 .. $#$col_map) {
253 61 100       100 next unless $col_map->[$i];
254 43 100       87 if ($col_map->[$i]->array) {
255 10         13 push @{ $fmap{$col_map->[$i]->name} }, $i;
  10         20  
256             } else {
257 33         80 $fmap{$col_map->[$i]->name}= $i;
258             }
259             }
260 13         26 $stash->{field_map}= \%fmap;
261             # And record the stream position of the start of the table
262 13         94 $stash->{first_record_pos}= $data_iter->tell;
263 13         23 $stash->{data_iter}= $data_iter;
264 13         35 return $stash;
265             }
266             else {
267 0   0     0 my $err= $stash->{fatal} || "Can't locate valid header";
268 0         0 $self->_log->('error', $err);
269 0 0       0 croak $err if $stash->{croak_on_fail};
270 0         0 return undef;
271             }
272             }
273              
274             sub _find_table_in_dataset {
275 14     14   51 my ($self, $data_iter, $stash)= @_;
276             # If header_row_at is undef, then there is no header.
277             # Ensure static_field_order, then set up columns.
278 14         38 my @fields= $self->field_list;
279 14         113 my $header_at= $self->header_row_at;
280 14 50       33 if (!defined $header_at) {
281 0 0       0 unless ($self->static_field_order) {
282 0         0 $stash->{fatal}= "You must enable 'static_field_order' if there is no header row";
283 0         0 return;
284             }
285 0         0 $stash->{col_map}= \@fields;
286 0         0 return 1;
287             }
288            
289             # If headers contain "\n", we need to collect multiple cells per column
290 14         198 my $row_accum= $self->header_row_combine;
291            
292 14 50       69 my ($start, $end)= ref $header_at? @$header_at : ( $header_at, $header_at );
293 14         18 my @rows;
294            
295             # If header_row_at doesn't start at 1, seek forward
296 14 50       33 if ($start > 1) {
297 0         0 $self->_log->('trace', 'Skipping to row %s', $start);
298 0         0 push @rows, $data_iter->() for 1..$start-1;
299             }
300            
301             # Scan through the rows of the dataset up to the end of header_row_at, accumulating rows so that
302             # multi-line regexes can match.
303 14         43 for ($start .. $end) {
304 23         124 my $vals= $data_iter->();
305 23 100       50 if (!$vals) { # if undef, we reached end of dataset
306 1         16 $self->_log->('trace', 'EOF');
307 1         8 last;
308             }
309 22 100       42 if ($row_accum > 1) {
310 5         10 push @rows, $vals;
311 5         13 shift @rows while @rows > $row_accum;
312 5         7 $vals= [ map { my $c= $_; join("\n", map $_->[$c], @rows) } 0 .. $#{$rows[-1]} ];
  28         34  
  28         67  
  5         38  
313 5         19 $stash->{context}= $row_accum.' rows ending at '.$data_iter->position;
314             } else {
315 17         45 $stash->{context}= $data_iter->position;
316             }
317 22         371 $self->_log->('trace', 'Checking for headers on %s', $stash->{context});
318 22         161 $stash->{context}.= ': ';
319 22 50       91 $stash->{col_map}= $self->static_field_order?
320             # If static field order, look for headers in sequence
321             $self->_match_headers_static($vals, $stash)
322             # else search for each header
323             : $self->_match_headers_dynamic($vals, $stash);
324 22 100       99 return 1 if $stash->{col_map};
325 9 50       20 return if $stash->{fatal};
326 9         178 $self->_log->('debug', '%sNo match', $stash->{context});
327             }
328 1         14 $self->_log->('warn','No row in dataset matched full header requirements');
329 1         376 return;
330             }
331              
332             sub _match_headers_static {
333 0     0   0 my ($self, $header, $stash)= @_;
334 0         0 my $fields= $self->fields;
335 0         0 for my $i (0 .. $#$fields) {
336 0 0       0 next if $header->[$i] =~ $fields->[$i]->header_regex;
337            
338             # Field header doesn't match. Start over on next row.
339 0   0     0 $self->_log->('debug','%sMissing field %s', $stash->{context}||'', $fields->[$i]->name);
340 0         0 return;
341             }
342             # found a match for every field!
343 0   0     0 $self->_log->('debug','%sFound!', $stash->{context}||'');
344 0         0 return $fields;
345             }
346              
347             sub _match_headers_dynamic {
348 22     22   33 my ($self, $header, $stash)= @_;
349 22   50     53 my $context= $stash->{context} || '';
350 22         28 my %col_map;
351 22         308 my $fields= $self->fields;
352             my $free_fields= $stash->{free_fields} ||= [
353             # Sort required fields to front, to fail faster on non-matching rows
354 36 100       91 sort { $a->required? -1 : $b->required? 1 : 0 }
    100          
355 22   100     135 grep { !$_->follows_list } @$fields
  38         79  
356             ];
357             my $follows_fields= $stash->{follows_fields} ||= [
358 22   100     63 grep { $_->follows_list } @$fields
  38         62  
359             ];
360 22         44 for my $f (@$free_fields) {
361 46         632 my $hr= $f->header_regex;
362 46         794 $self->_log->('debug', 'looking for %s', $hr);
363 46         313 my @found= grep { $header->[$_] =~ $hr } 0 .. $#$header;
  268         741  
364 46 100       125 if (@found == 1) {
    100          
    100          
365 34 50       78 if ($col_map{$found[0]}) {
366 0         0 $self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
367             $self->_log->('info','%sField %s and %s both match column %s',
368 0         0 $context, $f->name, $col_map{$found[0]}->name, $found[0]);
369 0         0 return;
370             }
371 34         81 $col_map{$found[0]}= $f;
372             }
373             elsif (@found > 1) {
374 1 50       5 if ($f->array) {
375             # Array columns may be found more than once
376 1         6 $col_map{$_}= $f for @found;
377             } else {
378 0         0 $self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
379 0         0 $self->_log->('info','%sField %s matches more than one column: %s',
380             $context, $f->name, join(', ', @found));
381 0         0 return;
382             }
383             }
384             elsif ($f->required) {
385 9         27 $self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
386 9         175 $self->_log->('info','%sNo match for required field %s', $context, $f->name);
387 9         3453 return;
388             }
389             # else Not required, and not found
390             }
391             # Need to have found at least one column (even if none required)
392 13 50       35 unless (keys %col_map) {
393 0         0 $self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
394 0         0 $self->_log->('debug','%sNo field headers found', $context);
395 0         0 return;
396             }
397             # Now, check for any of the 'follows' fields, some of which might also be 'required'.
398 13 100       31 if (@$follows_fields) {
399 2         3 my %following;
400             my %found;
401 2         6 for my $i (0 .. $#$header) {
402 17 100       31 if ($col_map{$i}) {
403 5         17 %following= ( $col_map{$i}->name => $col_map{$i} );
404             } else {
405 12         18 my $val= $header->[$i];
406 12         14 my @match;
407 12         16 for my $f (@$follows_fields) {
408 18 100       46 next unless grep $following{$_}, $f->follows_list;
409 9 100       129 push @match, $f if $val =~ $f->header_regex;
410             }
411 12 100       72 if (@match == 1) {
    50          
412 7 50 66     28 if ($found{$match[0]} && !$match[0]->array) {
413 0         0 $self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
414 0         0 $self->_log->('info','%sField %s matches multiple columns',
415             $context, $match[0]->name);
416 0         0 return;
417             }
418 7         12 $col_map{$i}= $match[0];
419 7         47 $found{$match[0]}= $i;
420 7         18 $following{$match[0]->name}= $match[0];
421             }
422             elsif (@match > 1) {
423 0         0 $self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
424 0         0 $self->_log->('info','%sField %s and %s both match column %d',
425             $context, $match[0]->name, $match[1]->name, $i+1);
426 0         0 return;
427             }
428             else {
429 5         20 %following= ();
430             }
431             }
432             }
433             # Check if any of the 'follows' fields were required
434 2 50       5 if (my @unfound= grep { !$found{$_} && $_->required } @$follows_fields) {
  3 50       14  
435 0         0 $self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
436             $self->_log->('info','%sNo match for required %s %s', $context,
437 0 0       0 (@unfound > 1? ('fields', join(', ', map { $_->name } sort @unfound))
  0         0  
438             : ('field', $unfound[0]->name)
439             ));
440 0         0 return;
441             }
442             }
443             # Now, if there are any un-claimed columns, handle per 'on_unknown_columns' setting.
444 13         31 my @unclaimed= grep { !$col_map{$_} } 0 .. $#$header;
  61         113  
445 13 100       33 if (@unclaimed) {
446 5         12 my $act= $self->on_unknown_columns;
447 5         16 my $unknown_list= join(', ', map $self->_fmt_header_text($header->[$_]), @unclaimed);
448 5 50       13 $act= $act->($self, $header, \@unclaimed) if ref $act eq 'CODE';
449 5 50       13 if ($act eq 'use') {
    0          
    0          
450 5         80 $self->_log->('warn','%sIgnoring unknown columns: %s', $context, $unknown_list);
451             } elsif ($act eq 'next') {
452 0         0 $self->_log->('warn','%sWould match except for unknown columns: %s',
453             $context, $unknown_list);
454             } elsif ($act eq 'die') {
455 0         0 $stash->{fatal}= "${context}Header row includes unknown columns: $unknown_list";
456             } else {
457 0         0 $stash->{fatal}= "Invalid action '$act' for 'on_unknown_columns'";
458             }
459 5         1953 $self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
460 5 50       46 return if $stash->{fatal};
461             }
462 13         80 return [ map $col_map{$_}, 0 .. $#$header ];
463             }
464             # Make header string readable for log messages
465             sub _fmt_header_text {
466 87 100   87   128 shift if ref $_[0];
467 87         106 my $x= shift;
468 87         127 $x =~ s/ ( [^[:print:]] ) / sprintf("\\x%02X", ord $1 ) /gex;
  11         36  
469 87         376 qq{"$x"};
470             }
471             # format the colmap into a string
472             sub _colmap_progress_str {
473 14     14   26 my ($colmap, $headers)= @_;
474             join(' ', map {
475 14 100       29 $colmap->{$_}? $_.'='.$colmap->{$_}->name
  95         294  
476             : $_.':'._fmt_header_text($headers->[$_])
477             } 0 .. $#$headers)
478             }
479              
480              
481             sub iterator {
482 15     15 1 5489 my $self= shift;
483 15         292 my $fields= $self->fields;
484             # Creating the record iterator consumes the data source's iterator.
485             # The first time after detecting the table, we continue with the same iterator.
486             # Every time after that we need to create a new data iterator and seek to the
487             # first record under the header.
488 15         248 my $data_iter= delete $self->_table_found->{data_iter};
489 15 100       76 unless ($data_iter) {
490 2         25 $data_iter= $self->decoder->iterator;
491 2         29 $data_iter->seek($self->_table_found->{first_record_pos});
492             }
493 15         208 my $col_map= $self->_table_found->{col_map};
494 15         234 my $field_map= $self->_table_found->{field_map};
495 15         118 my @row_slice; # one column index per field, and possibly more for array_val_map
496             my @arrayvals; # list of source index and destination index for building array values
497 15         0 my @field_names; # ordered list of field names where row slice should be assigned
498 15         0 my %trimmer; # list of trim functions and the array indicies they should be applied to
499 15         0 my @blank_val; # blank value per each fetched column
500 15         0 my @type_check;# list of
501 15         0 my $class; # optional object class for the resulting rows
502              
503             # If result is array, the slice of the row must match the position of the fields in the
504             # $self->fields array. If a field was not found it will get an undef for that slot.
505             # It also results in an undef for secondary fields of the same name as the first.
506 15 50       47 if ($self->record_class eq 'ARRAY') {
507 0         0 my %remaining= %$field_map;
508             @row_slice= map {
509 0         0 my $src= delete $remaining{$_->name};
  0         0  
510 0 0       0 defined $src? $src : 0x7FFFFFFF
511             } @$fields;
512             }
513             # If result is anything else, then only slice out the columns that are used for the fields
514             # that we located.
515             else {
516 15 50       39 $class= $self->record_class
517             unless 'HASH' eq $self->record_class;
518 15         45 @field_names= keys %$field_map;
519 15         40 @row_slice= values %$field_map;
520             }
521             # For any field whose value is an array of more that one source column,
522             # encode those details in @arrayvals, and update @row_slice and @trim_idx accordingly
523 15         45 for (0 .. $#row_slice) {
524 41 100       75 if (!ref $row_slice[$_]) {
525 39         52 my $field= $col_map->[$row_slice[$_]];
526 39 100       576 if (my $t= $field->trim_coderef) {
527 38   100     154 $trimmer{$t} ||= [ $t, [] ];
528 38         51 push @{ $trimmer{$t}[1] }, $_;
  38         98  
529             }
530 39         73 push @blank_val, $field->blank;
531 39 100       106 push @type_check, $self->_make_validation_callback($field, $_)
532             if $field->type;
533             }
534             else {
535             # This field is an array-value, so add the src columns to @row_slice
536             # and list it in @arrayvals, and update @trim_idx if needed
537 2         4 my $src= $row_slice[$_];
538 2         3 $row_slice[$_]= 0x7FFFFFFF;
539 2         4 my $from= @row_slice;
540 2         5 push @row_slice, @$src;
541 2         6 push @arrayvals, [ $_, $from, scalar @$src ];
542 2         4 for ($from .. $#row_slice) {
543 10         28 my $field= $col_map->[$row_slice[$_]];
544 10 50       130 if (my $t= $field->trim_coderef) {
545 10   100     69 $trimmer{$t} ||= [ $t, [] ];
546 10         10 push @{ $trimmer{$t}[1] }, $_;
  10         21  
547             }
548 10         17 push @blank_val, $field->blank;
549 10 50       24 push @type_check, $self->_make_validation_callback($field, $_)
550             if $field->type;
551             }
552             }
553             }
554 15         42 my @trim= values %trimmer;
555 15         19 @arrayvals= reverse @arrayvals;
556 15         32 my ($n_blank, $first_blank, $eof);
557             my $sub= sub {
558 50 100 50 50   2183 again:
      66        
559             # Pull the specific slice of the next row that we need
560             my $row= !$eof && $data_iter->(\@row_slice)
561             or ++$eof && return undef;
562             # Apply 'trim' to any column whose field requested it
563 36         70 for my $t (@trim) {
564 38         51 $t->[0]->() for grep defined, @{$row}[@{$t->[1]}];
  38         138  
  38         57  
565             }
566             # Apply 'blank value' to every column which is zero length
567 36         51 $n_blank= 0;
568             $row->[$_]= $blank_val[$_]
569 36 100 100     69 for grep { (!defined $row->[$_] || !length $row->[$_]) && ++$n_blank } 0..$#$row;
  97         348  
570             # If all are blank, then handle according to $on_blank_row setting
571 36 100       97 if ($n_blank == @$row) {
    100          
572 2   33     10 $first_blank ||= $data_iter->position;
573 2         10 goto again;
574             } elsif ($first_blank) {
575 2 50       7 unless ($self->_handle_blank_row($first_blank, $data_iter->position)) {
576 0         0 $eof= 1;
577 0         0 return undef;
578             }
579 2         4 $first_blank= undef;
580             }
581             # Check type constraints, if any
582 34 100       61 if (@type_check) {
583 18 100       35 if (my @failed= map $_->($row), @type_check) {
584 6 100       17 $self->_handle_validation_fail(\@failed, $row, $data_iter->position.': ')
585             or goto again;
586             }
587             }
588             # Collect all the array-valued fields from the tail of the row
589 30         57 $row->[$_->[0]]= [ splice @$row, $_->[1], $_->[2] ] for @arrayvals;
590             # stop here if the return class is 'ARRAY'
591 30 50       67 return $row unless @field_names;
592             # Convert the row to a hashref
593 30         37 my %rec;
594 30         79 @rec{@field_names}= @$row;
595             # Construct a class, if requested, else return hashref
596 30 50       164 return $class? $class->new(\%rec) : \%rec;
597 15         89 };
598 15         109 return Data::TableReader::_RecIter->new(
599             $sub, { data_iter => $data_iter, reader => $self },
600             );
601             }
602              
603             sub _make_validation_callback {
604 4     4   10 my ($self, $field, $index)= @_;
605 4         8 my $t= $field->type;
606             ref $t eq 'CODE'? sub {
607 18     18   41 my $e= $t->($_[0][$index]);
608 18 100       162 defined $e? ([ $field, $index, $e ]) : ()
609             }
610             : $t->can('validate')? sub {
611 0     0   0 my $e= $t->validate($_[0][$index]);
612 0 0       0 defined $e? ([ $field, $index, $e ]) : ()
613             }
614 4 0       26 : croak "Invalid type constraint $t on field ".$field->name;
    50          
615             }
616              
617             sub _handle_blank_row {
618 2     2   6 my ($self, $first, $last)= @_;
619 2         8 my $act= $self->on_blank_row;
620 2 50       8 $act= $act->($self, $first, $last)
621             if ref $act eq 'CODE';
622 2 50       7 if ($act eq 'next') {
623 2         39 $self->_log->('warn', 'Skipping blank rows from %s until %s', $first, $last);
624 2         954 return 1;
625             }
626 0 0       0 if ($act eq 'last') {
627 0         0 $self->_log->('warn', 'Ending at blank row %s', $first);
628 0         0 return 0;
629             }
630 0 0       0 if ($act eq 'die') {
631 0         0 my $msg= "Encountered blank rows at $first..$last";
632 0         0 $self->_log->('error', $msg);
633 0         0 croak $msg;
634             }
635 0         0 croak "Invalid value for 'on_blank_row': \"$act\"";
636             }
637              
638             sub _handle_validation_fail {
639 6     6   14 my ($self, $failures, $values, $context)= @_;
640 6         17 my $act= $self->on_validation_fail;
641 6 100       24 $act= $act->($self, $failures, $values, $context)
642             if ref $act eq 'CODE';
643 6         43 my $errors= join(', ', map $_->[0]->name.': '.$_->[2], @$failures);
644 6 100       14 if ($act eq 'next') {
645 3 50       51 $self->_log->('warn', "%sSkipped for data errors: %s", $context, $errors) if $errors;
646 3         1087 return 0;
647             }
648 3 100       6 if ($act eq 'use') {
649 2 100       20 $self->_log->('warn', "%sPossible data errors: %s", $context, $errors) if $errors;
650 2         6 return 1;
651             }
652 1 50       16 if ($act eq 'die') {
653 1         3 my $msg= "${context}Invalid record: $errors";
654 1         21 $self->_log->('error', $msg);
655 1         656 croak $msg;
656             }
657             }
658              
659 4     4   885 BEGIN { @Data::TableReader::_RecIter::ISA= ( 'Data::TableReader::Iterator' ) }
660             sub Data::TableReader::_RecIter::all {
661 9     9   16 my $self= shift;
662 9         22 my (@rec, $x);
663 9         19 push @rec, $x while ($x= $self->());
664 9         200 return \@rec;
665             }
666             sub Data::TableReader::_RecIter::position {
667 0     0   0 shift->_fields->{data_iter}->position(@_);
668             }
669             sub Data::TableReader::_RecIter::progress {
670 0     0   0 shift->_fields->{data_iter}->progress(@_);
671             }
672             sub Data::TableReader::_RecIter::tell {
673 0     0   0 shift->_fields->{data_iter}->tell(@_);
674             }
675             sub Data::TableReader::_RecIter::seek {
676 0     0   0 shift->_fields->{data_iter}->seek(@_);
677             }
678             sub Data::TableReader::_RecIter::next_dataset {
679             shift->_fields->{reader}->_log
680 0     0   0 ->('warn',"Searching for supsequent table headers is not supported yet");
681 0         0 return 0;
682             }
683              
684             1;
685              
686             __END__