File Coverage

blib/lib/Data/TableReader.pm

Criterion	Covered	Total	%
statement	402	532	75.5
branch	178	322	55.2
condition	61	134	45.5
subroutine	46	56	82.1
pod	5	7	71.4
total	692	1051	65.8

line	stmt	bran	cond	sub	pod	time	code
1							package Data::TableReader;
2	7			7		1983467	use Moo 2;
	7					44797
	7					41
3	7			7		9984	use Try::Tiny;
	7					14
	7					495
4	7			7		57	use Carp;
	7					13
	7					444
5	7			7		38	use Scalar::Util qw( blessed refaddr );
	7					17
	7					397
6	7			7		45	use List::Util 'max';
	7					14
	7					476
7	7			7		3536	use Module::Runtime 'require_module';
	7					13627
	7					40
8	7			7		4057	use Data::TableReader::Field;
	7					50
	7					311
9	7			7		3744	use Data::TableReader::Iterator;
	7					37
	7					243
10	7			7		41	use namespace::clean;
	7					14
	7					51
11
12							# ABSTRACT: Extract records from "dirty" tabular data sources
13							our $VERSION = '0.021'; # VERSION
14
15
16							has input => ( is => 'rw', required => 1 );
17							has _file_handle => ( is => 'lazy' );
18							has _decoder_arg => ( is => 'rw', init_arg => 'decoder' );
19							has decoder => ( is => 'lazy', init_arg => undef );
20							has fields => ( is => 'rw', required => 1, coerce => \&_coerce_field_list );
21	44			44	1	69	sub field_list { @{ shift->fields } }
	44					898
22							has field_by_name => ( is => 'lazy' );
23							has field_by_addr => ( is => 'lazy' );
24							has record_class => ( is => 'rw', required => 1, default => sub { 'HASH' } );
25							has static_field_order => ( is => 'rw' ); # force order of columns
26							has header_row_at => ( is => 'rw', default => sub { [1,10] } ); # row of header, or range to scan
27							has header_row_combine => ( is => 'rw', lazy => 1, builder => 1 );
28							has table_search_results=> ( is => 'rw', lazy => 1, builder => 1, clearer => 1, predicate => 1 );
29							has col_map => ( is => 'rw', lazy => 1, builder => 1, predicate => 1 );
30							has on_partial_match => ( is => 'rw', default => sub { 'next' } );
31							has on_ambiguous_columns=> ( is => 'rw', default => sub { 'error' } );
32							has on_unknown_columns => ( is => 'rw', default => sub { 'warn' } );
33							has on_blank_row => ( is => 'rw', default => sub { 'next' } );
34							has on_validation_error => ( is => 'rw', default => sub { 'die' } );
35							has log => ( is => 'rw', trigger => sub { shift->_clear_log } );
36
37							sub BUILD {
38	23			23	0	152	my ($self, $args)= @_;
39							# If user supplied col_map, it probably contains names instead of Field objects.
40	23	100				98	if ($self->has_col_map) {
41							# Make a new array in case other parts of user code refer to current one
42	4					9	$self->col_map($self->_resolve_colmap_names([ @{ $self->col_map } ]));
	4					86
43							}
44							# Back-compat for previous API
45	23	100				203	if (defined (my $act= $args->{on_validation_fail})) {
46							croak "on_validation_fail (back-compat alias) conflicts with on_validation_error"
47	3	50				5	if defined $args->{on_validation_error};
48	3					7	$self->on_validation_fail($act);
49							}
50							}
51
52							sub on_validation_fail {
53	3			3	0	4	my $self= shift;
54	3	50				5	if (@_) {
55	3					6	my $act= shift;
56							#warn "on_validation_fail is deprecated (see on_validation_error)";
57							# adapt for the old API
58	3	100				7	$act= _wrap_on_validation_fail($act) if ref $act eq 'CODE';
59	3					19	return $self->on_validation_error($act);
60							}
61	0					0	return $self->on_validation_error;
62							}
63
64							# Modifies array to replace name with field ref
65							sub _resolve_colmap_names {
66	4			4		47	my ($self, $col_map)= @_;
67	4		66			39	for (grep defined && !ref, @$col_map) {
68	7	50				142	defined(my $f= $self->field_by_name->{$_})
69							or croak("col_map specifies non-existent field '$_'");
70	7					123	$_= $f;
71							}
72	4					86	$col_map;
73							}
74
75							# Open 'input' if it isn't already a file handle
76							sub _build__file_handle {
77	23			23		232	my $self= shift;
78	23					77	my $i= $self->input;
79	23	50	66			843	return undef if ref($i) && (
			66
80							(blessed($i) && ($i->can('get_cell') \|\| $i->can('worksheets')))
81							or ref($i) eq 'ARRAY'
82							);
83	0	0	0			0	return $i if ref($i) && (ref($i) eq 'GLOB' or ref($i)->can('read'));
			0
84	0	0				0	open(my $fh, '<', $i) or croak "open($i): $!";
85	0					0	binmode $fh;
86	0					0	return $fh;
87							}
88
89							# Create ::Decoder instance either from user-supplied args, or by detecting input format
90							sub _build_decoder {
91	23			23		254	my $self= shift;
92	23					78	my $decoder_arg= $self->_decoder_arg;
93	23					56	my $decoder_ref= ref $decoder_arg;
94	23					51	my ($class, @args);
95	23	50	0			77	if (!$decoder_arg) {
		0
		0
		0
96	23					87	($class, @args)= $self->detect_input_format;
97	23					609	$self->_log->('trace', "Detected input format as %s", $class);
98							}
99							elsif (!$decoder_ref) {
100	0					0	$class= $decoder_arg;
101							}
102							elsif ($decoder_ref eq "HASH" or $decoder_ref eq "ARRAY") {
103	0	0				0	($class, @args)= $decoder_ref eq "ARRAY"? @$decoder_arg : do {
104	0					0	my %tmp= %$decoder_arg;
105	0					0	(delete($tmp{CLASS}), %tmp);
106							};
107	0	0				0	if(!$class) {
108	0					0	my ($input_class, @input_args)= $self->detect_input_format;
109	0	0				0	croak "decoder class not in arguments and unable to identify decoder class from input"
110							if !$input_class;
111	0					0	($class, @args)= ($input_class, @input_args, @args);
112							}
113							}
114							elsif ($decoder_ref->can('iterator')) {
115	0					0	return $decoder_arg;
116							}
117							else {
118	0					0	croak "Can't create decoder from $decoder_ref";
119							}
120	23	50				180	$class= "Data::TableReader::Decoder::$class"
121							unless $class =~ /::/;
122	23	50				177	require_module($class) or croak "$class does not exist or is not installed";
123	23					1244	$self->_log->('trace', 'Creating decoder %s on input %s', $class, $self->input);
124	23	50	50			593	return $class->new(
125							file_name => ($self->input eq ($self->_file_handle\|\|"") ? '' : $self->input),
126							file_handle => $self->_file_handle,
127							_log => $self->_log,
128							@args
129							);
130							}
131
132							# User supplies any old perl data, but this field should always be an arrayref of ::Field
133							sub _coerce_field_list {
134	23			23		489657	my ($list)= @_;
135	23	50	33			200	defined $list and ref $list eq 'ARRAY' or croak "'fields' must be a non-empty arrayref";
136	23					79	my @list= @$list; # clone it, to make sure we don't unexpectedly alter the caller's data
137	23					55	for (@list) {
138	68	100				245	if (!ref $_) {
		100
139	13					311	$_= Data::TableReader::Field->new({ name => $_ });
140							} elsif (ref $_ eq 'HASH') {
141	54					229	my %args= %$_;
142							# "isa" alias for the 'type' attribute
143	54	50	33			178	$args{type}= delete $args{isa} if defined $args{isa} && !defined $args{type};
144	54					1087	$_= Data::TableReader::Field->new(\%args)
145							} else {
146	1	50	33			31	croak "Can't coerce '$_' to a Field object"
147							unless blessed($_) && $_->isa('Data::TableReader::Field');
148							}
149							}
150	23					444	return \@list;
151							}
152
153							sub _build_field_by_name {
154	4			4		42	my $self= shift;
155							# reverse list so first field of a name takes precedence
156	4					8	return { map +( $_->name => $_ ), reverse @{ $self->fields } }
	4					82
157							}
158
159							sub _build_field_by_addr {
160	0			0		0	my $self= shift;
161	0					0	return { map +( refaddr $_ => $_ ), @{ $self->fields } }
	0					0
162							}
163
164							sub _build_header_row_combine {
165	22			22		807	my $self= shift;
166							# If headers contain "\n", we need to collect multiple cells per column
167							# Find the maximum number of \n contained in any regex.
168	22					61	max map { 1+(()= ($_->header_regex =~ /\\n\|\n/g)) } $self->field_list;
	68					1971
169							}
170
171							# 'log' can be a variety of things, but '_log' will always be a coderef
172							has _log => ( is => 'lazy', clearer => 1 );
173							sub _build__log {
174	23			23		623	_log_fn(shift->log);
175							}
176							sub _log_fn {
177	23			23		151	my $dest= shift;
178							!$dest? sub {
179	2			2		9	my ($level, $msg, @args)= @_;
180	2	50	33			9	return unless $level eq 'warn' or $level eq 'error';
181	0	0				0	$msg= sprintf($msg, @args) if @args;
182	0					0	warn $msg."\n";
183							}
184							: ref $dest eq 'ARRAY'? sub {
185	263			263		1953	my ($level, $msg, @args)= @_;
186	263	100	100			3203	return unless $level eq 'warn' or $level eq 'error';
187	34	100				85	$msg= sprintf($msg, @args) if @args;
188	34					253	push @$dest, [ $level, $msg ];
189							}
190							: ref($dest)->can('info')? sub {
191	34			34		612	my ($level, $msg, @args)= @_;
192	34	50				193	$dest->$level( @args? sprintf($msg, @args) : $msg )
		100
193							if $dest->can('is_'.$level)->($dest);
194							}
195	23	50				376	: croak "Don't know how to log to $dest";
		100
		100
196							}
197
198
199							sub detect_input_format {
200	23			23	1	64	my ($self, $filename, $magic)= @_;
201
202	23					111	my $input= $self->input;
203							# As convenience to spreadsheet users, let input be a parsed workbook/worksheet object.
204	23	100	66			384	return ('XLSX', sheet => $input)
205							if ref($input) && ref($input)->can('get_cell');
206	22	50	33			198	return ('XLSX', workbook => $input)
207							if ref($input) && ref($input)->can('worksheets');
208							# Convenience for passing already-parsed data
209	22	50				76	if (ref($input) eq 'ARRAY') {
210							# if user supplied single table of data, wrap it in an array to make an array of tables.
211							$input= [ $input ]
212							if @$input && ref($input->[0]) eq 'ARRAY'
213	22	100	33			222	&& @{$input->[0]} && ref($input->[0][0]) ne 'ARRAY';
	22		33			154
			66
214	22					129	return ('Mock', datasets => $input);
215							}
216
217							# Load first block of file, unless supplied
218	0					0	my $fpos;
219	0	0				0	if (!defined $magic) {
220	0					0	my $fh= $self->_file_handle;
221							# Need to be able to seek.
222	0	0				0	if (seek($fh, 0, 1)) {
		0
223	0					0	$fpos= tell $fh;
224	0					0	read($fh, $magic, 4096);
225	0	0				0	seek($fh, $fpos, 0) or croak "seek: $!";
226							}
227							elsif ($fh->can('ungets')) {
228	0					0	$fpos= 0; # to indicate that we did try reading the file
229	0					0	read($fh, $magic, 4096);
230	0					0	$fh->ungets($magic);
231							}
232							else {
233	0					0	$self->_log->('notice',"Can't fully detect input format because handle is not seekable."
234							." Consider fully buffering the file, or using FileHandle::Unget");
235	0					0	$magic= '';
236							}
237							}
238
239							# Excel is obvious so check it first. This handles cases where an excel file is
240							# erroneously named ".csv" and sillyness like that.
241	0	0				0	return ( 'XLSX' ) if $magic =~ /^PK(\x03\x04\|\x05\x06\|\x07\x08)/;
242	0	0				0	return ( 'XLS' ) if $magic =~ /^\xD0\xCF\x11\xE0/;
243
244							# Else trust the file extension, because TSV with commas can be very similar to CSV with
245							# tabs in the data, and some crazy person might store an HTML document as the first element
246							# of a CSV file.
247							# Detect filename if not supplied
248	0	0				0	if (!defined $filename) {
249	0					0	$filename= '';
250	0	0	0			0	$filename= "$input" if defined $input and (!ref $input \|\| ref($input) =~ /path\|file/i);
			0
251							}
252	0	0				0	if ($filename =~ /\.([^.]+)$/) {
253	0					0	my $suffix= uc($1);
254	0	0				0	return 'HTML' if $suffix eq 'HTM';
255	0					0	return $suffix;
256							}
257
258							# Else probe some more...
259	0					0	$self->_log->('debug',"Probing file format because no filename suffix");
260	0	0				0	length $magic or croak "Can't probe format. No filename suffix, and "
		0
261							.(!defined $fpos? "unseekable file handle" : "no content");
262
263							# HTML is pretty obvious
264	0	0				0	return 'HTML' if $magic =~ /^(\xEF\xBB\xBF\|\xFF\xFE\|\xFE\xFF)?<(!DOCTYPE )HTML/i;
265							# Else guess between CSV and TSV
266	0					0	my ($probably_csv, $probably_tsv)= (0,0);
267	0	0				0	++$probably_csv if $magic =~ /^(\xEF\xBB\xBF\|\xFF\xFE\|\xFE\xFF)?["']?[\w ]+["']?,/;
268	0	0				0	++$probably_tsv if $magic =~ /^(\xEF\xBB\xBF\|\xFF\xFE\|\xFE\xFF)?["']?[\w ]+["']?\t/;
269	0					0	my $comma_count= () = ($magic =~ /,/g);
270	0					0	my $tab_count= () = ($magic =~ /\t/g);
271	0					0	my $eol_count= () = ($magic =~ /\n/g);
272	0	0	0			0	++$probably_csv if $comma_count > $eol_count and $comma_count > $tab_count;
273	0	0	0			0	++$probably_tsv if $tab_count > $eol_count and $tab_count > $comma_count;
274	0					0	$self->_log->('debug', 'probe results: comma_count=%d tab_count=%d eol_count=%d probably_csv=%d probably_tsv=%d',
275							$comma_count, $tab_count, $eol_count, $probably_csv, $probably_tsv);
276	0	0	0			0	return 'CSV' if $probably_csv and $probably_csv > $probably_tsv;
277	0	0	0			0	return 'TSV' if $probably_tsv and $probably_tsv > $probably_csv;
278	0					0	croak "Can't determine file format";
279							}
280
281
282							sub _build_table_search_results {
283	6			6		49	my $self= shift;
284	6					136	my $result= $self->_find_table($self->decoder->iterator);
285							# When called during lazy-build, not finding the table is fatal
286	6	50				16	if (!$result->{found}) {
287	0		0			0	my $err= $$result->{fatal} \|\| "Can't locate valid header";
288	0					0	$self->_log->('error', $err);
289	0					0	croak $err;
290							}
291	6					25	$result;
292							}
293
294							sub _build_col_map {
295							shift->table_search_results->{found}{col_map}
296	1			1		544	}
297
298							sub find_table {
299	16			16	1	9329	my $self= shift;
300	16					588	my $result= $self->_find_table($self->decoder->iterator);
301	16					376	$self->table_search_results($result);
302	16					238	return defined $result->{found};
303							}
304
305	11			11	1	9924	sub field_map { _field_map(shift->col_map) }
306
307							sub _field_map {
308	35			35		142	my $col_map= shift;
309	35					60	my %fmap;
310	35					127	for my $i (0 .. $#$col_map) {
311	165	100				354	next unless defined $col_map->[$i];
312	140	100				375	if ($col_map->[$i]->array) {
313	46					70	push @{ $fmap{$col_map->[$i]->name} }, $i;
	46					136
314							} else {
315	94					284	$fmap{$col_map->[$i]->name}= $i;
316							}
317							}
318	35					156	return \%fmap;
319							}
320
321							sub _find_table {
322	22			22		84	my ($self, $data_iter)= @_;
323							# $stash \|\|= {};
324							# while (1) {
325							# $success= $self->_find_table_in_dataset($data_iter, $stash);
326							# && !defined $stash->{fatal}
327							# && $data_iter->next_dataset
328							# ) {}
329							# if ($success) {
330							# # And record the stream position of the start of the table
331							# $self->col_map($stash->{col_map});
332							# $stash->{first_record_pos}= $data_iter->tell;
333							# $stash->{data_iter}= $data_iter;
334							# return $stash;
335							# }
336							# else {
337							# my $err= $stash->{fatal} \|\| "Can't locate valid header";
338							# $self->_log->('error', $err);
339							# croak $err if $stash->{croak_on_fail};
340							# return undef;
341							# }
342	22					74	my @fields= $self->field_list;
343	22					208	my $header_at= $self->header_row_at;
344	22					77	my %result;
345
346							# Special case for the file not having any headers in it.
347							# If header_row_at is undef, then there is no header.
348							# Ensure static_field_order, then set up columns.
349	22	50				85	if (!defined $header_at) {
350	0	0				0	unless ($self->static_field_order) {
351	0					0	$result{fatal}= "You must enable 'static_field_order' if there is no header row";
352	0					0	return;
353							}
354	0	0				0	my $col_map= [ $self->has_col_map? @{$self->col_map} : @fields ];
	0					0
355							$result{found}= {
356	0					0	row_idx => -1,
357							row => undef,
358							dataset_idx => 0,
359							col_map => $col_map,
360							messages => [],
361							first_record_pos => $data_iter->tell,
362							_data_iter => $data_iter,
363							};
364	0					0	$result{candidates}= [ $result{found} ];
365	0					0	return \%result;
366							}
367
368	22					38	my $dataset_idx= 0;
369	22					45	dataset: do {
370							# If headers contain "\n", we need to collect multiple cells per column
371	23					468	my $row_accum= $self->header_row_combine;
372
373	23	100				139	my ($start, $end)= ref $header_at? @$header_at : ( $header_at, $header_at );
374	23					44	my @rows;
375
376							# If header_row_at doesn't start at 1, seek forward
377	23	100				83	if ($start > 1) {
378	1					25	$self->_log->('trace', 'Skipping to row %s', $start);
379	1					9	push @rows, $data_iter->() for 1..$start-1;
380							}
381
382							# Scan through the rows of the dataset up to the end of header_row_at, accumulating rows so that
383							# multi-line regexes can match.
384	23					77	for my $row ($start .. $end) {
385	35					213	my %attempt= (
386							row_idx => $row-1,
387							row => $row,
388							dataset_idx => $dataset_idx,
389							messages => []
390							);
391	35					160	my $vals= $data_iter->();
392	35	100				105	if (!$vals) { # if undef, we reached end of dataset
393	2					50	$self->_log->('trace', 'EOF');
394	2					10	last;
395							}
396	33	100				89	if ($row_accum > 1) {
397	5					11	push @rows, $vals;
398	5					17	shift @rows while @rows > $row_accum;
399	5					14	$vals= [ map { my $c= $_; join("\n", map $_->[$c], @rows) } 0 .. $#{$rows[-1]} ];
	28					39
	28					109
	5					14
400	5					23	$attempt{context}= $row_accum.' rows ending at '.$data_iter->position;
401							} else {
402	28					105	$attempt{context}= $data_iter->position;
403							}
404	33					851	$self->_log->('trace', 'Checking for headers on %s', $attempt{context});
405							# Now fill-in the col_map
406	33	50				211	my $found= $self->static_field_order?
407							# If static field order, look for headers in sequence
408							$self->_match_headers_static($vals, \%attempt)
409							# else search for each header
410							: $self->_match_headers_dynamic($vals, \%attempt);
411	33					143	$attempt{first_record_pos}= $data_iter->tell;
412	33					85	$self->_log->(@$_) for @{$attempt{messages}};
	33					932
413	33					69	push @{$result{candidates}}, \%attempt;
	33					102
414	33	100				122	if ($found) {
415	21					54	$result{found}= \%attempt;
416	21					54	$result{found}{_data_iter}= $data_iter;
417	21					439	$self->col_map($attempt{col_map});
418	21					500	$self->_log->(info => 'Found header at '.$attempt{context});
419	21					1829	return \%result;
420							} else {
421							# Back-compat: if attempt ends with 'fatal' message, stop looking for header
422							last dataset
423	12	50				48	if delete $attempt{fatal};
424							# Was this a partial match? See if any col_map entries were added vs. what user already gave us.
425							my $initial_colmap_count= !$self->has_col_map? 0
426	12	100				48	: scalar(grep defined, @{$self->col_map});
	3					73
427	12	100				63	if ($initial_colmap_count < scalar(grep defined, @{$attempt{col_map}})) {
	12					45
428							# Handling of partial match determined by on_partial_match setting
429	2					6	my $act= $self->on_partial_match;
430	2	50				8	$act= $act->($self, \%attempt) if ref $act eq 'CODE';
431							last dataset
432	2	50				7	if $act eq 'last';
433							}
434							}
435	12					276	$self->_log->('debug', '%s: No match', $attempt{context});
436							}
437	2					43	$self->_log->('error','No row in dataset matched full header requirements');
438	2					13	++$dataset_idx;
439							} while ($data_iter->next_dataset);
440	1					4	return \%result;
441							}
442
443							# This mode assumes all headers match exactly as perscribed in the fields list or user-supplied col_map
444							sub _match_headers_static {
445	0			0		0	my ($self, $header, $attempt)= @_;
446	0	0				0	my @col_map= $self->has_col_map? @{$self->col_map} : @{$self->fields};
	0					0
	0					0
447	0					0	$attempt->{col_map}= \@col_map;
448	0					0	for my $i (0 .. $#col_map) {
449	0	0				0	next unless defined $col_map[$i];
450	0	0				0	next if $header->[$i] =~ $col_map[$i]->header_regex;
451							# Field header doesn't match. Start over on next row.
452	0					0	push @{$attempt->{messages}}, [ error => "Header at column $i does not look like field ".$col_map[$i]->name ];
	0					0
453	0					0	return 0;
454							}
455							# found a match for every field!
456	0					0	$self->_log->('debug','%s: Found!', $attempt->{context});
457	0					0	return 1;
458							}
459
460							sub _match_headers_dynamic {
461	33			33		73	my ($self, $header, $attempt)= @_;
462	33					130	my $context= $attempt->{context};
463	33					709	my $fields= $self->fields;
464							# Colmap starts empty unless user supplied one
465	33	100				362	my $user_colmap= $self->has_col_map? $self->col_map : [];
466	33	100				194	my @colmap= map +(defined $_? [ $_ ] : undef), @$user_colmap;
467	33					89	$attempt->{col_map}= \@colmap;
468							# Search every cell of the header, except ones specified by the user
469	33					172	my @col_search_idx= grep !defined $user_colmap->[$_], 0 .. $#$header;
470							# Divide remaining fields (not specified by user) into list that can only occur following
471							# another field, or the ones that can occur anywhere.
472	33					110	my %seen= map +(refaddr $_ => 1 ), grep defined, @$user_colmap;
473	33					63	my (@follows_fields, @free_fields);
474	97	100				252	defined $seen{refaddr $_} or push @{($_->follows_list? \@follows_fields : \@free_fields)}, $_
475	33		66			159	for @$fields;
476	33					76	undef %seen;
477							# Sort required fields to front, to fail faster on non-matching rows
478							# But otherwise preserve field order in case it matters for priority of matching
479	33					203	@free_fields= ( (grep $_->required, @free_fields), (grep !$_->required, @free_fields) );
480
481							# For each freely-located field (free = lacking placement requirements) scan every un-used
482							# column for a match. Record all matches, for later analysis of ambiguity.
483							# But, stop as soon as a required column is missing; it helps speed up the search for the
484							# header.
485	33	50				126	my $ambiguous_log_level= $self->on_ambiguous_columns eq 'error'? 'error' : 'warn';
486	33					62	my %fieldname_cols;
487	33					64	for my $f (@free_fields) {
488	74					1929	my $hr= $f->header_regex;
489	74					516	push @{$attempt->{messages}}, [ trace => "looking for $hr" ];
	74					345
490	74					920	my @found_idx= grep $header->[$_] =~ $hr, @col_search_idx;
491	74					119	push @{$attempt->{messages}}, [ debug => "found ".$f->name." header at col [".join(',', map $_+1, @found_idx).']' ];
	74					411
492	74					167	for my $idx (@found_idx) {
493							# If another field of the same name matches a column, the first gets priority.
494							# ignore the duplicate.
495	68	50				239	if ($fieldname_cols{$f->name}{$idx}) {
496	0					0	push @{$attempt->{messages}}, [ debug => "Ignored; column $idx is already claimed by a field named ".$f->name ];
	0					0
497							} else {
498	68					97	push @{$colmap[$idx]}, $f;
	68					155
499							#$col_fieldnames{$idx}{$f->name}= 1;
500	68					246	$fieldname_cols{$f->name}{$idx}= $f;
501							}
502							}
503							# Flag missing required fields
504	74	100	100			377	if (!@found_idx && $f->required) {
505	9					15	push @{$attempt->{missing_required}}, $f;
	9					38
506	9					15	push @{$attempt->{messages}}, [ error => 'No match for required field '.$f->name ];
	9					34
507							# Missing required fields probably means this isn't he header row, or the input is
508							# garbage, so might as well stop here before genering a bunch of analysis.
509	9					27	last;
510							}
511							}
512							# Now, check for any of the 'follows' fields, some of which might also be 'required'.
513	33	100	100			105	if (@follows_fields && !$attempt->{missing_required}) {
514	2					5	my %following;
515							my %found;
516	2					9	for my $idx (0 .. $#$header) {
517	17	100	66			44	if ($colmap[$idx] && 1 == @{$colmap[$idx]}) {
	5					18
518	5					23	%following= ( $colmap[$idx][0]->name => $colmap[$idx][0] );
519							} else {
520	12					25	my $val= $header->[$idx];
521	12					25	for my $f (@follows_fields) {
522	18	100				51	next unless grep $following{$_}, $f->follows_list;
523	9	100				219	next unless $val =~ $f->header_regex;
524							# If another field of the same name matches a column, the first gets priority.
525							# ignore the duplicate.
526	7	50				91	if ($fieldname_cols{$f->name}{$idx}) {
527	0					0	push @{$attempt->{messages}}, [ debug => "Ignored; column $idx is already claimed by a field named ".$f->name ];
	0					0
528							} else {
529	7					12	push @{$colmap[$idx]}, $f;
	7					20
530	7					21	$fieldname_cols{$f->name}{$idx}= $f;
531	7					21	$found{refaddr $f}= 1;
532							}
533							}
534							# If successfully matched exactly one field, add it to the 'following' set.
535	12	100	66			71	if ($colmap[$idx] && @{$colmap[$idx]} == 1) {
	7					22
536	7					24	$following{$colmap[$idx][0]->name}= $colmap[$idx][0];
537							}
538							# Else if no matches, or ambiguous, so reset the following set
539							else {
540	5					15	%following= ();
541							}
542							}
543							}
544							# Check if any of the 'follows' fields were required
545	2	50	33			23	if (my @unfound= grep +($_->required && !$found{refaddr $_}), @follows_fields) {
546	0					0	push @{$attempt->{missing_required}}, @unfound;
	0					0
547	0	0				0	push @{$attempt->{messages}}, [ error =>
	0					0
548							sprintf('No match for required %s [%s]',
549							(@unfound > 1? 'fields':'field'),
550							join(', ', map $_->name, sort @unfound))
551							];
552							}
553							}
554
555							# Make the list of columns which didn't match anything before starting to munge things
556							# related to ambiguities.
557	33					162	my @unmatched= grep !defined $colmap[$_], 0 .. $#$header;
558	33	100				126	$attempt->{unmatched}= \@unmatched if @unmatched;
559
560							# Ambiguity check: each field name may only be located in one column, unless
561							# the field(s) are flagged as being arrays.
562	33					63	my %ambiguous_fields;
563	33					171	for my $name (sort keys %fieldname_cols) {
564	57					114	my $cols= $fieldname_cols{$name};
565	57	100				202	next unless keys %$cols > 1;
566	7	50				51	next unless grep !$_->array, values %$cols;
567	0					0	push @{$attempt->{messages}}, [ $ambiguous_log_level =>
568							sprintf "Found field '%s' at multiple columns: %s",
569	0					0	$name, join(', ', map 1+$_, sort { $a <=> $b } keys %$cols)
	0					0
570							];
571	0					0	$ambiguous_fields{$name}= $cols;
572							}
573	33	50				94	$attempt->{ambiguous_fields}= \%ambiguous_fields
574							if keys %ambiguous_fields;
575
576							# Ambiguity check: there must be only one field claiming each column
577							# If it's OK, resolve the arrayref down to its single member.
578	33					60	my $col_collision= 0;
579	33					119	for my $idx (0 .. $#colmap) {
580	100	100				231	next unless defined $colmap[$idx];
581	84	50				116	if (@{$colmap[$idx]} == 1) { # only claimed by one field
	84					201
582	84					143	my $f= $colmap[$idx][0];
583							# but if that one field is ambiguous, discard it
584	84	50				287	$colmap[$idx]= $ambiguous_fields{$f->name}? undef : $f;
585							} else {
586	0					0	push @{$attempt->{messages}}, [ $ambiguous_log_level =>
587							sprintf "Column %d claimed by multiple fields: %s",
588	0					0	$idx+1, join(', ', sort map $_->name, @{$colmap[$idx]})
	0					0
589							];
590	0					0	$attempt->{ambiguous_columns}{$idx}= $colmap[$idx];
591	0					0	$colmap[$idx]= undef;
592							}
593							}
594
595							# Need to have found at least one column (even if none required)
596	33	100				134	unless (grep defined, @colmap) {
597	7					43	push @{$attempt->{messages}}, [
598							error => ($attempt->{ambiguous_columns} \|\| $attempt->{ambiguous_fields})
599	7	50	33			10	? 'All matching headers were ambiguous'
600							: 'No field headers matched'
601							];
602	7					34	return 0;
603							}
604
605							return 0
606							if $attempt->{missing_required}
607							or $self->on_ambiguous_columns ne 'warn'
608	26	100	33			303	&& ($attempt->{ambiguous_columns} \|\| $attempt->{ambiguous_fields});
			33
			66
609
610							# Now, if there are any un-claimed columns, handle per 'on_unknown_columns' setting.
611	24	100				62	if (@unmatched) {
612	9					34	my $act= $self->on_unknown_columns;
613	9					40	my $unknown_list= join(', ', map $self->_fmt_header_text($header->[$_]), @unmatched);
614	9	50				34	$act= $act->($self, $header, \@unmatched) if ref $act eq 'CODE';
615	9	100	66			71	if ($act eq 'warn' \|\| $act eq 'use') { # 'use' is back-compat, 'warn' is official now.
		50	33
616	6					12	push @{$attempt->{messages}}, [ warn => 'Ignoring unknown columns: '.$unknown_list ];
	6					29
617							} elsif ($act eq 'error' \|\| $act eq 'next') { # 'next' is back-compat, 'error' is official now.
618	3					5	push @{$attempt->{messages}}, [ error => 'Would match except for unknown columns: '.$unknown_list ];
	3					13
619	3					41	return 0;
620							} else {
621	0	0				0	push @{$attempt->{messages}}, [ error =>
	0					0
622							$act eq 'die'? "${context}Header row includes unknown columns: $unknown_list"
623							: "Invalid action '$act' for 'on_unknown_columns'"
624							];
625	0					0	$attempt->{fatal}= 1;
626	0					0	return 0;
627							}
628							}
629	21					142	return 1;
630							}
631							# Make header string readable for log messages
632							sub _fmt_header_text {
633	25	50		25		63	shift if ref $_[0];
634	25					50	my $x= shift;
635	25					64	$x =~ s/ ( [^[:print:]] ) / sprintf("\\x%02X", ord $1 ) /gex;
	4					18
636	25					124	qq{"$x"};
637							}
638							# format the colmap into a string
639							sub _colmap_progress_str {
640	0			0		0	my ($colmap, $headers)= @_;
641							join(' ', map {
642	0	0				0	$colmap->{$_}? $_.'='.$colmap->{$_}->name
	0					0
643							: $_.':'._fmt_header_text($headers->[$_])
644							} 0 .. $#$headers)
645							}
646
647
648							sub iterator {
649	24			24	1	8987	my $self= shift;
650	24					829	my $fields= $self->fields;
651							$self->table_search_results->{found}
652	24	50				741	or croak "table_search_results does not contain 'found'";
653							# Creating the record iterator consumes the data source's iterator.
654							# The first time after detecting the table, we continue with the same iterator.
655							# Every time after that we need to create a new data iterator and seek to the
656							# first record under the header.
657	24					617	my $data_iter= delete $self->table_search_results->{found}{_data_iter};
658	24	100				195	unless ($data_iter) {
659	3					67	$data_iter= $self->decoder->iterator;
660	3					135	$data_iter->seek($self->table_search_results->{found}{first_record_pos});
661							}
662
663							# The goal for this iterator is to perform as little work as possible on each iteration,
664							# while making all the features possible, but avoiding building custom perl code with 'eval'.
665							# To that end, most of the operations get vectorized and stored in array variables that
666							# get closed-over by the iterator function.
667							#
668							# The iterator algorithm goes like this:
669							#
670							# - Collect a slice of the next row from the Decoder, selecting only the columns we need.
671							# i.e. @row_slice= @decoder_row[@slice_idx];
672							# - Run trim functions on each value that needs trimmed.
673							# i.e. $row_slice[$i]= $trim[$i]->($row_slice[$i]) if defined $row_slice[$i]
674							# - Apply "blank" value to any value that is zero length
675							# i.e. $row_slice[$i]= $blank_value[$i] unless length $row_slice[$i]
676							# - Special handling if the entire row is blank
677							# - If user wanted validation, do type checks on each relevant value
678							# i.e. $type->validate($row_slice[$i])
679							# - Assemble array-valued fields into a single arrayref value
680							# i.e. $row_slice[$from]= [ splice(@row_slice, $from, $to, undef) ];
681							# - If user wanted an array matching the Field order, alter @row_slice to match
682							# - else if user wanted a hashref, build one,
683							# - If validations failed, run user callback
684							# - if user wanted an object, construct one
685
686	24					435	my $col_map= $self->table_search_results->{found}{col_map};
687	24					163	my $field_map= _field_map($col_map);
688	24					196	my @input_slice; # list of column idx to retrieve from input
689							my @output_slice; # list of column idx to store in output, for record_class=>'ARRAY'
690	24					0	my @output_keys; # list of hash key names where values get stored
691	24					0	my @array_ranges; # list of value indices that get bundled into an arrayref
692	24					0	my @blank_val; # blank value per each fetched column
693	24					0	my @trim; # list of trim functions and the value indicies they should be applied to
694	24					0	my @type_check; # list of validation coderefs that should be applied
695	24					0	my $class; # optional object class to construct for the resulting rows
696	24					0	my ($n_blank, $first_blank, $eof);
697							my $sub= sub {
698	74			74		2287	my (@failed, $out, $vals);
699	79	100	50			340	again:
			66
700							# Pull the specific slice of the next row that we need
701							$vals= !$eof && $data_iter->(\@input_slice)
702							or ++$eof && return undef;
703							# Apply 'trim' to any column whose field requested it
704	56					113	for my $t (@trim) {
705	58		66			94	defined and $t->[0]->() for @{$vals}[@{$t->[1]}];
	58					245
	58					99
706							}
707							# Apply 'blank value' to every column which is zero length
708	56					89	$n_blank= 0;
709							(defined $vals->[$_] and length $vals->[$_]) or (++$n_blank, $vals->[$_]= $blank_val[$_])
710	56		100			555	for 0..$#$vals;
			66
711							# If all are blank, then handle according to $on_blank_row setting
712	56	100				201	if ($n_blank == @$vals) {
		100
713	2		33			15	$first_blank \|\|= $data_iter->row;
714	2					13	goto again;
715							} elsif ($first_blank) {
716							# At the end of a series of blank rows, run the callback to decide what to do
717	2	50				7	unless ($self->_handle_blank_row($data_iter, $data_iter->row - $first_blank)) {
718	0					0	$eof= 1;
719	0					0	return undef;
720							}
721	2					5	$first_blank= undef;
722							}
723							# Check type constraints, if any
724	54					90	@failed= ();
725	54					103	push @failed, $_->($vals) for @type_check;
726							# Combine each set of array-valued fields into an arrayref
727	54					136	$vals->[$_->[0]]= [ splice @$vals, $_->[0], $_->[1], undef ] for @array_ranges;
728							# Generate the output structure
729	52					72	$out= @output_keys? do { my %out; @out{@output_keys}= @$vals; \%out }
	52					175
	52					113
730	54	50				100	: @output_slice? do { my @out; $#out= $#$fields; @out[@output_slice]= @$vals; \@out }
	2	100				3
	2					5
	2					6
	2					4
731							: $vals;
732							# Handle any validation errors detected above
733	54	100				112	if (@failed) {
734	11	100				36	$self->_handle_validation_error(\@failed, $out, $data_iter)
735							or goto again;
736							}
737							# Construct a class, if requested, else return hashref
738	50	50				340	return $class? $class->new($out) : $out;
739	24					203	};
740
741							# User wants arrayref output, with one element per field?
742	24	100				134	if ($self->record_class eq 'ARRAY') {
743							# If two fields share a name, only the first one gets the value(s).
744	1					4	my %remaining= %$field_map;
745	1					3	my $need_output_slice= 0;
746	1					2	for my $field_idx (0 .. $#$fields) {
747	4					7	my $f= $fields->[$field_idx];
748	4					12	my $src= delete $remaining{$f->name};
749							# If this field has a source, add it to the input slice and output slice
750	4	100				12	if (defined $src) {
		50
751	3	50				7	push @output_slice, $field_idx
752							if $need_output_slice;
753	3					7	push @input_slice, $src;
754							}
755							# output_slice isn't needed until the first field that doesn't have a source
756							elsif (!$need_output_slice) {
757	1					3	$need_output_slice= 1;
758	1					6	@output_slice= ( 0 .. $field_idx-1 );
759							}
760							}
761							} else {
762							# For any other record_class, we are building a hashref
763							# Only set the 'class' variable if we also need to construct an object.
764	23	50				88	$class= $self->record_class
765							unless 'HASH' eq $self->record_class;
766	23					104	@input_slice= values %$field_map;
767	23					108	@output_keys= keys %$field_map;
768							}
769
770	24					68	my %trimmer;
771	24					127	for (my ($i, $out_ofs, $array_start, $array_lim)= (0,0); $i <= $#input_slice; $i++) {
772	86					158	my $col_idx= $input_slice[$i];
773	86	100				773	if (ref $col_idx eq 'ARRAY') {
		100
774	7					17	$array_start= $i;
775	7					16	$array_lim= $array_start + @$col_idx;
776	7					28	splice(@input_slice, $i, 1, @$col_idx);
777	7					23	push @array_ranges, [ $i+$out_ofs, scalar @$col_idx ];
778	7					17	$col_idx= $col_idx->[0];
779							} elsif (defined $array_lim) {
780	24	100				56	if ($i >= $array_lim) {
781	6					12	$array_start= $array_lim= undef;
782							} else {
783	18					30	--$out_ofs; # each iteration within an array increases the offset
784							}
785							}
786	86					149	my $field= $col_map->[$col_idx];
787							# Handling for ->trim feature
788	86	100				2011	if (my $t= $field->trim_coderef) {
789	85		100			466	$trimmer{refaddr $t} \|\|= [ $t, [] ];
790	85					126	push @{ $trimmer{refaddr $t}[1] }, $i;
	85					222
791							}
792							# Handling for ->blank feature
793	86					251	push @blank_val, $field->blank;
794							# Handling for ->type and ->coerce features
795	86	100				397	if ($field->type) {
796							# @path is needed to show the on_validation_fail callback where to find the value in
797							# the output. First element of @path is either ->{$name} or ->[$idx] depending whether
798							# the output is an array or hashref. Second element only happens if that field's value
799							# is an arrayref.
800	10					55	my $output_idx= $i + $out_ofs;
801	10	0				34	my @path= (
		50
802							@output_keys? $output_keys[$output_idx]
803							: @output_slice? $output_slice[$output_idx]
804							: $output_idx
805							);
806	10	100				25	push @path, ($i - $array_start)
807							if defined $array_start;
808	10					31	push @type_check, $self->_make_validation_check_coderef($field, $i, \@path);
809							}
810							}
811	24					100	@trim= values %trimmer;
812
813	24					231	return Data::TableReader::_RecIter->new(
814							$sub, { data_iter => $data_iter, reader => $self },
815							);
816							}
817
818							sub _make_validation_check_coderef {
819	10			10		22	my ($self, $field, $vals_idx, $out_path)= @_;
820	10					19	my $t= $field->type;
821	10					21	my $c= $field->coerce;
822	10		66			33	my $t_can_validate= blessed($t) && $t->can('validate');
823							# If type object has method 'coerce' but does not have method 'has_coercions', just run it.
824							# But, if has_coercions is false, then there's no point in running it.
825	10		66			61	my $t_can_coerce= blessed($t) && ($t->can('has_coercions')? $t->has_coercions : $t->can('coerce'));
826
827							# There are 5 possibilities for the callback:
828							# type is a coderef, and coerce is false
829							ref $t eq 'CODE'? (
830							!$c? sub {
831	30			30		62	my $e= $t->($_[0][$vals_idx]);
832	30	100				155	defined $e? ([ $field, undef, $e, $out_path ]) : ()
833							}
834							# type is a coderef, and coerce is a coderef
835							: ref $c eq 'CODE'? sub {
836	0			0		0	my $e= $t->($_[0][$vals_idx]);
837	0	0				0	if (defined $e) {
838	0					0	my $tmp= $c->($_[0][$vals_idx]);
839	0	0				0	($_[0][$vals_idx], $e)= ($tmp) unless defined $t->($tmp);
840							}
841	0	0				0	defined $e? ([ $field, undef, $e, $out_path ]) : ()
842							}
843							: croak("Can't coerce field ".$field->name.": ->type is coderef and ->coerce is not a coderef")
844							)
845							# type is a Type::Tiny, and coerce is a coderef
846							: $t_can_validate? (
847							ref $c eq 'CODE'? sub {
848	0			0		0	my $e= $t->validate($_[0][$vals_idx]);
849	0	0				0	if (defined $e) {
850	0					0	my $tmp= $c->($_[0][$vals_idx]);
851	0	0				0	($_[0][$vals_idx], $e)= ($tmp) unless defined $t->validate($tmp);
852							}
853	0	0				0	defined $e? ([ $field, undef, $e, $out_path ]) : ()
854							}
855							# type is a Type::Tiny, and coerce is requested and is available from the type object
856							: $c && $t_can_coerce? sub {
857	5			5		14	my $e= $t->validate($_[0][$vals_idx]);
858	5	50				340	if (defined $e) {
859	5					18	my $tmp= $t->coerce($_[0][$vals_idx]);
860	5	100				1476	($_[0][$vals_idx], $e)= ($tmp) unless defined $t->validate($tmp);
861							}
862	5	100				198	defined $e? ([ $field, undef, $e, $out_path ]) : ()
863							}
864							# type is a Type::Tiny, and coerce is not requested or not available
865							: sub {
866	10			10		34	my $e= $t->validate($_[0][$vals_idx]);
867	10	100				155	defined $e? ([ $field, undef, $e, $out_path ]) : ()
868							}
869							)
870	10	0	66			304	: croak "Invalid type constraint $t on field ".$field->name;
		50
		100
		50
		50
		100
871							}
872
873							sub _handle_blank_row {
874	2			2		6	my ($self, $data_iter, $count)= @_;
875	2					6	my $last= $data_iter->row - 1;
876	2					6	my $first= $last - $count + 1;
877	2					8	my $act= $self->on_blank_row;
878	2	50				8	$act= $act->($self, $first, $last)
879							if ref $act eq 'CODE';
880	2	50				6	if ($act eq 'next') {
881	2	50				69	$self->_log->('warn', $first == $last?
882							( 'Skipping blank row at %s', $first )
883							: ('Skipping blank rows from %s until %s', $first, $last )
884							);
885	2					8	return 1;
886							}
887	0	0				0	if ($act eq 'last') {
888	0					0	$self->_log->('warn', 'Ending at blank row %s', $first);
889	0					0	return 0;
890							}
891	0	0				0	if ($act eq 'die') {
892	0					0	my $msg= "Encountered blank rows at $first..$last";
893	0					0	$self->_log->('error', $msg);
894	0					0	croak $msg;
895							}
896	0					0	croak "Invalid value for 'on_blank_row': \"$act\"";
897							}
898
899							sub _handle_validation_error {
900	11			11		23	my ($self, $failures, $output, $data_iter)= @_;
901	11					32	my $act= $self->on_validation_error;
902	11	100				27	if (ref $act eq 'CODE') {
903							# Fill in the second element (ref to $output) for each failure
904	6					12	for (@$failures) {
905	8					33	my $path= $_->[3];
906	8	50				27	my $ref= ref $output eq 'HASH'? \$output->{$path->[0]} : \$output->[$path->[0]];
907	8	100				23	$ref= \(${$ref}->[$path->[1]]) if @$path > 1;
	2					5
908	8					18	$_->[1]= $ref;
909							}
910	6					22	$act= $act->($self, $failures, $output, $data_iter);
911							}
912	11					153	my $errors= join(', ', map $_->[0]->name.': '.$_->[2], @$failures);
913	11	100				26	if ($act eq 'next') {
914	3	50				7	$self->_log->('warn', "%s: Skipped for data errors: %s", $data_iter->position, $errors) if $errors;
915	3					10	return 0;
916							}
917	8	100				34	if ($act eq 'use') {
918	7	100				26	$self->_log->('warn', "%s: Possible data errors: %s", $data_iter->position, $errors) if $errors;
919	7					47	return 1;
920							}
921	1	50				13	if ($act eq 'die') {
922	1					4	my $msg= $data_iter->position.": Invalid record: $errors";
923	1					41	$self->_log->('error', $msg);
924	1					122	croak $msg;
925							}
926							}
927
928							# This is back-compat for the previous callback API which was an attribute named 'on_validation_fail'
929							sub _wrap_on_validation_fail {
930	1			1		2	my $orig_cb= shift;
931							return sub {
932	1			1		2	my ($self, $failures, $output, $data_iter)= @_;
933							# Old API gave the user a value index rather than a ref to the value
934	1					3	my @value_refs= map $_->[1], @$failures;
935	1					2	my @values= map $$_, @value_refs;
936	1					4	$failures->[$_][1]= $_ for 0 .. $#$failures;
937	1					4	my $act= $orig_cb->($self, $failures, \@values, $data_iter->position.': ');
938							# if they changed any values, write them back to the refs.
939	1					17	${$value_refs[$_]}= $values[$_] for 0 .. $#value_refs;
	1					2
940	1					2	return $act;
941	1					6	};
942							}
943
944	7			7		74708	BEGIN { @Data::TableReader::_RecIter::ISA= ( 'Data::TableReader::Iterator' ) }
945
946							sub Data::TableReader::_RecIter::all {
947	11			11		51	my $self= shift;
948	11					20	my (@rec, $x);
949	11					34	push @rec, $x while ($x= $self->());
950	11					147	return \@rec;
951							}
952							sub Data::TableReader::_RecIter::dataset_idx {
953	1			1		9	shift->_fields->{data_iter}->dataset_idx(@_);
954							}
955							sub Data::TableReader::_RecIter::row {
956	5			5		19	shift->_fields->{data_iter}->row(@_);
957							}
958							sub Data::TableReader::_RecIter::position {
959	0			0			shift->_fields->{data_iter}->position(@_);
960							}
961							sub Data::TableReader::_RecIter::progress {
962	0			0			shift->_fields->{data_iter}->progress(@_);
963							}
964							sub Data::TableReader::_RecIter::tell {
965	0			0			shift->_fields->{data_iter}->tell(@_);
966							}
967							sub Data::TableReader::_RecIter::seek {
968	0			0			shift->_fields->{data_iter}->seek(@_);
969							}
970							sub Data::TableReader::_RecIter::next_dataset {
971							shift->_fields->{reader}->_log
972	0			0			->('warn',"Searching for subsequent table headers is not supported yet");
973	0						return 0;
974							}
975
976							1;
977
978							__END__