File Coverage

blib/lib/Data/TableReader.pm

Criterion	Covered	Total	%
statement	290	404	71.7
branch	133	244	54.5
condition	37	88	42.0
subroutine	36	44	81.8
pod	5	6	83.3
total	501	786	63.7

line	stmt	bran	cond	sub	pod	time	code
1							package Data::TableReader;
2	4			4		3751	use Moo 2;
	4					26187
	4					24
3	4			4		4050	use Try::Tiny;
	4					9
	4					203
4	4			4		22	use Carp;
	4					8
	4					177
5	4			4		21	use List::Util 'max';
	4					9
	4					361
6	4			4		26	use Module::Runtime 'require_module';
	4					8
	4					24
7	4			4		1872	use Data::TableReader::Field;
	4					11
	4					120
8	4			4		1274	use Data::TableReader::Iterator;
	4					9
	4					18630
9
10							# ABSTRACT: Extract records from "dirty" tabular data sources
11							our $VERSION = '0.011'; # VERSION
12
13
14							has input => ( is => 'rw', required => 1 );
15							has _file_handle => ( is => 'lazy' );
16							has _decoder_arg => ( is => 'rw', init_arg => 'decoder' );
17							has decoder => ( is => 'lazy', init_arg => undef );
18							has fields => ( is => 'rw', required => 1, coerce => \&_coerce_field_list );
19	27			27	0	34	sub field_list { @{ shift->fields } }
	27					393
20							has field_by_name => ( is => 'lazy' );
21							has record_class => ( is => 'rw', required => 1, default => sub { 'HASH' } );
22							has static_field_order => ( is => 'rw' ); # force order of columns
23							has header_row_at => ( is => 'rw', default => sub { [1,10] } ); # row of header, or range to scan
24							has header_row_combine => ( is => 'rw', lazy => 1, builder => 1 );
25							has on_unknown_columns => ( is => 'rw', default => sub { 'use' } );
26							has on_blank_row => ( is => 'rw', default => sub { 'next' } );
27							has on_validation_fail => ( is => 'rw', default => sub { 'die' } );
28							has log => ( is => 'rw', trigger => sub { shift->_clear_log } );
29
30							# Open 'input' if it isn't already a file handle
31							sub _build__file_handle {
32	14			14		110	my $self= shift;
33	14					36	my $i= $self->input;
34	14	100	66			112	return undef if ref($i) && (ref($i) eq "Spreadsheet::ParseExcel::Worksheet");
35	13	100	66			158	return $i if ref($i) && (ref($i) eq 'GLOB' or ref($i)->can('read'));
			33
36	9	50		2		161	open(my $fh, '<', $i) or croak "open($i): $!";
	2					11
	2					4
	2					13
37	9					1226	binmode $fh;
38	9					196	return $fh;
39							}
40
41							# Create ::Decoder instance either from user-supplied args, or by detecting input format
42							sub _build_decoder {
43	14			14		134	my $self= shift;
44	14					39	my $decoder_arg= $self->_decoder_arg;
45	14					28	my $decoder_ref= ref $decoder_arg;
46	14					27	my ($class, @args);
47	14	100	33			73	if (!$decoder_arg) {
		100
		50
		0
48	1					4	($class, @args)= $self->detect_input_format;
49	1					22	$self->_log->('trace', "Detected input format as %s", $class);
50							}
51							elsif (!$decoder_ref) {
52	4					6	$class= $decoder_arg;
53							}
54							elsif ($decoder_ref eq "HASH" or $decoder_ref eq "ARRAY") {
55	9	50				22	($class, @args)= $decoder_ref eq "ARRAY"? @$decoder_arg : do {
56	9					34	my %tmp= %$decoder_arg;
57	9					38	(delete($tmp{CLASS}), %tmp);
58							};
59	9	50				28	if(!$class) {
60	0					0	my ($input_class, @input_args)= $self->detect_input_format;
61	0	0				0	croak "decoder class not in arguments and unable to identify decoder class from input"
62							if !$input_class;
63	0					0	($class, @args)= ($input_class, @input_args, @args);
64							}
65							}
66							elsif ($decoder_ref->can('iterator')) {
67	0					0	return $decoder_arg;
68							}
69							else {
70	0					0	croak "Can't create decoder from $decoder_ref";
71							}
72	14	50				70	$class= "Data::TableReader::Decoder::$class"
73							unless $class =~ /::/;
74	14	50				67	require_module($class) or croak "$class does not exist or is not installed";
75	14					568	$self->_log->('trace', 'Creating decoder %s on input %s', $class, $self->input);
76	14	100	100			339	return $class->new(
77							file_name => ($self->input eq ($self->_file_handle\|\|"") ? '' : $self->input),
78							file_handle => $self->_file_handle,
79							_log => $self->_log,
80							@args
81							);
82							}
83
84							# User supplies any old perl data, but this field should always be an arrayref of ::Field
85							sub _coerce_field_list {
86	14			14		35711	my ($list)= @_;
87	14	50	33			91	defined $list and ref $list eq 'ARRAY' or croak "'fields' must be a non-empty arrayref";
88	14					35	my @list= @$list; # clone it, to make sure we don't unexpectedly alter the caller's data
89	14					34	for (@list) {
90	38	100				186	if (!ref $_) {
		50
91	5					82	$_= Data::TableReader::Field->new({ name => $_ });
92							} elsif (ref $_ eq 'HASH') {
93	33					102	my %args= %$_;
94							# "isa" alias for the 'type' attribute
95	33	50	33			82	$args{type}= delete $args{isa} if defined $args{isa} && !defined $args{type};
96	33					488	$_= Data::TableReader::Field->new(\%args)
97							} else {
98	0					0	croak "Can't coerce '$_' to a Field object"
99							}
100							}
101	14					249	return \@list;
102							}
103
104							sub _build_field_by_name {
105	0			0		0	my $self= shift;
106							# reverse list so first field of a name takes precedence
107	0					0	{ map { $_->name => $_ } reverse @{ $self->fields } }
	0					0
	0					0
	0					0
108							}
109
110							sub _build_header_row_combine {
111	13			13		464	my $self= shift;
112							# If headers contain "\n", we need to collect multiple cells per column
113	13					28	max map { 1+(()= ($_->header_regex =~ /\\n\|\n/g)) } $self->field_list;
	38					688
114							}
115
116							# 'log' can be a variety of things, but '_log' will always be a coderef
117							has _log => ( is => 'lazy', clearer => 1 );
118							sub _build__log {
119	14			14		284	_log_fn(shift->log);
120							}
121							sub _log_fn {
122	14			14		91	my $dest= shift;
123							!$dest? sub {
124	2			2		13	my ($level, $msg, @args)= @_;
125	2	50	33			13	return unless $level eq 'warn' or $level eq 'error';
126	0	0				0	$msg= sprintf($msg, @args) if @args;
127	0					0	warn $msg."\n";
128							}
129							: ref $dest eq 'ARRAY'? sub {
130	11			11		44	my ($level, $msg, @args)= @_;
131	11	100	66			40	return unless $level eq 'warn' or $level eq 'error';
132	1	50				6	$msg= sprintf($msg, @args) if @args;
133	1					5	push @$dest, [ $level, $msg ];
134							}
135							: ref($dest)->can('info')? sub {
136	124			124		704	my ($level, $msg, @args)= @_;
137	124	100				461	$dest->$level( @args? sprintf($msg, @args) : $msg )
		100
138							if $dest->can('is_'.$level)->($dest);
139							}
140	14	50				305	: croak "Don't know how to log to $dest";
		100
		100
141							}
142
143
144							sub detect_input_format {
145	1			1	1	4	my ($self, $filename, $magic)= @_;
146
147	1					3	my $input= $self->input;
148							# As convenience to spreadsheet users, let input be a parsed workbook/worksheet object.
149	1	50	33			22	return ('XLSX', sheet => $input)
150							if ref($input) && ref($input)->can('get_cell');
151	0	0	0			0	return ('XLSX', workbook => $input)
152							if ref($input) && ref($input)->can('worksheets');
153
154							# Load first block of file, unless supplied
155	0					0	my $fpos;
156	0	0				0	if (!defined $magic) {
157	0					0	my $fh= $self->_file_handle;
158							# Need to be able to seek.
159	0	0				0	if (seek($fh, 0, 1)) {
		0
160	0					0	$fpos= tell $fh;
161	0					0	read($fh, $magic, 4096);
162	0	0				0	seek($fh, $fpos, 0) or croak "seek: $!";
163							}
164							elsif ($fh->can('ungets')) {
165	0					0	$fpos= 0; # to indicate that we did try reading the file
166	0					0	read($fh, $magic, 4096);
167	0					0	$fh->ungets($magic);
168							}
169							else {
170	0					0	$self->_log->('notice',"Can't fully detect input format because handle is not seekable."
171							." Consider fully buffering the file, or using FileHandle::Unget");
172	0					0	$magic= '';
173							}
174							}
175
176							# Excel is obvious so check it first. This handles cases where an excel file is
177							# erroneously named ".csv" and sillyness like that.
178	0	0				0	return ( 'XLSX' ) if $magic =~ /^PK(\x03\x04\|\x05\x06\|\x07\x08)/;
179	0	0				0	return ( 'XLS' ) if $magic =~ /^\xD0\xCF\x11\xE0/;
180
181							# Else trust the file extension, because TSV with commas can be very similar to CSV with
182							# tabs in the data, and some crazy person might store an HTML document as the first element
183							# of a CSV file.
184							# Detect filename if not supplied
185	0	0				0	if (!defined $filename) {
186	0					0	$filename= '';
187	0	0	0			0	$filename= "$input" if defined $input and (!ref $input \|\| ref($input) =~ /path\|file/i);
			0
188							}
189	0	0				0	if ($filename =~ /\.([^.]+)$/) {
190	0					0	my $suffix= uc($1);
191	0	0				0	return 'HTML' if $suffix eq 'HTM';
192	0					0	return $suffix;
193							}
194
195							# Else probe some more...
196	0					0	$self->_log->('debug',"Probing file format because no filename suffix");
197	0	0				0	length $magic or croak "Can't probe format. No filename suffix, and "
		0
198							.(!defined $fpos? "unseekable file handle" : "no content");
199
200							# HTML is pretty obvious
201	0	0				0	return 'HTML' if $magic =~ /^(\xEF\xBB\xBF\|\xFF\xFE\|\xFE\xFF)?<(!DOCTYPE )HTML/i;
202							# Else guess between CSV and TSV
203	0					0	my ($probably_csv, $probably_tsv)= (0,0);
204	0	0				0	++$probably_csv if $magic =~ /^(\xEF\xBB\xBF\|\xFF\xFE\|\xFE\xFF)?["']?[\w ]+["']?,/;
205	0	0				0	++$probably_tsv if $magic =~ /^(\xEF\xBB\xBF\|\xFF\xFE\|\xFE\xFF)?["']?[\w ]+["']?\t/;
206	0					0	my $comma_count= () = ($magic =~ /,/g);
207	0					0	my $tab_count= () = ($magic =~ /\t/g);
208	0					0	my $eol_count= () = ($magic =~ /\n/g);
209	0	0	0			0	++$probably_csv if $comma_count > $eol_count and $comma_count > $tab_count;
210	0	0	0			0	++$probably_tsv if $tab_count > $eol_count and $tab_count > $comma_count;
211	0					0	$self->_log->('debug', 'probe results: comma_count=%d tab_count=%d eol_count=%d probably_csv=%d probably_tsv=%d',
212							$comma_count, $tab_count, $eol_count, $probably_csv, $probably_tsv);
213	0	0	0			0	return 'CSV' if $probably_csv and $probably_csv > $probably_tsv;
214	0	0	0			0	return 'TSV' if $probably_tsv and $probably_tsv > $probably_csv;
215	0					0	croak "Can't determine file format";
216							}
217
218
219							has _table_found => ( is => 'rw', lazy => 1, builder => 1, clearer => 1, predicate => 1 );
220							sub _build__table_found {
221	4			4		26	my $self= shift;
222	4					10	my %loc= ( croak_on_fail => 1 );
223	4					57	$self->_find_table($self->decoder->iterator, \%loc);
224	4					16	\%loc;
225							}
226
227							sub find_table {
228	9			9	1	3078	my $self= shift;
229	9	50				38	return 1 if $self->_has_table_found;
230	9					13	my %loc;
231	9	50				233	if ($self->_find_table($self->decoder->iterator, \%loc)) {
232	9					157	$self->_table_found(\%loc);
233	9					81	return 1;
234							}
235	0					0	return 0;
236							}
237
238	1			1	1	20	sub col_map { shift->_table_found->{col_map}; }
239	6			6	1	1247	sub field_map { shift->_table_found->{field_map}; }
240
241							sub _find_table {
242	13			13		31	my ($self, $data_iter, $stash)= @_;
243	13		50			50	$stash \|\|= {};
244	13		66			79	while (!$self->_find_table_in_dataset($data_iter, $stash)
			66
245							&& !defined $stash->{fatal}
246							&& $data_iter->next_dataset)
247							{}
248	13	50				41	if ($stash->{col_map}) {
249							# Calculate field map from col map
250	13					20	my $col_map= $stash->{col_map};
251	13					16	my %fmap;
252	13					30	for my $i (0 .. $#$col_map) {
253	61	100				100	next unless $col_map->[$i];
254	43	100				87	if ($col_map->[$i]->array) {
255	10					13	push @{ $fmap{$col_map->[$i]->name} }, $i;
	10					20
256							} else {
257	33					80	$fmap{$col_map->[$i]->name}= $i;
258							}
259							}
260	13					26	$stash->{field_map}= \%fmap;
261							# And record the stream position of the start of the table
262	13					94	$stash->{first_record_pos}= $data_iter->tell;
263	13					23	$stash->{data_iter}= $data_iter;
264	13					35	return $stash;
265							}
266							else {
267	0		0			0	my $err= $stash->{fatal} \|\| "Can't locate valid header";
268	0					0	$self->_log->('error', $err);
269	0	0				0	croak $err if $stash->{croak_on_fail};
270	0					0	return undef;
271							}
272							}
273
274							sub _find_table_in_dataset {
275	14			14		51	my ($self, $data_iter, $stash)= @_;
276							# If header_row_at is undef, then there is no header.
277							# Ensure static_field_order, then set up columns.
278	14					38	my @fields= $self->field_list;
279	14					113	my $header_at= $self->header_row_at;
280	14	50				33	if (!defined $header_at) {
281	0	0				0	unless ($self->static_field_order) {
282	0					0	$stash->{fatal}= "You must enable 'static_field_order' if there is no header row";
283	0					0	return;
284							}
285	0					0	$stash->{col_map}= \@fields;
286	0					0	return 1;
287							}
288
289							# If headers contain "\n", we need to collect multiple cells per column
290	14					198	my $row_accum= $self->header_row_combine;
291
292	14	50				69	my ($start, $end)= ref $header_at? @$header_at : ( $header_at, $header_at );
293	14					18	my @rows;
294
295							# If header_row_at doesn't start at 1, seek forward
296	14	50				33	if ($start > 1) {
297	0					0	$self->_log->('trace', 'Skipping to row %s', $start);
298	0					0	push @rows, $data_iter->() for 1..$start-1;
299							}
300
301							# Scan through the rows of the dataset up to the end of header_row_at, accumulating rows so that
302							# multi-line regexes can match.
303	14					43	for ($start .. $end) {
304	23					124	my $vals= $data_iter->();
305	23	100				50	if (!$vals) { # if undef, we reached end of dataset
306	1					16	$self->_log->('trace', 'EOF');
307	1					8	last;
308							}
309	22	100				42	if ($row_accum > 1) {
310	5					10	push @rows, $vals;
311	5					13	shift @rows while @rows > $row_accum;
312	5					7	$vals= [ map { my $c= $_; join("\n", map $_->[$c], @rows) } 0 .. $#{$rows[-1]} ];
	28					34
	28					67
	5					38
313	5					19	$stash->{context}= $row_accum.' rows ending at '.$data_iter->position;
314							} else {
315	17					45	$stash->{context}= $data_iter->position;
316							}
317	22					371	$self->_log->('trace', 'Checking for headers on %s', $stash->{context});
318	22					161	$stash->{context}.= ': ';
319	22	50				91	$stash->{col_map}= $self->static_field_order?
320							# If static field order, look for headers in sequence
321							$self->_match_headers_static($vals, $stash)
322							# else search for each header
323							: $self->_match_headers_dynamic($vals, $stash);
324	22	100				99	return 1 if $stash->{col_map};
325	9	50				20	return if $stash->{fatal};
326	9					178	$self->_log->('debug', '%sNo match', $stash->{context});
327							}
328	1					14	$self->_log->('warn','No row in dataset matched full header requirements');
329	1					376	return;
330							}
331
332							sub _match_headers_static {
333	0			0		0	my ($self, $header, $stash)= @_;
334	0					0	my $fields= $self->fields;
335	0					0	for my $i (0 .. $#$fields) {
336	0	0				0	next if $header->[$i] =~ $fields->[$i]->header_regex;
337
338							# Field header doesn't match. Start over on next row.
339	0		0			0	$self->_log->('debug','%sMissing field %s', $stash->{context}\|\|'', $fields->[$i]->name);
340	0					0	return;
341							}
342							# found a match for every field!
343	0		0			0	$self->_log->('debug','%sFound!', $stash->{context}\|\|'');
344	0					0	return $fields;
345							}
346
347							sub _match_headers_dynamic {
348	22			22		33	my ($self, $header, $stash)= @_;
349	22		50			53	my $context= $stash->{context} \|\| '';
350	22					28	my %col_map;
351	22					308	my $fields= $self->fields;
352							my $free_fields= $stash->{free_fields} \|\|= [
353							# Sort required fields to front, to fail faster on non-matching rows
354	36	100				91	sort { $a->required? -1 : $b->required? 1 : 0 }
		100
355	22		100			135	grep { !$_->follows_list } @$fields
	38					79
356							];
357							my $follows_fields= $stash->{follows_fields} \|\|= [
358	22		100			63	grep { $_->follows_list } @$fields
	38					62
359							];
360	22					44	for my $f (@$free_fields) {
361	46					632	my $hr= $f->header_regex;
362	46					794	$self->_log->('debug', 'looking for %s', $hr);
363	46					313	my @found= grep { $header->[$_] =~ $hr } 0 .. $#$header;
	268					741
364	46	100				125	if (@found == 1) {
		100
		100
365	34	50				78	if ($col_map{$found[0]}) {
366	0					0	$self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
367							$self->_log->('info','%sField %s and %s both match column %s',
368	0					0	$context, $f->name, $col_map{$found[0]}->name, $found[0]);
369	0					0	return;
370							}
371	34					81	$col_map{$found[0]}= $f;
372							}
373							elsif (@found > 1) {
374	1	50				5	if ($f->array) {
375							# Array columns may be found more than once
376	1					6	$col_map{$_}= $f for @found;
377							} else {
378	0					0	$self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
379	0					0	$self->_log->('info','%sField %s matches more than one column: %s',
380							$context, $f->name, join(', ', @found));
381	0					0	return;
382							}
383							}
384							elsif ($f->required) {
385	9					27	$self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
386	9					175	$self->_log->('info','%sNo match for required field %s', $context, $f->name);
387	9					3453	return;
388							}
389							# else Not required, and not found
390							}
391							# Need to have found at least one column (even if none required)
392	13	50				35	unless (keys %col_map) {
393	0					0	$self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
394	0					0	$self->_log->('debug','%sNo field headers found', $context);
395	0					0	return;
396							}
397							# Now, check for any of the 'follows' fields, some of which might also be 'required'.
398	13	100				31	if (@$follows_fields) {
399	2					3	my %following;
400							my %found;
401	2					6	for my $i (0 .. $#$header) {
402	17	100				31	if ($col_map{$i}) {
403	5					17	%following= ( $col_map{$i}->name => $col_map{$i} );
404							} else {
405	12					18	my $val= $header->[$i];
406	12					14	my @match;
407	12					16	for my $f (@$follows_fields) {
408	18	100				46	next unless grep $following{$_}, $f->follows_list;
409	9	100				129	push @match, $f if $val =~ $f->header_regex;
410							}
411	12	100				72	if (@match == 1) {
		50
412	7	50	66			28	if ($found{$match[0]} && !$match[0]->array) {
413	0					0	$self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
414	0					0	$self->_log->('info','%sField %s matches multiple columns',
415							$context, $match[0]->name);
416	0					0	return;
417							}
418	7					12	$col_map{$i}= $match[0];
419	7					47	$found{$match[0]}= $i;
420	7					18	$following{$match[0]->name}= $match[0];
421							}
422							elsif (@match > 1) {
423	0					0	$self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
424	0					0	$self->_log->('info','%sField %s and %s both match column %d',
425							$context, $match[0]->name, $match[1]->name, $i+1);
426	0					0	return;
427							}
428							else {
429	5					20	%following= ();
430							}
431							}
432							}
433							# Check if any of the 'follows' fields were required
434	2	50				5	if (my @unfound= grep { !$found{$_} && $_->required } @$follows_fields) {
	3	50				14
435	0					0	$self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
436							$self->_log->('info','%sNo match for required %s %s', $context,
437	0	0				0	(@unfound > 1? ('fields', join(', ', map { $_->name } sort @unfound))
	0					0
438							: ('field', $unfound[0]->name)
439							));
440	0					0	return;
441							}
442							}
443							# Now, if there are any un-claimed columns, handle per 'on_unknown_columns' setting.
444	13					31	my @unclaimed= grep { !$col_map{$_} } 0 .. $#$header;
	61					113
445	13	100				33	if (@unclaimed) {
446	5					12	my $act= $self->on_unknown_columns;
447	5					16	my $unknown_list= join(', ', map $self->_fmt_header_text($header->[$_]), @unclaimed);
448	5	50				13	$act= $act->($self, $header, \@unclaimed) if ref $act eq 'CODE';
449	5	50				13	if ($act eq 'use') {
		0
		0
450	5					80	$self->_log->('warn','%sIgnoring unknown columns: %s', $context, $unknown_list);
451							} elsif ($act eq 'next') {
452	0					0	$self->_log->('warn','%sWould match except for unknown columns: %s',
453							$context, $unknown_list);
454							} elsif ($act eq 'die') {
455	0					0	$stash->{fatal}= "${context}Header row includes unknown columns: $unknown_list";
456							} else {
457	0					0	$stash->{fatal}= "Invalid action '$act' for 'on_unknown_columns'";
458							}
459	5					1953	$self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
460	5	50				46	return if $stash->{fatal};
461							}
462	13					80	return [ map $col_map{$_}, 0 .. $#$header ];
463							}
464							# Make header string readable for log messages
465							sub _fmt_header_text {
466	87	100		87		128	shift if ref $_[0];
467	87					106	my $x= shift;
468	87					127	$x =~ s/ ( [^[:print:]] ) / sprintf("\\x%02X", ord $1 ) /gex;
	11					36
469	87					376	qq{"$x"};
470							}
471							# format the colmap into a string
472							sub _colmap_progress_str {
473	14			14		26	my ($colmap, $headers)= @_;
474							join(' ', map {
475	14	100				29	$colmap->{$_}? $_.'='.$colmap->{$_}->name
	95					294
476							: $_.':'._fmt_header_text($headers->[$_])
477							} 0 .. $#$headers)
478							}
479
480
481							sub iterator {
482	15			15	1	5489	my $self= shift;
483	15					292	my $fields= $self->fields;
484							# Creating the record iterator consumes the data source's iterator.
485							# The first time after detecting the table, we continue with the same iterator.
486							# Every time after that we need to create a new data iterator and seek to the
487							# first record under the header.
488	15					248	my $data_iter= delete $self->_table_found->{data_iter};
489	15	100				76	unless ($data_iter) {
490	2					25	$data_iter= $self->decoder->iterator;
491	2					29	$data_iter->seek($self->_table_found->{first_record_pos});
492							}
493	15					208	my $col_map= $self->_table_found->{col_map};
494	15					234	my $field_map= $self->_table_found->{field_map};
495	15					118	my @row_slice; # one column index per field, and possibly more for array_val_map
496							my @arrayvals; # list of source index and destination index for building array values
497	15					0	my @field_names; # ordered list of field names where row slice should be assigned
498	15					0	my %trimmer; # list of trim functions and the array indicies they should be applied to
499	15					0	my @blank_val; # blank value per each fetched column
500	15					0	my @type_check;# list of
501	15					0	my $class; # optional object class for the resulting rows
502
503							# If result is array, the slice of the row must match the position of the fields in the
504							# $self->fields array. If a field was not found it will get an undef for that slot.
505							# It also results in an undef for secondary fields of the same name as the first.
506	15	50				47	if ($self->record_class eq 'ARRAY') {
507	0					0	my %remaining= %$field_map;
508							@row_slice= map {
509	0					0	my $src= delete $remaining{$_->name};
	0					0
510	0	0				0	defined $src? $src : 0x7FFFFFFF
511							} @$fields;
512							}
513							# If result is anything else, then only slice out the columns that are used for the fields
514							# that we located.
515							else {
516	15	50				39	$class= $self->record_class
517							unless 'HASH' eq $self->record_class;
518	15					45	@field_names= keys %$field_map;
519	15					40	@row_slice= values %$field_map;
520							}
521							# For any field whose value is an array of more that one source column,
522							# encode those details in @arrayvals, and update @row_slice and @trim_idx accordingly
523	15					45	for (0 .. $#row_slice) {
524	41	100				75	if (!ref $row_slice[$_]) {
525	39					52	my $field= $col_map->[$row_slice[$_]];
526	39	100				576	if (my $t= $field->trim_coderef) {
527	38		100			154	$trimmer{$t} \|\|= [ $t, [] ];
528	38					51	push @{ $trimmer{$t}[1] }, $_;
	38					98
529							}
530	39					73	push @blank_val, $field->blank;
531	39	100				106	push @type_check, $self->_make_validation_callback($field, $_)
532							if $field->type;
533							}
534							else {
535							# This field is an array-value, so add the src columns to @row_slice
536							# and list it in @arrayvals, and update @trim_idx if needed
537	2					4	my $src= $row_slice[$_];
538	2					3	$row_slice[$_]= 0x7FFFFFFF;
539	2					4	my $from= @row_slice;
540	2					5	push @row_slice, @$src;
541	2					6	push @arrayvals, [ $_, $from, scalar @$src ];
542	2					4	for ($from .. $#row_slice) {
543	10					28	my $field= $col_map->[$row_slice[$_]];
544	10	50				130	if (my $t= $field->trim_coderef) {
545	10		100			69	$trimmer{$t} \|\|= [ $t, [] ];
546	10					10	push @{ $trimmer{$t}[1] }, $_;
	10					21
547							}
548	10					17	push @blank_val, $field->blank;
549	10	50				24	push @type_check, $self->_make_validation_callback($field, $_)
550							if $field->type;
551							}
552							}
553							}
554	15					42	my @trim= values %trimmer;
555	15					19	@arrayvals= reverse @arrayvals;
556	15					32	my ($n_blank, $first_blank, $eof);
557							my $sub= sub {
558	50	100	50	50		2183	again:
			66
559							# Pull the specific slice of the next row that we need
560							my $row= !$eof && $data_iter->(\@row_slice)
561							or ++$eof && return undef;
562							# Apply 'trim' to any column whose field requested it
563	36					70	for my $t (@trim) {
564	38					51	$t->[0]->() for grep defined, @{$row}[@{$t->[1]}];
	38					138
	38					57
565							}
566							# Apply 'blank value' to every column which is zero length
567	36					51	$n_blank= 0;
568							$row->[$_]= $blank_val[$_]
569	36	100	100			69	for grep { (!defined $row->[$_] \|\| !length $row->[$_]) && ++$n_blank } 0..$#$row;
	97					348
570							# If all are blank, then handle according to $on_blank_row setting
571	36	100				97	if ($n_blank == @$row) {
		100
572	2		33			10	$first_blank \|\|= $data_iter->position;
573	2					10	goto again;
574							} elsif ($first_blank) {
575	2	50				7	unless ($self->_handle_blank_row($first_blank, $data_iter->position)) {
576	0					0	$eof= 1;
577	0					0	return undef;
578							}
579	2					4	$first_blank= undef;
580							}
581							# Check type constraints, if any
582	34	100				61	if (@type_check) {
583	18	100				35	if (my @failed= map $_->($row), @type_check) {
584	6	100				17	$self->_handle_validation_fail(\@failed, $row, $data_iter->position.': ')
585							or goto again;
586							}
587							}
588							# Collect all the array-valued fields from the tail of the row
589	30					57	$row->[$_->[0]]= [ splice @$row, $_->[1], $_->[2] ] for @arrayvals;
590							# stop here if the return class is 'ARRAY'
591	30	50				67	return $row unless @field_names;
592							# Convert the row to a hashref
593	30					37	my %rec;
594	30					79	@rec{@field_names}= @$row;
595							# Construct a class, if requested, else return hashref
596	30	50				164	return $class? $class->new(\%rec) : \%rec;
597	15					89	};
598	15					109	return Data::TableReader::_RecIter->new(
599							$sub, { data_iter => $data_iter, reader => $self },
600							);
601							}
602
603							sub _make_validation_callback {
604	4			4		10	my ($self, $field, $index)= @_;
605	4					8	my $t= $field->type;
606							ref $t eq 'CODE'? sub {
607	18			18		41	my $e= $t->($_[0][$index]);
608	18	100				162	defined $e? ([ $field, $index, $e ]) : ()
609							}
610							: $t->can('validate')? sub {
611	0			0		0	my $e= $t->validate($_[0][$index]);
612	0	0				0	defined $e? ([ $field, $index, $e ]) : ()
613							}
614	4	0				26	: croak "Invalid type constraint $t on field ".$field->name;
		50
615							}
616
617							sub _handle_blank_row {
618	2			2		6	my ($self, $first, $last)= @_;
619	2					8	my $act= $self->on_blank_row;
620	2	50				8	$act= $act->($self, $first, $last)
621							if ref $act eq 'CODE';
622	2	50				7	if ($act eq 'next') {
623	2					39	$self->_log->('warn', 'Skipping blank rows from %s until %s', $first, $last);
624	2					954	return 1;
625							}
626	0	0				0	if ($act eq 'last') {
627	0					0	$self->_log->('warn', 'Ending at blank row %s', $first);
628	0					0	return 0;
629							}
630	0	0				0	if ($act eq 'die') {
631	0					0	my $msg= "Encountered blank rows at $first..$last";
632	0					0	$self->_log->('error', $msg);
633	0					0	croak $msg;
634							}
635	0					0	croak "Invalid value for 'on_blank_row': \"$act\"";
636							}
637
638							sub _handle_validation_fail {
639	6			6		14	my ($self, $failures, $values, $context)= @_;
640	6					17	my $act= $self->on_validation_fail;
641	6	100				24	$act= $act->($self, $failures, $values, $context)
642							if ref $act eq 'CODE';
643	6					43	my $errors= join(', ', map $_->[0]->name.': '.$_->[2], @$failures);
644	6	100				14	if ($act eq 'next') {
645	3	50				51	$self->_log->('warn', "%sSkipped for data errors: %s", $context, $errors) if $errors;
646	3					1087	return 0;
647							}
648	3	100				6	if ($act eq 'use') {
649	2	100				20	$self->_log->('warn', "%sPossible data errors: %s", $context, $errors) if $errors;
650	2					6	return 1;
651							}
652	1	50				16	if ($act eq 'die') {
653	1					3	my $msg= "${context}Invalid record: $errors";
654	1					21	$self->_log->('error', $msg);
655	1					656	croak $msg;
656							}
657							}
658
659	4			4		885	BEGIN { @Data::TableReader::_RecIter::ISA= ( 'Data::TableReader::Iterator' ) }
660							sub Data::TableReader::_RecIter::all {
661	9			9		16	my $self= shift;
662	9					22	my (@rec, $x);
663	9					19	push @rec, $x while ($x= $self->());
664	9					200	return \@rec;
665							}
666							sub Data::TableReader::_RecIter::position {
667	0			0		0	shift->_fields->{data_iter}->position(@_);
668							}
669							sub Data::TableReader::_RecIter::progress {
670	0			0		0	shift->_fields->{data_iter}->progress(@_);
671							}
672							sub Data::TableReader::_RecIter::tell {
673	0			0		0	shift->_fields->{data_iter}->tell(@_);
674							}
675							sub Data::TableReader::_RecIter::seek {
676	0			0		0	shift->_fields->{data_iter}->seek(@_);
677							}
678							sub Data::TableReader::_RecIter::next_dataset {
679							shift->_fields->{reader}->_log
680	0			0		0	->('warn',"Searching for supsequent table headers is not supported yet");
681	0					0	return 0;
682							}
683
684							1;
685
686							__END__