line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Data::TableReader; |
2
|
4
|
|
|
4
|
|
3751
|
use Moo 2; |
|
4
|
|
|
|
|
26187
|
|
|
4
|
|
|
|
|
24
|
|
3
|
4
|
|
|
4
|
|
4050
|
use Try::Tiny; |
|
4
|
|
|
|
|
9
|
|
|
4
|
|
|
|
|
203
|
|
4
|
4
|
|
|
4
|
|
22
|
use Carp; |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
177
|
|
5
|
4
|
|
|
4
|
|
21
|
use List::Util 'max'; |
|
4
|
|
|
|
|
9
|
|
|
4
|
|
|
|
|
361
|
|
6
|
4
|
|
|
4
|
|
26
|
use Module::Runtime 'require_module'; |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
24
|
|
7
|
4
|
|
|
4
|
|
1872
|
use Data::TableReader::Field; |
|
4
|
|
|
|
|
11
|
|
|
4
|
|
|
|
|
120
|
|
8
|
4
|
|
|
4
|
|
1274
|
use Data::TableReader::Iterator; |
|
4
|
|
|
|
|
9
|
|
|
4
|
|
|
|
|
18630
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
# ABSTRACT: Extract records from "dirty" tabular data sources |
11
|
|
|
|
|
|
|
our $VERSION = '0.011'; # VERSION |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
has input => ( is => 'rw', required => 1 ); |
15
|
|
|
|
|
|
|
has _file_handle => ( is => 'lazy' ); |
16
|
|
|
|
|
|
|
has _decoder_arg => ( is => 'rw', init_arg => 'decoder' ); |
17
|
|
|
|
|
|
|
has decoder => ( is => 'lazy', init_arg => undef ); |
18
|
|
|
|
|
|
|
has fields => ( is => 'rw', required => 1, coerce => \&_coerce_field_list ); |
19
|
27
|
|
|
27
|
0
|
34
|
sub field_list { @{ shift->fields } } |
|
27
|
|
|
|
|
393
|
|
20
|
|
|
|
|
|
|
has field_by_name => ( is => 'lazy' ); |
21
|
|
|
|
|
|
|
has record_class => ( is => 'rw', required => 1, default => sub { 'HASH' } ); |
22
|
|
|
|
|
|
|
has static_field_order => ( is => 'rw' ); # force order of columns |
23
|
|
|
|
|
|
|
has header_row_at => ( is => 'rw', default => sub { [1,10] } ); # row of header, or range to scan |
24
|
|
|
|
|
|
|
has header_row_combine => ( is => 'rw', lazy => 1, builder => 1 ); |
25
|
|
|
|
|
|
|
has on_unknown_columns => ( is => 'rw', default => sub { 'use' } ); |
26
|
|
|
|
|
|
|
has on_blank_row => ( is => 'rw', default => sub { 'next' } ); |
27
|
|
|
|
|
|
|
has on_validation_fail => ( is => 'rw', default => sub { 'die' } ); |
28
|
|
|
|
|
|
|
has log => ( is => 'rw', trigger => sub { shift->_clear_log } ); |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
# Open 'input' if it isn't already a file handle |
31
|
|
|
|
|
|
|
sub _build__file_handle { |
32
|
14
|
|
|
14
|
|
110
|
my $self= shift; |
33
|
14
|
|
|
|
|
36
|
my $i= $self->input; |
34
|
14
|
100
|
66
|
|
|
112
|
return undef if ref($i) && (ref($i) eq "Spreadsheet::ParseExcel::Worksheet"); |
35
|
13
|
100
|
66
|
|
|
158
|
return $i if ref($i) && (ref($i) eq 'GLOB' or ref($i)->can('read')); |
|
|
|
33
|
|
|
|
|
36
|
9
|
50
|
|
2
|
|
161
|
open(my $fh, '<', $i) or croak "open($i): $!"; |
|
2
|
|
|
|
|
11
|
|
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
13
|
|
37
|
9
|
|
|
|
|
1226
|
binmode $fh; |
38
|
9
|
|
|
|
|
196
|
return $fh; |
39
|
|
|
|
|
|
|
} |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
# Create ::Decoder instance either from user-supplied args, or by detecting input format |
42
|
|
|
|
|
|
|
sub _build_decoder { |
43
|
14
|
|
|
14
|
|
134
|
my $self= shift; |
44
|
14
|
|
|
|
|
39
|
my $decoder_arg= $self->_decoder_arg; |
45
|
14
|
|
|
|
|
28
|
my $decoder_ref= ref $decoder_arg; |
46
|
14
|
|
|
|
|
27
|
my ($class, @args); |
47
|
14
|
100
|
33
|
|
|
73
|
if (!$decoder_arg) { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
48
|
1
|
|
|
|
|
4
|
($class, @args)= $self->detect_input_format; |
49
|
1
|
|
|
|
|
22
|
$self->_log->('trace', "Detected input format as %s", $class); |
50
|
|
|
|
|
|
|
} |
51
|
|
|
|
|
|
|
elsif (!$decoder_ref) { |
52
|
4
|
|
|
|
|
6
|
$class= $decoder_arg; |
53
|
|
|
|
|
|
|
} |
54
|
|
|
|
|
|
|
elsif ($decoder_ref eq "HASH" or $decoder_ref eq "ARRAY") { |
55
|
9
|
50
|
|
|
|
22
|
($class, @args)= $decoder_ref eq "ARRAY"? @$decoder_arg : do { |
56
|
9
|
|
|
|
|
34
|
my %tmp= %$decoder_arg; |
57
|
9
|
|
|
|
|
38
|
(delete($tmp{CLASS}), %tmp); |
58
|
|
|
|
|
|
|
}; |
59
|
9
|
50
|
|
|
|
28
|
if(!$class) { |
60
|
0
|
|
|
|
|
0
|
my ($input_class, @input_args)= $self->detect_input_format; |
61
|
0
|
0
|
|
|
|
0
|
croak "decoder class not in arguments and unable to identify decoder class from input" |
62
|
|
|
|
|
|
|
if !$input_class; |
63
|
0
|
|
|
|
|
0
|
($class, @args)= ($input_class, @input_args, @args); |
64
|
|
|
|
|
|
|
} |
65
|
|
|
|
|
|
|
} |
66
|
|
|
|
|
|
|
elsif ($decoder_ref->can('iterator')) { |
67
|
0
|
|
|
|
|
0
|
return $decoder_arg; |
68
|
|
|
|
|
|
|
} |
69
|
|
|
|
|
|
|
else { |
70
|
0
|
|
|
|
|
0
|
croak "Can't create decoder from $decoder_ref"; |
71
|
|
|
|
|
|
|
} |
72
|
14
|
50
|
|
|
|
70
|
$class= "Data::TableReader::Decoder::$class" |
73
|
|
|
|
|
|
|
unless $class =~ /::/; |
74
|
14
|
50
|
|
|
|
67
|
require_module($class) or croak "$class does not exist or is not installed"; |
75
|
14
|
|
|
|
|
568
|
$self->_log->('trace', 'Creating decoder %s on input %s', $class, $self->input); |
76
|
14
|
100
|
100
|
|
|
339
|
return $class->new( |
77
|
|
|
|
|
|
|
file_name => ($self->input eq ($self->_file_handle||"") ? '' : $self->input), |
78
|
|
|
|
|
|
|
file_handle => $self->_file_handle, |
79
|
|
|
|
|
|
|
_log => $self->_log, |
80
|
|
|
|
|
|
|
@args |
81
|
|
|
|
|
|
|
); |
82
|
|
|
|
|
|
|
} |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
# User supplies any old perl data, but this field should always be an arrayref of ::Field |
85
|
|
|
|
|
|
|
sub _coerce_field_list { |
86
|
14
|
|
|
14
|
|
35711
|
my ($list)= @_; |
87
|
14
|
50
|
33
|
|
|
91
|
defined $list and ref $list eq 'ARRAY' or croak "'fields' must be a non-empty arrayref"; |
88
|
14
|
|
|
|
|
35
|
my @list= @$list; # clone it, to make sure we don't unexpectedly alter the caller's data |
89
|
14
|
|
|
|
|
34
|
for (@list) { |
90
|
38
|
100
|
|
|
|
186
|
if (!ref $_) { |
|
|
50
|
|
|
|
|
|
91
|
5
|
|
|
|
|
82
|
$_= Data::TableReader::Field->new({ name => $_ }); |
92
|
|
|
|
|
|
|
} elsif (ref $_ eq 'HASH') { |
93
|
33
|
|
|
|
|
102
|
my %args= %$_; |
94
|
|
|
|
|
|
|
# "isa" alias for the 'type' attribute |
95
|
33
|
50
|
33
|
|
|
82
|
$args{type}= delete $args{isa} if defined $args{isa} && !defined $args{type}; |
96
|
33
|
|
|
|
|
488
|
$_= Data::TableReader::Field->new(\%args) |
97
|
|
|
|
|
|
|
} else { |
98
|
0
|
|
|
|
|
0
|
croak "Can't coerce '$_' to a Field object" |
99
|
|
|
|
|
|
|
} |
100
|
|
|
|
|
|
|
} |
101
|
14
|
|
|
|
|
249
|
return \@list; |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
sub _build_field_by_name { |
105
|
0
|
|
|
0
|
|
0
|
my $self= shift; |
106
|
|
|
|
|
|
|
# reverse list so first field of a name takes precedence |
107
|
0
|
|
|
|
|
0
|
{ map { $_->name => $_ } reverse @{ $self->fields } } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
108
|
|
|
|
|
|
|
} |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
sub _build_header_row_combine { |
111
|
13
|
|
|
13
|
|
464
|
my $self= shift; |
112
|
|
|
|
|
|
|
# If headers contain "\n", we need to collect multiple cells per column |
113
|
13
|
|
|
|
|
28
|
max map { 1+(()= ($_->header_regex =~ /\\n|\n/g)) } $self->field_list; |
|
38
|
|
|
|
|
688
|
|
114
|
|
|
|
|
|
|
} |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
# 'log' can be a variety of things, but '_log' will always be a coderef |
117
|
|
|
|
|
|
|
has _log => ( is => 'lazy', clearer => 1 ); |
118
|
|
|
|
|
|
|
sub _build__log { |
119
|
14
|
|
|
14
|
|
284
|
_log_fn(shift->log); |
120
|
|
|
|
|
|
|
} |
121
|
|
|
|
|
|
|
sub _log_fn { |
122
|
14
|
|
|
14
|
|
91
|
my $dest= shift; |
123
|
|
|
|
|
|
|
!$dest? sub { |
124
|
2
|
|
|
2
|
|
13
|
my ($level, $msg, @args)= @_; |
125
|
2
|
50
|
33
|
|
|
13
|
return unless $level eq 'warn' or $level eq 'error'; |
126
|
0
|
0
|
|
|
|
0
|
$msg= sprintf($msg, @args) if @args; |
127
|
0
|
|
|
|
|
0
|
warn $msg."\n"; |
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
: ref $dest eq 'ARRAY'? sub { |
130
|
11
|
|
|
11
|
|
44
|
my ($level, $msg, @args)= @_; |
131
|
11
|
100
|
66
|
|
|
40
|
return unless $level eq 'warn' or $level eq 'error'; |
132
|
1
|
50
|
|
|
|
6
|
$msg= sprintf($msg, @args) if @args; |
133
|
1
|
|
|
|
|
5
|
push @$dest, [ $level, $msg ]; |
134
|
|
|
|
|
|
|
} |
135
|
|
|
|
|
|
|
: ref($dest)->can('info')? sub { |
136
|
124
|
|
|
124
|
|
704
|
my ($level, $msg, @args)= @_; |
137
|
124
|
100
|
|
|
|
461
|
$dest->$level( @args? sprintf($msg, @args) : $msg ) |
|
|
100
|
|
|
|
|
|
138
|
|
|
|
|
|
|
if $dest->can('is_'.$level)->($dest); |
139
|
|
|
|
|
|
|
} |
140
|
14
|
50
|
|
|
|
305
|
: croak "Don't know how to log to $dest"; |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
141
|
|
|
|
|
|
|
} |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
sub detect_input_format { |
145
|
1
|
|
|
1
|
1
|
4
|
my ($self, $filename, $magic)= @_; |
146
|
|
|
|
|
|
|
|
147
|
1
|
|
|
|
|
3
|
my $input= $self->input; |
148
|
|
|
|
|
|
|
# As convenience to spreadsheet users, let input be a parsed workbook/worksheet object. |
149
|
1
|
50
|
33
|
|
|
22
|
return ('XLSX', sheet => $input) |
150
|
|
|
|
|
|
|
if ref($input) && ref($input)->can('get_cell'); |
151
|
0
|
0
|
0
|
|
|
0
|
return ('XLSX', workbook => $input) |
152
|
|
|
|
|
|
|
if ref($input) && ref($input)->can('worksheets'); |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
# Load first block of file, unless supplied |
155
|
0
|
|
|
|
|
0
|
my $fpos; |
156
|
0
|
0
|
|
|
|
0
|
if (!defined $magic) { |
157
|
0
|
|
|
|
|
0
|
my $fh= $self->_file_handle; |
158
|
|
|
|
|
|
|
# Need to be able to seek. |
159
|
0
|
0
|
|
|
|
0
|
if (seek($fh, 0, 1)) { |
|
|
0
|
|
|
|
|
|
160
|
0
|
|
|
|
|
0
|
$fpos= tell $fh; |
161
|
0
|
|
|
|
|
0
|
read($fh, $magic, 4096); |
162
|
0
|
0
|
|
|
|
0
|
seek($fh, $fpos, 0) or croak "seek: $!"; |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
elsif ($fh->can('ungets')) { |
165
|
0
|
|
|
|
|
0
|
$fpos= 0; # to indicate that we did try reading the file |
166
|
0
|
|
|
|
|
0
|
read($fh, $magic, 4096); |
167
|
0
|
|
|
|
|
0
|
$fh->ungets($magic); |
168
|
|
|
|
|
|
|
} |
169
|
|
|
|
|
|
|
else { |
170
|
0
|
|
|
|
|
0
|
$self->_log->('notice',"Can't fully detect input format because handle is not seekable." |
171
|
|
|
|
|
|
|
." Consider fully buffering the file, or using FileHandle::Unget"); |
172
|
0
|
|
|
|
|
0
|
$magic= ''; |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
} |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
# Excel is obvious so check it first. This handles cases where an excel file is |
177
|
|
|
|
|
|
|
# erroneously named ".csv" and sillyness like that. |
178
|
0
|
0
|
|
|
|
0
|
return ( 'XLSX' ) if $magic =~ /^PK(\x03\x04|\x05\x06|\x07\x08)/; |
179
|
0
|
0
|
|
|
|
0
|
return ( 'XLS' ) if $magic =~ /^\xD0\xCF\x11\xE0/; |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
# Else trust the file extension, because TSV with commas can be very similar to CSV with |
182
|
|
|
|
|
|
|
# tabs in the data, and some crazy person might store an HTML document as the first element |
183
|
|
|
|
|
|
|
# of a CSV file. |
184
|
|
|
|
|
|
|
# Detect filename if not supplied |
185
|
0
|
0
|
|
|
|
0
|
if (!defined $filename) { |
186
|
0
|
|
|
|
|
0
|
$filename= ''; |
187
|
0
|
0
|
0
|
|
|
0
|
$filename= "$input" if defined $input and (!ref $input || ref($input) =~ /path|file/i); |
|
|
|
0
|
|
|
|
|
188
|
|
|
|
|
|
|
} |
189
|
0
|
0
|
|
|
|
0
|
if ($filename =~ /\.([^.]+)$/) { |
190
|
0
|
|
|
|
|
0
|
my $suffix= uc($1); |
191
|
0
|
0
|
|
|
|
0
|
return 'HTML' if $suffix eq 'HTM'; |
192
|
0
|
|
|
|
|
0
|
return $suffix; |
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
# Else probe some more... |
196
|
0
|
|
|
|
|
0
|
$self->_log->('debug',"Probing file format because no filename suffix"); |
197
|
0
|
0
|
|
|
|
0
|
length $magic or croak "Can't probe format. No filename suffix, and " |
|
|
0
|
|
|
|
|
|
198
|
|
|
|
|
|
|
.(!defined $fpos? "unseekable file handle" : "no content"); |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
# HTML is pretty obvious |
201
|
0
|
0
|
|
|
|
0
|
return 'HTML' if $magic =~ /^(\xEF\xBB\xBF|\xFF\xFE|\xFE\xFF)?<(!DOCTYPE )HTML/i; |
202
|
|
|
|
|
|
|
# Else guess between CSV and TSV |
203
|
0
|
|
|
|
|
0
|
my ($probably_csv, $probably_tsv)= (0,0); |
204
|
0
|
0
|
|
|
|
0
|
++$probably_csv if $magic =~ /^(\xEF\xBB\xBF|\xFF\xFE|\xFE\xFF)?["']?[\w ]+["']?,/; |
205
|
0
|
0
|
|
|
|
0
|
++$probably_tsv if $magic =~ /^(\xEF\xBB\xBF|\xFF\xFE|\xFE\xFF)?["']?[\w ]+["']?\t/; |
206
|
0
|
|
|
|
|
0
|
my $comma_count= () = ($magic =~ /,/g); |
207
|
0
|
|
|
|
|
0
|
my $tab_count= () = ($magic =~ /\t/g); |
208
|
0
|
|
|
|
|
0
|
my $eol_count= () = ($magic =~ /\n/g); |
209
|
0
|
0
|
0
|
|
|
0
|
++$probably_csv if $comma_count > $eol_count and $comma_count > $tab_count; |
210
|
0
|
0
|
0
|
|
|
0
|
++$probably_tsv if $tab_count > $eol_count and $tab_count > $comma_count; |
211
|
0
|
|
|
|
|
0
|
$self->_log->('debug', 'probe results: comma_count=%d tab_count=%d eol_count=%d probably_csv=%d probably_tsv=%d', |
212
|
|
|
|
|
|
|
$comma_count, $tab_count, $eol_count, $probably_csv, $probably_tsv); |
213
|
0
|
0
|
0
|
|
|
0
|
return 'CSV' if $probably_csv and $probably_csv > $probably_tsv; |
214
|
0
|
0
|
0
|
|
|
0
|
return 'TSV' if $probably_tsv and $probably_tsv > $probably_csv; |
215
|
0
|
|
|
|
|
0
|
croak "Can't determine file format"; |
216
|
|
|
|
|
|
|
} |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
has _table_found => ( is => 'rw', lazy => 1, builder => 1, clearer => 1, predicate => 1 ); |
220
|
|
|
|
|
|
|
sub _build__table_found { |
221
|
4
|
|
|
4
|
|
26
|
my $self= shift; |
222
|
4
|
|
|
|
|
10
|
my %loc= ( croak_on_fail => 1 ); |
223
|
4
|
|
|
|
|
57
|
$self->_find_table($self->decoder->iterator, \%loc); |
224
|
4
|
|
|
|
|
16
|
\%loc; |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
sub find_table { |
228
|
9
|
|
|
9
|
1
|
3078
|
my $self= shift; |
229
|
9
|
50
|
|
|
|
38
|
return 1 if $self->_has_table_found; |
230
|
9
|
|
|
|
|
13
|
my %loc; |
231
|
9
|
50
|
|
|
|
233
|
if ($self->_find_table($self->decoder->iterator, \%loc)) { |
232
|
9
|
|
|
|
|
157
|
$self->_table_found(\%loc); |
233
|
9
|
|
|
|
|
81
|
return 1; |
234
|
|
|
|
|
|
|
} |
235
|
0
|
|
|
|
|
0
|
return 0; |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
|
238
|
1
|
|
|
1
|
1
|
20
|
sub col_map { shift->_table_found->{col_map}; } |
239
|
6
|
|
|
6
|
1
|
1247
|
sub field_map { shift->_table_found->{field_map}; } |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
sub _find_table { |
242
|
13
|
|
|
13
|
|
31
|
my ($self, $data_iter, $stash)= @_; |
243
|
13
|
|
50
|
|
|
50
|
$stash ||= {}; |
244
|
13
|
|
66
|
|
|
79
|
while (!$self->_find_table_in_dataset($data_iter, $stash) |
|
|
|
66
|
|
|
|
|
245
|
|
|
|
|
|
|
&& !defined $stash->{fatal} |
246
|
|
|
|
|
|
|
&& $data_iter->next_dataset) |
247
|
|
|
|
|
|
|
{} |
248
|
13
|
50
|
|
|
|
41
|
if ($stash->{col_map}) { |
249
|
|
|
|
|
|
|
# Calculate field map from col map |
250
|
13
|
|
|
|
|
20
|
my $col_map= $stash->{col_map}; |
251
|
13
|
|
|
|
|
16
|
my %fmap; |
252
|
13
|
|
|
|
|
30
|
for my $i (0 .. $#$col_map) { |
253
|
61
|
100
|
|
|
|
100
|
next unless $col_map->[$i]; |
254
|
43
|
100
|
|
|
|
87
|
if ($col_map->[$i]->array) { |
255
|
10
|
|
|
|
|
13
|
push @{ $fmap{$col_map->[$i]->name} }, $i; |
|
10
|
|
|
|
|
20
|
|
256
|
|
|
|
|
|
|
} else { |
257
|
33
|
|
|
|
|
80
|
$fmap{$col_map->[$i]->name}= $i; |
258
|
|
|
|
|
|
|
} |
259
|
|
|
|
|
|
|
} |
260
|
13
|
|
|
|
|
26
|
$stash->{field_map}= \%fmap; |
261
|
|
|
|
|
|
|
# And record the stream position of the start of the table |
262
|
13
|
|
|
|
|
94
|
$stash->{first_record_pos}= $data_iter->tell; |
263
|
13
|
|
|
|
|
23
|
$stash->{data_iter}= $data_iter; |
264
|
13
|
|
|
|
|
35
|
return $stash; |
265
|
|
|
|
|
|
|
} |
266
|
|
|
|
|
|
|
else { |
267
|
0
|
|
0
|
|
|
0
|
my $err= $stash->{fatal} || "Can't locate valid header"; |
268
|
0
|
|
|
|
|
0
|
$self->_log->('error', $err); |
269
|
0
|
0
|
|
|
|
0
|
croak $err if $stash->{croak_on_fail}; |
270
|
0
|
|
|
|
|
0
|
return undef; |
271
|
|
|
|
|
|
|
} |
272
|
|
|
|
|
|
|
} |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
sub _find_table_in_dataset { |
275
|
14
|
|
|
14
|
|
51
|
my ($self, $data_iter, $stash)= @_; |
276
|
|
|
|
|
|
|
# If header_row_at is undef, then there is no header. |
277
|
|
|
|
|
|
|
# Ensure static_field_order, then set up columns. |
278
|
14
|
|
|
|
|
38
|
my @fields= $self->field_list; |
279
|
14
|
|
|
|
|
113
|
my $header_at= $self->header_row_at; |
280
|
14
|
50
|
|
|
|
33
|
if (!defined $header_at) { |
281
|
0
|
0
|
|
|
|
0
|
unless ($self->static_field_order) { |
282
|
0
|
|
|
|
|
0
|
$stash->{fatal}= "You must enable 'static_field_order' if there is no header row"; |
283
|
0
|
|
|
|
|
0
|
return; |
284
|
|
|
|
|
|
|
} |
285
|
0
|
|
|
|
|
0
|
$stash->{col_map}= \@fields; |
286
|
0
|
|
|
|
|
0
|
return 1; |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
# If headers contain "\n", we need to collect multiple cells per column |
290
|
14
|
|
|
|
|
198
|
my $row_accum= $self->header_row_combine; |
291
|
|
|
|
|
|
|
|
292
|
14
|
50
|
|
|
|
69
|
my ($start, $end)= ref $header_at? @$header_at : ( $header_at, $header_at ); |
293
|
14
|
|
|
|
|
18
|
my @rows; |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
# If header_row_at doesn't start at 1, seek forward |
296
|
14
|
50
|
|
|
|
33
|
if ($start > 1) { |
297
|
0
|
|
|
|
|
0
|
$self->_log->('trace', 'Skipping to row %s', $start); |
298
|
0
|
|
|
|
|
0
|
push @rows, $data_iter->() for 1..$start-1; |
299
|
|
|
|
|
|
|
} |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
# Scan through the rows of the dataset up to the end of header_row_at, accumulating rows so that |
302
|
|
|
|
|
|
|
# multi-line regexes can match. |
303
|
14
|
|
|
|
|
43
|
for ($start .. $end) { |
304
|
23
|
|
|
|
|
124
|
my $vals= $data_iter->(); |
305
|
23
|
100
|
|
|
|
50
|
if (!$vals) { # if undef, we reached end of dataset |
306
|
1
|
|
|
|
|
16
|
$self->_log->('trace', 'EOF'); |
307
|
1
|
|
|
|
|
8
|
last; |
308
|
|
|
|
|
|
|
} |
309
|
22
|
100
|
|
|
|
42
|
if ($row_accum > 1) { |
310
|
5
|
|
|
|
|
10
|
push @rows, $vals; |
311
|
5
|
|
|
|
|
13
|
shift @rows while @rows > $row_accum; |
312
|
5
|
|
|
|
|
7
|
$vals= [ map { my $c= $_; join("\n", map $_->[$c], @rows) } 0 .. $#{$rows[-1]} ]; |
|
28
|
|
|
|
|
34
|
|
|
28
|
|
|
|
|
67
|
|
|
5
|
|
|
|
|
38
|
|
313
|
5
|
|
|
|
|
19
|
$stash->{context}= $row_accum.' rows ending at '.$data_iter->position; |
314
|
|
|
|
|
|
|
} else { |
315
|
17
|
|
|
|
|
45
|
$stash->{context}= $data_iter->position; |
316
|
|
|
|
|
|
|
} |
317
|
22
|
|
|
|
|
371
|
$self->_log->('trace', 'Checking for headers on %s', $stash->{context}); |
318
|
22
|
|
|
|
|
161
|
$stash->{context}.= ': '; |
319
|
22
|
50
|
|
|
|
91
|
$stash->{col_map}= $self->static_field_order? |
320
|
|
|
|
|
|
|
# If static field order, look for headers in sequence |
321
|
|
|
|
|
|
|
$self->_match_headers_static($vals, $stash) |
322
|
|
|
|
|
|
|
# else search for each header |
323
|
|
|
|
|
|
|
: $self->_match_headers_dynamic($vals, $stash); |
324
|
22
|
100
|
|
|
|
99
|
return 1 if $stash->{col_map}; |
325
|
9
|
50
|
|
|
|
20
|
return if $stash->{fatal}; |
326
|
9
|
|
|
|
|
178
|
$self->_log->('debug', '%sNo match', $stash->{context}); |
327
|
|
|
|
|
|
|
} |
328
|
1
|
|
|
|
|
14
|
$self->_log->('warn','No row in dataset matched full header requirements'); |
329
|
1
|
|
|
|
|
376
|
return; |
330
|
|
|
|
|
|
|
} |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
sub _match_headers_static { |
333
|
0
|
|
|
0
|
|
0
|
my ($self, $header, $stash)= @_; |
334
|
0
|
|
|
|
|
0
|
my $fields= $self->fields; |
335
|
0
|
|
|
|
|
0
|
for my $i (0 .. $#$fields) { |
336
|
0
|
0
|
|
|
|
0
|
next if $header->[$i] =~ $fields->[$i]->header_regex; |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
# Field header doesn't match. Start over on next row. |
339
|
0
|
|
0
|
|
|
0
|
$self->_log->('debug','%sMissing field %s', $stash->{context}||'', $fields->[$i]->name); |
340
|
0
|
|
|
|
|
0
|
return; |
341
|
|
|
|
|
|
|
} |
342
|
|
|
|
|
|
|
# found a match for every field! |
343
|
0
|
|
0
|
|
|
0
|
$self->_log->('debug','%sFound!', $stash->{context}||''); |
344
|
0
|
|
|
|
|
0
|
return $fields; |
345
|
|
|
|
|
|
|
} |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
sub _match_headers_dynamic { |
348
|
22
|
|
|
22
|
|
33
|
my ($self, $header, $stash)= @_; |
349
|
22
|
|
50
|
|
|
53
|
my $context= $stash->{context} || ''; |
350
|
22
|
|
|
|
|
28
|
my %col_map; |
351
|
22
|
|
|
|
|
308
|
my $fields= $self->fields; |
352
|
|
|
|
|
|
|
my $free_fields= $stash->{free_fields} ||= [ |
353
|
|
|
|
|
|
|
# Sort required fields to front, to fail faster on non-matching rows |
354
|
36
|
100
|
|
|
|
91
|
sort { $a->required? -1 : $b->required? 1 : 0 } |
|
|
100
|
|
|
|
|
|
355
|
22
|
|
100
|
|
|
135
|
grep { !$_->follows_list } @$fields |
|
38
|
|
|
|
|
79
|
|
356
|
|
|
|
|
|
|
]; |
357
|
|
|
|
|
|
|
my $follows_fields= $stash->{follows_fields} ||= [ |
358
|
22
|
|
100
|
|
|
63
|
grep { $_->follows_list } @$fields |
|
38
|
|
|
|
|
62
|
|
359
|
|
|
|
|
|
|
]; |
360
|
22
|
|
|
|
|
44
|
for my $f (@$free_fields) { |
361
|
46
|
|
|
|
|
632
|
my $hr= $f->header_regex; |
362
|
46
|
|
|
|
|
794
|
$self->_log->('debug', 'looking for %s', $hr); |
363
|
46
|
|
|
|
|
313
|
my @found= grep { $header->[$_] =~ $hr } 0 .. $#$header; |
|
268
|
|
|
|
|
741
|
|
364
|
46
|
100
|
|
|
|
125
|
if (@found == 1) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
365
|
34
|
50
|
|
|
|
78
|
if ($col_map{$found[0]}) { |
366
|
0
|
|
|
|
|
0
|
$self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header)); |
367
|
|
|
|
|
|
|
$self->_log->('info','%sField %s and %s both match column %s', |
368
|
0
|
|
|
|
|
0
|
$context, $f->name, $col_map{$found[0]}->name, $found[0]); |
369
|
0
|
|
|
|
|
0
|
return; |
370
|
|
|
|
|
|
|
} |
371
|
34
|
|
|
|
|
81
|
$col_map{$found[0]}= $f; |
372
|
|
|
|
|
|
|
} |
373
|
|
|
|
|
|
|
elsif (@found > 1) { |
374
|
1
|
50
|
|
|
|
5
|
if ($f->array) { |
375
|
|
|
|
|
|
|
# Array columns may be found more than once |
376
|
1
|
|
|
|
|
6
|
$col_map{$_}= $f for @found; |
377
|
|
|
|
|
|
|
} else { |
378
|
0
|
|
|
|
|
0
|
$self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header)); |
379
|
0
|
|
|
|
|
0
|
$self->_log->('info','%sField %s matches more than one column: %s', |
380
|
|
|
|
|
|
|
$context, $f->name, join(', ', @found)); |
381
|
0
|
|
|
|
|
0
|
return; |
382
|
|
|
|
|
|
|
} |
383
|
|
|
|
|
|
|
} |
384
|
|
|
|
|
|
|
elsif ($f->required) { |
385
|
9
|
|
|
|
|
27
|
$self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header)); |
386
|
9
|
|
|
|
|
175
|
$self->_log->('info','%sNo match for required field %s', $context, $f->name); |
387
|
9
|
|
|
|
|
3453
|
return; |
388
|
|
|
|
|
|
|
} |
389
|
|
|
|
|
|
|
# else Not required, and not found |
390
|
|
|
|
|
|
|
} |
391
|
|
|
|
|
|
|
# Need to have found at least one column (even if none required) |
392
|
13
|
50
|
|
|
|
35
|
unless (keys %col_map) { |
393
|
0
|
|
|
|
|
0
|
$self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header)); |
394
|
0
|
|
|
|
|
0
|
$self->_log->('debug','%sNo field headers found', $context); |
395
|
0
|
|
|
|
|
0
|
return; |
396
|
|
|
|
|
|
|
} |
397
|
|
|
|
|
|
|
# Now, check for any of the 'follows' fields, some of which might also be 'required'. |
398
|
13
|
100
|
|
|
|
31
|
if (@$follows_fields) { |
399
|
2
|
|
|
|
|
3
|
my %following; |
400
|
|
|
|
|
|
|
my %found; |
401
|
2
|
|
|
|
|
6
|
for my $i (0 .. $#$header) { |
402
|
17
|
100
|
|
|
|
31
|
if ($col_map{$i}) { |
403
|
5
|
|
|
|
|
17
|
%following= ( $col_map{$i}->name => $col_map{$i} ); |
404
|
|
|
|
|
|
|
} else { |
405
|
12
|
|
|
|
|
18
|
my $val= $header->[$i]; |
406
|
12
|
|
|
|
|
14
|
my @match; |
407
|
12
|
|
|
|
|
16
|
for my $f (@$follows_fields) { |
408
|
18
|
100
|
|
|
|
46
|
next unless grep $following{$_}, $f->follows_list; |
409
|
9
|
100
|
|
|
|
129
|
push @match, $f if $val =~ $f->header_regex; |
410
|
|
|
|
|
|
|
} |
411
|
12
|
100
|
|
|
|
72
|
if (@match == 1) { |
|
|
50
|
|
|
|
|
|
412
|
7
|
50
|
66
|
|
|
28
|
if ($found{$match[0]} && !$match[0]->array) { |
413
|
0
|
|
|
|
|
0
|
$self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header)); |
414
|
0
|
|
|
|
|
0
|
$self->_log->('info','%sField %s matches multiple columns', |
415
|
|
|
|
|
|
|
$context, $match[0]->name); |
416
|
0
|
|
|
|
|
0
|
return; |
417
|
|
|
|
|
|
|
} |
418
|
7
|
|
|
|
|
12
|
$col_map{$i}= $match[0]; |
419
|
7
|
|
|
|
|
47
|
$found{$match[0]}= $i; |
420
|
7
|
|
|
|
|
18
|
$following{$match[0]->name}= $match[0]; |
421
|
|
|
|
|
|
|
} |
422
|
|
|
|
|
|
|
elsif (@match > 1) { |
423
|
0
|
|
|
|
|
0
|
$self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header)); |
424
|
0
|
|
|
|
|
0
|
$self->_log->('info','%sField %s and %s both match column %d', |
425
|
|
|
|
|
|
|
$context, $match[0]->name, $match[1]->name, $i+1); |
426
|
0
|
|
|
|
|
0
|
return; |
427
|
|
|
|
|
|
|
} |
428
|
|
|
|
|
|
|
else { |
429
|
5
|
|
|
|
|
20
|
%following= (); |
430
|
|
|
|
|
|
|
} |
431
|
|
|
|
|
|
|
} |
432
|
|
|
|
|
|
|
} |
433
|
|
|
|
|
|
|
# Check if any of the 'follows' fields were required |
434
|
2
|
50
|
|
|
|
5
|
if (my @unfound= grep { !$found{$_} && $_->required } @$follows_fields) { |
|
3
|
50
|
|
|
|
14
|
|
435
|
0
|
|
|
|
|
0
|
$self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header)); |
436
|
|
|
|
|
|
|
$self->_log->('info','%sNo match for required %s %s', $context, |
437
|
0
|
0
|
|
|
|
0
|
(@unfound > 1? ('fields', join(', ', map { $_->name } sort @unfound)) |
|
0
|
|
|
|
|
0
|
|
438
|
|
|
|
|
|
|
: ('field', $unfound[0]->name) |
439
|
|
|
|
|
|
|
)); |
440
|
0
|
|
|
|
|
0
|
return; |
441
|
|
|
|
|
|
|
} |
442
|
|
|
|
|
|
|
} |
443
|
|
|
|
|
|
|
# Now, if there are any un-claimed columns, handle per 'on_unknown_columns' setting. |
444
|
13
|
|
|
|
|
31
|
my @unclaimed= grep { !$col_map{$_} } 0 .. $#$header; |
|
61
|
|
|
|
|
113
|
|
445
|
13
|
100
|
|
|
|
33
|
if (@unclaimed) { |
446
|
5
|
|
|
|
|
12
|
my $act= $self->on_unknown_columns; |
447
|
5
|
|
|
|
|
16
|
my $unknown_list= join(', ', map $self->_fmt_header_text($header->[$_]), @unclaimed); |
448
|
5
|
50
|
|
|
|
13
|
$act= $act->($self, $header, \@unclaimed) if ref $act eq 'CODE'; |
449
|
5
|
50
|
|
|
|
13
|
if ($act eq 'use') { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
450
|
5
|
|
|
|
|
80
|
$self->_log->('warn','%sIgnoring unknown columns: %s', $context, $unknown_list); |
451
|
|
|
|
|
|
|
} elsif ($act eq 'next') { |
452
|
0
|
|
|
|
|
0
|
$self->_log->('warn','%sWould match except for unknown columns: %s', |
453
|
|
|
|
|
|
|
$context, $unknown_list); |
454
|
|
|
|
|
|
|
} elsif ($act eq 'die') { |
455
|
0
|
|
|
|
|
0
|
$stash->{fatal}= "${context}Header row includes unknown columns: $unknown_list"; |
456
|
|
|
|
|
|
|
} else { |
457
|
0
|
|
|
|
|
0
|
$stash->{fatal}= "Invalid action '$act' for 'on_unknown_columns'"; |
458
|
|
|
|
|
|
|
} |
459
|
5
|
|
|
|
|
1953
|
$self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header)); |
460
|
5
|
50
|
|
|
|
46
|
return if $stash->{fatal}; |
461
|
|
|
|
|
|
|
} |
462
|
13
|
|
|
|
|
80
|
return [ map $col_map{$_}, 0 .. $#$header ]; |
463
|
|
|
|
|
|
|
} |
464
|
|
|
|
|
|
|
# Make header string readable for log messages |
465
|
|
|
|
|
|
|
sub _fmt_header_text { |
466
|
87
|
100
|
|
87
|
|
128
|
shift if ref $_[0]; |
467
|
87
|
|
|
|
|
106
|
my $x= shift; |
468
|
87
|
|
|
|
|
127
|
$x =~ s/ ( [^[:print:]] ) / sprintf("\\x%02X", ord $1 ) /gex; |
|
11
|
|
|
|
|
36
|
|
469
|
87
|
|
|
|
|
376
|
qq{"$x"}; |
470
|
|
|
|
|
|
|
} |
471
|
|
|
|
|
|
|
# format the colmap into a string |
472
|
|
|
|
|
|
|
sub _colmap_progress_str { |
473
|
14
|
|
|
14
|
|
26
|
my ($colmap, $headers)= @_; |
474
|
|
|
|
|
|
|
join(' ', map { |
475
|
14
|
100
|
|
|
|
29
|
$colmap->{$_}? $_.'='.$colmap->{$_}->name |
|
95
|
|
|
|
|
294
|
|
476
|
|
|
|
|
|
|
: $_.':'._fmt_header_text($headers->[$_]) |
477
|
|
|
|
|
|
|
} 0 .. $#$headers) |
478
|
|
|
|
|
|
|
} |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
sub iterator { |
482
|
15
|
|
|
15
|
1
|
5489
|
my $self= shift; |
483
|
15
|
|
|
|
|
292
|
my $fields= $self->fields; |
484
|
|
|
|
|
|
|
# Creating the record iterator consumes the data source's iterator. |
485
|
|
|
|
|
|
|
# The first time after detecting the table, we continue with the same iterator. |
486
|
|
|
|
|
|
|
# Every time after that we need to create a new data iterator and seek to the |
487
|
|
|
|
|
|
|
# first record under the header. |
488
|
15
|
|
|
|
|
248
|
my $data_iter= delete $self->_table_found->{data_iter}; |
489
|
15
|
100
|
|
|
|
76
|
unless ($data_iter) { |
490
|
2
|
|
|
|
|
25
|
$data_iter= $self->decoder->iterator; |
491
|
2
|
|
|
|
|
29
|
$data_iter->seek($self->_table_found->{first_record_pos}); |
492
|
|
|
|
|
|
|
} |
493
|
15
|
|
|
|
|
208
|
my $col_map= $self->_table_found->{col_map}; |
494
|
15
|
|
|
|
|
234
|
my $field_map= $self->_table_found->{field_map}; |
495
|
15
|
|
|
|
|
118
|
my @row_slice; # one column index per field, and possibly more for array_val_map |
496
|
|
|
|
|
|
|
my @arrayvals; # list of source index and destination index for building array values |
497
|
15
|
|
|
|
|
0
|
my @field_names; # ordered list of field names where row slice should be assigned |
498
|
15
|
|
|
|
|
0
|
my %trimmer; # list of trim functions and the array indicies they should be applied to |
499
|
15
|
|
|
|
|
0
|
my @blank_val; # blank value per each fetched column |
500
|
15
|
|
|
|
|
0
|
my @type_check;# list of |
501
|
15
|
|
|
|
|
0
|
my $class; # optional object class for the resulting rows |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
# If result is array, the slice of the row must match the position of the fields in the |
504
|
|
|
|
|
|
|
# $self->fields array. If a field was not found it will get an undef for that slot. |
505
|
|
|
|
|
|
|
# It also results in an undef for secondary fields of the same name as the first. |
506
|
15
|
50
|
|
|
|
47
|
if ($self->record_class eq 'ARRAY') { |
507
|
0
|
|
|
|
|
0
|
my %remaining= %$field_map; |
508
|
|
|
|
|
|
|
@row_slice= map { |
509
|
0
|
|
|
|
|
0
|
my $src= delete $remaining{$_->name}; |
|
0
|
|
|
|
|
0
|
|
510
|
0
|
0
|
|
|
|
0
|
defined $src? $src : 0x7FFFFFFF |
511
|
|
|
|
|
|
|
} @$fields; |
512
|
|
|
|
|
|
|
} |
513
|
|
|
|
|
|
|
# If result is anything else, then only slice out the columns that are used for the fields |
514
|
|
|
|
|
|
|
# that we located. |
515
|
|
|
|
|
|
|
else { |
516
|
15
|
50
|
|
|
|
39
|
$class= $self->record_class |
517
|
|
|
|
|
|
|
unless 'HASH' eq $self->record_class; |
518
|
15
|
|
|
|
|
45
|
@field_names= keys %$field_map; |
519
|
15
|
|
|
|
|
40
|
@row_slice= values %$field_map; |
520
|
|
|
|
|
|
|
} |
521
|
|
|
|
|
|
|
# For any field whose value is an array of more that one source column, |
522
|
|
|
|
|
|
|
# encode those details in @arrayvals, and update @row_slice and @trim_idx accordingly |
523
|
15
|
|
|
|
|
45
|
for (0 .. $#row_slice) { |
524
|
41
|
100
|
|
|
|
75
|
if (!ref $row_slice[$_]) { |
525
|
39
|
|
|
|
|
52
|
my $field= $col_map->[$row_slice[$_]]; |
526
|
39
|
100
|
|
|
|
576
|
if (my $t= $field->trim_coderef) { |
527
|
38
|
|
100
|
|
|
154
|
$trimmer{$t} ||= [ $t, [] ]; |
528
|
38
|
|
|
|
|
51
|
push @{ $trimmer{$t}[1] }, $_; |
|
38
|
|
|
|
|
98
|
|
529
|
|
|
|
|
|
|
} |
530
|
39
|
|
|
|
|
73
|
push @blank_val, $field->blank; |
531
|
39
|
100
|
|
|
|
106
|
push @type_check, $self->_make_validation_callback($field, $_) |
532
|
|
|
|
|
|
|
if $field->type; |
533
|
|
|
|
|
|
|
} |
534
|
|
|
|
|
|
|
else { |
535
|
|
|
|
|
|
|
# This field is an array-value, so add the src columns to @row_slice |
536
|
|
|
|
|
|
|
# and list it in @arrayvals, and update @trim_idx if needed |
537
|
2
|
|
|
|
|
4
|
my $src= $row_slice[$_]; |
538
|
2
|
|
|
|
|
3
|
$row_slice[$_]= 0x7FFFFFFF; |
539
|
2
|
|
|
|
|
4
|
my $from= @row_slice; |
540
|
2
|
|
|
|
|
5
|
push @row_slice, @$src; |
541
|
2
|
|
|
|
|
6
|
push @arrayvals, [ $_, $from, scalar @$src ]; |
542
|
2
|
|
|
|
|
4
|
for ($from .. $#row_slice) { |
543
|
10
|
|
|
|
|
28
|
my $field= $col_map->[$row_slice[$_]]; |
544
|
10
|
50
|
|
|
|
130
|
if (my $t= $field->trim_coderef) { |
545
|
10
|
|
100
|
|
|
69
|
$trimmer{$t} ||= [ $t, [] ]; |
546
|
10
|
|
|
|
|
10
|
push @{ $trimmer{$t}[1] }, $_; |
|
10
|
|
|
|
|
21
|
|
547
|
|
|
|
|
|
|
} |
548
|
10
|
|
|
|
|
17
|
push @blank_val, $field->blank; |
549
|
10
|
50
|
|
|
|
24
|
push @type_check, $self->_make_validation_callback($field, $_) |
550
|
|
|
|
|
|
|
if $field->type; |
551
|
|
|
|
|
|
|
} |
552
|
|
|
|
|
|
|
} |
553
|
|
|
|
|
|
|
} |
554
|
15
|
|
|
|
|
42
|
my @trim= values %trimmer; |
555
|
15
|
|
|
|
|
19
|
@arrayvals= reverse @arrayvals; |
556
|
15
|
|
|
|
|
32
|
my ($n_blank, $first_blank, $eof); |
557
|
|
|
|
|
|
|
my $sub= sub { |
558
|
50
|
100
|
50
|
50
|
|
2183
|
again: |
|
|
|
66
|
|
|
|
|
559
|
|
|
|
|
|
|
# Pull the specific slice of the next row that we need |
560
|
|
|
|
|
|
|
my $row= !$eof && $data_iter->(\@row_slice) |
561
|
|
|
|
|
|
|
or ++$eof && return undef; |
562
|
|
|
|
|
|
|
# Apply 'trim' to any column whose field requested it |
563
|
36
|
|
|
|
|
70
|
for my $t (@trim) { |
564
|
38
|
|
|
|
|
51
|
$t->[0]->() for grep defined, @{$row}[@{$t->[1]}]; |
|
38
|
|
|
|
|
138
|
|
|
38
|
|
|
|
|
57
|
|
565
|
|
|
|
|
|
|
} |
566
|
|
|
|
|
|
|
# Apply 'blank value' to every column which is zero length |
567
|
36
|
|
|
|
|
51
|
$n_blank= 0; |
568
|
|
|
|
|
|
|
$row->[$_]= $blank_val[$_] |
569
|
36
|
100
|
100
|
|
|
69
|
for grep { (!defined $row->[$_] || !length $row->[$_]) && ++$n_blank } 0..$#$row; |
|
97
|
|
|
|
|
348
|
|
570
|
|
|
|
|
|
|
# If all are blank, then handle according to $on_blank_row setting |
571
|
36
|
100
|
|
|
|
97
|
if ($n_blank == @$row) { |
|
|
100
|
|
|
|
|
|
572
|
2
|
|
33
|
|
|
10
|
$first_blank ||= $data_iter->position; |
573
|
2
|
|
|
|
|
10
|
goto again; |
574
|
|
|
|
|
|
|
} elsif ($first_blank) { |
575
|
2
|
50
|
|
|
|
7
|
unless ($self->_handle_blank_row($first_blank, $data_iter->position)) { |
576
|
0
|
|
|
|
|
0
|
$eof= 1; |
577
|
0
|
|
|
|
|
0
|
return undef; |
578
|
|
|
|
|
|
|
} |
579
|
2
|
|
|
|
|
4
|
$first_blank= undef; |
580
|
|
|
|
|
|
|
} |
581
|
|
|
|
|
|
|
# Check type constraints, if any |
582
|
34
|
100
|
|
|
|
61
|
if (@type_check) { |
583
|
18
|
100
|
|
|
|
35
|
if (my @failed= map $_->($row), @type_check) { |
584
|
6
|
100
|
|
|
|
17
|
$self->_handle_validation_fail(\@failed, $row, $data_iter->position.': ') |
585
|
|
|
|
|
|
|
or goto again; |
586
|
|
|
|
|
|
|
} |
587
|
|
|
|
|
|
|
} |
588
|
|
|
|
|
|
|
# Collect all the array-valued fields from the tail of the row |
589
|
30
|
|
|
|
|
57
|
$row->[$_->[0]]= [ splice @$row, $_->[1], $_->[2] ] for @arrayvals; |
590
|
|
|
|
|
|
|
# stop here if the return class is 'ARRAY' |
591
|
30
|
50
|
|
|
|
67
|
return $row unless @field_names; |
592
|
|
|
|
|
|
|
# Convert the row to a hashref |
593
|
30
|
|
|
|
|
37
|
my %rec; |
594
|
30
|
|
|
|
|
79
|
@rec{@field_names}= @$row; |
595
|
|
|
|
|
|
|
# Construct a class, if requested, else return hashref |
596
|
30
|
50
|
|
|
|
164
|
return $class? $class->new(\%rec) : \%rec; |
597
|
15
|
|
|
|
|
89
|
}; |
598
|
15
|
|
|
|
|
109
|
return Data::TableReader::_RecIter->new( |
599
|
|
|
|
|
|
|
$sub, { data_iter => $data_iter, reader => $self }, |
600
|
|
|
|
|
|
|
); |
601
|
|
|
|
|
|
|
} |
602
|
|
|
|
|
|
|
|
603
|
|
|
|
|
|
|
sub _make_validation_callback { |
604
|
4
|
|
|
4
|
|
10
|
my ($self, $field, $index)= @_; |
605
|
4
|
|
|
|
|
8
|
my $t= $field->type; |
606
|
|
|
|
|
|
|
ref $t eq 'CODE'? sub { |
607
|
18
|
|
|
18
|
|
41
|
my $e= $t->($_[0][$index]); |
608
|
18
|
100
|
|
|
|
162
|
defined $e? ([ $field, $index, $e ]) : () |
609
|
|
|
|
|
|
|
} |
610
|
|
|
|
|
|
|
: $t->can('validate')? sub { |
611
|
0
|
|
|
0
|
|
0
|
my $e= $t->validate($_[0][$index]); |
612
|
0
|
0
|
|
|
|
0
|
defined $e? ([ $field, $index, $e ]) : () |
613
|
|
|
|
|
|
|
} |
614
|
4
|
0
|
|
|
|
26
|
: croak "Invalid type constraint $t on field ".$field->name; |
|
|
50
|
|
|
|
|
|
615
|
|
|
|
|
|
|
} |
616
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
sub _handle_blank_row { |
618
|
2
|
|
|
2
|
|
6
|
my ($self, $first, $last)= @_; |
619
|
2
|
|
|
|
|
8
|
my $act= $self->on_blank_row; |
620
|
2
|
50
|
|
|
|
8
|
$act= $act->($self, $first, $last) |
621
|
|
|
|
|
|
|
if ref $act eq 'CODE'; |
622
|
2
|
50
|
|
|
|
7
|
if ($act eq 'next') { |
623
|
2
|
|
|
|
|
39
|
$self->_log->('warn', 'Skipping blank rows from %s until %s', $first, $last); |
624
|
2
|
|
|
|
|
954
|
return 1; |
625
|
|
|
|
|
|
|
} |
626
|
0
|
0
|
|
|
|
0
|
if ($act eq 'last') { |
627
|
0
|
|
|
|
|
0
|
$self->_log->('warn', 'Ending at blank row %s', $first); |
628
|
0
|
|
|
|
|
0
|
return 0; |
629
|
|
|
|
|
|
|
} |
630
|
0
|
0
|
|
|
|
0
|
if ($act eq 'die') { |
631
|
0
|
|
|
|
|
0
|
my $msg= "Encountered blank rows at $first..$last"; |
632
|
0
|
|
|
|
|
0
|
$self->_log->('error', $msg); |
633
|
0
|
|
|
|
|
0
|
croak $msg; |
634
|
|
|
|
|
|
|
} |
635
|
0
|
|
|
|
|
0
|
croak "Invalid value for 'on_blank_row': \"$act\""; |
636
|
|
|
|
|
|
|
} |
637
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
sub _handle_validation_fail { |
639
|
6
|
|
|
6
|
|
14
|
my ($self, $failures, $values, $context)= @_; |
640
|
6
|
|
|
|
|
17
|
my $act= $self->on_validation_fail; |
641
|
6
|
100
|
|
|
|
24
|
$act= $act->($self, $failures, $values, $context) |
642
|
|
|
|
|
|
|
if ref $act eq 'CODE'; |
643
|
6
|
|
|
|
|
43
|
my $errors= join(', ', map $_->[0]->name.': '.$_->[2], @$failures); |
644
|
6
|
100
|
|
|
|
14
|
if ($act eq 'next') { |
645
|
3
|
50
|
|
|
|
51
|
$self->_log->('warn', "%sSkipped for data errors: %s", $context, $errors) if $errors; |
646
|
3
|
|
|
|
|
1087
|
return 0; |
647
|
|
|
|
|
|
|
} |
648
|
3
|
100
|
|
|
|
6
|
if ($act eq 'use') { |
649
|
2
|
100
|
|
|
|
20
|
$self->_log->('warn', "%sPossible data errors: %s", $context, $errors) if $errors; |
650
|
2
|
|
|
|
|
6
|
return 1; |
651
|
|
|
|
|
|
|
} |
652
|
1
|
50
|
|
|
|
16
|
if ($act eq 'die') { |
653
|
1
|
|
|
|
|
3
|
my $msg= "${context}Invalid record: $errors"; |
654
|
1
|
|
|
|
|
21
|
$self->_log->('error', $msg); |
655
|
1
|
|
|
|
|
656
|
croak $msg; |
656
|
|
|
|
|
|
|
} |
657
|
|
|
|
|
|
|
} |
658
|
|
|
|
|
|
|
|
659
|
4
|
|
|
4
|
|
885
|
BEGIN { @Data::TableReader::_RecIter::ISA= ( 'Data::TableReader::Iterator' ) } |
660
|
|
|
|
|
|
|
sub Data::TableReader::_RecIter::all { |
661
|
9
|
|
|
9
|
|
16
|
my $self= shift; |
662
|
9
|
|
|
|
|
22
|
my (@rec, $x); |
663
|
9
|
|
|
|
|
19
|
push @rec, $x while ($x= $self->()); |
664
|
9
|
|
|
|
|
200
|
return \@rec; |
665
|
|
|
|
|
|
|
} |
666
|
|
|
|
|
|
|
sub Data::TableReader::_RecIter::position { |
667
|
0
|
|
|
0
|
|
0
|
shift->_fields->{data_iter}->position(@_); |
668
|
|
|
|
|
|
|
} |
669
|
|
|
|
|
|
|
sub Data::TableReader::_RecIter::progress { |
670
|
0
|
|
|
0
|
|
0
|
shift->_fields->{data_iter}->progress(@_); |
671
|
|
|
|
|
|
|
} |
672
|
|
|
|
|
|
|
sub Data::TableReader::_RecIter::tell { |
673
|
0
|
|
|
0
|
|
0
|
shift->_fields->{data_iter}->tell(@_); |
674
|
|
|
|
|
|
|
} |
675
|
|
|
|
|
|
|
sub Data::TableReader::_RecIter::seek { |
676
|
0
|
|
|
0
|
|
0
|
shift->_fields->{data_iter}->seek(@_); |
677
|
|
|
|
|
|
|
} |
678
|
|
|
|
|
|
|
sub Data::TableReader::_RecIter::next_dataset { |
679
|
|
|
|
|
|
|
shift->_fields->{reader}->_log |
680
|
0
|
|
|
0
|
|
0
|
->('warn',"Searching for supsequent table headers is not supported yet"); |
681
|
0
|
|
|
|
|
0
|
return 0; |
682
|
|
|
|
|
|
|
} |
683
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
1; |
685
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
__END__ |