line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Data::TableReader::Decoder::CSV; |
2
|
4
|
|
|
4
|
|
97052
|
use Moo 2; |
|
4
|
|
|
|
|
10251
|
|
|
4
|
|
|
|
|
37
|
|
3
|
4
|
|
|
4
|
|
2267
|
use Try::Tiny; |
|
4
|
|
|
|
|
10
|
|
|
4
|
|
|
|
|
236
|
|
4
|
4
|
|
|
4
|
|
24
|
use Carp; |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
218
|
|
5
|
4
|
|
|
4
|
|
1962
|
use IO::Handle; |
|
4
|
|
|
|
|
22782
|
|
|
4
|
|
|
|
|
3700
|
|
6
|
|
|
|
|
|
|
extends 'Data::TableReader::Decoder'; |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
# ABSTRACT: Access rows of a comma-delimited text file |
9
|
|
|
|
|
|
|
our $VERSION = '0.011'; # VERSION |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
our @csv_probe_modules= ( ['Text::CSV_XS' => 1.06], ['Text::CSV' => 1.91] ); |
12
|
|
|
|
|
|
|
our $default_csv_module; |
13
|
|
|
|
|
|
|
sub default_csv_module { |
14
|
21
|
|
66
|
21
|
0
|
588
|
$default_csv_module ||= |
15
|
|
|
|
|
|
|
Data::TableReader::Decoder::_first_sufficient_module('CSV parser', \@csv_probe_modules); |
16
|
|
|
|
|
|
|
} |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
has _parser_args => ( is => 'ro', init_arg => 'parser' ); |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
has parser => ( is => 'lazy', init_arg => undef ); |
22
|
|
|
|
|
|
|
sub _build_parser { |
23
|
16
|
|
|
16
|
|
130
|
my $self= shift; |
24
|
16
|
|
50
|
|
|
85
|
my $args= $self->_parser_args || {}; |
25
|
16
|
50
|
|
|
|
114
|
return $args if ref($args)->can('getline'); |
26
|
16
|
|
|
|
|
49
|
return $self->default_csv_module->new({ |
27
|
|
|
|
|
|
|
binary => 1, |
28
|
|
|
|
|
|
|
allow_loose_quotes => 1, |
29
|
|
|
|
|
|
|
auto_diag => 2, |
30
|
|
|
|
|
|
|
%$args |
31
|
|
|
|
|
|
|
}); |
32
|
|
|
|
|
|
|
} |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
has autodetect_encoding => ( is => 'rw', default => sub { 1 } ); |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
sub encoding { |
38
|
18
|
|
|
18
|
1
|
43
|
my ($self, $enc)= @_; |
39
|
18
|
|
|
|
|
36
|
my $fh= $self->file_handle; |
40
|
18
|
50
|
|
|
|
51
|
if (defined $enc) { |
41
|
0
|
|
|
|
|
0
|
binmode($fh, ":encoding($enc)"); |
42
|
0
|
|
|
|
|
0
|
return $enc; |
43
|
|
|
|
|
|
|
} |
44
|
|
|
|
|
|
|
|
45
|
18
|
|
|
|
|
114
|
my @layers= PerlIO::get_layers($fh); |
46
|
18
|
50
|
|
|
|
60
|
if (($enc)= grep { /^encoding|^utf/ } @layers) { |
|
23
|
|
|
|
|
124
|
|
47
|
|
|
|
|
|
|
# extract encoding name |
48
|
0
|
0
|
|
|
|
0
|
return 'UTF-8' if $enc eq 'utf8'; |
49
|
0
|
0
|
|
|
|
0
|
return uc($1) if $enc =~ /encoding\(([^)]+)\)/; |
50
|
0
|
|
|
|
|
0
|
return uc($enc); # could throw a parse error, but this is probably more useful behavior |
51
|
|
|
|
|
|
|
} |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
# fh_start_pos will be set if we have already checked for BOM |
54
|
18
|
50
|
33
|
|
|
152
|
if ($self->autodetect_encoding && !defined $self->_fh_start_pos) { |
55
|
18
|
|
50
|
|
|
112
|
$self->_fh_start_pos(tell $fh or 0); |
56
|
18
|
100
|
|
|
|
51
|
if (($enc= $self->_autodetect_bom($fh))) { |
57
|
1
|
|
|
1
|
|
7
|
binmode($fh, ":encoding($enc)"); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
5
|
|
|
6
|
|
|
|
|
83
|
|
58
|
|
|
|
|
|
|
# re-mark the start after the BOM |
59
|
6
|
|
50
|
|
|
18425
|
$self->_fh_start_pos(tell $fh or 0); |
60
|
6
|
|
|
|
|
19
|
return $enc; |
61
|
|
|
|
|
|
|
} |
62
|
|
|
|
|
|
|
} |
63
|
11
|
|
|
|
|
33
|
return ''; |
64
|
|
|
|
|
|
|
} |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
has _fh_start_pos => ( is => 'rw' ); |
68
|
|
|
|
|
|
|
has _iterator => ( is => 'rw', weak_ref => 1 ); |
69
|
|
|
|
|
|
|
has _row_ref => ( is => 'rw' ); |
70
|
|
|
|
|
|
|
sub iterator { |
71
|
20
|
|
|
20
|
1
|
7050
|
my $self= shift; |
72
|
20
|
100
|
|
|
|
446
|
croak "Multiple iterators on CSV stream not supported yet" if $self->_iterator; |
73
|
19
|
|
|
|
|
417
|
my $parser= $self->parser; |
74
|
19
|
|
|
|
|
12426
|
my $fh= $self->file_handle; |
75
|
19
|
|
|
|
|
51
|
my $row_ref= $self->_row_ref; |
76
|
|
|
|
|
|
|
# Keeping this object is just an indication of whether an iterator has been used yet |
77
|
19
|
100
|
|
|
|
56
|
if (!$row_ref) { |
|
|
50
|
|
|
|
|
|
78
|
18
|
|
|
|
|
55
|
$self->_row_ref($row_ref= \(my $row= 0)); |
79
|
|
|
|
|
|
|
# trigger BOM detection if needed |
80
|
18
|
|
|
|
|
64
|
my $enc= $self->encoding; |
81
|
17
|
|
100
|
|
|
121
|
$self->_log->('debug', "encoding is ".($enc||'maybe utf8')); |
82
|
|
|
|
|
|
|
# ensure _fh_start_pos is set |
83
|
17
|
|
100
|
|
|
224
|
$self->_fh_start_pos(tell $fh or 0); |
84
|
|
|
|
|
|
|
} |
85
|
|
|
|
|
|
|
elsif ($$row_ref) { |
86
|
0
|
|
|
|
|
0
|
$self->_log->('debug', 'Seeking back to start of input'); |
87
|
0
|
0
|
|
|
|
0
|
seek($fh, $self->_fh_start_pos, 0) |
88
|
|
|
|
|
|
|
or die "Can't seek back to start of stream"; |
89
|
0
|
|
|
|
|
0
|
$$row_ref= 0; |
90
|
|
|
|
|
|
|
} |
91
|
|
|
|
|
|
|
my $i= Data::TableReader::Decoder::CSV::_Iter->new( |
92
|
|
|
|
|
|
|
sub { |
93
|
87
|
|
|
87
|
|
173
|
++$$row_ref; |
94
|
87
|
100
|
|
|
|
2402
|
my $r= $parser->getline($fh) or return undef; |
95
|
71
|
100
|
|
|
|
2134
|
@$r= @{$r}[ @{$_[0]} ] if $_[0]; # optional slice argument |
|
26
|
|
|
|
|
77
|
|
|
26
|
|
|
|
|
47
|
|
96
|
71
|
|
|
|
|
331
|
return $r; |
97
|
|
|
|
|
|
|
}, |
98
|
|
|
|
|
|
|
{ |
99
|
18
|
|
|
|
|
278
|
row => $row_ref, |
100
|
|
|
|
|
|
|
fh => $fh, |
101
|
|
|
|
|
|
|
origin => $self->_fh_start_pos, |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
); |
104
|
18
|
|
|
|
|
426
|
$self->_iterator($i); |
105
|
18
|
|
|
|
|
259
|
return $i; |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
# This design is simplified from File::BOM in that it ignores UTF-32 |
109
|
|
|
|
|
|
|
# and in any "normal" case it can read from a pipe with only one |
110
|
|
|
|
|
|
|
# character to push back, avoiding the need to tie the file handle. |
111
|
|
|
|
|
|
|
# It also checks for whether layers have already been enabled. |
112
|
|
|
|
|
|
|
# It also avoids seeking to the start of the file handle, in case |
113
|
|
|
|
|
|
|
# the user deliberately seeked to a position. |
114
|
|
|
|
|
|
|
sub _autodetect_bom { |
115
|
18
|
|
|
18
|
|
47
|
my ($self, $fh)= @_; |
116
|
18
|
|
|
|
|
37
|
my $fpos= tell($fh); |
117
|
|
|
|
|
|
|
|
118
|
18
|
|
|
|
|
93
|
local $!; |
119
|
18
|
50
|
|
|
|
150
|
read($fh, my $buf, 1) || return; |
120
|
18
|
100
|
100
|
|
|
113
|
if ($buf eq "\xFF" || $buf eq "\xFE" || $buf eq "\xEF") { |
|
|
|
100
|
|
|
|
|
121
|
8
|
50
|
|
|
|
27
|
if (read($fh, $buf, 1, 1)) { |
122
|
8
|
100
|
66
|
|
|
43
|
if ($buf eq "\xFF\xFE") { |
|
|
100
|
66
|
|
|
|
|
|
|
100
|
|
|
|
|
|
123
|
2
|
|
|
|
|
11
|
return 'UTF-16LE'; |
124
|
|
|
|
|
|
|
} elsif ($buf eq "\xFE\xFF") { |
125
|
2
|
|
|
|
|
9
|
return 'UTF-16BE'; |
126
|
|
|
|
|
|
|
} elsif ($buf eq "\xEF\xBB" and read($fh, $buf, 1, 2) and $buf eq "\xEF\xBB\xBF") { |
127
|
2
|
|
|
|
|
11
|
return 'UTF-8'; |
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
# It wasn't a BOM. Try to undo our read. |
133
|
12
|
|
|
|
|
75
|
$self->_log->('debug', 'No BOM in stream, seeking back to start'); |
134
|
12
|
100
|
|
|
|
203
|
if (length $buf == 1) { |
|
|
100
|
|
|
|
|
|
135
|
10
|
|
|
|
|
197
|
$fh->ungetc(ord $buf); |
136
|
|
|
|
|
|
|
} elsif (!seek($fh, $fpos, 0)) { |
137
|
|
|
|
|
|
|
# Can't seek |
138
|
1
|
50
|
|
|
|
22
|
if ($fh->can('ungets')) { # support for FileHandle::Unget |
139
|
0
|
|
|
|
|
0
|
$fh->ungets($buf); |
140
|
|
|
|
|
|
|
} else { |
141
|
1
|
|
|
|
|
190
|
croak "Can't seek input handle after BOM detection; You should set an encoding manually, buffer the entire input, or use FileHandle::Unget"; |
142
|
|
|
|
|
|
|
} |
143
|
|
|
|
|
|
|
} |
144
|
11
|
|
|
|
|
63
|
return; |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
# If you need to subclass this iterator, don't. Just implement your own. |
148
|
|
|
|
|
|
|
# i.e. I'm not declaring this implementation stable, yet. |
149
|
4
|
|
|
4
|
|
1255
|
use Data::TableReader::Iterator; |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
142
|
|
150
|
4
|
|
|
4
|
|
1364
|
BEGIN { @Data::TableReader::Decoder::CSV::_Iter::ISA= ('Data::TableReader::Iterator'); } |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
sub Data::TableReader::Decoder::CSV::_Iter::position { |
153
|
12
|
|
|
12
|
|
34
|
my $f= shift->_fields; |
154
|
12
|
|
|
|
|
19
|
'row '.${ $f->{row} }; |
|
12
|
|
|
|
|
47
|
|
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
sub Data::TableReader::Decoder::CSV::_Iter::progress { |
158
|
0
|
|
|
0
|
|
0
|
my $f= shift->_fields; |
159
|
|
|
|
|
|
|
# lazy-build the file size, using seek |
160
|
0
|
0
|
|
|
|
0
|
unless (exists $f->{file_size}) { |
161
|
0
|
|
|
|
|
0
|
my $pos= tell $f->{fh}; |
162
|
0
|
0
|
0
|
|
|
0
|
if (defined $pos and $pos >= 0 and seek($f->{fh}, 0, 2)) { |
|
|
|
0
|
|
|
|
|
163
|
0
|
|
|
|
|
0
|
$f->{file_size}= tell($f->{fh}); |
164
|
0
|
0
|
|
|
|
0
|
seek($f->{fh}, $pos, 0) or die "seek: $!"; |
165
|
|
|
|
|
|
|
} else { |
166
|
0
|
|
|
|
|
0
|
$f->{file_size}= undef; |
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
} |
169
|
0
|
0
|
|
|
|
0
|
return $f->{file_size}? (tell $f->{fh})/$f->{file_size} : undef; |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
sub Data::TableReader::Decoder::CSV::_Iter::tell { |
173
|
4
|
|
|
4
|
|
16
|
my $f= shift->_fields; |
174
|
4
|
|
|
|
|
9
|
my $pos= tell($f->{fh}); |
175
|
4
|
50
|
33
|
|
|
17
|
return undef unless defined $pos && $pos >= 0; |
176
|
4
|
|
|
|
|
5
|
return [ $pos, ${$f->{row}} ]; |
|
4
|
|
|
|
|
13
|
|
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
sub Data::TableReader::Decoder::CSV::_Iter::seek { |
180
|
7
|
|
|
7
|
|
24
|
my ($self, $to)= @_; |
181
|
7
|
|
|
|
|
30
|
my $f= $self->_fields; |
182
|
7
|
50
|
|
|
|
54
|
seek($f->{fh}, ($to? $to->[0] : $f->{origin}), 0) or croak("seek failed: $!"); |
|
|
50
|
|
|
|
|
|
183
|
7
|
50
|
|
|
|
18
|
${ $f->{row} }= $to? $to->[1] : 0; |
|
7
|
|
|
|
|
17
|
|
184
|
7
|
|
|
|
|
27
|
1; |
185
|
|
|
|
|
|
|
} |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
1; |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
__END__ |