File Coverage

blib/lib/Data/TableReader/Decoder/CSV.pm
Criterion Covered Total %
statement 86 103 83.5
branch 34 60 56.6
condition 21 37 56.7
subroutine 16 17 94.1
pod 2 3 66.6
total 159 220 72.2


line stmt bran cond sub pod time code
1             package Data::TableReader::Decoder::CSV;
2             $Data::TableReader::Decoder::CSV::VERSION = '0.010';
3 4     4   103808 use Moo 2;
  4         11032  
  4         28  
4 4     4   2485 use Try::Tiny;
  4         10  
  4         257  
5 4     4   32 use Carp;
  4         11  
  4         247  
6 4     4   2153 use IO::Handle;
  4         24554  
  4         3976  
7             extends 'Data::TableReader::Decoder';
8              
9             our @csv_probe_modules= ( ['Text::CSV_XS' => 1.06], ['Text::CSV' => 1.91] );
10             our $default_csv_module;
11             sub default_csv_module {
12 21   66 21 0 619 $default_csv_module ||=
13             Data::TableReader::Decoder::_first_sufficient_module('CSV parser', \@csv_probe_modules);
14             }
15              
16             # ABSTRACT: Access rows of a comma-delimited text file
17              
18              
19             has _parser_args => ( is => 'ro', init_arg => 'parser' );
20              
21             has parser => ( is => 'lazy', init_arg => undef );
22             sub _build_parser {
23 16     16   154 my $self= shift;
24 16   50     96 my $args= $self->_parser_args || {};
25 16 50       130 return $args if ref($args)->can('getline');
26 16         54 return $self->default_csv_module->new({
27             binary => 1,
28             allow_loose_quotes => 1,
29             auto_diag => 2,
30             %$args
31             });
32             }
33              
34              
35             has autodetect_encoding => ( is => 'rw', default => sub { 1 } );
36              
37             sub encoding {
38 18     18 1 52 my ($self, $enc)= @_;
39 18         43 my $fh= $self->file_handle;
40 18 50       55 if (defined $enc) {
41 0         0 binmode($fh, ":encoding($enc)");
42 0         0 return $enc;
43             }
44            
45 18         123 my @layers= PerlIO::get_layers($fh);
46 18 50       55 if (($enc)= grep { /^encoding|^utf/ } @layers) {
  23         137  
47             # extract encoding name
48 0 0       0 return 'UTF-8' if $enc eq 'utf8';
49 0 0       0 return uc($1) if $enc =~ /encoding\(([^)]+)\)/;
50 0         0 return uc($enc); # could throw a parse error, but this is probably more useful behavior
51             }
52            
53             # fh_start_pos will be set if we have already checked for BOM
54 18 50 33     175 if ($self->autodetect_encoding && !defined $self->_fh_start_pos) {
55 18   50     124 $self->_fh_start_pos(tell $fh or 0);
56 18 100       63 if (($enc= $self->_autodetect_bom($fh))) {
57 1     1   10 binmode($fh, ":encoding($enc)");
  1         3  
  1         8  
  6         107  
58             # re-mark the start after the BOM
59 6   50     21021 $self->_fh_start_pos(tell $fh or 0);
60 6         24 return $enc;
61             }
62             }
63 11         35 return '';
64             }
65              
66              
67             has _fh_start_pos => ( is => 'rw' );
68             has _iterator => ( is => 'rw', weak_ref => 1 );
69             has _row_ref => ( is => 'rw' );
70             sub iterator {
71 20     20 1 7211 my $self= shift;
72 20 100       458 croak "Multiple iterators on CSV stream not supported yet" if $self->_iterator;
73 19         449 my $parser= $self->parser;
74 19         14976 my $fh= $self->file_handle;
75 19         61 my $row_ref= $self->_row_ref;
76             # Keeping this object is just an indication of whether an iterator has been used yet
77 19 100       66 if (!$row_ref) {
    50          
78 18         55 $self->_row_ref($row_ref= \(my $row= 0));
79             # trigger BOM detection if needed
80 18         72 my $enc= $self->encoding;
81 17   100     150 $self->_log->('debug', "encoding is ".($enc||'maybe utf8'));
82             # ensure _fh_start_pos is set
83 17   100     262 $self->_fh_start_pos(tell $fh or 0);
84             }
85             elsif ($$row_ref) {
86 0         0 $self->_log->('debug', 'Seeking back to start of input');
87 0 0       0 seek($fh, $self->_fh_start_pos, 0)
88             or die "Can't seek back to start of stream";
89 0         0 $$row_ref= 0;
90             }
91             my $i= Data::TableReader::Decoder::CSV::_Iter->new(
92             sub {
93 87     87   176 ++$$row_ref;
94 87 100       2583 my $r= $parser->getline($fh) or return undef;
95 71 100       2329 @$r= @{$r}[ @{$_[0]} ] if $_[0]; # optional slice argument
  26         83  
  26         50  
96 71         378 return $r;
97             },
98             {
99 18         294 row => $row_ref,
100             fh => $fh,
101             origin => $self->_fh_start_pos,
102             }
103             );
104 18         492 $self->_iterator($i);
105 18         274 return $i;
106             }
107              
108             # This design is simplified from File::BOM in that it ignores UTF-32
109             # and in any "normal" case it can read from a pipe with only one
110             # character to push back, avoiding the need to tie the file handle.
111             # It also checks for whether layers have already been enabled.
112             # It also avoids seeking to the start of the file handle, in case
113             # the user deliberately seeked to a position.
114             sub _autodetect_bom {
115 18     18   40 my ($self, $fh)= @_;
116 18         45 my $fpos= tell($fh);
117            
118 18         109 local $!;
119 18 50       170 read($fh, my $buf, 1) || return;
120 18 100 100     128 if ($buf eq "\xFF" || $buf eq "\xFE" || $buf eq "\xEF") {
      100        
121 8 50       30 if (read($fh, $buf, 1, 1)) {
122 8 100 66     55 if ($buf eq "\xFF\xFE") {
    100 66        
    100          
123 2         12 return 'UTF-16LE';
124             } elsif ($buf eq "\xFE\xFF") {
125 2         11 return 'UTF-16BE';
126             } elsif ($buf eq "\xEF\xBB" and read($fh, $buf, 1, 2) and $buf eq "\xEF\xBB\xBF") {
127 2         15 return 'UTF-8';
128             }
129             }
130             }
131            
132             # It wasn't a BOM. Try to undo our read.
133 12         77 $self->_log->('debug', 'No BOM in stream, seeking back to start');
134 12 100       208 if (length $buf == 1) {
    100          
135 10         196 $fh->ungetc(ord $buf);
136             } elsif (!seek($fh, $fpos, 0)) {
137             # Can't seek
138 1 50       26 if ($fh->can('ungets')) { # support for FileHandle::Unget
139 0         0 $fh->ungets($buf);
140             } else {
141 1         218 croak "Can't seek input handle after BOM detection; You should set an encoding manually, buffer the entire input, or use FileHandle::Unget";
142             }
143             }
144 11         72 return;
145             }
146              
147             # If you need to subclass this iterator, don't. Just implement your own.
148             # i.e. I'm not declaring this implementation stable, yet.
149 4     4   1476 use Data::TableReader::Iterator;
  4         11  
  4         163  
150 4     4   1467 BEGIN { @Data::TableReader::Decoder::CSV::_Iter::ISA= ('Data::TableReader::Iterator'); }
151              
152             sub Data::TableReader::Decoder::CSV::_Iter::position {
153 12     12   34 my $f= shift->_fields;
154 12         20 'row '.${ $f->{row} };
  12         57  
155             }
156              
157             sub Data::TableReader::Decoder::CSV::_Iter::progress {
158 0     0   0 my $f= shift->_fields;
159             # lazy-build the file size, using seek
160 0 0       0 unless (exists $f->{file_size}) {
161 0         0 my $pos= tell $f->{fh};
162 0 0 0     0 if (defined $pos and $pos >= 0 and seek($f->{fh}, 0, 2)) {
      0        
163 0         0 $f->{file_size}= tell($f->{fh});
164 0 0       0 seek($f->{fh}, $pos, 0) or die "seek: $!";
165             } else {
166 0         0 $f->{file_size}= undef;
167             }
168             }
169 0 0       0 return $f->{file_size}? (tell $f->{fh})/$f->{file_size} : undef;
170             }
171              
172             sub Data::TableReader::Decoder::CSV::_Iter::tell {
173 4     4   11 my $f= shift->_fields;
174 4         11 my $pos= tell($f->{fh});
175 4 50 33     17 return undef unless defined $pos && $pos >= 0;
176 4         10 return [ $pos, ${$f->{row}} ];
  4         14  
177             }
178              
179             sub Data::TableReader::Decoder::CSV::_Iter::seek {
180 7     7   25 my ($self, $to)= @_;
181 7         39 my $f= $self->_fields;
182 7 50       59 seek($f->{fh}, ($to? $to->[0] : $f->{origin}), 0) or croak("seek failed: $!");
    50          
183 7 50       21 ${ $f->{row} }= $to? $to->[1] : 0;
  7         22  
184 7         26 1;
185             }
186              
187             1;
188              
189             __END__