File Coverage
| blib/lib/Data/TableReader/Decoder/HTML.pm |
|
| Criterion |
Covered |
Total |
% |
| statement |
104 |
113 |
92.0
|
| branch |
34 |
58 |
58.6
|
| condition |
9 |
13 |
69.2
|
| subroutine |
20 |
20 |
100.0
|
| pod |
2 |
2 |
100.0
|
| total |
169 |
206 |
82.0
|
| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Data::TableReader::Decoder::HTML; |
|
2
|
|
|
|
|
|
|
$Data::TableReader::Decoder::HTML::VERSION = '0.009'; |
|
3
|
1
|
|
|
1
|
|
103654
|
use Moo 2; |
|
|
1
|
|
|
|
|
10633
|
|
|
|
1
|
|
|
|
|
7
|
|
|
4
|
1
|
|
|
1
|
|
1452
|
use Try::Tiny; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
54
|
|
|
5
|
1
|
|
|
1
|
|
6
|
use Carp; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
45
|
|
|
6
|
1
|
|
|
1
|
|
546
|
use IO::Handle; |
|
|
1
|
|
|
|
|
6102
|
|
|
|
1
|
|
|
|
|
47
|
|
|
7
|
1
|
|
|
1
|
|
563
|
use HTML::Parser; |
|
|
1
|
|
|
|
|
5796
|
|
|
|
1
|
|
|
|
|
850
|
|
|
8
|
|
|
|
|
|
|
extends 'Data::TableReader::Decoder'; |
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
# ABSTRACT: Access the tables of an HTML document |
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
has _tables => ( is => 'lazy' ); |
|
14
|
|
|
|
|
|
|
sub parse { |
|
15
|
2
|
|
|
2
|
1
|
7513
|
shift->_tables; |
|
16
|
2
|
|
|
|
|
14
|
return 1; |
|
17
|
|
|
|
|
|
|
} |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
sub _build__tables { |
|
20
|
3
|
|
|
3
|
|
33
|
my $self= shift; |
|
21
|
|
|
|
|
|
|
# TODO: determine encoding from BOM, or from meta-equiv while parsing... |
|
22
|
3
|
|
|
|
|
21
|
binmode $self->file_handle; |
|
23
|
3
|
|
|
|
|
27
|
return $self->_parse_html_tables($self->file_handle); |
|
24
|
|
|
|
|
|
|
} |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
sub _parse_html_tables { |
|
27
|
3
|
|
|
3
|
|
9
|
my ($self, $handle)= @_; |
|
28
|
|
|
|
|
|
|
# These variables track the state of the HTML parse. |
|
29
|
|
|
|
|
|
|
# cur_row is only defined when we are in a table row, and $cur_cell |
|
30
|
|
|
|
|
|
|
# is a scalar ref only defined when we are in a cell. |
|
31
|
3
|
|
|
|
|
6
|
my (@tables, $cur_table, $cur_row, $cur_cell); |
|
32
|
3
|
|
|
|
|
7
|
my $nested_tables= 0; |
|
33
|
3
|
|
|
|
|
6
|
my $ignore_all= 0; |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
my $tag_start= sub { |
|
36
|
114
|
50
|
|
114
|
|
572
|
next if $ignore_all; |
|
37
|
114
|
|
|
|
|
235
|
my ($tagname, $attr)= (uc $_[0], $_[1]); |
|
38
|
114
|
100
|
100
|
|
|
406
|
if ($tagname eq 'TABLE') { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
39
|
6
|
50
|
|
|
|
15
|
if ($cur_table) { |
|
40
|
0
|
|
|
|
|
0
|
$self->_log->('warn','tables within tables are currently returned as a single cell value'); |
|
41
|
0
|
|
|
|
|
0
|
$nested_tables++; |
|
42
|
0
|
|
|
|
|
0
|
$ignore_all++; |
|
43
|
|
|
|
|
|
|
} |
|
44
|
|
|
|
|
|
|
else { |
|
45
|
6
|
|
|
|
|
27
|
push @tables, ($cur_table= []); |
|
46
|
|
|
|
|
|
|
} |
|
47
|
|
|
|
|
|
|
} |
|
48
|
|
|
|
|
|
|
elsif ($tagname eq 'TR') { |
|
49
|
18
|
50
|
|
|
|
40
|
$cur_table or croak "found |
outside any ";
|
50
|
18
|
50
|
|
|
|
41
|
$cur_row and $self->_log->('warn', 'found |
before end of previous row');
|
51
|
18
|
|
|
|
|
69
|
push @$cur_table, ($cur_row= []); |
|
52
|
|
|
|
|
|
|
} |
|
53
|
|
|
|
|
|
|
elsif ($tagname eq 'TD' or $tagname eq 'TH') { |
|
54
|
81
|
50
|
|
|
|
174
|
$cur_table or croak "found <$tagname> outside any ";
|
55
|
81
|
50
|
|
|
|
147
|
$cur_row or croak "found <$tagname> outside any | ";
|
56
|
81
|
50
|
|
|
|
144
|
$cur_cell and $self->_log->('warn', "found <$tagname> before previous $tagname>"); |
|
57
|
81
|
|
|
|
|
189
|
push @$cur_row, ''; |
|
58
|
81
|
|
|
|
|
292
|
$cur_cell= \$cur_row->[-1]; |
|
59
|
|
|
|
|
|
|
} |
|
60
|
3
|
|
|
|
|
22
|
}; |
|
61
|
|
|
|
|
|
|
my $content= sub { |
|
62
|
129
|
|
|
129
|
|
270
|
my ($text)= @_; |
|
63
|
129
|
100
|
33
|
|
|
352
|
if ($cur_cell) { |
|
|
|
50
|
|
|
|
|
|
|
64
|
81
|
|
|
|
|
256
|
$$cur_cell .= $text |
|
65
|
|
|
|
|
|
|
} |
|
66
|
|
|
|
|
|
|
elsif ($cur_row && $text =~ /\S/) { |
|
67
|
0
|
|
|
|
|
0
|
$self->_log->('warn', "Encountered text within a row but not in a cell: '$text'"); |
|
68
|
|
|
|
|
|
|
} |
|
69
|
3
|
|
|
|
|
15
|
}; |
|
70
|
|
|
|
|
|
|
my $tag_end= sub { |
|
71
|
114
|
|
|
114
|
|
310
|
my ($tagname)= (uc($_[0])); |
|
72
|
114
|
50
|
100
|
|
|
366
|
if ($ignore_all) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
73
|
0
|
0
|
|
|
|
0
|
if ($tagname eq 'TABLE') { |
|
74
|
0
|
|
|
|
|
0
|
--$nested_tables; |
|
75
|
0
|
0
|
|
|
|
0
|
$ignore_all= 0 if $nested_tables <= 0; |
|
76
|
|
|
|
|
|
|
} |
|
77
|
|
|
|
|
|
|
} |
|
78
|
|
|
|
|
|
|
elsif ($tagname eq 'TD' or $tagname eq 'TH') { |
|
79
|
81
|
50
|
|
|
|
150
|
$cur_cell or $self->_log->('warn', "Found $tagname> without matching <$tagname>"); |
|
80
|
81
|
|
|
|
|
245
|
$cur_cell= undef; |
|
81
|
|
|
|
|
|
|
} |
|
82
|
|
|
|
|
|
|
elsif ($tagname eq 'TR') { |
|
83
|
18
|
50
|
|
|
|
42
|
$cur_row or $self->_log->('warn', "Found | without matching ");
|
84
|
18
|
50
|
|
|
|
33
|
$cur_cell and $self->_log->('warn', "Found | while still in "); |
|
85
|
18
|
|
|
|
|
28
|
$cur_row= undef; |
|
86
|
18
|
|
|
|
|
46
|
$cur_cell= undef; |
|
87
|
|
|
|
|
|
|
} |
|
88
|
|
|
|
|
|
|
elsif ($tagname eq 'TABLE') { |
|
89
|
6
|
50
|
|
|
|
11
|
$cur_table or $self->_log->('warn', "Found | without matching ");
|
90
|
6
|
50
|
|
|
|
15
|
$cur_row and $self->_log->('warn', "Found | while still in |
");
|
91
|
6
|
50
|
|
|
|
19
|
$cur_cell and $self->_log->('warn', "Found |
while still in "); |
|
92
|
6
|
|
|
|
|
10
|
$cur_table= undef; |
|
93
|
6
|
|
|
|
|
9
|
$cur_row= undef; |
|
94
|
6
|
|
|
|
|
17
|
$cur_cell= undef; |
|
95
|
|
|
|
|
|
|
} |
|
96
|
3
|
|
|
|
|
13
|
}; |
|
97
|
|
|
|
|
|
|
|
|
98
|
3
|
|
|
|
|
34
|
HTML::Parser->new( |
|
99
|
|
|
|
|
|
|
api_version => 3, |
|
100
|
|
|
|
|
|
|
start_h => [ $tag_start, 'tagname,attr' ], |
|
101
|
|
|
|
|
|
|
text_h => [ $content, 'dtext' ], |
|
102
|
|
|
|
|
|
|
end_h => [ $tag_end, 'tagname' ] |
|
103
|
|
|
|
|
|
|
)->parse_file($handle); |
|
104
|
|
|
|
|
|
|
|
|
105
|
3
|
50
|
|
|
|
24
|
$nested_tables == 0 or $self->_log->('warn', "Found EOF while expecting |
tag");
|
106
|
3
|
|
|
|
|
56
|
return \@tables; |
|
107
|
|
|
|
|
|
|
} |
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
sub iterator { |
|
111
|
3
|
|
|
3
|
1
|
2963
|
my $self= shift; |
|
112
|
3
|
|
|
|
|
73
|
my ($tables, $table_i, $row_i)= ($self->_tables, 0, 0); |
|
113
|
3
|
|
50
|
|
|
26
|
my $table= $tables->[$table_i] || []; |
|
114
|
3
|
|
|
|
|
6
|
my $n_records= 0; $n_records += @$_ for @$tables; |
|
|
3
|
|
|
|
|
10
|
|
|
115
|
|
|
|
|
|
|
return Data::TableReader::Decoder::HTML::_Iter->new( |
|
116
|
|
|
|
|
|
|
sub { |
|
117
|
10
|
50
|
|
10
|
|
2904
|
my $row= $table->[$row_i] |
|
118
|
|
|
|
|
|
|
or return undef; |
|
119
|
10
|
|
|
|
|
18
|
$row_i++; |
|
120
|
10
|
50
|
|
|
|
42
|
my @r= $_[0]? @{$row}[ @{$_[0]} ] : @$row; # optional slice argument |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
121
|
10
|
|
|
|
|
44
|
return \@r; |
|
122
|
|
|
|
|
|
|
}, |
|
123
|
|
|
|
|
|
|
{ |
|
124
|
3
|
|
|
|
|
62
|
table => \$table, |
|
125
|
|
|
|
|
|
|
table_i => \$table_i, |
|
126
|
|
|
|
|
|
|
row_i => \$row_i, |
|
127
|
|
|
|
|
|
|
total_records => $n_records, |
|
128
|
|
|
|
|
|
|
table_record_ofs => 0, |
|
129
|
|
|
|
|
|
|
tables => $tables, |
|
130
|
|
|
|
|
|
|
} |
|
131
|
|
|
|
|
|
|
); |
|
132
|
|
|
|
|
|
|
} |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
# If you need to subclass this iterator, don't. Just implement your own. |
|
135
|
|
|
|
|
|
|
# i.e. I'm not declaring this implementation stable, yet. |
|
136
|
1
|
|
|
1
|
|
485
|
use Data::TableReader::Iterator; |
|
|
1
|
|
|
|
|
451
|
|
|
|
1
|
|
|
|
|
40
|
|
|
137
|
1
|
|
|
1
|
|
347
|
BEGIN { @Data::TableReader::Decoder::HTML::_Iter::ISA= ('Data::TableReader::Iterator'); } |
|
138
|
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
sub Data::TableReader::Decoder::HTML::_Iter::position { |
|
140
|
6
|
|
|
6
|
|
24
|
my $f= shift->_fields; |
|
141
|
6
|
|
|
|
|
32
|
'table '.${ $f->{table_i} }.' row '.${ $f->{row_i} }; |
|
|
6
|
|
|
|
|
17
|
|
|
|
6
|
|
|
|
|
29
|
|
|
142
|
|
|
|
|
|
|
} |
|
143
|
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
sub Data::TableReader::Decoder::HTML::_Iter::progress { |
|
145
|
2
|
|
|
2
|
|
11
|
my $f= shift->_fields; |
|
146
|
|
|
|
|
|
|
return ! $f->{total_records}? 0 |
|
147
|
2
|
50
|
|
|
|
12
|
: (( $f->{table_record_ofs} + ${$f->{row_i}} ) / $f->{total_records}); |
|
|
2
|
|
|
|
|
13
|
|
|
148
|
|
|
|
|
|
|
} |
|
149
|
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
sub Data::TableReader::Decoder::HTML::_Iter::tell { |
|
151
|
2
|
|
|
2
|
|
363
|
my $f= shift->_fields; |
|
152
|
2
|
|
|
|
|
12
|
return [ ${$f->{table_i}}, ${$f->{row_i}} ]; |
|
|
2
|
|
|
|
|
6
|
|
|
|
2
|
|
|
|
|
6
|
|
|
153
|
|
|
|
|
|
|
} |
|
154
|
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
sub Data::TableReader::Decoder::HTML::_Iter::seek { |
|
156
|
3
|
|
|
3
|
|
620
|
my ($self, $to)= @_; |
|
157
|
3
|
|
|
|
|
9
|
my $f= $self->_fields; |
|
158
|
3
|
|
|
|
|
15
|
${$f->{table_i}}= $to->[0]; |
|
|
3
|
|
|
|
|
8
|
|
|
159
|
3
|
|
|
|
|
6
|
${$f->{row_i}}= $to->[1]; |
|
|
3
|
|
|
|
|
6
|
|
|
160
|
3
|
|
50
|
|
|
7
|
${$f->{table}}= $f->{tables}[${$f->{table_i}}] || []; |
|
|
3
|
|
|
|
|
6
|
|
|
161
|
|
|
|
|
|
|
# re-calculate table_record_ofs |
|
162
|
3
|
|
|
|
|
5
|
my $t= 0; $t += @$_ for @{$f->{tables}}[0 .. $to->[1]-1]; |
|
|
3
|
|
|
|
|
9
|
|
|
|
3
|
|
|
|
|
11
|
|
|
163
|
3
|
|
|
|
|
6
|
$f->{table_record_ofs}= $t; |
|
164
|
3
|
|
|
|
|
9
|
1; |
|
165
|
|
|
|
|
|
|
} |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
sub Data::TableReader::Decoder::HTML::_Iter::next_dataset { |
|
168
|
1
|
|
|
1
|
|
8
|
my $f= $_[0]->_fields; |
|
169
|
1
|
50
|
|
|
|
5
|
return if ${$f->{table_i}} >= @{$f->{tables}}; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
6
|
|
|
170
|
1
|
|
|
|
|
3
|
$_[0]->seek([ ${$f->{table_i}}+1, 0 ]); |
|
|
1
|
|
|
|
|
5
|
|
|
171
|
|
|
|
|
|
|
} |
|
172
|
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
1; |
|
174
|
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
__END__ |