File Coverage
blib/lib/Data/TableReader/Decoder/HTML.pm |
|
Criterion |
Covered |
Total |
% |
statement |
104 |
113 |
92.0
|
branch |
34 |
58 |
58.6
|
condition |
9 |
13 |
69.2
|
subroutine |
20 |
20 |
100.0
|
pod |
2 |
2 |
100.0
|
total |
169 |
206 |
82.0
|
line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Data::TableReader::Decoder::HTML; |
2
|
|
|
|
|
|
|
$Data::TableReader::Decoder::HTML::VERSION = '0.012'; |
3
|
1
|
|
|
1
|
|
107821
|
use Moo 2; |
|
1
|
|
|
|
|
11080
|
|
|
1
|
|
|
|
|
7
|
|
4
|
1
|
|
|
1
|
|
1566
|
use Try::Tiny; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
55
|
|
5
|
1
|
|
|
1
|
|
6
|
use Carp; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
46
|
|
6
|
1
|
|
|
1
|
|
568
|
use IO::Handle; |
|
1
|
|
|
|
|
6525
|
|
|
1
|
|
|
|
|
45
|
|
7
|
1
|
|
|
1
|
|
631
|
use HTML::Parser; |
|
1
|
|
|
|
|
6224
|
|
|
1
|
|
|
|
|
923
|
|
8
|
|
|
|
|
|
|
extends 'Data::TableReader::Decoder'; |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
# ABSTRACT: Access the tables of an HTML document |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
has _tables => ( is => 'lazy' ); |
14
|
|
|
|
|
|
|
sub parse { |
15
|
2
|
|
|
2
|
1
|
7203
|
shift->_tables; |
16
|
2
|
|
|
|
|
11
|
return 1; |
17
|
|
|
|
|
|
|
} |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
sub _build__tables { |
20
|
3
|
|
|
3
|
|
30
|
my $self= shift; |
21
|
|
|
|
|
|
|
# TODO: determine encoding from BOM, or from meta-equiv while parsing... |
22
|
3
|
|
|
|
|
20
|
binmode $self->file_handle; |
23
|
3
|
|
|
|
|
11
|
return $self->_parse_html_tables($self->file_handle); |
24
|
|
|
|
|
|
|
} |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
sub _parse_html_tables { |
27
|
3
|
|
|
3
|
|
7
|
my ($self, $handle)= @_; |
28
|
|
|
|
|
|
|
# These variables track the state of the HTML parse. |
29
|
|
|
|
|
|
|
# cur_row is only defined when we are in a table row, and $cur_cell |
30
|
|
|
|
|
|
|
# is a scalar ref only defined when we are in a cell. |
31
|
3
|
|
|
|
|
7
|
my (@tables, $cur_table, $cur_row, $cur_cell); |
32
|
3
|
|
|
|
|
6
|
my $nested_tables= 0; |
33
|
3
|
|
|
|
|
4
|
my $ignore_all= 0; |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
my $tag_start= sub { |
36
|
114
|
50
|
|
114
|
|
561
|
next if $ignore_all; |
37
|
114
|
|
|
|
|
229
|
my ($tagname, $attr)= (uc $_[0], $_[1]); |
38
|
114
|
100
|
100
|
|
|
376
|
if ($tagname eq 'TABLE') { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
39
|
6
|
50
|
|
|
|
14
|
if ($cur_table) { |
40
|
0
|
|
|
|
|
0
|
$self->_log->('warn','tables within tables are currently returned as a single cell value'); |
41
|
0
|
|
|
|
|
0
|
$nested_tables++; |
42
|
0
|
|
|
|
|
0
|
$ignore_all++; |
43
|
|
|
|
|
|
|
} |
44
|
|
|
|
|
|
|
else { |
45
|
6
|
|
|
|
|
28
|
push @tables, ($cur_table= []); |
46
|
|
|
|
|
|
|
} |
47
|
|
|
|
|
|
|
} |
48
|
|
|
|
|
|
|
elsif ($tagname eq 'TR') { |
49
|
18
|
50
|
|
|
|
35
|
$cur_table or croak "found |
outside any ";
50
|
18
|
50
|
|
|
|
36
|
$cur_row and $self->_log->('warn', 'found |
before end of previous row');
51
|
18
|
|
|
|
|
69
|
push @$cur_table, ($cur_row= []); |
52
|
|
|
|
|
|
|
} |
53
|
|
|
|
|
|
|
elsif ($tagname eq 'TD' or $tagname eq 'TH') { |
54
|
81
|
50
|
|
|
|
162
|
$cur_table or croak "found <$tagname> outside any ";
55
|
81
|
50
|
|
|
|
161
|
$cur_row or croak "found <$tagname> outside any | ";
56
|
81
|
50
|
|
|
|
174
|
$cur_cell and $self->_log->('warn', "found <$tagname> before previous $tagname>"); |
57
|
81
|
|
|
|
|
146
|
push @$cur_row, ''; |
58
|
81
|
|
|
|
|
299
|
$cur_cell= \$cur_row->[-1]; |
59
|
|
|
|
|
|
|
} |
60
|
3
|
|
|
|
|
19
|
}; |
61
|
|
|
|
|
|
|
my $content= sub { |
62
|
129
|
|
|
129
|
|
335
|
my ($text)= @_; |
63
|
129
|
100
|
33
|
|
|
378
|
if ($cur_cell) { |
|
|
50
|
|
|
|
|
|
64
|
81
|
|
|
|
|
278
|
$$cur_cell .= $text |
65
|
|
|
|
|
|
|
} |
66
|
|
|
|
|
|
|
elsif ($cur_row && $text =~ /\S/) { |
67
|
0
|
|
|
|
|
0
|
$self->_log->('warn', "Encountered text within a row but not in a cell: '$text'"); |
68
|
|
|
|
|
|
|
} |
69
|
3
|
|
|
|
|
11
|
}; |
70
|
|
|
|
|
|
|
my $tag_end= sub { |
71
|
114
|
|
|
114
|
|
292
|
my ($tagname)= (uc($_[0])); |
72
|
114
|
50
|
100
|
|
|
410
|
if ($ignore_all) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
73
|
0
|
0
|
|
|
|
0
|
if ($tagname eq 'TABLE') { |
74
|
0
|
|
|
|
|
0
|
--$nested_tables; |
75
|
0
|
0
|
|
|
|
0
|
$ignore_all= 0 if $nested_tables <= 0; |
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
} |
78
|
|
|
|
|
|
|
elsif ($tagname eq 'TD' or $tagname eq 'TH') { |
79
|
81
|
50
|
|
|
|
159
|
$cur_cell or $self->_log->('warn', "Found $tagname> without matching <$tagname>"); |
80
|
81
|
|
|
|
|
254
|
$cur_cell= undef; |
81
|
|
|
|
|
|
|
} |
82
|
|
|
|
|
|
|
elsif ($tagname eq 'TR') { |
83
|
18
|
50
|
|
|
|
39
|
$cur_row or $self->_log->('warn', "Found | without matching ");
84
|
18
|
50
|
|
|
|
36
|
$cur_cell and $self->_log->('warn', "Found | while still in "); |
85
|
18
|
|
|
|
|
26
|
$cur_row= undef; |
86
|
18
|
|
|
|
|
56
|
$cur_cell= undef; |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
elsif ($tagname eq 'TABLE') { |
89
|
6
|
50
|
|
|
|
24
|
$cur_table or $self->_log->('warn', "Found | without matching ");
90
|
6
|
50
|
|
|
|
13
|
$cur_row and $self->_log->('warn', "Found | while still in |
");
91
|
6
|
50
|
|
|
|
12
|
$cur_cell and $self->_log->('warn', "Found |
while still in "); |
92
|
6
|
|
|
|
|
10
|
$cur_table= undef; |
93
|
6
|
|
|
|
|
10
|
$cur_row= undef; |
94
|
6
|
|
|
|
|
21
|
$cur_cell= undef; |
95
|
|
|
|
|
|
|
} |
96
|
3
|
|
|
|
|
11
|
}; |
97
|
|
|
|
|
|
|
|
98
|
3
|
|
|
|
|
37
|
HTML::Parser->new( |
99
|
|
|
|
|
|
|
api_version => 3, |
100
|
|
|
|
|
|
|
start_h => [ $tag_start, 'tagname,attr' ], |
101
|
|
|
|
|
|
|
text_h => [ $content, 'dtext' ], |
102
|
|
|
|
|
|
|
end_h => [ $tag_end, 'tagname' ] |
103
|
|
|
|
|
|
|
)->parse_file($handle); |
104
|
|
|
|
|
|
|
|
105
|
3
|
50
|
|
|
|
24
|
$nested_tables == 0 or $self->_log->('warn', "Found EOF while expecting |
tag");
106
|
3
|
|
|
|
|
55
|
return \@tables; |
107
|
|
|
|
|
|
|
} |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
sub iterator { |
111
|
3
|
|
|
3
|
1
|
2926
|
my $self= shift; |
112
|
3
|
|
|
|
|
77
|
my ($tables, $table_i, $row_i)= ($self->_tables, 0, 0); |
113
|
3
|
|
50
|
|
|
25
|
my $table= $tables->[$table_i] || []; |
114
|
3
|
|
|
|
|
6
|
my $n_records= 0; $n_records += @$_ for @$tables; |
|
3
|
|
|
|
|
10
|
|
115
|
|
|
|
|
|
|
return Data::TableReader::Decoder::HTML::_Iter->new( |
116
|
|
|
|
|
|
|
sub { |
117
|
10
|
50
|
|
10
|
|
2904
|
my $row= $table->[$row_i] |
118
|
|
|
|
|
|
|
or return undef; |
119
|
10
|
|
|
|
|
30
|
$row_i++; |
120
|
10
|
50
|
|
|
|
42
|
my @r= $_[0]? @{$row}[ @{$_[0]} ] : @$row; # optional slice argument |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
121
|
10
|
|
|
|
|
44
|
return \@r; |
122
|
|
|
|
|
|
|
}, |
123
|
|
|
|
|
|
|
{ |
124
|
3
|
|
|
|
|
52
|
table => \$table, |
125
|
|
|
|
|
|
|
table_i => \$table_i, |
126
|
|
|
|
|
|
|
row_i => \$row_i, |
127
|
|
|
|
|
|
|
total_records => $n_records, |
128
|
|
|
|
|
|
|
table_record_ofs => 0, |
129
|
|
|
|
|
|
|
tables => $tables, |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
); |
132
|
|
|
|
|
|
|
} |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
# If you need to subclass this iterator, don't. Just implement your own. |
135
|
|
|
|
|
|
|
# i.e. I'm not declaring this implementation stable, yet. |
136
|
1
|
|
|
1
|
|
517
|
use Data::TableReader::Iterator; |
|
1
|
|
|
|
|
482
|
|
|
1
|
|
|
|
|
40
|
|
137
|
1
|
|
|
1
|
|
371
|
BEGIN { @Data::TableReader::Decoder::HTML::_Iter::ISA= ('Data::TableReader::Iterator'); } |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
sub Data::TableReader::Decoder::HTML::_Iter::position { |
140
|
6
|
|
|
6
|
|
24
|
my $f= shift->_fields; |
141
|
6
|
|
|
|
|
33
|
'table '.${ $f->{table_i} }.' row '.${ $f->{row_i} }; |
|
6
|
|
|
|
|
18
|
|
|
6
|
|
|
|
|
27
|
|
142
|
|
|
|
|
|
|
} |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
sub Data::TableReader::Decoder::HTML::_Iter::progress { |
145
|
2
|
|
|
2
|
|
8
|
my $f= shift->_fields; |
146
|
|
|
|
|
|
|
return ! $f->{total_records}? 0 |
147
|
2
|
50
|
|
|
|
12
|
: (( $f->{table_record_ofs} + ${$f->{row_i}} ) / $f->{total_records}); |
|
2
|
|
|
|
|
14
|
|
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
sub Data::TableReader::Decoder::HTML::_Iter::tell { |
151
|
2
|
|
|
2
|
|
385
|
my $f= shift->_fields; |
152
|
2
|
|
|
|
|
12
|
return [ ${$f->{table_i}}, ${$f->{row_i}} ]; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
7
|
|
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
sub Data::TableReader::Decoder::HTML::_Iter::seek { |
156
|
3
|
|
|
3
|
|
593
|
my ($self, $to)= @_; |
157
|
3
|
|
|
|
|
11
|
my $f= $self->_fields; |
158
|
3
|
|
|
|
|
16
|
${$f->{table_i}}= $to->[0]; |
|
3
|
|
|
|
|
8
|
|
159
|
3
|
|
|
|
|
5
|
${$f->{row_i}}= $to->[1]; |
|
3
|
|
|
|
|
6
|
|
160
|
3
|
|
50
|
|
|
7
|
${$f->{table}}= $f->{tables}[${$f->{table_i}}] || []; |
|
3
|
|
|
|
|
22
|
|
161
|
|
|
|
|
|
|
# re-calculate table_record_ofs |
162
|
3
|
|
|
|
|
6
|
my $t= 0; $t += @$_ for @{$f->{tables}}[0 .. $to->[1]-1]; |
|
3
|
|
|
|
|
10
|
|
|
3
|
|
|
|
|
9
|
|
163
|
3
|
|
|
|
|
6
|
$f->{table_record_ofs}= $t; |
164
|
3
|
|
|
|
|
9
|
1; |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
sub Data::TableReader::Decoder::HTML::_Iter::next_dataset { |
168
|
1
|
|
|
1
|
|
8
|
my $f= $_[0]->_fields; |
169
|
1
|
50
|
|
|
|
5
|
return if ${$f->{table_i}} >= @{$f->{tables}}; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
5
|
|
170
|
1
|
|
|
|
|
3
|
$_[0]->seek([ ${$f->{table_i}}+1, 0 ]); |
|
1
|
|
|
|
|
5
|
|
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
1; |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
__END__ |