line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# This software is copyright (c) 2011 by Jeffrey Kegler |
2
|
|
|
|
|
|
|
# This is free software; you can redistribute it and/or modify it |
3
|
|
|
|
|
|
|
# under the same terms as the Perl 5 programming language system |
4
|
|
|
|
|
|
|
# itself. |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package Marpa::HTML; |
7
|
|
|
|
|
|
|
|
8
|
6
|
|
|
6
|
|
66524
|
use 5.010; |
|
6
|
|
|
|
|
24
|
|
|
6
|
|
|
|
|
265
|
|
9
|
6
|
|
|
6
|
|
32
|
use strict; |
|
6
|
|
|
|
|
11
|
|
|
6
|
|
|
|
|
198
|
|
10
|
6
|
|
|
6
|
|
42
|
use warnings; |
|
6
|
|
|
|
|
21
|
|
|
6
|
|
|
|
|
199
|
|
11
|
|
|
|
|
|
|
|
12
|
6
|
|
|
6
|
|
37
|
use vars qw( $VERSION $STRING_VERSION ); |
|
6
|
|
|
|
|
10
|
|
|
6
|
|
|
|
|
519
|
|
13
|
|
|
|
|
|
|
$VERSION = '0.112000'; |
14
|
|
|
|
|
|
|
$STRING_VERSION = $VERSION; |
15
|
|
|
|
|
|
|
{ |
16
|
|
|
|
|
|
|
## no critic (BuiltinFunctions::ProhibitStringyEval) |
17
|
|
|
|
|
|
|
## no critic (ValuesAndExpressions::RequireConstantVersion) |
18
|
|
|
|
|
|
|
$VERSION = eval $VERSION; |
19
|
|
|
|
|
|
|
} |
20
|
|
|
|
|
|
|
|
21
|
6
|
|
|
6
|
|
3356
|
use Marpa::HTML::Version; |
|
6
|
|
|
|
|
15
|
|
|
6
|
|
|
|
|
275
|
|
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
our @EXPORT_OK; |
24
|
6
|
|
|
6
|
|
35
|
use base qw(Exporter); |
|
6
|
|
|
|
|
22
|
|
|
6
|
|
|
|
|
655
|
|
25
|
6
|
|
|
6
|
|
154
|
BEGIN { @EXPORT_OK = qw(html); } |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
package Marpa::HTML::Internal; |
28
|
|
|
|
|
|
|
|
29
|
6
|
|
|
6
|
|
30
|
use Carp; |
|
6
|
|
|
|
|
12
|
|
|
6
|
|
|
|
|
465
|
|
30
|
6
|
|
|
6
|
|
32
|
use HTML::PullParser; |
|
6
|
|
|
|
|
11
|
|
|
6
|
|
|
|
|
139
|
|
31
|
6
|
|
|
6
|
|
36
|
use HTML::Entities qw(decode_entities); |
|
6
|
|
|
|
|
11
|
|
|
6
|
|
|
|
|
385
|
|
32
|
6
|
|
|
6
|
|
5145
|
use HTML::Tagset (); |
|
6
|
|
|
|
|
10529
|
|
|
6
|
|
|
|
|
737
|
|
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# versions below must be coordinated with |
35
|
|
|
|
|
|
|
# those required in Build.PL |
36
|
|
|
|
|
|
|
BEGIN { |
37
|
6
|
|
|
6
|
|
12
|
my $using_xs = eval { |
38
|
6
|
|
|
|
|
2609
|
require Marpa::XS::Installed; |
39
|
0
|
0
|
|
|
|
0
|
defined $Marpa::XS::Installed::VERSION |
40
|
|
|
|
|
|
|
and $Marpa::XS::Installed::VERSION >= $Marpa::HTML::MARPA_XS_VERSION; |
41
|
|
|
|
|
|
|
}; |
42
|
6
|
50
|
|
|
|
37
|
if ($using_xs) { |
43
|
0
|
|
|
|
|
0
|
require Marpa::XS; |
44
|
0
|
|
|
|
|
0
|
Marpa::XS->VERSION($Marpa::HTML::MARPA_XS_VERSION); # double check |
45
|
0
|
|
|
|
|
0
|
$Marpa::HTML::MARPA_MODULE = 'Marpa::XS'; |
46
|
6
|
|
|
6
|
|
39
|
no strict 'refs'; |
|
6
|
|
|
|
|
12
|
|
|
6
|
|
|
|
|
788
|
|
47
|
0
|
|
|
|
|
0
|
*Marpa::HTML::Recognizer::new = \&Marpa::XS::Recognizer::new; |
48
|
0
|
|
|
|
|
0
|
*Marpa::HTML::Grammar::new = \&Marpa::XS::Grammar::new; |
49
|
|
|
|
|
|
|
} ## end if ($using_xs) |
50
|
|
|
|
|
|
|
else { |
51
|
6
|
|
|
|
|
40023
|
require Marpa::PP; |
52
|
6
|
|
|
|
|
582241
|
Marpa::PP->VERSION($Marpa::HTML::MARPA_PP_VERSION); |
53
|
6
|
|
|
|
|
27
|
$Marpa::HTML::MARPA_MODULE = 'Marpa::PP'; |
54
|
6
|
|
|
6
|
|
29
|
no strict 'refs'; |
|
6
|
|
|
|
|
11
|
|
|
6
|
|
|
|
|
414
|
|
55
|
6
|
|
|
|
|
27
|
*Marpa::HTML::Recognizer::new = \&Marpa::PP::Recognizer::new; |
56
|
6
|
|
|
|
|
227
|
*Marpa::HTML::Grammar::new = \&Marpa::PP::Grammar::new; |
57
|
|
|
|
|
|
|
} ## end else [ if ($using_xs) ] |
58
|
|
|
|
|
|
|
} ## end BEGIN |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# use Smart::Comments '-ENV'; |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
### Using smart comments ... |
63
|
|
|
|
|
|
|
|
64
|
6
|
|
|
6
|
|
81
|
use English qw( -no_match_vars ); |
|
6
|
|
|
|
|
12
|
|
|
6
|
|
|
|
|
56
|
|
65
|
|
|
|
|
|
|
|
66
|
6
|
|
|
|
|
40
|
use Marpa::HTML::Offset qw( |
67
|
|
|
|
|
|
|
:package=Marpa::HTML::Internal::TDesc |
68
|
|
|
|
|
|
|
TYPE |
69
|
|
|
|
|
|
|
START_TOKEN |
70
|
|
|
|
|
|
|
END_TOKEN |
71
|
6
|
|
|
6
|
|
6425
|
); |
|
6
|
|
|
|
|
17
|
|
72
|
|
|
|
|
|
|
|
73
|
6
|
|
|
|
|
29
|
use Marpa::HTML::Offset qw( |
74
|
|
|
|
|
|
|
:package=Marpa::HTML::Internal::TDesc::Element |
75
|
|
|
|
|
|
|
TYPE |
76
|
|
|
|
|
|
|
START_TOKEN |
77
|
|
|
|
|
|
|
END_TOKEN |
78
|
|
|
|
|
|
|
VALUE |
79
|
|
|
|
|
|
|
NODE_DATA |
80
|
6
|
|
|
6
|
|
33
|
); |
|
6
|
|
|
|
|
11
|
|
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
%Marpa::HTML::PULL_PARSER_OPTIONS = ( |
83
|
|
|
|
|
|
|
start => q{'S',line,column,offset,offset_end,tagname,attr}, |
84
|
|
|
|
|
|
|
end => q{'E',line,column,offset,offset_end,tagname}, |
85
|
|
|
|
|
|
|
text => q{'T',line,column,offset,offset_end,is_cdata}, |
86
|
|
|
|
|
|
|
comment => q{'C',line,column,offset,offset_end}, |
87
|
|
|
|
|
|
|
declaration => q{'D',line,column,offset,offset_end}, |
88
|
|
|
|
|
|
|
process => q{'PI',line,column,offset,offset_end}, |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
# options that default on |
91
|
|
|
|
|
|
|
unbroken_text => 1, |
92
|
|
|
|
|
|
|
); |
93
|
|
|
|
|
|
|
|
94
|
6
|
|
|
|
|
28
|
use Marpa::HTML::Offset qw( |
95
|
|
|
|
|
|
|
:package=Marpa::HTML::Internal::Token |
96
|
|
|
|
|
|
|
TYPE |
97
|
|
|
|
|
|
|
LINE |
98
|
|
|
|
|
|
|
COL |
99
|
|
|
|
|
|
|
=COLUMN |
100
|
|
|
|
|
|
|
START_OFFSET |
101
|
|
|
|
|
|
|
END_OFFSET |
102
|
|
|
|
|
|
|
TAGNAME |
103
|
|
|
|
|
|
|
=IS_CDATA |
104
|
|
|
|
|
|
|
ATTR |
105
|
6
|
|
|
6
|
|
34
|
); |
|
6
|
|
|
|
|
11
|
|
106
|
|
|
|
|
|
|
|
107
|
6
|
|
|
6
|
|
3731
|
use Marpa::HTML::Callback; |
|
6
|
|
|
|
|
16
|
|
|
6
|
|
|
|
|
26508
|
|
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
sub per_element_handlers { |
110
|
1386
|
|
|
1386
|
|
2420
|
my ( $element, $user_handlers ) = @_; |
111
|
1386
|
100
|
|
|
|
3032
|
return {} if not $element; |
112
|
1374
|
100
|
|
|
|
3708
|
return {} if not $user_handlers; |
113
|
687
|
|
50
|
|
|
1758
|
my $wildcard_handlers = $user_handlers->{ANY} // {}; |
114
|
687
|
|
|
|
|
866
|
my %handlers = %{$wildcard_handlers}; |
|
687
|
|
|
|
|
2189
|
|
115
|
687
|
|
100
|
|
|
10601
|
my $per_element_handlers = $user_handlers->{$element} // {}; |
116
|
687
|
|
|
|
|
1142
|
@handlers{ keys %{$per_element_handlers} } = |
|
687
|
|
|
|
|
1501
|
|
117
|
687
|
|
|
|
|
1003
|
values %{$per_element_handlers}; |
118
|
687
|
|
|
|
|
2094
|
return \%handlers; |
119
|
|
|
|
|
|
|
} ## end sub per_element_handlers |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
sub tdesc_list_to_literal { |
122
|
843
|
|
|
843
|
|
1308
|
my ( $self, $tdesc_list ) = @_; |
123
|
|
|
|
|
|
|
|
124
|
843
|
|
|
|
|
1186
|
my $text = q{}; |
125
|
843
|
|
|
|
|
1499
|
my $document = $self->{document}; |
126
|
843
|
|
|
|
|
1174
|
my $tokens = $self->{tokens}; |
127
|
843
|
|
|
|
|
953
|
TDESC: for my $tdesc ( @{$tdesc_list} ) { |
|
843
|
|
|
|
|
1660
|
|
128
|
981
|
|
|
|
|
1500
|
given ( $tdesc->[Marpa::HTML::Internal::TDesc::TYPE] ) { |
129
|
981
|
|
|
|
|
1597
|
when ('POINT') { break; } |
|
107
|
|
|
|
|
283
|
|
130
|
874
|
|
|
|
|
1246
|
when ('VALUED_SPAN') { |
131
|
528
|
50
|
|
|
|
1455
|
if (defined( |
132
|
|
|
|
|
|
|
my $value = |
133
|
|
|
|
|
|
|
$tdesc |
134
|
|
|
|
|
|
|
->[Marpa::HTML::Internal::TDesc::Element::VALUE] |
135
|
|
|
|
|
|
|
) |
136
|
|
|
|
|
|
|
) |
137
|
|
|
|
|
|
|
{ |
138
|
528
|
|
|
|
|
1195
|
$text .= $value; |
139
|
528
|
|
|
|
|
1575
|
break; # next TDESC; |
140
|
|
|
|
|
|
|
} ## end if ( defined( my $value = $tdesc->[...])) |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
# next TDESC if no first token id |
143
|
|
|
|
|
|
|
#<<< As of 2009-11-22 perltidy cycles on this code |
144
|
|
|
|
|
|
|
break |
145
|
0
|
0
|
|
|
|
0
|
if not defined( my $first_token_id = $tdesc |
146
|
|
|
|
|
|
|
->[ Marpa::HTML::Internal::TDesc::START_TOKEN ] ); |
147
|
|
|
|
|
|
|
#>>> |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
# next TDESC if no last token id |
150
|
|
|
|
|
|
|
#<<< As of 2009-11-22 perltidy cycles on this code |
151
|
|
|
|
|
|
|
break |
152
|
0
|
0
|
|
|
|
0
|
if not defined( my $last_token_id = |
153
|
|
|
|
|
|
|
$tdesc->[Marpa::HTML::Internal::TDesc::END_TOKEN] ); |
154
|
|
|
|
|
|
|
#>>> |
155
|
|
|
|
|
|
|
|
156
|
0
|
|
|
|
|
0
|
my $offset = |
157
|
|
|
|
|
|
|
$tokens->[$first_token_id] |
158
|
|
|
|
|
|
|
->[Marpa::HTML::Internal::Token::START_OFFSET]; |
159
|
0
|
|
|
|
|
0
|
my $end_offset = |
160
|
|
|
|
|
|
|
$tokens->[$last_token_id] |
161
|
|
|
|
|
|
|
->[Marpa::HTML::Internal::Token::END_OFFSET]; |
162
|
0
|
|
|
|
|
0
|
$text .= substr ${$document}, $offset, |
|
0
|
|
|
|
|
0
|
|
163
|
|
|
|
|
|
|
( $end_offset - $offset ); |
164
|
|
|
|
|
|
|
} ## end when ('VALUED_SPAN') |
165
|
346
|
|
|
|
|
536
|
when ('UNVALUED_SPAN') { |
166
|
346
|
|
|
|
|
534
|
my $first_token_id = |
167
|
|
|
|
|
|
|
$tdesc->[Marpa::HTML::Internal::TDesc::START_TOKEN]; |
168
|
346
|
|
|
|
|
512
|
my $last_token_id = |
169
|
|
|
|
|
|
|
$tdesc->[Marpa::HTML::Internal::TDesc::END_TOKEN]; |
170
|
346
|
|
|
|
|
722
|
my $offset = |
171
|
|
|
|
|
|
|
$tokens->[$first_token_id] |
172
|
|
|
|
|
|
|
->[Marpa::HTML::Internal::Token::START_OFFSET]; |
173
|
346
|
|
|
|
|
520
|
my $end_offset = |
174
|
|
|
|
|
|
|
$tokens->[$last_token_id] |
175
|
|
|
|
|
|
|
->[Marpa::HTML::Internal::Token::END_OFFSET]; |
176
|
|
|
|
|
|
|
|
177
|
346
|
|
|
|
|
525
|
$text .= substr ${$document}, $offset, |
|
346
|
|
|
|
|
2014
|
|
178
|
|
|
|
|
|
|
( $end_offset - $offset ); |
179
|
|
|
|
|
|
|
} ## end when ('UNVALUED_SPAN') |
180
|
0
|
|
|
|
|
0
|
default { |
181
|
0
|
|
|
|
|
0
|
Carp::croak(qq{Internal error: unknown tdesc type "$_"}); |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
} ## end given |
184
|
|
|
|
|
|
|
} ## end for my $tdesc ( @{$tdesc_list} ) |
185
|
843
|
|
|
|
|
4122
|
return \$text; |
186
|
|
|
|
|
|
|
} ## end sub tdesc_list_to_literal |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
# Convert a list of text descriptions to text |
189
|
|
|
|
|
|
|
sub default_top_handler { |
190
|
87
|
|
|
87
|
|
23930
|
my ( $dummy, @tdesc_lists ) = @_; |
191
|
87
|
|
|
|
|
180
|
my $self = $Marpa::HTML::Internal::PARSE_INSTANCE; |
192
|
87
|
|
|
|
|
210
|
my @tdesc_list = map { @{$_} } grep {defined} @tdesc_lists; |
|
179
|
|
|
|
|
218
|
|
|
179
|
|
|
|
|
467
|
|
|
348
|
|
|
|
|
927
|
|
193
|
87
|
|
|
|
|
267
|
return tdesc_list_to_literal( $self, \@tdesc_list ); |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
} ## end sub default_top_handler |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
sub wrap_user_top_handler { |
198
|
4
|
|
|
4
|
|
9
|
my ($user_handler) = @_; |
199
|
|
|
|
|
|
|
return sub { |
200
|
4
|
|
|
4
|
|
230
|
my ( $dummy, @tdesc_lists ) = @_; |
201
|
4
|
|
|
|
|
8
|
my @tdesc_list = map { @{$_} } grep {defined} @tdesc_lists; |
|
9
|
|
|
|
|
10
|
|
|
9
|
|
|
|
|
21
|
|
|
16
|
|
|
|
|
30
|
|
202
|
4
|
|
|
|
|
7
|
local $Marpa::HTML::Internal::TDESC_LIST = \@tdesc_list; |
203
|
4
|
|
|
|
|
1027
|
local $Marpa::HTML::Internal::PER_NODE_DATA = |
204
|
|
|
|
|
|
|
{ pseudoclass => 'TOP' }; |
205
|
4
|
|
|
|
|
22
|
return scalar $user_handler->(); |
206
|
4
|
|
|
|
|
38
|
}; |
207
|
|
|
|
|
|
|
} ## end sub wrap_user_top_handler |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
# Convert a list of text descriptions to a |
210
|
|
|
|
|
|
|
# single, shortened text description |
211
|
|
|
|
|
|
|
sub create_tdesc_handler { |
212
|
693
|
|
|
693
|
|
1092
|
my ( $self, $element ) = @_; |
213
|
693
|
100
|
|
|
|
2231
|
my $handlers_by_class = |
214
|
|
|
|
|
|
|
per_element_handlers( $element, |
215
|
|
|
|
|
|
|
( $self ? $self->{user_handlers_by_class} : {} ) ); |
216
|
693
|
100
|
|
|
|
3083
|
my $handlers_by_id = |
217
|
|
|
|
|
|
|
per_element_handlers( $element, |
218
|
|
|
|
|
|
|
( $self ? $self->{user_handlers_by_id} : {} ) ); |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
return sub { |
221
|
3265
|
|
|
3265
|
|
1997221
|
my ( $dummy, @tdesc_lists ) = @_; |
222
|
|
|
|
|
|
|
|
223
|
3265
|
|
|
|
|
5114
|
my @tdesc_list = map { @{$_} } grep {defined} @tdesc_lists; |
|
5200
|
|
|
|
|
5244
|
|
|
5200
|
|
|
|
|
11479
|
|
|
5581
|
|
|
|
|
11282
|
|
224
|
3265
|
|
|
|
|
4755
|
local $Marpa::HTML::Internal::TDESC_LIST = \@tdesc_list; |
225
|
|
|
|
|
|
|
|
226
|
9248
|
|
|
|
|
14079
|
my @token_ids = sort { $a <=> $b } grep {defined} map { |
|
10852
|
|
|
|
|
21677
|
|
|
5426
|
|
|
|
|
12612
|
|
227
|
3265
|
|
|
|
|
4368
|
@{$_}[ |
|
5426
|
|
|
|
|
5634
|
|
228
|
|
|
|
|
|
|
Marpa::HTML::Internal::TDesc::START_TOKEN, |
229
|
|
|
|
|
|
|
Marpa::HTML::Internal::TDesc::END_TOKEN |
230
|
|
|
|
|
|
|
] |
231
|
|
|
|
|
|
|
} @tdesc_list; |
232
|
|
|
|
|
|
|
|
233
|
3265
|
|
|
|
|
4578
|
my $first_token_id_in_node = $token_ids[0]; |
234
|
3265
|
|
|
|
|
3451
|
my $last_token_id_in_node = $token_ids[-1]; |
235
|
3265
|
|
|
|
|
9995
|
my $per_node_data = { |
236
|
|
|
|
|
|
|
element => $element, |
237
|
|
|
|
|
|
|
first_token_id => $first_token_id_in_node, |
238
|
|
|
|
|
|
|
last_token_id => $last_token_id_in_node, |
239
|
|
|
|
|
|
|
}; |
240
|
|
|
|
|
|
|
|
241
|
3265
|
100
|
|
|
|
8945
|
if ( $tdesc_list[0]->[Marpa::HTML::Internal::TDesc::TYPE] ne 'POINT' ) |
242
|
|
|
|
|
|
|
{ |
243
|
2962
|
|
|
|
|
4850
|
$per_node_data->{start_tag_token_id} = $first_token_id_in_node; |
244
|
|
|
|
|
|
|
} |
245
|
|
|
|
|
|
|
|
246
|
3265
|
100
|
|
|
|
7173
|
if ($tdesc_list[-1]->[Marpa::HTML::Internal::TDesc::TYPE] ne 'POINT' ) |
247
|
|
|
|
|
|
|
{ |
248
|
2895
|
|
|
|
|
4481
|
$per_node_data->{end_tag_token_id} = $last_token_id_in_node; |
249
|
|
|
|
|
|
|
} |
250
|
|
|
|
|
|
|
|
251
|
3265
|
|
|
|
|
4159
|
local $Marpa::HTML::Internal::PER_NODE_DATA = $per_node_data; |
252
|
|
|
|
|
|
|
|
253
|
3265
|
|
|
|
|
3645
|
my $self = $Marpa::HTML::Internal::PARSE_INSTANCE; |
254
|
3265
|
|
|
|
|
4603
|
my $trace_fh = $self->{trace_fh}; |
255
|
3265
|
|
|
|
|
4630
|
my $trace_handlers = $self->{trace_handlers}; |
256
|
|
|
|
|
|
|
|
257
|
3265
|
|
|
|
|
5884
|
my $tokens = $self->{tokens}; |
258
|
|
|
|
|
|
|
|
259
|
3265
|
|
|
|
|
3607
|
my $user_handler; |
260
|
|
|
|
|
|
|
GET_USER_HANDLER: { |
261
|
3265
|
50
|
|
|
|
3347
|
if ( my $id = Marpa::HTML::id() ) { |
|
3265
|
|
|
|
|
7496
|
|
262
|
0
|
0
|
|
|
|
0
|
if ( $user_handler = $handlers_by_id->{$id} ) { |
263
|
0
|
0
|
|
|
|
0
|
if ($trace_handlers) { |
264
|
0
|
0
|
|
|
|
0
|
say {$trace_fh} |
|
0
|
|
|
|
|
0
|
|
265
|
|
|
|
|
|
|
"Resolved to user handler by element ($element) and id ($id)" |
266
|
|
|
|
|
|
|
or Carp::croak("Cannot print: $ERRNO"); |
267
|
|
|
|
|
|
|
} |
268
|
0
|
|
|
|
|
0
|
last GET_USER_HANDLER; |
269
|
|
|
|
|
|
|
} ## end if ( $user_handler = $handlers_by_id->{$id} ) |
270
|
|
|
|
|
|
|
} ## end if ( my $id = Marpa::HTML::id() ) |
271
|
3265
|
100
|
|
|
|
7317
|
if ( my $class = Marpa::HTML::class() ) { |
272
|
744
|
100
|
|
|
|
1725
|
if ( $user_handler = $handlers_by_class->{$class} ) { |
273
|
8
|
50
|
|
|
|
32
|
if ($trace_handlers) { |
274
|
0
|
0
|
|
|
|
0
|
say {$trace_fh} |
|
0
|
|
|
|
|
0
|
|
275
|
|
|
|
|
|
|
"Resolved to user handler by element ($element) and class ($class)" |
276
|
|
|
|
|
|
|
or Carp::croak("Cannot print: $ERRNO"); |
277
|
|
|
|
|
|
|
} |
278
|
8
|
|
|
|
|
15
|
last GET_USER_HANDLER; |
279
|
|
|
|
|
|
|
} ## end if ( $user_handler = $handlers_by_class->{$class} ) |
280
|
|
|
|
|
|
|
} ## end if ( my $class = Marpa::HTML::class() ) |
281
|
3257
|
|
|
|
|
4703
|
$user_handler = $handlers_by_class->{ANY}; |
282
|
3257
|
50
|
33
|
|
|
7934
|
if ( $trace_handlers and $user_handler ) { |
283
|
0
|
0
|
|
|
|
0
|
say {$trace_fh} +( |
|
0
|
0
|
|
|
|
0
|
|
284
|
|
|
|
|
|
|
defined $element |
285
|
|
|
|
|
|
|
? "Resolved to user handler by element ($element)" |
286
|
|
|
|
|
|
|
: 'Resolved to default user handler' |
287
|
|
|
|
|
|
|
) or Carp::croak("Cannot print: $ERRNO"); |
288
|
|
|
|
|
|
|
} ## end if ( $trace_handlers and $user_handler ) |
289
|
|
|
|
|
|
|
} ## end GET_USER_HANDLER: |
290
|
|
|
|
|
|
|
|
291
|
3265
|
100
|
|
|
|
6063
|
if ( defined $user_handler ) { |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
# scalar context needed for the user handler |
294
|
|
|
|
|
|
|
# because so that a bare return returns undef |
295
|
|
|
|
|
|
|
# and not an empty list. |
296
|
|
|
|
|
|
|
return [ |
297
|
429
|
|
|
|
|
1244
|
[ VALUED_SPAN => $first_token_id_in_node, |
298
|
|
|
|
|
|
|
$last_token_id_in_node, ( scalar $user_handler->() ), |
299
|
|
|
|
|
|
|
$per_node_data |
300
|
|
|
|
|
|
|
] |
301
|
|
|
|
|
|
|
]; |
302
|
|
|
|
|
|
|
} ## end if ( defined $user_handler ) |
303
|
|
|
|
|
|
|
|
304
|
2836
|
|
|
|
|
3616
|
my $doc = $self->{doc}; |
305
|
2836
|
|
|
|
|
3797
|
my @tdesc_result = (); |
306
|
|
|
|
|
|
|
|
307
|
2836
|
|
|
|
|
2824
|
my $first_token_id_in_current_span; |
308
|
|
|
|
|
|
|
my $last_token_id_in_current_span; |
309
|
|
|
|
|
|
|
|
310
|
2836
|
|
|
|
|
6443
|
TDESC: for my $tdesc ( @tdesc_list, ['FINAL'] ) { |
311
|
|
|
|
|
|
|
|
312
|
6933
|
|
|
|
|
6703
|
my $next_tdesc; |
313
|
|
|
|
|
|
|
my $first_token_id; |
314
|
0
|
|
|
|
|
0
|
my $last_token_id; |
315
|
6933
|
|
|
|
|
9788
|
PARSE_TDESC: { |
316
|
6933
|
|
|
|
|
6991
|
my $ref_type = ref $tdesc; |
317
|
6933
|
50
|
33
|
|
|
34239
|
if ( not $ref_type or $ref_type ne 'ARRAY' ) { |
318
|
0
|
|
|
|
|
0
|
$next_tdesc = $tdesc; |
319
|
0
|
|
|
|
|
0
|
last PARSE_TDESC; |
320
|
|
|
|
|
|
|
} |
321
|
6933
|
|
|
|
|
9424
|
given ( $tdesc->[Marpa::HTML::Internal::TDesc::TYPE] ) { |
322
|
6933
|
|
|
|
|
9441
|
when ('POINT') { break; } |
|
63
|
|
|
|
|
115
|
|
323
|
6870
|
|
|
|
|
8280
|
when ('VALUED_SPAN') { |
324
|
913
|
50
|
|
|
|
2375
|
if (not defined( |
325
|
|
|
|
|
|
|
my $value = $tdesc->[ |
326
|
|
|
|
|
|
|
Marpa::HTML::Internal::TDesc::Element::VALUE |
327
|
|
|
|
|
|
|
] |
328
|
|
|
|
|
|
|
) |
329
|
|
|
|
|
|
|
) |
330
|
|
|
|
|
|
|
{ |
331
|
|
|
|
|
|
|
#<<< As of 2009-11-22 pertidy cycles on this |
332
|
0
|
|
|
|
|
0
|
$first_token_id = $tdesc->[ |
333
|
|
|
|
|
|
|
Marpa::HTML::Internal::TDesc::START_TOKEN ]; |
334
|
0
|
|
|
|
|
0
|
$last_token_id = |
335
|
|
|
|
|
|
|
$tdesc |
336
|
|
|
|
|
|
|
->[ Marpa::HTML::Internal::TDesc::END_TOKEN |
337
|
|
|
|
|
|
|
]; |
338
|
|
|
|
|
|
|
#>>> |
339
|
0
|
|
|
|
|
0
|
break; # last PARSE_TDESC; |
340
|
|
|
|
|
|
|
} ## end if ( not defined( my $value = $tdesc->[ ...])) |
341
|
913
|
|
|
|
|
2013
|
$next_tdesc = $tdesc; |
342
|
|
|
|
|
|
|
} ## end when ('VALUED_SPAN') |
343
|
5957
|
|
|
|
|
6922
|
when ('FINAL') { |
344
|
2836
|
|
|
|
|
11352
|
$next_tdesc = $tdesc; |
345
|
|
|
|
|
|
|
} |
346
|
3121
|
|
|
|
|
4019
|
when ('UNVALUED_SPAN') { |
347
|
3121
|
|
|
|
|
3474
|
$first_token_id = $tdesc |
348
|
|
|
|
|
|
|
->[Marpa::HTML::Internal::TDesc::START_TOKEN]; |
349
|
3121
|
|
|
|
|
6201
|
$last_token_id = |
350
|
|
|
|
|
|
|
$tdesc->[Marpa::HTML::Internal::TDesc::END_TOKEN]; |
351
|
|
|
|
|
|
|
} ## end when ('UNVALUED_SPAN') |
352
|
0
|
|
|
|
|
0
|
default { |
353
|
0
|
|
|
|
|
0
|
Carp::croak("Unknown text description type: $_"); |
354
|
|
|
|
|
|
|
} |
355
|
|
|
|
|
|
|
} ## end given |
356
|
|
|
|
|
|
|
} ## end PARSE_TDESC: |
357
|
|
|
|
|
|
|
|
358
|
6933
|
100
|
66
|
|
|
21000
|
if ( defined $first_token_id and defined $last_token_id ) { |
359
|
3121
|
100
|
|
|
|
5479
|
if ( defined $first_token_id_in_current_span ) { |
360
|
952
|
50
|
|
|
|
1845
|
if ( $first_token_id |
361
|
|
|
|
|
|
|
<= $last_token_id_in_current_span + 1 ) |
362
|
|
|
|
|
|
|
{ |
363
|
952
|
|
|
|
|
945
|
$last_token_id_in_current_span = $last_token_id; |
364
|
952
|
|
|
|
|
1447
|
next TDESC; |
365
|
|
|
|
|
|
|
} ## end if ( $first_token_id <= ...) |
366
|
0
|
|
|
|
|
0
|
push @tdesc_result, |
367
|
|
|
|
|
|
|
[ |
368
|
|
|
|
|
|
|
'UNVALUED_SPAN', |
369
|
|
|
|
|
|
|
$first_token_id_in_current_span, |
370
|
|
|
|
|
|
|
$last_token_id_in_current_span |
371
|
|
|
|
|
|
|
]; |
372
|
|
|
|
|
|
|
} ## end if ( defined $first_token_id_in_current_span ) |
373
|
2169
|
|
|
|
|
2085
|
$first_token_id_in_current_span = $first_token_id; |
374
|
2169
|
|
|
|
|
2052
|
$last_token_id_in_current_span = $last_token_id; |
375
|
2169
|
|
|
|
|
3414
|
next TDESC; |
376
|
|
|
|
|
|
|
} ## end if ( defined $first_token_id and defined $last_token_id) |
377
|
|
|
|
|
|
|
|
378
|
3812
|
100
|
|
|
|
11865
|
if ( defined $next_tdesc ) { |
379
|
3749
|
100
|
|
|
|
6617
|
if ( defined $first_token_id_in_current_span ) { |
380
|
2169
|
|
|
|
|
5528
|
push @tdesc_result, |
381
|
|
|
|
|
|
|
[ |
382
|
|
|
|
|
|
|
'UNVALUED_SPAN', |
383
|
|
|
|
|
|
|
$first_token_id_in_current_span, |
384
|
|
|
|
|
|
|
$last_token_id_in_current_span |
385
|
|
|
|
|
|
|
]; |
386
|
|
|
|
|
|
|
|
387
|
2169
|
|
|
|
|
3063
|
$first_token_id_in_current_span = |
388
|
|
|
|
|
|
|
$last_token_id_in_current_span = undef; |
389
|
|
|
|
|
|
|
} ## end if ( defined $first_token_id_in_current_span ) |
390
|
3749
|
|
|
|
|
5013
|
my $ref_type = ref $next_tdesc; |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
last TDESC |
393
|
3749
|
100
|
66
|
|
|
18696
|
if $ref_type eq 'ARRAY' |
394
|
|
|
|
|
|
|
and $next_tdesc->[Marpa::HTML::Internal::TDesc::TYPE] |
395
|
|
|
|
|
|
|
eq 'FINAL'; |
396
|
913
|
|
|
|
|
2089
|
push @tdesc_result, $next_tdesc; |
397
|
|
|
|
|
|
|
} ## end if ( defined $next_tdesc ) |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
} ## end for my $tdesc ( @tdesc_list, ['FINAL'] ) |
400
|
|
|
|
|
|
|
|
401
|
2836
|
|
|
|
|
16345
|
return \@tdesc_result; |
402
|
693
|
|
|
|
|
12344
|
}; |
403
|
|
|
|
|
|
|
} ## end sub create_tdesc_handler |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
sub wrap_user_tdesc_handler { |
406
|
313
|
|
|
313
|
|
461
|
my ( $user_handler, $per_node_data ) = @_; |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
return sub { |
409
|
109
|
|
|
109
|
|
50005
|
my ( $dummy, @tdesc_lists ) = @_; |
410
|
109
|
|
|
|
|
266
|
my @tdesc_list = map { @{$_} } grep {defined} @tdesc_lists; |
|
109
|
|
|
|
|
197
|
|
|
109
|
|
|
|
|
419
|
|
|
109
|
|
|
|
|
299
|
|
411
|
109
|
|
|
|
|
205
|
local $Marpa::HTML::Internal::TDESC_LIST = \@tdesc_list; |
412
|
112
|
|
|
|
|
319
|
my @token_ids = sort { $a <=> $b } grep {defined} map { |
|
220
|
|
|
|
|
550
|
|
|
110
|
|
|
|
|
342
|
|
413
|
109
|
|
|
|
|
183
|
@{$_}[ |
|
110
|
|
|
|
|
163
|
|
414
|
|
|
|
|
|
|
Marpa::HTML::Internal::TDesc::START_TOKEN, |
415
|
|
|
|
|
|
|
Marpa::HTML::Internal::TDesc::END_TOKEN |
416
|
|
|
|
|
|
|
] |
417
|
|
|
|
|
|
|
} @tdesc_list; |
418
|
|
|
|
|
|
|
|
419
|
109
|
|
|
|
|
194
|
my $first_token_id = $token_ids[0]; |
420
|
109
|
|
|
|
|
181
|
my $last_token_id = $token_ids[-1]; |
421
|
109
|
|
50
|
|
|
300
|
$per_node_data //= {}; |
422
|
109
|
|
|
|
|
229
|
$per_node_data->{first_token_id} = $first_token_id; |
423
|
109
|
|
|
|
|
241
|
$per_node_data->{last_token_id} = $last_token_id; |
424
|
109
|
|
|
|
|
165
|
local $Marpa::HTML::Internal::PER_NODE_DATA = $per_node_data; |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
# scalar context needed for the user handler |
427
|
|
|
|
|
|
|
# because so that a bare return returns undef |
428
|
|
|
|
|
|
|
# and not an empty list. |
429
|
|
|
|
|
|
|
return [ |
430
|
109
|
|
|
|
|
419
|
[ VALUED_SPAN => $first_token_id, |
431
|
|
|
|
|
|
|
$last_token_id, ( scalar $user_handler->() ), |
432
|
|
|
|
|
|
|
$per_node_data |
433
|
|
|
|
|
|
|
] |
434
|
|
|
|
|
|
|
]; |
435
|
|
|
|
|
|
|
|
436
|
313
|
|
|
|
|
2913
|
}; |
437
|
|
|
|
|
|
|
} ## end sub wrap_user_tdesc_handler |
438
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
sub earleme_to_linecol { |
440
|
0
|
|
|
0
|
|
0
|
my ( $self, $token_offset ) = @_; |
441
|
0
|
|
|
|
|
0
|
my $html_parser_tokens = $self->{tokens}; |
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
# Special start of file for undefined offset |
444
|
0
|
0
|
|
|
|
0
|
if ( not defined $token_offset ) { |
445
|
0
|
|
|
|
|
0
|
return ( 1, 0 ); |
446
|
|
|
|
|
|
|
} |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
# Special case needed for a token offset after the last |
449
|
|
|
|
|
|
|
# token. This happens with the EOF. |
450
|
0
|
0
|
0
|
|
|
0
|
if ( $token_offset < 0 or $token_offset > $#{$html_parser_tokens} ) { |
|
0
|
|
|
|
|
0
|
|
451
|
0
|
|
|
|
|
0
|
$token_offset = $#{$html_parser_tokens}; |
|
0
|
|
|
|
|
0
|
|
452
|
|
|
|
|
|
|
} |
453
|
|
|
|
|
|
|
|
454
|
0
|
|
|
|
|
0
|
return @{ $html_parser_tokens->[$token_offset] }[ |
|
0
|
|
|
|
|
0
|
|
455
|
|
|
|
|
|
|
Marpa::HTML::Internal::Token::LINE, |
456
|
|
|
|
|
|
|
Marpa::HTML::Internal::Token::COLUMN, |
457
|
|
|
|
|
|
|
]; |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
} ## end sub earleme_to_linecol |
460
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
sub earleme_to_offset { |
462
|
|
|
|
|
|
|
|
463
|
0
|
|
|
0
|
|
0
|
my ( $self, $token_offset ) = @_; |
464
|
0
|
|
|
|
|
0
|
my $html_parser_tokens = $self->{tokens}; |
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
# Special start of file for undefined offset |
467
|
0
|
0
|
|
|
|
0
|
if ( not defined $token_offset ) { |
468
|
0
|
|
|
|
|
0
|
return 0; |
469
|
|
|
|
|
|
|
} |
470
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
# Special case needed for a token offset after the last |
472
|
|
|
|
|
|
|
# token. This happens with the EOF. |
473
|
0
|
|
|
|
|
0
|
my $offset; |
474
|
0
|
0
|
0
|
|
|
0
|
if ( $token_offset < 0 or $token_offset > $#{$html_parser_tokens} ) { |
|
0
|
|
|
|
|
0
|
|
475
|
0
|
|
|
|
|
0
|
$offset = length ${ $self->{document} }; |
|
0
|
|
|
|
|
0
|
|
476
|
|
|
|
|
|
|
} |
477
|
|
|
|
|
|
|
else { |
478
|
0
|
|
|
|
|
0
|
$offset = |
479
|
|
|
|
|
|
|
$html_parser_tokens->[$token_offset] |
480
|
|
|
|
|
|
|
->[Marpa::HTML::Internal::Token::END_OFFSET]; |
481
|
|
|
|
|
|
|
} |
482
|
0
|
|
|
|
|
0
|
return $offset; |
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
} ## end sub earleme_to_offset |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
my %ARGS = ( |
487
|
|
|
|
|
|
|
start => q{'S',offset,offset_end,tagname,attr}, |
488
|
|
|
|
|
|
|
end => q{'E',offset,offset_end,tagname}, |
489
|
|
|
|
|
|
|
text => q{'T',offset,offset_end,is_cdata}, |
490
|
|
|
|
|
|
|
process => q{'PI',offset,offset_end}, |
491
|
|
|
|
|
|
|
comment => q{'C',offset,offset_end}, |
492
|
|
|
|
|
|
|
declaration => q{'D',offset,offset_end}, |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
# options that default on |
495
|
|
|
|
|
|
|
unbroken_text => 1, |
496
|
|
|
|
|
|
|
); |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
sub add_handler { |
499
|
413
|
|
|
413
|
|
674
|
my ( $self, $handler_description ) = @_; |
500
|
413
|
|
50
|
|
|
1064
|
my $ref_type = ref $handler_description || 'not a reference'; |
501
|
413
|
50
|
|
|
|
957
|
Carp::croak( |
502
|
|
|
|
|
|
|
"Long form handler description should be ref to hash, but it is $ref_type" |
503
|
|
|
|
|
|
|
) if $ref_type ne 'HASH'; |
504
|
413
|
|
|
|
|
866
|
my $element = delete $handler_description->{element}; |
505
|
413
|
|
|
|
|
674
|
my $id = delete $handler_description->{id}; |
506
|
413
|
|
|
|
|
763
|
my $class = delete $handler_description->{class}; |
507
|
413
|
|
|
|
|
705
|
my $pseudoclass = delete $handler_description->{pseudoclass}; |
508
|
413
|
|
|
|
|
644
|
my $action = delete $handler_description->{action}; |
509
|
0
|
|
|
|
|
0
|
Carp::croak( |
510
|
|
|
|
|
|
|
'Unknown option(s) in Long form handler description: ', |
511
|
413
|
|
|
|
|
1254
|
( join q{ }, keys %{$handler_description} ) |
512
|
413
|
50
|
|
|
|
549
|
) if scalar keys %{$handler_description}; |
513
|
|
|
|
|
|
|
|
514
|
413
|
50
|
|
|
|
1076
|
Carp::croak('Handler action must be CODE ref') |
515
|
|
|
|
|
|
|
if ref $action ne 'CODE'; |
516
|
|
|
|
|
|
|
|
517
|
413
|
100
|
100
|
|
|
1491
|
$element = ( not $element or $element eq q{*} ) ? 'ANY' : lc $element; |
518
|
413
|
100
|
|
|
|
957
|
if ( defined $pseudoclass ) { |
519
|
317
|
|
|
|
|
913
|
$self->{user_handlers_by_pseudoclass}->{$element}->{$pseudoclass} = |
520
|
|
|
|
|
|
|
$action; |
521
|
317
|
|
|
|
|
1302
|
return 1; |
522
|
|
|
|
|
|
|
} |
523
|
|
|
|
|
|
|
|
524
|
96
|
50
|
|
|
|
199
|
if ( defined $id ) { |
525
|
0
|
|
|
|
|
0
|
$self->{user_handlers_by_id}->{$element}->{ lc $id } = $action; |
526
|
0
|
|
|
|
|
0
|
return 1; |
527
|
|
|
|
|
|
|
} |
528
|
96
|
100
|
|
|
|
328
|
$class = defined $class ? lc $class : 'ANY'; |
529
|
96
|
|
|
|
|
441
|
$self->{user_handlers_by_class}->{$element}->{$class} = $action; |
530
|
96
|
|
|
|
|
354
|
return 1; |
531
|
|
|
|
|
|
|
} ## end sub add_handler |
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
sub add_handlers_from_hashes { |
534
|
0
|
|
|
0
|
|
0
|
my ( $self, $handler_specs ) = @_; |
535
|
0
|
|
0
|
|
|
0
|
my $ref_type = ref $handler_specs || 'not a reference'; |
536
|
0
|
0
|
|
|
|
0
|
Carp::croak("handlers arg must must be ref to ARRAY, it is $ref_type") |
537
|
|
|
|
|
|
|
if $ref_type ne 'ARRAY'; |
538
|
0
|
|
|
|
|
0
|
for my $handler_spec ( keys %{$handler_specs} ) { |
|
0
|
|
|
|
|
0
|
|
539
|
0
|
|
|
|
|
0
|
add_handler( $self, $handler_spec ); |
540
|
|
|
|
|
|
|
} |
541
|
0
|
|
|
|
|
0
|
return 1; |
542
|
|
|
|
|
|
|
} ## end sub add_handlers_from_hashes |
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
sub add_handlers { |
545
|
90
|
|
|
90
|
|
171
|
my ( $self, $handler_specs ) = @_; |
546
|
90
|
|
|
|
|
171
|
HANDLER_SPEC: for my $specifier ( keys %{$handler_specs} ) { |
|
90
|
|
|
|
|
472
|
|
547
|
413
|
|
|
|
|
552
|
my ( $element, $id, $class, $pseudoclass ); |
548
|
413
|
|
|
|
|
728
|
my $action = $handler_specs->{$specifier}; |
549
|
413
|
100
|
66
|
|
|
5167
|
( $element, $id ) = ( $specifier =~ /\A ([^#]*) [#] (.*) \z/xms ) |
|
|
|
100
|
|
|
|
|
550
|
|
|
|
|
|
|
or ( $element, $class ) = |
551
|
|
|
|
|
|
|
( $specifier =~ /\A ([^.]*) [.] (.*) \z/xms ) |
552
|
|
|
|
|
|
|
or ( $element, $pseudoclass ) = |
553
|
|
|
|
|
|
|
( $specifier =~ /\A ([^:]*) [:] (.*) \z/xms ) |
554
|
|
|
|
|
|
|
or $element = $specifier; |
555
|
413
|
50
|
66
|
|
|
2942
|
if ($pseudoclass |
556
|
|
|
|
|
|
|
and not $pseudoclass ~~ [ |
557
|
|
|
|
|
|
|
qw(TOP PI DECL COMMENT PROLOG TRAILER WHITESPACE CDATA PCDATA CRUFT) |
558
|
|
|
|
|
|
|
] |
559
|
|
|
|
|
|
|
) |
560
|
|
|
|
|
|
|
{ |
561
|
0
|
|
|
|
|
0
|
Carp::croak( qq{pseudoclass "$pseudoclass" is not known:\n}, |
562
|
|
|
|
|
|
|
"Specifier was $specifier\n" ); |
563
|
|
|
|
|
|
|
} ## end if ( $pseudoclass and not $pseudoclass ~~ [ ...]) |
564
|
413
|
50
|
66
|
|
|
1762
|
if ( $pseudoclass and $element ) { |
565
|
0
|
|
|
|
|
0
|
Carp::croak( |
566
|
|
|
|
|
|
|
qq{pseudoclass "$pseudoclass" may not have an element specified:\n}, |
567
|
|
|
|
|
|
|
"Specifier was $specifier\n" |
568
|
|
|
|
|
|
|
); |
569
|
|
|
|
|
|
|
} ## end if ( $pseudoclass and $element ) |
570
|
|
|
|
|
|
|
add_handler( |
571
|
413
|
|
|
|
|
2161
|
$self, |
572
|
|
|
|
|
|
|
{ element => $element, |
573
|
|
|
|
|
|
|
id => $id, |
574
|
|
|
|
|
|
|
class => $class, |
575
|
|
|
|
|
|
|
pseudoclass => $pseudoclass, |
576
|
|
|
|
|
|
|
action => $action |
577
|
|
|
|
|
|
|
} |
578
|
|
|
|
|
|
|
); |
579
|
|
|
|
|
|
|
} ## end for my $specifier ( keys %{$handler_specs} ) |
580
|
|
|
|
|
|
|
|
581
|
90
|
|
|
|
|
252
|
return 1; |
582
|
|
|
|
|
|
|
} ## end sub add_handlers |
583
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
# If we factor this package, this will be the constructor. |
585
|
|
|
|
|
|
|
## no critic (Subroutines::RequireArgUnpacking) |
586
|
|
|
|
|
|
|
sub create { |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
## use critic |
589
|
91
|
|
|
91
|
|
203
|
my $self = {}; |
590
|
91
|
|
|
|
|
360
|
$self->{trace_fh} = \*STDERR; |
591
|
91
|
|
|
|
|
289
|
ARG: for my $arg (@_) { |
592
|
90
|
|
50
|
|
|
482
|
my $ref_type = ref $arg || 'not a reference'; |
593
|
90
|
50
|
|
|
|
358
|
if ( $ref_type eq 'HASH' ) { |
594
|
90
|
|
|
|
|
375
|
Marpa::HTML::Internal::add_handlers( $self, $arg ); |
595
|
90
|
|
|
|
|
293
|
next ARG; |
596
|
|
|
|
|
|
|
} |
597
|
0
|
0
|
|
|
|
0
|
Carp::croak("Argument must be hash or refs to hash: it is $ref_type") |
598
|
|
|
|
|
|
|
if $ref_type ne 'REF'; |
599
|
0
|
|
|
|
|
0
|
my $option_hash = ${$arg}; |
|
0
|
|
|
|
|
0
|
|
600
|
0
|
|
0
|
|
|
0
|
$ref_type = ref $option_hash || 'not a reference'; |
601
|
0
|
0
|
|
|
|
0
|
Carp::croak( |
602
|
|
|
|
|
|
|
"Argument must be hash or refs to hash: it is ref to $ref_type") |
603
|
|
|
|
|
|
|
if $ref_type ne 'HASH'; |
604
|
0
|
|
|
|
|
0
|
OPTION: for my $option ( keys %{$option_hash} ) { |
|
0
|
|
|
|
|
0
|
|
605
|
0
|
0
|
|
|
|
0
|
if ( $option eq 'handlers' ) { |
606
|
0
|
|
|
|
|
0
|
add_handlers_from_hashes( $self, $option_hash->{$option} ); |
607
|
|
|
|
|
|
|
} |
608
|
0
|
0
|
|
|
|
0
|
if (not $option ~~ [ |
609
|
|
|
|
|
|
|
qw(trace_fh trace_values trace_handlers trace_actions |
610
|
|
|
|
|
|
|
trace_conflicts trace_ambiguity trace_rules trace_QDFA |
611
|
|
|
|
|
|
|
trace_earley_sets trace_terminals trace_cruft) |
612
|
|
|
|
|
|
|
] |
613
|
|
|
|
|
|
|
) |
614
|
|
|
|
|
|
|
{ |
615
|
0
|
|
|
|
|
0
|
Carp::croak("unknown option: $option"); |
616
|
|
|
|
|
|
|
} ## end if ( not $option ~~ [ ...]) |
617
|
0
|
|
|
|
|
0
|
$self->{$option} = $option_hash->{$option}; |
618
|
|
|
|
|
|
|
} ## end for my $option ( keys %{$option_hash} ) |
619
|
|
|
|
|
|
|
} ## end for my $arg (@_) |
620
|
91
|
|
|
|
|
228
|
return $self; |
621
|
|
|
|
|
|
|
} ## end sub create |
622
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
# block_element is for block-level ONLY elements. |
624
|
|
|
|
|
|
|
# head is for anything legal inside the HTML header. |
625
|
|
|
|
|
|
|
# Note that isindex can be both a head element and |
626
|
|
|
|
|
|
|
# and block level element in the body. |
627
|
|
|
|
|
|
|
# ISINDEX is classified as a header_element |
628
|
|
|
|
|
|
|
%Marpa::HTML::Internal::ELEMENT_TYPE = ( |
629
|
|
|
|
|
|
|
( map { $_ => 'block_element' } |
630
|
|
|
|
|
|
|
qw( |
631
|
|
|
|
|
|
|
h1 h2 h3 h4 h5 h6 |
632
|
|
|
|
|
|
|
ul ol dir menu |
633
|
|
|
|
|
|
|
pre |
634
|
|
|
|
|
|
|
p dl div center |
635
|
|
|
|
|
|
|
noscript noframes |
636
|
|
|
|
|
|
|
blockquote form hr |
637
|
|
|
|
|
|
|
table fieldset address |
638
|
|
|
|
|
|
|
) |
639
|
|
|
|
|
|
|
), |
640
|
|
|
|
|
|
|
( map { $_ => 'header_element' } |
641
|
|
|
|
|
|
|
qw( |
642
|
|
|
|
|
|
|
script style meta link object title isindex base |
643
|
|
|
|
|
|
|
) |
644
|
|
|
|
|
|
|
), |
645
|
|
|
|
|
|
|
( map { $_ => 'list_item_element' } qw( li dd dt ) ), |
646
|
|
|
|
|
|
|
( map { $_ => 'table_cell_element' } qw( td th ) ), |
647
|
|
|
|
|
|
|
( map { $_ => 'table_row_element' } qw( tr ) ), |
648
|
|
|
|
|
|
|
); |
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
@Marpa::HTML::Internal::CORE_OPTIONAL_TERMINALS = qw( |
651
|
|
|
|
|
|
|
E_html |
652
|
|
|
|
|
|
|
E_body |
653
|
|
|
|
|
|
|
S_table |
654
|
|
|
|
|
|
|
E_head |
655
|
|
|
|
|
|
|
E_table |
656
|
|
|
|
|
|
|
E_tbody |
657
|
|
|
|
|
|
|
E_tr |
658
|
|
|
|
|
|
|
E_td |
659
|
|
|
|
|
|
|
S_td |
660
|
|
|
|
|
|
|
S_tr |
661
|
|
|
|
|
|
|
S_tbody |
662
|
|
|
|
|
|
|
S_head |
663
|
|
|
|
|
|
|
S_body |
664
|
|
|
|
|
|
|
S_html |
665
|
|
|
|
|
|
|
); |
666
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
%Marpa::HTML::Internal::CORE_OPTIONAL_TERMINALS = (); |
668
|
|
|
|
|
|
|
for my $rank ( 0 .. $#Marpa::HTML::Internal::CORE_OPTIONAL_TERMINALS ) { |
669
|
|
|
|
|
|
|
$Marpa::HTML::Internal::CORE_OPTIONAL_TERMINALS{ |
670
|
|
|
|
|
|
|
$Marpa::HTML::Internal::CORE_OPTIONAL_TERMINALS[$rank] } = $rank; |
671
|
|
|
|
|
|
|
} |
672
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
%Marpa::HTML::Internal::VIRTUAL_TOKEN_HIERARCHY = (); |
674
|
|
|
|
|
|
|
{ |
675
|
|
|
|
|
|
|
my $hierarchy = <<'END_OF_STRING'; |
676
|
|
|
|
|
|
|
th td |
677
|
|
|
|
|
|
|
tr |
678
|
|
|
|
|
|
|
col |
679
|
|
|
|
|
|
|
caption colgroup tfoot thead tbody |
680
|
|
|
|
|
|
|
table |
681
|
|
|
|
|
|
|
body head |
682
|
|
|
|
|
|
|
html |
683
|
|
|
|
|
|
|
END_OF_STRING |
684
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
my $iota = 0; |
686
|
|
|
|
|
|
|
my @hierarchy; |
687
|
|
|
|
|
|
|
for my $level ( split /\n/xms, $hierarchy ) { |
688
|
|
|
|
|
|
|
push @hierarchy, |
689
|
|
|
|
|
|
|
map { ( "S_$_" => $iota, "E_$_" => $iota ) } |
690
|
|
|
|
|
|
|
( split q{ }, $level ); |
691
|
|
|
|
|
|
|
$iota++; |
692
|
|
|
|
|
|
|
} ## end for my $level ( split /\n/xms, $hierarchy ) |
693
|
|
|
|
|
|
|
%Marpa::HTML::Internal::VIRTUAL_TOKEN_HIERARCHY = @hierarchy; |
694
|
|
|
|
|
|
|
$Marpa::HTML::Internal::VIRTUAL_TOKEN_HIERARCHY{EOF} = |
695
|
|
|
|
|
|
|
$Marpa::HTML::Internal::VIRTUAL_TOKEN_HIERARCHY{E_tbody}; |
696
|
|
|
|
|
|
|
} |
697
|
|
|
|
|
|
|
|
698
|
|
|
|
|
|
|
# This display set to be ignored |
699
|
|
|
|
|
|
|
# until the HTML::Implementation doc |
700
|
|
|
|
|
|
|
# is ready. |
701
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
# Marpa::HTML::Display |
703
|
|
|
|
|
|
|
# name: HTML BNF |
704
|
|
|
|
|
|
|
# ignore: 1 |
705
|
|
|
|
|
|
|
# start-after-line: END_OF_BNF |
706
|
|
|
|
|
|
|
# end-before-line: '^END_OF_BNF$' |
707
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
my $BNF = <<'END_OF_BNF'; |
709
|
|
|
|
|
|
|
cruft ::= CRUFT |
710
|
|
|
|
|
|
|
comment ::= C |
711
|
|
|
|
|
|
|
pi ::= PI |
712
|
|
|
|
|
|
|
decl ::= D |
713
|
|
|
|
|
|
|
pcdata ::= PCDATA |
714
|
|
|
|
|
|
|
cdata ::= CDATA |
715
|
|
|
|
|
|
|
whitespace ::= WHITESPACE |
716
|
|
|
|
|
|
|
SGML_item ::= comment |
717
|
|
|
|
|
|
|
SGML_item ::= pi |
718
|
|
|
|
|
|
|
SGML_item ::= decl |
719
|
|
|
|
|
|
|
SGML_flow_item ::= SGML_item |
720
|
|
|
|
|
|
|
SGML_flow_item ::= whitespace |
721
|
|
|
|
|
|
|
SGML_flow_item ::= cruft |
722
|
|
|
|
|
|
|
SGML_flow ::= SGML_flow_item* |
723
|
|
|
|
|
|
|
document ::= prolog ELE_html trailer EOF |
724
|
|
|
|
|
|
|
prolog ::= SGML_flow |
725
|
|
|
|
|
|
|
trailer ::= SGML_flow |
726
|
|
|
|
|
|
|
ELE_html ::= S_html Contents_html E_html |
727
|
|
|
|
|
|
|
Contents_html ::= SGML_flow ELE_head SGML_flow ELE_body SGML_flow |
728
|
|
|
|
|
|
|
ELE_head ::= S_head Contents_head E_head |
729
|
|
|
|
|
|
|
Contents_head ::= head_item* |
730
|
|
|
|
|
|
|
ELE_body ::= S_body flow E_body |
731
|
|
|
|
|
|
|
ELE_table ::= S_table table_flow E_table |
732
|
|
|
|
|
|
|
ELE_tbody ::= S_tbody table_section_flow E_tbody |
733
|
|
|
|
|
|
|
ELE_tr ::= S_tr table_row_flow E_tr |
734
|
|
|
|
|
|
|
ELE_td ::= S_td flow E_td |
735
|
|
|
|
|
|
|
flow ::= flow_item* |
736
|
|
|
|
|
|
|
flow_item ::= cruft |
737
|
|
|
|
|
|
|
flow_item ::= SGML_item |
738
|
|
|
|
|
|
|
flow_item ::= ELE_table |
739
|
|
|
|
|
|
|
flow_item ::= list_item_element |
740
|
|
|
|
|
|
|
flow_item ::= header_element |
741
|
|
|
|
|
|
|
flow_item ::= block_element |
742
|
|
|
|
|
|
|
flow_item ::= inline_element |
743
|
|
|
|
|
|
|
flow_item ::= whitespace |
744
|
|
|
|
|
|
|
flow_item ::= cdata |
745
|
|
|
|
|
|
|
flow_item ::= pcdata |
746
|
|
|
|
|
|
|
head_item ::= header_element |
747
|
|
|
|
|
|
|
head_item ::= cruft |
748
|
|
|
|
|
|
|
head_item ::= whitespace |
749
|
|
|
|
|
|
|
head_item ::= SGML_item |
750
|
|
|
|
|
|
|
inline_flow ::= inline_flow_item* |
751
|
|
|
|
|
|
|
inline_flow_item ::= pcdata_flow_item |
752
|
|
|
|
|
|
|
inline_flow_item ::= inline_element |
753
|
|
|
|
|
|
|
pcdata_flow ::= pcdata_flow_item* |
754
|
|
|
|
|
|
|
pcdata_flow_item ::= cdata |
755
|
|
|
|
|
|
|
pcdata_flow_item ::= pcdata |
756
|
|
|
|
|
|
|
pcdata_flow_item ::= cruft |
757
|
|
|
|
|
|
|
pcdata_flow_item ::= whitespace |
758
|
|
|
|
|
|
|
pcdata_flow_item ::= SGML_item |
759
|
|
|
|
|
|
|
Contents_select ::= select_flow_item* |
760
|
|
|
|
|
|
|
select_flow_item ::= ELE_optgroup |
761
|
|
|
|
|
|
|
select_flow_item ::= ELE_option |
762
|
|
|
|
|
|
|
select_flow_item ::= SGML_flow_item |
763
|
|
|
|
|
|
|
Contents_optgroup ::= optgroup_flow_item* |
764
|
|
|
|
|
|
|
optgroup_flow_item ::= ELE_option |
765
|
|
|
|
|
|
|
optgroup_flow_item ::= SGML_flow_item |
766
|
|
|
|
|
|
|
list_item_flow ::= list_item_flow_item* |
767
|
|
|
|
|
|
|
list_item_flow_item ::= cruft |
768
|
|
|
|
|
|
|
list_item_flow_item ::= SGML_item |
769
|
|
|
|
|
|
|
list_item_flow_item ::= header_element |
770
|
|
|
|
|
|
|
list_item_flow_item ::= block_element |
771
|
|
|
|
|
|
|
list_item_flow_item ::= inline_element |
772
|
|
|
|
|
|
|
list_item_flow_item ::= whitespace |
773
|
|
|
|
|
|
|
list_item_flow_item ::= cdata |
774
|
|
|
|
|
|
|
list_item_flow_item ::= pcdata |
775
|
|
|
|
|
|
|
Contents_colgroup ::= colgroup_flow_item* |
776
|
|
|
|
|
|
|
colgroup_flow_item ::= ELE_col |
777
|
|
|
|
|
|
|
colgroup_flow_item ::= SGML_flow_item |
778
|
|
|
|
|
|
|
table_row_flow ::= table_row_flow_item* |
779
|
|
|
|
|
|
|
table_row_flow_item ::= ELE_th |
780
|
|
|
|
|
|
|
table_row_flow_item ::= ELE_td |
781
|
|
|
|
|
|
|
table_row_flow_item ::= SGML_flow_item |
782
|
|
|
|
|
|
|
table_section_flow ::= table_section_flow_item* |
783
|
|
|
|
|
|
|
table_section_flow_item ::= table_row_element |
784
|
|
|
|
|
|
|
table_section_flow_item ::= SGML_flow_item |
785
|
|
|
|
|
|
|
table_row_element ::= ELE_tr |
786
|
|
|
|
|
|
|
table_flow ::= table_flow_item* |
787
|
|
|
|
|
|
|
table_flow_item ::= ELE_colgroup |
788
|
|
|
|
|
|
|
table_flow_item ::= ELE_thead |
789
|
|
|
|
|
|
|
table_flow_item ::= ELE_tfoot |
790
|
|
|
|
|
|
|
table_flow_item ::= ELE_tbody |
791
|
|
|
|
|
|
|
table_flow_item ::= ELE_caption |
792
|
|
|
|
|
|
|
table_flow_item ::= ELE_col |
793
|
|
|
|
|
|
|
table_flow_item ::= SGML_flow_item |
794
|
|
|
|
|
|
|
empty ::= |
795
|
|
|
|
|
|
|
END_OF_BNF |
796
|
|
|
|
|
|
|
|
797
|
|
|
|
|
|
|
@Marpa::HTML::Internal::CORE_RULES = (); |
798
|
|
|
|
|
|
|
|
799
|
|
|
|
|
|
|
my %handler = ( |
800
|
|
|
|
|
|
|
cruft => '!CRUFT_handler', |
801
|
|
|
|
|
|
|
comment => '!COMMENT_handler', |
802
|
|
|
|
|
|
|
pi => '!PI_handler', |
803
|
|
|
|
|
|
|
decl => '!DECL_handler', |
804
|
|
|
|
|
|
|
document => '!TOP_handler', |
805
|
|
|
|
|
|
|
whitespace => '!WHITESPACE_handler', |
806
|
|
|
|
|
|
|
pcdata => '!PCDATA_handler', |
807
|
|
|
|
|
|
|
cdata => '!CDATA_handler', |
808
|
|
|
|
|
|
|
prolog => '!PROLOG_handler', |
809
|
|
|
|
|
|
|
trailer => '!TRAILER_handler', |
810
|
|
|
|
|
|
|
); |
811
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
for my $bnf_production ( split /\n/xms, $BNF ) { |
813
|
|
|
|
|
|
|
my $sequence = ( $bnf_production =~ s/ [*] \s* $//xms ); |
814
|
|
|
|
|
|
|
$bnf_production =~ s/ \s* [:][:][=] \s* / /xms; |
815
|
|
|
|
|
|
|
my @symbols = ( split q{ }, $bnf_production ); |
816
|
|
|
|
|
|
|
my $lhs = shift @symbols; |
817
|
|
|
|
|
|
|
my %rule_descriptor = ( |
818
|
|
|
|
|
|
|
lhs => $lhs, |
819
|
|
|
|
|
|
|
rhs => \@symbols, |
820
|
|
|
|
|
|
|
); |
821
|
|
|
|
|
|
|
if ($sequence) { |
822
|
|
|
|
|
|
|
$rule_descriptor{min} = 0; |
823
|
|
|
|
|
|
|
} |
824
|
|
|
|
|
|
|
if ( my $handler = $handler{$lhs} ) { |
825
|
|
|
|
|
|
|
$rule_descriptor{action} = $handler; |
826
|
|
|
|
|
|
|
} |
827
|
|
|
|
|
|
|
elsif ( $lhs =~ /^ELE_/xms ) { |
828
|
|
|
|
|
|
|
$rule_descriptor{action} = "!$lhs"; |
829
|
|
|
|
|
|
|
} |
830
|
|
|
|
|
|
|
push @Marpa::HTML::Internal::CORE_RULES, \%rule_descriptor; |
831
|
|
|
|
|
|
|
} ## end for my $bnf_production ( split /\n/xms, $BNF ) |
832
|
|
|
|
|
|
|
|
833
|
|
|
|
|
|
|
@Marpa::HTML::Internal::CORE_TERMINALS = |
834
|
|
|
|
|
|
|
qw(C D PI CRUFT CDATA PCDATA WHITESPACE EOF ); |
835
|
|
|
|
|
|
|
|
836
|
|
|
|
|
|
|
push @Marpa::HTML::Internal::CORE_TERMINALS, |
837
|
|
|
|
|
|
|
keys %Marpa::HTML::Internal::CORE_OPTIONAL_TERMINALS; |
838
|
|
|
|
|
|
|
|
839
|
6
|
|
|
6
|
|
60
|
no strict 'refs'; |
|
6
|
|
|
|
|
14
|
|
|
6
|
|
|
|
|
304
|
|
840
|
|
|
|
|
|
|
*{'Marpa::HTML::Internal::default_action'} = create_tdesc_handler(); |
841
|
6
|
|
|
6
|
|
45
|
use strict; |
|
6
|
|
|
|
|
12
|
|
|
6
|
|
|
|
|
25537
|
|
842
|
|
|
|
|
|
|
|
843
|
|
|
|
|
|
|
%Marpa::HTML::Internal::EMPTY_ELEMENT = map { $_ => 1 } qw( |
844
|
|
|
|
|
|
|
area base basefont br col frame hr |
845
|
|
|
|
|
|
|
img input isindex link meta param); |
846
|
|
|
|
|
|
|
|
847
|
|
|
|
|
|
|
%Marpa::HTML::Internal::CONTENTS = ( |
848
|
|
|
|
|
|
|
'p' => 'inline_flow', |
849
|
|
|
|
|
|
|
'select' => 'Contents_select', |
850
|
|
|
|
|
|
|
'option' => 'pcdata_flow', |
851
|
|
|
|
|
|
|
'optgroup' => 'Contents_optgroup', |
852
|
|
|
|
|
|
|
'dt' => 'inline_flow', |
853
|
|
|
|
|
|
|
'dd' => 'list_item_flow', |
854
|
|
|
|
|
|
|
'li' => 'list_item_flow', |
855
|
|
|
|
|
|
|
'colgroup' => 'Contents_colgroup', |
856
|
|
|
|
|
|
|
'thead' => 'table_section_flow', |
857
|
|
|
|
|
|
|
'tfoot' => 'table_section_flow', |
858
|
|
|
|
|
|
|
'tbody' => 'table_section_flow', |
859
|
|
|
|
|
|
|
'table' => 'table_flow', |
860
|
|
|
|
|
|
|
( map { $_ => 'empty' } keys %Marpa::HTML::Internal::EMPTY_ELEMENT ), |
861
|
|
|
|
|
|
|
); |
862
|
|
|
|
|
|
|
|
863
|
|
|
|
|
|
|
sub parse { |
864
|
91
|
|
|
91
|
|
244
|
my ( $self, $document_ref ) = @_; |
865
|
|
|
|
|
|
|
|
866
|
91
|
|
|
|
|
221
|
my %start_tags = (); |
867
|
91
|
|
|
|
|
169
|
my %end_tags = (); |
868
|
|
|
|
|
|
|
|
869
|
91
|
50
|
|
|
|
350
|
Carp::croak( |
870
|
|
|
|
|
|
|
"parse() already run on this object\n", |
871
|
|
|
|
|
|
|
'For a new parse, create a new object' |
872
|
|
|
|
|
|
|
) if $self->{document}; |
873
|
|
|
|
|
|
|
|
874
|
91
|
|
|
|
|
193
|
my $trace_cruft = $self->{trace_cruft}; |
875
|
91
|
|
50
|
|
|
514
|
my $trace_terminals = $self->{trace_terminals} // 0; |
876
|
91
|
|
|
|
|
183
|
my $trace_conflicts = $self->{trace_conflicts}; |
877
|
91
|
|
|
|
|
164
|
my $trace_fh = $self->{trace_fh}; |
878
|
91
|
|
|
|
|
224
|
my $ref_type = ref $document_ref; |
879
|
91
|
|
|
|
|
311
|
Carp::croak('Arg to parse() must be ref to string') |
880
|
|
|
|
|
|
|
if not $ref_type |
881
|
|
|
|
|
|
|
or $ref_type ne 'SCALAR' |
882
|
91
|
50
|
33
|
|
|
578
|
or not defined ${$document_ref}; |
|
|
|
33
|
|
|
|
|
883
|
|
|
|
|
|
|
|
884
|
91
|
|
|
|
|
147
|
my %pull_parser_args; |
885
|
91
|
|
|
|
|
376
|
my $document = $pull_parser_args{doc} = $self->{document} = $document_ref; |
886
|
91
|
|
33
|
|
|
1074
|
my $pull_parser = |
887
|
|
|
|
|
|
|
HTML::PullParser->new( %pull_parser_args, |
888
|
|
|
|
|
|
|
%Marpa::HTML::PULL_PARSER_OPTIONS ) |
889
|
|
|
|
|
|
|
|| Carp::croak('Could not create pull parser'); |
890
|
|
|
|
|
|
|
|
891
|
91
|
|
|
|
|
16295
|
my @tokens = (); |
892
|
|
|
|
|
|
|
|
893
|
91
|
|
|
|
|
283
|
my %terminals = map { $_ => 1 } @Marpa::HTML::Internal::CORE_TERMINALS; |
|
2002
|
|
|
|
|
4387
|
|
894
|
91
|
|
|
|
|
1147
|
my %optional_terminals = %Marpa::HTML::Internal::CORE_OPTIONAL_TERMINALS; |
895
|
91
|
|
|
|
|
267
|
my @html_parser_tokens = (); |
896
|
91
|
|
|
|
|
203
|
my @marpa_tokens = (); |
897
|
|
|
|
|
|
|
HTML_PARSER_TOKEN: |
898
|
91
|
|
|
|
|
469
|
while ( my $html_parser_token = $pull_parser->get_token ) { |
899
|
1361
|
|
|
|
|
2485
|
my ( $token_type, $line, $column, $offset, $offset_end ) = |
900
|
1361
|
|
|
|
|
17865
|
@{$html_parser_token}; |
901
|
|
|
|
|
|
|
|
902
|
|
|
|
|
|
|
# If it's a virtual token from HTML::Parser, |
903
|
|
|
|
|
|
|
# pretend it never existed. |
904
|
|
|
|
|
|
|
# We figure out where the missing tags are, |
905
|
|
|
|
|
|
|
# and HTML::Parser's guesses are not helpful. |
906
|
1361
|
100
|
|
|
|
2975
|
next HTML_PARSER_TOKEN if $offset_end <= $offset; |
907
|
|
|
|
|
|
|
|
908
|
1359
|
|
|
|
|
2262
|
my $token_number = scalar @html_parser_tokens; |
909
|
1359
|
|
|
|
|
1893
|
push @html_parser_tokens, $html_parser_token; |
910
|
|
|
|
|
|
|
|
911
|
1359
|
|
|
|
|
1610
|
given ($token_type) { |
912
|
1359
|
|
|
|
|
2444
|
when ('T') { |
913
|
626
|
|
|
|
|
905
|
my $is_cdata = $html_parser_token |
914
|
|
|
|
|
|
|
->[Marpa::HTML::Internal::Token::IS_CDATA]; |
915
|
626
|
|
|
|
|
10661
|
push @marpa_tokens, |
916
|
|
|
|
|
|
|
[ |
917
|
|
|
|
|
|
|
( substr( |
918
|
626
|
50
|
|
|
|
737
|
${$document}, $offset, |
|
|
100
|
|
|
|
|
|
919
|
|
|
|
|
|
|
( $offset_end - $offset ) |
920
|
|
|
|
|
|
|
) =~ / \A \s* \z /xms ? 'WHITESPACE' |
921
|
|
|
|
|
|
|
: $is_cdata ? 'CDATA' |
922
|
|
|
|
|
|
|
: 'PCDATA' |
923
|
|
|
|
|
|
|
), |
924
|
|
|
|
|
|
|
[ [ 'UNVALUED_SPAN', $token_number, $token_number ] ], |
925
|
|
|
|
|
|
|
]; |
926
|
|
|
|
|
|
|
} ## end when ('T') |
927
|
733
|
|
|
|
|
1012
|
when ('S') { |
928
|
398
|
|
|
|
|
580
|
my $tag_name = $html_parser_token |
929
|
|
|
|
|
|
|
->[Marpa::HTML::Internal::Token::TAGNAME]; |
930
|
398
|
|
|
|
|
699
|
$start_tags{$tag_name}++; |
931
|
398
|
|
|
|
|
678
|
my $terminal = "S_$tag_name"; |
932
|
398
|
|
|
|
|
606
|
$terminals{$terminal}++; |
933
|
398
|
|
|
|
|
2748
|
push @marpa_tokens, |
934
|
|
|
|
|
|
|
[ |
935
|
|
|
|
|
|
|
$terminal, |
936
|
|
|
|
|
|
|
[ [ 'UNVALUED_SPAN', $token_number, $token_number ] ], |
937
|
|
|
|
|
|
|
]; |
938
|
|
|
|
|
|
|
} ## end when ('S') |
939
|
335
|
|
|
|
|
500
|
when ('E') { |
940
|
331
|
|
|
|
|
505
|
my $tag_name = $html_parser_token |
941
|
|
|
|
|
|
|
->[Marpa::HTML::Internal::Token::TAGNAME]; |
942
|
331
|
|
|
|
|
566
|
$end_tags{$tag_name}++; |
943
|
331
|
|
|
|
|
506
|
my $terminal = "E_$tag_name"; |
944
|
331
|
|
|
|
|
674
|
$terminals{$terminal}++; |
945
|
331
|
|
|
|
|
2428
|
push @marpa_tokens, |
946
|
|
|
|
|
|
|
[ |
947
|
|
|
|
|
|
|
$terminal, |
948
|
|
|
|
|
|
|
[ [ 'UNVALUED_SPAN', $token_number, $token_number ] ], |
949
|
|
|
|
|
|
|
]; |
950
|
|
|
|
|
|
|
} ## end when ('E') |
951
|
4
|
|
|
|
|
15
|
when ( [qw(C D)] ) { |
952
|
4
|
|
|
|
|
34
|
push @marpa_tokens, |
953
|
|
|
|
|
|
|
[ |
954
|
|
|
|
|
|
|
$_, [ [ 'UNVALUED_SPAN', $token_number, $token_number ] ], |
955
|
|
|
|
|
|
|
]; |
956
|
|
|
|
|
|
|
} ## end when ( [qw(C D)] ) |
957
|
0
|
|
|
|
|
0
|
when ( ['PI'] ) { |
958
|
0
|
|
|
|
|
0
|
push @marpa_tokens, |
959
|
|
|
|
|
|
|
[ |
960
|
|
|
|
|
|
|
$_, [ [ 'UNVALUED_SPAN', $token_number, $token_number ] ], |
961
|
|
|
|
|
|
|
]; |
962
|
|
|
|
|
|
|
} ## end when ( ['PI'] ) |
963
|
0
|
|
|
|
|
0
|
default { Carp::croak("Unprovided-for event: $_") } |
|
0
|
|
|
|
|
0
|
|
964
|
|
|
|
|
|
|
} ## end given |
965
|
|
|
|
|
|
|
} ## end while ( my $html_parser_token = $pull_parser->get_token) |
966
|
|
|
|
|
|
|
|
967
|
|
|
|
|
|
|
# Points AFTER the last HTML |
968
|
|
|
|
|
|
|
# Parser token. |
969
|
|
|
|
|
|
|
# The other logic needs to be ready for this. |
970
|
91
|
|
|
|
|
1379
|
push @marpa_tokens, [ 'EOF', [ ['POINT'] ] ]; |
971
|
|
|
|
|
|
|
|
972
|
91
|
|
|
|
|
184
|
$pull_parser = undef; # conserve memory |
973
|
|
|
|
|
|
|
|
974
|
91
|
|
|
|
|
1617
|
my @rules = @Marpa::HTML::Internal::CORE_RULES; |
975
|
91
|
|
|
|
|
872
|
my @terminals = keys %terminals; |
976
|
|
|
|
|
|
|
|
977
|
91
|
|
|
|
|
560
|
my %pseudoclass_element_actions = (); |
978
|
91
|
|
|
|
|
262
|
my %element_actions = (); |
979
|
|
|
|
|
|
|
|
980
|
|
|
|
|
|
|
# Special cases which are dealt with elsewhere. |
981
|
|
|
|
|
|
|
# As of now the only special cases are elements with optional |
982
|
|
|
|
|
|
|
# start and end tags |
983
|
91
|
|
|
|
|
227
|
for my $special_element (qw(html head body table tbody tr td)) { |
984
|
637
|
|
|
|
|
767
|
delete $start_tags{$special_element}; |
985
|
637
|
|
|
|
|
1600
|
$element_actions{"!ELE_$special_element"} = $special_element; |
986
|
|
|
|
|
|
|
} |
987
|
|
|
|
|
|
|
|
988
|
91
|
|
|
|
|
335
|
ELEMENT: for ( keys %start_tags ) { |
989
|
50
|
|
|
|
|
113
|
my $start_tag = "S_$_"; |
990
|
50
|
|
|
|
|
102
|
my $end_tag = "E_$_"; |
991
|
50
|
|
100
|
|
|
258
|
my $contents = $Marpa::HTML::Internal::CONTENTS{$_} // 'flow'; |
992
|
50
|
|
100
|
|
|
331
|
my $element_type = $Marpa::HTML::Internal::ELEMENT_TYPE{$_} |
993
|
|
|
|
|
|
|
// 'inline_element'; |
994
|
|
|
|
|
|
|
|
995
|
50
|
|
|
|
|
3270
|
push @rules, |
996
|
|
|
|
|
|
|
{ |
997
|
|
|
|
|
|
|
lhs => $element_type, |
998
|
|
|
|
|
|
|
rhs => ["ELE_$_"], |
999
|
|
|
|
|
|
|
}, |
1000
|
|
|
|
|
|
|
{ |
1001
|
|
|
|
|
|
|
lhs => "ELE_$_", |
1002
|
|
|
|
|
|
|
rhs => [ $start_tag, $contents, $end_tag ], |
1003
|
|
|
|
|
|
|
action => "!ELE_$_", |
1004
|
|
|
|
|
|
|
}; |
1005
|
|
|
|
|
|
|
|
1006
|
|
|
|
|
|
|
# There may be no |
1007
|
|
|
|
|
|
|
# end tag in the input. |
1008
|
|
|
|
|
|
|
# This silences the warning. |
1009
|
50
|
100
|
|
|
|
185
|
if ( not $terminals{$end_tag} ) { |
1010
|
21
|
|
|
|
|
41
|
push @terminals, $end_tag; |
1011
|
21
|
|
|
|
|
52
|
$terminals{$end_tag}++; |
1012
|
|
|
|
|
|
|
} |
1013
|
|
|
|
|
|
|
|
1014
|
|
|
|
|
|
|
# Make each new optional terminal the highest ranking |
1015
|
50
|
|
|
|
|
113
|
$optional_terminals{$end_tag} = keys %optional_terminals; |
1016
|
|
|
|
|
|
|
|
1017
|
50
|
|
|
|
|
249
|
$element_actions{"!ELE_$_"} = $_; |
1018
|
|
|
|
|
|
|
} ## end for ( keys %start_tags ) |
1019
|
|
|
|
|
|
|
|
1020
|
|
|
|
|
|
|
# The question is where to put cruft -- in the current element, |
1021
|
|
|
|
|
|
|
# or at a higher level. As a first step, we set up a system of |
1022
|
|
|
|
|
|
|
# levels for specific elements, going from the lowest, where no |
1023
|
|
|
|
|
|
|
# cruft is allowed, to the highest, where everything is |
1024
|
|
|
|
|
|
|
# acceptable as cruft, if only because it has nowhere else to go. |
1025
|
|
|
|
|
|
|
|
1026
|
|
|
|
|
|
|
# First step, set up the levels, using specific elements. |
1027
|
|
|
|
|
|
|
# Some of these elements will are stand-ins for large category. |
1028
|
|
|
|
|
|
|
# For example, the HR element stands in for those elements |
1029
|
|
|
|
|
|
|
# such as empty elements, |
1030
|
|
|
|
|
|
|
# which tolerate zero cruft, while SPAN stands in for |
1031
|
|
|
|
|
|
|
# inline elements and DIV stands in for the class of |
1032
|
|
|
|
|
|
|
# block-level elements |
1033
|
|
|
|
|
|
|
|
1034
|
91
|
|
|
|
|
228
|
my %ok_as_cruft = (); |
1035
|
91
|
|
|
|
|
221
|
DECIDE_CRUFT_TREATMENT: { |
1036
|
91
|
|
|
|
|
146
|
my %level = (); |
1037
|
91
|
|
|
|
|
1434
|
my @elements_by_level = ( |
1038
|
|
|
|
|
|
|
[qw( HR HEAD )], |
1039
|
|
|
|
|
|
|
[qw( SPAN OPTION )], |
1040
|
|
|
|
|
|
|
[qw( LI OPTGROUP DD DT )], |
1041
|
|
|
|
|
|
|
[qw( DIR MENU )], |
1042
|
|
|
|
|
|
|
[qw( DIV )], |
1043
|
|
|
|
|
|
|
[qw( UL OL DL )], |
1044
|
|
|
|
|
|
|
[qw( TH TD )], |
1045
|
|
|
|
|
|
|
[qw( TR )], |
1046
|
|
|
|
|
|
|
[qw( COL )], |
1047
|
|
|
|
|
|
|
[qw( CAPTION COLGROUP THEAD TFOOT TBODY )], |
1048
|
|
|
|
|
|
|
[qw( TABLE )], |
1049
|
|
|
|
|
|
|
[qw( BODY )], |
1050
|
|
|
|
|
|
|
[qw( HTML )], |
1051
|
|
|
|
|
|
|
); |
1052
|
|
|
|
|
|
|
|
1053
|
|
|
|
|
|
|
# EOF comes after everything -- it is |
1054
|
|
|
|
|
|
|
# the highest level of all |
1055
|
91
|
|
|
|
|
279
|
$level{EOF} = scalar @elements_by_level; |
1056
|
|
|
|
|
|
|
|
1057
|
|
|
|
|
|
|
# Assign levels to the end tags of the elements |
1058
|
|
|
|
|
|
|
# in the above table. |
1059
|
91
|
|
|
|
|
339
|
for my $level ( 0 .. $#elements_by_level ) { |
1060
|
1183
|
|
|
|
|
1307
|
for my $element ( @{ $elements_by_level[$level] } ) { |
|
1183
|
|
|
|
|
1981
|
|
1061
|
2366
|
|
|
|
|
8527
|
$level{ 'S_' . lc $element } = $level{ 'E_' . lc $element } = |
1062
|
|
|
|
|
|
|
$level; |
1063
|
|
|
|
|
|
|
} |
1064
|
|
|
|
|
|
|
} ## end for my $level ( 0 .. $#elements_by_level ) |
1065
|
|
|
|
|
|
|
|
1066
|
91
|
|
|
|
|
245
|
my $no_cruft_allowed = $level{E_hr}; |
1067
|
91
|
|
|
|
|
168
|
my $block_level = $level{E_div}; |
1068
|
91
|
|
|
|
|
165
|
my $inline_level = $level{E_span}; |
1069
|
|
|
|
|
|
|
|
1070
|
|
|
|
|
|
|
# Now that we have set out the structure of levels |
1071
|
|
|
|
|
|
|
# fill it in for all the terminals we have yet to |
1072
|
|
|
|
|
|
|
# define. |
1073
|
3426
|
|
|
|
|
5909
|
TERMINAL: |
1074
|
91
|
|
|
|
|
489
|
for my $terminal ( grep { not defined $level{$_} } |
1075
|
|
|
|
|
|
|
( @terminals, keys %optional_terminals ) ) |
1076
|
|
|
|
|
|
|
{ |
1077
|
|
|
|
|
|
|
|
1078
|
|
|
|
|
|
|
# With the exception of EOF, |
1079
|
|
|
|
|
|
|
# only tags can have levels because only they really |
1080
|
|
|
|
|
|
|
# tell us anyting about "state" -- |
1081
|
|
|
|
|
|
|
# whether we are awaiting something |
1082
|
|
|
|
|
|
|
# or are inside something. |
1083
|
721
|
100
|
|
|
|
1715
|
if ( $terminal !~ /^[SE]_/xms ) { |
1084
|
637
|
|
|
|
|
942
|
$level{$terminal} = $no_cruft_allowed; |
1085
|
637
|
|
|
|
|
927
|
next TERMINAL; |
1086
|
|
|
|
|
|
|
} |
1087
|
84
|
|
|
|
|
151
|
my $element = substr $terminal, 2; |
1088
|
84
|
100
|
|
|
|
213
|
if ( $Marpa::HTML::Internal::EMPTY_ELEMENT{$element} ) { |
1089
|
24
|
|
|
|
|
37
|
$level{$terminal} = $no_cruft_allowed; |
1090
|
24
|
|
|
|
|
49
|
next TERMINAL; |
1091
|
|
|
|
|
|
|
} |
1092
|
|
|
|
|
|
|
|
1093
|
60
|
|
|
|
|
98
|
my $element_type = $Marpa::HTML::Internal::ELEMENT_TYPE{$element}; |
1094
|
60
|
50
|
33
|
|
|
405
|
if ( defined $element_type |
1095
|
|
|
|
|
|
|
and $element_type ~~ [qw(block_element header_element)] ) |
1096
|
|
|
|
|
|
|
{ |
1097
|
60
|
|
|
|
|
112
|
$level{$terminal} = $block_level; |
1098
|
60
|
|
|
|
|
165
|
next TERMINAL; |
1099
|
|
|
|
|
|
|
} ## end if ( defined $element_type and $element_type ~~ [...]) |
1100
|
|
|
|
|
|
|
|
1101
|
0
|
|
|
|
|
0
|
$level{$terminal} = $inline_level; |
1102
|
|
|
|
|
|
|
|
1103
|
|
|
|
|
|
|
} ## end for my $terminal ( grep { not defined $level{$_} } ( ...)) |
1104
|
|
|
|
|
|
|
|
1105
|
|
|
|
|
|
|
EXPECTED_TERMINAL: |
1106
|
91
|
|
|
|
|
520
|
for my $expected_terminal ( keys %optional_terminals ) { |
1107
|
|
|
|
|
|
|
|
1108
|
|
|
|
|
|
|
# Regardless of levels, allow no cruft before a start tag. |
1109
|
|
|
|
|
|
|
# Start whatever it is, then deal with the cruft. |
1110
|
1324
|
100
|
|
|
|
4460
|
next EXPECTED_TERMINAL if $expected_terminal =~ /^S_/xms; |
1111
|
|
|
|
|
|
|
|
1112
|
|
|
|
|
|
|
# For end tags, use the levels |
1113
|
687
|
|
|
|
|
900
|
TERMINAL: for my $actual_terminal (@terminals) { |
1114
|
15970
|
|
|
|
|
38925
|
$ok_as_cruft{$expected_terminal}{$actual_terminal} = |
1115
|
|
|
|
|
|
|
$level{$actual_terminal} < $level{$expected_terminal}; |
1116
|
|
|
|
|
|
|
} |
1117
|
|
|
|
|
|
|
} ## end for my $expected_terminal ( keys %optional_terminals ) |
1118
|
|
|
|
|
|
|
|
1119
|
|
|
|
|
|
|
} ## end DECIDE_CRUFT_TREATMENT: |
1120
|
|
|
|
|
|
|
|
1121
|
91
|
|
|
|
|
1573
|
my $grammar = Marpa::HTML::Grammar->new( |
1122
|
|
|
|
|
|
|
{ rules => \@rules, |
1123
|
|
|
|
|
|
|
start => 'document', |
1124
|
|
|
|
|
|
|
terminals => \@terminals, |
1125
|
|
|
|
|
|
|
inaccessible_ok => 1, |
1126
|
|
|
|
|
|
|
unproductive_ok => 1, |
1127
|
|
|
|
|
|
|
default_action => 'Marpa::HTML::Internal::default_action', |
1128
|
|
|
|
|
|
|
strip => 0, |
1129
|
|
|
|
|
|
|
} |
1130
|
|
|
|
|
|
|
); |
1131
|
91
|
|
|
|
|
2106424
|
$grammar->precompute(); |
1132
|
|
|
|
|
|
|
|
1133
|
91
|
50
|
|
|
|
27728292
|
if ( $self->{trace_rules} ) { |
1134
|
0
|
0
|
|
|
|
0
|
say {$trace_fh} $grammar->show_rules() |
|
0
|
|
|
|
|
0
|
|
1135
|
|
|
|
|
|
|
or Carp::croak("Cannot print: $ERRNO"); |
1136
|
|
|
|
|
|
|
} |
1137
|
91
|
50
|
|
|
|
464
|
if ( $self->{trace_QDFA} ) { |
1138
|
0
|
0
|
|
|
|
0
|
say {$trace_fh} $grammar->show_QDFA() |
|
0
|
|
|
|
|
0
|
|
1139
|
|
|
|
|
|
|
or Carp::croak("Cannot print: $ERRNO"); |
1140
|
|
|
|
|
|
|
} |
1141
|
|
|
|
|
|
|
|
1142
|
91
|
|
|
|
|
1661
|
my $recce = Marpa::HTML::Recognizer->new( |
1143
|
|
|
|
|
|
|
{ grammar => $grammar, |
1144
|
|
|
|
|
|
|
trace_terminals => $self->{trace_terminals}, |
1145
|
|
|
|
|
|
|
trace_earley_sets => $self->{trace_earley_sets}, |
1146
|
|
|
|
|
|
|
mode => 'stream', |
1147
|
|
|
|
|
|
|
} |
1148
|
|
|
|
|
|
|
); |
1149
|
|
|
|
|
|
|
|
1150
|
91
|
|
|
|
|
49745
|
$self->{recce} = $recce; |
1151
|
91
|
|
|
|
|
374
|
$self->{tokens} = \@html_parser_tokens; |
1152
|
|
|
|
|
|
|
|
1153
|
|
|
|
|
|
|
# These variables track virtual start tokens as |
1154
|
|
|
|
|
|
|
# a protection against infinite loops. |
1155
|
91
|
|
|
|
|
247
|
my %start_virtuals_used = (); |
1156
|
91
|
|
|
|
|
189
|
my $earleme_of_last_start_virtual = -1; |
1157
|
|
|
|
|
|
|
|
1158
|
91
|
|
|
|
|
277
|
my $marpa_token = shift @marpa_tokens; |
1159
|
91
|
|
|
|
|
382
|
RECCE_RESPONSE: while ( defined $marpa_token ) { |
1160
|
|
|
|
|
|
|
|
1161
|
2125
|
|
|
|
|
2630
|
my $read_result = $recce->read( @{$marpa_token} ); |
|
2125
|
|
|
|
|
8355
|
|
1162
|
2125
|
100
|
|
|
|
1254697
|
if ( defined $read_result ) { |
1163
|
1450
|
|
|
|
|
2797
|
$marpa_token = shift @marpa_tokens; |
1164
|
1450
|
|
|
|
|
4796
|
next RECCE_RESPONSE; |
1165
|
|
|
|
|
|
|
} |
1166
|
|
|
|
|
|
|
|
1167
|
675
|
|
|
|
|
1246
|
my $actual_terminal = $marpa_token->[0]; |
1168
|
675
|
50
|
|
|
|
1849
|
if ($trace_terminals) { |
1169
|
0
|
0
|
|
|
|
0
|
say {$trace_fh} 'Literal Token not accepted: ', $actual_terminal |
|
0
|
|
|
|
|
0
|
|
1170
|
|
|
|
|
|
|
or Carp::croak("Cannot print: $ERRNO"); |
1171
|
|
|
|
|
|
|
} |
1172
|
|
|
|
|
|
|
|
1173
|
675
|
|
|
|
|
1024
|
my $virtual_token_to_add; |
1174
|
|
|
|
|
|
|
|
1175
|
675
|
|
|
|
|
751
|
FIND_VIRTUAL_TOKEN: { |
1176
|
675
|
|
|
|
|
1013
|
my $virtual_terminal; |
1177
|
214
|
|
|
|
|
1157
|
my @virtuals_expected = |
1178
|
4561
|
|
|
|
|
12731
|
sort { $optional_terminals{$a} <=> $optional_terminals{$b} } |
1179
|
675
|
|
|
|
|
2093
|
grep { defined $optional_terminals{$_} } |
1180
|
675
|
|
|
|
|
861
|
@{ $recce->terminals_expected() }; |
1181
|
675
|
50
|
|
|
|
1668
|
if ($trace_conflicts) { |
1182
|
0
|
0
|
|
|
|
0
|
say {$trace_fh} 'Conflict of virtual choices' |
|
0
|
|
|
|
|
0
|
|
1183
|
|
|
|
|
|
|
or Carp::croak("Cannot print: $ERRNO"); |
1184
|
0
|
0
|
|
|
|
0
|
say {$trace_fh} "Actual Token is $actual_terminal" |
|
0
|
|
|
|
|
0
|
|
1185
|
|
|
|
|
|
|
or Carp::croak("Cannot print: $ERRNO"); |
1186
|
0
|
0
|
|
|
|
0
|
say {$trace_fh} +( scalar @virtuals_expected ), |
|
0
|
|
|
|
|
0
|
|
1187
|
|
|
|
|
|
|
' virtual terminals expected: ', join q{ }, |
1188
|
|
|
|
|
|
|
@virtuals_expected |
1189
|
|
|
|
|
|
|
or Carp::croak("Cannot print: $ERRNO"); |
1190
|
|
|
|
|
|
|
} ## end if ($trace_conflicts) |
1191
|
|
|
|
|
|
|
|
1192
|
|
|
|
|
|
|
LOOKAHEAD_VIRTUAL_TERMINAL: |
1193
|
675
|
|
|
|
|
1996
|
while ( my $candidate = pop @virtuals_expected ) { |
1194
|
|
|
|
|
|
|
|
1195
|
|
|
|
|
|
|
# Start an implied table only if the next token is one which |
1196
|
|
|
|
|
|
|
# can only occur inside a table |
1197
|
817
|
100
|
|
|
|
2393
|
if ( $candidate eq 'S_table' ) { |
1198
|
90
|
100
|
|
|
|
906
|
if (not $actual_terminal ~~ [ |
1199
|
|
|
|
|
|
|
qw( |
1200
|
|
|
|
|
|
|
S_caption S_col S_colgroup S_thead S_tfoot |
1201
|
|
|
|
|
|
|
S_tbody S_tr S_th S_td |
1202
|
|
|
|
|
|
|
E_caption E_col E_colgroup E_thead E_tfoot |
1203
|
|
|
|
|
|
|
E_tbody E_tr E_th E_td |
1204
|
|
|
|
|
|
|
E_table |
1205
|
|
|
|
|
|
|
) |
1206
|
|
|
|
|
|
|
] |
1207
|
|
|
|
|
|
|
) |
1208
|
|
|
|
|
|
|
{ |
1209
|
86
|
|
|
|
|
457
|
next LOOKAHEAD_VIRTUAL_TERMINAL; |
1210
|
|
|
|
|
|
|
} ## end if ( not $actual_terminal ~~ [ qw(...)]) |
1211
|
|
|
|
|
|
|
|
1212
|
|
|
|
|
|
|
# The above test implies the others below, so |
1213
|
|
|
|
|
|
|
# this virtual table start terminal is OK. |
1214
|
4
|
|
|
|
|
23
|
$virtual_terminal = $candidate; |
1215
|
4
|
|
|
|
|
13
|
last LOOKAHEAD_VIRTUAL_TERMINAL; |
1216
|
|
|
|
|
|
|
} ## end if ( $candidate eq 'S_table' ) |
1217
|
|
|
|
|
|
|
|
1218
|
|
|
|
|
|
|
# For other than , we are permissive.
1219
|
|
|
|
|
|
|
# Unless the lookahead gives us |
1220
|
|
|
|
|
|
|
# a specific reason to |
1221
|
|
|
|
|
|
|
# reject the virtual terminal, we accept it. |
1222
|
|
|
|
|
|
|
|
1223
|
|
|
|
|
|
|
# No need to check lookahead, unless we are starting |
1224
|
|
|
|
|
|
|
# an element |
1225
|
727
|
100
|
|
|
|
3384
|
if ( $candidate !~ /^S_/xms ) { |
1226
|
372
|
|
|
|
|
826
|
$virtual_terminal = $candidate; |
1227
|
372
|
|
|
|
|
755
|
last LOOKAHEAD_VIRTUAL_TERMINAL; |
1228
|
|
|
|
|
|
|
} |
1229
|
|
|
|
|
|
|
|
1230
|
|
|
|
|
|
|
#<<< no perltidy cycles as of 12 Mar 2010 |
1231
|
|
|
|
|
|
|
|
1232
|
|
|
|
|
|
|
my $candidate_level = |
1233
|
|
|
|
|
|
|
$Marpa::HTML::Internal::VIRTUAL_TOKEN_HIERARCHY{ |
1234
|
355
|
|
|
|
|
956
|
$candidate }; |
1235
|
|
|
|
|
|
|
|
1236
|
|
|
|
|
|
|
#>>> |
1237
|
|
|
|
|
|
|
# If the candidate is not part of the hierarchy, no need to check |
1238
|
|
|
|
|
|
|
# lookahead |
1239
|
355
|
50
|
|
|
|
804
|
if ( not defined $candidate_level ) { |
1240
|
0
|
|
|
|
|
0
|
$virtual_terminal = $candidate; |
1241
|
0
|
|
|
|
|
0
|
last LOOKAHEAD_VIRTUAL_TERMINAL; |
1242
|
|
|
|
|
|
|
} |
1243
|
|
|
|
|
|
|
|
1244
|
|
|
|
|
|
|
my $actual_terminal_level = |
1245
|
|
|
|
|
|
|
$Marpa::HTML::Internal::VIRTUAL_TOKEN_HIERARCHY{ |
1246
|
355
|
|
|
|
|
708
|
$actual_terminal}; |
1247
|
|
|
|
|
|
|
|
1248
|
|
|
|
|
|
|
# If the actual terminal is not part of the hierarchy, no need to check |
1249
|
|
|
|
|
|
|
# lookahead, either |
1250
|
355
|
100
|
|
|
|
1013
|
if ( not defined $actual_terminal_level ) { |
1251
|
179
|
|
|
|
|
240
|
$virtual_terminal = $candidate; |
1252
|
179
|
|
|
|
|
359
|
last LOOKAHEAD_VIRTUAL_TERMINAL; |
1253
|
|
|
|
|
|
|
} |
1254
|
|
|
|
|
|
|
|
1255
|
|
|
|
|
|
|
# Here we are trying to deal with a higher-level element's |
1256
|
|
|
|
|
|
|
# start or end, by starting a new lower level element. |
1257
|
|
|
|
|
|
|
# This won't work, because we'll have to close it |
1258
|
|
|
|
|
|
|
# immediately with another virtual terminal. |
1259
|
|
|
|
|
|
|
# At best this means useless, empty elements. |
1260
|
|
|
|
|
|
|
# At worst, it means an infinite loop where |
1261
|
|
|
|
|
|
|
# empty lower-level elements are repeatedly added. |
1262
|
|
|
|
|
|
|
# |
1263
|
|
|
|
|
|
|
next LOOKAHEAD_VIRTUAL_TERMINAL |
1264
|
176
|
100
|
|
|
|
620
|
if $candidate_level <= $actual_terminal_level; |
1265
|
|
|
|
|
|
|
|
1266
|
120
|
|
|
|
|
195
|
$virtual_terminal = $candidate; |
1267
|
120
|
|
|
|
|
241
|
last LOOKAHEAD_VIRTUAL_TERMINAL; |
1268
|
|
|
|
|
|
|
|
1269
|
|
|
|
|
|
|
} ## end while ( my $candidate = pop @virtuals_expected ) |
1270
|
|
|
|
|
|
|
|
1271
|
675
|
50
|
|
|
|
1570
|
if ($trace_terminals) { |
1272
|
0
|
0
|
|
|
|
0
|
say {$trace_fh} 'Converting Token: ', $actual_terminal |
|
0
|
|
|
|
|
0
|
|
1273
|
|
|
|
|
|
|
or Carp::croak("Cannot print: $ERRNO"); |
1274
|
0
|
0
|
|
|
|
0
|
if ( defined $virtual_terminal ) { |
1275
|
0
|
0
|
|
|
|
0
|
say {$trace_fh} 'Candidate as Virtual Token: ', |
|
0
|
|
|
|
|
0
|
|
1276
|
|
|
|
|
|
|
$virtual_terminal |
1277
|
|
|
|
|
|
|
or Carp::croak("Cannot print: $ERRNO"); |
1278
|
|
|
|
|
|
|
} |
1279
|
|
|
|
|
|
|
} ## end if ($trace_terminals) |
1280
|
|
|
|
|
|
|
|
1281
|
|
|
|
|
|
|
# Depending on the expected (optional or virtual) |
1282
|
|
|
|
|
|
|
# terminal and the actual |
1283
|
|
|
|
|
|
|
# terminal, we either want to add the actual one as cruft, or add |
1284
|
|
|
|
|
|
|
# the virtual one to move on in the parse. |
1285
|
|
|
|
|
|
|
|
1286
|
675
|
50
|
33
|
|
|
1832
|
if ( $trace_terminals > 1 and defined $virtual_terminal ) { |
1287
|
0
|
|
|
|
|
0
|
say {$trace_fh} |
|
0
|
|
|
|
|
0
|
|
1288
|
|
|
|
|
|
|
"OK as cruft when expecting $virtual_terminal: ", |
1289
|
0
|
0
|
|
|
|
0
|
join q{ }, keys %{ $ok_as_cruft{$virtual_terminal} } |
1290
|
|
|
|
|
|
|
or Carp::croak("Cannot print: $ERRNO"); |
1291
|
|
|
|
|
|
|
} ## end if ( $trace_terminals > 1 and defined $virtual_terminal) |
1292
|
|
|
|
|
|
|
|
1293
|
675
|
50
|
|
|
|
1413
|
last FIND_VIRTUAL_TOKEN if not defined $virtual_terminal; |
1294
|
|
|
|
|
|
|
last FIND_VIRTUAL_TOKEN |
1295
|
675
|
100
|
|
|
|
2829
|
if $ok_as_cruft{$virtual_terminal}{$actual_terminal}; |
1296
|
|
|
|
|
|
|
|
1297
|
673
|
100
|
|
|
|
2040
|
CHECK_FOR_INFINITE_LOOP: { |
1298
|
|
|
|
|
|
|
|
1299
|
|
|
|
|
|
|
# It is sufficient to check for start tags. |
1300
|
|
|
|
|
|
|
# Just ending things will never cause an infinite loop. |
1301
|
673
|
|
|
|
|
744
|
last CHECK_FOR_INFINITE_LOOP if $virtual_terminal !~ /^S_/xms; |
1302
|
|
|
|
|
|
|
|
1303
|
|
|
|
|
|
|
# Are we at the same earleme as we were when the last |
1304
|
|
|
|
|
|
|
# virtual start was added? If not, no problem. |
1305
|
|
|
|
|
|
|
# But we need to reinitialize. |
1306
|
303
|
|
|
|
|
1186
|
my $current_earleme = $recce->current_earleme(); |
1307
|
303
|
50
|
|
|
|
1571
|
if ( $current_earleme != $earleme_of_last_start_virtual ) { |
1308
|
303
|
|
|
|
|
492
|
$earleme_of_last_start_virtual = $current_earleme; |
1309
|
303
|
|
|
|
|
508
|
%start_virtuals_used = (); |
1310
|
303
|
|
|
|
|
697
|
last CHECK_FOR_INFINITE_LOOP; |
1311
|
|
|
|
|
|
|
} |
1312
|
|
|
|
|
|
|
|
1313
|
|
|
|
|
|
|
# Is this the first time we've added this start |
1314
|
|
|
|
|
|
|
# terminal? If so, we're OK. |
1315
|
|
|
|
|
|
|
last CHECK_FOR_INFINITE_LOOP |
1316
|
0
|
0
|
|
|
|
0
|
if $start_virtuals_used{$virtual_terminal}++ <= 1; |
1317
|
|
|
|
|
|
|
|
1318
|
|
|
|
|
|
|
# Attempt to add duplicate. |
1319
|
|
|
|
|
|
|
# Give up on adding virtual at this location, |
1320
|
|
|
|
|
|
|
# and warn the user. |
1321
|
0
|
|
|
|
|
0
|
( my $tagname = $virtual_terminal ) =~ s/^S_//xms; |
1322
|
0
|
0
|
|
|
|
0
|
say {$trace_fh} |
|
0
|
|
|
|
|
0
|
|
1323
|
|
|
|
|
|
|
"Warning: attempt to add <$tagname> twice at the same place" |
1324
|
|
|
|
|
|
|
or Carp::croak("Cannot print: $ERRNO"); |
1325
|
0
|
|
|
|
|
0
|
last FIND_VIRTUAL_TOKEN; |
1326
|
|
|
|
|
|
|
|
1327
|
|
|
|
|
|
|
} ## end CHECK_FOR_INFINITE_LOOP: |
1328
|
|
|
|
|
|
|
|
1329
|
673
|
|
|
|
|
1085
|
my $tdesc_list = $marpa_token->[1]; |
1330
|
673
|
|
|
|
|
1421
|
my $first_tdesc_start_token = |
1331
|
|
|
|
|
|
|
$tdesc_list->[0]->[Marpa::HTML::Internal::TDesc::START_TOKEN]; |
1332
|
673
|
|
|
|
|
3325
|
$virtual_token_to_add = [ |
1333
|
|
|
|
|
|
|
$virtual_terminal, [ [ 'POINT', $first_tdesc_start_token ] ] |
1334
|
|
|
|
|
|
|
]; |
1335
|
|
|
|
|
|
|
|
1336
|
|
|
|
|
|
|
} ## end FIND_VIRTUAL_TOKEN: |
1337
|
|
|
|
|
|
|
|
1338
|
675
|
100
|
|
|
|
1830
|
if ( defined $virtual_token_to_add ) { |
1339
|
673
|
|
|
|
|
813
|
$recce->read( @{$virtual_token_to_add} ); |
|
673
|
|
|
|
|
2474
|
|
1340
|
673
|
|
|
|
|
470858
|
next RECCE_RESPONSE; |
1341
|
|
|
|
|
|
|
} |
1342
|
|
|
|
|
|
|
|
1343
|
|
|
|
|
|
|
# If we didn't find a token to add, add the |
1344
|
|
|
|
|
|
|
# current physical token as CRUFT. |
1345
|
|
|
|
|
|
|
|
1346
|
2
|
50
|
|
|
|
6
|
if ($trace_terminals) { |
1347
|
0
|
0
|
|
|
|
0
|
say {$trace_fh} 'Adding actual token as cruft: ', $actual_terminal |
|
0
|
|
|
|
|
0
|
|
1348
|
|
|
|
|
|
|
or Carp::croak("Cannot print: $ERRNO"); |
1349
|
|
|
|
|
|
|
} |
1350
|
|
|
|
|
|
|
|
1351
|
|
|
|
|
|
|
# Cruft tokens are not virtual. |
1352
|
|
|
|
|
|
|
# They are the real things, hacked up. |
1353
|
2
|
|
|
|
|
4
|
$marpa_token->[0] = 'CRUFT'; |
1354
|
2
|
50
|
|
|
|
8
|
if ($trace_cruft) { |
1355
|
0
|
|
|
|
|
0
|
my ( $line, $col ) = |
1356
|
|
|
|
|
|
|
earleme_to_linecol( $self, $recce->current_earleme() ); |
1357
|
|
|
|
|
|
|
|
1358
|
|
|
|
|
|
|
# HTML::Parser uses one-based line numbers, |
1359
|
|
|
|
|
|
|
# but zero-based column numbers |
1360
|
|
|
|
|
|
|
# The convention (in vi and cut) is that |
1361
|
|
|
|
|
|
|
# columns are also one-based. |
1362
|
0
|
|
|
|
|
0
|
$col++; |
1363
|
|
|
|
|
|
|
|
1364
|
0
|
|
|
|
|
0
|
say {$trace_fh} qq{Cruft at line $line, column $col: "}, |
|
0
|
|
|
|
|
0
|
|
1365
|
0
|
0
|
|
|
|
0
|
${ tdesc_list_to_literal( $self, $marpa_token->[1] ) }, q{"} |
1366
|
|
|
|
|
|
|
or Carp::croak("Cannot print: $ERRNO"); |
1367
|
|
|
|
|
|
|
} ## end if ($trace_cruft) |
1368
|
|
|
|
|
|
|
|
1369
|
|
|
|
|
|
|
} ## end while ( defined $marpa_token ) |
1370
|
|
|
|
|
|
|
|
1371
|
91
|
50
|
|
|
|
448
|
if ($trace_terminals) { |
1372
|
0
|
0
|
|
|
|
0
|
say {$trace_fh} 'at end of tokens' |
|
0
|
|
|
|
|
0
|
|
1373
|
|
|
|
|
|
|
or Carp::croak("Cannot print: $ERRNO"); |
1374
|
|
|
|
|
|
|
} |
1375
|
|
|
|
|
|
|
|
1376
|
91
|
|
|
|
|
465
|
$recce->end_input(); |
1377
|
|
|
|
|
|
|
|
1378
|
91
|
|
|
|
|
930
|
my %closure = (); |
1379
|
|
|
|
|
|
|
{ |
1380
|
91
|
|
|
|
|
179
|
my $user_top_handler = |
|
91
|
|
|
|
|
406
|
|
1381
|
|
|
|
|
|
|
$self->{user_handlers_by_pseudoclass}->{ANY}->{TOP}; |
1382
|
91
|
100
|
|
|
|
524
|
$closure{'!TOP_handler'} = |
1383
|
|
|
|
|
|
|
defined $user_top_handler |
1384
|
|
|
|
|
|
|
? wrap_user_top_handler($user_top_handler) |
1385
|
|
|
|
|
|
|
: \&Marpa::HTML::Internal::default_top_handler; |
1386
|
|
|
|
|
|
|
} ## end if ( defined( my $user_top_handler = $self->{...})) |
1387
|
|
|
|
|
|
|
|
1388
|
91
|
100
|
|
|
|
554
|
if ( defined $self->{user_handlers_by_class}->{ANY}->{ANY} ) { |
1389
|
83
|
|
|
|
|
287
|
$closure{'!DEFAULT_ELE_handler'} = |
1390
|
|
|
|
|
|
|
$self->{user_handlers_by_class}->{ANY}->{ANY}; |
1391
|
|
|
|
|
|
|
} |
1392
|
|
|
|
|
|
|
|
1393
|
|
|
|
|
|
|
PSEUDO_CLASS: |
1394
|
91
|
|
|
|
|
233
|
for my $pseudoclass ( |
1395
|
|
|
|
|
|
|
qw(PI DECL COMMENT PROLOG TRAILER WHITESPACE CDATA PCDATA CRUFT)) |
1396
|
|
|
|
|
|
|
{ |
1397
|
819
|
|
|
|
|
1978
|
my $pseudoclass_action = |
1398
|
|
|
|
|
|
|
$self->{user_handlers_by_pseudoclass}->{ANY}->{$pseudoclass}; |
1399
|
819
|
|
|
|
|
1503
|
my $pseudoclass_action_name = "!$pseudoclass" . '_handler'; |
1400
|
819
|
100
|
|
|
|
1562
|
if ($pseudoclass_action) { |
1401
|
313
|
|
|
|
|
1255
|
$closure{$pseudoclass_action_name} = |
1402
|
|
|
|
|
|
|
wrap_user_tdesc_handler( $pseudoclass_action, |
1403
|
|
|
|
|
|
|
{ pseudoclass => $pseudoclass } ); |
1404
|
313
|
|
|
|
|
775
|
next PSEUDO_CLASS; |
1405
|
|
|
|
|
|
|
} ## end if ($pseudoclass_action) |
1406
|
506
|
|
|
|
|
1632
|
$closure{$pseudoclass_action_name} = |
1407
|
|
|
|
|
|
|
\&Marpa::HTML::Internal::default_action; |
1408
|
|
|
|
|
|
|
} ## end for my $pseudoclass (...) |
1409
|
|
|
|
|
|
|
|
1410
|
91
|
|
|
|
|
628
|
while ( my ( $element_action, $element ) = each %element_actions ) { |
1411
|
687
|
|
|
|
|
1273
|
$closure{$element_action} = create_tdesc_handler( $self, $element ); |
1412
|
|
|
|
|
|
|
} |
1413
|
|
|
|
|
|
|
|
1414
|
|
|
|
|
|
|
ELEMENT_ACTION: |
1415
|
91
|
|
|
|
|
568
|
while ( my ( $element_action, $data ) = |
1416
|
|
|
|
|
|
|
each %pseudoclass_element_actions ) |
1417
|
|
|
|
|
|
|
{ |
1418
|
|
|
|
|
|
|
|
1419
|
|
|
|
|
|
|
# As of now, there are |
1420
|
|
|
|
|
|
|
# no per-element pseudo-classes, and since I can't regression test |
1421
|
|
|
|
|
|
|
# this logic any more, I'm commenting it out. |
1422
|
0
|
|
|
|
|
0
|
Carp::croak('per-element pseudo-classes not implemented'); |
1423
|
|
|
|
|
|
|
|
1424
|
|
|
|
|
|
|
# my ( $pseudoclass, $element ) = @{$data}; |
1425
|
|
|
|
|
|
|
# my $pseudoclass_action = |
1426
|
|
|
|
|
|
|
# $self->{user_handlers_by_pseudoclass}->{$element} |
1427
|
|
|
|
|
|
|
# ->{$pseudoclass} |
1428
|
|
|
|
|
|
|
# // $self->{user_handlers_by_pseudoclass}->{ANY}->{$pseudoclass}; |
1429
|
|
|
|
|
|
|
# if ( defined $pseudoclass_action ) { |
1430
|
|
|
|
|
|
|
# $pseudoclass_action = |
1431
|
|
|
|
|
|
|
# wrap_user_tdesc_handler($pseudoclass_action); |
1432
|
|
|
|
|
|
|
# } |
1433
|
|
|
|
|
|
|
# $pseudoclass_action //= \&Marpa::HTML::Internal::default_action; |
1434
|
|
|
|
|
|
|
# $closure{$element_action} = $pseudoclass_action; |
1435
|
|
|
|
|
|
|
} ## end while ( my ( $element_action, $data ) = each ...) |
1436
|
|
|
|
|
|
|
|
1437
|
91
|
|
|
|
|
248
|
my $value = do { |
1438
|
91
|
|
|
|
|
192
|
local $Marpa::HTML::Internal::PARSE_INSTANCE = $self; |
1439
|
91
|
|
|
|
|
176
|
local $Marpa::HTML::INSTANCE = {}; |
1440
|
91
|
|
|
|
|
1113
|
$recce->value( |
1441
|
|
|
|
|
|
|
{ trace_values => $self->{trace_values}, |
1442
|
|
|
|
|
|
|
trace_actions => $self->{trace_actions}, |
1443
|
|
|
|
|
|
|
closures => \%closure, |
1444
|
|
|
|
|
|
|
} |
1445
|
|
|
|
|
|
|
); |
1446
|
|
|
|
|
|
|
}; |
1447
|
91
|
50
|
|
|
|
7368
|
Carp::croak('No parse: evaler returned undef') if not defined $value; |
1448
|
91
|
|
|
|
|
166
|
return ${$value}; |
|
91
|
|
|
|
|
211167
|
|
1449
|
|
|
|
|
|
|
|
1450
|
|
|
|
|
|
|
} ## end sub parse |
1451
|
|
|
|
|
|
|
|
1452
|
|
|
|
|
|
|
sub Marpa::HTML::html { |
1453
|
91
|
|
|
91
|
0
|
36804
|
my ( $document_ref, @args ) = @_; |
1454
|
91
|
|
|
|
|
380
|
my $html = Marpa::HTML::Internal::create(@args); |
1455
|
91
|
|
|
|
|
316
|
return Marpa::HTML::Internal::parse( $html, $document_ref ); |
1456
|
|
|
|
|
|
|
} |
1457
|
|
|
|
|
|
|
|
1458
|
|
|
|
|
|
|
1; |
|