line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# This code can be redistributed and modified under the terms of the GNU Affero |
2
|
|
|
|
|
|
|
# General Public License as published by the Free Software Foundation, either |
3
|
|
|
|
|
|
|
# version 3 of the License, or (at your option) any later version. |
4
|
|
|
|
|
|
|
# See the "COPYING" file for details. |
5
|
|
|
|
|
|
|
package HTML::Blitz::Parser; |
6
|
11
|
|
|
11
|
|
85
|
use HTML::Blitz::pragma; |
|
11
|
|
|
|
|
29
|
|
|
11
|
|
|
|
|
77
|
|
7
|
11
|
|
|
11
|
|
9100
|
use HTML::Blitz::ParseError (); |
|
11
|
|
|
|
|
28
|
|
|
11
|
|
|
|
|
458
|
|
8
|
11
|
|
|
|
|
2584
|
use HTML::Blitz::TokenType qw( |
9
|
|
|
|
|
|
|
TT_TAG_OPEN |
10
|
|
|
|
|
|
|
TT_TAG_CLOSE |
11
|
|
|
|
|
|
|
TT_TEXT |
12
|
|
|
|
|
|
|
TT_COMMENT |
13
|
|
|
|
|
|
|
TT_DOCTYPE |
14
|
11
|
|
|
11
|
|
4548
|
); |
|
11
|
|
|
|
|
29
|
|
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
our $VERSION = '0.07'; |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
method _fail( |
19
|
|
|
|
|
|
|
$msg, |
20
|
8
|
|
|
|
|
25
|
:$pos = pos(${$self->{src_ref}}), |
21
|
|
|
|
|
|
|
:$width = 1, |
22
|
|
|
|
|
|
|
:$alt_msg = undef, |
23
|
|
|
|
|
|
|
:$alt_pos = undef, |
24
|
|
|
|
|
|
|
:$alt_width = 1, |
25
|
27
|
50
|
66
|
27
|
|
73
|
) { |
|
27
|
50
|
|
|
|
149
|
|
|
27
|
100
|
|
|
|
49
|
|
|
27
|
100
|
|
|
|
101
|
|
|
27
|
100
|
|
|
|
83
|
|
|
27
|
50
|
|
|
|
88
|
|
|
27
|
|
|
|
|
49
|
|
|
27
|
|
|
|
|
41
|
|
|
27
|
|
|
|
|
60
|
|
|
27
|
|
|
|
|
66
|
|
|
27
|
|
|
|
|
37
|
|
26
|
|
|
|
|
|
|
die HTML::Blitz::ParseError->new( |
27
|
|
|
|
|
|
|
src_name => $self->{src_name}, |
28
|
|
|
|
|
|
|
src_ref => $self->{src_ref}, |
29
|
27
|
|
|
|
|
137
|
msg => $msg, |
30
|
|
|
|
|
|
|
pos => $pos, |
31
|
|
|
|
|
|
|
width => $width, |
32
|
|
|
|
|
|
|
alt_msg => $alt_msg, |
33
|
|
|
|
|
|
|
alt_pos => $alt_pos, |
34
|
|
|
|
|
|
|
alt_width => $alt_width, |
35
|
|
|
|
|
|
|
) |
36
|
|
|
|
|
|
|
} |
37
|
|
|
|
|
|
|
|
38
|
10
|
50
|
|
10
|
0
|
26
|
method throw_for($token, $msg) { |
|
10
|
50
|
|
|
|
24
|
|
|
10
|
|
|
|
|
16
|
|
|
10
|
|
|
|
|
21
|
|
|
10
|
|
|
|
|
14
|
|
39
|
10
|
|
|
|
|
17
|
my $type = $token->{type}; |
40
|
|
|
|
|
|
|
$self->_fail( |
41
|
|
|
|
|
|
|
$msg, |
42
|
|
|
|
|
|
|
pos => $token->{pos}, |
43
|
|
|
|
|
|
|
$type eq TT_TAG_OPEN || $type eq TT_TAG_CLOSE |
44
|
|
|
|
|
|
|
? (width => 1 + ($type eq TT_TAG_CLOSE) + length $token->{name}) |
45
|
10
|
100
|
66
|
|
|
61
|
: (), |
46
|
|
|
|
|
|
|
); |
47
|
|
|
|
|
|
|
} |
48
|
|
|
|
|
|
|
|
49
|
273
|
50
|
|
273
|
0
|
624
|
method new($class: $src_name, $src) { |
|
273
|
50
|
|
|
|
558
|
|
|
273
|
|
|
|
|
438
|
|
|
273
|
|
|
|
|
557
|
|
|
273
|
|
|
|
|
357
|
|
50
|
273
|
|
|
|
|
1003
|
my $self = bless { |
51
|
|
|
|
|
|
|
src_name => $src_name, |
52
|
|
|
|
|
|
|
src_ref => \$src, |
53
|
|
|
|
|
|
|
tag_stack => [], |
54
|
|
|
|
|
|
|
in_foreign_elem => 0, |
55
|
|
|
|
|
|
|
}, $class; |
56
|
|
|
|
|
|
|
|
57
|
273
|
|
|
|
|
851
|
$src =~ s/\r\n?/\n/g; # normalize newlines |
58
|
273
|
50
|
|
|
|
1141
|
$src =~ /([\x{d800}-\x{dfff}])/ |
59
|
|
|
|
|
|
|
and $self->_fail(sprintf("surrogate codepoint U+%04X in input", ord $1), pos => $-[1]); |
60
|
273
|
50
|
|
|
|
21662
|
$src =~ /( |
61
|
|
|
|
|
|
|
[\x{fdd0}-\x{fdef}] |
62
|
|
|
|
|
|
|
| [\x{fffe}\x{ffff}] |
63
|
|
|
|
|
|
|
| [\x{1fffe}\x{1ffff}] |
64
|
|
|
|
|
|
|
| [\x{2fffe}\x{2ffff}] |
65
|
|
|
|
|
|
|
| [\x{3fffe}\x{3ffff}] |
66
|
|
|
|
|
|
|
| [\x{4fffe}\x{4ffff}] |
67
|
|
|
|
|
|
|
| [\x{5fffe}\x{5ffff}] |
68
|
|
|
|
|
|
|
| [\x{6fffe}\x{6ffff}] |
69
|
|
|
|
|
|
|
| [\x{7fffe}\x{7ffff}] |
70
|
|
|
|
|
|
|
| [\x{8fffe}\x{8ffff}] |
71
|
|
|
|
|
|
|
| [\x{9fffe}\x{9ffff}] |
72
|
|
|
|
|
|
|
| [\x{afffe}\x{affff}] |
73
|
|
|
|
|
|
|
| [\x{bfffe}\x{bffff}] |
74
|
|
|
|
|
|
|
| [\x{cfffe}\x{cffff}] |
75
|
|
|
|
|
|
|
| [\x{dfffe}\x{dffff}] |
76
|
|
|
|
|
|
|
| [\x{efffe}\x{effff}] |
77
|
|
|
|
|
|
|
| [\x{ffffe}\x{fffff}] |
78
|
|
|
|
|
|
|
| [\x{10fffe}\x{10ffff}] |
79
|
|
|
|
|
|
|
)/x and $self->_fail(sprintf("non-character codepoint U+%04X in input", ord $1), pos => $-[1]); |
80
|
273
|
50
|
|
|
|
1040
|
$src =~ /((?![ \t\n\f])[\x00-\x1f\x7f-\x9f])/ |
81
|
|
|
|
|
|
|
and $self->_fail(sprintf("control character U+%04X in input", ord $1), pos => $-[1]); |
82
|
|
|
|
|
|
|
|
83
|
273
|
|
|
|
|
896
|
pos($src) = 0; |
84
|
|
|
|
|
|
|
|
85
|
273
|
|
|
|
|
827
|
$self |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
my %entities; |
89
|
|
|
|
|
|
|
{ |
90
|
|
|
|
|
|
|
while (my $line = readline DATA) { |
91
|
|
|
|
|
|
|
chomp $line; |
92
|
|
|
|
|
|
|
my ($name, $value) = $line =~ /^(\w+) (\d+(?:,\d+)*)\z/a |
93
|
|
|
|
|
|
|
or die "Internal error: malformed entitiy definition '$line'"; |
94
|
|
|
|
|
|
|
$value =~ s/(\d+),?/chr $1/aeg; |
95
|
|
|
|
|
|
|
$entities{$name} = $value; |
96
|
|
|
|
|
|
|
} |
97
|
|
|
|
|
|
|
close DATA; |
98
|
|
|
|
|
|
|
} |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
my %void_tags = map +($_ => 1), qw( |
101
|
|
|
|
|
|
|
area |
102
|
|
|
|
|
|
|
base br |
103
|
|
|
|
|
|
|
col |
104
|
|
|
|
|
|
|
embed |
105
|
|
|
|
|
|
|
hr |
106
|
|
|
|
|
|
|
img input |
107
|
|
|
|
|
|
|
link |
108
|
|
|
|
|
|
|
meta |
109
|
|
|
|
|
|
|
source |
110
|
|
|
|
|
|
|
track |
111
|
|
|
|
|
|
|
wbr |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
basefont bgsound |
114
|
|
|
|
|
|
|
frame |
115
|
|
|
|
|
|
|
keygen |
116
|
|
|
|
|
|
|
param |
117
|
|
|
|
|
|
|
); |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
my %foreign_tags = map +($_ => 1), qw( |
120
|
|
|
|
|
|
|
math |
121
|
|
|
|
|
|
|
svg |
122
|
|
|
|
|
|
|
); |
123
|
|
|
|
|
|
|
|
124
|
3599
|
50
|
|
3599
|
|
6714
|
method _consume_entity_maybe($chunk) { |
|
3599
|
50
|
|
|
|
6252
|
|
|
3599
|
|
|
|
|
4833
|
|
|
3599
|
|
|
|
|
7735
|
|
|
3599
|
|
|
|
|
4431
|
|
125
|
3599
|
100
|
|
|
|
17444
|
$chunk eq '&' or return $chunk; |
126
|
|
|
|
|
|
|
|
127
|
162
|
|
|
|
|
274
|
my $src_ref = $self->{src_ref}; |
128
|
162
|
|
|
|
|
210
|
my $char; |
129
|
|
|
|
|
|
|
|
130
|
162
|
100
|
|
|
|
403
|
if ($$src_ref =~ /\G#/gc) { |
131
|
30
|
100
|
|
|
|
71
|
if ($$src_ref =~ /\G[xX]/gc) { |
132
|
18
|
50
|
|
|
|
48
|
$$src_ref =~ /\G([[:xdigit:]]+)/gc |
133
|
|
|
|
|
|
|
or $self->_fail("missing hex digits after ''"); |
134
|
18
|
|
|
|
|
50
|
$char = chr hex $1; |
135
|
|
|
|
|
|
|
} else { |
136
|
12
|
50
|
|
|
|
43
|
$$src_ref =~ /\G(\d+)/agc |
137
|
|
|
|
|
|
|
or $self->_fail("missing digits after ''"); |
138
|
12
|
|
|
|
|
53
|
$char = chr $1; |
139
|
|
|
|
|
|
|
} |
140
|
|
|
|
|
|
|
} else { |
141
|
132
|
50
|
|
|
|
421
|
$$src_ref =~ /\G(\w+)/agc |
142
|
|
|
|
|
|
|
or $self->_fail("missing character name after '&'"); |
143
|
132
|
|
33
|
|
|
411
|
$char = $entities{$1} |
144
|
|
|
|
|
|
|
// $self->_fail("invalid character reference '$1' after '&'", pos => $-[1], width => length $1 ); |
145
|
|
|
|
|
|
|
} |
146
|
162
|
50
|
|
|
|
450
|
$$src_ref =~ /\G;/gc |
147
|
|
|
|
|
|
|
or $self->_fail("missing ';' after character reference"); |
148
|
|
|
|
|
|
|
|
149
|
162
|
|
|
|
|
602
|
$char |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
|
152
|
2148
|
50
|
|
2148
|
0
|
4143
|
method current_tag() { |
|
2148
|
50
|
|
|
|
3708
|
|
|
2148
|
|
|
|
|
2959
|
|
|
2148
|
|
|
|
|
3054
|
|
153
|
2148
|
|
|
|
|
3176
|
my $tag_stack = $self->{tag_stack}; |
154
|
2148
|
100
|
|
|
|
5921
|
@$tag_stack ? $tag_stack->[-1][0] : '' |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
|
157
|
5584
|
50
|
|
5584
|
0
|
11404
|
method parse() { |
|
5584
|
50
|
|
|
|
9836
|
|
|
5584
|
|
|
|
|
7832
|
|
|
5584
|
|
|
|
|
6601
|
|
158
|
5584
|
|
|
|
|
8304
|
my $src_ref = $self->{src_ref}; |
159
|
5584
|
|
|
|
|
7482
|
my $tag_stack = $self->{tag_stack}; |
160
|
|
|
|
|
|
|
|
161
|
5584
|
100
|
|
|
|
11069
|
my $cur_tag = @$tag_stack ? $tag_stack->[-1][0] : ''; |
162
|
|
|
|
|
|
|
|
163
|
5584
|
100
|
|
|
|
14481
|
if ($$src_ref =~ /\G\z/) { |
164
|
236
|
50
|
|
|
|
601
|
length $cur_tag |
165
|
|
|
|
|
|
|
and $self->_fail("unclosed '<$cur_tag>' tag", pos => $tag_stack->[-1][1], width => 1 + length($cur_tag)); |
166
|
236
|
|
|
|
|
797
|
return undef; |
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
|
169
|
5348
|
|
|
|
|
8581
|
my $pos = pos $$src_ref; |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
{ |
172
|
5348
|
|
|
|
|
6938
|
my $text = ''; |
|
5348
|
|
|
|
|
7164
|
|
173
|
|
|
|
|
|
|
|
174
|
5348
|
100
|
|
|
|
13343
|
if ($cur_tag eq 'script') { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
175
|
200
|
50
|
|
1
|
|
757
|
my $err = fun () { $self->_fail("unclosed '<$cur_tag>' tag", pos => $tag_stack->[-1][1], width => 1 + length($cur_tag)) }; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
6
|
|
176
|
|
|
|
|
|
|
SCRIPT_DATA: { |
177
|
200
|
50
|
|
|
|
292
|
$$src_ref =~ m{ ( ) | < (/?) script [ \t\n\f/>] }xaaigc or $err->(); |
|
12
|
|
|
|
|
47
|
|
182
|
11
|
|
|
|
|
24
|
$match_start = $-[0]; |
183
|
11
|
100
|
|
|
|
29
|
if ($1) { |
184
|
2
|
|
|
|
|
6
|
redo SCRIPT_DATA; |
185
|
|
|
|
|
|
|
} |
186
|
9
|
100
|
|
|
|
24
|
if (!$2) { |
187
|
6
|
50
|
|
|
|
20
|
$$src_ref =~ m{ (-->) | ] }xaaigc or $err->(); |
188
|
6
|
100
|
|
|
|
17
|
if ($1) { |
189
|
3
|
|
|
|
|
7
|
redo SCRIPT_DATA; |
190
|
|
|
|
|
|
|
} |
191
|
3
|
|
|
|
|
7
|
redo SCRIPT_DATA_ESCAPED; |
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
} |
195
|
199
|
|
|
|
|
422
|
pos($$src_ref) = $match_start; |
196
|
|
|
|
|
|
|
} |
197
|
199
|
|
|
|
|
959
|
$text = substr $$src_ref, $pos, pos($$src_ref) - $pos; |
198
|
|
|
|
|
|
|
} elsif ($cur_tag eq 'style') { |
199
|
140
|
100
|
|
|
|
567
|
if ($$src_ref =~ m{\G ( (?: (?! (?aai: style ) [ \t\n\f/>] ) . )+ ) }xsgc) { |
200
|
70
|
|
|
|
|
176
|
$text = $1; |
201
|
|
|
|
|
|
|
} |
202
|
|
|
|
|
|
|
} elsif ($cur_tag eq 'title') { |
203
|
186
|
|
|
|
|
814
|
while ($$src_ref =~ m{\G ( (?: (?! (?aai: title ) [ \t\n\f/>] ) [^&] )+ | & ) }xgc) { |
204
|
93
|
|
|
|
|
248
|
$text .= $self->_consume_entity_maybe($1); |
205
|
|
|
|
|
|
|
} |
206
|
|
|
|
|
|
|
} elsif ($cur_tag eq 'textarea') { |
207
|
0
|
|
|
|
|
0
|
while ($$src_ref =~ m{\G ( (?: (?! (?aai: textarea ) [ \t\n\f/>] ) [^&] )+ | & ) }xgc) { |
208
|
0
|
|
|
|
|
0
|
$text .= $self->_consume_entity_maybe($1); |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
} else { |
211
|
4822
|
|
|
|
|
14232
|
while ($$src_ref =~ /\G ( [^<&]+ | & )/xgc) { |
212
|
2414
|
|
|
|
|
4956
|
$text .= $self->_consume_entity_maybe($1); |
213
|
|
|
|
|
|
|
} |
214
|
|
|
|
|
|
|
} |
215
|
|
|
|
|
|
|
|
216
|
5347
|
100
|
|
|
|
11392
|
if (length $text) { |
217
|
|
|
|
|
|
|
return { |
218
|
2388
|
|
|
|
|
11077
|
type => TT_TEXT, |
219
|
|
|
|
|
|
|
pos => $pos, |
220
|
|
|
|
|
|
|
content => $text, |
221
|
|
|
|
|
|
|
}; |
222
|
|
|
|
|
|
|
} |
223
|
|
|
|
|
|
|
} |
224
|
|
|
|
|
|
|
|
225
|
2959
|
50
|
|
|
|
8272
|
if ($$src_ref =~ /\G
|
226
|
2959
|
100
|
|
|
|
6304
|
if ($$src_ref =~ /\G!/gc) { |
227
|
54
|
100
|
|
|
|
147
|
if ($$src_ref =~ /\G--/gc) { |
228
|
43
|
100
|
|
|
|
113
|
if ($$src_ref =~ /\G(-?>)/) { |
229
|
2
|
|
|
|
|
9
|
$self->_fail("improperly closed comment", width => length($1)); |
230
|
|
|
|
|
|
|
} |
231
|
41
|
100
|
|
|
|
193
|
$$src_ref =~ /\G(.*?)(?|--!?>)/sgc |
232
|
|
|
|
|
|
|
or $self->_fail("unterminated comment", pos => $pos, width => 4); |
233
|
40
|
|
|
|
|
131
|
my ($text, $closer) = ($1, $2); |
234
|
40
|
100
|
|
|
|
88
|
if ($closer eq '') { |
245
|
1
|
|
|
|
|
21
|
$self->_fail( |
246
|
|
|
|
|
|
|
"improperly closed comment (should be '-->')", |
247
|
|
|
|
|
|
|
pos => $-[2], |
248
|
|
|
|
|
|
|
width => length($closer), |
249
|
|
|
|
|
|
|
alt_msg => "comment starting here", |
250
|
|
|
|
|
|
|
alt_pos => $pos, |
251
|
|
|
|
|
|
|
alt_width => 4, |
252
|
|
|
|
|
|
|
); |
253
|
|
|
|
|
|
|
} |
254
|
38
|
100
|
|
|
|
121
|
if ($closer eq '') { |
255
|
2
|
|
|
|
|
5
|
$text .= '
|
256
|
2
|
|
|
|
|
5
|
$closer = '-->'; |
257
|
|
|
|
|
|
|
} |
258
|
|
|
|
|
|
|
return { |
259
|
38
|
|
|
|
|
197
|
type => TT_COMMENT, |
260
|
|
|
|
|
|
|
pos => $pos, |
261
|
|
|
|
|
|
|
content => $text, |
262
|
|
|
|
|
|
|
}; |
263
|
|
|
|
|
|
|
} |
264
|
|
|
|
|
|
|
|
265
|
11
|
100
|
|
|
|
41
|
if ($$src_ref =~ /\Gdoctype/aaigc) { |
266
|
9
|
50
|
|
|
|
39
|
$$src_ref =~ /\G[ \t\n\f]+/gc |
267
|
|
|
|
|
|
|
or $self->_fail("missing whitespace after '
|
268
|
9
|
50
|
|
|
|
28
|
$$src_ref =~ /\Ghtml/aaigc |
269
|
|
|
|
|
|
|
or $self->_fail("invalid non-html doctype"); |
270
|
9
|
50
|
|
|
|
38
|
$$src_ref =~ /\G[ \t\n\f]*>/gc |
271
|
|
|
|
|
|
|
or $self->_fail("missing '>' after '
|
272
|
|
|
|
|
|
|
return { |
273
|
9
|
|
|
|
|
52
|
type => TT_DOCTYPE, |
274
|
|
|
|
|
|
|
pos => $pos, |
275
|
|
|
|
|
|
|
}; |
276
|
|
|
|
|
|
|
} |
277
|
|
|
|
|
|
|
|
278
|
2
|
100
|
66
|
|
|
39
|
if ($self->{in_foreign_elem} && $$src_ref =~ /\G\[CDATA\[/gc) { |
279
|
1
|
|
|
|
|
5
|
my $text_start = $+[0]; |
280
|
1
|
50
|
|
|
|
8
|
$$src_ref =~ /\]\]>/gc or $self->_fail("missing ']]>' after '
|
281
|
1
|
|
|
|
|
5
|
my $text_end = $-[0]; |
282
|
|
|
|
|
|
|
return { |
283
|
1
|
|
|
|
|
10
|
type => TT_TEXT, |
284
|
|
|
|
|
|
|
pos => $text_start, |
285
|
|
|
|
|
|
|
content => substr($$src_ref, $text_start, $text_end - $text_start), |
286
|
|
|
|
|
|
|
}; |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
|
289
|
1
|
|
|
|
|
7
|
$self->_fail("invalid declaration (should be '--' or 'DOCTYPE')"); |
290
|
|
|
|
|
|
|
} |
291
|
|
|
|
|
|
|
|
292
|
2905
|
|
|
|
|
5307
|
my $closing = $$src_ref =~ m{\G/}gc; |
293
|
|
|
|
|
|
|
|
294
|
2905
|
50
|
|
|
|
7566
|
$$src_ref =~ m{\G([a-zA-Z][^\s/>[:cntrl:]]*)}gc |
295
|
|
|
|
|
|
|
or $self->_fail("invalid tag name"); |
296
|
2905
|
|
|
|
|
7465
|
(my $name = $1) =~ tr/A-Z/a-z/; |
297
|
|
|
|
|
|
|
|
298
|
2905
|
|
|
|
|
5707
|
$$src_ref =~ /\G[ \t\n\f]+/gc; |
299
|
|
|
|
|
|
|
|
300
|
2905
|
|
|
|
|
4266
|
my (%attrs, %attr_pos); |
301
|
2905
|
|
|
|
|
6736
|
while ($$src_ref =~ m{\G([^\s/>="'<[:cntrl:]]+)}gc) { |
302
|
1097
|
|
|
|
|
3290
|
my $apos = $-[1]; |
303
|
1097
|
100
|
|
|
|
2385
|
if ($closing) { |
304
|
1
|
|
|
|
|
8
|
$self->_fail("invalid attribute in end tag '$name>'", pos => $apos, width => length $1); |
305
|
|
|
|
|
|
|
} |
306
|
1096
|
|
|
|
|
2144
|
(my $attr_name = $1) =~ tr/A-Z/a-z/; |
307
|
|
|
|
|
|
|
|
308
|
1096
|
100
|
|
|
|
2303
|
if (exists $attrs{$attr_name}) { |
309
|
1
|
|
|
|
|
13
|
$self->_fail("duplicate attribute '$attr_name' in '<$name>' tag", pos => $apos, width => length($attr_name), alt_msg => "first defined here", alt_pos => $attr_pos{$attr_name}, alt_width => length($attr_name)); |
310
|
|
|
|
|
|
|
} |
311
|
|
|
|
|
|
|
|
312
|
1095
|
|
|
|
|
2149
|
$$src_ref =~ /\G[ \t\n\f]+/gc; |
313
|
|
|
|
|
|
|
|
314
|
1095
|
|
|
|
|
1612
|
my $attr_value = ''; |
315
|
1095
|
100
|
|
|
|
2867
|
if ($$src_ref =~ /\G=[ \t\n\f]*/gc) { |
316
|
1088
|
100
|
|
|
|
3474
|
if ($$src_ref =~ /\G"/gc) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
317
|
332
|
|
|
|
|
833
|
my $qpos = $-[0]; |
318
|
332
|
|
|
|
|
575
|
my $text = ''; |
319
|
332
|
|
|
|
|
933
|
while ($$src_ref =~ /\G ( [^"&]+ | & ) /xgc) { |
320
|
331
|
|
|
|
|
665
|
$text .= $self->_consume_entity_maybe($1); |
321
|
|
|
|
|
|
|
} |
322
|
332
|
50
|
|
|
|
928
|
$$src_ref =~ /\G"/gc |
323
|
|
|
|
|
|
|
or $self->_fail(q{missing '"' after attribute value}, alt_msg => 'starting here', alt_pos => $qpos); |
324
|
332
|
50
|
|
|
|
736
|
$$src_ref =~ m{\G[^ \t\n\f/>]} |
325
|
|
|
|
|
|
|
and $self->_fail('missing whitespace after attribute value'); |
326
|
332
|
|
|
|
|
575
|
$attr_value = $text; |
327
|
|
|
|
|
|
|
} elsif ($$src_ref =~ /\G'/gc) { |
328
|
8
|
|
|
|
|
47
|
my $qpos = $-[0]; |
329
|
8
|
|
|
|
|
19
|
my $text = ''; |
330
|
8
|
|
|
|
|
31
|
while ($$src_ref =~ /\G ( [^'&]+ | & ) /xgc) { |
331
|
14
|
|
|
|
|
56
|
$text .= $self->_consume_entity_maybe($1); |
332
|
|
|
|
|
|
|
} |
333
|
8
|
100
|
|
|
|
30
|
$$src_ref =~ /\G'/gc |
334
|
|
|
|
|
|
|
or $self->_fail(q{missing "'" after attribute value}, alt_msg => 'starting here', alt_pos => $qpos); |
335
|
7
|
100
|
|
|
|
25
|
$$src_ref =~ m{\G[^ \t\n\f/>]} |
336
|
|
|
|
|
|
|
and $self->_fail('missing whitespace after attribute value'); |
337
|
6
|
|
|
|
|
13
|
$attr_value = $text; |
338
|
|
|
|
|
|
|
} elsif ($$src_ref =~ /\G ( [^ \t\n\f&>"'<=`]+ | & )/xgc) { |
339
|
747
|
|
|
|
|
1135
|
my $text = ''; |
340
|
747
|
|
|
|
|
975
|
do { |
341
|
747
|
|
|
|
|
1915
|
$text .= $self->_consume_entity_maybe($1); |
342
|
|
|
|
|
|
|
} while $$src_ref =~ /\G ( [^ \t\n\f&>"'<=`]+ | & )/xgc; |
343
|
747
|
|
|
|
|
1443
|
$attr_value = $text; |
344
|
|
|
|
|
|
|
} else { |
345
|
1
|
|
|
|
|
3
|
$self->_fail("missing attribute value after '='"); |
346
|
|
|
|
|
|
|
} |
347
|
|
|
|
|
|
|
} |
348
|
|
|
|
|
|
|
|
349
|
1092
|
|
|
|
|
2587
|
$attrs{$attr_name} = $attr_value; |
350
|
1092
|
|
|
|
|
1866
|
$attr_pos{$attr_name} = $apos; |
351
|
|
|
|
|
|
|
|
352
|
1092
|
|
|
|
|
3348
|
$$src_ref =~ /\G[ \t\n\f]+/gc; |
353
|
|
|
|
|
|
|
} |
354
|
|
|
|
|
|
|
|
355
|
2900
|
100
|
|
|
|
8267
|
$$src_ref =~ m{\G(/?)>}gc |
356
|
|
|
|
|
|
|
or $self->_fail("missing '>' at end of tag", alt_msg => 'starting here', alt_pos => $pos, alt_width => 1 + $closing + length($name)); |
357
|
2898
|
|
|
|
|
5665
|
my $is_self_closing = length $1; |
358
|
|
|
|
|
|
|
|
359
|
2898
|
100
|
|
|
|
5337
|
if ($closing) { |
360
|
1340
|
50
|
|
|
|
2353
|
$is_self_closing and $self->_fail("invalid '/' at end of closing tag '$name>'", pos => $-[1]); |
361
|
1340
|
100
|
|
|
|
2506
|
@$tag_stack |
362
|
|
|
|
|
|
|
or $self->_fail("closing tag '$name>' has no corresponding open tag", pos => $pos, width => 1 + 1 + length($name)); |
363
|
|
|
|
|
|
|
|
364
|
1338
|
50
|
|
|
|
2525
|
$cur_tag eq $name |
365
|
|
|
|
|
|
|
or $self->_fail("closing tag '$name>' does not match current open tag '<$cur_tag>'", pos => $pos, width => 1 + 1 + length($name), alt_msg => 'starting here', alt_pos => $tag_stack->[-1][1], alt_width => 1 + length($cur_tag)); |
366
|
|
|
|
|
|
|
|
367
|
1338
|
100
|
|
|
|
2918
|
if ($foreign_tags{$cur_tag}) { |
368
|
2
|
|
|
|
|
6
|
$self->{in_foreign_elem}--; |
369
|
|
|
|
|
|
|
} |
370
|
1338
|
|
|
|
|
2100
|
pop @$tag_stack; |
371
|
|
|
|
|
|
|
return { |
372
|
1338
|
|
|
|
|
7258
|
type => TT_TAG_CLOSE, |
373
|
|
|
|
|
|
|
pos => $pos, |
374
|
|
|
|
|
|
|
name => $name, |
375
|
|
|
|
|
|
|
}; |
376
|
|
|
|
|
|
|
} |
377
|
|
|
|
|
|
|
|
378
|
1558
|
|
|
|
|
2891
|
my $is_void = $void_tags{$name}; |
379
|
1558
|
100
|
100
|
|
|
3366
|
if ($is_self_closing && !$is_void && !$foreign_tags{$name} && !$self->{in_foreign_elem}) { |
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
380
|
1
|
|
|
|
|
9
|
$self->_fail("invalid '/' at end of non-void tag '<$name>'", pos => $-[1], alt_msg => 'starting here', alt_pos => $pos, alt_width => 1 + length($name)); |
381
|
|
|
|
|
|
|
} |
382
|
1557
|
|
100
|
|
|
6419
|
$is_self_closing ||= $is_void; |
383
|
|
|
|
|
|
|
|
384
|
1557
|
100
|
|
|
|
3116
|
if (!$is_self_closing) { |
385
|
1366
|
|
|
|
|
1752
|
push @{$self->{tag_stack}}, [$name, $pos]; |
|
1366
|
|
|
|
|
3751
|
|
386
|
1366
|
100
|
|
|
|
3268
|
if ($foreign_tags{$name}) { |
387
|
2
|
|
|
|
|
9
|
$self->{in_foreign_elem}++; |
388
|
|
|
|
|
|
|
} |
389
|
|
|
|
|
|
|
} |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
return { |
392
|
1557
|
|
|
|
|
10110
|
type => TT_TAG_OPEN, |
393
|
|
|
|
|
|
|
pos => $pos, |
394
|
|
|
|
|
|
|
name => $name, |
395
|
|
|
|
|
|
|
attrs => \%attrs, |
396
|
|
|
|
|
|
|
is_void => $is_void, |
397
|
|
|
|
|
|
|
is_self_closing => $is_self_closing, |
398
|
|
|
|
|
|
|
}; |
399
|
|
|
|
|
|
|
} |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
# uncoverable statement |
402
|
0
|
|
|
|
|
|
die "Internal error: unparsable input '${\substr $$src_ref, pos($$src_ref), 10}'..."; |
|
0
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
} |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
1 |
406
|
|
|
|
|
|
|
__DATA__ |