line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package DOM::Tiny::_HTML; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
4
|
use strict; |
|
1
|
|
|
|
|
0
|
|
|
1
|
|
|
|
|
23
|
|
4
|
1
|
|
|
1
|
|
3
|
use warnings; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
21
|
|
5
|
1
|
|
|
1
|
|
393
|
use DOM::Tiny::Entities qw(html_escape html_unescape); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
132
|
|
6
|
1
|
|
|
1
|
|
9
|
use Scalar::Util 'weaken'; |
|
1
|
|
|
|
|
0
|
|
|
1
|
|
|
|
|
1087
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
our $VERSION = '0.003'; |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
my $ATTR_RE = qr/ |
11
|
|
|
|
|
|
|
([^<>=\s\/]+|\/) # Key |
12
|
|
|
|
|
|
|
(?: |
13
|
|
|
|
|
|
|
\s*=\s* |
14
|
|
|
|
|
|
|
(?s:(?:"(.*?)")|(?:'(.*?)')|([^>\s]*)) # Value |
15
|
|
|
|
|
|
|
)? |
16
|
|
|
|
|
|
|
\s* |
17
|
|
|
|
|
|
|
/x; |
18
|
|
|
|
|
|
|
my $TOKEN_RE = qr/ |
19
|
|
|
|
|
|
|
([^<]+)? # Text |
20
|
|
|
|
|
|
|
(?: |
21
|
|
|
|
|
|
|
<(?: |
22
|
|
|
|
|
|
|
!(?: |
23
|
|
|
|
|
|
|
DOCTYPE( |
24
|
|
|
|
|
|
|
\s+\w+ # Doctype |
25
|
|
|
|
|
|
|
(?:(?:\s+\w+)?(?:\s+(?:"[^"]*"|'[^']*'))+)? # External ID |
26
|
|
|
|
|
|
|
(?:\s+\[.+?\])? # Int Subset |
27
|
|
|
|
|
|
|
\s*) |
28
|
|
|
|
|
|
|
| |
29
|
|
|
|
|
|
|
--(.*?)--\s* # Comment |
30
|
|
|
|
|
|
|
| |
31
|
|
|
|
|
|
|
\[CDATA\[(.*?)\]\] # CDATA |
32
|
|
|
|
|
|
|
) |
33
|
|
|
|
|
|
|
| |
34
|
|
|
|
|
|
|
\?(.*?)\? # Processing Instruction |
35
|
|
|
|
|
|
|
| |
36
|
|
|
|
|
|
|
\s*([^<>\s]+\s*(?>(?:$ATTR_RE){0,32766})*) # Tag |
37
|
|
|
|
|
|
|
# Workaround for perl's limit of * to {0,32767} |
38
|
|
|
|
|
|
|
)> |
39
|
|
|
|
|
|
|
| |
40
|
|
|
|
|
|
|
(<) # Runaway "<" |
41
|
|
|
|
|
|
|
)?? |
42
|
|
|
|
|
|
|
/xis; |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
# HTML elements that only contain raw text |
45
|
|
|
|
|
|
|
my %RAW = map { $_ => 1 } qw(script style); |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
# HTML elements that only contain raw text and entities |
48
|
|
|
|
|
|
|
my %RCDATA = map { $_ => 1 } qw(title textarea); |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
# HTML elements with optional end tags |
51
|
|
|
|
|
|
|
my %END = (body => 'head', optgroup => 'optgroup', option => 'option'); |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
# HTML elements that break paragraphs |
54
|
|
|
|
|
|
|
$END{$_} = 'p' for |
55
|
|
|
|
|
|
|
qw(address article aside blockquote dir div dl fieldset footer form h1 h2), |
56
|
|
|
|
|
|
|
qw(h3 h4 h5 h6 header hr main menu nav ol p pre section table ul); |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
# HTML table elements with optional end tags |
59
|
|
|
|
|
|
|
my %TABLE = map { $_ => 1 } qw(colgroup tbody td tfoot th thead tr); |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
# HTML elements with optional end tags and scoping rules |
62
|
|
|
|
|
|
|
my %CLOSE |
63
|
|
|
|
|
|
|
= (li => [{li => 1}, {ul => 1, ol => 1}], tr => [{tr => 1}, {table => 1}]); |
64
|
|
|
|
|
|
|
$CLOSE{$_} = [\%TABLE, {table => 1}] for qw(colgroup tbody tfoot thead); |
65
|
|
|
|
|
|
|
$CLOSE{$_} = [{dd => 1, dt => 1}, {dl => 1}] for qw(dd dt); |
66
|
|
|
|
|
|
|
$CLOSE{$_} = [{rp => 1, rt => 1}, {ruby => 1}] for qw(rp rt); |
67
|
|
|
|
|
|
|
$CLOSE{$_} = [{th => 1, td => 1}, {table => 1}] for qw(td th); |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
# HTML elements without end tags |
70
|
|
|
|
|
|
|
my %EMPTY = map { $_ => 1 } ( |
71
|
|
|
|
|
|
|
qw(area base br col embed hr img input keygen link menuitem meta param), |
72
|
|
|
|
|
|
|
qw(source track wbr) |
73
|
|
|
|
|
|
|
); |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
# HTML elements categorized as phrasing content (and obsolete inline elements) |
76
|
|
|
|
|
|
|
my @PHRASING = ( |
77
|
|
|
|
|
|
|
qw(a abbr area audio b bdi bdo br button canvas cite code data datalist), |
78
|
|
|
|
|
|
|
qw(del dfn em embed i iframe img input ins kbd keygen label link map mark), |
79
|
|
|
|
|
|
|
qw(math meta meter noscript object output picture progress q ruby s samp), |
80
|
|
|
|
|
|
|
qw(script select small span strong sub sup svg template textarea time u), |
81
|
|
|
|
|
|
|
qw(var video wbr) |
82
|
|
|
|
|
|
|
); |
83
|
|
|
|
|
|
|
my @OBSOLETE = qw(acronym applet basefont big font strike tt); |
84
|
|
|
|
|
|
|
my %PHRASING = map { $_ => 1 } @OBSOLETE, @PHRASING; |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
# HTML elements that don't get their self-closing flag acknowledged |
87
|
|
|
|
|
|
|
my %BLOCK = map { $_ => 1 } ( |
88
|
|
|
|
|
|
|
qw(a address applet article aside b big blockquote body button caption), |
89
|
|
|
|
|
|
|
qw(center code col colgroup dd details dialog dir div dl dt em fieldset), |
90
|
|
|
|
|
|
|
qw(figcaption figure font footer form frameset h1 h2 h3 h4 h5 h6 head), |
91
|
|
|
|
|
|
|
qw(header hgroup html i iframe li listing main marquee menu nav nobr), |
92
|
|
|
|
|
|
|
qw(noembed noframes noscript object ol optgroup option p plaintext pre rp), |
93
|
|
|
|
|
|
|
qw(rt s script section select small strike strong style summary table), |
94
|
|
|
|
|
|
|
qw(tbody td template textarea tfoot th thead title tr tt u ul xmp) |
95
|
|
|
|
|
|
|
); |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
sub new { |
98
|
1952
|
|
|
1952
|
0
|
1795
|
my $class = shift; |
99
|
1952
|
50
|
33
|
|
|
13153
|
bless @_ ? @_ > 1 ? {@_} : {%{$_[0]}} : {}, ref $class || $class; |
|
0
|
100
|
|
|
|
0
|
|
100
|
|
|
|
|
|
|
} |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
sub tree { |
103
|
6079
|
|
|
6079
|
0
|
5626
|
my $self = shift; |
104
|
6079
|
100
|
|
|
|
22667
|
return exists $self->{tree} ? $self->{tree} : ($self->{tree} = ['root']) unless @_; |
|
|
100
|
|
|
|
|
|
105
|
1965
|
|
|
|
|
3132
|
$self->{tree} = shift; |
106
|
1965
|
|
|
|
|
3167
|
return $self; |
107
|
|
|
|
|
|
|
} |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
sub xml { |
110
|
3236
|
|
|
3236
|
0
|
2638
|
my $self = shift; |
111
|
3236
|
100
|
|
|
|
7452
|
return $self->{xml} unless @_; |
112
|
1775
|
|
|
|
|
1969
|
$self->{xml} = shift; |
113
|
1775
|
|
|
|
|
2078
|
return $self; |
114
|
|
|
|
|
|
|
} |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
sub parse { |
117
|
194
|
|
|
194
|
0
|
545
|
my ($self, $html) = (shift, "$_[0]"); |
118
|
|
|
|
|
|
|
|
119
|
194
|
|
|
|
|
397
|
my $xml = $self->xml; |
120
|
194
|
|
|
|
|
399
|
my $current = my $tree = ['root']; |
121
|
194
|
|
|
|
|
1250
|
while ($html =~ /\G$TOKEN_RE/gcso) { |
122
|
3697
|
|
|
|
|
9163
|
my ($text, $doctype, $comment, $cdata, $pi, $tag, $runaway) |
123
|
|
|
|
|
|
|
= ($1, $2, $3, $4, $5, $6, $11); |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
# Text (and runaway "<") |
126
|
3697
|
100
|
|
|
|
5021
|
$text .= '<' if defined $runaway; |
127
|
3697
|
100
|
|
|
|
6320
|
_node($current, 'text', html_unescape $text) if defined $text; |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
# Tag |
130
|
3697
|
100
|
|
|
|
88961
|
if (defined $tag) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
# End |
133
|
1243
|
100
|
|
|
|
4873
|
if ($tag =~ /^\/\s*(\S+)/) { _end($xml ? $1 : lc $1, $xml, \$current) } |
|
513
|
100
|
|
|
|
1511
|
|
|
|
50
|
|
|
|
|
|
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
# Start |
136
|
|
|
|
|
|
|
elsif ($tag =~ m!^([^\s/]+)([\s\S]*)!) { |
137
|
730
|
100
|
|
|
|
1995
|
my ($start, $attr) = ($xml ? $1 : lc $1, $2); |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
# Attributes |
140
|
730
|
|
|
|
|
661
|
my (%attrs, $closing); |
141
|
730
|
|
|
|
|
1967
|
while ($attr =~ /$ATTR_RE/go) { |
142
|
33051
|
100
|
|
|
|
48534
|
my $key = $xml ? $1 : lc $1; |
143
|
33051
|
100
|
|
|
|
57390
|
my $value = defined($2) ? $2 : defined($3) ? $3 : $4; |
|
|
100
|
|
|
|
|
|
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
# Empty tag |
146
|
33051
|
100
|
50
|
|
|
40793
|
++$closing and next if $key eq '/'; |
147
|
|
|
|
|
|
|
|
148
|
33012
|
100
|
|
|
|
57027
|
$attrs{$key} = defined $value ? html_unescape $value : $value; |
149
|
|
|
|
|
|
|
} |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
# "image" is an alias for "img" |
152
|
730
|
100
|
100
|
|
|
2500
|
$start = 'img' if !$xml && $start eq 'image'; |
153
|
730
|
|
|
|
|
1435
|
_start($start, \%attrs, $xml, \$current); |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
# Element without end tag (self-closing) |
156
|
|
|
|
|
|
|
_end($start, $xml, \$current) |
157
|
730
|
100
|
66
|
|
|
4383
|
if !$xml && $EMPTY{$start} || ($xml || !$BLOCK{$start}) && $closing; |
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
66
|
|
|
|
|
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
# Raw text elements |
160
|
730
|
100
|
100
|
|
|
5672
|
next if $xml || !$RAW{$start} && !$RCDATA{$start}; |
|
|
|
100
|
|
|
|
|
161
|
28
|
100
|
|
|
|
529
|
next unless $html =~ m!\G(.*?)<\s*/\s*\Q$start\E\s*>!gcsi; |
162
|
27
|
100
|
|
|
|
126
|
_node($current, 'raw', $RCDATA{$start} ? html_unescape $1 : $1); |
163
|
27
|
|
|
|
|
72
|
_end($start, 0, \$current); |
164
|
|
|
|
|
|
|
} |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
# DOCTYPE |
168
|
11
|
|
|
|
|
37
|
elsif (defined $doctype) { _node($current, 'doctype', $doctype) } |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
# Comment |
171
|
10
|
|
|
|
|
18
|
elsif (defined $comment) { _node($current, 'comment', $comment) } |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
# CDATA |
174
|
7
|
|
|
|
|
63
|
elsif (defined $cdata) { _node($current, 'cdata', $cdata) } |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
# Processing instruction (try to detect XML) |
177
|
|
|
|
|
|
|
elsif (defined $pi) { |
178
|
17
|
100
|
100
|
|
|
163
|
$self->xml($xml = 1) if !exists $self->{xml} && $pi =~ /xml/i; |
179
|
17
|
|
|
|
|
42
|
_node($current, 'pi', $pi); |
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
|
183
|
194
|
|
|
|
|
415
|
return $self->tree($tree); |
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
|
186
|
125
|
|
|
125
|
0
|
247
|
sub render { _render($_[0]->tree, $_[0]->xml) } |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
sub _end { |
189
|
768
|
|
|
768
|
|
923
|
my ($end, $xml, $current) = @_; |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
# Search stack for start tag |
192
|
768
|
|
|
|
|
659
|
my $next = $$current; |
193
|
768
|
|
|
|
|
618
|
do { |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
# Ignore useless end tag |
196
|
1046
|
100
|
|
|
|
1908
|
return if $next->[0] eq 'root'; |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
# Right tag |
199
|
933
|
100
|
|
|
|
3644
|
return $$current = $next->[3] if $next->[1] eq $end; |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
# Phrasing content can only cross phrasing content |
202
|
283
|
100
|
66
|
|
|
1196
|
return if !$xml && $PHRASING{$end} && !$PHRASING{$next->[1]}; |
|
|
|
66
|
|
|
|
|
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
} while $next = $next->[3]; |
205
|
|
|
|
|
|
|
} |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
sub _node { |
208
|
992
|
|
|
992
|
|
1145
|
my ($current, $type, $content) = @_; |
209
|
992
|
|
|
|
|
2341
|
push @$current, my $new = [$type, $content, $current]; |
210
|
992
|
|
|
|
|
2235
|
weaken $new->[2]; |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
sub _render { |
214
|
852
|
|
|
852
|
|
747
|
my ($tree, $xml) = @_; |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
# Text (escaped) |
217
|
852
|
|
|
|
|
783
|
my $type = $tree->[0]; |
218
|
852
|
100
|
|
|
|
1621
|
return html_escape($tree->[1]) if $type eq 'text'; |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
# Raw text |
221
|
477
|
100
|
|
|
|
638
|
return $tree->[1] if $type eq 'raw'; |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
# DOCTYPE |
224
|
473
|
100
|
|
|
|
680
|
return '[1] . '>' if $type eq 'doctype'; |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
# Comment |
227
|
469
|
100
|
|
|
|
580
|
return '' if $type eq 'comment'; |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
# CDATA |
230
|
465
|
100
|
|
|
|
609
|
return '[1] . ']]>' if $type eq 'cdata'; |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
# Processing instruction |
233
|
462
|
100
|
|
|
|
599
|
return '' . $tree->[1] . '?>' if $type eq 'pi'; |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
# Root |
236
|
459
|
100
|
|
|
|
891
|
return join '', map { _render($_, $xml) } @$tree[1 .. $#$tree] |
|
143
|
|
|
|
|
231
|
|
237
|
|
|
|
|
|
|
if $type eq 'root'; |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
# Start tag |
240
|
362
|
|
|
|
|
394
|
my $tag = $tree->[1]; |
241
|
362
|
|
|
|
|
452
|
my $result = "<$tag"; |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
# Attributes |
244
|
362
|
|
|
|
|
295
|
for my $key (sort keys %{$tree->[2]}) { |
|
362
|
|
|
|
|
964
|
|
245
|
54
|
|
|
|
|
72
|
my $value = $tree->[2]{$key}; |
246
|
54
|
100
|
50
|
|
|
117
|
$result .= $xml ? qq{ $key="$key"} : " $key" and next unless defined $value; |
|
|
100
|
|
|
|
|
|
247
|
47
|
|
|
|
|
138
|
$result .= qq{ $key="} . html_escape($value) . '"'; |
248
|
|
|
|
|
|
|
} |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
# No children |
251
|
362
|
100
|
|
|
|
795
|
return $xml ? "$result />" : $EMPTY{$tag} ? "$result>" : "$result>$tag>" |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
252
|
|
|
|
|
|
|
unless $tree->[4]; |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
# Children |
255
|
1
|
|
|
1
|
|
5
|
no warnings 'recursion'; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
246
|
|
256
|
332
|
|
|
|
|
509
|
$result .= '>' . join '', map { _render($_, $xml) } @$tree[4 .. $#$tree]; |
|
584
|
|
|
|
|
1034
|
|
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
# End tag |
259
|
332
|
|
|
|
|
1518
|
return "$result$tag>"; |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
sub _start { |
263
|
730
|
|
|
730
|
|
898
|
my ($start, $attrs, $xml, $current) = @_; |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
# Autoclose optional HTML elements |
266
|
730
|
100
|
100
|
|
|
2534
|
if (!$xml && $$current->[0] ne 'root') { |
267
|
478
|
100
|
|
|
|
1323
|
if (my $end = $END{$start}) { _end($end, 0, $current) } |
|
118
|
100
|
|
|
|
173
|
|
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
elsif (my $close = $CLOSE{$start}) { |
270
|
123
|
|
|
|
|
180
|
my ($allowed, $scope) = @$close; |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
# Close allowed parent elements in scope |
273
|
123
|
|
|
|
|
100
|
my $parent = $$current; |
274
|
123
|
|
66
|
|
|
503
|
while ($parent->[0] ne 'root' && !$scope->{$parent->[1]}) { |
275
|
119
|
100
|
|
|
|
227
|
_end($parent->[1], 0, $current) if $allowed->{$parent->[1]}; |
276
|
119
|
|
|
|
|
417
|
$parent = $parent->[3]; |
277
|
|
|
|
|
|
|
} |
278
|
|
|
|
|
|
|
} |
279
|
|
|
|
|
|
|
} |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
# New tag |
282
|
730
|
|
|
|
|
2022
|
push @$$current, my $new = ['tag', $start, $attrs, $$current]; |
283
|
730
|
|
|
|
|
1651
|
weaken $new->[3]; |
284
|
730
|
|
|
|
|
1039
|
$$current = $new; |
285
|
|
|
|
|
|
|
} |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
1; |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
=for Pod::Coverage *EVERYTHING* |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
=cut |