blib/lib/Mojo/DOM/HTML.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 103 | 104 | 99.0 |
branch | 88 | 90 | 97.7 |
condition | 48 | 52 | 92.3 |
subroutine | 14 | 14 | 100.0 |
pod | 4 | 4 | 100.0 |
total | 257 | 264 | 97.3 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | package Mojo::DOM::HTML; | ||||||
2 | 62 | 62 | 512 | use Mojo::Base -base; | |||
62 | 165 | ||||||
62 | 472 | ||||||
3 | |||||||
4 | 62 | 62 | 465 | use Exporter qw(import); | |||
62 | 137 | ||||||
62 | 2218 | ||||||
5 | 62 | 62 | 385 | use Mojo::Util qw(html_attr_unescape html_unescape xml_escape); | |||
62 | 180 | ||||||
62 | 4172 | ||||||
6 | 62 | 62 | 665 | use Scalar::Util qw(weaken); | |||
62 | 194 | ||||||
62 | 133915 | ||||||
7 | |||||||
8 | our @EXPORT_OK = ('tag_to_html'); | ||||||
9 | |||||||
10 | has tree => sub { ['root'] }; | ||||||
11 | has 'xml'; | ||||||
12 | |||||||
13 | my $ATTR_RE = qr/ | ||||||
14 | ([^<>=\s\/0-9.\-][^<>=\s\/]*|\/) # Key | ||||||
15 | (?: | ||||||
16 | \s*=\s* | ||||||
17 | (?s:(["'])(.*?)\g{-2}|([^>\s]*)) # Value | ||||||
18 | )? | ||||||
19 | \s* | ||||||
20 | /x; | ||||||
21 | my $TOKEN_RE = qr/ | ||||||
22 | ([^<]+)? # Text | ||||||
23 | (?: | ||||||
24 | <(?: | ||||||
25 | !(?: | ||||||
26 | DOCTYPE( | ||||||
27 | \s+\w+ # Doctype | ||||||
28 | (?:(?:\s+\w+)?(?:\s+(?:"[^"]*"|'[^']*'))+)? # External ID | ||||||
29 | (?:\s+\[.+?\])? # Int Subset | ||||||
30 | \s*) | ||||||
31 | | | ||||||
32 | --(.*?)--\s* # Comment | ||||||
33 | | | ||||||
34 | \[CDATA\[(.*?)\]\] # CDATA | ||||||
35 | ) | ||||||
36 | | | ||||||
37 | \?(.*?)\? # Processing Instruction | ||||||
38 | | | ||||||
39 | \s*((?:\/\s*)?[^<>\s\/0-9.\-][^<>\s\/]*\s*(?:(?:$ATTR_RE){0,32766})*+) # Tag | ||||||
40 | )> | ||||||
41 | | | ||||||
42 | (<) # Runaway "<" | ||||||
43 | )?? | ||||||
44 | /xis; | ||||||
45 | |||||||
46 | # HTML elements that only contain raw text | ||||||
47 | my %RAW = map { $_ => 1 } qw(script style); | ||||||
48 | |||||||
49 | # HTML elements that only contain raw text and entities | ||||||
50 | my %RCDATA = map { $_ => 1 } qw(title textarea); | ||||||
51 | |||||||
52 | # HTML elements with optional end tags | ||||||
53 | my %END = (body => 'head', optgroup => 'optgroup', option => 'option'); | ||||||
54 | |||||||
55 | # HTML elements that break paragraphs | ||||||
56 | map { $END{$_} = 'p' } ( | ||||||
57 | qw(address article aside blockquote details dialog div dl fieldset figcaption figure footer form h1 h2 h3 h4 h5 h6), | ||||||
58 | qw(header hgroup hr main menu nav ol p pre section table ul) | ||||||
59 | ); | ||||||
60 | |||||||
61 | # Container HTML elements that create their own scope | ||||||
62 | my %SCOPE = map { $_ => 1 } qw(math svg); | ||||||
63 | |||||||
64 | # HTML table elements with optional end tags | ||||||
65 | my %TABLE = map { $_ => 1 } qw(colgroup tbody td tfoot th thead tr); | ||||||
66 | |||||||
67 | # HTML elements with optional end tags and scoping rules | ||||||
68 | my %CLOSE = (li => [{li => 1}, {ul => 1, ol => 1}], tr => [{tr => 1}, {table => 1}]); | ||||||
69 | $CLOSE{$_} = [\%TABLE, {table => 1}] for qw(colgroup tbody tfoot thead); | ||||||
70 | $CLOSE{$_} = [{dd => 1, dt => 1}, {dl => 1}] for qw(dd dt); | ||||||
71 | $CLOSE{$_} = [{rp => 1, rt => 1}, {ruby => 1}] for qw(rp rt); | ||||||
72 | $CLOSE{$_} = [{th => 1, td => 1}, {table => 1}] for qw(td th); | ||||||
73 | |||||||
74 | # HTML parent elements that signal no more content when closed, but that are also phrasing content | ||||||
75 | my %NO_MORE_CONTENT = (ruby => [qw(rt rp)], select => [qw(option optgroup)]); | ||||||
76 | |||||||
77 | # HTML elements without end tags | ||||||
78 | my %EMPTY = map { $_ => 1 } qw(area base br col embed hr img input keygen link menuitem meta param source track wbr); | ||||||
79 | |||||||
80 | # HTML elements categorized as phrasing content (and obsolete inline elements) | ||||||
81 | my @PHRASING = ( | ||||||
82 | qw(a abbr area audio b bdi bdo br button canvas cite code data datalist del dfn em embed i iframe img input ins kbd), | ||||||
83 | qw(keygen label link map mark math meta meter noscript object output picture progress q ruby s samp script select), | ||||||
84 | qw(slot small span strong sub sup svg template textarea time u var video wbr) | ||||||
85 | ); | ||||||
86 | my @OBSOLETE = qw(acronym applet basefont big font strike tt); | ||||||
87 | my %PHRASING = map { $_ => 1 } @OBSOLETE, @PHRASING; | ||||||
88 | |||||||
89 | # HTML elements that don't get their self-closing flag acknowledged | ||||||
90 | my %BLOCK = map { $_ => 1 } ( | ||||||
91 | qw(a address applet article aside b big blockquote body button caption center code col colgroup dd details dialog), | ||||||
92 | qw(dir div dl dt em fieldset figcaption figure font footer form frameset h1 h2 h3 h4 h5 h6 head header hgroup html), | ||||||
93 | qw(i iframe li listing main marquee menu nav nobr noembed noframes noscript object ol optgroup option p plaintext), | ||||||
94 | qw(pre rp rt s script section select small strike strong style summary table tbody td template textarea tfoot th), | ||||||
95 | qw(thead title tr tt u ul xmp) | ||||||
96 | ); | ||||||
97 | |||||||
98 | sub parse { | ||||||
99 | 324 | 324 | 1 | 1447 | my ($self, $html) = (shift, "$_[0]"); | ||
100 | |||||||
101 | 324 | 903 | my $xml = $self->xml; | ||||
102 | 324 | 768 | my $current = my $tree = ['root']; | ||||
103 | 324 | 4299 | while ($html =~ /\G$TOKEN_RE/gcso) { | ||||
104 | 10699 | 34177 | my ($text, $doctype, $comment, $cdata, $pi, $tag, $runaway) = ($1, $2, $3, $4, $5, $6, $11); | ||||
105 | |||||||
106 | # Text (and runaway "<") | ||||||
107 | 10699 | 100 | 19010 | $text .= '<' if defined $runaway; | |||
108 | 10699 | 100 | 20177 | _node($current, 'text', html_unescape $text) if defined $text; | |||
109 | |||||||
110 | # Tag | ||||||
111 | 10699 | 100 | 193846 | if (defined $tag) { | |||
100 | |||||||
100 | |||||||
100 | |||||||
100 | |||||||
112 | |||||||
113 | # End | ||||||
114 | 3638 | 100 | 11986 | if ($tag =~ /^\/\s*(\S+)/) { | |||
50 | |||||||
115 | 1594 | 100 | 4282 | my $end = $xml ? $1 : lc $1; | |||
116 | |||||||
117 | # No more content | ||||||
118 | 1594 | 100 | 100 | 5655 | if (!$xml && (my $tags = $NO_MORE_CONTENT{$end})) { _end($_, $xml, \$current) for @$tags } | ||
15 | 55 | ||||||
119 | |||||||
120 | 1594 | 3093 | _end($end, $xml, \$current); | ||||
121 | } | ||||||
122 | |||||||
123 | # Start | ||||||
124 | elsif ($tag =~ m!^([^\s/]+)([\s\S]*)!) { | ||||||
125 | 2044 | 100 | 7032 | my ($start, $attr) = ($xml ? $1 : lc $1, $2); | |||
126 | |||||||
127 | # Attributes | ||||||
128 | 2044 | 3255 | my (%attrs, $closing); | ||||
129 | 2044 | 7123 | while ($attr =~ /$ATTR_RE/go) { | ||||
130 | 34220 | 100 | 100 | 127785 | my ($key, $value) = ($xml ? $1 : lc $1, $3 // $4); | ||
131 | |||||||
132 | # Empty tag | ||||||
133 | 34220 | 100 | 50 | 62522 | ++$closing and next if $key eq '/'; | ||
134 | |||||||
135 | 34156 | 100 | 72277 | $attrs{$key} = defined $value ? html_attr_unescape $value : $value; | |||
136 | } | ||||||
137 | |||||||
138 | # "image" is an alias for "img" | ||||||
139 | 2044 | 100 | 100 | 7052 | $start = 'img' if !$xml && $start eq 'image'; | ||
140 | 2044 | 5717 | _start($start, \%attrs, $xml, \$current); | ||||
141 | |||||||
142 | # Element without end tag (self-closing) | ||||||
143 | 2044 | 100 | 100 | 12591 | _end($start, $xml, \$current) if !$xml && $EMPTY{$start} || ($xml || !$BLOCK{$start}) && $closing; | ||
100 | |||||||
100 | |||||||
100 | |||||||
144 | |||||||
145 | # Raw text elements | ||||||
146 | 2044 | 100 | 100 | 15071 | next if $xml || !$RAW{$start} && !$RCDATA{$start}; | ||
100 | |||||||
147 | 72 | 100 | 1403 | next unless $html =~ m!\G(.*?)\Q$start\E(?:\s+|\s*>)!gcsi; | |||
148 | 71 | 100 | 44974 | _node($current, 'raw', $RCDATA{$start} ? html_unescape $1 : $1); | |||
149 | 71 | 217 | _end($start, 0, \$current); | ||||
150 | } | ||||||
151 | } | ||||||
152 | |||||||
153 | # DOCTYPE | ||||||
154 | 18 | 77 | elsif (defined $doctype) { _node($current, 'doctype', $doctype) } | ||||
155 | |||||||
156 | # Comment | ||||||
157 | 14 | 61 | elsif (defined $comment) { _node($current, 'comment', $comment) } | ||||
158 | |||||||
159 | # CDATA | ||||||
160 | 7 | 24 | elsif (defined $cdata) { _node($current, 'cdata', $cdata) } | ||||
161 | |||||||
162 | # Processing instruction (try to detect XML) | ||||||
163 | elsif (defined $pi) { | ||||||
164 | 18 | 100 | 100 | 142 | $self->xml($xml = 1) if !exists $self->{xml} && $pi =~ /xml/i; | ||
165 | 18 | 52 | _node($current, 'pi', $pi); | ||||
166 | } | ||||||
167 | } | ||||||
168 | |||||||
169 | 324 | 1109 | return $self->tree($tree); | ||||
170 | } | ||||||
171 | |||||||
172 | 164 | 164 | 1 | 446 | sub render { _render($_[0]->tree, $_[0]->xml) } | ||
173 | |||||||
174 | 11 | 11 | 1 | 31 | sub tag { shift->tree(['root', _tag(@_)]) } | ||
175 | |||||||
176 | 942 | 942 | 1 | 2121 | sub tag_to_html { _render(_tag(@_), undef) } | ||
177 | |||||||
178 | sub _end { | ||||||
179 | 2387 | 2387 | 4178 | my ($end, $xml, $current) = @_; | |||
180 | |||||||
181 | # Search stack for start tag | ||||||
182 | 2387 | 3266 | my $next = $$current; | ||||
183 | 2387 | 3235 | do { | ||||
184 | |||||||
185 | # Ignore useless end tag | ||||||
186 | 5208 | 100 | 10267 | return if $next->[0] eq 'root'; | |||
187 | |||||||
188 | # Don’t traverse a container tag | ||||||
189 | 4724 | 100 | 100 | 10086 | return if $SCOPE{$next->[1]} && $next->[1] ne $end; | ||
190 | |||||||
191 | # Right tag | ||||||
192 | 4721 | 100 | 14245 | return $$current = $next->[3] if $next->[1] eq $end; | |||
193 | |||||||
194 | # Phrasing content can only cross phrasing content | ||||||
195 | 2827 | 100 | 100 | 9258 | return if !$xml && $PHRASING{$end} && !$PHRASING{$next->[1]}; | ||
66 | |||||||
196 | |||||||
197 | } while $next = $next->[3]; | ||||||
198 | } | ||||||
199 | |||||||
200 | sub _node { | ||||||
201 | 3092 | 3092 | 5609 | my ($current, $type, $content) = @_; | |||
202 | 3092 | 9215 | push @$current, my $new = [$type, $content, $current]; | ||||
203 | 3092 | 8063 | weaken $new->[2]; | ||||
204 | } | ||||||
205 | |||||||
206 | sub _render { | ||||||
207 | 2478 | 2478 | 4415 | my ($tree, $xml) = @_; | |||
208 | |||||||
209 | # Tag | ||||||
210 | 2478 | 3938 | my $type = $tree->[0]; | ||||
211 | 2478 | 100 | 5356 | if ($type eq 'tag') { | |||
212 | |||||||
213 | # Start tag | ||||||
214 | 1395 | 2315 | my $tag = $tree->[1]; | ||||
215 | 1395 | 2759 | my $result = "<$tag"; | ||||
216 | |||||||
217 | # Attributes | ||||||
218 | 1395 | 1933 | for my $key (sort keys %{$tree->[2]}) { | ||||
1395 | 5263 | ||||||
219 | 1650 | 2955 | my $value = $tree->[2]{$key}; | ||||
220 | 1650 | 100 | 50 | 3230 | $result .= $xml ? qq{ $key="$key"} : " $key" and next unless defined $value; | ||
100 | |||||||
221 | 1613 | 4347 | $result .= qq{ $key="} . xml_escape($value) . '"'; | ||||
222 | } | ||||||
223 | |||||||
224 | # No children | ||||||
225 | 1395 | 100 | 6401 | return $xml ? "$result />" : $EMPTY{$tag} ? "$result>" : "$result>$tag>" unless $tree->[4]; | |||
100 | |||||||
100 | |||||||
226 | |||||||
227 | # Children | ||||||
228 | 62 | 62 | 603 | no warnings 'recursion'; | |||
62 | 201 | ||||||
62 | 52789 | ||||||
229 | 912 | 2323 | $result .= '>' . join '', map { _render($_, $xml) } @$tree[4 .. $#$tree]; | ||||
1184 | 3257 | ||||||
230 | |||||||
231 | # End tag | ||||||
232 | 912 | 5509 | return "$result$tag>"; | ||||
233 | } | ||||||
234 | |||||||
235 | # Text (escaped) | ||||||
236 | 1083 | 100 | 2719 | return xml_escape $tree->[1] if $type eq 'text'; | |||
237 | |||||||
238 | # Raw text | ||||||
239 | 538 | 100 | 2017 | return $tree->[1] if $type eq 'raw'; | |||
240 | |||||||
241 | # Root | ||||||
242 | 150 | 100 | 717 | return join '', map { _render($_, $xml) } @$tree[1 .. $#$tree] if $type eq 'root'; | |||
188 | 392 | ||||||
243 | |||||||
244 | # DOCTYPE | ||||||
245 | 14 | 100 | 83 | return '[1] . '>' if $type eq 'doctype'; | |||
246 | |||||||
247 | # Comment | ||||||
248 | 10 | 100 | 47 | return '' if $type eq 'comment'; | |||
249 | |||||||
250 | # CDATA | ||||||
251 | 6 | 100 | 59 | return '[1] . ']]>' if $type eq 'cdata'; | |||
252 | |||||||
253 | # Processing instruction | ||||||
254 | 3 | 50 | 44 | return '' . $tree->[1] . '?>' if $type eq 'pi'; | |||
255 | |||||||
256 | # Everything else | ||||||
257 | 0 | 0 | return ''; | ||||
258 | } | ||||||
259 | |||||||
260 | sub _start { | ||||||
261 | 2044 | 2044 | 3825 | my ($start, $attrs, $xml, $current) = @_; | |||
262 | |||||||
263 | # Autoclose optional HTML elements | ||||||
264 | 2044 | 100 | 100 | 6963 | if (!$xml && $$current->[0] ne 'root') { | ||
265 | 1627 | 100 | 5003 | if (my $end = $END{$start}) { _end($end, 0, $current) } | |||
469 | 100 | 936 | |||||
266 | |||||||
267 | elsif (my $close = $CLOSE{$start}) { | ||||||
268 | 595 | 1100 | my ($allowed, $scope) = @$close; | ||||
269 | |||||||
270 | # Close allowed parent elements in scope | ||||||
271 | 595 | 855 | my $parent = $$current; | ||||
272 | 595 | 66 | 2277 | while ($parent->[0] ne 'root' && !$scope->{$parent->[1]}) { | |||
273 | 440 | 100 | 1028 | _end($parent->[1], 0, $current) if $allowed->{$parent->[1]}; | |||
274 | 440 | 1535 | $parent = $parent->[3]; | ||||
275 | } | ||||||
276 | } | ||||||
277 | } | ||||||
278 | |||||||
279 | # New tag | ||||||
280 | 2044 | 6642 | push @$$current, my $new = ['tag', $start, $attrs, $$current]; | ||||
281 | 2044 | 5939 | weaken $new->[3]; | ||||
282 | 2044 | 3290 | $$current = $new; | ||||
283 | } | ||||||
284 | |||||||
285 | sub _tag { | ||||||
286 | 953 | 953 | 2202 | my $tree = ['tag', shift, undef, undef]; | |||
287 | |||||||
288 | # Content | ||||||
289 | 953 | 100 | 3713 | push @$tree, ref $_[-1] eq 'CODE' ? ['raw', pop->()] : ['text', pop] if @_ % 2; | |||
100 | |||||||
290 | |||||||
291 | # Attributes | ||||||
292 | 953 | 3282 | my $attrs = $tree->[2] = {@_}; | ||||
293 | 953 | 100 | 100 | 3513 | return $tree unless exists $attrs->{data} && ref $attrs->{data} eq 'HASH'; | ||
294 | 5 | 15 | my $data = delete $attrs->{data}; | ||||
295 | 5 | 21 | @$attrs{map { y/_/-/; lc "data-$_" } keys %$data} = values %$data; | ||||
9 | 24 | ||||||
9 | 37 | ||||||
296 | 5 | 23 | return $tree; | ||||
297 | } | ||||||
298 | |||||||
299 | 1; | ||||||
300 | |||||||
301 | =encoding utf8 | ||||||
302 | |||||||
303 | =head1 NAME | ||||||
304 | |||||||
305 | Mojo::DOM::HTML - HTML/XML engine | ||||||
306 | |||||||
307 | =head1 SYNOPSIS | ||||||
308 | |||||||
309 | use Mojo::DOM::HTML; | ||||||
310 | |||||||
311 | # Turn HTML into DOM tree | ||||||
312 | my $html = Mojo::DOM::HTML->new; | ||||||
313 | $html->parse(' Test 123 |
||||||
314 | my $tree = $html->tree; | ||||||
315 | |||||||
316 | =head1 DESCRIPTION | ||||||
317 | |||||||
318 | L | ||||||
319 | Standard|https://html.spec.whatwg.org> and the L |
||||||
320 | |||||||
321 | =head1 FUNCTIONS | ||||||
322 | |||||||
323 | L |
||||||
324 | |||||||
325 | =head2 tag_to_html | ||||||
326 | |||||||
327 | my $str = tag_to_html 'div', id => 'foo', 'safe content'; | ||||||
328 | |||||||
329 | Generate HTML/XML tag and render it right away. This is a significantly faster alternative to L"tag"> for template | ||||||
330 | systems that have to generate a lot of tags. | ||||||
331 | |||||||
332 | =head1 ATTRIBUTES | ||||||
333 | |||||||
334 | L |
||||||
335 | |||||||
336 | =head2 tree | ||||||
337 | |||||||
338 | my $tree = $html->tree; | ||||||
339 | $html = $html->tree(['root']); | ||||||
340 | |||||||
341 | Document Object Model. Note that this structure should only be used very carefully since it is very dynamic. | ||||||
342 | |||||||
343 | =head2 xml | ||||||
344 | |||||||
345 | my $bool = $html->xml; | ||||||
346 | $html = $html->xml($bool); | ||||||
347 | |||||||
348 | Disable HTML semantics in parser and activate case-sensitivity, defaults to auto-detection based on XML declarations. | ||||||
349 | |||||||
350 | =head1 METHODS | ||||||
351 | |||||||
352 | L |
||||||
353 | |||||||
354 | =head2 parse | ||||||
355 | |||||||
356 | $html = $html->parse(' |
||||||
357 | |||||||
358 | Parse HTML/XML fragment. | ||||||
359 | |||||||
360 | =head2 render | ||||||
361 | |||||||
362 | my $str = $html->render; | ||||||
363 | |||||||
364 | Render DOM to HTML/XML. | ||||||
365 | |||||||
366 | =head2 tag | ||||||
367 | |||||||
368 | $html = $html->tag('div', id => 'foo', 'safe content'); | ||||||
369 | |||||||
370 | Generate HTML/XML tag. | ||||||
371 | |||||||
372 | =head1 SEE ALSO | ||||||
373 | |||||||
374 | L |
||||||
375 | |||||||
376 | =cut |