File Coverage

blib/lib/HTML/Tagset.pm
Criterion Covered Total %
statement 3 3 100.0
branch n/a
condition n/a
subroutine 1 1 100.0
pod n/a
total 4 4 100.0


line stmt bran cond sub pod time code
1             package HTML::Tagset;
2              
3 2     2   276370 use strict;
  2         4  
  2         2094  
4              
5             =head1 NAME
6              
7             HTML::Tagset - data tables useful in parsing HTML
8              
9             =head1 VERSION
10              
11             Version 3.24
12              
13             =cut
14              
15             our $VERSION = '3.24';
16              
17             =head1 SYNOPSIS
18              
19             use HTML::Tagset;
20             # Then use any of the items in the HTML::Tagset package
21             # as need arises
22              
23             =head1 DESCRIPTION
24              
25             This module contains several data tables useful in various kinds of
26             HTML parsing operations.
27              
28             Note that all tag names used are lowercase.
29              
30             In the following documentation, a "hashset" is a hash being used as a
31             set -- the hash conveys that its keys are there, and the actual values
32             associated with the keys are not significant. (But what values are
33             there, are always true.)
34              
35             =head1 VARIABLES
36              
37             Note that none of these variables are exported.
38              
39             =head2 hashset %HTML::Tagset::emptyElement
40              
41             This hashset has as values the tag-names (GIs) of elements that cannot
42             have content. (For example, "base", "br", "hr".) So
43             C<$HTML::Tagset::emptyElement{'hr'}> exists and is true.
44             C<$HTML::Tagset::emptyElement{'dl'}> does not exist, and so is not true.
45              
46             =cut
47              
48             our %emptyElement = map { $_ => 1 } qw(
49             base link meta isindex
50             img br hr wbr
51             input area param
52             embed bgsound spacer
53             basefont col frame
54             ~comment ~literal
55             ~declaration ~pi
56             );
57             # The "~"-initial names are for pseudo-elements used by HTML::Entities
58             # and TreeBuilder
59              
60             =head2 hashset %HTML::Tagset::optionalEndTag
61              
62             This hashset lists tag-names for elements that can have content, but whose
63             end-tags are generally, "safely", omissible. Example:
64             C<$HTML::Tagset::emptyElement{'li'}> exists and is true.
65              
66             =cut
67              
68             our %optionalEndTag = map { $_ => 1 } qw(
69             p li dt dd
70             ); # option th tr td);
71              
72             =head2 hash %HTML::Tagset::linkElements
73              
74             Values in this hash are tagnames for elements that might contain
75             links, and the value for each is a reference to an array of the names
76             of attributes whose values can be links.
77              
78             =cut
79              
80             our %linkElements =
81             (
82             'a' => ['href'],
83             'applet' => ['archive', 'codebase', 'code'],
84             'area' => ['href'],
85             'base' => ['href'],
86             'bgsound' => ['src'],
87             'blockquote' => ['cite'],
88             'body' => ['background'],
89             'del' => ['cite'],
90             'embed' => ['pluginspage', 'src'],
91             'form' => ['action'],
92             'frame' => ['src', 'longdesc'],
93             'iframe' => ['src', 'longdesc'],
94             'ilayer' => ['background'],
95             'img' => ['src', 'lowsrc', 'longdesc', 'usemap'],
96             'input' => ['src', 'usemap'],
97             'ins' => ['cite'],
98             'isindex' => ['action'],
99             'head' => ['profile'],
100             'layer' => ['background', 'src'],
101             'link' => ['href'],
102             'object' => ['classid', 'codebase', 'data', 'archive', 'usemap'],
103             'q' => ['cite'],
104             'script' => ['src', 'for'],
105             'table' => ['background'],
106             'td' => ['background'],
107             'th' => ['background'],
108             'tr' => ['background'],
109             'xmp' => ['href'],
110             );
111              
112             =head2 hash %HTML::Tagset::boolean_attr
113              
114             This hash (not hashset) lists what attributes of what elements can be
115             printed without showing the value (for example, the "noshade" attribute
116             of "hr" elements). For elements with only one such attribute, its value
117             is simply that attribute name. For elements with many such attributes,
118             the value is a reference to a hashset containing all such attributes.
119              
120             =cut
121              
122             our %boolean_attr = (
123             # TODO: make these all hashes
124             'area' => 'nohref',
125             'dir' => 'compact',
126             'dl' => 'compact',
127             'hr' => 'noshade',
128             'img' => 'ismap',
129             'input' => { 'checked' => 1, 'readonly' => 1, 'disabled' => 1 },
130             'menu' => 'compact',
131             'ol' => 'compact',
132             'option' => 'selected',
133             'select' => 'multiple',
134             'td' => 'nowrap',
135             'th' => 'nowrap',
136             'ul' => 'compact',
137             );
138              
139             #==========================================================================
140             # List of all elements from Extensible HTML version 1.0 Transitional DTD:
141             #
142             # a abbr acronym address applet area b base basefont bdo big
143             # blockquote body br button caption center cite code col colgroup
144             # dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6
145             # head hr html i iframe img input ins isindex kbd label legend li
146             # link map menu meta noframes noscript object ol optgroup option p
147             # param pre q s samp script select small span strike strong style
148             # sub sup table tbody td textarea tfoot th thead title tr tt u ul
149             # var
150             #
151             # Varia from Mozilla source internal table of tags:
152             # Implemented:
153             # xmp listing wbr nobr frame frameset noframes ilayer
154             # layer nolayer spacer embed multicol
155             # But these are unimplemented:
156             # sound?? keygen?? server??
157             # Also seen here and there:
158             # marquee?? app?? (both unimplemented)
159             #==========================================================================
160              
161             =head2 hashset %HTML::Tagset::isPhraseMarkup
162              
163             This hashset contains all phrasal-level elements.
164              
165             =cut
166              
167             our %isPhraseMarkup = map { $_ => 1 } qw(
168             span abbr acronym q sub sup
169             cite code em kbd samp strong var dfn strike
170             b i u s tt small big
171             ins del
172             a img br
173             wbr nobr blink
174             font basefont bdo
175             spacer embed noembed
176             ); # had: center, hr, table
177              
178              
179             =head2 hashset %HTML::Tagset::is_Possible_Strict_P_Content
180              
181             This hashset contains all phrasal-level elements that be content of a
182             P element, for a strict model of HTML.
183              
184             =cut
185              
186             our %isFormElement; # Forward declaration
187             our %is_Possible_Strict_P_Content = (
188             %isPhraseMarkup,
189             %isFormElement,
190             map {; $_ => 1} qw( object script map )
191             # I've no idea why there's these latter exceptions.
192             # I'm just following the HTML4.01 DTD.
193             );
194              
195             #from html4 strict:
196             #
197             #
198             #
199             # SAMP | KBD | VAR | CITE | ABBR | ACRONYM" >
200             #
201             #
202             # "A | IMG | OBJECT | BR | SCRIPT | MAP | Q | SUB | SUP | SPAN | BDO">
203             #
204             #
205             #
206             #
207             #
208              
209             =head2 hashset %HTML::Tagset::isHeadElement
210              
211             This hashset contains all elements that elements that should be
212             present only in the 'head' element of an HTML document.
213              
214             =cut
215              
216             our %isHeadElement = map { $_ => 1 }
217             qw(title base link meta isindex script style object bgsound);
218              
219             =head2 hashset %HTML::Tagset::isList
220              
221             This hashset contains all elements that can contain "li" elements.
222              
223             =cut
224              
225             our %isList = map { $_ => 1 } qw(
226             ul ol dir menu
227             );
228              
229             =head2 hashset %HTML::Tagset::isTableElement
230              
231             This hashset contains all elements that are to be found only in/under
232             a "table" element.
233              
234             =cut
235              
236             our %isTableElement = map { $_ => 1 }
237             qw(tr td th thead tbody tfoot caption col colgroup);
238              
239             =head2 hashset %HTML::Tagset::isFormElement
240              
241             This hashset contains all elements that are to be found only in/under
242             a "form" element.
243              
244             =cut
245              
246             # Declared earlier in the file
247             %isFormElement = map { $_ => 1 }
248             qw(input select option optgroup textarea button label);
249              
250             =head2 hashset %HTML::Tagset::isBodyElement
251              
252             This hashset contains all elements that are to be found only in/under
253             the "body" element of an HTML document.
254              
255             =cut
256              
257             our %isBodyElement = map { $_ => 1 } qw(
258             h1 h2 h3 h4 h5 h6
259             p div pre plaintext address blockquote
260             xmp listing
261             center
262              
263             multicol
264             iframe ilayer nolayer
265             bgsound
266              
267             hr
268             ol ul dir menu li
269             dl dt dd
270             ins del
271              
272             fieldset legend
273              
274             map area
275             applet param object
276             isindex script noscript
277             table
278             center
279             form
280             ),
281             keys %isFormElement,
282             keys %isPhraseMarkup, # And everything phrasal
283             keys %isTableElement,
284             ;
285              
286              
287             =head2 hashset %HTML::Tagset::isHeadOrBodyElement
288              
289             This hashset includes all elements that I notice can fall either in
290             the head or in the body.
291              
292             =cut
293              
294             our %isHeadOrBodyElement = map { $_ => 1 }
295             qw(script isindex style object map area param noscript bgsound);
296             # i.e., if we find 'script' in the 'body' or the 'head', don't freak out.
297              
298              
299             =head2 hashset %HTML::Tagset::isKnown
300              
301             This hashset lists all known HTML elements.
302              
303             =cut
304              
305             our %isKnown = (%isHeadElement, %isBodyElement,
306             map{ $_ => 1 }
307             qw( head body html
308             frame frameset noframes
309             ~comment ~pi ~directive ~literal
310             ));
311             # that should be all known tags ever ever
312              
313              
314             =head2 hashset %HTML::Tagset::canTighten
315              
316             This hashset lists elements that might have ignorable whitespace as
317             children or siblings.
318              
319             =cut
320              
321             our %canTighten = %isKnown;
322             delete @canTighten{
323             keys(%isPhraseMarkup), 'input', 'select',
324             'xmp', 'listing', 'plaintext', 'pre',
325             };
326             # xmp, listing, plaintext, and pre are untightenable, and
327             # in a really special way.
328             @canTighten{'hr','br'} = (1,1);
329             # exceptional 'phrasal' things that ARE subject to tightening.
330              
331             # The one case where I can think of my tightening rules failing is:
332             #

foo bar

baz quux ...
333             # ^-- that would get deleted.
334             # But that's pretty gruesome code anyhow. You gets what you pays for.
335              
336             #==========================================================================
337              
338             =head2 array @HTML::Tagset::p_closure_barriers
339              
340             This array has a meaning that I have only seen a need for in
341             C, but I include it here on the off chance that someone
342             might find it of use:
343              
344             When we see a "EpE" token, we go lookup up the lineage for a p
345             element we might have to minimize. At first sight, we might say that
346             if there's a p anywhere in the lineage of this new p, it should be
347             closed. But that's wrong. Consider this document:
348              
349            
350            
351             foo
352            
353            
354            

foo

355            
356            
357            
358             foo
359            

bar

360            
361            
362            
363            

364            
365            
366              
367             The second p is quite legally inside a much higher p.
368              
369             My formalization of the reason why this is legal, but this:
370              
371            

foo

bar

372              
373             isn't, is that something about the table constitutes a "barrier" to
374             the application of the rule about what p must minimize.
375              
376             So C<@HTML::Tagset::p_closure_barriers> is the list of all such
377             barrier-tags.
378              
379             =cut
380              
381             our @p_closure_barriers = qw(
382             li blockquote
383             ul ol menu dir
384             dl dt dd
385             td th tr table caption
386             div
387             );
388              
389             # In an ideal world (i.e., XHTML) we wouldn't have to bother with any of this
390             # monkey business of barriers to minimization!
391              
392             =head2 hashset %isCDATA_Parent
393              
394             This hashset includes all elements whose content is CDATA.
395              
396             =cut
397              
398             our %isCDATA_Parent = map { $_ => 1 }
399             qw(script style xmp listing plaintext);
400              
401             # TODO: there's nothing else that takes CDATA children, right?
402              
403             # As the HTML3 DTD (Raggett 1995-04-24) noted:
404             # The XMP, LISTING and PLAINTEXT tags are incompatible with SGML
405             # and derive from very early versions of HTML. They require non-
406             # standard parsers and will cause problems for processing
407             # documents with standard SGML tools.
408              
409              
410             =head1 CAVEATS
411              
412             You may find it useful to alter the behavior of modules (like
413             C or C) that use C's
414             data tables by altering the data tables themselves. You are welcome
415             to try, but be careful; and be aware that different modules may or may
416             react differently to the data tables being changed.
417              
418             Note that it may be inappropriate to use these tables for I
419             HTML -- for example, C<%isHeadOrBodyElement> lists the tagnames
420             for all elements that can appear either in the head or in the body,
421             such as "script". That doesn't mean that I am saying your code that
422             produces HTML should feel free to put script elements in either place!
423             If you are producing programs that spit out HTML, you should be
424             I familiar with the DTDs for HTML or XHTML (available at
425             C), and you should slavishly obey them, not
426             the data tables in this document.
427              
428             =head1 SEE ALSO
429              
430             L, L, L
431              
432             =head1 COPYRIGHT & LICENSE
433              
434             Copyright 1995-2000 Gisle Aas.
435              
436             Copyright 2000-2005 Sean M. Burke.
437              
438             Copyright 2005-2024 Andy Lester.
439              
440             This library is free software; you can redistribute it and/or modify it
441             under the terms of the Artistic License version 2.0.
442              
443             =head1 ACKNOWLEDGEMENTS
444              
445             Most of the code/data in this module was adapted from code written
446             by Gisle Aas for C, C, and
447             C. Then it was maintained by Sean M. Burke.
448              
449             =head1 AUTHOR
450              
451             Current maintainer: Andy Lester, C<< >>
452              
453             =head1 BUGS
454              
455             Please report any bugs or feature requests to
456             C, or through the web interface at
457             L. I will
458             be notified, and then you'll automatically be notified of progress on
459             your bug as I make changes.
460              
461             =cut
462              
463             1;