File Coverage

blib/lib/HTML/DOM/_TreeBuilder.pm
Criterion Covered Total %
statement 284 466 60.9
branch 163 298 54.7
condition 96 243 39.5
subroutine 23 44 52.2
pod 8 33 24.2
total 574 1084 52.9


line stmt bran cond sub pod time code
1             # This is a fork of HTML::Element. Eventually the code may be merged.
2              
3             package HTML::DOM::_TreeBuilder;
4              
5 24     24   76 use warnings;
  24         26  
  24         559  
6 24     24   66 use strict;
  24         24  
  24         316  
7 24     24   63 use integer; # vroom vroom!
  24         23  
  24         115  
8 24     24   333 use Carp ();
  24         24  
  24         346  
9 24     24   65 use vars qw(@ISA $VERSION $DEBUG);
  24         31  
  24         2941  
10              
11             #---------------------------------------------------------------------------
12             # Make a 'DEBUG' constant...
13              
14             BEGIN {
15              
16             # We used to have things like
17             # print $indent, "lalala" if $Debug;
18             # But there were an awful lot of having to evaluate $Debug's value.
19             # If we make that depend on a constant, like so:
20             # sub DEBUG () { 1 } # or whatever value.
21             # ...
22             # print $indent, "lalala" if DEBUG;
23             # Which at compile-time (thru the miracle of constant folding) turns into:
24             # print $indent, "lalala";
25             # or, if DEBUG is a constant with a true value, then that print statement
26             # is simply optimized away, and doesn't appear in the target code at all.
27             # If you don't believe me, run:
28             # perl -MO=Deparse,-uHTML::DOM::_TreeBuilder -e 'BEGIN { \
29             # $HTML::DOM::_TreeBuilder::DEBUG = 4} use HTML::DOM::_TreeBuilder'
30             # and see for yourself (substituting whatever value you want for $DEBUG
31             # there).
32             ## no critic
33 24 50   24   194 if ( defined &DEBUG ) {
    50          
    50          
    0          
34              
35             # Already been defined! Do nothing.
36             }
37             elsif ( $] < 5.00404 ) {
38              
39             # Grudgingly accomodate ancient (pre-constant) versions.
40 0         0 eval 'sub DEBUG { $Debug } ';
41             }
42             elsif ( !$DEBUG ) {
43 24         1100 eval 'sub DEBUG () {0}'; # Make it a constant.
44             }
45             elsif ( $DEBUG =~ m<^\d+$>s ) {
46 0         0 eval 'sub DEBUG () { ' . $DEBUG . ' }'; # Make THAT a constant.
47             }
48             else { # WTF?
49 0         0 warn "Non-numeric value \"$DEBUG\" in \$HTML::DOM::_Element::DEBUG";
50 0         0 eval 'sub DEBUG () { $DEBUG }'; # I guess.
51             }
52             ## use critic
53             }
54              
55             #---------------------------------------------------------------------------
56              
57 24     24   90 use HTML::Entities ();
  24         27  
  24         315  
58 24     24   67 use HTML::Tagset 3.02 ();
  24         436  
  24         312  
59              
60 24     24   75 use HTML::DOM::_Element ();
  24         20  
  24         239  
61 24     24   77 use HTML::Parser ();
  24         40  
  24         109530  
62             @ISA = qw(HTML::DOM::_Element HTML::Parser);
63             $VERSION = 4.2001;
64              
65             # This looks schizoid, I know.
66             # It's not that we ARE an element AND a parser.
67             # We ARE an element, but one that knows how to handle signals
68             # (method calls) from Parser in order to elaborate its subtree.
69              
70             # Legacy aliases:
71             *HTML::DOM::_TreeBuilder::isKnown = \%HTML::Tagset::isKnown;
72             *HTML::DOM::_TreeBuilder::canTighten = \%HTML::Tagset::canTighten;
73             *HTML::DOM::_TreeBuilder::isHeadElement = \%HTML::Tagset::isHeadElement;
74             *HTML::DOM::_TreeBuilder::isBodyElement = \%HTML::Tagset::isBodyElement;
75             *HTML::DOM::_TreeBuilder::isPhraseMarkup = \%HTML::Tagset::isPhraseMarkup;
76             *HTML::DOM::_TreeBuilder::isHeadOrBodyElement = \%HTML::Tagset::isHeadOrBodyElement;
77             *HTML::DOM::_TreeBuilder::isList = \%HTML::Tagset::isList;
78             *HTML::DOM::_TreeBuilder::isTableElement = \%HTML::Tagset::isTableElement;
79             *HTML::DOM::_TreeBuilder::isFormElement = \%HTML::Tagset::isFormElement;
80             *HTML::DOM::_TreeBuilder::p_closure_barriers = \@HTML::Tagset::p_closure_barriers;
81              
82             #==========================================================================
83             # Two little shortcut constructors:
84              
85             sub new_from_file { # or from a FH
86 0     0 0 0 my $class = shift;
87 0 0       0 Carp::croak("new_from_file takes only one argument")
88             unless @_ == 1;
89 0 0       0 Carp::croak("new_from_file is a class method only")
90             if ref $class;
91 0         0 my $new = $class->new();
92 0         0 $new->parse_file( $_[0] );
93 0         0 return $new;
94             }
95              
96             sub new_from_content { # from any number of scalars
97 0     0 0 0 my $class = shift;
98 0 0       0 Carp::croak("new_from_content is a class method only")
99             if ref $class;
100 0         0 my $new = $class->new();
101 0         0 foreach my $whunk (@_) {
102 0 0       0 if ( ref($whunk) eq 'SCALAR' ) {
103 0         0 $new->parse($$whunk);
104             }
105             else {
106 0         0 $new->parse($whunk);
107             }
108 0 0       0 last if $new->{'_stunted'}; # might as well check that.
109             }
110 0         0 $new->eof();
111 0         0 return $new;
112             }
113              
114             # TODO: document more fully?
115             sub parse_content { # from any number of scalars
116 0     0 0 0 my $tree = shift;
117 0         0 my $retval;
118 0         0 foreach my $whunk (@_) {
119 0 0       0 if ( ref($whunk) eq 'SCALAR' ) {
120 0         0 $retval = $tree->parse($$whunk);
121             }
122             else {
123 0         0 $retval = $tree->parse($whunk);
124             }
125 0 0       0 last if $tree->{'_stunted'}; # might as well check that.
126             }
127 0         0 $tree->eof();
128 0         0 return $retval;
129             }
130              
131             #---------------------------------------------------------------------------
132              
133             sub new { # constructor!
134 147     147 1 276 my $class = shift;
135 147   33     453 $class = ref($class) || $class;
136              
137             # Initialize HTML::DOM::_Element part
138 147         349 my $self = $class->element_class->new('html');
139              
140             {
141              
142             # A hack for certain strange versions of Parser:
143 147         141 my $other_self = HTML::Parser->new();
  147         436  
144 147         5407 %$self = ( %$self, %$other_self ); # copy fields
145             # Yes, multiple inheritance is messy. Kids, don't try this at home.
146 147         514 bless $other_self, "HTML::DOM::_TreeBuilder::_hideyhole";
147              
148             # whack it out of the HTML::Parser class, to avoid the destructor
149             }
150              
151             # The root of the tree is special, as it has these funny attributes,
152             # and gets reblessed into this class.
153              
154             # Initialize parser settings
155 147         186 $self->{'_implicit_tags'} = 1;
156 147         150 $self->{'_implicit_body_p_tag'} = 0;
157              
158             # If true, trying to insert text, or any of %isPhraseMarkup right
159             # under 'body' will implicate a 'p'. If false, will just go there.
160              
161 147         138 $self->{'_tighten'} = 1;
162              
163             # whether ignorable WS in this tree should be deleted
164              
165 147         153 $self->{'_implicit'} = 1; # to delete, once we find a real open-"html" tag
166              
167 147         249 $self->{'_ignore_unknown'} = 1;
168 147         139 $self->{'_ignore_text'} = 0;
169 147         153 $self->{'_warn'} = 0;
170 147         137 $self->{'_no_space_compacting'} = 0;
171 147         142 $self->{'_store_comments'} = 0;
172 147         173 $self->{'_store_declarations'} = 1;
173 147         128 $self->{'_store_pis'} = 0;
174 147         123 $self->{'_p_strict'} = 0;
175 147         225 $self->{'_no_expand_entities'} = 0;
176              
177             # Parse attributes passed in as arguments
178 147 50       293 if (@_) {
179 147         369 my %attr = @_;
180 147         310 for ( keys %attr ) {
181 441         813 $self->{"_$_"} = $attr{$_};
182             }
183             }
184              
185 147         217 $HTML::DOM::_Element::encoded_content = $self->{'_no_expand_entities'};
186              
187             # rebless to our class
188 147         155 bless $self, $class;
189              
190 147         155 $self->{'_element_count'} = 1;
191              
192             # undocumented, informal, and maybe not exactly correct
193              
194 147         356 $self->{'_head'} = $self->insert_element( 'head', 1 );
195 147         156 $self->{'_pos'} = undef; # pull it back up
196 147         259 $self->{'_body'} = $self->insert_element( 'body', 1 );
197 147         154 $self->{'_pos'} = undef; # pull it back up again
198              
199 147         384 return $self;
200             }
201              
202             #==========================================================================
203              
204             sub _elem # universal accessor...
205             {
206 441     441   381 my ( $self, $elem, $val ) = @_;
207 441         398 my $old = $self->{$elem};
208 441 50       644 $self->{$elem} = $val if defined $val;
209 441         568 return $old;
210             }
211              
212             # accessors....
213 0     0 0 0 sub implicit_tags { shift->_elem( '_implicit_tags', @_ ); }
214 0     0 0 0 sub implicit_body_p_tag { shift->_elem( '_implicit_body_p_tag', @_ ); }
215 0     0 0 0 sub p_strict { shift->_elem( '_p_strict', @_ ); }
216 147     147 0 194 sub no_space_compacting { shift->_elem( '_no_space_compacting', @_ ); }
217 0     0 0 0 sub ignore_unknown { shift->_elem( '_ignore_unknown', @_ ); }
218 0     0 0 0 sub ignore_text { shift->_elem( '_ignore_text', @_ ); }
219 147     147 0 272 sub ignore_ignorable_whitespace { shift->_elem( '_tighten', @_ ); }
220 147     147 0 187 sub store_comments { shift->_elem( '_store_comments', @_ ); }
221 0     0 0 0 sub store_declarations { shift->_elem( '_store_declarations', @_ ); }
222 0     0 0 0 sub store_pis { shift->_elem( '_store_pis', @_ ); }
223 0     0 0 0 sub warn { shift->_elem( '_warn', @_ ); }
224              
225             sub no_expand_entities {
226 0     0 0 0 shift->_elem( '_no_expand_entities', @_ );
227 0         0 $HTML::DOM::_Element::encoded_content = @_;
228             }
229              
230             #==========================================================================
231              
232             sub warning {
233 2     2 0 3 my $self = shift;
234 2 50       4 CORE::warn("HTML::Parse: $_[0]\n") if $self->{'_warn'};
235              
236             # should maybe say HTML::DOM::_TreeBuilder instead
237             }
238              
239             #==========================================================================
240              
241             {
242              
243             # To avoid having to rebuild these lists constantly...
244             my $_Closed_by_structurals = [qw(p h1 h2 h3 h4 h5 h6 pre textarea)];
245             my $indent;
246              
247             sub start {
248 594 50   594 1 1036 return if $_[0]{'_stunted'};
249              
250             # Accept a signal from HTML::Parser for start-tags.
251 594         724 my ( $self, $tag, $attr ) = @_;
252              
253             # Parser passes more, actually:
254             # $self->start($tag, $attr, $attrseq, $origtext)
255             # But we can merrily ignore $attrseq and $origtext.
256              
257 594 50       865 if ( $tag eq 'x-html' ) {
258 0         0 print "Ignoring open-x-html tag.\n" if DEBUG;
259              
260             # inserted by some lame code-generators.
261 0         0 return; # bypass tweaking.
262             }
263              
264 594         737 $tag =~ s{/$}{}s; # So turns into . Silently forgive.
265              
266 594 50       1756 unless ( $tag =~ m/^[-_a-zA-Z0-9:%]+$/s ) {
267 0         0 DEBUG and print "Start-tag name $tag is no good. Skipping.\n";
268 0         0 return;
269              
270             # This avoids having Element's new() throw an exception.
271             }
272              
273 594   66     1395 my $ptag = ( my $pos = $self->{'_pos'} || $self )->{'_tag'};
274 594         427 my $already_inserted;
275              
276             #my($indent);
277 594         370 if (DEBUG) {
278              
279             # optimization -- don't figure out indenting unless we're in debug mode
280             my @lineage = $pos->lineage;
281             $indent = ' ' x ( 1 + @lineage );
282             print $indent, "Proposing a new \U$tag\E under ",
283             join( '/', map $_->{'_tag'}, reverse( $pos, @lineage ) )
284             || 'Root',
285             ".\n";
286              
287             #} else {
288             # $indent = ' ';
289             }
290              
291             #print $indent, "POS: $pos ($ptag)\n" if DEBUG > 2;
292             # $attr = {%$attr};
293              
294 594         1183 foreach my $k ( keys %$attr ) {
295              
296             # Make sure some stooge doesn't have "".
297             # That happens every few million Web pages.
298 538 50 33     1879 $attr->{ ' ' . $k } = delete $attr->{$k}
299             if length $k and substr( $k, 0, 1 ) eq '_';
300              
301             # Looks bad, but is fine for round-tripping.
302             }
303              
304 594         1203 my $e = $self->element_class->new( $tag, %$attr );
305              
306             # Make a new element object.
307             # (Only rarely do we end up just throwing it away later in this call.)
308              
309             # Some prep -- custom messiness for those damned tables, and strict P's.
310 594 50       1083 if ( $self->{'_implicit_tags'} ) { # wallawallawalla!
311              
312 594 100       1001 unless ( $HTML::DOM::_TreeBuilder::isTableElement{$tag} ) {
313 549 50       1043 if ( $ptag eq 'table' ) {
    50          
314 0         0 print $indent,
315             " * Phrasal \U$tag\E right under TABLE makes implicit TR and TD\n"
316             if DEBUG > 1;
317 0         0 $self->insert_element( 'tr', 1 );
318 0         0 $pos = $self->insert_element( 'td', 1 )
319             ; # yes, needs updating
320             }
321             elsif ( $ptag eq 'tr' ) {
322 0         0 print $indent,
323             " * Phrasal \U$tag\E right under TR makes an implicit TD\n"
324             if DEBUG > 1;
325 0         0 $pos = $self->insert_element( 'td', 1 )
326             ; # yes, needs updating
327             }
328 549         653 $ptag = $pos->{'_tag'}; # yes, needs updating
329             }
330              
331             # end of table-implication block.
332              
333             # Now maybe do a little dance to enforce P-strictness.
334             # This seems like it should be integrated with the big
335             # "ALL HOPE..." block, further below, but that doesn't
336             # seem feasable.
337 594 0 33     926 if ( $self->{'_p_strict'}
      0        
338             and $HTML::DOM::_TreeBuilder::isKnown{$tag}
339             and not $HTML::Tagset::is_Possible_Strict_P_Content{$tag} )
340             {
341 0         0 my $here = $pos;
342 0         0 my $here_tag = $ptag;
343 0         0 while (1) {
344 0 0       0 if ( $here_tag eq 'p' ) {
345 0         0 print $indent, " * Inserting $tag closes strict P.\n"
346             if DEBUG > 1;
347 0         0 $self->end( \q{p} );
348              
349             # NB: same as \'q', but less confusing to emacs cperl-mode
350 0         0 last;
351             }
352              
353             #print("Lasting from $here_tag\n"),
354             last
355             if $HTML::DOM::_TreeBuilder::isKnown{$here_tag}
356             and
357             not $HTML::Tagset::is_Possible_Strict_P_Content{
358 0 0 0     0 $here_tag};
359              
360             # Don't keep looking up the tree if we see something that can't
361             # be strict-P content.
362              
363             $here_tag
364 0   0     0 = ( $here = $here->{'_parent'} || last )->{'_tag'};
365             } # end while
366             $ptag = ( $pos = $self->{'_pos'} || $self )
367 0   0     0 ->{'_tag'}; # better update!
368             }
369              
370             # end of strict-p block.
371             }
372              
373             # And now, get busy...
374             #----------------------------------------------------------------------
375 594 50       1738 if ( !$self->{'_implicit_tags'} ) { # bimskalabim
    100          
    100          
    100          
    100          
    100          
    100          
    50          
    50          
    50          
376             # do nothing
377 0         0 print $indent, " * _implicit_tags is off. doing nothing\n"
378             if DEBUG > 1;
379              
380             #----------------------------------------------------------------------
381             }
382             elsif ( $HTML::DOM::_TreeBuilder::isHeadOrBodyElement{$tag} ) {
383 19 100       56 if ( $pos->is_inside('body') ) { # all is well
    100          
384 9         12 print $indent,
385             " * ambilocal element \U$tag\E is fine under BODY.\n"
386             if DEBUG > 1;
387             }
388             elsif ( $pos->is_inside('head') ) {
389 4         9 print $indent,
390             " * ambilocal element \U$tag\E is fine under HEAD.\n"
391             if DEBUG > 1;
392             }
393             else {
394              
395             # In neither head nor body! mmmmm... put under head?
396              
397 6 50       13 if ( $ptag eq 'html' ) { # expected case
398             # TODO?? : would there ever be a case where _head would be
399             # absent from a tree that would ever be accessed at this
400             # point?
401 6 50       15 die "Where'd my head go?" unless ref $self->{'_head'};
402 6 50       13 if ( $self->{'_head'}{'_implicit'} ) {
403 6         4 print $indent,
404             " * ambilocal element \U$tag\E makes an implicit HEAD.\n"
405             if DEBUG > 1;
406              
407             # or rather, points us at it.
408             $self->{'_pos'}
409 6         12 = $self->{'_head'}; # to insert under...
410             }
411             else {
412 0         0 $self->warning(
413             "Ambilocal element <$tag> not under HEAD or BODY!?"
414             );
415              
416             # Put it under HEAD by default, I guess
417             $self->{'_pos'}
418 0         0 = $self->{'_head'}; # to insert under...
419             }
420              
421             }
422             else {
423              
424             # Neither under head nor body, nor right under html... pass thru?
425 0         0 $self->warning(
426             "Ambilocal element <$tag> neither under head nor body, nor right under html!?"
427             );
428             }
429             }
430              
431             #----------------------------------------------------------------------
432             }
433             elsif ( $HTML::DOM::_TreeBuilder::isBodyElement{$tag} ) {
434              
435             # Ensure that we are within
436 493 100 66     1764 if ( $ptag eq 'body' ) {
    100          
    100          
    50          
437              
438             # We're good.
439             }
440             elsif (
441             $HTML::DOM::_TreeBuilder::isBodyElement{$ptag} # glarg
442             and not $HTML::DOM::_TreeBuilder::isHeadOrBodyElement{$ptag}
443             )
444             {
445              
446             # Special case: Save ourselves a call to is_inside further down.
447             # If our $ptag is an isBodyElement element (but not an
448             # isHeadOrBodyElement element), then we must be under body!
449 345         259 print $indent, " * Inferring that $ptag is under BODY.\n",
450             if DEBUG > 3;
451              
452             # I think this and the test for 'body' trap everything
453             # bodyworthy, except the case where the parent element is
454             # under an unknown element that's a descendant of body.
455             }
456             elsif ( $pos->is_inside('head') ) {
457 12         13 print $indent,
458             " * body-element \U$tag\E minimizes HEAD, makes implicit BODY.\n"
459             if DEBUG > 1;
460             $ptag = (
461             $pos = $self->{'_pos'}
462             = $self->{'_body'} # yes, needs updating
463             || die "Where'd my body go?"
464 12   50     37 )->{'_tag'}; # yes, needs updating
465             }
466             elsif ( !$pos->is_inside('body') ) {
467 73         57 print $indent,
468             " * body-element \U$tag\E makes implicit BODY.\n"
469             if DEBUG > 1;
470             $ptag = (
471             $pos = $self->{'_pos'}
472             = $self->{'_body'} # yes, needs updating
473             || die "Where'd my body go?"
474 73   50     206 )->{'_tag'}; # yes, needs updating
475             }
476              
477             # else we ARE under body, so okay.
478              
479             # Handle implicit endings and insert based on and position
480             # ... ALL HOPE ABANDON ALL YE WHO ENTER HERE ...
481 493 100 100     6585 if ( $tag eq 'p'
    100 66        
    100 66        
    50 33        
    100 33        
    100 33        
    100 66        
      66        
      66        
      33        
482             or $tag eq 'h1'
483             or $tag eq 'h2'
484             or $tag eq 'h3'
485             or $tag eq 'h4'
486             or $tag eq 'h5'
487             or $tag eq 'h6'
488             or $tag eq 'form'
489              
490             # Hm, should
really be here?!
491             )
492             {
493              
494             # Can't have

, or inside these

495 162         368 $self->end(
496             $_Closed_by_structurals,
497             @HTML::DOM::_TreeBuilder::p_closure_barriers
498              
499             # used to be just li!
500             );
501              
502             }
503             elsif ( $tag eq 'ol' or $tag eq 'ul' or $tag eq 'dl' ) {
504              
505             # Can't have lists inside -- in the unlikely
506             # event anyone tries to put them there!
507 2 50 33     37 if ( $ptag eq 'h1'
      33        
      33        
      33        
      33        
508             or $ptag eq 'h2'
509             or $ptag eq 'h3'
510             or $ptag eq 'h4'
511             or $ptag eq 'h5'
512             or $ptag eq 'h6' )
513             {
514 0         0 $self->end( \$ptag );
515             }
516              
517             # TODO: Maybe keep closing up the tree until
518             # the ptag isn't any of the above?
519             # But anyone that says

    ...

520             # deserves what they get anyway.
521              
522             }
523             elsif ( $tag eq 'li' ) { # list item
524             # Get under a list tag, one way or another
525 8 50 33     24 unless (
526             exists $HTML::DOM::_TreeBuilder::isList{$ptag}
527             or $self->end( \q{*}, keys %HTML::DOM::_TreeBuilder::isList ) #'
528             )
529             {
530 0         0 print $indent,
531             " * inserting implicit UL for lack of containing ",
532             join( '|', keys %HTML::DOM::_TreeBuilder::isList ), ".\n"
533             if DEBUG > 1;
534 0         0 $self->insert_element( 'ul', 1 );
535             }
536              
537             }
538             elsif ( $tag eq 'dt' or $tag eq 'dd' ) {
539              
540             # Get under a DL, one way or another
541 0 0 0     0 unless ( $ptag eq 'dl' or $self->end( \q{*}, 'dl' ) ) { #'
542 0         0 print $indent,
543             " * inserting implicit DL for lack of containing DL.\n"
544             if DEBUG > 1;
545 0         0 $self->insert_element( 'dl', 1 );
546             }
547              
548             }
549             elsif ( $HTML::DOM::_TreeBuilder::isFormElement{$tag} ) {
550 141 50 33     271 if ($self->{
551             '_ignore_formies_outside_form'} # TODO: document this
552             and not $pos->is_inside('form')
553             )
554             {
555 0         0 print $indent,
556             " * ignoring \U$tag\E because not in a FORM.\n"
557             if DEBUG > 1;
558 0         0 return; # bypass tweaking.
559             }
560 141 100       270 if ( $tag eq 'option' ) {
561              
562             # return unless $ptag eq 'select';
563 45         127 $self->end( \q{option} );
564 45   33     93 $ptag = ( $self->{'_pos'} || $self )->{'_tag'};
565 45 50 33     113 unless ( $ptag eq 'select' or $ptag eq 'optgroup' ) {
566 0         0 print $indent,
567             " * \U$tag\E makes an implicit SELECT.\n"
568             if DEBUG > 1;
569 0         0 $pos = $self->insert_element( 'select', 1 );
570              
571             # but not a very useful select -- has no 'name' attribute!
572             # is $pos's value used after this?
573             }
574             }
575             }
576             elsif ( $HTML::DOM::_TreeBuilder::isTableElement{$tag} ) {
577 45 50       133 if ( !$pos->is_inside('table') ) {
578 0         0 print $indent, " * \U$tag\E makes an implicit TABLE\n"
579             if DEBUG > 1;
580 0         0 $self->insert_element( 'table', 1 );
581             }
582              
583 45 100 100     128 if ( $tag eq 'td' or $tag eq 'th' ) {
584              
585             # Get under a tr one way or another
586 22 50 66     68 unless (
587             $ptag eq 'tr' # either under a tr
588             or $self->end( \q{*}, 'tr',
589             'table' ) #or we can get under one
590             )
591             {
592 0         0 print $indent,
593             " * \U$tag\E under \U$ptag\E makes an implicit TR\n"
594             if DEBUG > 1;
595 0         0 $self->insert_element( 'tr', 1 );
596              
597             # presumably pos's value isn't used after this.
598             }
599             }
600             else {
601 23         55 $self->end( \$tag, 'table' ); #'
602             }
603              
604             # Hmm, I guess this is right. To work it out:
605             # tr closes any open tr (limited at a table)
606             # thead closes any open thead (limited at a table)
607             # tbody closes any open tbody (limited at a table)
608             # tfoot closes any open tfoot (limited at a table)
609             # colgroup closes any open colgroup (limited at a table)
610             # col can try, but will always fail, at the enclosing table,
611             # as col is empty, and therefore never open!
612             # But!
613             # td closes any open td OR th (limited at a table)
614             # th closes any open th OR td (limited at a table)
615             # ...implementable as "close to a tr, or make a tr"
616              
617             }
618             elsif ( $HTML::DOM::_TreeBuilder::isPhraseMarkup{$tag} ) {
619 61 50 66     149 if ( $ptag eq 'body' and $self->{'_implicit_body_p_tag'} ) {
620 0         0 print
621             " * Phrasal \U$tag\E right under BODY makes an implicit P\n"
622             if DEBUG > 1;
623 0         0 $pos = $self->insert_element( 'p', 1 );
624              
625             # is $pos's value used after this?
626             }
627             }
628              
629             # End of implicit endings logic
630              
631             # End of "elsif ($HTML::DOM::_TreeBuilder::isBodyElement{$tag}"
632             #----------------------------------------------------------------------
633              
634             }
635             elsif ( $HTML::DOM::_TreeBuilder::isHeadElement{$tag} ) {
636 42 50       124 if ( $pos->is_inside('body') ) {
    100          
637 0         0 print $indent, " * head element \U$tag\E found inside BODY!\n"
638             if DEBUG;
639 0         0 $self->warning("Header element <$tag> in body"); # [sic]
640             }
641             elsif ( !$pos->is_inside('head') ) {
642 21         23 print $indent,
643             " * head element \U$tag\E makes an implicit HEAD.\n"
644             if DEBUG > 1;
645             }
646             else {
647 21         13 print $indent,
648             " * head element \U$tag\E goes inside existing HEAD.\n"
649             if DEBUG > 1;
650             }
651 42   50     102 $self->{'_pos'} = $self->{'_head'} || die "Where'd my head go?";
652              
653             #----------------------------------------------------------------------
654             }
655             elsif ( $tag eq 'html' ) {
656 11 50       22 if ( delete $self->{'_implicit'} ) { # first time here
657 11         11 print $indent, " * good! found the real HTML element!\n"
658             if DEBUG > 1;
659             }
660             else {
661 0         0 print $indent, " * Found a second HTML element\n"
662             if DEBUG;
663 0         0 $self->warning("Found a nested element");
664             }
665              
666             # in either case, migrate attributes to the real element
667 11         22 for ( keys %$attr ) {
668 7         16 $self->attr( $_, $attr->{$_} );
669             }
670 11         24 $self->{'_pos'} = undef;
671 11         39 return $self; # bypass tweaking.
672              
673             #----------------------------------------------------------------------
674             }
675             elsif ( $tag eq 'head' ) {
676 5   50     16 my $head = $self->{'_head'} || die "Where'd my head go?";
677 5 50       18 if ( delete $head->{'_implicit'} ) { # first time here
678 5         3 print $indent, " * good! found the real HEAD element!\n"
679             if DEBUG > 1;
680             }
681             else { # been here before
682 0         0 print $indent, " * Found a second HEAD element\n"
683             if DEBUG;
684 0         0 $self->warning("Found a second element");
685             }
686              
687             # in either case, migrate attributes to the real element
688 5         15 for ( keys %$attr ) {
689 0         0 $head->attr( $_, $attr->{$_} );
690             }
691 5         18 return $self->{'_pos'} = $head; # bypass tweaking.
692              
693             #----------------------------------------------------------------------
694             }
695             elsif ( $tag eq 'body' ) {
696 22   50     62 my $body = $self->{'_body'} || die "Where'd my body go?";
697 22 50       40 if ( delete $body->{'_implicit'} ) { # first time here
698 22         16 print $indent, " * good! found the real BODY element!\n"
699             if DEBUG > 1;
700             }
701             else { # been here before
702 0         0 print $indent, " * Found a second BODY element\n"
703             if DEBUG;
704 0         0 $self->warning("Found a second element");
705             }
706              
707             # in either case, migrate attributes to the real element
708 22         45 for ( keys %$attr ) {
709 21         43 $body->attr( $_, $attr->{$_} );
710             }
711 22 100       58 $self->{'_pos'} = $body unless $pos->is_inside('body');
712 22         81 return $body; # bypass tweaking.
713              
714             #----------------------------------------------------------------------
715             }
716             elsif ( $tag eq 'frameset' ) {
717 0 0 0     0 if (!( $self->{'_frameset_seen'}++ ) # first frameset seen
      0        
718             and !$self->{'_noframes_seen'}
719              
720             # otherwise it'll be under the noframes already
721             and !$self->is_inside('body')
722             )
723             {
724              
725             # The following is a bit of a hack. We don't use the normal
726             # insert_element because 1) we don't want it as _pos, but instead
727             # right under $self, and 2), more importantly, that we don't want
728             # this inserted at the /end/ of $self's content_list, but instead
729             # in the middle of it, specifiaclly right before the body element.
730             #
731 0   0     0 my $c = $self->{'_content'} || die "Contentless root?";
732 0   0     0 my $body = $self->{'_body'} || die "Where'd my BODY go?";
733 0         0 for ( my $i = 0; $i < @$c; ++$i ) {
734 0 0       0 if ( $c->[$i] eq $body ) {
735 0         0 splice( @$c, $i, 0, $self->{'_pos'} = $pos = $e );
736 0         0 $e->{'_parent'} = $self;
737 0         0 $already_inserted = 1;
738 0         0 print $indent,
739             " * inserting 'frameset' right before BODY.\n"
740             if DEBUG > 1;
741 0         0 last;
742             }
743             }
744 0 0       0 die "BODY not found in children of root?"
745             unless $already_inserted;
746             }
747              
748             }
749             elsif ( $tag eq 'frame' ) {
750              
751             # Okay, fine, pass thru.
752             # Should probably enforce that these should be under a frameset.
753             # But hey. Ditto for enforcing that 'noframes' should be under
754             # a 'frameset', as the DTDs say.
755              
756             }
757             elsif ( $tag eq 'noframes' ) {
758              
759             # This basically assumes there'll be exactly one 'noframes' element
760             # per document. At least, only the first one gets to have the
761             # body under it. And if there are no noframes elements, then
762             # the body pretty much stays where it is. Is that ever a problem?
763 0 0       0 if ( $self->{'_noframes_seen'}++ ) {
764 0         0 print $indent, " * ANOTHER noframes element?\n" if DEBUG;
765             }
766             else {
767 0 0       0 if ( $pos->is_inside('body') ) {
768 0         0 print $indent, " * 'noframes' inside 'body'. Odd!\n"
769             if DEBUG;
770              
771             # In that odd case, we /can't/ make body a child of 'noframes',
772             # because it's an ancestor of the 'noframes'!
773             }
774             else {
775 0   0     0 $e->push_content( $self->{'_body'}
776             || die "Where'd my body go?" );
777 0         0 print $indent, " * Moving body to be under noframes.\n"
778             if DEBUG;
779             }
780             }
781              
782             #----------------------------------------------------------------------
783             }
784             else {
785              
786             # unknown tag
787 2 50       4 if ( $self->{'_ignore_unknown'} ) {
788 2         10 print $indent, " * Ignoring unknown tag \U$tag\E\n" if DEBUG;
789 2         13 $self->warning("Skipping unknown tag $tag");
790 2         6 return;
791             }
792             else {
793 0         0 print $indent, " * Accepting unknown tag \U$tag\E\n"
794             if DEBUG;
795             }
796             }
797              
798             #----------------------------------------------------------------------
799             # End of mumbo-jumbo
800              
801             print $indent, "(Attaching ", $e->{'_tag'}, " under ",
802 554         383 ( $self->{'_pos'} || $self )->{'_tag'}, ")\n"
803              
804             # because if _pos isn't defined, it goes under self
805             if DEBUG;
806              
807             # The following if-clause is to delete /some/ ignorable whitespace
808             # nodes, as we're making the tree.
809             # This'd be a node we'd catch later anyway, but we might as well
810             # nip it in the bud now.
811             # This doesn't catch /all/ deletable WS-nodes, so we do have to call
812             # the tightener later to catch the rest.
813              
814 554 50 33     970 if ( $self->{'_tighten'} and !$self->{'_ignore_text'} )
815             { # if tightenable
816 0         0 my ( $sibs, $par );
817 0 0 0     0 if (( $sibs = ( $par = $self->{'_pos'} || $self )->{'_content'} )
      0        
      0        
      0        
      0        
      0        
      0        
818             and @$sibs # parent already has content
819             and !
820             ref( $sibs->[-1] ) # and the last one there is a text node
821             and $sibs->[-1] !~ m<[^\n\r\f\t ]>s # and it's all whitespace
822              
823             and ( # one of these has to be eligible...
824             $HTML::DOM::_TreeBuilder::canTighten{$tag}
825             or (( @$sibs == 1 )
826             ? # WS is leftmost -- so parent matters
827             $HTML::DOM::_TreeBuilder::canTighten{ $par->{'_tag'} }
828             : # WS is after another node -- it matters
829             ( ref $sibs->[-2]
830             and
831             $HTML::DOM::_TreeBuilder::canTighten{ $sibs->[-2]
832             {'_tag'} }
833             )
834             )
835             )
836              
837             and !$par->is_inside( 'pre', 'xmp', 'textarea', 'plaintext' )
838              
839             # we're clear
840             )
841             {
842 0         0 pop @$sibs;
843 0         0 print $indent, "Popping a preceding all-WS node\n" if DEBUG;
844             }
845             }
846              
847 554 50       1419 $self->insert_element($e) unless $already_inserted;
848              
849 554         394 if (DEBUG) {
850             if ( $self->{'_pos'} ) {
851             print $indent, "(Current lineage of pos: \U$tag\E under ",
852             join(
853             '/',
854             reverse(
855              
856             # $self->{'_pos'}{'_tag'}, # don't list myself!
857             $self->{'_pos'}->lineage_tag_names
858             )
859             ),
860             ".)\n";
861             }
862             else {
863             print $indent, "(Pos points nowhere!?)\n";
864             }
865             }
866              
867 554 100 50     1880 unless ( ( $self->{'_pos'} || '' ) eq $e ) {
868              
869             # if it's an empty element -- i.e., if it didn't change the _pos
870 107         250 &{ $self->{"_tweak_$tag"}
871 107 50 33     532 || $self->{'_tweak_*'}
872             || return $e }( map $_, $e, $tag, $self )
873             ; # make a list so the user can't clobber
874             }
875              
876 554         1105 return $e;
877             }
878             }
879              
880             #==========================================================================
881              
882             {
883             my $indent;
884              
885             sub end {
886 601 50   601 1 873 return if $_[0]{'_stunted'};
887              
888             # Either: Acccept an end-tag signal from HTML::Parser
889             # Or: Method for closing currently open elements in some fairly complex
890             # way, as used by other methods in this class.
891 601         999 my ( $self, $tag, @stop ) = @_;
892 601 50       912 if ( $tag eq 'x-html' ) {
893 0         0 print "Ignoring close-x-html tag.\n" if DEBUG;
894              
895             # inserted by some lame code-generators.
896 0         0 return;
897             }
898              
899 601 50 66     2128 unless ( ref($tag) or $tag =~ m/^[-_a-zA-Z0-9:%]+$/s ) {
900 0         0 DEBUG and print "End-tag name $tag is no good. Skipping.\n";
901 0         0 return;
902              
903             # This avoids having Element's new() throw an exception.
904             }
905              
906             # This method accepts two calling formats:
907             # 1) from Parser: $self->end('tag_name', 'origtext')
908             # in which case we shouldn't mistake origtext as a blocker tag
909             # 2) from myself: $self->end(\q{tagname1}, 'blk1', ... )
910             # from myself: $self->end(['tagname1', 'tagname2'], 'blk1', ... )
911              
912             # End the specified tag, but don't move above any of the blocker tags.
913             # The tag can also be a reference to an array. Terminate the first
914             # tag found.
915              
916 601   66     1164 my $ptag = ( my $p = $self->{'_pos'} || $self )->{'_tag'};
917              
918             # $p and $ptag are sort-of stratch
919              
920 601 100       740 if ( ref($tag) ) {
921              
922             # First param is a ref of one sort or another --
923             # THE CALL IS COMING FROM INSIDE THE HOUSE!
924 244 100       402 $tag = $$tag if ref($tag) eq 'SCALAR';
925              
926             # otherwise it's an arrayref.
927             }
928             else {
929              
930             # the call came from Parser -- just ignore origtext
931             # except in a table ignore unmatched table tags RT #59980
932 357 100       619 @stop = $tag =~ /^t[hdr]\z/ ? 'table' : ();
933             }
934              
935             #my($indent);
936 601         399 if (DEBUG) {
937              
938             # optimization -- don't figure out depth unless we're in debug mode
939             my @lineage_tags = $p->lineage_tag_names;
940             $indent = ' ' x ( 1 + @lineage_tags );
941              
942             # now announce ourselves
943             print $indent, "Ending ",
944             ref($tag) ? ( '[', join( ' ', @$tag ), ']' ) : "\U$tag\E",
945             scalar(@stop)
946             ? ( " no higher than [", join( ' ', @stop ), "]" )
947             : (), ".\n";
948              
949             print $indent, " (Current lineage: ", join( '/', @lineage_tags ),
950             ".)\n"
951             if DEBUG > 1;
952              
953             if ( DEBUG > 3 ) {
954              
955             #my(
956             # $package, $filename, $line, $subroutine,
957             # $hasargs, $wantarray, $evaltext, $is_require) = caller;
958             print $indent,
959             " (Called from ", ( caller(1) )[3], ' line ',
960             ( caller(1) )[2],
961             ")\n";
962             }
963              
964             #} else {
965             # $indent = ' ';
966             }
967              
968             # End of if DEBUG
969              
970             # Now actually do it
971 601         434 my @to_close;
972 601 100       1019 if ( $tag eq '*' ) {
    100          
973              
974             # Special -- close everything up to (but not including) the first
975             # limiting tag, or return if none found. Somewhat of a special case.
976             PARENT:
977 8         14 while ( defined $p ) {
978 21         25 $ptag = $p->{'_tag'};
979 21         15 print $indent, " (Looking at $ptag.)\n" if DEBUG > 2;
980 21         23 for (@stop) {
981 34 100       52 if ( $ptag eq $_ ) {
982 8         9 print $indent,
983             " (Hit a $_; closing everything up to here.)\n"
984             if DEBUG > 2;
985 8         13 last PARENT;
986             }
987             }
988 13         14 push @to_close, $p;
989 13         16 $p = $p->{'_parent'}; # no match so far? keep moving up
990             print $indent,
991 13         18 " (Moving on up to ", $p ? $p->{'_tag'} : 'nil', ")\n"
992             if DEBUG > 1;
993             }
994 8 50       13 unless ( defined $p ) { # We never found what we were looking for.
995 0         0 print $indent, " (We never found a limit.)\n" if DEBUG > 1;
996 0         0 return;
997             }
998              
999             #print
1000             # $indent,
1001             # " (To close: ", join('/', map $_->tag, @to_close), ".)\n"
1002             # if DEBUG > 4;
1003              
1004             # Otherwise update pos and fall thru.
1005 8         12 $self->{'_pos'} = $p;
1006             }
1007             elsif ( ref $tag ) {
1008              
1009             # Close the first of any of the matching tags, giving up if you hit
1010             # any of the stop-tags.
1011             PARENT:
1012 162         253 while ( defined $p ) {
1013 285         291 $ptag = $p->{'_tag'};
1014 285         252 print $indent, " (Looking at $ptag.)\n" if DEBUG > 2;
1015 285         327 for (@$tag) {
1016 2045 100       2485 if ( $ptag eq $_ ) {
1017 65         47 print $indent, " (Closing $_.)\n" if DEBUG > 2;
1018 65         83 last PARENT;
1019             }
1020             }
1021 220         206 for (@stop) {
1022 3270 100       3898 if ( $ptag eq $_ ) {
1023 33         20 print $indent,
1024             " (Hit a limiting $_ -- bailing out.)\n"
1025             if DEBUG > 1;
1026 33         92 return; # so it was all for naught
1027             }
1028             }
1029 187         158 push @to_close, $p;
1030 187         265 $p = $p->{'_parent'};
1031             }
1032 129 100       362 return unless defined $p; # We went off the top of the tree.
1033             # Otherwise specified element was found; set pos to its parent.
1034 65         58 push @to_close, $p;
1035 65         71 $self->{'_pos'} = $p->{'_parent'};
1036             }
1037             else {
1038              
1039             # Close the first of the specified tag, giving up if you hit
1040             # any of the stop-tags.
1041 431         640 while ( defined $p ) {
1042 745         693 $ptag = $p->{'_tag'};
1043 745         468 print $indent, " (Looking at $ptag.)\n" if DEBUG > 2;
1044 745 100       975 if ( $ptag eq $tag ) {
1045 388         281 print $indent, " (Closing $tag.)\n" if DEBUG > 2;
1046 388         388 last;
1047             }
1048 357         441 for (@stop) {
1049 35 100       57 if ( $ptag eq $_ ) {
1050 23         17 print $indent,
1051             " (Hit a limiting $_ -- bailing out.)\n"
1052             if DEBUG > 1;
1053 23         53 return; # so it was all for naught
1054             }
1055             }
1056 334         334 push @to_close, $p;
1057 334         479 $p = $p->{'_parent'};
1058             }
1059 408 100       598 return unless defined $p; # We went off the top of the tree.
1060             # Otherwise specified element was found; set pos to its parent.
1061 388         341 push @to_close, $p;
1062 388         416 $self->{'_pos'} = $p->{'_parent'};
1063             }
1064              
1065 461 100 100     1390 $self->{'_pos'} = undef if $self eq ( $self->{'_pos'} || '' );
1066             print $indent, "(Pos now points to ",
1067 461         301 $self->{'_pos'} ? $self->{'_pos'}{'_tag'} : '???', ".)\n"
1068             if DEBUG > 1;
1069              
1070             ### EXPENSIVE, because has to check that it's not under a pre
1071             ### or a CDATA-parent. That's one more method call per end()!
1072             ### Might as well just do this at the end of the tree-parse, I guess,
1073             ### at which point we'd be parsing top-down, and just not traversing
1074             ### under pre's or CDATA-parents.
1075             ##
1076             ## Take this opportunity to nix any terminal whitespace nodes.
1077             ## TODO: consider whether this (plus the logic in start(), above)
1078             ## would ever leave any WS nodes in the tree.
1079             ## If not, then there's no reason to have eof() call
1080             ## delete_ignorable_whitespace on the tree, is there?
1081             ##
1082             #if(@to_close and $self->{'_tighten'} and !$self->{'_ignore_text'} and
1083             # ! $to_close[-1]->is_inside('pre', keys %HTML::Tagset::isCDATA_Parent)
1084             #) { # if tightenable
1085             # my($children, $e_tag);
1086             # foreach my $e (reverse @to_close) { # going top-down
1087             # last if 'pre' eq ($e_tag = $e->{'_tag'}) or
1088             # $HTML::Tagset::isCDATA_Parent{$e_tag};
1089             #
1090             # if(
1091             # $children = $e->{'_content'}
1092             # and @$children # has children
1093             # and !ref($children->[-1])
1094             # and $children->[-1] =~ m<^\s+$>s # last node is all-WS
1095             # and
1096             # (
1097             # # has a tightable parent:
1098             # $HTML::DOM::_TreeBuilder::canTighten{ $e_tag }
1099             # or
1100             # ( # has a tightenable left sibling:
1101             # @$children > 1 and
1102             # ref($children->[-2])
1103             # and $HTML::DOM::_TreeBuilder::canTighten{ $children->[-2]{'_tag'} }
1104             # )
1105             # )
1106             # ) {
1107             # pop @$children;
1108             # #print $indent, "Popping a terminal WS node from ", $e->{'_tag'},
1109             # # " (", $e->address, ") while exiting.\n" if DEBUG;
1110             # }
1111             # }
1112             #}
1113              
1114 461         474 foreach my $e (@to_close) {
1115              
1116             # Call the applicable callback, if any
1117 693         610 $ptag = $e->{'_tag'};
1118 693         1231 &{ $self->{"_tweak_$ptag"}
1119 693 50 66     2767 || $self->{'_tweak_*'}
1120             || next }( map $_, $e, $ptag, $self );
1121 692         616 print $indent, "Back from tweaking.\n" if DEBUG;
1122             last
1123 692 50       1402 if $self->{ '_stunted'
1124             }; # in case one of the handlers called stunt
1125             }
1126 460         1166 return @to_close;
1127             }
1128             }
1129              
1130             #==========================================================================
1131             {
1132             my ( $indent, $nugget );
1133              
1134             sub text {
1135 547 50   547 1 907 return if $_[0]{'_stunted'};
1136              
1137             # Accept a "here's a text token" signal from HTML::Parser.
1138 547         637 my ( $self, $text, $is_cdata ) = @_;
1139              
1140             # the >3.0 versions of Parser may pass a cdata node.
1141             # Thanks to Gisle Aas for pointing this out.
1142              
1143 547 50       862 return unless length $text; # I guess that's always right
1144              
1145 547         513 my $ignore_text = $self->{'_ignore_text'};
1146 547         411 my $no_space_compacting = $self->{'_no_space_compacting'};
1147 547         443 my $no_expand_entities = $self->{'_no_expand_entities'};
1148 547   66     886 my $pos = $self->{'_pos'} || $self;
1149              
1150             HTML::Entities::decode($text)
1151             unless $ignore_text
1152             || $is_cdata
1153 547 100 66     3979 || $HTML::Tagset::isCDATA_Parent{ $pos->{'_tag'} }
      100        
      66        
1154             || $no_expand_entities;
1155              
1156             #my($indent, $nugget);
1157 547         385 if (DEBUG) {
1158              
1159             # optimization -- don't figure out depth unless we're in debug mode
1160             my @lineage_tags = $pos->lineage_tag_names;
1161             $indent = ' ' x ( 1 + @lineage_tags );
1162              
1163             $nugget
1164             = ( length($text) <= 25 )
1165             ? $text
1166             : ( substr( $text, 0, 25 ) . '...' );
1167             $nugget =~ s<([\x00-\x1F])>
1168             <'\\x'.(unpack("H2",$1))>eg;
1169             print $indent, "Proposing a new text node ($nugget) under ",
1170             join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) )
1171             || 'Root',
1172             ".\n";
1173              
1174             #} else {
1175             # $indent = ' ';
1176             }
1177              
1178 547         386 my $ptag;
1179 547 100 100     1775 if ($HTML::Tagset::isCDATA_Parent{ $ptag = $pos->{'_tag'} }
1180              
1181             #or $pos->is_inside('pre')
1182             or $pos->is_inside( 'pre', 'textarea' )
1183             )
1184             {
1185 15 50       37 return if $ignore_text;
1186 15         45 $pos->push_content($text);
1187             }
1188             else {
1189              
1190             # return unless $text =~ /\S/; # This is sometimes wrong
1191              
1192 532 100 66     3169 if ( !$self->{'_implicit_tags'} || $text !~ /[^\n\r\f\t ]/ ) {
    100 66        
    100          
    100          
    50          
    50          
1193              
1194             # don't change anything
1195             }
1196             elsif ( $ptag eq 'head' or $ptag eq 'noframes' ) {
1197 2 50       4 if ( $self->{'_implicit_body_p_tag'} ) {
1198 0         0 print $indent,
1199             " * Text node under \U$ptag\E closes \U$ptag\E, implicates BODY and P.\n"
1200             if DEBUG > 1;
1201 0         0 $self->end( \$ptag );
1202             $pos = $self->{'_body'}
1203             ? ( $self->{'_pos'}
1204 0 0       0 = $self->{'_body'} ) # expected case
1205             : $self->insert_element( 'body', 1 );
1206 0         0 $pos = $self->insert_element( 'p', 1 );
1207             }
1208             else {
1209 2         1 print $indent,
1210             " * Text node under \U$ptag\E closes, implicates BODY.\n"
1211             if DEBUG > 1;
1212 2         7 $self->end( \$ptag );
1213             $pos = $self->{'_body'}
1214             ? ( $self->{'_pos'}
1215 2 50       7 = $self->{'_body'} ) # expected case
1216             : $self->insert_element( 'body', 1 );
1217             }
1218             }
1219             elsif ( $ptag eq 'html' ) {
1220 4 50       12 if ( $self->{'_implicit_body_p_tag'} ) {
1221 0         0 print $indent,
1222             " * Text node under HTML implicates BODY and P.\n"
1223             if DEBUG > 1;
1224             $pos = $self->{'_body'}
1225             ? ( $self->{'_pos'}
1226 0 0       0 = $self->{'_body'} ) # expected case
1227             : $self->insert_element( 'body', 1 );
1228 0         0 $pos = $self->insert_element( 'p', 1 );
1229             }
1230             else {
1231 4         3 print $indent,
1232             " * Text node under HTML implicates BODY.\n"
1233             if DEBUG > 1;
1234             $pos = $self->{'_body'}
1235             ? ( $self->{'_pos'}
1236 4 50       12 = $self->{'_body'} ) # expected case
1237             : $self->insert_element( 'body', 1 );
1238              
1239             #print "POS is $pos, ", $pos->{'_tag'}, "\n";
1240             }
1241             }
1242             elsif ( $ptag eq 'body' ) {
1243 4 50       14 if ( $self->{'_implicit_body_p_tag'} ) {
1244 0         0 print $indent, " * Text node under BODY implicates P.\n"
1245             if DEBUG > 1;
1246 0         0 $pos = $self->insert_element( 'p', 1 );
1247             }
1248             }
1249             elsif ( $ptag eq 'table' ) {
1250 0         0 print $indent,
1251             " * Text node under TABLE implicates TR and TD.\n"
1252             if DEBUG > 1;
1253 0         0 $self->insert_element( 'tr', 1 );
1254 0         0 $pos = $self->insert_element( 'td', 1 );
1255              
1256             # double whammy!
1257             }
1258             elsif ( $ptag eq 'tr' ) {
1259 0         0 print $indent, " * Text node under TR implicates TD.\n"
1260             if DEBUG > 1;
1261 0         0 $pos = $self->insert_element( 'td', 1 );
1262             }
1263              
1264             # elsif (
1265             # # $ptag eq 'li' ||
1266             # # $ptag eq 'dd' ||
1267             # $ptag eq 'form') {
1268             # $pos = $self->insert_element('p', 1);
1269             #}
1270              
1271             # Whatever we've done above should have had the side
1272             # effect of updating $self->{'_pos'}
1273              
1274             #print "POS is now $pos, ", $pos->{'_tag'}, "\n";
1275              
1276 532 50       744 return if $ignore_text;
1277 532 50       705 $text =~ s/[\n\r\f\t ]+/ /g # canonical space
1278             unless $no_space_compacting;
1279              
1280             print $indent, " (Attaching text node ($nugget) under ",
1281              
1282             # was: $self->{'_pos'} ? $self->{'_pos'}{'_tag'} : $self->{'_tag'},
1283 532         340 $pos->{'_tag'}, ").\n"
1284             if DEBUG > 1;
1285              
1286 532         1098 $pos->push_content($text);
1287             }
1288              
1289 547 50       1552 &{ $self->{'_tweak_~text'} || return }( $text, $pos,
1290 547         777 $pos->{'_tag'} . '' );
1291              
1292             # Note that this is very exceptional -- it doesn't fall back to
1293             # _tweak_*, and it gives its tweak different arguments.
1294 547         3351 return;
1295             }
1296             }
1297              
1298             #==========================================================================
1299              
1300             # TODO: test whether comment(), declaration(), and process(), do the right
1301             # thing as far as tightening and whatnot.
1302             # Also, currently, doctypes and comments that appear before head or body
1303             # show up in the tree in the wrong place. Something should be done about
1304             # this. Tricky. Maybe this whole business of pre-making the body and
1305             # whatnot is wrong.
1306              
1307             sub comment {
1308 7 50   7 1 93 return if $_[0]{'_stunted'};
1309              
1310             # Accept a "here's a comment" signal from HTML::Parser.
1311              
1312 7         11 my ( $self, $text ) = @_;
1313 7   66     22 my $pos = $self->{'_pos'} || $self;
1314             return
1315             unless $self->{'_store_comments'}
1316 7 50 33     23 || $HTML::Tagset::isCDATA_Parent{ $pos->{'_tag'} };
1317              
1318 7         5 if (DEBUG) {
1319             my @lineage_tags = $pos->lineage_tag_names;
1320             my $indent = ' ' x ( 1 + @lineage_tags );
1321              
1322             my $nugget
1323             = ( length($text) <= 25 )
1324             ? $text
1325             : ( substr( $text, 0, 25 ) . '...' );
1326             $nugget =~ s<([\x00-\x1F])>
1327             <'\\x'.(unpack("H2",$1))>eg;
1328             print $indent, "Proposing a Comment ($nugget) under ",
1329             join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root',
1330             ".\n";
1331             }
1332              
1333 7         23 ( my $e = $self->element_class->new('~comment') )->{'text'} = $text;
1334 7         23 $pos->push_content($e);
1335 7         8 ++( $self->{'_element_count'} );
1336              
1337 7         24 &{ $self->{'_tweak_~comment'}
1338 7 50 33     44 || $self->{'_tweak_*'}
1339             || return $e }( map $_, $e, '~comment', $self );
1340              
1341 7         14 return $e;
1342             }
1343              
1344             sub declaration {
1345 0 0   0 1 0 return if $_[0]{'_stunted'};
1346              
1347             # Accept a "here's a markup declaration" signal from HTML::Parser.
1348              
1349 0         0 my ( $self, $text ) = @_;
1350 0   0     0 my $pos = $self->{'_pos'} || $self;
1351              
1352 0         0 if (DEBUG) {
1353             my @lineage_tags = $pos->lineage_tag_names;
1354             my $indent = ' ' x ( 1 + @lineage_tags );
1355              
1356             my $nugget
1357             = ( length($text) <= 25 )
1358             ? $text
1359             : ( substr( $text, 0, 25 ) . '...' );
1360             $nugget =~ s<([\x00-\x1F])>
1361             <'\\x'.(unpack("H2",$1))>eg;
1362             print $indent, "Proposing a Declaration ($nugget) under ",
1363             join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root',
1364             ".\n";
1365             }
1366 0         0 ( my $e = $self->element_class->new('~declaration') )->{'text'} = $text;
1367              
1368 0         0 $self->{_decl} = $e;
1369 0         0 return $e;
1370             }
1371              
1372             #==========================================================================
1373              
1374             sub process {
1375 0 0   0 1 0 return if $_[0]{'_stunted'};
1376              
1377             # Accept a "here's a PI" signal from HTML::Parser.
1378              
1379 0 0       0 return unless $_[0]->{'_store_pis'};
1380 0         0 my ( $self, $text ) = @_;
1381 0   0     0 my $pos = $self->{'_pos'} || $self;
1382              
1383 0         0 if (DEBUG) {
1384             my @lineage_tags = $pos->lineage_tag_names;
1385             my $indent = ' ' x ( 1 + @lineage_tags );
1386              
1387             my $nugget
1388             = ( length($text) <= 25 )
1389             ? $text
1390             : ( substr( $text, 0, 25 ) . '...' );
1391             $nugget =~ s<([\x00-\x1F])>
1392             <'\\x'.(unpack("H2",$1))>eg;
1393             print $indent, "Proposing a PI ($nugget) under ",
1394             join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root',
1395             ".\n";
1396             }
1397 0         0 ( my $e = $self->element_class->new('~pi') )->{'text'} = $text;
1398 0         0 $pos->push_content($e);
1399 0         0 ++( $self->{'_element_count'} );
1400              
1401 0 0 0     0 &{ $self->{'_tweak_~pi'} || $self->{'_tweak_*'} || return $e }( map $_,
  0         0  
1402             $e, '~pi', $self );
1403              
1404 0         0 return $e;
1405             }
1406              
1407             #==========================================================================
1408              
1409             #When you call $tree->parse_file($filename), and the
1410             #tree's ignore_ignorable_whitespace attribute is on (as it is
1411             #by default), HTML::DOM::_TreeBuilder's logic will manage to avoid
1412             #creating some, but not all, nodes that represent ignorable
1413             #whitespace. However, at the end of its parse, it traverses the
1414             #tree and deletes any that it missed. (It does this with an
1415             #around-method around HTML::Parser's eof method.)
1416             #
1417             #However, with $tree->parse($content), the cleanup-traversal step
1418             #doesn't happen automatically -- so when you're done parsing all
1419             #content for a document (regardless of whether $content is the only
1420             #bit, or whether it's just another chunk of content you're parsing into
1421             #the tree), call $tree->eof() to signal that you're at the end of the
1422             #text you're inputting to the tree. Besides properly cleaning any bits
1423             #of ignorable whitespace from the tree, this will also ensure that
1424             #HTML::Parser's internal buffer is flushed.
1425              
1426             sub eof {
1427              
1428             # Accept an "end-of-file" signal from HTML::Parser, or thrown by the user.
1429              
1430 128 100   128 1 274 return if $_[0]->{'_done'}; # we've already been here
1431              
1432 125 50       221 return $_[0]->SUPER::eof() if $_[0]->{'_stunted'};
1433              
1434 125         110 my $x = $_[0];
1435 125         81 print "EOF received.\n" if DEBUG;
1436 125         111 my (@rv);
1437 125 50       175 if (wantarray) {
1438              
1439             # I don't think this makes any difference for this particular
1440             # method, but let's be scrupulous, for once.
1441 0         0 @rv = $x->SUPER::eof();
1442             }
1443             else {
1444 125         464 $rv[0] = $x->SUPER::eof();
1445             }
1446              
1447 125 100 66     592 $x->end('html') unless $x eq ( $x->{'_pos'} || $x );
1448              
1449             # That SHOULD close everything, and will run the appropriate tweaks.
1450             # We /could/ be running under some insane mode such that there's more
1451             # than one HTML element, but really, that's just insane to do anyhow.
1452              
1453 124 50       203 unless ( $x->{'_implicit_tags'} ) {
1454              
1455             # delete those silly implicit head and body in case we put
1456             # them there in implicit tags mode
1457 0         0 foreach my $node ( $x->{'_head'}, $x->{'_body'} ) {
1458             $node->replace_with_content
1459             if defined $node
1460             and ref $node
1461             and $node->{'_implicit'}
1462 0 0 0     0 and $node->{'_parent'};
      0        
      0        
1463              
1464             # I think they should be empty anyhow, since the only
1465             # logic that'd insert under them can apply only, I think,
1466             # in the case where _implicit_tags is on
1467             }
1468              
1469             # this may still leave an implicit 'html' at the top, but there's
1470             # nothing we can do about that, is there?
1471             }
1472              
1473             $x->delete_ignorable_whitespace()
1474              
1475             # this's why we trap this -- an after-method
1476 124 50 33     258 if $x->{'_tighten'} and !$x->{'_ignore_text'};
1477 124         157 $x->{'_done'} = 1;
1478              
1479 124 50       186 return @rv if wantarray;
1480 124         218 return $rv[0];
1481             }
1482              
1483             #==========================================================================
1484              
1485             # TODO: document
1486              
1487             sub stunt {
1488 0     0 0 0 my $self = $_[0];
1489 0         0 print "Stunting the tree.\n" if DEBUG;
1490 0         0 $self->{'_done'} = 1;
1491              
1492 0 0       0 if ( $HTML::Parser::VERSION < 3 ) {
1493              
1494             #This is a MEAN MEAN HACK. And it works most of the time!
1495 0         0 $self->{'_buf'} = '';
1496 0         0 my $fh = *HTML::Parser::F{IO};
1497              
1498             # the local'd FH used by parse_file loop
1499 0 0       0 if ( defined $fh ) {
1500 0         0 print "Closing Parser's filehandle $fh\n" if DEBUG;
1501 0         0 close($fh);
1502             }
1503              
1504             # But if they called $tree->parse_file($filehandle)
1505             # or $tree->parse_file(*IO), then there will be no *HTML::Parser::F{IO}
1506             # to close. Ahwell. Not a problem for most users these days.
1507              
1508             }
1509             else {
1510 0         0 $self->SUPER::eof();
1511              
1512             # Under 3+ versions, calling eof from inside a parse will abort the
1513             # parse / parse_file
1514             }
1515              
1516             # In the off chance that the above didn't work, we'll throw
1517             # this flag to make any future events be no-ops.
1518 0         0 $self->stunted(1);
1519 0         0 return;
1520             }
1521              
1522             # TODO: document
1523 0     0 0 0 sub stunted { shift->_elem( '_stunted', @_ ); }
1524 0     0 0 0 sub done { shift->_elem( '_done', @_ ); }
1525              
1526             #==========================================================================
1527              
1528             sub delete {
1529              
1530             # Override Element's delete method.
1531             # This does most, if not all, of what Element's delete does anyway.
1532             # Deletes content, including content in some special attributes.
1533             # But doesn't empty out the hash.
1534              
1535 0     0 0 0 $_[0]->{'_element_count'} = 1; # never hurts to be scrupulously correct
1536              
1537 0         0 delete @{ $_[0] }{ '_body', '_head', '_pos' };
  0         0  
1538 0         0 for (
1539 0 0       0 @{ delete( $_[0]->{'_content'} ) || [] }, # all/any content
1540              
1541             # delete @{$_[0]}{'_body', '_head', '_pos'}
1542             # ...and these, in case these elements don't appear in the
1543             # content, which is possible. If they did appear (as they
1544             # usually do), then calling $_->delete on them again is harmless.
1545             # I don't think that's such a hot idea now. Thru creative reattachment,
1546             # those could actually now point to elements in OTHER trees (which we do
1547             # NOT want to delete!).
1548             ## Reasoned out:
1549             # If these point to elements not in the content list of any element in this
1550             # tree, but not in the content list of any element in any OTHER tree, then
1551             # just deleting these will make their refcounts hit zero.
1552             # If these point to elements in the content lists of elements in THIS tree,
1553             # then we'll get to deleting them when we delete from the top.
1554             # If these point to elements in the content lists of elements in SOME OTHER
1555             # tree, then they're not to be deleted.
1556             )
1557             {
1558 0 0 0     0 $_->delete
      0        
1559             if defined $_ and ref $_ # Make sure it's an object.
1560             and $_ ne $_[0]; # And avoid hitting myself, just in case!
1561             }
1562              
1563 0 0 0     0 $_[0]->detach if $_[0]->{'_parent'} and $_[0]->{'_parent'}{'_content'};
1564              
1565             # An 'html' element having a parent is quite unlikely.
1566              
1567 0         0 return;
1568             }
1569              
1570             sub tighten_up { # legacy
1571 0     0 0 0 shift->delete_ignorable_whitespace(@_);
1572             }
1573              
1574             sub elementify {
1575              
1576             # Rebless this object down into the normal element class.
1577 93     93 0 90 my $self = $_[0];
1578 93         174 my $to_class = $self->element_class;
1579 93         721 delete @{$self}{
1580             grep {
1581 93         249 ;
1582 2679 100 66     24380 length $_ and substr( $_, 0, 1 ) eq '_'
      100        
      100        
      100        
      100        
      100        
1583              
1584             # The private attributes that we'll retain:
1585             and $_ ne '_tag'
1586             and $_ ne '_parent'
1587             and $_ ne '_content'
1588             and $_ ne '_implicit'
1589             and $_ ne '_pos'
1590             and $_ ne '_element_class'
1591             } keys %$self
1592             };
1593 93         284 bless $self, $to_class; # Returns the same object we were fed
1594             }
1595              
1596             sub element_class {
1597 0 0   0 0 0 return 'HTML::DOM::_Element' if not ref $_[0];
1598 0   0     0 return $_[0]->{_element_class} || 'HTML::DOM::_Element';
1599             }
1600              
1601             #--------------------------------------------------------------------------
1602              
1603             sub guts {
1604 31     31 0 28 my @out;
1605 31         45 my @stack = ( $_[0] );
1606 31         24 my $destructive = $_[1];
1607 31         25 my $this;
1608 31         59 while (@stack) {
1609 135         93 $this = shift @stack;
1610 135 50       239 if ( !ref $this ) {
    100          
1611 0         0 push @out, $this; # yes, it can include text nodes
1612             }
1613             elsif ( !$this->{'_implicit'} ) {
1614 38         33 push @out, $this;
1615 38 50       77 delete $this->{'_parent'} if $destructive;
1616             }
1617             else {
1618              
1619             # it's an implicit node. Delete it and recurse
1620 97 50       114 delete $this->{'_parent'} if $destructive;
1621             unshift @stack,
1622             @{
1623 97         65 ( $destructive
1624             ? delete( $this->{'_content'} )
1625 97 50       347 : $this->{'_content'}
    100          
1626             )
1627             || []
1628             };
1629             }
1630             }
1631              
1632             # Doesn't call a real $root->delete on the (when implicit) root,
1633             # but I don't think it needs to.
1634              
1635 31 50       120 return @out if wantarray; # one simple normal case.
1636 0 0         return unless @out;
1637 0 0 0       return $out[0] if @out == 1 and ref( $out[0] );
1638 0           my $x = HTML::DOM::_Element->new( 'div', '_implicit' => 1 );
1639 0           $x->push_content(@out);
1640 0           return $x;
1641             }
1642              
1643 0     0 0   sub disembowel { $_[0]->guts(1) }
1644              
1645             #--------------------------------------------------------------------------
1646             1;
1647              
1648             __END__