line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package HTML::TreeBuilder; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
# ABSTRACT: Parser that builds a HTML syntax tree |
4
|
|
|
|
|
|
|
|
5
|
16
|
|
|
16
|
|
659682
|
use warnings; |
|
16
|
|
|
|
|
34
|
|
|
16
|
|
|
|
|
508
|
|
6
|
16
|
|
|
16
|
|
74
|
use strict; |
|
16
|
|
|
|
|
29
|
|
|
16
|
|
|
|
|
263
|
|
7
|
16
|
|
|
16
|
|
4434
|
use integer; # vroom vroom! |
|
16
|
|
|
|
|
191
|
|
|
16
|
|
|
|
|
66
|
|
8
|
16
|
|
|
16
|
|
404
|
use Carp (); |
|
16
|
|
|
|
|
32
|
|
|
16
|
|
|
|
|
1879
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
our $VERSION = '5.07'; # VERSION from OurPkgVersion |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
#--------------------------------------------------------------------------- |
13
|
|
|
|
|
|
|
# Make a 'DEBUG' constant... |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
our $DEBUG; # Must be set BEFORE loading this file |
16
|
|
|
|
|
|
|
BEGIN { |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
# We used to have things like |
19
|
|
|
|
|
|
|
# print $indent, "lalala" if $Debug; |
20
|
|
|
|
|
|
|
# But there were an awful lot of having to evaluate $Debug's value. |
21
|
|
|
|
|
|
|
# If we make that depend on a constant, like so: |
22
|
|
|
|
|
|
|
# sub DEBUG () { 1 } # or whatever value. |
23
|
|
|
|
|
|
|
# ... |
24
|
|
|
|
|
|
|
# print $indent, "lalala" if DEBUG; |
25
|
|
|
|
|
|
|
# Which at compile-time (thru the miracle of constant folding) turns into: |
26
|
|
|
|
|
|
|
# print $indent, "lalala"; |
27
|
|
|
|
|
|
|
# or, if DEBUG is a constant with a true value, then that print statement |
28
|
|
|
|
|
|
|
# is simply optimized away, and doesn't appear in the target code at all. |
29
|
|
|
|
|
|
|
# If you don't believe me, run: |
30
|
|
|
|
|
|
|
# perl -MO=Deparse,-uHTML::TreeBuilder -e 'BEGIN { \ |
31
|
|
|
|
|
|
|
# $HTML::TreeBuilder::DEBUG = 4} use HTML::TreeBuilder' |
32
|
|
|
|
|
|
|
# and see for yourself (substituting whatever value you want for $DEBUG |
33
|
|
|
|
|
|
|
# there). |
34
|
|
|
|
|
|
|
## no critic |
35
|
16
|
50
|
|
16
|
|
173
|
if ( defined &DEBUG ) { |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
# Already been defined! Do nothing. |
38
|
|
|
|
|
|
|
} |
39
|
|
|
|
|
|
|
elsif ( $] < 5.00404 ) { |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
# Grudgingly accomodate ancient (pre-constant) versions. |
42
|
0
|
|
|
|
|
0
|
eval 'sub DEBUG { $Debug } '; |
43
|
|
|
|
|
|
|
} |
44
|
|
|
|
|
|
|
elsif ( !$DEBUG ) { |
45
|
16
|
|
|
|
|
792
|
eval 'sub DEBUG () {0}'; # Make it a constant. |
46
|
|
|
|
|
|
|
} |
47
|
|
|
|
|
|
|
elsif ( $DEBUG =~ m<^\d+$>s ) { |
48
|
0
|
|
|
|
|
0
|
eval 'sub DEBUG () { ' . $DEBUG . ' }'; # Make THAT a constant. |
49
|
|
|
|
|
|
|
} |
50
|
|
|
|
|
|
|
else { # WTF? |
51
|
0
|
|
|
|
|
0
|
warn "Non-numeric value \"$DEBUG\" in \$HTML::Element::DEBUG"; |
52
|
0
|
|
|
|
|
0
|
eval 'sub DEBUG () { $DEBUG }'; # I guess. |
53
|
|
|
|
|
|
|
} |
54
|
|
|
|
|
|
|
## use critic |
55
|
|
|
|
|
|
|
} |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
#--------------------------------------------------------------------------- |
58
|
|
|
|
|
|
|
|
59
|
16
|
|
|
16
|
|
4434
|
use HTML::Entities (); |
|
16
|
|
|
|
|
70682
|
|
|
16
|
|
|
|
|
564
|
|
60
|
16
|
|
|
16
|
|
4408
|
use HTML::Tagset 3.02 (); |
|
16
|
|
|
|
|
16051
|
|
|
16
|
|
|
|
|
421
|
|
61
|
|
|
|
|
|
|
|
62
|
16
|
|
|
16
|
|
9353
|
use HTML::Element (); |
|
16
|
|
|
|
|
46
|
|
|
16
|
|
|
|
|
495
|
|
63
|
16
|
|
|
16
|
|
103
|
use HTML::Parser 3.46 (); |
|
16
|
|
|
|
|
374
|
|
|
16
|
|
|
|
|
86598
|
|
64
|
|
|
|
|
|
|
our @ISA = qw(HTML::Element HTML::Parser); |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
# This looks schizoid, I know. |
67
|
|
|
|
|
|
|
# It's not that we ARE an element AND a parser. |
68
|
|
|
|
|
|
|
# We ARE an element, but one that knows how to handle signals |
69
|
|
|
|
|
|
|
# (method calls) from Parser in order to elaborate its subtree. |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
# Legacy aliases: |
72
|
|
|
|
|
|
|
*HTML::TreeBuilder::isKnown = \%HTML::Tagset::isKnown; |
73
|
|
|
|
|
|
|
*HTML::TreeBuilder::canTighten = \%HTML::Tagset::canTighten; |
74
|
|
|
|
|
|
|
*HTML::TreeBuilder::isHeadElement = \%HTML::Tagset::isHeadElement; |
75
|
|
|
|
|
|
|
*HTML::TreeBuilder::isBodyElement = \%HTML::Tagset::isBodyElement; |
76
|
|
|
|
|
|
|
*HTML::TreeBuilder::isPhraseMarkup = \%HTML::Tagset::isPhraseMarkup; |
77
|
|
|
|
|
|
|
*HTML::TreeBuilder::isHeadOrBodyElement = \%HTML::Tagset::isHeadOrBodyElement; |
78
|
|
|
|
|
|
|
*HTML::TreeBuilder::isList = \%HTML::Tagset::isList; |
79
|
|
|
|
|
|
|
*HTML::TreeBuilder::isTableElement = \%HTML::Tagset::isTableElement; |
80
|
|
|
|
|
|
|
*HTML::TreeBuilder::isFormElement = \%HTML::Tagset::isFormElement; |
81
|
|
|
|
|
|
|
*HTML::TreeBuilder::p_closure_barriers = \@HTML::Tagset::p_closure_barriers; |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
#========================================================================== |
84
|
|
|
|
|
|
|
# Two little shortcut constructors: |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
sub new_from_file { # or from a FH |
87
|
3
|
|
|
3
|
1
|
8
|
my $class = shift; |
88
|
3
|
50
|
|
|
|
12
|
Carp::croak("new_from_file takes only one argument") |
89
|
|
|
|
|
|
|
unless @_ == 1; |
90
|
3
|
50
|
|
|
|
10
|
Carp::croak("new_from_file is a class method only") |
91
|
|
|
|
|
|
|
if ref $class; |
92
|
3
|
|
|
|
|
13
|
my $new = $class->new(); |
93
|
3
|
100
|
|
|
|
23
|
defined $new->parse_file( $_[0] ) |
94
|
|
|
|
|
|
|
or Carp::croak("unable to parse file: $!"); |
95
|
2
|
|
|
|
|
7
|
return $new; |
96
|
|
|
|
|
|
|
} |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
sub new_from_content { # from any number of scalars |
99
|
8
|
|
|
8
|
1
|
2005
|
my $class = shift; |
100
|
8
|
50
|
|
|
|
30
|
Carp::croak("new_from_content is a class method only") |
101
|
|
|
|
|
|
|
if ref $class; |
102
|
8
|
|
|
|
|
27
|
my $new = $class->new(); |
103
|
8
|
|
|
|
|
21
|
foreach my $whunk (@_) { |
104
|
9
|
100
|
|
|
|
28
|
if ( ref($whunk) eq 'SCALAR' ) { |
105
|
1
|
|
|
|
|
8
|
$new->parse($$whunk); |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
else { |
108
|
8
|
|
|
|
|
71
|
$new->parse($whunk); |
109
|
|
|
|
|
|
|
} |
110
|
9
|
50
|
|
|
|
60
|
last if $new->{'_stunted'}; # might as well check that. |
111
|
|
|
|
|
|
|
} |
112
|
8
|
|
|
|
|
32
|
$new->eof(); |
113
|
8
|
|
|
|
|
25
|
return $new; |
114
|
|
|
|
|
|
|
} |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
sub new_from_url { # should accept anything that LWP does. |
117
|
4
|
|
|
4
|
1
|
14
|
undef our $lwp_response; |
118
|
4
|
|
|
|
|
6
|
my $class = shift; |
119
|
4
|
50
|
|
|
|
17
|
Carp::croak("new_from_url takes only one argument") |
120
|
|
|
|
|
|
|
unless @_ == 1; |
121
|
4
|
50
|
|
|
|
12
|
Carp::croak("new_from_url is a class method only") |
122
|
|
|
|
|
|
|
if ref $class; |
123
|
4
|
|
|
|
|
6
|
my $url = shift; |
124
|
4
|
|
|
|
|
20
|
my $new = $class->new(); |
125
|
|
|
|
|
|
|
|
126
|
4
|
|
|
|
|
33
|
require LWP::UserAgent; |
127
|
|
|
|
|
|
|
# RECOMMEND PREREQ: LWP::UserAgent 5.815 |
128
|
4
|
|
|
|
|
79
|
LWP::UserAgent->VERSION( 5.815 ); # HTTP::Headers content_is_html method |
129
|
4
|
|
|
|
|
54
|
$lwp_response = LWP::UserAgent->new->get( $url ); |
130
|
|
|
|
|
|
|
|
131
|
4
|
100
|
|
|
|
33516
|
Carp::croak("GET failed on $url: " . $lwp_response->status_line) |
132
|
|
|
|
|
|
|
unless $lwp_response->is_success; |
133
|
3
|
100
|
|
|
|
34
|
Carp::croak("$url returned " . $lwp_response->content_type . " not HTML") |
134
|
|
|
|
|
|
|
unless $lwp_response->content_is_html; |
135
|
|
|
|
|
|
|
|
136
|
2
|
|
|
|
|
61
|
$new->parse( $lwp_response->decoded_content ); |
137
|
2
|
|
|
|
|
9
|
$new->eof; |
138
|
2
|
|
|
|
|
44
|
undef $lwp_response; # Processed successfully |
139
|
2
|
|
|
|
|
11
|
return $new; |
140
|
|
|
|
|
|
|
} |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
# TODO: document more fully? |
143
|
|
|
|
|
|
|
sub parse_content { # from any number of scalars |
144
|
9
|
|
|
9
|
1
|
34
|
my $tree = shift; |
145
|
9
|
|
|
|
|
13
|
my $retval; |
146
|
9
|
|
|
|
|
18
|
foreach my $whunk (@_) { |
147
|
10
|
100
|
|
|
|
25
|
if ( ref($whunk) eq 'SCALAR' ) { |
148
|
1
|
|
|
|
|
7
|
$retval = $tree->parse($$whunk); |
149
|
|
|
|
|
|
|
} |
150
|
|
|
|
|
|
|
else { |
151
|
9
|
|
|
|
|
97
|
$retval = $tree->parse($whunk); |
152
|
|
|
|
|
|
|
} |
153
|
10
|
50
|
|
|
|
30
|
last if $tree->{'_stunted'}; # might as well check that. |
154
|
|
|
|
|
|
|
} |
155
|
9
|
|
|
|
|
25
|
$tree->eof(); |
156
|
9
|
|
|
|
|
18
|
return $retval; |
157
|
|
|
|
|
|
|
} |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
#--------------------------------------------------------------------------- |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
sub new { # constructor! |
162
|
291
|
|
|
291
|
1
|
91675
|
my $class = shift; |
163
|
291
|
|
33
|
|
|
1028
|
$class = ref($class) || $class; |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
# Initialize HTML::Element part |
166
|
291
|
|
|
|
|
641
|
my $self = $class->element_class->new('html'); |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
{ |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
# A hack for certain strange versions of Parser: |
171
|
291
|
|
|
|
|
390
|
my $other_self = HTML::Parser->new(); |
|
291
|
|
|
|
|
801
|
|
172
|
291
|
|
|
|
|
12538
|
%$self = ( %$self, %$other_self ); # copy fields |
173
|
|
|
|
|
|
|
# Yes, multiple inheritance is messy. Kids, don't try this at home. |
174
|
291
|
|
|
|
|
904
|
bless $other_self, "HTML::TreeBuilder::_hideyhole"; |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
# whack it out of the HTML::Parser class, to avoid the destructor |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
# The root of the tree is special, as it has these funny attributes, |
180
|
|
|
|
|
|
|
# and gets reblessed into this class. |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
# Initialize parser settings |
183
|
291
|
|
|
|
|
518
|
$self->{'_implicit_tags'} = 1; |
184
|
291
|
|
|
|
|
414
|
$self->{'_implicit_body_p_tag'} = 0; |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
# If true, trying to insert text, or any of %isPhraseMarkup right |
187
|
|
|
|
|
|
|
# under 'body' will implicate a 'p'. If false, will just go there. |
188
|
|
|
|
|
|
|
|
189
|
291
|
|
|
|
|
390
|
$self->{'_tighten'} = 1; |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
# whether ignorable WS in this tree should be deleted |
192
|
|
|
|
|
|
|
|
193
|
291
|
|
|
|
|
384
|
$self->{'_implicit'} = 1; # to delete, once we find a real open-"html" tag |
194
|
|
|
|
|
|
|
|
195
|
291
|
|
|
|
|
660
|
$self->{'_ignore_unknown'} = 1; |
196
|
291
|
|
|
|
|
404
|
$self->{'_ignore_text'} = 0; |
197
|
291
|
|
|
|
|
377
|
$self->{'_warn'} = 0; |
198
|
291
|
|
|
|
|
441
|
$self->{'_no_space_compacting'} = 0; |
199
|
291
|
|
|
|
|
404
|
$self->{'_store_comments'} = 0; |
200
|
291
|
|
|
|
|
394
|
$self->{'_store_declarations'} = 1; |
201
|
291
|
|
|
|
|
408
|
$self->{'_store_pis'} = 0; |
202
|
291
|
|
|
|
|
407
|
$self->{'_p_strict'} = 0; |
203
|
291
|
|
|
|
|
585
|
$self->{'_no_expand_entities'} = 0; |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
# Parse attributes passed in as arguments |
206
|
291
|
100
|
|
|
|
555
|
if (@_) { |
207
|
4
|
|
|
|
|
13
|
my %attr = @_; |
208
|
4
|
|
|
|
|
12
|
for ( keys %attr ) { |
209
|
10
|
|
|
|
|
26
|
$self->{"_$_"} = $attr{$_}; |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
|
213
|
291
|
|
|
|
|
418
|
$HTML::Element::encoded_content = $self->{'_no_expand_entities'}; |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
# rebless to our class |
216
|
291
|
|
|
|
|
408
|
bless $self, $class; |
217
|
|
|
|
|
|
|
|
218
|
291
|
|
|
|
|
424
|
$self->{'_element_count'} = 1; |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
# undocumented, informal, and maybe not exactly correct |
221
|
|
|
|
|
|
|
|
222
|
291
|
|
|
|
|
706
|
$self->{'_head'} = $self->insert_element( 'head', 1 ); |
223
|
291
|
|
|
|
|
460
|
$self->{'_pos'} = undef; # pull it back up |
224
|
291
|
|
|
|
|
526
|
$self->{'_body'} = $self->insert_element( 'body', 1 ); |
225
|
291
|
|
|
|
|
462
|
$self->{'_pos'} = undef; # pull it back up again |
226
|
|
|
|
|
|
|
|
227
|
291
|
|
|
|
|
547
|
return $self; |
228
|
|
|
|
|
|
|
} |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
#========================================================================== |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
sub _elem # universal accessor... |
233
|
|
|
|
|
|
|
{ |
234
|
21
|
|
|
21
|
|
40
|
my ( $self, $elem, $val ) = @_; |
235
|
21
|
|
|
|
|
38
|
my $old = $self->{$elem}; |
236
|
21
|
50
|
|
|
|
49
|
$self->{$elem} = $val if defined $val; |
237
|
21
|
|
|
|
|
33
|
return $old; |
238
|
|
|
|
|
|
|
} |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
# accessors.... |
241
|
5
|
|
|
5
|
1
|
36
|
sub implicit_tags { shift->_elem( '_implicit_tags', @_ ); } |
242
|
4
|
|
|
4
|
1
|
20
|
sub implicit_body_p_tag { shift->_elem( '_implicit_body_p_tag', @_ ); } |
243
|
0
|
|
|
0
|
1
|
0
|
sub p_strict { shift->_elem( '_p_strict', @_ ); } |
244
|
5
|
|
|
5
|
1
|
17
|
sub no_space_compacting { shift->_elem( '_no_space_compacting', @_ ); } |
245
|
0
|
|
|
0
|
1
|
0
|
sub ignore_unknown { shift->_elem( '_ignore_unknown', @_ ); } |
246
|
0
|
|
|
0
|
1
|
0
|
sub ignore_text { shift->_elem( '_ignore_text', @_ ); } |
247
|
5
|
|
|
5
|
1
|
19
|
sub ignore_ignorable_whitespace { shift->_elem( '_tighten', @_ ); } |
248
|
1
|
|
|
1
|
1
|
6
|
sub store_comments { shift->_elem( '_store_comments', @_ ); } |
249
|
1
|
|
|
1
|
1
|
373
|
sub store_declarations { shift->_elem( '_store_declarations', @_ ); } |
250
|
0
|
|
|
0
|
1
|
0
|
sub store_pis { shift->_elem( '_store_pis', @_ ); } |
251
|
0
|
|
|
0
|
1
|
0
|
sub warn { shift->_elem( '_warn', @_ ); } |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
sub no_expand_entities { |
254
|
0
|
|
|
0
|
1
|
0
|
shift->_elem( '_no_expand_entities', @_ ); |
255
|
0
|
|
|
|
|
0
|
$HTML::Element::encoded_content = @_; |
256
|
|
|
|
|
|
|
} |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
#========================================================================== |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
sub warning { |
261
|
1
|
|
|
1
|
1
|
1
|
my $self = shift; |
262
|
1
|
50
|
|
|
|
4
|
CORE::warn("HTML::Parse: $_[0]\n") if $self->{'_warn'}; |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
# should maybe say HTML::TreeBuilder instead |
265
|
|
|
|
|
|
|
} |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
#========================================================================== |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
{ |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
# To avoid having to rebuild these lists constantly... |
272
|
|
|
|
|
|
|
my $_Closed_by_structurals = [qw(p h1 h2 h3 h4 h5 h6 pre textarea)]; |
273
|
|
|
|
|
|
|
my $indent; |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
sub start { |
276
|
1005
|
50
|
|
1005
|
1
|
3190
|
return if $_[0]{'_stunted'}; |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
# Accept a signal from HTML::Parser for start-tags. |
279
|
1005
|
|
|
|
|
1859
|
my ( $self, $tag, $attr ) = @_; |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
# Parser passes more, actually: |
282
|
|
|
|
|
|
|
# $self->start($tag, $attr, $attrseq, $origtext) |
283
|
|
|
|
|
|
|
# But we can merrily ignore $attrseq and $origtext. |
284
|
|
|
|
|
|
|
|
285
|
1005
|
50
|
|
|
|
1803
|
if ( $tag eq 'x-html' ) { |
286
|
0
|
|
|
|
|
0
|
print "Ignoring open-x-html tag.\n" if DEBUG; |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
# inserted by some lame code-generators. |
289
|
0
|
|
|
|
|
0
|
return; # bypass tweaking. |
290
|
|
|
|
|
|
|
} |
291
|
|
|
|
|
|
|
|
292
|
1005
|
|
|
|
|
1628
|
$tag =~ s{/$}{}s; # So turns into . Silently forgive. |
293
|
|
|
|
|
|
|
|
294
|
1005
|
50
|
|
|
|
2913
|
unless ( $tag =~ m/^[-_a-zA-Z0-9:%]+$/s ) { |
295
|
0
|
|
|
|
|
0
|
DEBUG and print "Start-tag name $tag is no good. Skipping.\n"; |
296
|
0
|
|
|
|
|
0
|
return; |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
# This avoids having Element's new() throw an exception. |
299
|
|
|
|
|
|
|
} |
300
|
|
|
|
|
|
|
|
301
|
1005
|
|
66
|
|
|
2342
|
my $ptag = ( my $pos = $self->{'_pos'} || $self )->{'_tag'}; |
302
|
1005
|
|
|
|
|
1244
|
my $already_inserted; |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
#my($indent); |
305
|
1005
|
|
|
|
|
1087
|
if (DEBUG) { |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
# optimization -- don't figure out indenting unless we're in debug mode |
308
|
|
|
|
|
|
|
my @lineage = $pos->lineage; |
309
|
|
|
|
|
|
|
$indent = ' ' x ( 1 + @lineage ); |
310
|
|
|
|
|
|
|
print $indent, "Proposing a new \U$tag\E under ", |
311
|
|
|
|
|
|
|
join( '/', map $_->{'_tag'}, reverse( $pos, @lineage ) ) |
312
|
|
|
|
|
|
|
|| 'Root', |
313
|
|
|
|
|
|
|
".\n"; |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
#} else { |
316
|
|
|
|
|
|
|
# $indent = ' '; |
317
|
|
|
|
|
|
|
} |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
#print $indent, "POS: $pos ($ptag)\n" if DEBUG > 2; |
320
|
|
|
|
|
|
|
# $attr = {%$attr}; |
321
|
|
|
|
|
|
|
|
322
|
1005
|
|
|
|
|
2362
|
foreach my $k ( keys %$attr ) { |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
# Make sure some stooge doesn't have "". |
325
|
|
|
|
|
|
|
# That happens every few million Web pages. |
326
|
503
|
50
|
33
|
|
|
1531
|
$attr->{ ' ' . $k } = delete $attr->{$k} |
327
|
|
|
|
|
|
|
if length $k and substr( $k, 0, 1 ) eq '_'; |
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
# Looks bad, but is fine for round-tripping. |
330
|
|
|
|
|
|
|
} |
331
|
|
|
|
|
|
|
|
332
|
1005
|
|
|
|
|
1836
|
my $e = $self->element_class->new( $tag, %$attr ); |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
# Make a new element object. |
335
|
|
|
|
|
|
|
# (Only rarely do we end up just throwing it away later in this call.) |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
# Some prep -- custom messiness for those damned tables, and strict P's. |
338
|
1005
|
100
|
|
|
|
2006
|
if ( $self->{'_implicit_tags'} ) { # wallawallawalla! |
339
|
|
|
|
|
|
|
|
340
|
983
|
100
|
|
|
|
1880
|
unless ( $HTML::TreeBuilder::isTableElement{$tag} ) { |
341
|
829
|
50
|
|
|
|
1673
|
if ( $ptag eq 'table' ) { |
|
|
50
|
|
|
|
|
|
342
|
0
|
|
|
|
|
0
|
print $indent, |
343
|
|
|
|
|
|
|
" * Phrasal \U$tag\E right under TABLE makes implicit TR and TD\n" |
344
|
|
|
|
|
|
|
if DEBUG > 1; |
345
|
0
|
|
|
|
|
0
|
$self->insert_element( 'tr', 1 ); |
346
|
0
|
|
|
|
|
0
|
$pos = $self->insert_element( 'td', 1 ) |
347
|
|
|
|
|
|
|
; # yes, needs updating |
348
|
|
|
|
|
|
|
} |
349
|
|
|
|
|
|
|
elsif ( $ptag eq 'tr' ) { |
350
|
0
|
|
|
|
|
0
|
print $indent, |
351
|
|
|
|
|
|
|
" * Phrasal \U$tag\E right under TR makes an implicit TD\n" |
352
|
|
|
|
|
|
|
if DEBUG > 1; |
353
|
0
|
|
|
|
|
0
|
$pos = $self->insert_element( 'td', 1 ) |
354
|
|
|
|
|
|
|
; # yes, needs updating |
355
|
|
|
|
|
|
|
} |
356
|
829
|
|
|
|
|
1151
|
$ptag = $pos->{'_tag'}; # yes, needs updating |
357
|
|
|
|
|
|
|
} |
358
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
# end of table-implication block. |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
# Now maybe do a little dance to enforce P-strictness. |
362
|
|
|
|
|
|
|
# This seems like it should be integrated with the big |
363
|
|
|
|
|
|
|
# "ALL HOPE..." block, further below, but that doesn't |
364
|
|
|
|
|
|
|
# seem feasable. |
365
|
983
|
0
|
33
|
|
|
1793
|
if ( $self->{'_p_strict'} |
|
|
|
0
|
|
|
|
|
366
|
|
|
|
|
|
|
and $HTML::TreeBuilder::isKnown{$tag} |
367
|
|
|
|
|
|
|
and not $HTML::Tagset::is_Possible_Strict_P_Content{$tag} ) |
368
|
|
|
|
|
|
|
{ |
369
|
0
|
|
|
|
|
0
|
my $here = $pos; |
370
|
0
|
|
|
|
|
0
|
my $here_tag = $ptag; |
371
|
0
|
|
|
|
|
0
|
while (1) { |
372
|
0
|
0
|
|
|
|
0
|
if ( $here_tag eq 'p' ) { |
373
|
0
|
|
|
|
|
0
|
print $indent, " * Inserting $tag closes strict P.\n" |
374
|
|
|
|
|
|
|
if DEBUG > 1; |
375
|
0
|
|
|
|
|
0
|
$self->end( \q{p} ); |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
# NB: same as \'q', but less confusing to emacs cperl-mode |
378
|
0
|
|
|
|
|
0
|
last; |
379
|
|
|
|
|
|
|
} |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
#print("Lasting from $here_tag\n"), |
382
|
|
|
|
|
|
|
last |
383
|
|
|
|
|
|
|
if $HTML::TreeBuilder::isKnown{$here_tag} |
384
|
|
|
|
|
|
|
and |
385
|
|
|
|
|
|
|
not $HTML::Tagset::is_Possible_Strict_P_Content{ |
386
|
0
|
0
|
0
|
|
|
0
|
$here_tag}; |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
# Don't keep looking up the tree if we see something that can't |
389
|
|
|
|
|
|
|
# be strict-P content. |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
$here_tag |
392
|
0
|
|
0
|
|
|
0
|
= ( $here = $here->{'_parent'} || last )->{'_tag'}; |
393
|
|
|
|
|
|
|
} # end while |
394
|
|
|
|
|
|
|
$ptag = ( $pos = $self->{'_pos'} || $self ) |
395
|
0
|
|
0
|
|
|
0
|
->{'_tag'}; # better update! |
396
|
|
|
|
|
|
|
} |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
# end of strict-p block. |
399
|
|
|
|
|
|
|
} |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
# And now, get busy... |
402
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
403
|
1005
|
100
|
|
|
|
2778
|
if ( !$self->{'_implicit_tags'} ) { # bimskalabim |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
404
|
|
|
|
|
|
|
# do nothing |
405
|
22
|
|
|
|
|
24
|
print $indent, " * _implicit_tags is off. doing nothing\n" |
406
|
|
|
|
|
|
|
if DEBUG > 1; |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
409
|
|
|
|
|
|
|
} |
410
|
|
|
|
|
|
|
elsif ( $HTML::TreeBuilder::isHeadOrBodyElement{$tag} ) { |
411
|
1
|
50
|
|
|
|
3
|
if ( $pos->is_inside('body') ) { # all is well |
|
|
50
|
|
|
|
|
|
412
|
0
|
|
|
|
|
0
|
print $indent, |
413
|
|
|
|
|
|
|
" * ambilocal element \U$tag\E is fine under BODY.\n" |
414
|
|
|
|
|
|
|
if DEBUG > 1; |
415
|
|
|
|
|
|
|
} |
416
|
|
|
|
|
|
|
elsif ( $pos->is_inside('head') ) { |
417
|
1
|
|
|
|
|
2
|
print $indent, |
418
|
|
|
|
|
|
|
" * ambilocal element \U$tag\E is fine under HEAD.\n" |
419
|
|
|
|
|
|
|
if DEBUG > 1; |
420
|
|
|
|
|
|
|
} |
421
|
|
|
|
|
|
|
else { |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
# In neither head nor body! mmmmm... put under head? |
424
|
|
|
|
|
|
|
|
425
|
0
|
0
|
|
|
|
0
|
if ( $ptag eq 'html' ) { # expected case |
426
|
|
|
|
|
|
|
# TODO?? : would there ever be a case where _head would be |
427
|
|
|
|
|
|
|
# absent from a tree that would ever be accessed at this |
428
|
|
|
|
|
|
|
# point? |
429
|
0
|
0
|
|
|
|
0
|
die "Where'd my head go?" unless ref $self->{'_head'}; |
430
|
0
|
0
|
|
|
|
0
|
if ( $self->{'_head'}{'_implicit'} ) { |
431
|
0
|
|
|
|
|
0
|
print $indent, |
432
|
|
|
|
|
|
|
" * ambilocal element \U$tag\E makes an implicit HEAD.\n" |
433
|
|
|
|
|
|
|
if DEBUG > 1; |
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
# or rather, points us at it. |
436
|
|
|
|
|
|
|
$self->{'_pos'} |
437
|
0
|
|
|
|
|
0
|
= $self->{'_head'}; # to insert under... |
438
|
|
|
|
|
|
|
} |
439
|
|
|
|
|
|
|
else { |
440
|
0
|
|
|
|
|
0
|
$self->warning( |
441
|
|
|
|
|
|
|
"Ambilocal element <$tag> not under HEAD or BODY!?" |
442
|
|
|
|
|
|
|
); |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
# Put it under HEAD by default, I guess |
445
|
|
|
|
|
|
|
$self->{'_pos'} |
446
|
0
|
|
|
|
|
0
|
= $self->{'_head'}; # to insert under... |
447
|
|
|
|
|
|
|
} |
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
} |
450
|
|
|
|
|
|
|
else { |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
# Neither under head nor body, nor right under html... pass thru? |
453
|
0
|
|
|
|
|
0
|
$self->warning( |
454
|
|
|
|
|
|
|
"Ambilocal element <$tag> neither under head nor body, nor right under html!?" |
455
|
|
|
|
|
|
|
); |
456
|
|
|
|
|
|
|
} |
457
|
|
|
|
|
|
|
} |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
460
|
|
|
|
|
|
|
} |
461
|
|
|
|
|
|
|
elsif ( $HTML::TreeBuilder::isBodyElement{$tag} ) { |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
# Ensure that we are within |
464
|
799
|
100
|
66
|
|
|
2113
|
if ( $ptag eq 'body' ) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
# We're good. |
467
|
|
|
|
|
|
|
} |
468
|
|
|
|
|
|
|
elsif ( |
469
|
|
|
|
|
|
|
$HTML::TreeBuilder::isBodyElement{$ptag} # glarg |
470
|
|
|
|
|
|
|
and not $HTML::TreeBuilder::isHeadOrBodyElement{$ptag} |
471
|
|
|
|
|
|
|
) |
472
|
|
|
|
|
|
|
{ |
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
# Special case: Save ourselves a call to is_inside further down. |
475
|
|
|
|
|
|
|
# If our $ptag is an isBodyElement element (but not an |
476
|
|
|
|
|
|
|
# isHeadOrBodyElement element), then we must be under body! |
477
|
206
|
|
|
|
|
251
|
print $indent, " * Inferring that $ptag is under BODY.\n", |
478
|
|
|
|
|
|
|
if DEBUG > 3; |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
# I think this and the test for 'body' trap everything |
481
|
|
|
|
|
|
|
# bodyworthy, except the case where the parent element is |
482
|
|
|
|
|
|
|
# under an unknown element that's a descendant of body. |
483
|
|
|
|
|
|
|
} |
484
|
|
|
|
|
|
|
elsif ( $pos->is_inside('head') ) { |
485
|
164
|
|
|
|
|
204
|
print $indent, |
486
|
|
|
|
|
|
|
" * body-element \U$tag\E minimizes HEAD, makes implicit BODY.\n" |
487
|
|
|
|
|
|
|
if DEBUG > 1; |
488
|
|
|
|
|
|
|
$ptag = ( |
489
|
|
|
|
|
|
|
$pos = $self->{'_pos'} |
490
|
|
|
|
|
|
|
= $self->{'_body'} # yes, needs updating |
491
|
|
|
|
|
|
|
|| die "Where'd my body go?" |
492
|
164
|
|
50
|
|
|
394
|
)->{'_tag'}; # yes, needs updating |
493
|
|
|
|
|
|
|
} |
494
|
|
|
|
|
|
|
elsif ( !$pos->is_inside('body') ) { |
495
|
77
|
|
|
|
|
90
|
print $indent, |
496
|
|
|
|
|
|
|
" * body-element \U$tag\E makes implicit BODY.\n" |
497
|
|
|
|
|
|
|
if DEBUG > 1; |
498
|
|
|
|
|
|
|
$ptag = ( |
499
|
|
|
|
|
|
|
$pos = $self->{'_pos'} |
500
|
|
|
|
|
|
|
= $self->{'_body'} # yes, needs updating |
501
|
|
|
|
|
|
|
|| die "Where'd my body go?" |
502
|
77
|
|
50
|
|
|
197
|
)->{'_tag'}; # yes, needs updating |
503
|
|
|
|
|
|
|
} |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
# else we ARE under body, so okay. |
506
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
# Handle implicit endings and insert based on and position |
508
|
|
|
|
|
|
|
# ... ALL HOPE ABANDON ALL YE WHO ENTER HERE ... |
509
|
799
|
100
|
100
|
|
|
7840
|
if ( $tag eq 'p' |
|
|
100
|
66
|
|
|
|
|
|
|
100
|
66
|
|
|
|
|
|
|
50
|
33
|
|
|
|
|
|
|
100
|
33
|
|
|
|
|
|
|
100
|
33
|
|
|
|
|
|
|
100
|
33
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
33
|
|
|
|
|
510
|
|
|
|
|
|
|
or $tag eq 'h1' |
511
|
|
|
|
|
|
|
or $tag eq 'h2' |
512
|
|
|
|
|
|
|
or $tag eq 'h3' |
513
|
|
|
|
|
|
|
or $tag eq 'h4' |
514
|
|
|
|
|
|
|
or $tag eq 'h5' |
515
|
|
|
|
|
|
|
or $tag eq 'h6' |
516
|
|
|
|
|
|
|
or $tag eq 'form' |
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
# Hm, should |
519
|
|
|
|
|
|
|
) |
520
|
|
|
|
|
|
|
{ |
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
# Can't have , or |
523
|
243
|
|
|
|
|
483
|
$self->end( |
524
|
|
|
|
|
|
|
$_Closed_by_structurals, |
525
|
|
|
|
|
|
|
@HTML::TreeBuilder::p_closure_barriers |
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
# used to be just li! |
528
|
|
|
|
|
|
|
); |
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
} |
531
|
|
|
|
|
|
|
elsif ( $tag eq 'ol' or $tag eq 'ul' or $tag eq 'dl' ) { |
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
# Can't have lists inside -- in the unlikely |
534
|
|
|
|
|
|
|
# event anyone tries to put them there! |
535
|
9
|
50
|
33
|
|
|
79
|
if ( $ptag eq 'h1' |
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
536
|
|
|
|
|
|
|
or $ptag eq 'h2' |
537
|
|
|
|
|
|
|
or $ptag eq 'h3' |
538
|
|
|
|
|
|
|
or $ptag eq 'h4' |
539
|
|
|
|
|
|
|
or $ptag eq 'h5' |
540
|
|
|
|
|
|
|
or $ptag eq 'h6' ) |
541
|
|
|
|
|
|
|
{ |
542
|
0
|
|
|
|
|
0
|
$self->end( \$ptag ); |
543
|
|
|
|
|
|
|
} |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
# TODO: Maybe keep closing up the tree until |
546
|
|
|
|
|
|
|
# the ptag isn't any of the above? |
547
|
|
|
|
|
|
|
# But anyone that says
|
548
|
|
|
|
|
|
|
# deserves what they get anyway. |
549
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
} |
551
|
|
|
|
|
|
|
elsif ( $tag eq 'li' ) { # list item |
552
|
|
|
|
|
|
|
# Get under a list tag, one way or another |
553
|
12
|
50
|
66
|
|
|
32
|
unless ( |
554
|
|
|
|
|
|
|
exists $HTML::TreeBuilder::isList{$ptag} |
555
|
|
|
|
|
|
|
or $self->end( \q{*}, keys %HTML::TreeBuilder::isList ) #' |
556
|
|
|
|
|
|
|
) |
557
|
|
|
|
|
|
|
{ |
558
|
0
|
|
|
|
|
0
|
print $indent, |
559
|
|
|
|
|
|
|
" * inserting implicit UL for lack of containing ", |
560
|
|
|
|
|
|
|
join( '|', keys %HTML::TreeBuilder::isList ), ".\n" |
561
|
|
|
|
|
|
|
if DEBUG > 1; |
562
|
0
|
|
|
|
|
0
|
$self->insert_element( 'ul', 1 ); |
563
|
|
|
|
|
|
|
} |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
} |
566
|
|
|
|
|
|
|
elsif ( $tag eq 'dt' or $tag eq 'dd' ) { |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
# Get under a DL, one way or another |
569
|
0
|
0
|
0
|
|
|
0
|
unless ( $ptag eq 'dl' or $self->end( \q{*}, 'dl' ) ) { #' |
570
|
0
|
|
|
|
|
0
|
print $indent, |
571
|
|
|
|
|
|
|
" * inserting implicit DL for lack of containing DL.\n" |
572
|
|
|
|
|
|
|
if DEBUG > 1; |
573
|
0
|
|
|
|
|
0
|
$self->insert_element( 'dl', 1 ); |
574
|
|
|
|
|
|
|
} |
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
} |
577
|
|
|
|
|
|
|
elsif ( $HTML::TreeBuilder::isFormElement{$tag} ) { |
578
|
1
|
50
|
33
|
|
|
6
|
if ($self->{ |
579
|
|
|
|
|
|
|
'_ignore_formies_outside_form'} # TODO: document this |
580
|
|
|
|
|
|
|
and not $pos->is_inside('form') |
581
|
|
|
|
|
|
|
) |
582
|
|
|
|
|
|
|
{ |
583
|
0
|
|
|
|
|
0
|
print $indent, |
584
|
|
|
|
|
|
|
" * ignoring \U$tag\E because not in a FORM.\n" |
585
|
|
|
|
|
|
|
if DEBUG > 1; |
586
|
0
|
|
|
|
|
0
|
return; # bypass tweaking. |
587
|
|
|
|
|
|
|
} |
588
|
1
|
50
|
|
|
|
3
|
if ( $tag eq 'option' ) { |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
# return unless $ptag eq 'select'; |
591
|
0
|
|
|
|
|
0
|
$self->end( \q{option} ); |
592
|
0
|
|
0
|
|
|
0
|
$ptag = ( $self->{'_pos'} || $self )->{'_tag'}; |
593
|
0
|
0
|
0
|
|
|
0
|
unless ( $ptag eq 'select' or $ptag eq 'optgroup' ) { |
594
|
0
|
|
|
|
|
0
|
print $indent, |
595
|
|
|
|
|
|
|
" * \U$tag\E makes an implicit SELECT.\n" |
596
|
|
|
|
|
|
|
if DEBUG > 1; |
597
|
0
|
|
|
|
|
0
|
$pos = $self->insert_element( 'select', 1 ); |
598
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
# but not a very useful select -- has no 'name' attribute! |
600
|
|
|
|
|
|
|
# is $pos's value used after this? |
601
|
|
|
|
|
|
|
} |
602
|
|
|
|
|
|
|
} |
603
|
|
|
|
|
|
|
} |
604
|
|
|
|
|
|
|
elsif ( $HTML::TreeBuilder::isTableElement{$tag} ) { |
605
|
154
|
100
|
|
|
|
340
|
if ( !$pos->is_inside('table') ) { |
606
|
3
|
|
|
|
|
4
|
print $indent, " * \U$tag\E makes an implicit TABLE\n" |
607
|
|
|
|
|
|
|
if DEBUG > 1; |
608
|
3
|
|
|
|
|
6
|
$self->insert_element( 'table', 1 ); |
609
|
|
|
|
|
|
|
} |
610
|
|
|
|
|
|
|
|
611
|
154
|
100
|
100
|
|
|
394
|
if ( $tag eq 'td' or $tag eq 'th' ) { |
612
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
# Get under a tr one way or another |
614
|
96
|
100
|
100
|
|
|
238
|
unless ( |
615
|
|
|
|
|
|
|
$ptag eq 'tr' # either under a tr |
616
|
|
|
|
|
|
|
or $self->end( \q{*}, 'tr', |
617
|
|
|
|
|
|
|
'table' ) #or we can get under one |
618
|
|
|
|
|
|
|
) |
619
|
|
|
|
|
|
|
{ |
620
|
2
|
|
|
|
|
3
|
print $indent, |
621
|
|
|
|
|
|
|
" * \U$tag\E under \U$ptag\E makes an implicit TR\n" |
622
|
|
|
|
|
|
|
if DEBUG > 1; |
623
|
2
|
|
|
|
|
5
|
$self->insert_element( 'tr', 1 ); |
624
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
# presumably pos's value isn't used after this. |
626
|
|
|
|
|
|
|
} |
627
|
|
|
|
|
|
|
} |
628
|
|
|
|
|
|
|
else { |
629
|
58
|
|
|
|
|
137
|
$self->end( \$tag, 'table' ); #' |
630
|
|
|
|
|
|
|
} |
631
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
# Hmm, I guess this is right. To work it out: |
633
|
|
|
|
|
|
|
# tr closes any open tr (limited at a table) |
634
|
|
|
|
|
|
|
# thead closes any open thead (limited at a table) |
635
|
|
|
|
|
|
|
# tbody closes any open tbody (limited at a table) |
636
|
|
|
|
|
|
|
# tfoot closes any open tfoot (limited at a table) |
637
|
|
|
|
|
|
|
# colgroup closes any open colgroup (limited at a table) |
638
|
|
|
|
|
|
|
# col can try, but will always fail, at the enclosing table, |
639
|
|
|
|
|
|
|
# as col is empty, and therefore never open! |
640
|
|
|
|
|
|
|
# But! |
641
|
|
|
|
|
|
|
# td closes any open td OR th (limited at a table) |
642
|
|
|
|
|
|
|
# th closes any open th OR td (limited at a table) |
643
|
|
|
|
|
|
|
# ...implementable as "close to a tr, or make a tr" |
644
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
} |
646
|
|
|
|
|
|
|
elsif ( $HTML::TreeBuilder::isPhraseMarkup{$tag} ) { |
647
|
347
|
100
|
100
|
|
|
1012
|
if ( $ptag eq 'body' and $self->{'_implicit_body_p_tag'} ) { |
648
|
1
|
|
|
|
|
1
|
print |
649
|
|
|
|
|
|
|
" * Phrasal \U$tag\E right under BODY makes an implicit P\n" |
650
|
|
|
|
|
|
|
if DEBUG > 1; |
651
|
1
|
|
|
|
|
4
|
$pos = $self->insert_element( 'p', 1 ); |
652
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
# is $pos's value used after this? |
654
|
|
|
|
|
|
|
} |
655
|
|
|
|
|
|
|
} |
656
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
# End of implicit endings logic |
658
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
# End of "elsif ($HTML::TreeBuilder::isBodyElement{$tag}" |
660
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
661
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
} |
663
|
|
|
|
|
|
|
elsif ( $HTML::TreeBuilder::isHeadElement{$tag} ) { |
664
|
167
|
100
|
|
|
|
328
|
if ( $pos->is_inside('body') ) { |
|
|
100
|
|
|
|
|
|
665
|
1
|
|
|
|
|
1
|
print $indent, " * head element \U$tag\E found inside BODY!\n" |
666
|
|
|
|
|
|
|
if DEBUG; |
667
|
1
|
|
|
|
|
6
|
$self->warning("Header element <$tag> in body"); # [sic] |
668
|
|
|
|
|
|
|
} |
669
|
|
|
|
|
|
|
elsif ( !$pos->is_inside('head') ) { |
670
|
164
|
|
|
|
|
176
|
print $indent, |
671
|
|
|
|
|
|
|
" * head element \U$tag\E makes an implicit HEAD.\n" |
672
|
|
|
|
|
|
|
if DEBUG > 1; |
673
|
|
|
|
|
|
|
} |
674
|
|
|
|
|
|
|
else { |
675
|
2
|
|
|
|
|
3
|
print $indent, |
676
|
|
|
|
|
|
|
" * head element \U$tag\E goes inside existing HEAD.\n" |
677
|
|
|
|
|
|
|
if DEBUG > 1; |
678
|
|
|
|
|
|
|
} |
679
|
167
|
|
50
|
|
|
381
|
$self->{'_pos'} = $self->{'_head'} || die "Where'd my head go?"; |
680
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
682
|
|
|
|
|
|
|
} |
683
|
|
|
|
|
|
|
elsif ( $tag eq 'html' ) { |
684
|
4
|
50
|
|
|
|
12
|
if ( delete $self->{'_implicit'} ) { # first time here |
685
|
4
|
|
|
|
|
4
|
print $indent, " * good! found the real HTML element!\n" |
686
|
|
|
|
|
|
|
if DEBUG > 1; |
687
|
|
|
|
|
|
|
} |
688
|
|
|
|
|
|
|
else { |
689
|
0
|
|
|
|
|
0
|
print $indent, " * Found a second HTML element\n" |
690
|
|
|
|
|
|
|
if DEBUG; |
691
|
0
|
|
|
|
|
0
|
$self->warning("Found a nested element"); |
692
|
|
|
|
|
|
|
} |
693
|
|
|
|
|
|
|
|
694
|
|
|
|
|
|
|
# in either case, migrate attributes to the real element |
695
|
4
|
|
|
|
|
11
|
for ( keys %$attr ) { |
696
|
0
|
|
|
|
|
0
|
$self->attr( $_, $attr->{$_} ); |
697
|
|
|
|
|
|
|
} |
698
|
4
|
|
|
|
|
6
|
$self->{'_pos'} = undef; |
699
|
4
|
|
|
|
|
31
|
return $self; # bypass tweaking. |
700
|
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
702
|
|
|
|
|
|
|
} |
703
|
|
|
|
|
|
|
elsif ( $tag eq 'head' ) { |
704
|
6
|
|
50
|
|
|
60
|
my $head = $self->{'_head'} || die "Where'd my head go?"; |
705
|
6
|
50
|
|
|
|
16
|
if ( delete $head->{'_implicit'} ) { # first time here |
706
|
6
|
|
|
|
|
9
|
print $indent, " * good! found the real HEAD element!\n" |
707
|
|
|
|
|
|
|
if DEBUG > 1; |
708
|
|
|
|
|
|
|
} |
709
|
|
|
|
|
|
|
else { # been here before |
710
|
0
|
|
|
|
|
0
|
print $indent, " * Found a second HEAD element\n" |
711
|
|
|
|
|
|
|
if DEBUG; |
712
|
0
|
|
|
|
|
0
|
$self->warning("Found a second element"); |
713
|
|
|
|
|
|
|
} |
714
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
# in either case, migrate attributes to the real element |
716
|
6
|
|
|
|
|
15
|
for ( keys %$attr ) { |
717
|
0
|
|
|
|
|
0
|
$head->attr( $_, $attr->{$_} ); |
718
|
|
|
|
|
|
|
} |
719
|
6
|
|
|
|
|
37
|
return $self->{'_pos'} = $head; # bypass tweaking. |
720
|
|
|
|
|
|
|
|
721
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
722
|
|
|
|
|
|
|
} |
723
|
|
|
|
|
|
|
elsif ( $tag eq 'body' ) { |
724
|
6
|
|
50
|
|
|
20
|
my $body = $self->{'_body'} || die "Where'd my body go?"; |
725
|
6
|
50
|
|
|
|
15
|
if ( delete $body->{'_implicit'} ) { # first time here |
726
|
6
|
|
|
|
|
10
|
print $indent, " * good! found the real BODY element!\n" |
727
|
|
|
|
|
|
|
if DEBUG > 1; |
728
|
|
|
|
|
|
|
} |
729
|
|
|
|
|
|
|
else { # been here before |
730
|
0
|
|
|
|
|
0
|
print $indent, " * Found a second BODY element\n" |
731
|
|
|
|
|
|
|
if DEBUG; |
732
|
0
|
|
|
|
|
0
|
$self->warning("Found a second element"); |
733
|
|
|
|
|
|
|
} |
734
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
# in either case, migrate attributes to the real element |
736
|
6
|
|
|
|
|
18
|
for ( keys %$attr ) { |
737
|
0
|
|
|
|
|
0
|
$body->attr( $_, $attr->{$_} ); |
738
|
|
|
|
|
|
|
} |
739
|
6
|
|
|
|
|
43
|
return $self->{'_pos'} = $body; # bypass tweaking. |
740
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
742
|
|
|
|
|
|
|
} |
743
|
|
|
|
|
|
|
elsif ( $tag eq 'frameset' ) { |
744
|
0
|
0
|
0
|
|
|
0
|
if (!( $self->{'_frameset_seen'}++ ) # first frameset seen |
|
|
|
0
|
|
|
|
|
745
|
|
|
|
|
|
|
and !$self->{'_noframes_seen'} |
746
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
# otherwise it'll be under the noframes already |
748
|
|
|
|
|
|
|
and !$self->is_inside('body') |
749
|
|
|
|
|
|
|
) |
750
|
|
|
|
|
|
|
{ |
751
|
|
|
|
|
|
|
|
752
|
|
|
|
|
|
|
# The following is a bit of a hack. We don't use the normal |
753
|
|
|
|
|
|
|
# insert_element because 1) we don't want it as _pos, but instead |
754
|
|
|
|
|
|
|
# right under $self, and 2), more importantly, that we don't want |
755
|
|
|
|
|
|
|
# this inserted at the /end/ of $self's content_list, but instead |
756
|
|
|
|
|
|
|
# in the middle of it, specifically right before the body element. |
757
|
|
|
|
|
|
|
# |
758
|
0
|
|
0
|
|
|
0
|
my $c = $self->{'_content'} || die "Contentless root?"; |
759
|
0
|
|
0
|
|
|
0
|
my $body = $self->{'_body'} || die "Where'd my BODY go?"; |
760
|
0
|
|
|
|
|
0
|
for ( my $i = 0; $i < @$c; ++$i ) { |
761
|
0
|
0
|
|
|
|
0
|
if ( $c->[$i] eq $body ) { |
762
|
0
|
|
|
|
|
0
|
splice( @$c, $i, 0, $self->{'_pos'} = $pos = $e ); |
763
|
0
|
|
|
|
|
0
|
HTML::Element::_weaken($e->{'_parent'} = $self); |
764
|
0
|
|
|
|
|
0
|
$already_inserted = 1; |
765
|
0
|
|
|
|
|
0
|
print $indent, |
766
|
|
|
|
|
|
|
" * inserting 'frameset' right before BODY.\n" |
767
|
|
|
|
|
|
|
if DEBUG > 1; |
768
|
0
|
|
|
|
|
0
|
last; |
769
|
|
|
|
|
|
|
} |
770
|
|
|
|
|
|
|
} |
771
|
0
|
0
|
|
|
|
0
|
die "BODY not found in children of root?" |
772
|
|
|
|
|
|
|
unless $already_inserted; |
773
|
|
|
|
|
|
|
} |
774
|
|
|
|
|
|
|
|
775
|
|
|
|
|
|
|
} |
776
|
|
|
|
|
|
|
elsif ( $tag eq 'frame' ) { |
777
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
# Okay, fine, pass thru. |
779
|
|
|
|
|
|
|
# Should probably enforce that these should be under a frameset. |
780
|
|
|
|
|
|
|
# But hey. Ditto for enforcing that 'noframes' should be under |
781
|
|
|
|
|
|
|
# a 'frameset', as the DTDs say. |
782
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
} |
784
|
|
|
|
|
|
|
elsif ( $tag eq 'noframes' ) { |
785
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
# This basically assumes there'll be exactly one 'noframes' element |
787
|
|
|
|
|
|
|
# per document. At least, only the first one gets to have the |
788
|
|
|
|
|
|
|
# body under it. And if there are no noframes elements, then |
789
|
|
|
|
|
|
|
# the body pretty much stays where it is. Is that ever a problem? |
790
|
0
|
0
|
|
|
|
0
|
if ( $self->{'_noframes_seen'}++ ) { |
791
|
0
|
|
|
|
|
0
|
print $indent, " * ANOTHER noframes element?\n" if DEBUG; |
792
|
|
|
|
|
|
|
} |
793
|
|
|
|
|
|
|
else { |
794
|
0
|
0
|
|
|
|
0
|
if ( $pos->is_inside('body') ) { |
795
|
0
|
|
|
|
|
0
|
print $indent, " * 'noframes' inside 'body'. Odd!\n" |
796
|
|
|
|
|
|
|
if DEBUG; |
797
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
# In that odd case, we /can't/ make body a child of 'noframes', |
799
|
|
|
|
|
|
|
# because it's an ancestor of the 'noframes'! |
800
|
|
|
|
|
|
|
} |
801
|
|
|
|
|
|
|
else { |
802
|
0
|
|
0
|
|
|
0
|
$e->push_content( $self->{'_body'} |
803
|
|
|
|
|
|
|
|| die "Where'd my body go?" ); |
804
|
0
|
|
|
|
|
0
|
print $indent, " * Moving body to be under noframes.\n" |
805
|
|
|
|
|
|
|
if DEBUG; |
806
|
|
|
|
|
|
|
} |
807
|
|
|
|
|
|
|
} |
808
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
810
|
|
|
|
|
|
|
} |
811
|
|
|
|
|
|
|
else { |
812
|
|
|
|
|
|
|
|
813
|
|
|
|
|
|
|
# unknown tag |
814
|
0
|
0
|
|
|
|
0
|
if ( $self->{'_ignore_unknown'} ) { |
815
|
0
|
|
|
|
|
0
|
print $indent, " * Ignoring unknown tag \U$tag\E\n" if DEBUG; |
816
|
0
|
|
|
|
|
0
|
$self->warning("Skipping unknown tag $tag"); |
817
|
0
|
|
|
|
|
0
|
return; |
818
|
|
|
|
|
|
|
} |
819
|
|
|
|
|
|
|
else { |
820
|
0
|
|
|
|
|
0
|
print $indent, " * Accepting unknown tag \U$tag\E\n" |
821
|
|
|
|
|
|
|
if DEBUG; |
822
|
|
|
|
|
|
|
} |
823
|
|
|
|
|
|
|
} |
824
|
|
|
|
|
|
|
|
825
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
826
|
|
|
|
|
|
|
# End of mumbo-jumbo |
827
|
|
|
|
|
|
|
|
828
|
|
|
|
|
|
|
print $indent, "(Attaching ", $e->{'_tag'}, " under ", |
829
|
989
|
|
|
|
|
1178
|
( $self->{'_pos'} || $self )->{'_tag'}, ")\n" |
830
|
|
|
|
|
|
|
|
831
|
|
|
|
|
|
|
# because if _pos isn't defined, it goes under self |
832
|
|
|
|
|
|
|
if DEBUG; |
833
|
|
|
|
|
|
|
|
834
|
|
|
|
|
|
|
# The following if-clause is to delete /some/ ignorable whitespace |
835
|
|
|
|
|
|
|
# nodes, as we're making the tree. |
836
|
|
|
|
|
|
|
# This'd be a node we'd catch later anyway, but we might as well |
837
|
|
|
|
|
|
|
# nip it in the bud now. |
838
|
|
|
|
|
|
|
# This doesn't catch /all/ deletable WS-nodes, so we do have to call |
839
|
|
|
|
|
|
|
# the tightener later to catch the rest. |
840
|
|
|
|
|
|
|
|
841
|
989
|
100
|
66
|
|
|
2862
|
if ( $self->{'_tighten'} and !$self->{'_ignore_text'} ) |
842
|
|
|
|
|
|
|
{ # if tightenable |
843
|
980
|
|
|
|
|
1221
|
my ( $sibs, $par ); |
844
|
980
|
100
|
66
|
|
|
5251
|
if (( $sibs = ( $par = $self->{'_pos'} || $self )->{'_content'} ) |
|
|
|
66
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
66
|
|
|
|
|
845
|
|
|
|
|
|
|
and @$sibs # parent already has content |
846
|
|
|
|
|
|
|
and ! |
847
|
|
|
|
|
|
|
ref( $sibs->[-1] ) # and the last one there is a text node |
848
|
|
|
|
|
|
|
and $sibs->[-1] !~ m<[^\n\r\f\t ]>s # and it's all whitespace |
849
|
|
|
|
|
|
|
|
850
|
|
|
|
|
|
|
and ( # one of these has to be eligible... |
851
|
|
|
|
|
|
|
$HTML::TreeBuilder::canTighten{$tag} |
852
|
|
|
|
|
|
|
or (( @$sibs == 1 ) |
853
|
|
|
|
|
|
|
? # WS is leftmost -- so parent matters |
854
|
|
|
|
|
|
|
$HTML::TreeBuilder::canTighten{ $par->{'_tag'} } |
855
|
|
|
|
|
|
|
: # WS is after another node -- it matters |
856
|
|
|
|
|
|
|
( ref $sibs->[-2] |
857
|
|
|
|
|
|
|
and |
858
|
|
|
|
|
|
|
$HTML::TreeBuilder::canTighten{ $sibs->[-2] |
859
|
|
|
|
|
|
|
{'_tag'} } |
860
|
|
|
|
|
|
|
) |
861
|
|
|
|
|
|
|
) |
862
|
|
|
|
|
|
|
) |
863
|
|
|
|
|
|
|
|
864
|
|
|
|
|
|
|
and !$par->is_inside( 'pre', 'xmp', 'textarea', 'plaintext' ) |
865
|
|
|
|
|
|
|
|
866
|
|
|
|
|
|
|
# we're clear |
867
|
|
|
|
|
|
|
) |
868
|
|
|
|
|
|
|
{ |
869
|
34
|
|
|
|
|
48
|
pop @$sibs; |
870
|
34
|
|
|
|
|
51
|
print $indent, "Popping a preceding all-WS node\n" if DEBUG; |
871
|
|
|
|
|
|
|
} |
872
|
|
|
|
|
|
|
} |
873
|
|
|
|
|
|
|
|
874
|
989
|
50
|
|
|
|
2927
|
$self->insert_element($e) unless $already_inserted; |
875
|
|
|
|
|
|
|
|
876
|
989
|
|
|
|
|
1048
|
if (DEBUG) { |
877
|
|
|
|
|
|
|
if ( $self->{'_pos'} ) { |
878
|
|
|
|
|
|
|
print $indent, "(Current lineage of pos: \U$tag\E under ", |
879
|
|
|
|
|
|
|
join( |
880
|
|
|
|
|
|
|
'/', |
881
|
|
|
|
|
|
|
reverse( |
882
|
|
|
|
|
|
|
|
883
|
|
|
|
|
|
|
# $self->{'_pos'}{'_tag'}, # don't list myself! |
884
|
|
|
|
|
|
|
$self->{'_pos'}->lineage_tag_names |
885
|
|
|
|
|
|
|
) |
886
|
|
|
|
|
|
|
), |
887
|
|
|
|
|
|
|
".)\n"; |
888
|
|
|
|
|
|
|
} |
889
|
|
|
|
|
|
|
else { |
890
|
|
|
|
|
|
|
print $indent, "(Pos points nowhere!?)\n"; |
891
|
|
|
|
|
|
|
} |
892
|
|
|
|
|
|
|
} |
893
|
|
|
|
|
|
|
|
894
|
989
|
100
|
50
|
|
|
2694
|
unless ( ( $self->{'_pos'} || '' ) eq $e ) { |
895
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
# if it's an empty element -- i.e., if it didn't change the _pos |
897
|
15
|
|
|
|
|
41
|
&{ $self->{"_tweak_$tag"} |
898
|
15
|
50
|
33
|
|
|
143
|
|| $self->{'_tweak_*'} |
899
|
|
|
|
|
|
|
|| return $e }( map $_, $e, $tag, $self ) |
900
|
|
|
|
|
|
|
; # make a list so the user can't clobber |
901
|
|
|
|
|
|
|
} |
902
|
|
|
|
|
|
|
|
903
|
974
|
|
|
|
|
5540
|
return $e; |
904
|
|
|
|
|
|
|
} |
905
|
|
|
|
|
|
|
} |
906
|
|
|
|
|
|
|
|
907
|
|
|
|
|
|
|
#========================================================================== |
908
|
|
|
|
|
|
|
|
909
|
|
|
|
|
|
|
{ |
910
|
|
|
|
|
|
|
my $indent; |
911
|
|
|
|
|
|
|
|
912
|
|
|
|
|
|
|
sub end { |
913
|
1400
|
50
|
|
1400
|
1
|
2838
|
return if $_[0]{'_stunted'}; |
914
|
|
|
|
|
|
|
|
915
|
|
|
|
|
|
|
# Either: Acccept an end-tag signal from HTML::Parser |
916
|
|
|
|
|
|
|
# Or: Method for closing currently open elements in some fairly complex |
917
|
|
|
|
|
|
|
# way, as used by other methods in this class. |
918
|
1400
|
|
|
|
|
3004
|
my ( $self, $tag, @stop ) = @_; |
919
|
1400
|
50
|
|
|
|
2416
|
if ( $tag eq 'x-html' ) { |
920
|
0
|
|
|
|
|
0
|
print "Ignoring close-x-html tag.\n" if DEBUG; |
921
|
|
|
|
|
|
|
|
922
|
|
|
|
|
|
|
# inserted by some lame code-generators. |
923
|
0
|
|
|
|
|
0
|
return; |
924
|
|
|
|
|
|
|
} |
925
|
|
|
|
|
|
|
|
926
|
1400
|
50
|
66
|
|
|
4818
|
unless ( ref($tag) or $tag =~ m/^[-_a-zA-Z0-9:%]+$/s ) { |
927
|
0
|
|
|
|
|
0
|
DEBUG and print "End-tag name $tag is no good. Skipping.\n"; |
928
|
0
|
|
|
|
|
0
|
return; |
929
|
|
|
|
|
|
|
|
930
|
|
|
|
|
|
|
# This avoids having Element's new() throw an exception. |
931
|
|
|
|
|
|
|
} |
932
|
|
|
|
|
|
|
|
933
|
|
|
|
|
|
|
# This method accepts two calling formats: |
934
|
|
|
|
|
|
|
# 1) from Parser: $self->end('tag_name', 'origtext') |
935
|
|
|
|
|
|
|
# in which case we shouldn't mistake origtext as a blocker tag |
936
|
|
|
|
|
|
|
# 2) from myself: $self->end(\q{tagname1}, 'blk1', ... ) |
937
|
|
|
|
|
|
|
# from myself: $self->end(['tagname1', 'tagname2'], 'blk1', ... ) |
938
|
|
|
|
|
|
|
|
939
|
|
|
|
|
|
|
# End the specified tag, but don't move above any of the blocker tags. |
940
|
|
|
|
|
|
|
# The tag can also be a reference to an array. Terminate the first |
941
|
|
|
|
|
|
|
# tag found. |
942
|
|
|
|
|
|
|
|
943
|
1400
|
|
66
|
|
|
2886
|
my $ptag = ( my $p = $self->{'_pos'} || $self )->{'_tag'}; |
944
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
# $p and $ptag are sort-of stratch |
946
|
|
|
|
|
|
|
|
947
|
1400
|
100
|
|
|
|
2055
|
if ( ref($tag) ) { |
948
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
# First param is a ref of one sort or another -- |
950
|
|
|
|
|
|
|
# THE CALL IS COMING FROM INSIDE THE HOUSE! |
951
|
336
|
100
|
|
|
|
606
|
$tag = $$tag if ref($tag) eq 'SCALAR'; |
952
|
|
|
|
|
|
|
|
953
|
|
|
|
|
|
|
# otherwise it's an arrayref. |
954
|
|
|
|
|
|
|
} |
955
|
|
|
|
|
|
|
else { |
956
|
|
|
|
|
|
|
|
957
|
|
|
|
|
|
|
# the call came from Parser -- just ignore origtext |
958
|
|
|
|
|
|
|
# except in a table ignore unmatched table tags RT #59980 |
959
|
1064
|
100
|
|
|
|
2274
|
@stop = $tag =~ /^t[hdr]\z/ ? 'table' : (); |
960
|
|
|
|
|
|
|
} |
961
|
|
|
|
|
|
|
|
962
|
|
|
|
|
|
|
#my($indent); |
963
|
1400
|
|
|
|
|
1518
|
if (DEBUG) { |
964
|
|
|
|
|
|
|
|
965
|
|
|
|
|
|
|
# optimization -- don't figure out depth unless we're in debug mode |
966
|
|
|
|
|
|
|
my @lineage_tags = $p->lineage_tag_names; |
967
|
|
|
|
|
|
|
$indent = ' ' x ( 1 + @lineage_tags ); |
968
|
|
|
|
|
|
|
|
969
|
|
|
|
|
|
|
# now announce ourselves |
970
|
|
|
|
|
|
|
print $indent, "Ending ", |
971
|
|
|
|
|
|
|
ref($tag) ? ( '[', join( ' ', @$tag ), ']' ) : "\U$tag\E", |
972
|
|
|
|
|
|
|
scalar(@stop) |
973
|
|
|
|
|
|
|
? ( " no higher than [", join( ' ', @stop ), "]" ) |
974
|
|
|
|
|
|
|
: (), ".\n"; |
975
|
|
|
|
|
|
|
|
976
|
|
|
|
|
|
|
print $indent, " (Current lineage: ", join( '/', @lineage_tags ), |
977
|
|
|
|
|
|
|
".)\n" |
978
|
|
|
|
|
|
|
if DEBUG > 1; |
979
|
|
|
|
|
|
|
|
980
|
|
|
|
|
|
|
if ( DEBUG > 3 ) { |
981
|
|
|
|
|
|
|
|
982
|
|
|
|
|
|
|
#my( |
983
|
|
|
|
|
|
|
# $package, $filename, $line, $subroutine, |
984
|
|
|
|
|
|
|
# $hasargs, $wantarray, $evaltext, $is_require) = caller; |
985
|
|
|
|
|
|
|
print $indent, |
986
|
|
|
|
|
|
|
" (Called from ", ( caller(1) )[3], ' line ', |
987
|
|
|
|
|
|
|
( caller(1) )[2], |
988
|
|
|
|
|
|
|
")\n"; |
989
|
|
|
|
|
|
|
} |
990
|
|
|
|
|
|
|
|
991
|
|
|
|
|
|
|
#} else { |
992
|
|
|
|
|
|
|
# $indent = ' '; |
993
|
|
|
|
|
|
|
} |
994
|
|
|
|
|
|
|
|
995
|
|
|
|
|
|
|
# End of if DEBUG |
996
|
|
|
|
|
|
|
|
997
|
|
|
|
|
|
|
# Now actually do it |
998
|
1400
|
|
|
|
|
1533
|
my @to_close; |
999
|
1400
|
100
|
|
|
|
2428
|
if ( $tag eq '*' ) { |
|
|
100
|
|
|
|
|
|
1000
|
|
|
|
|
|
|
|
1001
|
|
|
|
|
|
|
# Special -- close everything up to (but not including) the first |
1002
|
|
|
|
|
|
|
# limiting tag, or return if none found. Somewhat of a special case. |
1003
|
|
|
|
|
|
|
PARENT: |
1004
|
35
|
|
|
|
|
62
|
while ( defined $p ) { |
1005
|
68
|
|
|
|
|
102
|
$ptag = $p->{'_tag'}; |
1006
|
68
|
|
|
|
|
70
|
print $indent, " (Looking at $ptag.)\n" if DEBUG > 2; |
1007
|
68
|
|
|
|
|
95
|
for (@stop) { |
1008
|
113
|
100
|
|
|
|
223
|
if ( $ptag eq $_ ) { |
1009
|
35
|
|
|
|
|
42
|
print $indent, |
1010
|
|
|
|
|
|
|
" (Hit a $_; closing everything up to here.)\n" |
1011
|
|
|
|
|
|
|
if DEBUG > 2; |
1012
|
35
|
|
|
|
|
59
|
last PARENT; |
1013
|
|
|
|
|
|
|
} |
1014
|
|
|
|
|
|
|
} |
1015
|
33
|
|
|
|
|
46
|
push @to_close, $p; |
1016
|
33
|
|
|
|
|
43
|
$p = $p->{'_parent'}; # no match so far? keep moving up |
1017
|
|
|
|
|
|
|
print $indent, |
1018
|
33
|
|
|
|
|
53
|
" (Moving on up to ", $p ? $p->{'_tag'} : 'nil', ")\n" |
1019
|
|
|
|
|
|
|
if DEBUG > 1; |
1020
|
|
|
|
|
|
|
} |
1021
|
35
|
50
|
|
|
|
57
|
unless ( defined $p ) { # We never found what we were looking for. |
1022
|
0
|
|
|
|
|
0
|
print $indent, " (We never found a limit.)\n" if DEBUG > 1; |
1023
|
0
|
|
|
|
|
0
|
return; |
1024
|
|
|
|
|
|
|
} |
1025
|
|
|
|
|
|
|
|
1026
|
|
|
|
|
|
|
#print |
1027
|
|
|
|
|
|
|
# $indent, |
1028
|
|
|
|
|
|
|
# " (To close: ", join('/', map $_->tag, @to_close), ".)\n" |
1029
|
|
|
|
|
|
|
# if DEBUG > 4; |
1030
|
|
|
|
|
|
|
|
1031
|
|
|
|
|
|
|
# Otherwise update pos and fall thru. |
1032
|
35
|
|
|
|
|
48
|
$self->{'_pos'} = $p; |
1033
|
|
|
|
|
|
|
} |
1034
|
|
|
|
|
|
|
elsif ( ref $tag ) { |
1035
|
|
|
|
|
|
|
|
1036
|
|
|
|
|
|
|
# Close the first of any of the matching tags, giving up if you hit |
1037
|
|
|
|
|
|
|
# any of the stop-tags. |
1038
|
|
|
|
|
|
|
PARENT: |
1039
|
243
|
|
|
|
|
405
|
while ( defined $p ) { |
1040
|
463
|
|
|
|
|
579
|
$ptag = $p->{'_tag'}; |
1041
|
463
|
|
|
|
|
765
|
print $indent, " (Looking at $ptag.)\n" if DEBUG > 2; |
1042
|
463
|
|
|
|
|
678
|
for (@$tag) { |
1043
|
3983
|
100
|
|
|
|
5464
|
if ( $ptag eq $_ ) { |
1044
|
23
|
|
|
|
|
31
|
print $indent, " (Closing $_.)\n" if DEBUG > 2; |
1045
|
23
|
|
|
|
|
37
|
last PARENT; |
1046
|
|
|
|
|
|
|
} |
1047
|
|
|
|
|
|
|
} |
1048
|
440
|
|
|
|
|
543
|
for (@stop) { |
1049
|
6600
|
50
|
|
|
|
8987
|
if ( $ptag eq $_ ) { |
1050
|
0
|
|
|
|
|
0
|
print $indent, |
1051
|
|
|
|
|
|
|
" (Hit a limiting $_ -- bailing out.)\n" |
1052
|
|
|
|
|
|
|
if DEBUG > 1; |
1053
|
0
|
|
|
|
|
0
|
return; # so it was all for naught |
1054
|
|
|
|
|
|
|
} |
1055
|
|
|
|
|
|
|
} |
1056
|
440
|
|
|
|
|
561
|
push @to_close, $p; |
1057
|
440
|
|
|
|
|
742
|
$p = $p->{'_parent'}; |
1058
|
|
|
|
|
|
|
} |
1059
|
243
|
100
|
|
|
|
698
|
return unless defined $p; # We went off the top of the tree. |
1060
|
|
|
|
|
|
|
# Otherwise specified element was found; set pos to its parent. |
1061
|
23
|
|
|
|
|
30
|
push @to_close, $p; |
1062
|
23
|
|
|
|
|
38
|
$self->{'_pos'} = $p->{'_parent'}; |
1063
|
|
|
|
|
|
|
} |
1064
|
|
|
|
|
|
|
else { |
1065
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
# Close the first of the specified tag, giving up if you hit |
1067
|
|
|
|
|
|
|
# any of the stop-tags. |
1068
|
1122
|
|
|
|
|
1883
|
while ( defined $p ) { |
1069
|
1511
|
|
|
|
|
1962
|
$ptag = $p->{'_tag'}; |
1070
|
1511
|
|
|
|
|
1561
|
print $indent, " (Looking at $ptag.)\n" if DEBUG > 2; |
1071
|
1511
|
100
|
|
|
|
2661
|
if ( $ptag eq $tag ) { |
1072
|
1092
|
|
|
|
|
1109
|
print $indent, " (Closing $tag.)\n" if DEBUG > 2; |
1073
|
1092
|
|
|
|
|
1340
|
last; |
1074
|
|
|
|
|
|
|
} |
1075
|
419
|
|
|
|
|
583
|
for (@stop) { |
1076
|
58
|
100
|
|
|
|
121
|
if ( $ptag eq $_ ) { |
1077
|
29
|
|
|
|
|
33
|
print $indent, |
1078
|
|
|
|
|
|
|
" (Hit a limiting $_ -- bailing out.)\n" |
1079
|
|
|
|
|
|
|
if DEBUG > 1; |
1080
|
29
|
|
|
|
|
69
|
return; # so it was all for naught |
1081
|
|
|
|
|
|
|
} |
1082
|
|
|
|
|
|
|
} |
1083
|
390
|
|
|
|
|
511
|
push @to_close, $p; |
1084
|
390
|
|
|
|
|
640
|
$p = $p->{'_parent'}; |
1085
|
|
|
|
|
|
|
} |
1086
|
1093
|
100
|
|
|
|
1610
|
return unless defined $p; # We went off the top of the tree. |
1087
|
|
|
|
|
|
|
# Otherwise specified element was found; set pos to its parent. |
1088
|
1092
|
|
|
|
|
1384
|
push @to_close, $p; |
1089
|
1092
|
|
|
|
|
1527
|
$self->{'_pos'} = $p->{'_parent'}; |
1090
|
|
|
|
|
|
|
} |
1091
|
|
|
|
|
|
|
|
1092
|
1150
|
100
|
100
|
|
|
3218
|
$self->{'_pos'} = undef if $self eq ( $self->{'_pos'} || '' ); |
1093
|
|
|
|
|
|
|
print $indent, "(Pos now points to ", |
1094
|
1150
|
|
|
|
|
1244
|
$self->{'_pos'} ? $self->{'_pos'}{'_tag'} : '???', ".)\n" |
1095
|
|
|
|
|
|
|
if DEBUG > 1; |
1096
|
|
|
|
|
|
|
|
1097
|
|
|
|
|
|
|
### EXPENSIVE, because has to check that it's not under a pre |
1098
|
|
|
|
|
|
|
### or a CDATA-parent. That's one more method call per end()! |
1099
|
|
|
|
|
|
|
### Might as well just do this at the end of the tree-parse, I guess, |
1100
|
|
|
|
|
|
|
### at which point we'd be parsing top-down, and just not traversing |
1101
|
|
|
|
|
|
|
### under pre's or CDATA-parents. |
1102
|
|
|
|
|
|
|
## |
1103
|
|
|
|
|
|
|
## Take this opportunity to nix any terminal whitespace nodes. |
1104
|
|
|
|
|
|
|
## TODO: consider whether this (plus the logic in start(), above) |
1105
|
|
|
|
|
|
|
## would ever leave any WS nodes in the tree. |
1106
|
|
|
|
|
|
|
## If not, then there's no reason to have eof() call |
1107
|
|
|
|
|
|
|
## delete_ignorable_whitespace on the tree, is there? |
1108
|
|
|
|
|
|
|
## |
1109
|
|
|
|
|
|
|
#if(@to_close and $self->{'_tighten'} and !$self->{'_ignore_text'} and |
1110
|
|
|
|
|
|
|
# ! $to_close[-1]->is_inside('pre', keys %HTML::Tagset::isCDATA_Parent) |
1111
|
|
|
|
|
|
|
#) { # if tightenable |
1112
|
|
|
|
|
|
|
# my($children, $e_tag); |
1113
|
|
|
|
|
|
|
# foreach my $e (reverse @to_close) { # going top-down |
1114
|
|
|
|
|
|
|
# last if 'pre' eq ($e_tag = $e->{'_tag'}) or |
1115
|
|
|
|
|
|
|
# $HTML::Tagset::isCDATA_Parent{$e_tag}; |
1116
|
|
|
|
|
|
|
# |
1117
|
|
|
|
|
|
|
# if( |
1118
|
|
|
|
|
|
|
# $children = $e->{'_content'} |
1119
|
|
|
|
|
|
|
# and @$children # has children |
1120
|
|
|
|
|
|
|
# and !ref($children->[-1]) |
1121
|
|
|
|
|
|
|
# and $children->[-1] =~ m<^\s+$>s # last node is all-WS |
1122
|
|
|
|
|
|
|
# and |
1123
|
|
|
|
|
|
|
# ( |
1124
|
|
|
|
|
|
|
# # has a tightable parent: |
1125
|
|
|
|
|
|
|
# $HTML::TreeBuilder::canTighten{ $e_tag } |
1126
|
|
|
|
|
|
|
# or |
1127
|
|
|
|
|
|
|
# ( # has a tightenable left sibling: |
1128
|
|
|
|
|
|
|
# @$children > 1 and |
1129
|
|
|
|
|
|
|
# ref($children->[-2]) |
1130
|
|
|
|
|
|
|
# and $HTML::TreeBuilder::canTighten{ $children->[-2]{'_tag'} } |
1131
|
|
|
|
|
|
|
# ) |
1132
|
|
|
|
|
|
|
# ) |
1133
|
|
|
|
|
|
|
# ) { |
1134
|
|
|
|
|
|
|
# pop @$children; |
1135
|
|
|
|
|
|
|
# #print $indent, "Popping a terminal WS node from ", $e->{'_tag'}, |
1136
|
|
|
|
|
|
|
# # " (", $e->address, ") while exiting.\n" if DEBUG; |
1137
|
|
|
|
|
|
|
# } |
1138
|
|
|
|
|
|
|
# } |
1139
|
|
|
|
|
|
|
#} |
1140
|
|
|
|
|
|
|
|
1141
|
1150
|
|
|
|
|
1469
|
foreach my $e (@to_close) { |
1142
|
|
|
|
|
|
|
|
1143
|
|
|
|
|
|
|
# Call the applicable callback, if any |
1144
|
1535
|
|
|
|
|
1920
|
$ptag = $e->{'_tag'}; |
1145
|
1535
|
|
|
|
|
3507
|
&{ $self->{"_tweak_$ptag"} |
1146
|
1535
|
50
|
33
|
|
|
5731
|
|| $self->{'_tweak_*'} |
1147
|
|
|
|
|
|
|
|| next }( map $_, $e, $ptag, $self ); |
1148
|
0
|
|
|
|
|
0
|
print $indent, "Back from tweaking.\n" if DEBUG; |
1149
|
|
|
|
|
|
|
last |
1150
|
0
|
0
|
|
|
|
0
|
if $self->{ '_stunted' |
1151
|
|
|
|
|
|
|
}; # in case one of the handlers called stunt |
1152
|
|
|
|
|
|
|
} |
1153
|
1150
|
|
|
|
|
3992
|
return @to_close; |
1154
|
|
|
|
|
|
|
} |
1155
|
|
|
|
|
|
|
} |
1156
|
|
|
|
|
|
|
|
1157
|
|
|
|
|
|
|
#========================================================================== |
1158
|
|
|
|
|
|
|
{ |
1159
|
|
|
|
|
|
|
my ( $indent, $nugget ); |
1160
|
|
|
|
|
|
|
|
1161
|
|
|
|
|
|
|
sub text { |
1162
|
2152
|
50
|
|
2152
|
1
|
61184
|
return if $_[0]{'_stunted'}; |
1163
|
|
|
|
|
|
|
|
1164
|
|
|
|
|
|
|
# Accept a "here's a text token" signal from HTML::Parser. |
1165
|
2152
|
|
|
|
|
3839
|
my ( $self, $text, $is_cdata ) = @_; |
1166
|
|
|
|
|
|
|
|
1167
|
|
|
|
|
|
|
# the >3.0 versions of Parser may pass a cdata node. |
1168
|
|
|
|
|
|
|
# Thanks to Gisle Aas for pointing this out. |
1169
|
|
|
|
|
|
|
|
1170
|
2152
|
50
|
|
|
|
3648
|
return unless length $text; # I guess that's always right |
1171
|
|
|
|
|
|
|
|
1172
|
2152
|
|
|
|
|
2710
|
my $ignore_text = $self->{'_ignore_text'}; |
1173
|
2152
|
|
|
|
|
2568
|
my $no_space_compacting = $self->{'_no_space_compacting'}; |
1174
|
2152
|
|
|
|
|
2507
|
my $no_expand_entities = $self->{'_no_expand_entities'}; |
1175
|
2152
|
|
66
|
|
|
3752
|
my $pos = $self->{'_pos'} || $self; |
1176
|
|
|
|
|
|
|
|
1177
|
|
|
|
|
|
|
HTML::Entities::decode($text) |
1178
|
|
|
|
|
|
|
unless $ignore_text |
1179
|
|
|
|
|
|
|
|| $is_cdata |
1180
|
2152
|
100
|
33
|
|
|
13648
|
|| $HTML::Tagset::isCDATA_Parent{ $pos->{'_tag'} } |
|
|
|
33
|
|
|
|
|
|
|
|
66
|
|
|
|
|
1181
|
|
|
|
|
|
|
|| $no_expand_entities; |
1182
|
|
|
|
|
|
|
|
1183
|
|
|
|
|
|
|
#my($indent, $nugget); |
1184
|
2152
|
|
|
|
|
2538
|
if (DEBUG) { |
1185
|
|
|
|
|
|
|
|
1186
|
|
|
|
|
|
|
# optimization -- don't figure out depth unless we're in debug mode |
1187
|
|
|
|
|
|
|
my @lineage_tags = $pos->lineage_tag_names; |
1188
|
|
|
|
|
|
|
$indent = ' ' x ( 1 + @lineage_tags ); |
1189
|
|
|
|
|
|
|
|
1190
|
|
|
|
|
|
|
$nugget |
1191
|
|
|
|
|
|
|
= ( length($text) <= 25 ) |
1192
|
|
|
|
|
|
|
? $text |
1193
|
|
|
|
|
|
|
: ( substr( $text, 0, 25 ) . '...' ); |
1194
|
|
|
|
|
|
|
$nugget =~ s<([\x00-\x1F])> |
1195
|
|
|
|
|
|
|
<'\\x'.(unpack("H2",$1))>eg; |
1196
|
|
|
|
|
|
|
print $indent, "Proposing a new text node ($nugget) under ", |
1197
|
|
|
|
|
|
|
join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) |
1198
|
|
|
|
|
|
|
|| 'Root', |
1199
|
|
|
|
|
|
|
".\n"; |
1200
|
|
|
|
|
|
|
|
1201
|
|
|
|
|
|
|
#} else { |
1202
|
|
|
|
|
|
|
# $indent = ' '; |
1203
|
|
|
|
|
|
|
} |
1204
|
2152
|
|
|
|
|
2294
|
|
1205
|
2152
|
50
|
33
|
|
|
6325
|
my $ptag; |
1206
|
|
|
|
|
|
|
if ($HTML::Tagset::isCDATA_Parent{ $ptag = $pos->{'_tag'} } |
1207
|
|
|
|
|
|
|
|
1208
|
|
|
|
|
|
|
#or $pos->is_inside('pre') |
1209
|
|
|
|
|
|
|
or $pos->is_inside( 'pre', 'textarea' ) |
1210
|
|
|
|
|
|
|
) |
1211
|
0
|
0
|
|
|
|
0
|
{ |
1212
|
0
|
|
|
|
|
0
|
return if $ignore_text; |
1213
|
|
|
|
|
|
|
$pos->push_content($text); |
1214
|
|
|
|
|
|
|
} |
1215
|
|
|
|
|
|
|
else { |
1216
|
|
|
|
|
|
|
|
1217
|
|
|
|
|
|
|
# return unless $text =~ /\S/; # This is sometimes wrong |
1218
|
2152
|
100
|
100
|
|
|
11238
|
|
|
|
50
|
33
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
if ( !$self->{'_implicit_tags'} || $text !~ /[^\n\r\f\t ]/ ) { |
1220
|
|
|
|
|
|
|
|
1221
|
|
|
|
|
|
|
# don't change anything |
1222
|
|
|
|
|
|
|
} |
1223
|
0
|
0
|
|
|
|
0
|
elsif ( $ptag eq 'head' or $ptag eq 'noframes' ) { |
1224
|
0
|
|
|
|
|
0
|
if ( $self->{'_implicit_body_p_tag'} ) { |
1225
|
|
|
|
|
|
|
print $indent, |
1226
|
|
|
|
|
|
|
" * Text node under \U$ptag\E closes \U$ptag\E, implicates BODY and P.\n" |
1227
|
0
|
|
|
|
|
0
|
if DEBUG > 1; |
1228
|
|
|
|
|
|
|
$self->end( \$ptag ); |
1229
|
|
|
|
|
|
|
$pos = $self->{'_body'} |
1230
|
0
|
0
|
|
|
|
0
|
? ( $self->{'_pos'} |
1231
|
|
|
|
|
|
|
= $self->{'_body'} ) # expected case |
1232
|
0
|
|
|
|
|
0
|
: $self->insert_element( 'body', 1 ); |
1233
|
|
|
|
|
|
|
$pos = $self->insert_element( 'p', 1 ); |
1234
|
|
|
|
|
|
|
} |
1235
|
0
|
|
|
|
|
0
|
else { |
1236
|
|
|
|
|
|
|
print $indent, |
1237
|
|
|
|
|
|
|
" * Text node under \U$ptag\E closes, implicates BODY.\n" |
1238
|
0
|
|
|
|
|
0
|
if DEBUG > 1; |
1239
|
|
|
|
|
|
|
$self->end( \$ptag ); |
1240
|
|
|
|
|
|
|
$pos = $self->{'_body'} |
1241
|
0
|
0
|
|
|
|
0
|
? ( $self->{'_pos'} |
1242
|
|
|
|
|
|
|
= $self->{'_body'} ) # expected case |
1243
|
|
|
|
|
|
|
: $self->insert_element( 'body', 1 ); |
1244
|
|
|
|
|
|
|
} |
1245
|
|
|
|
|
|
|
} |
1246
|
22
|
100
|
|
|
|
48
|
elsif ( $ptag eq 'html' ) { |
1247
|
1
|
|
|
|
|
2
|
if ( $self->{'_implicit_body_p_tag'} ) { |
1248
|
|
|
|
|
|
|
print $indent, |
1249
|
|
|
|
|
|
|
" * Text node under HTML implicates BODY and P.\n" |
1250
|
|
|
|
|
|
|
if DEBUG > 1; |
1251
|
|
|
|
|
|
|
$pos = $self->{'_body'} |
1252
|
1
|
50
|
|
|
|
4
|
? ( $self->{'_pos'} |
1253
|
|
|
|
|
|
|
= $self->{'_body'} ) # expected case |
1254
|
1
|
|
|
|
|
3
|
: $self->insert_element( 'body', 1 ); |
1255
|
|
|
|
|
|
|
$pos = $self->insert_element( 'p', 1 ); |
1256
|
|
|
|
|
|
|
} |
1257
|
21
|
|
|
|
|
29
|
else { |
1258
|
|
|
|
|
|
|
print $indent, |
1259
|
|
|
|
|
|
|
" * Text node under HTML implicates BODY.\n" |
1260
|
|
|
|
|
|
|
if DEBUG > 1; |
1261
|
|
|
|
|
|
|
$pos = $self->{'_body'} |
1262
|
21
|
50
|
|
|
|
47
|
? ( $self->{'_pos'} |
1263
|
|
|
|
|
|
|
= $self->{'_body'} ) # expected case |
1264
|
|
|
|
|
|
|
: $self->insert_element( 'body', 1 ); |
1265
|
|
|
|
|
|
|
|
1266
|
|
|
|
|
|
|
#print "POS is $pos, ", $pos->{'_tag'}, "\n"; |
1267
|
|
|
|
|
|
|
} |
1268
|
|
|
|
|
|
|
} |
1269
|
370
|
50
|
|
|
|
687
|
elsif ( $ptag eq 'body' ) { |
1270
|
0
|
|
|
|
|
0
|
if ( $self->{'_implicit_body_p_tag'} ) { |
1271
|
|
|
|
|
|
|
print $indent, " * Text node under BODY implicates P.\n" |
1272
|
0
|
|
|
|
|
0
|
if DEBUG > 1; |
1273
|
|
|
|
|
|
|
$pos = $self->insert_element( 'p', 1 ); |
1274
|
|
|
|
|
|
|
} |
1275
|
|
|
|
|
|
|
} |
1276
|
8
|
|
|
|
|
12
|
elsif ( $ptag eq 'table' ) { |
1277
|
|
|
|
|
|
|
print $indent, |
1278
|
|
|
|
|
|
|
" * Text node under TABLE implicates TR and TD.\n" |
1279
|
8
|
|
|
|
|
20
|
if DEBUG > 1; |
1280
|
8
|
|
|
|
|
14
|
$self->insert_element( 'tr', 1 ); |
1281
|
|
|
|
|
|
|
$pos = $self->insert_element( 'td', 1 ); |
1282
|
|
|
|
|
|
|
|
1283
|
|
|
|
|
|
|
# double whammy! |
1284
|
|
|
|
|
|
|
} |
1285
|
3
|
|
|
|
|
4
|
elsif ( $ptag eq 'tr' ) { |
1286
|
|
|
|
|
|
|
print $indent, " * Text node under TR implicates TD.\n" |
1287
|
3
|
|
|
|
|
7
|
if DEBUG > 1; |
1288
|
|
|
|
|
|
|
$pos = $self->insert_element( 'td', 1 ); |
1289
|
|
|
|
|
|
|
} |
1290
|
|
|
|
|
|
|
|
1291
|
|
|
|
|
|
|
# elsif ( |
1292
|
|
|
|
|
|
|
# # $ptag eq 'li' || |
1293
|
|
|
|
|
|
|
# # $ptag eq 'dd' || |
1294
|
|
|
|
|
|
|
# $ptag eq 'form') { |
1295
|
|
|
|
|
|
|
# $pos = $self->insert_element('p', 1); |
1296
|
|
|
|
|
|
|
#} |
1297
|
|
|
|
|
|
|
|
1298
|
|
|
|
|
|
|
# Whatever we've done above should have had the side |
1299
|
|
|
|
|
|
|
# effect of updating $self->{'_pos'} |
1300
|
|
|
|
|
|
|
|
1301
|
|
|
|
|
|
|
#print "POS is now $pos, ", $pos->{'_tag'}, "\n"; |
1302
|
2152
|
50
|
|
|
|
3344
|
|
1303
|
2152
|
100
|
|
|
|
7350
|
return if $ignore_text; |
1304
|
|
|
|
|
|
|
$text =~ s/[\n\r\f\t ]+/ /g # canonical space |
1305
|
|
|
|
|
|
|
unless $no_space_compacting; |
1306
|
|
|
|
|
|
|
|
1307
|
|
|
|
|
|
|
print $indent, " (Attaching text node ($nugget) under ", |
1308
|
|
|
|
|
|
|
|
1309
|
2152
|
|
|
|
|
2684
|
# was: $self->{'_pos'} ? $self->{'_pos'}{'_tag'} : $self->{'_tag'}, |
1310
|
|
|
|
|
|
|
$pos->{'_tag'}, ").\n" |
1311
|
|
|
|
|
|
|
if DEBUG > 1; |
1312
|
2152
|
|
|
|
|
4067
|
|
1313
|
|
|
|
|
|
|
$pos->push_content($text); |
1314
|
|
|
|
|
|
|
} |
1315
|
2152
|
50
|
|
|
|
11454
|
|
1316
|
2152
|
|
|
|
|
3781
|
&{ $self->{'_tweak_~text'} || return }( $text, $pos, |
1317
|
|
|
|
|
|
|
$pos->{'_tag'} . '' ); |
1318
|
|
|
|
|
|
|
|
1319
|
|
|
|
|
|
|
# Note that this is very exceptional -- it doesn't fall back to |
1320
|
0
|
|
|
|
|
0
|
# _tweak_*, and it gives its tweak different arguments. |
1321
|
|
|
|
|
|
|
return; |
1322
|
|
|
|
|
|
|
} |
1323
|
|
|
|
|
|
|
} |
1324
|
|
|
|
|
|
|
|
1325
|
|
|
|
|
|
|
#========================================================================== |
1326
|
|
|
|
|
|
|
|
1327
|
|
|
|
|
|
|
# TODO: test whether comment(), declaration(), and process(), do the right |
1328
|
|
|
|
|
|
|
# thing as far as tightening and whatnot. |
1329
|
|
|
|
|
|
|
# Also, currently, doctypes and comments that appear before head or body |
1330
|
|
|
|
|
|
|
# show up in the tree in the wrong place. Something should be done about |
1331
|
|
|
|
|
|
|
# this. Tricky. Maybe this whole business of pre-making the body and |
1332
|
|
|
|
|
|
|
# whatnot is wrong. |
1333
|
|
|
|
|
|
|
|
1334
|
188
|
50
|
|
188
|
1
|
1468
|
sub comment { |
1335
|
|
|
|
|
|
|
return if $_[0]{'_stunted'}; |
1336
|
|
|
|
|
|
|
|
1337
|
|
|
|
|
|
|
# Accept a "here's a comment" signal from HTML::Parser. |
1338
|
188
|
|
|
|
|
282
|
|
1339
|
188
|
|
66
|
|
|
349
|
my ( $self, $text ) = @_; |
1340
|
|
|
|
|
|
|
my $pos = $self->{'_pos'} || $self; |
1341
|
|
|
|
|
|
|
return |
1342
|
188
|
100
|
66
|
|
|
715
|
unless $self->{'_store_comments'} |
1343
|
|
|
|
|
|
|
|| $HTML::Tagset::isCDATA_Parent{ $pos->{'_tag'} }; |
1344
|
1
|
|
|
|
|
1
|
|
1345
|
|
|
|
|
|
|
if (DEBUG) { |
1346
|
|
|
|
|
|
|
my @lineage_tags = $pos->lineage_tag_names; |
1347
|
|
|
|
|
|
|
my $indent = ' ' x ( 1 + @lineage_tags ); |
1348
|
|
|
|
|
|
|
|
1349
|
|
|
|
|
|
|
my $nugget |
1350
|
|
|
|
|
|
|
= ( length($text) <= 25 ) |
1351
|
|
|
|
|
|
|
? $text |
1352
|
|
|
|
|
|
|
: ( substr( $text, 0, 25 ) . '...' ); |
1353
|
|
|
|
|
|
|
$nugget =~ s<([\x00-\x1F])> |
1354
|
|
|
|
|
|
|
<'\\x'.(unpack("H2",$1))>eg; |
1355
|
|
|
|
|
|
|
print $indent, "Proposing a Comment ($nugget) under ", |
1356
|
|
|
|
|
|
|
join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root', |
1357
|
|
|
|
|
|
|
".\n"; |
1358
|
1
|
|
|
|
|
2
|
} |
1359
|
1
|
|
|
|
|
3
|
|
1360
|
1
|
|
|
|
|
2
|
( my $e = $self->element_class->new('~comment') )->{'text'} = $text; |
1361
|
|
|
|
|
|
|
$pos->push_content($e); |
1362
|
1
|
|
|
|
|
4
|
++( $self->{'_element_count'} ); |
1363
|
1
|
50
|
33
|
|
|
8
|
|
1364
|
|
|
|
|
|
|
&{ $self->{'_tweak_~comment'} |
1365
|
|
|
|
|
|
|
|| $self->{'_tweak_*'} |
1366
|
0
|
|
|
|
|
0
|
|| return $e }( map $_, $e, '~comment', $self ); |
1367
|
|
|
|
|
|
|
|
1368
|
|
|
|
|
|
|
return $e; |
1369
|
|
|
|
|
|
|
} |
1370
|
2
|
50
|
|
2
|
1
|
532
|
|
1371
|
|
|
|
|
|
|
sub declaration { |
1372
|
|
|
|
|
|
|
return if $_[0]{'_stunted'}; |
1373
|
|
|
|
|
|
|
|
1374
|
2
|
|
|
|
|
11
|
# Accept a "here's a markup declaration" signal from HTML::Parser. |
1375
|
2
|
|
33
|
|
|
11
|
|
1376
|
|
|
|
|
|
|
my ( $self, $text ) = @_; |
1377
|
2
|
|
|
|
|
5
|
my $pos = $self->{'_pos'} || $self; |
1378
|
|
|
|
|
|
|
|
1379
|
|
|
|
|
|
|
if (DEBUG) { |
1380
|
|
|
|
|
|
|
my @lineage_tags = $pos->lineage_tag_names; |
1381
|
|
|
|
|
|
|
my $indent = ' ' x ( 1 + @lineage_tags ); |
1382
|
|
|
|
|
|
|
|
1383
|
|
|
|
|
|
|
my $nugget |
1384
|
|
|
|
|
|
|
= ( length($text) <= 25 ) |
1385
|
|
|
|
|
|
|
? $text |
1386
|
|
|
|
|
|
|
: ( substr( $text, 0, 25 ) . '...' ); |
1387
|
|
|
|
|
|
|
$nugget =~ s<([\x00-\x1F])> |
1388
|
|
|
|
|
|
|
<'\\x'.(unpack("H2",$1))>eg; |
1389
|
|
|
|
|
|
|
print $indent, "Proposing a Declaration ($nugget) under ", |
1390
|
2
|
|
|
|
|
7
|
join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root', |
1391
|
|
|
|
|
|
|
".\n"; |
1392
|
2
|
|
|
|
|
6
|
} |
1393
|
2
|
|
|
|
|
19
|
( my $e = $self->element_class->new('~declaration') )->{'text'} = $text; |
1394
|
|
|
|
|
|
|
|
1395
|
|
|
|
|
|
|
$self->{_decl} = $e; |
1396
|
|
|
|
|
|
|
return $e; |
1397
|
|
|
|
|
|
|
} |
1398
|
|
|
|
|
|
|
|
1399
|
0
|
0
|
|
0
|
1
|
0
|
#========================================================================== |
1400
|
|
|
|
|
|
|
|
1401
|
|
|
|
|
|
|
sub process { |
1402
|
|
|
|
|
|
|
return if $_[0]{'_stunted'}; |
1403
|
0
|
0
|
|
|
|
0
|
|
1404
|
0
|
|
|
|
|
0
|
# Accept a "here's a PI" signal from HTML::Parser. |
1405
|
0
|
|
0
|
|
|
0
|
|
1406
|
|
|
|
|
|
|
return unless $_[0]->{'_store_pis'}; |
1407
|
0
|
|
|
|
|
0
|
my ( $self, $text ) = @_; |
1408
|
|
|
|
|
|
|
my $pos = $self->{'_pos'} || $self; |
1409
|
|
|
|
|
|
|
|
1410
|
|
|
|
|
|
|
if (DEBUG) { |
1411
|
|
|
|
|
|
|
my @lineage_tags = $pos->lineage_tag_names; |
1412
|
|
|
|
|
|
|
my $indent = ' ' x ( 1 + @lineage_tags ); |
1413
|
|
|
|
|
|
|
|
1414
|
|
|
|
|
|
|
my $nugget |
1415
|
|
|
|
|
|
|
= ( length($text) <= 25 ) |
1416
|
|
|
|
|
|
|
? $text |
1417
|
|
|
|
|
|
|
: ( substr( $text, 0, 25 ) . '...' ); |
1418
|
|
|
|
|
|
|
$nugget =~ s<([\x00-\x1F])> |
1419
|
|
|
|
|
|
|
<'\\x'.(unpack("H2",$1))>eg; |
1420
|
0
|
|
|
|
|
0
|
print $indent, "Proposing a PI ($nugget) under ", |
1421
|
0
|
|
|
|
|
0
|
join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root', |
1422
|
0
|
|
|
|
|
0
|
".\n"; |
1423
|
|
|
|
|
|
|
} |
1424
|
0
|
0
|
0
|
|
|
0
|
( my $e = $self->element_class->new('~pi') )->{'text'} = $text; |
|
0
|
|
|
|
|
0
|
|
1425
|
|
|
|
|
|
|
$pos->push_content($e); |
1426
|
|
|
|
|
|
|
++( $self->{'_element_count'} ); |
1427
|
0
|
|
|
|
|
0
|
|
1428
|
|
|
|
|
|
|
&{ $self->{'_tweak_~pi'} || $self->{'_tweak_*'} || return $e }( map $_, |
1429
|
|
|
|
|
|
|
$e, '~pi', $self ); |
1430
|
|
|
|
|
|
|
|
1431
|
|
|
|
|
|
|
return $e; |
1432
|
|
|
|
|
|
|
} |
1433
|
|
|
|
|
|
|
|
1434
|
|
|
|
|
|
|
#========================================================================== |
1435
|
|
|
|
|
|
|
|
1436
|
|
|
|
|
|
|
#When you call $tree->parse_file($filename), and the |
1437
|
|
|
|
|
|
|
#tree's ignore_ignorable_whitespace attribute is on (as it is |
1438
|
|
|
|
|
|
|
#by default), HTML::TreeBuilder's logic will manage to avoid |
1439
|
|
|
|
|
|
|
#creating some, but not all, nodes that represent ignorable |
1440
|
|
|
|
|
|
|
#whitespace. However, at the end of its parse, it traverses the |
1441
|
|
|
|
|
|
|
#tree and deletes any that it missed. (It does this with an |
1442
|
|
|
|
|
|
|
#around-method around HTML::Parser's eof method.) |
1443
|
|
|
|
|
|
|
# |
1444
|
|
|
|
|
|
|
#However, with $tree->parse($content), the cleanup-traversal step |
1445
|
|
|
|
|
|
|
#doesn't happen automatically -- so when you're done parsing all |
1446
|
|
|
|
|
|
|
#content for a document (regardless of whether $content is the only |
1447
|
|
|
|
|
|
|
#bit, or whether it's just another chunk of content you're parsing into |
1448
|
|
|
|
|
|
|
#the tree), call $tree->eof() to signal that you're at the end of the |
1449
|
|
|
|
|
|
|
#text you're inputting to the tree. Besides properly cleaning any bits |
1450
|
|
|
|
|
|
|
#of ignorable whitespace from the tree, this will also ensure that |
1451
|
|
|
|
|
|
|
#HTML::Parser's internal buffer is flushed. |
1452
|
|
|
|
|
|
|
|
1453
|
283
|
50
|
|
283
|
1
|
1166
|
sub eof { |
1454
|
|
|
|
|
|
|
|
1455
|
283
|
50
|
|
|
|
491
|
# Accept an "end-of-file" signal from HTML::Parser, or thrown by the user. |
1456
|
|
|
|
|
|
|
|
1457
|
283
|
|
|
|
|
341
|
return if $_[0]->{'_done'}; # we've already been here |
1458
|
283
|
|
|
|
|
296
|
|
1459
|
283
|
|
|
|
|
308
|
return $_[0]->SUPER::eof() if $_[0]->{'_stunted'}; |
1460
|
283
|
50
|
|
|
|
468
|
|
1461
|
|
|
|
|
|
|
my $x = $_[0]; |
1462
|
|
|
|
|
|
|
print "EOF received.\n" if DEBUG; |
1463
|
|
|
|
|
|
|
my (@rv); |
1464
|
0
|
|
|
|
|
0
|
if (wantarray) { |
1465
|
|
|
|
|
|
|
|
1466
|
|
|
|
|
|
|
# I don't think this makes any difference for this particular |
1467
|
283
|
|
|
|
|
1079
|
# method, but let's be scrupulous, for once. |
1468
|
|
|
|
|
|
|
@rv = $x->SUPER::eof(); |
1469
|
|
|
|
|
|
|
} |
1470
|
283
|
100
|
66
|
|
|
1152
|
else { |
1471
|
|
|
|
|
|
|
$rv[0] = $x->SUPER::eof(); |
1472
|
|
|
|
|
|
|
} |
1473
|
|
|
|
|
|
|
|
1474
|
|
|
|
|
|
|
$x->end('html') unless $x eq ( $x->{'_pos'} || $x ); |
1475
|
|
|
|
|
|
|
|
1476
|
283
|
100
|
|
|
|
480
|
# That SHOULD close everything, and will run the appropriate tweaks. |
1477
|
|
|
|
|
|
|
# We /could/ be running under some insane mode such that there's more |
1478
|
|
|
|
|
|
|
# than one HTML element, but really, that's just insane to do anyhow. |
1479
|
|
|
|
|
|
|
|
1480
|
5
|
|
|
|
|
9
|
unless ( $x->{'_implicit_tags'} ) { |
1481
|
|
|
|
|
|
|
|
1482
|
|
|
|
|
|
|
# delete those silly implicit head and body in case we put |
1483
|
|
|
|
|
|
|
# them there in implicit tags mode |
1484
|
|
|
|
|
|
|
foreach my $node ( $x->{'_head'}, $x->{'_body'} ) { |
1485
|
10
|
50
|
33
|
|
|
65
|
$node->replace_with_content |
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
1486
|
|
|
|
|
|
|
if defined $node |
1487
|
|
|
|
|
|
|
and ref $node |
1488
|
|
|
|
|
|
|
and $node->{'_implicit'} |
1489
|
|
|
|
|
|
|
and $node->{'_parent'}; |
1490
|
|
|
|
|
|
|
|
1491
|
|
|
|
|
|
|
# I think they should be empty anyhow, since the only |
1492
|
|
|
|
|
|
|
# logic that'd insert under them can apply only, I think, |
1493
|
|
|
|
|
|
|
# in the case where _implicit_tags is on |
1494
|
|
|
|
|
|
|
} |
1495
|
|
|
|
|
|
|
|
1496
|
|
|
|
|
|
|
# this may still leave an implicit 'html' at the top, but there's |
1497
|
|
|
|
|
|
|
# nothing we can do about that, is there? |
1498
|
|
|
|
|
|
|
} |
1499
|
283
|
100
|
66
|
|
|
1327
|
|
1500
|
283
|
|
|
|
|
447
|
$x->delete_ignorable_whitespace() |
1501
|
|
|
|
|
|
|
|
1502
|
283
|
50
|
|
|
|
459
|
# this's why we trap this -- an after-method |
1503
|
283
|
|
|
|
|
620
|
if $x->{'_tighten'} and !$x->{'_ignore_text'}; |
1504
|
|
|
|
|
|
|
$x->{'_done'} = 1; |
1505
|
|
|
|
|
|
|
|
1506
|
|
|
|
|
|
|
return @rv if wantarray; |
1507
|
|
|
|
|
|
|
return $rv[0]; |
1508
|
|
|
|
|
|
|
} |
1509
|
|
|
|
|
|
|
|
1510
|
|
|
|
|
|
|
#========================================================================== |
1511
|
0
|
|
|
0
|
1
|
0
|
|
1512
|
0
|
|
|
|
|
0
|
# TODO: document |
1513
|
0
|
|
|
|
|
0
|
|
1514
|
|
|
|
|
|
|
sub stunt { |
1515
|
0
|
0
|
|
|
|
0
|
my $self = $_[0]; |
1516
|
|
|
|
|
|
|
print "Stunting the tree.\n" if DEBUG; |
1517
|
|
|
|
|
|
|
$self->{'_done'} = 1; |
1518
|
0
|
|
|
|
|
0
|
|
1519
|
0
|
|
|
|
|
0
|
if ( $HTML::Parser::VERSION < 3 ) { |
1520
|
|
|
|
|
|
|
|
1521
|
|
|
|
|
|
|
#This is a MEAN MEAN HACK. And it works most of the time! |
1522
|
0
|
0
|
|
|
|
0
|
$self->{'_buf'} = ''; |
1523
|
0
|
|
|
|
|
0
|
my $fh = *HTML::Parser::F{IO}; |
1524
|
0
|
|
|
|
|
0
|
|
1525
|
|
|
|
|
|
|
# the local'd FH used by parse_file loop |
1526
|
|
|
|
|
|
|
if ( defined $fh ) { |
1527
|
|
|
|
|
|
|
print "Closing Parser's filehandle $fh\n" if DEBUG; |
1528
|
|
|
|
|
|
|
close($fh); |
1529
|
|
|
|
|
|
|
} |
1530
|
|
|
|
|
|
|
|
1531
|
|
|
|
|
|
|
# But if they called $tree->parse_file($filehandle) |
1532
|
|
|
|
|
|
|
# or $tree->parse_file(*IO), then there will be no *HTML::Parser::F{IO} |
1533
|
0
|
|
|
|
|
0
|
# to close. Ahwell. Not a problem for most users these days. |
1534
|
|
|
|
|
|
|
|
1535
|
|
|
|
|
|
|
} |
1536
|
|
|
|
|
|
|
else { |
1537
|
|
|
|
|
|
|
$self->SUPER::eof(); |
1538
|
|
|
|
|
|
|
|
1539
|
|
|
|
|
|
|
# Under 3+ versions, calling eof from inside a parse will abort the |
1540
|
|
|
|
|
|
|
# parse / parse_file |
1541
|
0
|
|
|
|
|
0
|
} |
1542
|
0
|
|
|
|
|
0
|
|
1543
|
|
|
|
|
|
|
# In the off chance that the above didn't work, we'll throw |
1544
|
|
|
|
|
|
|
# this flag to make any future events be no-ops. |
1545
|
|
|
|
|
|
|
$self->stunted(1); |
1546
|
0
|
|
|
0
|
1
|
0
|
return; |
1547
|
0
|
|
|
0
|
1
|
0
|
} |
1548
|
|
|
|
|
|
|
|
1549
|
|
|
|
|
|
|
# TODO: document |
1550
|
|
|
|
|
|
|
sub stunted { shift->_elem( '_stunted', @_ ); } |
1551
|
|
|
|
|
|
|
sub done { shift->_elem( '_done', @_ ); } |
1552
|
|
|
|
|
|
|
|
1553
|
|
|
|
|
|
|
#========================================================================== |
1554
|
|
|
|
|
|
|
|
1555
|
|
|
|
|
|
|
sub delete { |
1556
|
|
|
|
|
|
|
|
1557
|
|
|
|
|
|
|
# Override Element's delete method. |
1558
|
250
|
|
|
250
|
1
|
70008
|
# This does most, if not all, of what Element's delete does anyway. |
1559
|
|
|
|
|
|
|
# Deletes content, including content in some special attributes. |
1560
|
250
|
|
|
|
|
342
|
# But doesn't empty out the hash. |
|
250
|
|
|
|
|
641
|
|
1561
|
250
|
|
|
|
|
333
|
|
1562
|
250
|
50
|
|
|
|
668
|
$_[0]->{'_element_count'} = 1; # never hurts to be scrupulously correct |
1563
|
|
|
|
|
|
|
|
1564
|
|
|
|
|
|
|
delete @{ $_[0] }{ '_body', '_head', '_pos' }; |
1565
|
|
|
|
|
|
|
for ( |
1566
|
|
|
|
|
|
|
@{ delete( $_[0]->{'_content'} ) || [] }, # all/any content |
1567
|
|
|
|
|
|
|
|
1568
|
|
|
|
|
|
|
# delete @{$_[0]}{'_body', '_head', '_pos'} |
1569
|
|
|
|
|
|
|
# ...and these, in case these elements don't appear in the |
1570
|
|
|
|
|
|
|
# content, which is possible. If they did appear (as they |
1571
|
|
|
|
|
|
|
# usually do), then calling $_->delete on them again is harmless. |
1572
|
|
|
|
|
|
|
# I don't think that's such a hot idea now. Thru creative reattachment, |
1573
|
|
|
|
|
|
|
# those could actually now point to elements in OTHER trees (which we do |
1574
|
|
|
|
|
|
|
# NOT want to delete!). |
1575
|
|
|
|
|
|
|
## Reasoned out: |
1576
|
|
|
|
|
|
|
# If these point to elements not in the content list of any element in this |
1577
|
|
|
|
|
|
|
# tree, but not in the content list of any element in any OTHER tree, then |
1578
|
|
|
|
|
|
|
# just deleting these will make their refcounts hit zero. |
1579
|
|
|
|
|
|
|
# If these point to elements in the content lists of elements in THIS tree, |
1580
|
|
|
|
|
|
|
# then we'll get to deleting them when we delete from the top. |
1581
|
500
|
50
|
33
|
|
|
2506
|
# If these point to elements in the content lists of elements in SOME OTHER |
|
|
|
33
|
|
|
|
|
1582
|
|
|
|
|
|
|
# tree, then they're not to be deleted. |
1583
|
|
|
|
|
|
|
) |
1584
|
|
|
|
|
|
|
{ |
1585
|
|
|
|
|
|
|
$_->delete |
1586
|
250
|
0
|
33
|
|
|
700
|
if defined $_ and ref $_ # Make sure it's an object. |
1587
|
|
|
|
|
|
|
and $_ ne $_[0]; # And avoid hitting myself, just in case! |
1588
|
|
|
|
|
|
|
} |
1589
|
|
|
|
|
|
|
|
1590
|
250
|
|
|
|
|
1606
|
$_[0]->detach if $_[0]->{'_parent'} and $_[0]->{'_parent'}{'_content'}; |
1591
|
|
|
|
|
|
|
|
1592
|
|
|
|
|
|
|
# An 'html' element having a parent is quite unlikely. |
1593
|
|
|
|
|
|
|
|
1594
|
0
|
|
|
0
|
1
|
0
|
return; |
1595
|
|
|
|
|
|
|
} |
1596
|
|
|
|
|
|
|
|
1597
|
|
|
|
|
|
|
sub tighten_up { # legacy |
1598
|
|
|
|
|
|
|
shift->delete_ignorable_whitespace(@_); |
1599
|
|
|
|
|
|
|
} |
1600
|
3
|
|
|
3
|
1
|
4
|
|
1601
|
3
|
|
|
|
|
6
|
sub elementify { |
1602
|
3
|
|
|
|
|
27
|
|
1603
|
|
|
|
|
|
|
# Rebless this object down into the normal element class. |
1604
|
3
|
|
|
|
|
14
|
my $self = $_[0]; |
1605
|
63
|
100
|
33
|
|
|
502
|
my $to_class = $self->element_class; |
|
|
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
1606
|
|
|
|
|
|
|
delete @{$self}{ |
1607
|
|
|
|
|
|
|
grep { |
1608
|
|
|
|
|
|
|
; |
1609
|
|
|
|
|
|
|
length $_ and substr( $_, 0, 1 ) eq '_' |
1610
|
|
|
|
|
|
|
|
1611
|
|
|
|
|
|
|
# The private attributes that we'll retain: |
1612
|
|
|
|
|
|
|
and $_ ne '_tag' |
1613
|
|
|
|
|
|
|
and $_ ne '_parent' |
1614
|
|
|
|
|
|
|
and $_ ne '_content' |
1615
|
|
|
|
|
|
|
and $_ ne '_implicit' |
1616
|
3
|
|
|
|
|
12
|
and $_ ne '_pos' |
1617
|
|
|
|
|
|
|
and $_ ne '_element_class' |
1618
|
|
|
|
|
|
|
} keys %$self |
1619
|
|
|
|
|
|
|
}; |
1620
|
4071
|
100
|
|
4071
|
1
|
7617
|
bless $self, $to_class; # Returns the same object we were fed |
1621
|
3780
|
|
50
|
|
|
14145
|
} |
1622
|
|
|
|
|
|
|
|
1623
|
|
|
|
|
|
|
sub element_class { |
1624
|
|
|
|
|
|
|
return 'HTML::Element' if not ref $_[0]; |
1625
|
|
|
|
|
|
|
return $_[0]->{_element_class} || 'HTML::Element'; |
1626
|
|
|
|
|
|
|
} |
1627
|
0
|
|
|
0
|
1
|
|
|
1628
|
0
|
|
|
|
|
|
#-------------------------------------------------------------------------- |
1629
|
0
|
|
|
|
|
|
|
1630
|
0
|
|
|
|
|
|
sub guts { |
1631
|
0
|
|
|
|
|
|
my @out; |
1632
|
0
|
|
|
|
|
|
my @stack = ( $_[0] ); |
1633
|
0
|
0
|
|
|
|
|
my $destructive = $_[1]; |
|
|
0
|
|
|
|
|
|
1634
|
0
|
|
|
|
|
|
my $this; |
1635
|
|
|
|
|
|
|
while (@stack) { |
1636
|
|
|
|
|
|
|
$this = shift @stack; |
1637
|
0
|
|
|
|
|
|
if ( !ref $this ) { |
1638
|
0
|
0
|
|
|
|
|
push @out, $this; # yes, it can include text nodes |
1639
|
|
|
|
|
|
|
} |
1640
|
|
|
|
|
|
|
elsif ( !$this->{'_implicit'} ) { |
1641
|
|
|
|
|
|
|
push @out, $this; |
1642
|
|
|
|
|
|
|
delete $this->{'_parent'} if $destructive; |
1643
|
0
|
0
|
|
|
|
|
} |
1644
|
|
|
|
|
|
|
else { |
1645
|
|
|
|
|
|
|
|
1646
|
0
|
|
|
|
|
|
# it's an implicit node. Delete it and recurse |
1647
|
|
|
|
|
|
|
delete $this->{'_parent'} if $destructive; |
1648
|
0
|
0
|
|
|
|
|
unshift @stack, |
|
|
0
|
|
|
|
|
|
1649
|
|
|
|
|
|
|
@{ |
1650
|
|
|
|
|
|
|
( $destructive |
1651
|
|
|
|
|
|
|
? delete( $this->{'_content'} ) |
1652
|
|
|
|
|
|
|
: $this->{'_content'} |
1653
|
|
|
|
|
|
|
) |
1654
|
|
|
|
|
|
|
|| [] |
1655
|
|
|
|
|
|
|
}; |
1656
|
|
|
|
|
|
|
} |
1657
|
|
|
|
|
|
|
} |
1658
|
0
|
0
|
|
|
|
|
|
1659
|
0
|
0
|
|
|
|
|
# Doesn't call a real $root->delete on the (when implicit) root, |
1660
|
0
|
0
|
0
|
|
|
|
# but I don't think it needs to. |
1661
|
0
|
|
|
|
|
|
|
1662
|
0
|
|
|
|
|
|
return @out if wantarray; # one simple normal case. |
1663
|
0
|
|
|
|
|
|
return unless @out; |
1664
|
|
|
|
|
|
|
return $out[0] if @out == 1 and ref( $out[0] ); |
1665
|
|
|
|
|
|
|
my $x = HTML::Element->new( 'div', '_implicit' => 1 ); |
1666
|
0
|
|
|
0
|
1
|
|
$x->push_content(@out); |
1667
|
|
|
|
|
|
|
return $x; |
1668
|
|
|
|
|
|
|
} |
1669
|
|
|
|
|
|
|
|
1670
|
|
|
|
|
|
|
sub disembowel { $_[0]->guts(1) } |
1671
|
|
|
|
|
|
|
|
1672
|
|
|
|
|
|
|
#-------------------------------------------------------------------------- |
1673
|
|
|
|
|
|
|
1; |
1674
|
|
|
|
|
|
|
|
1675
|
|
|
|
|
|
|
__END__ |