line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# This is a fork of HTML::Element. Eventually the code may be merged. |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
package HTML::DOM::_TreeBuilder; |
4
|
|
|
|
|
|
|
|
5
|
24
|
|
|
24
|
|
129
|
use warnings; |
|
24
|
|
|
|
|
37
|
|
|
24
|
|
|
|
|
640
|
|
6
|
24
|
|
|
24
|
|
118
|
use strict; |
|
24
|
|
|
|
|
37
|
|
|
24
|
|
|
|
|
400
|
|
7
|
24
|
|
|
24
|
|
93
|
use integer; # vroom vroom! |
|
24
|
|
|
|
|
41
|
|
|
24
|
|
|
|
|
143
|
|
8
|
24
|
|
|
24
|
|
422
|
use Carp (); |
|
24
|
|
|
|
|
45
|
|
|
24
|
|
|
|
|
452
|
|
9
|
24
|
|
|
24
|
|
127
|
use vars qw(@ISA $VERSION $DEBUG); |
|
24
|
|
|
|
|
29
|
|
|
24
|
|
|
|
|
3578
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
#--------------------------------------------------------------------------- |
12
|
|
|
|
|
|
|
# Make a 'DEBUG' constant... |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
BEGIN { |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
# We used to have things like |
17
|
|
|
|
|
|
|
# print $indent, "lalala" if $Debug; |
18
|
|
|
|
|
|
|
# But there were an awful lot of having to evaluate $Debug's value. |
19
|
|
|
|
|
|
|
# If we make that depend on a constant, like so: |
20
|
|
|
|
|
|
|
# sub DEBUG () { 1 } # or whatever value. |
21
|
|
|
|
|
|
|
# ... |
22
|
|
|
|
|
|
|
# print $indent, "lalala" if DEBUG; |
23
|
|
|
|
|
|
|
# Which at compile-time (thru the miracle of constant folding) turns into: |
24
|
|
|
|
|
|
|
# print $indent, "lalala"; |
25
|
|
|
|
|
|
|
# or, if DEBUG is a constant with a true value, then that print statement |
26
|
|
|
|
|
|
|
# is simply optimized away, and doesn't appear in the target code at all. |
27
|
|
|
|
|
|
|
# If you don't believe me, run: |
28
|
|
|
|
|
|
|
# perl -MO=Deparse,-uHTML::DOM::_TreeBuilder -e 'BEGIN { \ |
29
|
|
|
|
|
|
|
# $HTML::DOM::_TreeBuilder::DEBUG = 4} use HTML::DOM::_TreeBuilder' |
30
|
|
|
|
|
|
|
# and see for yourself (substituting whatever value you want for $DEBUG |
31
|
|
|
|
|
|
|
# there). |
32
|
|
|
|
|
|
|
## no critic |
33
|
24
|
50
|
|
24
|
|
246
|
if ( defined &DEBUG ) { |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
# Already been defined! Do nothing. |
36
|
|
|
|
|
|
|
} |
37
|
|
|
|
|
|
|
elsif ( $] < 5.00404 ) { |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
# Grudgingly accomodate ancient (pre-constant) versions. |
40
|
0
|
|
|
|
|
0
|
eval 'sub DEBUG { $Debug } '; |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
elsif ( !$DEBUG ) { |
43
|
24
|
|
|
|
|
1324
|
eval 'sub DEBUG () {0}'; # Make it a constant. |
44
|
|
|
|
|
|
|
} |
45
|
|
|
|
|
|
|
elsif ( $DEBUG =~ m<^\d+$>s ) { |
46
|
0
|
|
|
|
|
0
|
eval 'sub DEBUG () { ' . $DEBUG . ' }'; # Make THAT a constant. |
47
|
|
|
|
|
|
|
} |
48
|
|
|
|
|
|
|
else { # WTF? |
49
|
0
|
|
|
|
|
0
|
warn "Non-numeric value \"$DEBUG\" in \$HTML::DOM::_Element::DEBUG"; |
50
|
0
|
|
|
|
|
0
|
eval 'sub DEBUG () { $DEBUG }'; # I guess. |
51
|
|
|
|
|
|
|
} |
52
|
|
|
|
|
|
|
## use critic |
53
|
|
|
|
|
|
|
} |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
#--------------------------------------------------------------------------- |
56
|
|
|
|
|
|
|
|
57
|
24
|
|
|
24
|
|
133
|
use HTML::Entities (); |
|
24
|
|
|
|
|
43
|
|
|
24
|
|
|
|
|
387
|
|
58
|
24
|
|
|
24
|
|
99
|
use HTML::Tagset 3.02 (); |
|
24
|
|
|
|
|
510
|
|
|
24
|
|
|
|
|
368
|
|
59
|
|
|
|
|
|
|
|
60
|
24
|
|
|
24
|
|
96
|
use HTML::DOM::_Element (); |
|
24
|
|
|
|
|
41
|
|
|
24
|
|
|
|
|
308
|
|
61
|
24
|
|
|
24
|
|
110
|
use HTML::Parser (); |
|
24
|
|
|
|
|
46
|
|
|
24
|
|
|
|
|
128015
|
|
62
|
|
|
|
|
|
|
@ISA = qw(HTML::DOM::_Element HTML::Parser); |
63
|
|
|
|
|
|
|
$VERSION = 4.2001; |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
# This looks schizoid, I know. |
66
|
|
|
|
|
|
|
# It's not that we ARE an element AND a parser. |
67
|
|
|
|
|
|
|
# We ARE an element, but one that knows how to handle signals |
68
|
|
|
|
|
|
|
# (method calls) from Parser in order to elaborate its subtree. |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
# Legacy aliases: |
71
|
|
|
|
|
|
|
*HTML::DOM::_TreeBuilder::isKnown = \%HTML::Tagset::isKnown; |
72
|
|
|
|
|
|
|
*HTML::DOM::_TreeBuilder::canTighten = \%HTML::Tagset::canTighten; |
73
|
|
|
|
|
|
|
*HTML::DOM::_TreeBuilder::isHeadElement = \%HTML::Tagset::isHeadElement; |
74
|
|
|
|
|
|
|
*HTML::DOM::_TreeBuilder::isBodyElement = \%HTML::Tagset::isBodyElement; |
75
|
|
|
|
|
|
|
*HTML::DOM::_TreeBuilder::isPhraseMarkup = \%HTML::Tagset::isPhraseMarkup; |
76
|
|
|
|
|
|
|
*HTML::DOM::_TreeBuilder::isHeadOrBodyElement = \%HTML::Tagset::isHeadOrBodyElement; |
77
|
|
|
|
|
|
|
*HTML::DOM::_TreeBuilder::isList = \%HTML::Tagset::isList; |
78
|
|
|
|
|
|
|
*HTML::DOM::_TreeBuilder::isTableElement = \%HTML::Tagset::isTableElement; |
79
|
|
|
|
|
|
|
*HTML::DOM::_TreeBuilder::isFormElement = \%HTML::Tagset::isFormElement; |
80
|
|
|
|
|
|
|
*HTML::DOM::_TreeBuilder::p_closure_barriers = \@HTML::Tagset::p_closure_barriers; |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
#========================================================================== |
83
|
|
|
|
|
|
|
# Two little shortcut constructors: |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
sub new_from_file { # or from a FH |
86
|
0
|
|
|
0
|
0
|
0
|
my $class = shift; |
87
|
0
|
0
|
|
|
|
0
|
Carp::croak("new_from_file takes only one argument") |
88
|
|
|
|
|
|
|
unless @_ == 1; |
89
|
0
|
0
|
|
|
|
0
|
Carp::croak("new_from_file is a class method only") |
90
|
|
|
|
|
|
|
if ref $class; |
91
|
0
|
|
|
|
|
0
|
my $new = $class->new(); |
92
|
0
|
|
|
|
|
0
|
$new->parse_file( $_[0] ); |
93
|
0
|
|
|
|
|
0
|
return $new; |
94
|
|
|
|
|
|
|
} |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
sub new_from_content { # from any number of scalars |
97
|
0
|
|
|
0
|
0
|
0
|
my $class = shift; |
98
|
0
|
0
|
|
|
|
0
|
Carp::croak("new_from_content is a class method only") |
99
|
|
|
|
|
|
|
if ref $class; |
100
|
0
|
|
|
|
|
0
|
my $new = $class->new(); |
101
|
0
|
|
|
|
|
0
|
foreach my $whunk (@_) { |
102
|
0
|
0
|
|
|
|
0
|
if ( ref($whunk) eq 'SCALAR' ) { |
103
|
0
|
|
|
|
|
0
|
$new->parse($$whunk); |
104
|
|
|
|
|
|
|
} |
105
|
|
|
|
|
|
|
else { |
106
|
0
|
|
|
|
|
0
|
$new->parse($whunk); |
107
|
|
|
|
|
|
|
} |
108
|
0
|
0
|
|
|
|
0
|
last if $new->{'_stunted'}; # might as well check that. |
109
|
|
|
|
|
|
|
} |
110
|
0
|
|
|
|
|
0
|
$new->eof(); |
111
|
0
|
|
|
|
|
0
|
return $new; |
112
|
|
|
|
|
|
|
} |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
# TODO: document more fully? |
115
|
|
|
|
|
|
|
sub parse_content { # from any number of scalars |
116
|
0
|
|
|
0
|
0
|
0
|
my $tree = shift; |
117
|
0
|
|
|
|
|
0
|
my $retval; |
118
|
0
|
|
|
|
|
0
|
foreach my $whunk (@_) { |
119
|
0
|
0
|
|
|
|
0
|
if ( ref($whunk) eq 'SCALAR' ) { |
120
|
0
|
|
|
|
|
0
|
$retval = $tree->parse($$whunk); |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
else { |
123
|
0
|
|
|
|
|
0
|
$retval = $tree->parse($whunk); |
124
|
|
|
|
|
|
|
} |
125
|
0
|
0
|
|
|
|
0
|
last if $tree->{'_stunted'}; # might as well check that. |
126
|
|
|
|
|
|
|
} |
127
|
0
|
|
|
|
|
0
|
$tree->eof(); |
128
|
0
|
|
|
|
|
0
|
return $retval; |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
#--------------------------------------------------------------------------- |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
sub new { # constructor! |
134
|
147
|
|
|
147
|
1
|
243
|
my $class = shift; |
135
|
147
|
|
33
|
|
|
516
|
$class = ref($class) || $class; |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
# Initialize HTML::DOM::_Element part |
138
|
147
|
|
|
|
|
387
|
my $self = $class->element_class->new('html'); |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
{ |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
# A hack for certain strange versions of Parser: |
143
|
147
|
|
|
|
|
216
|
my $other_self = HTML::Parser->new(); |
|
147
|
|
|
|
|
540
|
|
144
|
147
|
|
|
|
|
6953
|
%$self = ( %$self, %$other_self ); # copy fields |
145
|
|
|
|
|
|
|
# Yes, multiple inheritance is messy. Kids, don't try this at home. |
146
|
147
|
|
|
|
|
572
|
bless $other_self, "HTML::DOM::_TreeBuilder::_hideyhole"; |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
# whack it out of the HTML::Parser class, to avoid the destructor |
149
|
|
|
|
|
|
|
} |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
# The root of the tree is special, as it has these funny attributes, |
152
|
|
|
|
|
|
|
# and gets reblessed into this class. |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
# Initialize parser settings |
155
|
147
|
|
|
|
|
308
|
$self->{'_implicit_tags'} = 1; |
156
|
147
|
|
|
|
|
231
|
$self->{'_implicit_body_p_tag'} = 0; |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
# If true, trying to insert text, or any of %isPhraseMarkup right |
159
|
|
|
|
|
|
|
# under 'body' will implicate a 'p'. If false, will just go there. |
160
|
|
|
|
|
|
|
|
161
|
147
|
|
|
|
|
245
|
$self->{'_tighten'} = 1; |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
# whether ignorable WS in this tree should be deleted |
164
|
|
|
|
|
|
|
|
165
|
147
|
|
|
|
|
234
|
$self->{'_implicit'} = 1; # to delete, once we find a real open-"html" tag |
166
|
|
|
|
|
|
|
|
167
|
147
|
|
|
|
|
320
|
$self->{'_ignore_unknown'} = 1; |
168
|
147
|
|
|
|
|
228
|
$self->{'_ignore_text'} = 0; |
169
|
147
|
|
|
|
|
221
|
$self->{'_warn'} = 0; |
170
|
147
|
|
|
|
|
238
|
$self->{'_no_space_compacting'} = 0; |
171
|
147
|
|
|
|
|
214
|
$self->{'_store_comments'} = 0; |
172
|
147
|
|
|
|
|
237
|
$self->{'_store_declarations'} = 1; |
173
|
147
|
|
|
|
|
241
|
$self->{'_store_pis'} = 0; |
174
|
147
|
|
|
|
|
210
|
$self->{'_p_strict'} = 0; |
175
|
147
|
|
|
|
|
330
|
$self->{'_no_expand_entities'} = 0; |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
# Parse attributes passed in as arguments |
178
|
147
|
50
|
|
|
|
328
|
if (@_) { |
179
|
147
|
|
|
|
|
536
|
my %attr = @_; |
180
|
147
|
|
|
|
|
410
|
for ( keys %attr ) { |
181
|
441
|
|
|
|
|
1023
|
$self->{"_$_"} = $attr{$_}; |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
} |
184
|
|
|
|
|
|
|
|
185
|
147
|
|
|
|
|
270
|
$HTML::DOM::_Element::encoded_content = $self->{'_no_expand_entities'}; |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
# rebless to our class |
188
|
147
|
|
|
|
|
231
|
bless $self, $class; |
189
|
|
|
|
|
|
|
|
190
|
147
|
|
|
|
|
262
|
$self->{'_element_count'} = 1; |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
# undocumented, informal, and maybe not exactly correct |
193
|
|
|
|
|
|
|
|
194
|
147
|
|
|
|
|
440
|
$self->{'_head'} = $self->insert_element( 'head', 1 ); |
195
|
147
|
|
|
|
|
239
|
$self->{'_pos'} = undef; # pull it back up |
196
|
147
|
|
|
|
|
326
|
$self->{'_body'} = $self->insert_element( 'body', 1 ); |
197
|
147
|
|
|
|
|
229
|
$self->{'_pos'} = undef; # pull it back up again |
198
|
|
|
|
|
|
|
|
199
|
147
|
|
|
|
|
549
|
return $self; |
200
|
|
|
|
|
|
|
} |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
#========================================================================== |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
sub _elem # universal accessor... |
205
|
|
|
|
|
|
|
{ |
206
|
441
|
|
|
441
|
|
675
|
my ( $self, $elem, $val ) = @_; |
207
|
441
|
|
|
|
|
614
|
my $old = $self->{$elem}; |
208
|
441
|
50
|
|
|
|
782
|
$self->{$elem} = $val if defined $val; |
209
|
441
|
|
|
|
|
806
|
return $old; |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
# accessors.... |
213
|
0
|
|
|
0
|
0
|
0
|
sub implicit_tags { shift->_elem( '_implicit_tags', @_ ); } |
214
|
0
|
|
|
0
|
0
|
0
|
sub implicit_body_p_tag { shift->_elem( '_implicit_body_p_tag', @_ ); } |
215
|
0
|
|
|
0
|
0
|
0
|
sub p_strict { shift->_elem( '_p_strict', @_ ); } |
216
|
147
|
|
|
147
|
0
|
332
|
sub no_space_compacting { shift->_elem( '_no_space_compacting', @_ ); } |
217
|
0
|
|
|
0
|
0
|
0
|
sub ignore_unknown { shift->_elem( '_ignore_unknown', @_ ); } |
218
|
0
|
|
|
0
|
0
|
0
|
sub ignore_text { shift->_elem( '_ignore_text', @_ ); } |
219
|
147
|
|
|
147
|
0
|
396
|
sub ignore_ignorable_whitespace { shift->_elem( '_tighten', @_ ); } |
220
|
147
|
|
|
147
|
0
|
270
|
sub store_comments { shift->_elem( '_store_comments', @_ ); } |
221
|
0
|
|
|
0
|
0
|
0
|
sub store_declarations { shift->_elem( '_store_declarations', @_ ); } |
222
|
0
|
|
|
0
|
0
|
0
|
sub store_pis { shift->_elem( '_store_pis', @_ ); } |
223
|
0
|
|
|
0
|
0
|
0
|
sub warn { shift->_elem( '_warn', @_ ); } |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
sub no_expand_entities { |
226
|
0
|
|
|
0
|
0
|
0
|
shift->_elem( '_no_expand_entities', @_ ); |
227
|
0
|
|
|
|
|
0
|
$HTML::DOM::_Element::encoded_content = @_; |
228
|
|
|
|
|
|
|
} |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
#========================================================================== |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
sub warning { |
233
|
2
|
|
|
2
|
0
|
4
|
my $self = shift; |
234
|
2
|
50
|
|
|
|
5
|
CORE::warn("HTML::Parse: $_[0]\n") if $self->{'_warn'}; |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
# should maybe say HTML::DOM::_TreeBuilder instead |
237
|
|
|
|
|
|
|
} |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
#========================================================================== |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
{ |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
# To avoid having to rebuild these lists constantly... |
244
|
|
|
|
|
|
|
my $_Closed_by_structurals = [qw(p h1 h2 h3 h4 h5 h6 pre textarea)]; |
245
|
|
|
|
|
|
|
my $indent; |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
sub start { |
248
|
595
|
50
|
|
595
|
1
|
1244
|
return if $_[0]{'_stunted'}; |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
# Accept a signal from HTML::Parser for start-tags. |
251
|
595
|
|
|
|
|
1114
|
my ( $self, $tag, $attr ) = @_; |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
# Parser passes more, actually: |
254
|
|
|
|
|
|
|
# $self->start($tag, $attr, $attrseq, $origtext) |
255
|
|
|
|
|
|
|
# But we can merrily ignore $attrseq and $origtext. |
256
|
|
|
|
|
|
|
|
257
|
595
|
50
|
|
|
|
1087
|
if ( $tag eq 'x-html' ) { |
258
|
0
|
|
|
|
|
0
|
print "Ignoring open-x-html tag.\n" if DEBUG; |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
# inserted by some lame code-generators. |
261
|
0
|
|
|
|
|
0
|
return; # bypass tweaking. |
262
|
|
|
|
|
|
|
} |
263
|
|
|
|
|
|
|
|
264
|
595
|
|
|
|
|
1028
|
$tag =~ s{/$}{}s; # So turns into . Silently forgive. |
265
|
|
|
|
|
|
|
|
266
|
595
|
50
|
|
|
|
1963
|
unless ( $tag =~ m/^[-_a-zA-Z0-9:%]+$/s ) { |
267
|
0
|
|
|
|
|
0
|
DEBUG and print "Start-tag name $tag is no good. Skipping.\n"; |
268
|
0
|
|
|
|
|
0
|
return; |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
# This avoids having Element's new() throw an exception. |
271
|
|
|
|
|
|
|
} |
272
|
|
|
|
|
|
|
|
273
|
595
|
|
66
|
|
|
1584
|
my $ptag = ( my $pos = $self->{'_pos'} || $self )->{'_tag'}; |
274
|
595
|
|
|
|
|
737
|
my $already_inserted; |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
#my($indent); |
277
|
595
|
|
|
|
|
628
|
if (DEBUG) { |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
# optimization -- don't figure out indenting unless we're in debug mode |
280
|
|
|
|
|
|
|
my @lineage = $pos->lineage; |
281
|
|
|
|
|
|
|
$indent = ' ' x ( 1 + @lineage ); |
282
|
|
|
|
|
|
|
print $indent, "Proposing a new \U$tag\E under ", |
283
|
|
|
|
|
|
|
join( '/', map $_->{'_tag'}, reverse( $pos, @lineage ) ) |
284
|
|
|
|
|
|
|
|| 'Root', |
285
|
|
|
|
|
|
|
".\n"; |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
#} else { |
288
|
|
|
|
|
|
|
# $indent = ' '; |
289
|
|
|
|
|
|
|
} |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
#print $indent, "POS: $pos ($ptag)\n" if DEBUG > 2; |
292
|
|
|
|
|
|
|
# $attr = {%$attr}; |
293
|
|
|
|
|
|
|
|
294
|
595
|
|
|
|
|
1454
|
foreach my $k ( keys %$attr ) { |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
# Make sure some stooge doesn't have "". |
297
|
|
|
|
|
|
|
# That happens every few million Web pages. |
298
|
539
|
50
|
33
|
|
|
1822
|
$attr->{ ' ' . $k } = delete $attr->{$k} |
299
|
|
|
|
|
|
|
if length $k and substr( $k, 0, 1 ) eq '_'; |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
# Looks bad, but is fine for round-tripping. |
302
|
|
|
|
|
|
|
} |
303
|
|
|
|
|
|
|
|
304
|
595
|
|
|
|
|
1371
|
my $e = $self->element_class->new( $tag, %$attr ); |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
# Make a new element object. |
307
|
|
|
|
|
|
|
# (Only rarely do we end up just throwing it away later in this call.) |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
# Some prep -- custom messiness for those damned tables, and strict P's. |
310
|
595
|
50
|
|
|
|
1290
|
if ( $self->{'_implicit_tags'} ) { # wallawallawalla! |
311
|
|
|
|
|
|
|
|
312
|
595
|
100
|
|
|
|
1193
|
unless ( $HTML::DOM::_TreeBuilder::isTableElement{$tag} ) { |
313
|
550
|
50
|
|
|
|
1189
|
if ( $ptag eq 'table' ) { |
|
|
50
|
|
|
|
|
|
314
|
0
|
|
|
|
|
0
|
print $indent, |
315
|
|
|
|
|
|
|
" * Phrasal \U$tag\E right under TABLE makes implicit TR and TD\n" |
316
|
|
|
|
|
|
|
if DEBUG > 1; |
317
|
0
|
|
|
|
|
0
|
$self->insert_element( 'tr', 1 ); |
318
|
0
|
|
|
|
|
0
|
$pos = $self->insert_element( 'td', 1 ) |
319
|
|
|
|
|
|
|
; # yes, needs updating |
320
|
|
|
|
|
|
|
} |
321
|
|
|
|
|
|
|
elsif ( $ptag eq 'tr' ) { |
322
|
0
|
|
|
|
|
0
|
print $indent, |
323
|
|
|
|
|
|
|
" * Phrasal \U$tag\E right under TR makes an implicit TD\n" |
324
|
|
|
|
|
|
|
if DEBUG > 1; |
325
|
0
|
|
|
|
|
0
|
$pos = $self->insert_element( 'td', 1 ) |
326
|
|
|
|
|
|
|
; # yes, needs updating |
327
|
|
|
|
|
|
|
} |
328
|
550
|
|
|
|
|
879
|
$ptag = $pos->{'_tag'}; # yes, needs updating |
329
|
|
|
|
|
|
|
} |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
# end of table-implication block. |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
# Now maybe do a little dance to enforce P-strictness. |
334
|
|
|
|
|
|
|
# This seems like it should be integrated with the big |
335
|
|
|
|
|
|
|
# "ALL HOPE..." block, further below, but that doesn't |
336
|
|
|
|
|
|
|
# seem feasable. |
337
|
595
|
0
|
33
|
|
|
1100
|
if ( $self->{'_p_strict'} |
|
|
|
0
|
|
|
|
|
338
|
|
|
|
|
|
|
and $HTML::DOM::_TreeBuilder::isKnown{$tag} |
339
|
|
|
|
|
|
|
and not $HTML::Tagset::is_Possible_Strict_P_Content{$tag} ) |
340
|
|
|
|
|
|
|
{ |
341
|
0
|
|
|
|
|
0
|
my $here = $pos; |
342
|
0
|
|
|
|
|
0
|
my $here_tag = $ptag; |
343
|
0
|
|
|
|
|
0
|
while (1) { |
344
|
0
|
0
|
|
|
|
0
|
if ( $here_tag eq 'p' ) { |
345
|
0
|
|
|
|
|
0
|
print $indent, " * Inserting $tag closes strict P.\n" |
346
|
|
|
|
|
|
|
if DEBUG > 1; |
347
|
0
|
|
|
|
|
0
|
$self->end( \q{p} ); |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
# NB: same as \'q', but less confusing to emacs cperl-mode |
350
|
0
|
|
|
|
|
0
|
last; |
351
|
|
|
|
|
|
|
} |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
#print("Lasting from $here_tag\n"), |
354
|
|
|
|
|
|
|
last |
355
|
|
|
|
|
|
|
if $HTML::DOM::_TreeBuilder::isKnown{$here_tag} |
356
|
|
|
|
|
|
|
and |
357
|
|
|
|
|
|
|
not $HTML::Tagset::is_Possible_Strict_P_Content{ |
358
|
0
|
0
|
0
|
|
|
0
|
$here_tag}; |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
# Don't keep looking up the tree if we see something that can't |
361
|
|
|
|
|
|
|
# be strict-P content. |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
$here_tag |
364
|
0
|
|
0
|
|
|
0
|
= ( $here = $here->{'_parent'} || last )->{'_tag'}; |
365
|
|
|
|
|
|
|
} # end while |
366
|
|
|
|
|
|
|
$ptag = ( $pos = $self->{'_pos'} || $self ) |
367
|
0
|
|
0
|
|
|
0
|
->{'_tag'}; # better update! |
368
|
|
|
|
|
|
|
} |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
# end of strict-p block. |
371
|
|
|
|
|
|
|
} |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
# And now, get busy... |
374
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
375
|
595
|
50
|
|
|
|
2041
|
if ( !$self->{'_implicit_tags'} ) { # bimskalabim |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
376
|
|
|
|
|
|
|
# do nothing |
377
|
0
|
|
|
|
|
0
|
print $indent, " * _implicit_tags is off. doing nothing\n" |
378
|
|
|
|
|
|
|
if DEBUG > 1; |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
381
|
|
|
|
|
|
|
} |
382
|
|
|
|
|
|
|
elsif ( $HTML::DOM::_TreeBuilder::isHeadOrBodyElement{$tag} ) { |
383
|
19
|
100
|
|
|
|
68
|
if ( $pos->is_inside('body') ) { # all is well |
|
|
100
|
|
|
|
|
|
384
|
9
|
|
|
|
|
15
|
print $indent, |
385
|
|
|
|
|
|
|
" * ambilocal element \U$tag\E is fine under BODY.\n" |
386
|
|
|
|
|
|
|
if DEBUG > 1; |
387
|
|
|
|
|
|
|
} |
388
|
|
|
|
|
|
|
elsif ( $pos->is_inside('head') ) { |
389
|
4
|
|
|
|
|
14
|
print $indent, |
390
|
|
|
|
|
|
|
" * ambilocal element \U$tag\E is fine under HEAD.\n" |
391
|
|
|
|
|
|
|
if DEBUG > 1; |
392
|
|
|
|
|
|
|
} |
393
|
|
|
|
|
|
|
else { |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
# In neither head nor body! mmmmm... put under head? |
396
|
|
|
|
|
|
|
|
397
|
6
|
50
|
|
|
|
15
|
if ( $ptag eq 'html' ) { # expected case |
398
|
|
|
|
|
|
|
# TODO?? : would there ever be a case where _head would be |
399
|
|
|
|
|
|
|
# absent from a tree that would ever be accessed at this |
400
|
|
|
|
|
|
|
# point? |
401
|
6
|
50
|
|
|
|
18
|
die "Where'd my head go?" unless ref $self->{'_head'}; |
402
|
6
|
50
|
|
|
|
14
|
if ( $self->{'_head'}{'_implicit'} ) { |
403
|
6
|
|
|
|
|
9
|
print $indent, |
404
|
|
|
|
|
|
|
" * ambilocal element \U$tag\E makes an implicit HEAD.\n" |
405
|
|
|
|
|
|
|
if DEBUG > 1; |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
# or rather, points us at it. |
408
|
|
|
|
|
|
|
$self->{'_pos'} |
409
|
6
|
|
|
|
|
15
|
= $self->{'_head'}; # to insert under... |
410
|
|
|
|
|
|
|
} |
411
|
|
|
|
|
|
|
else { |
412
|
0
|
|
|
|
|
0
|
$self->warning( |
413
|
|
|
|
|
|
|
"Ambilocal element <$tag> not under HEAD or BODY!?" |
414
|
|
|
|
|
|
|
); |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
# Put it under HEAD by default, I guess |
417
|
|
|
|
|
|
|
$self->{'_pos'} |
418
|
0
|
|
|
|
|
0
|
= $self->{'_head'}; # to insert under... |
419
|
|
|
|
|
|
|
} |
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
} |
422
|
|
|
|
|
|
|
else { |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
# Neither under head nor body, nor right under html... pass thru? |
425
|
0
|
|
|
|
|
0
|
$self->warning( |
426
|
|
|
|
|
|
|
"Ambilocal element <$tag> neither under head nor body, nor right under html!?" |
427
|
|
|
|
|
|
|
); |
428
|
|
|
|
|
|
|
} |
429
|
|
|
|
|
|
|
} |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
432
|
|
|
|
|
|
|
} |
433
|
|
|
|
|
|
|
elsif ( $HTML::DOM::_TreeBuilder::isBodyElement{$tag} ) { |
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
# Ensure that we are within |
436
|
494
|
100
|
66
|
|
|
1800
|
if ( $ptag eq 'body' ) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
# We're good. |
439
|
|
|
|
|
|
|
} |
440
|
|
|
|
|
|
|
elsif ( |
441
|
|
|
|
|
|
|
$HTML::DOM::_TreeBuilder::isBodyElement{$ptag} # glarg |
442
|
|
|
|
|
|
|
and not $HTML::DOM::_TreeBuilder::isHeadOrBodyElement{$ptag} |
443
|
|
|
|
|
|
|
) |
444
|
|
|
|
|
|
|
{ |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
# Special case: Save ourselves a call to is_inside further down. |
447
|
|
|
|
|
|
|
# If our $ptag is an isBodyElement element (but not an |
448
|
|
|
|
|
|
|
# isHeadOrBodyElement element), then we must be under body! |
449
|
346
|
|
|
|
|
442
|
print $indent, " * Inferring that $ptag is under BODY.\n", |
450
|
|
|
|
|
|
|
if DEBUG > 3; |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
# I think this and the test for 'body' trap everything |
453
|
|
|
|
|
|
|
# bodyworthy, except the case where the parent element is |
454
|
|
|
|
|
|
|
# under an unknown element that's a descendant of body. |
455
|
|
|
|
|
|
|
} |
456
|
|
|
|
|
|
|
elsif ( $pos->is_inside('head') ) { |
457
|
12
|
|
|
|
|
18
|
print $indent, |
458
|
|
|
|
|
|
|
" * body-element \U$tag\E minimizes HEAD, makes implicit BODY.\n" |
459
|
|
|
|
|
|
|
if DEBUG > 1; |
460
|
|
|
|
|
|
|
$ptag = ( |
461
|
|
|
|
|
|
|
$pos = $self->{'_pos'} |
462
|
|
|
|
|
|
|
= $self->{'_body'} # yes, needs updating |
463
|
|
|
|
|
|
|
|| die "Where'd my body go?" |
464
|
12
|
|
50
|
|
|
59
|
)->{'_tag'}; # yes, needs updating |
465
|
|
|
|
|
|
|
} |
466
|
|
|
|
|
|
|
elsif ( !$pos->is_inside('body') ) { |
467
|
73
|
|
|
|
|
112
|
print $indent, |
468
|
|
|
|
|
|
|
" * body-element \U$tag\E makes implicit BODY.\n" |
469
|
|
|
|
|
|
|
if DEBUG > 1; |
470
|
|
|
|
|
|
|
$ptag = ( |
471
|
|
|
|
|
|
|
$pos = $self->{'_pos'} |
472
|
|
|
|
|
|
|
= $self->{'_body'} # yes, needs updating |
473
|
|
|
|
|
|
|
|| die "Where'd my body go?" |
474
|
73
|
|
50
|
|
|
234
|
)->{'_tag'}; # yes, needs updating |
475
|
|
|
|
|
|
|
} |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
# else we ARE under body, so okay. |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
# Handle implicit endings and insert based on and position |
480
|
|
|
|
|
|
|
# ... ALL HOPE ABANDON ALL YE WHO ENTER HERE ... |
481
|
494
|
100
|
100
|
|
|
5300
|
if ( $tag eq 'p' |
|
|
100
|
66
|
|
|
|
|
|
|
100
|
66
|
|
|
|
|
|
|
50
|
33
|
|
|
|
|
|
|
100
|
33
|
|
|
|
|
|
|
100
|
33
|
|
|
|
|
|
|
100
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
33
|
|
|
|
|
482
|
|
|
|
|
|
|
or $tag eq 'h1' |
483
|
|
|
|
|
|
|
or $tag eq 'h2' |
484
|
|
|
|
|
|
|
or $tag eq 'h3' |
485
|
|
|
|
|
|
|
or $tag eq 'h4' |
486
|
|
|
|
|
|
|
or $tag eq 'h5' |
487
|
|
|
|
|
|
|
or $tag eq 'h6' |
488
|
|
|
|
|
|
|
or $tag eq 'form' |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
# Hm, should |
491
|
|
|
|
|
|
|
) |
492
|
|
|
|
|
|
|
{ |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
# Can't have , or inside these |
495
|
162
|
|
|
|
|
448
|
$self->end( |
496
|
|
|
|
|
|
|
$_Closed_by_structurals, |
497
|
|
|
|
|
|
|
@HTML::DOM::_TreeBuilder::p_closure_barriers |
498
|
|
|
|
|
|
|
|
499
|
|
|
|
|
|
|
# used to be just li! |
500
|
|
|
|
|
|
|
); |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
} |
503
|
|
|
|
|
|
|
elsif ( $tag eq 'ol' or $tag eq 'ul' or $tag eq 'dl' ) { |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
# Can't have lists inside -- in the unlikely |
506
|
|
|
|
|
|
|
# event anyone tries to put them there! |
507
|
2
|
50
|
33
|
|
|
33
|
if ( $ptag eq 'h1' |
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
508
|
|
|
|
|
|
|
or $ptag eq 'h2' |
509
|
|
|
|
|
|
|
or $ptag eq 'h3' |
510
|
|
|
|
|
|
|
or $ptag eq 'h4' |
511
|
|
|
|
|
|
|
or $ptag eq 'h5' |
512
|
|
|
|
|
|
|
or $ptag eq 'h6' ) |
513
|
|
|
|
|
|
|
{ |
514
|
0
|
|
|
|
|
0
|
$self->end( \$ptag ); |
515
|
|
|
|
|
|
|
} |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
# TODO: Maybe keep closing up the tree until |
518
|
|
|
|
|
|
|
# the ptag isn't any of the above? |
519
|
|
|
|
|
|
|
# But anyone that says |
520
|
|
|
|
|
|
|
# deserves what they get anyway. |
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
} |
523
|
|
|
|
|
|
|
elsif ( $tag eq 'li' ) { # list item |
524
|
|
|
|
|
|
|
# Get under a list tag, one way or another |
525
|
8
|
50
|
33
|
|
|
14
|
unless ( |
526
|
|
|
|
|
|
|
exists $HTML::DOM::_TreeBuilder::isList{$ptag} |
527
|
|
|
|
|
|
|
or $self->end( \q{*}, keys %HTML::DOM::_TreeBuilder::isList ) #' |
528
|
|
|
|
|
|
|
) |
529
|
|
|
|
|
|
|
{ |
530
|
0
|
|
|
|
|
0
|
print $indent, |
531
|
|
|
|
|
|
|
" * inserting implicit UL for lack of containing ", |
532
|
|
|
|
|
|
|
join( '|', keys %HTML::DOM::_TreeBuilder::isList ), ".\n" |
533
|
|
|
|
|
|
|
if DEBUG > 1; |
534
|
0
|
|
|
|
|
0
|
$self->insert_element( 'ul', 1 ); |
535
|
|
|
|
|
|
|
} |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
} |
538
|
|
|
|
|
|
|
elsif ( $tag eq 'dt' or $tag eq 'dd' ) { |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
# Get under a DL, one way or another |
541
|
0
|
0
|
0
|
|
|
0
|
unless ( $ptag eq 'dl' or $self->end( \q{*}, 'dl' ) ) { #' |
542
|
0
|
|
|
|
|
0
|
print $indent, |
543
|
|
|
|
|
|
|
" * inserting implicit DL for lack of containing DL.\n" |
544
|
|
|
|
|
|
|
if DEBUG > 1; |
545
|
0
|
|
|
|
|
0
|
$self->insert_element( 'dl', 1 ); |
546
|
|
|
|
|
|
|
} |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
} |
549
|
|
|
|
|
|
|
elsif ( $HTML::DOM::_TreeBuilder::isFormElement{$tag} ) { |
550
|
142
|
50
|
33
|
|
|
321
|
if ($self->{ |
551
|
|
|
|
|
|
|
'_ignore_formies_outside_form'} # TODO: document this |
552
|
|
|
|
|
|
|
and not $pos->is_inside('form') |
553
|
|
|
|
|
|
|
) |
554
|
|
|
|
|
|
|
{ |
555
|
0
|
|
|
|
|
0
|
print $indent, |
556
|
|
|
|
|
|
|
" * ignoring \U$tag\E because not in a FORM.\n" |
557
|
|
|
|
|
|
|
if DEBUG > 1; |
558
|
0
|
|
|
|
|
0
|
return; # bypass tweaking. |
559
|
|
|
|
|
|
|
} |
560
|
142
|
100
|
|
|
|
277
|
if ( $tag eq 'option' ) { |
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
# return unless $ptag eq 'select'; |
563
|
45
|
|
|
|
|
130
|
$self->end( \q{option} ); |
564
|
45
|
|
33
|
|
|
103
|
$ptag = ( $self->{'_pos'} || $self )->{'_tag'}; |
565
|
45
|
50
|
33
|
|
|
105
|
unless ( $ptag eq 'select' or $ptag eq 'optgroup' ) { |
566
|
0
|
|
|
|
|
0
|
print $indent, |
567
|
|
|
|
|
|
|
" * \U$tag\E makes an implicit SELECT.\n" |
568
|
|
|
|
|
|
|
if DEBUG > 1; |
569
|
0
|
|
|
|
|
0
|
$pos = $self->insert_element( 'select', 1 ); |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
# but not a very useful select -- has no 'name' attribute! |
572
|
|
|
|
|
|
|
# is $pos's value used after this? |
573
|
|
|
|
|
|
|
} |
574
|
|
|
|
|
|
|
} |
575
|
|
|
|
|
|
|
} |
576
|
|
|
|
|
|
|
elsif ( $HTML::DOM::_TreeBuilder::isTableElement{$tag} ) { |
577
|
45
|
50
|
|
|
|
143
|
if ( !$pos->is_inside('table') ) { |
578
|
0
|
|
|
|
|
0
|
print $indent, " * \U$tag\E makes an implicit TABLE\n" |
579
|
|
|
|
|
|
|
if DEBUG > 1; |
580
|
0
|
|
|
|
|
0
|
$self->insert_element( 'table', 1 ); |
581
|
|
|
|
|
|
|
} |
582
|
|
|
|
|
|
|
|
583
|
45
|
100
|
100
|
|
|
151
|
if ( $tag eq 'td' or $tag eq 'th' ) { |
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
# Get under a tr one way or another |
586
|
22
|
50
|
66
|
|
|
69
|
unless ( |
587
|
|
|
|
|
|
|
$ptag eq 'tr' # either under a tr |
588
|
|
|
|
|
|
|
or $self->end( \q{*}, 'tr', |
589
|
|
|
|
|
|
|
'table' ) #or we can get under one |
590
|
|
|
|
|
|
|
) |
591
|
|
|
|
|
|
|
{ |
592
|
0
|
|
|
|
|
0
|
print $indent, |
593
|
|
|
|
|
|
|
" * \U$tag\E under \U$ptag\E makes an implicit TR\n" |
594
|
|
|
|
|
|
|
if DEBUG > 1; |
595
|
0
|
|
|
|
|
0
|
$self->insert_element( 'tr', 1 ); |
596
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
# presumably pos's value isn't used after this. |
598
|
|
|
|
|
|
|
} |
599
|
|
|
|
|
|
|
} |
600
|
|
|
|
|
|
|
else { |
601
|
23
|
|
|
|
|
101
|
$self->end( \$tag, 'table' ); #' |
602
|
|
|
|
|
|
|
} |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
# Hmm, I guess this is right. To work it out: |
605
|
|
|
|
|
|
|
# tr closes any open tr (limited at a table) |
606
|
|
|
|
|
|
|
# thead closes any open thead (limited at a table) |
607
|
|
|
|
|
|
|
# tbody closes any open tbody (limited at a table) |
608
|
|
|
|
|
|
|
# tfoot closes any open tfoot (limited at a table) |
609
|
|
|
|
|
|
|
# colgroup closes any open colgroup (limited at a table) |
610
|
|
|
|
|
|
|
# col can try, but will always fail, at the enclosing table, |
611
|
|
|
|
|
|
|
# as col is empty, and therefore never open! |
612
|
|
|
|
|
|
|
# But! |
613
|
|
|
|
|
|
|
# td closes any open td OR th (limited at a table) |
614
|
|
|
|
|
|
|
# th closes any open th OR td (limited at a table) |
615
|
|
|
|
|
|
|
# ...implementable as "close to a tr, or make a tr" |
616
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
} |
618
|
|
|
|
|
|
|
elsif ( $HTML::DOM::_TreeBuilder::isPhraseMarkup{$tag} ) { |
619
|
61
|
50
|
66
|
|
|
176
|
if ( $ptag eq 'body' and $self->{'_implicit_body_p_tag'} ) { |
620
|
0
|
|
|
|
|
0
|
print |
621
|
|
|
|
|
|
|
" * Phrasal \U$tag\E right under BODY makes an implicit P\n" |
622
|
|
|
|
|
|
|
if DEBUG > 1; |
623
|
0
|
|
|
|
|
0
|
$pos = $self->insert_element( 'p', 1 ); |
624
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
# is $pos's value used after this? |
626
|
|
|
|
|
|
|
} |
627
|
|
|
|
|
|
|
} |
628
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
# End of implicit endings logic |
630
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
# End of "elsif ($HTML::DOM::_TreeBuilder::isBodyElement{$tag}" |
632
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
633
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
} |
635
|
|
|
|
|
|
|
elsif ( $HTML::DOM::_TreeBuilder::isHeadElement{$tag} ) { |
636
|
42
|
50
|
|
|
|
133
|
if ( $pos->is_inside('body') ) { |
|
|
100
|
|
|
|
|
|
637
|
0
|
|
|
|
|
0
|
print $indent, " * head element \U$tag\E found inside BODY!\n" |
638
|
|
|
|
|
|
|
if DEBUG; |
639
|
0
|
|
|
|
|
0
|
$self->warning("Header element <$tag> in body"); # [sic] |
640
|
|
|
|
|
|
|
} |
641
|
|
|
|
|
|
|
elsif ( !$pos->is_inside('head') ) { |
642
|
21
|
|
|
|
|
32
|
print $indent, |
643
|
|
|
|
|
|
|
" * head element \U$tag\E makes an implicit HEAD.\n" |
644
|
|
|
|
|
|
|
if DEBUG > 1; |
645
|
|
|
|
|
|
|
} |
646
|
|
|
|
|
|
|
else { |
647
|
21
|
|
|
|
|
32
|
print $indent, |
648
|
|
|
|
|
|
|
" * head element \U$tag\E goes inside existing HEAD.\n" |
649
|
|
|
|
|
|
|
if DEBUG > 1; |
650
|
|
|
|
|
|
|
} |
651
|
42
|
|
50
|
|
|
120
|
$self->{'_pos'} = $self->{'_head'} || die "Where'd my head go?"; |
652
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
654
|
|
|
|
|
|
|
} |
655
|
|
|
|
|
|
|
elsif ( $tag eq 'html' ) { |
656
|
11
|
50
|
|
|
|
30
|
if ( delete $self->{'_implicit'} ) { # first time here |
657
|
11
|
|
|
|
|
16
|
print $indent, " * good! found the real HTML element!\n" |
658
|
|
|
|
|
|
|
if DEBUG > 1; |
659
|
|
|
|
|
|
|
} |
660
|
|
|
|
|
|
|
else { |
661
|
0
|
|
|
|
|
0
|
print $indent, " * Found a second HTML element\n" |
662
|
|
|
|
|
|
|
if DEBUG; |
663
|
0
|
|
|
|
|
0
|
$self->warning("Found a nested element"); |
664
|
|
|
|
|
|
|
} |
665
|
|
|
|
|
|
|
|
666
|
|
|
|
|
|
|
# in either case, migrate attributes to the real element |
667
|
11
|
|
|
|
|
29
|
for ( keys %$attr ) { |
668
|
7
|
|
|
|
|
18
|
$self->attr( $_, $attr->{$_} ); |
669
|
|
|
|
|
|
|
} |
670
|
11
|
|
|
|
|
18
|
$self->{'_pos'} = undef; |
671
|
11
|
|
|
|
|
45
|
return $self; # bypass tweaking. |
672
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
674
|
|
|
|
|
|
|
} |
675
|
|
|
|
|
|
|
elsif ( $tag eq 'head' ) { |
676
|
5
|
|
50
|
|
|
17
|
my $head = $self->{'_head'} || die "Where'd my head go?"; |
677
|
5
|
50
|
|
|
|
15
|
if ( delete $head->{'_implicit'} ) { # first time here |
678
|
5
|
|
|
|
|
9
|
print $indent, " * good! found the real HEAD element!\n" |
679
|
|
|
|
|
|
|
if DEBUG > 1; |
680
|
|
|
|
|
|
|
} |
681
|
|
|
|
|
|
|
else { # been here before |
682
|
0
|
|
|
|
|
0
|
print $indent, " * Found a second HEAD element\n" |
683
|
|
|
|
|
|
|
if DEBUG; |
684
|
0
|
|
|
|
|
0
|
$self->warning("Found a second element"); |
685
|
|
|
|
|
|
|
} |
686
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
# in either case, migrate attributes to the real element |
688
|
5
|
|
|
|
|
14
|
for ( keys %$attr ) { |
689
|
0
|
|
|
|
|
0
|
$head->attr( $_, $attr->{$_} ); |
690
|
|
|
|
|
|
|
} |
691
|
5
|
|
|
|
|
19
|
return $self->{'_pos'} = $head; # bypass tweaking. |
692
|
|
|
|
|
|
|
|
693
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
694
|
|
|
|
|
|
|
} |
695
|
|
|
|
|
|
|
elsif ( $tag eq 'body' ) { |
696
|
22
|
|
50
|
|
|
74
|
my $body = $self->{'_body'} || die "Where'd my body go?"; |
697
|
22
|
50
|
|
|
|
57
|
if ( delete $body->{'_implicit'} ) { # first time here |
698
|
22
|
|
|
|
|
31
|
print $indent, " * good! found the real BODY element!\n" |
699
|
|
|
|
|
|
|
if DEBUG > 1; |
700
|
|
|
|
|
|
|
} |
701
|
|
|
|
|
|
|
else { # been here before |
702
|
0
|
|
|
|
|
0
|
print $indent, " * Found a second BODY element\n" |
703
|
|
|
|
|
|
|
if DEBUG; |
704
|
0
|
|
|
|
|
0
|
$self->warning("Found a second element"); |
705
|
|
|
|
|
|
|
} |
706
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
# in either case, migrate attributes to the real element |
708
|
22
|
|
|
|
|
64
|
for ( keys %$attr ) { |
709
|
21
|
|
|
|
|
52
|
$body->attr( $_, $attr->{$_} ); |
710
|
|
|
|
|
|
|
} |
711
|
22
|
100
|
|
|
|
87
|
$self->{'_pos'} = $body unless $pos->is_inside('body'); |
712
|
22
|
|
|
|
|
89
|
return $body; # bypass tweaking. |
713
|
|
|
|
|
|
|
|
714
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
715
|
|
|
|
|
|
|
} |
716
|
|
|
|
|
|
|
elsif ( $tag eq 'frameset' ) { |
717
|
0
|
0
|
0
|
|
|
0
|
if (!( $self->{'_frameset_seen'}++ ) # first frameset seen |
|
|
|
0
|
|
|
|
|
718
|
|
|
|
|
|
|
and !$self->{'_noframes_seen'} |
719
|
|
|
|
|
|
|
|
720
|
|
|
|
|
|
|
# otherwise it'll be under the noframes already |
721
|
|
|
|
|
|
|
and !$self->is_inside('body') |
722
|
|
|
|
|
|
|
) |
723
|
|
|
|
|
|
|
{ |
724
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
# The following is a bit of a hack. We don't use the normal |
726
|
|
|
|
|
|
|
# insert_element because 1) we don't want it as _pos, but instead |
727
|
|
|
|
|
|
|
# right under $self, and 2), more importantly, that we don't want |
728
|
|
|
|
|
|
|
# this inserted at the /end/ of $self's content_list, but instead |
729
|
|
|
|
|
|
|
# in the middle of it, specifiaclly right before the body element. |
730
|
|
|
|
|
|
|
# |
731
|
0
|
|
0
|
|
|
0
|
my $c = $self->{'_content'} || die "Contentless root?"; |
732
|
0
|
|
0
|
|
|
0
|
my $body = $self->{'_body'} || die "Where'd my BODY go?"; |
733
|
0
|
|
|
|
|
0
|
for ( my $i = 0; $i < @$c; ++$i ) { |
734
|
0
|
0
|
|
|
|
0
|
if ( $c->[$i] eq $body ) { |
735
|
0
|
|
|
|
|
0
|
splice( @$c, $i, 0, $self->{'_pos'} = $pos = $e ); |
736
|
0
|
|
|
|
|
0
|
$e->{'_parent'} = $self; |
737
|
0
|
|
|
|
|
0
|
$already_inserted = 1; |
738
|
0
|
|
|
|
|
0
|
print $indent, |
739
|
|
|
|
|
|
|
" * inserting 'frameset' right before BODY.\n" |
740
|
|
|
|
|
|
|
if DEBUG > 1; |
741
|
0
|
|
|
|
|
0
|
last; |
742
|
|
|
|
|
|
|
} |
743
|
|
|
|
|
|
|
} |
744
|
0
|
0
|
|
|
|
0
|
die "BODY not found in children of root?" |
745
|
|
|
|
|
|
|
unless $already_inserted; |
746
|
|
|
|
|
|
|
} |
747
|
|
|
|
|
|
|
|
748
|
|
|
|
|
|
|
} |
749
|
|
|
|
|
|
|
elsif ( $tag eq 'frame' ) { |
750
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
# Okay, fine, pass thru. |
752
|
|
|
|
|
|
|
# Should probably enforce that these should be under a frameset. |
753
|
|
|
|
|
|
|
# But hey. Ditto for enforcing that 'noframes' should be under |
754
|
|
|
|
|
|
|
# a 'frameset', as the DTDs say. |
755
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
} |
757
|
|
|
|
|
|
|
elsif ( $tag eq 'noframes' ) { |
758
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
# This basically assumes there'll be exactly one 'noframes' element |
760
|
|
|
|
|
|
|
# per document. At least, only the first one gets to have the |
761
|
|
|
|
|
|
|
# body under it. And if there are no noframes elements, then |
762
|
|
|
|
|
|
|
# the body pretty much stays where it is. Is that ever a problem? |
763
|
0
|
0
|
|
|
|
0
|
if ( $self->{'_noframes_seen'}++ ) { |
764
|
0
|
|
|
|
|
0
|
print $indent, " * ANOTHER noframes element?\n" if DEBUG; |
765
|
|
|
|
|
|
|
} |
766
|
|
|
|
|
|
|
else { |
767
|
0
|
0
|
|
|
|
0
|
if ( $pos->is_inside('body') ) { |
768
|
0
|
|
|
|
|
0
|
print $indent, " * 'noframes' inside 'body'. Odd!\n" |
769
|
|
|
|
|
|
|
if DEBUG; |
770
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
# In that odd case, we /can't/ make body a child of 'noframes', |
772
|
|
|
|
|
|
|
# because it's an ancestor of the 'noframes'! |
773
|
|
|
|
|
|
|
} |
774
|
|
|
|
|
|
|
else { |
775
|
0
|
|
0
|
|
|
0
|
$e->push_content( $self->{'_body'} |
776
|
|
|
|
|
|
|
|| die "Where'd my body go?" ); |
777
|
0
|
|
|
|
|
0
|
print $indent, " * Moving body to be under noframes.\n" |
778
|
|
|
|
|
|
|
if DEBUG; |
779
|
|
|
|
|
|
|
} |
780
|
|
|
|
|
|
|
} |
781
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
783
|
|
|
|
|
|
|
} |
784
|
|
|
|
|
|
|
else { |
785
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
# unknown tag |
787
|
2
|
50
|
|
|
|
5
|
if ( $self->{'_ignore_unknown'} ) { |
788
|
2
|
|
|
|
|
3
|
print $indent, " * Ignoring unknown tag \U$tag\E\n" if DEBUG; |
789
|
2
|
|
|
|
|
17
|
$self->warning("Skipping unknown tag $tag"); |
790
|
2
|
|
|
|
|
5
|
return; |
791
|
|
|
|
|
|
|
} |
792
|
|
|
|
|
|
|
else { |
793
|
0
|
|
|
|
|
0
|
print $indent, " * Accepting unknown tag \U$tag\E\n" |
794
|
|
|
|
|
|
|
if DEBUG; |
795
|
|
|
|
|
|
|
} |
796
|
|
|
|
|
|
|
} |
797
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
799
|
|
|
|
|
|
|
# End of mumbo-jumbo |
800
|
|
|
|
|
|
|
|
801
|
|
|
|
|
|
|
print $indent, "(Attaching ", $e->{'_tag'}, " under ", |
802
|
555
|
|
|
|
|
679
|
( $self->{'_pos'} || $self )->{'_tag'}, ")\n" |
803
|
|
|
|
|
|
|
|
804
|
|
|
|
|
|
|
# because if _pos isn't defined, it goes under self |
805
|
|
|
|
|
|
|
if DEBUG; |
806
|
|
|
|
|
|
|
|
807
|
|
|
|
|
|
|
# The following if-clause is to delete /some/ ignorable whitespace |
808
|
|
|
|
|
|
|
# nodes, as we're making the tree. |
809
|
|
|
|
|
|
|
# This'd be a node we'd catch later anyway, but we might as well |
810
|
|
|
|
|
|
|
# nip it in the bud now. |
811
|
|
|
|
|
|
|
# This doesn't catch /all/ deletable WS-nodes, so we do have to call |
812
|
|
|
|
|
|
|
# the tightener later to catch the rest. |
813
|
|
|
|
|
|
|
|
814
|
555
|
50
|
33
|
|
|
1172
|
if ( $self->{'_tighten'} and !$self->{'_ignore_text'} ) |
815
|
|
|
|
|
|
|
{ # if tightenable |
816
|
0
|
|
|
|
|
0
|
my ( $sibs, $par ); |
817
|
0
|
0
|
0
|
|
|
0
|
if (( $sibs = ( $par = $self->{'_pos'} || $self )->{'_content'} ) |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
818
|
|
|
|
|
|
|
and @$sibs # parent already has content |
819
|
|
|
|
|
|
|
and ! |
820
|
|
|
|
|
|
|
ref( $sibs->[-1] ) # and the last one there is a text node |
821
|
|
|
|
|
|
|
and $sibs->[-1] !~ m<[^\n\r\f\t ]>s # and it's all whitespace |
822
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
and ( # one of these has to be eligible... |
824
|
|
|
|
|
|
|
$HTML::DOM::_TreeBuilder::canTighten{$tag} |
825
|
|
|
|
|
|
|
or (( @$sibs == 1 ) |
826
|
|
|
|
|
|
|
? # WS is leftmost -- so parent matters |
827
|
|
|
|
|
|
|
$HTML::DOM::_TreeBuilder::canTighten{ $par->{'_tag'} } |
828
|
|
|
|
|
|
|
: # WS is after another node -- it matters |
829
|
|
|
|
|
|
|
( ref $sibs->[-2] |
830
|
|
|
|
|
|
|
and |
831
|
|
|
|
|
|
|
$HTML::DOM::_TreeBuilder::canTighten{ $sibs->[-2] |
832
|
|
|
|
|
|
|
{'_tag'} } |
833
|
|
|
|
|
|
|
) |
834
|
|
|
|
|
|
|
) |
835
|
|
|
|
|
|
|
) |
836
|
|
|
|
|
|
|
|
837
|
|
|
|
|
|
|
and !$par->is_inside( 'pre', 'xmp', 'textarea', 'plaintext' ) |
838
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
# we're clear |
840
|
|
|
|
|
|
|
) |
841
|
|
|
|
|
|
|
{ |
842
|
0
|
|
|
|
|
0
|
pop @$sibs; |
843
|
0
|
|
|
|
|
0
|
print $indent, "Popping a preceding all-WS node\n" if DEBUG; |
844
|
|
|
|
|
|
|
} |
845
|
|
|
|
|
|
|
} |
846
|
|
|
|
|
|
|
|
847
|
555
|
50
|
|
|
|
1647
|
$self->insert_element($e) unless $already_inserted; |
848
|
|
|
|
|
|
|
|
849
|
555
|
|
|
|
|
614
|
if (DEBUG) { |
850
|
|
|
|
|
|
|
if ( $self->{'_pos'} ) { |
851
|
|
|
|
|
|
|
print $indent, "(Current lineage of pos: \U$tag\E under ", |
852
|
|
|
|
|
|
|
join( |
853
|
|
|
|
|
|
|
'/', |
854
|
|
|
|
|
|
|
reverse( |
855
|
|
|
|
|
|
|
|
856
|
|
|
|
|
|
|
# $self->{'_pos'}{'_tag'}, # don't list myself! |
857
|
|
|
|
|
|
|
$self->{'_pos'}->lineage_tag_names |
858
|
|
|
|
|
|
|
) |
859
|
|
|
|
|
|
|
), |
860
|
|
|
|
|
|
|
".)\n"; |
861
|
|
|
|
|
|
|
} |
862
|
|
|
|
|
|
|
else { |
863
|
|
|
|
|
|
|
print $indent, "(Pos points nowhere!?)\n"; |
864
|
|
|
|
|
|
|
} |
865
|
|
|
|
|
|
|
} |
866
|
|
|
|
|
|
|
|
867
|
555
|
100
|
50
|
|
|
1791
|
unless ( ( $self->{'_pos'} || '' ) eq $e ) { |
868
|
|
|
|
|
|
|
|
869
|
|
|
|
|
|
|
# if it's an empty element -- i.e., if it didn't change the _pos |
870
|
107
|
|
|
|
|
342
|
&{ $self->{"_tweak_$tag"} |
871
|
107
|
50
|
33
|
|
|
614
|
|| $self->{'_tweak_*'} |
872
|
|
|
|
|
|
|
|| return $e }( map $_, $e, $tag, $self ) |
873
|
|
|
|
|
|
|
; # make a list so the user can't clobber |
874
|
|
|
|
|
|
|
} |
875
|
|
|
|
|
|
|
|
876
|
555
|
|
|
|
|
1324
|
return $e; |
877
|
|
|
|
|
|
|
} |
878
|
|
|
|
|
|
|
} |
879
|
|
|
|
|
|
|
|
880
|
|
|
|
|
|
|
#========================================================================== |
881
|
|
|
|
|
|
|
|
882
|
|
|
|
|
|
|
{ |
883
|
|
|
|
|
|
|
my $indent; |
884
|
|
|
|
|
|
|
|
885
|
|
|
|
|
|
|
sub end { |
886
|
602
|
50
|
|
602
|
1
|
1136
|
return if $_[0]{'_stunted'}; |
887
|
|
|
|
|
|
|
|
888
|
|
|
|
|
|
|
# Either: Acccept an end-tag signal from HTML::Parser |
889
|
|
|
|
|
|
|
# Or: Method for closing currently open elements in some fairly complex |
890
|
|
|
|
|
|
|
# way, as used by other methods in this class. |
891
|
602
|
|
|
|
|
1432
|
my ( $self, $tag, @stop ) = @_; |
892
|
602
|
50
|
|
|
|
1091
|
if ( $tag eq 'x-html' ) { |
893
|
0
|
|
|
|
|
0
|
print "Ignoring close-x-html tag.\n" if DEBUG; |
894
|
|
|
|
|
|
|
|
895
|
|
|
|
|
|
|
# inserted by some lame code-generators. |
896
|
0
|
|
|
|
|
0
|
return; |
897
|
|
|
|
|
|
|
} |
898
|
|
|
|
|
|
|
|
899
|
602
|
50
|
66
|
|
|
2248
|
unless ( ref($tag) or $tag =~ m/^[-_a-zA-Z0-9:%]+$/s ) { |
900
|
0
|
|
|
|
|
0
|
DEBUG and print "End-tag name $tag is no good. Skipping.\n"; |
901
|
0
|
|
|
|
|
0
|
return; |
902
|
|
|
|
|
|
|
|
903
|
|
|
|
|
|
|
# This avoids having Element's new() throw an exception. |
904
|
|
|
|
|
|
|
} |
905
|
|
|
|
|
|
|
|
906
|
|
|
|
|
|
|
# This method accepts two calling formats: |
907
|
|
|
|
|
|
|
# 1) from Parser: $self->end('tag_name', 'origtext') |
908
|
|
|
|
|
|
|
# in which case we shouldn't mistake origtext as a blocker tag |
909
|
|
|
|
|
|
|
# 2) from myself: $self->end(\q{tagname1}, 'blk1', ... ) |
910
|
|
|
|
|
|
|
# from myself: $self->end(['tagname1', 'tagname2'], 'blk1', ... ) |
911
|
|
|
|
|
|
|
|
912
|
|
|
|
|
|
|
# End the specified tag, but don't move above any of the blocker tags. |
913
|
|
|
|
|
|
|
# The tag can also be a reference to an array. Terminate the first |
914
|
|
|
|
|
|
|
# tag found. |
915
|
|
|
|
|
|
|
|
916
|
602
|
|
66
|
|
|
1432
|
my $ptag = ( my $p = $self->{'_pos'} || $self )->{'_tag'}; |
917
|
|
|
|
|
|
|
|
918
|
|
|
|
|
|
|
# $p and $ptag are sort-of stratch |
919
|
|
|
|
|
|
|
|
920
|
602
|
100
|
|
|
|
988
|
if ( ref($tag) ) { |
921
|
|
|
|
|
|
|
|
922
|
|
|
|
|
|
|
# First param is a ref of one sort or another -- |
923
|
|
|
|
|
|
|
# THE CALL IS COMING FROM INSIDE THE HOUSE! |
924
|
244
|
100
|
|
|
|
494
|
$tag = $$tag if ref($tag) eq 'SCALAR'; |
925
|
|
|
|
|
|
|
|
926
|
|
|
|
|
|
|
# otherwise it's an arrayref. |
927
|
|
|
|
|
|
|
} |
928
|
|
|
|
|
|
|
else { |
929
|
|
|
|
|
|
|
|
930
|
|
|
|
|
|
|
# the call came from Parser -- just ignore origtext |
931
|
|
|
|
|
|
|
# except in a table ignore unmatched table tags RT #59980 |
932
|
358
|
100
|
|
|
|
767
|
@stop = $tag =~ /^t[hdr]\z/ ? 'table' : (); |
933
|
|
|
|
|
|
|
} |
934
|
|
|
|
|
|
|
|
935
|
|
|
|
|
|
|
#my($indent); |
936
|
602
|
|
|
|
|
662
|
if (DEBUG) { |
937
|
|
|
|
|
|
|
|
938
|
|
|
|
|
|
|
# optimization -- don't figure out depth unless we're in debug mode |
939
|
|
|
|
|
|
|
my @lineage_tags = $p->lineage_tag_names; |
940
|
|
|
|
|
|
|
$indent = ' ' x ( 1 + @lineage_tags ); |
941
|
|
|
|
|
|
|
|
942
|
|
|
|
|
|
|
# now announce ourselves |
943
|
|
|
|
|
|
|
print $indent, "Ending ", |
944
|
|
|
|
|
|
|
ref($tag) ? ( '[', join( ' ', @$tag ), ']' ) : "\U$tag\E", |
945
|
|
|
|
|
|
|
scalar(@stop) |
946
|
|
|
|
|
|
|
? ( " no higher than [", join( ' ', @stop ), "]" ) |
947
|
|
|
|
|
|
|
: (), ".\n"; |
948
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
print $indent, " (Current lineage: ", join( '/', @lineage_tags ), |
950
|
|
|
|
|
|
|
".)\n" |
951
|
|
|
|
|
|
|
if DEBUG > 1; |
952
|
|
|
|
|
|
|
|
953
|
|
|
|
|
|
|
if ( DEBUG > 3 ) { |
954
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
#my( |
956
|
|
|
|
|
|
|
# $package, $filename, $line, $subroutine, |
957
|
|
|
|
|
|
|
# $hasargs, $wantarray, $evaltext, $is_require) = caller; |
958
|
|
|
|
|
|
|
print $indent, |
959
|
|
|
|
|
|
|
" (Called from ", ( caller(1) )[3], ' line ', |
960
|
|
|
|
|
|
|
( caller(1) )[2], |
961
|
|
|
|
|
|
|
")\n"; |
962
|
|
|
|
|
|
|
} |
963
|
|
|
|
|
|
|
|
964
|
|
|
|
|
|
|
#} else { |
965
|
|
|
|
|
|
|
# $indent = ' '; |
966
|
|
|
|
|
|
|
} |
967
|
|
|
|
|
|
|
|
968
|
|
|
|
|
|
|
# End of if DEBUG |
969
|
|
|
|
|
|
|
|
970
|
|
|
|
|
|
|
# Now actually do it |
971
|
602
|
|
|
|
|
713
|
my @to_close; |
972
|
602
|
100
|
|
|
|
1157
|
if ( $tag eq '*' ) { |
|
|
100
|
|
|
|
|
|
973
|
|
|
|
|
|
|
|
974
|
|
|
|
|
|
|
# Special -- close everything up to (but not including) the first |
975
|
|
|
|
|
|
|
# limiting tag, or return if none found. Somewhat of a special case. |
976
|
|
|
|
|
|
|
PARENT: |
977
|
8
|
|
|
|
|
17
|
while ( defined $p ) { |
978
|
21
|
|
|
|
|
29
|
$ptag = $p->{'_tag'}; |
979
|
21
|
|
|
|
|
20
|
print $indent, " (Looking at $ptag.)\n" if DEBUG > 2; |
980
|
21
|
|
|
|
|
31
|
for (@stop) { |
981
|
34
|
100
|
|
|
|
52
|
if ( $ptag eq $_ ) { |
982
|
8
|
|
|
|
|
9
|
print $indent, |
983
|
|
|
|
|
|
|
" (Hit a $_; closing everything up to here.)\n" |
984
|
|
|
|
|
|
|
if DEBUG > 2; |
985
|
8
|
|
|
|
|
15
|
last PARENT; |
986
|
|
|
|
|
|
|
} |
987
|
|
|
|
|
|
|
} |
988
|
13
|
|
|
|
|
17
|
push @to_close, $p; |
989
|
13
|
|
|
|
|
17
|
$p = $p->{'_parent'}; # no match so far? keep moving up |
990
|
|
|
|
|
|
|
print $indent, |
991
|
13
|
|
|
|
|
18
|
" (Moving on up to ", $p ? $p->{'_tag'} : 'nil', ")\n" |
992
|
|
|
|
|
|
|
if DEBUG > 1; |
993
|
|
|
|
|
|
|
} |
994
|
8
|
50
|
|
|
|
15
|
unless ( defined $p ) { # We never found what we were looking for. |
995
|
0
|
|
|
|
|
0
|
print $indent, " (We never found a limit.)\n" if DEBUG > 1; |
996
|
0
|
|
|
|
|
0
|
return; |
997
|
|
|
|
|
|
|
} |
998
|
|
|
|
|
|
|
|
999
|
|
|
|
|
|
|
#print |
1000
|
|
|
|
|
|
|
# $indent, |
1001
|
|
|
|
|
|
|
# " (To close: ", join('/', map $_->tag, @to_close), ".)\n" |
1002
|
|
|
|
|
|
|
# if DEBUG > 4; |
1003
|
|
|
|
|
|
|
|
1004
|
|
|
|
|
|
|
# Otherwise update pos and fall thru. |
1005
|
8
|
|
|
|
|
14
|
$self->{'_pos'} = $p; |
1006
|
|
|
|
|
|
|
} |
1007
|
|
|
|
|
|
|
elsif ( ref $tag ) { |
1008
|
|
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
# Close the first of any of the matching tags, giving up if you hit |
1010
|
|
|
|
|
|
|
# any of the stop-tags. |
1011
|
|
|
|
|
|
|
PARENT: |
1012
|
162
|
|
|
|
|
315
|
while ( defined $p ) { |
1013
|
285
|
|
|
|
|
454
|
$ptag = $p->{'_tag'}; |
1014
|
285
|
|
|
|
|
352
|
print $indent, " (Looking at $ptag.)\n" if DEBUG > 2; |
1015
|
285
|
|
|
|
|
422
|
for (@$tag) { |
1016
|
2045
|
100
|
|
|
|
3001
|
if ( $ptag eq $_ ) { |
1017
|
65
|
|
|
|
|
72
|
print $indent, " (Closing $_.)\n" if DEBUG > 2; |
1018
|
65
|
|
|
|
|
111
|
last PARENT; |
1019
|
|
|
|
|
|
|
} |
1020
|
|
|
|
|
|
|
} |
1021
|
220
|
|
|
|
|
343
|
for (@stop) { |
1022
|
3270
|
100
|
|
|
|
4581
|
if ( $ptag eq $_ ) { |
1023
|
33
|
|
|
|
|
38
|
print $indent, |
1024
|
|
|
|
|
|
|
" (Hit a limiting $_ -- bailing out.)\n" |
1025
|
|
|
|
|
|
|
if DEBUG > 1; |
1026
|
33
|
|
|
|
|
98
|
return; # so it was all for naught |
1027
|
|
|
|
|
|
|
} |
1028
|
|
|
|
|
|
|
} |
1029
|
187
|
|
|
|
|
265
|
push @to_close, $p; |
1030
|
187
|
|
|
|
|
314
|
$p = $p->{'_parent'}; |
1031
|
|
|
|
|
|
|
} |
1032
|
129
|
100
|
|
|
|
412
|
return unless defined $p; # We went off the top of the tree. |
1033
|
|
|
|
|
|
|
# Otherwise specified element was found; set pos to its parent. |
1034
|
65
|
|
|
|
|
99
|
push @to_close, $p; |
1035
|
65
|
|
|
|
|
93
|
$self->{'_pos'} = $p->{'_parent'}; |
1036
|
|
|
|
|
|
|
} |
1037
|
|
|
|
|
|
|
else { |
1038
|
|
|
|
|
|
|
|
1039
|
|
|
|
|
|
|
# Close the first of the specified tag, giving up if you hit |
1040
|
|
|
|
|
|
|
# any of the stop-tags. |
1041
|
432
|
|
|
|
|
801
|
while ( defined $p ) { |
1042
|
746
|
|
|
|
|
1034
|
$ptag = $p->{'_tag'}; |
1043
|
746
|
|
|
|
|
834
|
print $indent, " (Looking at $ptag.)\n" if DEBUG > 2; |
1044
|
746
|
100
|
|
|
|
1216
|
if ( $ptag eq $tag ) { |
1045
|
389
|
|
|
|
|
402
|
print $indent, " (Closing $tag.)\n" if DEBUG > 2; |
1046
|
389
|
|
|
|
|
513
|
last; |
1047
|
|
|
|
|
|
|
} |
1048
|
357
|
|
|
|
|
555
|
for (@stop) { |
1049
|
35
|
100
|
|
|
|
68
|
if ( $ptag eq $_ ) { |
1050
|
23
|
|
|
|
|
27
|
print $indent, |
1051
|
|
|
|
|
|
|
" (Hit a limiting $_ -- bailing out.)\n" |
1052
|
|
|
|
|
|
|
if DEBUG > 1; |
1053
|
23
|
|
|
|
|
57
|
return; # so it was all for naught |
1054
|
|
|
|
|
|
|
} |
1055
|
|
|
|
|
|
|
} |
1056
|
334
|
|
|
|
|
472
|
push @to_close, $p; |
1057
|
334
|
|
|
|
|
566
|
$p = $p->{'_parent'}; |
1058
|
|
|
|
|
|
|
} |
1059
|
409
|
100
|
|
|
|
727
|
return unless defined $p; # We went off the top of the tree. |
1060
|
|
|
|
|
|
|
# Otherwise specified element was found; set pos to its parent. |
1061
|
389
|
|
|
|
|
513
|
push @to_close, $p; |
1062
|
389
|
|
|
|
|
570
|
$self->{'_pos'} = $p->{'_parent'}; |
1063
|
|
|
|
|
|
|
} |
1064
|
|
|
|
|
|
|
|
1065
|
462
|
100
|
100
|
|
|
1432
|
$self->{'_pos'} = undef if $self eq ( $self->{'_pos'} || '' ); |
1066
|
|
|
|
|
|
|
print $indent, "(Pos now points to ", |
1067
|
462
|
|
|
|
|
548
|
$self->{'_pos'} ? $self->{'_pos'}{'_tag'} : '???', ".)\n" |
1068
|
|
|
|
|
|
|
if DEBUG > 1; |
1069
|
|
|
|
|
|
|
|
1070
|
|
|
|
|
|
|
### EXPENSIVE, because has to check that it's not under a pre |
1071
|
|
|
|
|
|
|
### or a CDATA-parent. That's one more method call per end()! |
1072
|
|
|
|
|
|
|
### Might as well just do this at the end of the tree-parse, I guess, |
1073
|
|
|
|
|
|
|
### at which point we'd be parsing top-down, and just not traversing |
1074
|
|
|
|
|
|
|
### under pre's or CDATA-parents. |
1075
|
|
|
|
|
|
|
## |
1076
|
|
|
|
|
|
|
## Take this opportunity to nix any terminal whitespace nodes. |
1077
|
|
|
|
|
|
|
## TODO: consider whether this (plus the logic in start(), above) |
1078
|
|
|
|
|
|
|
## would ever leave any WS nodes in the tree. |
1079
|
|
|
|
|
|
|
## If not, then there's no reason to have eof() call |
1080
|
|
|
|
|
|
|
## delete_ignorable_whitespace on the tree, is there? |
1081
|
|
|
|
|
|
|
## |
1082
|
|
|
|
|
|
|
#if(@to_close and $self->{'_tighten'} and !$self->{'_ignore_text'} and |
1083
|
|
|
|
|
|
|
# ! $to_close[-1]->is_inside('pre', keys %HTML::Tagset::isCDATA_Parent) |
1084
|
|
|
|
|
|
|
#) { # if tightenable |
1085
|
|
|
|
|
|
|
# my($children, $e_tag); |
1086
|
|
|
|
|
|
|
# foreach my $e (reverse @to_close) { # going top-down |
1087
|
|
|
|
|
|
|
# last if 'pre' eq ($e_tag = $e->{'_tag'}) or |
1088
|
|
|
|
|
|
|
# $HTML::Tagset::isCDATA_Parent{$e_tag}; |
1089
|
|
|
|
|
|
|
# |
1090
|
|
|
|
|
|
|
# if( |
1091
|
|
|
|
|
|
|
# $children = $e->{'_content'} |
1092
|
|
|
|
|
|
|
# and @$children # has children |
1093
|
|
|
|
|
|
|
# and !ref($children->[-1]) |
1094
|
|
|
|
|
|
|
# and $children->[-1] =~ m<^\s+$>s # last node is all-WS |
1095
|
|
|
|
|
|
|
# and |
1096
|
|
|
|
|
|
|
# ( |
1097
|
|
|
|
|
|
|
# # has a tightable parent: |
1098
|
|
|
|
|
|
|
# $HTML::DOM::_TreeBuilder::canTighten{ $e_tag } |
1099
|
|
|
|
|
|
|
# or |
1100
|
|
|
|
|
|
|
# ( # has a tightenable left sibling: |
1101
|
|
|
|
|
|
|
# @$children > 1 and |
1102
|
|
|
|
|
|
|
# ref($children->[-2]) |
1103
|
|
|
|
|
|
|
# and $HTML::DOM::_TreeBuilder::canTighten{ $children->[-2]{'_tag'} } |
1104
|
|
|
|
|
|
|
# ) |
1105
|
|
|
|
|
|
|
# ) |
1106
|
|
|
|
|
|
|
# ) { |
1107
|
|
|
|
|
|
|
# pop @$children; |
1108
|
|
|
|
|
|
|
# #print $indent, "Popping a terminal WS node from ", $e->{'_tag'}, |
1109
|
|
|
|
|
|
|
# # " (", $e->address, ") while exiting.\n" if DEBUG; |
1110
|
|
|
|
|
|
|
# } |
1111
|
|
|
|
|
|
|
# } |
1112
|
|
|
|
|
|
|
#} |
1113
|
|
|
|
|
|
|
|
1114
|
462
|
|
|
|
|
668
|
foreach my $e (@to_close) { |
1115
|
|
|
|
|
|
|
|
1116
|
|
|
|
|
|
|
# Call the applicable callback, if any |
1117
|
694
|
|
|
|
|
881
|
$ptag = $e->{'_tag'}; |
1118
|
694
|
|
|
|
|
1747
|
&{ $self->{"_tweak_$ptag"} |
1119
|
694
|
50
|
66
|
|
|
3298
|
|| $self->{'_tweak_*'} |
1120
|
|
|
|
|
|
|
|| next }( map $_, $e, $ptag, $self ); |
1121
|
693
|
|
|
|
|
1016
|
print $indent, "Back from tweaking.\n" if DEBUG; |
1122
|
|
|
|
|
|
|
last |
1123
|
693
|
50
|
|
|
|
1308
|
if $self->{ '_stunted' |
1124
|
|
|
|
|
|
|
}; # in case one of the handlers called stunt |
1125
|
|
|
|
|
|
|
} |
1126
|
461
|
|
|
|
|
1204
|
return @to_close; |
1127
|
|
|
|
|
|
|
} |
1128
|
|
|
|
|
|
|
} |
1129
|
|
|
|
|
|
|
|
1130
|
|
|
|
|
|
|
#========================================================================== |
1131
|
|
|
|
|
|
|
{ |
1132
|
|
|
|
|
|
|
my ( $indent, $nugget ); |
1133
|
|
|
|
|
|
|
|
1134
|
|
|
|
|
|
|
sub text { |
1135
|
548
|
50
|
|
548
|
1
|
1086
|
return if $_[0]{'_stunted'}; |
1136
|
|
|
|
|
|
|
|
1137
|
|
|
|
|
|
|
# Accept a "here's a text token" signal from HTML::Parser. |
1138
|
548
|
|
|
|
|
1054
|
my ( $self, $text, $is_cdata ) = @_; |
1139
|
|
|
|
|
|
|
|
1140
|
|
|
|
|
|
|
# the >3.0 versions of Parser may pass a cdata node. |
1141
|
|
|
|
|
|
|
# Thanks to Gisle Aas for pointing this out. |
1142
|
|
|
|
|
|
|
|
1143
|
548
|
50
|
|
|
|
1070
|
return unless length $text; # I guess that's always right |
1144
|
|
|
|
|
|
|
|
1145
|
548
|
|
|
|
|
757
|
my $ignore_text = $self->{'_ignore_text'}; |
1146
|
548
|
|
|
|
|
671
|
my $no_space_compacting = $self->{'_no_space_compacting'}; |
1147
|
548
|
|
|
|
|
679
|
my $no_expand_entities = $self->{'_no_expand_entities'}; |
1148
|
548
|
|
66
|
|
|
1020
|
my $pos = $self->{'_pos'} || $self; |
1149
|
|
|
|
|
|
|
|
1150
|
|
|
|
|
|
|
HTML::Entities::decode($text) |
1151
|
|
|
|
|
|
|
unless $ignore_text |
1152
|
|
|
|
|
|
|
|| $is_cdata |
1153
|
548
|
100
|
66
|
|
|
3750
|
|| $HTML::Tagset::isCDATA_Parent{ $pos->{'_tag'} } |
|
|
|
100
|
|
|
|
|
|
|
|
66
|
|
|
|
|
1154
|
|
|
|
|
|
|
|| $no_expand_entities; |
1155
|
|
|
|
|
|
|
|
1156
|
|
|
|
|
|
|
#my($indent, $nugget); |
1157
|
548
|
|
|
|
|
658
|
if (DEBUG) { |
1158
|
|
|
|
|
|
|
|
1159
|
|
|
|
|
|
|
# optimization -- don't figure out depth unless we're in debug mode |
1160
|
|
|
|
|
|
|
my @lineage_tags = $pos->lineage_tag_names; |
1161
|
|
|
|
|
|
|
$indent = ' ' x ( 1 + @lineage_tags ); |
1162
|
|
|
|
|
|
|
|
1163
|
|
|
|
|
|
|
$nugget |
1164
|
|
|
|
|
|
|
= ( length($text) <= 25 ) |
1165
|
|
|
|
|
|
|
? $text |
1166
|
|
|
|
|
|
|
: ( substr( $text, 0, 25 ) . '...' ); |
1167
|
|
|
|
|
|
|
$nugget =~ s<([\x00-\x1F])> |
1168
|
|
|
|
|
|
|
<'\\x'.(unpack("H2",$1))>eg; |
1169
|
|
|
|
|
|
|
print $indent, "Proposing a new text node ($nugget) under ", |
1170
|
|
|
|
|
|
|
join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) |
1171
|
|
|
|
|
|
|
|| 'Root', |
1172
|
|
|
|
|
|
|
".\n"; |
1173
|
|
|
|
|
|
|
|
1174
|
|
|
|
|
|
|
#} else { |
1175
|
|
|
|
|
|
|
# $indent = ' '; |
1176
|
|
|
|
|
|
|
} |
1177
|
548
|
|
|
|
|
612
|
|
1178
|
548
|
100
|
100
|
|
|
1960
|
my $ptag; |
1179
|
|
|
|
|
|
|
if ($HTML::Tagset::isCDATA_Parent{ $ptag = $pos->{'_tag'} } |
1180
|
|
|
|
|
|
|
|
1181
|
|
|
|
|
|
|
#or $pos->is_inside('pre') |
1182
|
|
|
|
|
|
|
or $pos->is_inside( 'pre', 'textarea' ) |
1183
|
|
|
|
|
|
|
) |
1184
|
15
|
50
|
|
|
|
36
|
{ |
1185
|
15
|
|
|
|
|
52
|
return if $ignore_text; |
1186
|
|
|
|
|
|
|
$pos->push_content($text); |
1187
|
|
|
|
|
|
|
} |
1188
|
|
|
|
|
|
|
else { |
1189
|
|
|
|
|
|
|
|
1190
|
|
|
|
|
|
|
# return unless $text =~ /\S/; # This is sometimes wrong |
1191
|
533
|
100
|
66
|
|
|
3065
|
|
|
|
100
|
66
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
1192
|
|
|
|
|
|
|
if ( !$self->{'_implicit_tags'} || $text !~ /[^\n\r\f\t ]/ ) { |
1193
|
|
|
|
|
|
|
|
1194
|
|
|
|
|
|
|
# don't change anything |
1195
|
|
|
|
|
|
|
} |
1196
|
2
|
50
|
|
|
|
7
|
elsif ( $ptag eq 'head' or $ptag eq 'noframes' ) { |
1197
|
0
|
|
|
|
|
0
|
if ( $self->{'_implicit_body_p_tag'} ) { |
1198
|
|
|
|
|
|
|
print $indent, |
1199
|
|
|
|
|
|
|
" * Text node under \U$ptag\E closes \U$ptag\E, implicates BODY and P.\n" |
1200
|
0
|
|
|
|
|
0
|
if DEBUG > 1; |
1201
|
|
|
|
|
|
|
$self->end( \$ptag ); |
1202
|
|
|
|
|
|
|
$pos = $self->{'_body'} |
1203
|
0
|
0
|
|
|
|
0
|
? ( $self->{'_pos'} |
1204
|
|
|
|
|
|
|
= $self->{'_body'} ) # expected case |
1205
|
0
|
|
|
|
|
0
|
: $self->insert_element( 'body', 1 ); |
1206
|
|
|
|
|
|
|
$pos = $self->insert_element( 'p', 1 ); |
1207
|
|
|
|
|
|
|
} |
1208
|
2
|
|
|
|
|
3
|
else { |
1209
|
|
|
|
|
|
|
print $indent, |
1210
|
|
|
|
|
|
|
" * Text node under \U$ptag\E closes, implicates BODY.\n" |
1211
|
2
|
|
|
|
|
9
|
if DEBUG > 1; |
1212
|
|
|
|
|
|
|
$self->end( \$ptag ); |
1213
|
|
|
|
|
|
|
$pos = $self->{'_body'} |
1214
|
2
|
50
|
|
|
|
9
|
? ( $self->{'_pos'} |
1215
|
|
|
|
|
|
|
= $self->{'_body'} ) # expected case |
1216
|
|
|
|
|
|
|
: $self->insert_element( 'body', 1 ); |
1217
|
|
|
|
|
|
|
} |
1218
|
|
|
|
|
|
|
} |
1219
|
4
|
50
|
|
|
|
13
|
elsif ( $ptag eq 'html' ) { |
1220
|
0
|
|
|
|
|
0
|
if ( $self->{'_implicit_body_p_tag'} ) { |
1221
|
|
|
|
|
|
|
print $indent, |
1222
|
|
|
|
|
|
|
" * Text node under HTML implicates BODY and P.\n" |
1223
|
|
|
|
|
|
|
if DEBUG > 1; |
1224
|
|
|
|
|
|
|
$pos = $self->{'_body'} |
1225
|
0
|
0
|
|
|
|
0
|
? ( $self->{'_pos'} |
1226
|
|
|
|
|
|
|
= $self->{'_body'} ) # expected case |
1227
|
0
|
|
|
|
|
0
|
: $self->insert_element( 'body', 1 ); |
1228
|
|
|
|
|
|
|
$pos = $self->insert_element( 'p', 1 ); |
1229
|
|
|
|
|
|
|
} |
1230
|
4
|
|
|
|
|
4
|
else { |
1231
|
|
|
|
|
|
|
print $indent, |
1232
|
|
|
|
|
|
|
" * Text node under HTML implicates BODY.\n" |
1233
|
|
|
|
|
|
|
if DEBUG > 1; |
1234
|
|
|
|
|
|
|
$pos = $self->{'_body'} |
1235
|
4
|
50
|
|
|
|
15
|
? ( $self->{'_pos'} |
1236
|
|
|
|
|
|
|
= $self->{'_body'} ) # expected case |
1237
|
|
|
|
|
|
|
: $self->insert_element( 'body', 1 ); |
1238
|
|
|
|
|
|
|
|
1239
|
|
|
|
|
|
|
#print "POS is $pos, ", $pos->{'_tag'}, "\n"; |
1240
|
|
|
|
|
|
|
} |
1241
|
|
|
|
|
|
|
} |
1242
|
4
|
50
|
|
|
|
14
|
elsif ( $ptag eq 'body' ) { |
1243
|
0
|
|
|
|
|
0
|
if ( $self->{'_implicit_body_p_tag'} ) { |
1244
|
|
|
|
|
|
|
print $indent, " * Text node under BODY implicates P.\n" |
1245
|
0
|
|
|
|
|
0
|
if DEBUG > 1; |
1246
|
|
|
|
|
|
|
$pos = $self->insert_element( 'p', 1 ); |
1247
|
|
|
|
|
|
|
} |
1248
|
|
|
|
|
|
|
} |
1249
|
0
|
|
|
|
|
0
|
elsif ( $ptag eq 'table' ) { |
1250
|
|
|
|
|
|
|
print $indent, |
1251
|
|
|
|
|
|
|
" * Text node under TABLE implicates TR and TD.\n" |
1252
|
0
|
|
|
|
|
0
|
if DEBUG > 1; |
1253
|
0
|
|
|
|
|
0
|
$self->insert_element( 'tr', 1 ); |
1254
|
|
|
|
|
|
|
$pos = $self->insert_element( 'td', 1 ); |
1255
|
|
|
|
|
|
|
|
1256
|
|
|
|
|
|
|
# double whammy! |
1257
|
|
|
|
|
|
|
} |
1258
|
0
|
|
|
|
|
0
|
elsif ( $ptag eq 'tr' ) { |
1259
|
|
|
|
|
|
|
print $indent, " * Text node under TR implicates TD.\n" |
1260
|
0
|
|
|
|
|
0
|
if DEBUG > 1; |
1261
|
|
|
|
|
|
|
$pos = $self->insert_element( 'td', 1 ); |
1262
|
|
|
|
|
|
|
} |
1263
|
|
|
|
|
|
|
|
1264
|
|
|
|
|
|
|
# elsif ( |
1265
|
|
|
|
|
|
|
# # $ptag eq 'li' || |
1266
|
|
|
|
|
|
|
# # $ptag eq 'dd' || |
1267
|
|
|
|
|
|
|
# $ptag eq 'form') { |
1268
|
|
|
|
|
|
|
# $pos = $self->insert_element('p', 1); |
1269
|
|
|
|
|
|
|
#} |
1270
|
|
|
|
|
|
|
|
1271
|
|
|
|
|
|
|
# Whatever we've done above should have had the side |
1272
|
|
|
|
|
|
|
# effect of updating $self->{'_pos'} |
1273
|
|
|
|
|
|
|
|
1274
|
|
|
|
|
|
|
#print "POS is now $pos, ", $pos->{'_tag'}, "\n"; |
1275
|
533
|
50
|
|
|
|
893
|
|
1276
|
533
|
50
|
|
|
|
880
|
return if $ignore_text; |
1277
|
|
|
|
|
|
|
$text =~ s/[\n\r\f\t ]+/ /g # canonical space |
1278
|
|
|
|
|
|
|
unless $no_space_compacting; |
1279
|
|
|
|
|
|
|
|
1280
|
|
|
|
|
|
|
print $indent, " (Attaching text node ($nugget) under ", |
1281
|
|
|
|
|
|
|
|
1282
|
533
|
|
|
|
|
583
|
# was: $self->{'_pos'} ? $self->{'_pos'}{'_tag'} : $self->{'_tag'}, |
1283
|
|
|
|
|
|
|
$pos->{'_tag'}, ").\n" |
1284
|
|
|
|
|
|
|
if DEBUG > 1; |
1285
|
533
|
|
|
|
|
1285
|
|
1286
|
|
|
|
|
|
|
$pos->push_content($text); |
1287
|
|
|
|
|
|
|
} |
1288
|
548
|
50
|
|
|
|
1787
|
|
1289
|
548
|
|
|
|
|
1012
|
&{ $self->{'_tweak_~text'} || return }( $text, $pos, |
1290
|
|
|
|
|
|
|
$pos->{'_tag'} . '' ); |
1291
|
|
|
|
|
|
|
|
1292
|
|
|
|
|
|
|
# Note that this is very exceptional -- it doesn't fall back to |
1293
|
548
|
|
|
|
|
3581
|
# _tweak_*, and it gives its tweak different arguments. |
1294
|
|
|
|
|
|
|
return; |
1295
|
|
|
|
|
|
|
} |
1296
|
|
|
|
|
|
|
} |
1297
|
|
|
|
|
|
|
|
1298
|
|
|
|
|
|
|
#========================================================================== |
1299
|
|
|
|
|
|
|
|
1300
|
|
|
|
|
|
|
# TODO: test whether comment(), declaration(), and process(), do the right |
1301
|
|
|
|
|
|
|
# thing as far as tightening and whatnot. |
1302
|
|
|
|
|
|
|
# Also, currently, doctypes and comments that appear before head or body |
1303
|
|
|
|
|
|
|
# show up in the tree in the wrong place. Something should be done about |
1304
|
|
|
|
|
|
|
# this. Tricky. Maybe this whole business of pre-making the body and |
1305
|
|
|
|
|
|
|
# whatnot is wrong. |
1306
|
|
|
|
|
|
|
|
1307
|
7
|
50
|
|
7
|
1
|
124
|
sub comment { |
1308
|
|
|
|
|
|
|
return if $_[0]{'_stunted'}; |
1309
|
|
|
|
|
|
|
|
1310
|
|
|
|
|
|
|
# Accept a "here's a comment" signal from HTML::Parser. |
1311
|
7
|
|
|
|
|
19
|
|
1312
|
7
|
|
66
|
|
|
26
|
my ( $self, $text ) = @_; |
1313
|
|
|
|
|
|
|
my $pos = $self->{'_pos'} || $self; |
1314
|
|
|
|
|
|
|
return |
1315
|
7
|
50
|
33
|
|
|
26
|
unless $self->{'_store_comments'} |
1316
|
|
|
|
|
|
|
|| $HTML::Tagset::isCDATA_Parent{ $pos->{'_tag'} }; |
1317
|
7
|
|
|
|
|
8
|
|
1318
|
|
|
|
|
|
|
if (DEBUG) { |
1319
|
|
|
|
|
|
|
my @lineage_tags = $pos->lineage_tag_names; |
1320
|
|
|
|
|
|
|
my $indent = ' ' x ( 1 + @lineage_tags ); |
1321
|
|
|
|
|
|
|
|
1322
|
|
|
|
|
|
|
my $nugget |
1323
|
|
|
|
|
|
|
= ( length($text) <= 25 ) |
1324
|
|
|
|
|
|
|
? $text |
1325
|
|
|
|
|
|
|
: ( substr( $text, 0, 25 ) . '...' ); |
1326
|
|
|
|
|
|
|
$nugget =~ s<([\x00-\x1F])> |
1327
|
|
|
|
|
|
|
<'\\x'.(unpack("H2",$1))>eg; |
1328
|
|
|
|
|
|
|
print $indent, "Proposing a Comment ($nugget) under ", |
1329
|
|
|
|
|
|
|
join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root', |
1330
|
|
|
|
|
|
|
".\n"; |
1331
|
7
|
|
|
|
|
23
|
} |
1332
|
7
|
|
|
|
|
52
|
|
1333
|
7
|
|
|
|
|
15
|
( my $e = $self->element_class->new('~comment') )->{'text'} = $text; |
1334
|
|
|
|
|
|
|
$pos->push_content($e); |
1335
|
7
|
|
|
|
|
30
|
++( $self->{'_element_count'} ); |
1336
|
7
|
50
|
33
|
|
|
59
|
|
1337
|
|
|
|
|
|
|
&{ $self->{'_tweak_~comment'} |
1338
|
|
|
|
|
|
|
|| $self->{'_tweak_*'} |
1339
|
7
|
|
|
|
|
19
|
|| return $e }( map $_, $e, '~comment', $self ); |
1340
|
|
|
|
|
|
|
|
1341
|
|
|
|
|
|
|
return $e; |
1342
|
|
|
|
|
|
|
} |
1343
|
0
|
0
|
|
0
|
1
|
0
|
|
1344
|
|
|
|
|
|
|
sub declaration { |
1345
|
|
|
|
|
|
|
return if $_[0]{'_stunted'}; |
1346
|
|
|
|
|
|
|
|
1347
|
0
|
|
|
|
|
0
|
# Accept a "here's a markup declaration" signal from HTML::Parser. |
1348
|
0
|
|
0
|
|
|
0
|
|
1349
|
|
|
|
|
|
|
my ( $self, $text ) = @_; |
1350
|
0
|
|
|
|
|
0
|
my $pos = $self->{'_pos'} || $self; |
1351
|
|
|
|
|
|
|
|
1352
|
|
|
|
|
|
|
if (DEBUG) { |
1353
|
|
|
|
|
|
|
my @lineage_tags = $pos->lineage_tag_names; |
1354
|
|
|
|
|
|
|
my $indent = ' ' x ( 1 + @lineage_tags ); |
1355
|
|
|
|
|
|
|
|
1356
|
|
|
|
|
|
|
my $nugget |
1357
|
|
|
|
|
|
|
= ( length($text) <= 25 ) |
1358
|
|
|
|
|
|
|
? $text |
1359
|
|
|
|
|
|
|
: ( substr( $text, 0, 25 ) . '...' ); |
1360
|
|
|
|
|
|
|
$nugget =~ s<([\x00-\x1F])> |
1361
|
|
|
|
|
|
|
<'\\x'.(unpack("H2",$1))>eg; |
1362
|
|
|
|
|
|
|
print $indent, "Proposing a Declaration ($nugget) under ", |
1363
|
0
|
|
|
|
|
0
|
join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root', |
1364
|
|
|
|
|
|
|
".\n"; |
1365
|
0
|
|
|
|
|
0
|
} |
1366
|
0
|
|
|
|
|
0
|
( my $e = $self->element_class->new('~declaration') )->{'text'} = $text; |
1367
|
|
|
|
|
|
|
|
1368
|
|
|
|
|
|
|
$self->{_decl} = $e; |
1369
|
|
|
|
|
|
|
return $e; |
1370
|
|
|
|
|
|
|
} |
1371
|
|
|
|
|
|
|
|
1372
|
0
|
0
|
|
0
|
1
|
0
|
#========================================================================== |
1373
|
|
|
|
|
|
|
|
1374
|
|
|
|
|
|
|
sub process { |
1375
|
|
|
|
|
|
|
return if $_[0]{'_stunted'}; |
1376
|
0
|
0
|
|
|
|
0
|
|
1377
|
0
|
|
|
|
|
0
|
# Accept a "here's a PI" signal from HTML::Parser. |
1378
|
0
|
|
0
|
|
|
0
|
|
1379
|
|
|
|
|
|
|
return unless $_[0]->{'_store_pis'}; |
1380
|
0
|
|
|
|
|
0
|
my ( $self, $text ) = @_; |
1381
|
|
|
|
|
|
|
my $pos = $self->{'_pos'} || $self; |
1382
|
|
|
|
|
|
|
|
1383
|
|
|
|
|
|
|
if (DEBUG) { |
1384
|
|
|
|
|
|
|
my @lineage_tags = $pos->lineage_tag_names; |
1385
|
|
|
|
|
|
|
my $indent = ' ' x ( 1 + @lineage_tags ); |
1386
|
|
|
|
|
|
|
|
1387
|
|
|
|
|
|
|
my $nugget |
1388
|
|
|
|
|
|
|
= ( length($text) <= 25 ) |
1389
|
|
|
|
|
|
|
? $text |
1390
|
|
|
|
|
|
|
: ( substr( $text, 0, 25 ) . '...' ); |
1391
|
|
|
|
|
|
|
$nugget =~ s<([\x00-\x1F])> |
1392
|
|
|
|
|
|
|
<'\\x'.(unpack("H2",$1))>eg; |
1393
|
0
|
|
|
|
|
0
|
print $indent, "Proposing a PI ($nugget) under ", |
1394
|
0
|
|
|
|
|
0
|
join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root', |
1395
|
0
|
|
|
|
|
0
|
".\n"; |
1396
|
|
|
|
|
|
|
} |
1397
|
0
|
0
|
0
|
|
|
0
|
( my $e = $self->element_class->new('~pi') )->{'text'} = $text; |
|
0
|
|
|
|
|
0
|
|
1398
|
|
|
|
|
|
|
$pos->push_content($e); |
1399
|
|
|
|
|
|
|
++( $self->{'_element_count'} ); |
1400
|
0
|
|
|
|
|
0
|
|
1401
|
|
|
|
|
|
|
&{ $self->{'_tweak_~pi'} || $self->{'_tweak_*'} || return $e }( map $_, |
1402
|
|
|
|
|
|
|
$e, '~pi', $self ); |
1403
|
|
|
|
|
|
|
|
1404
|
|
|
|
|
|
|
return $e; |
1405
|
|
|
|
|
|
|
} |
1406
|
|
|
|
|
|
|
|
1407
|
|
|
|
|
|
|
#========================================================================== |
1408
|
|
|
|
|
|
|
|
1409
|
|
|
|
|
|
|
#When you call $tree->parse_file($filename), and the |
1410
|
|
|
|
|
|
|
#tree's ignore_ignorable_whitespace attribute is on (as it is |
1411
|
|
|
|
|
|
|
#by default), HTML::DOM::_TreeBuilder's logic will manage to avoid |
1412
|
|
|
|
|
|
|
#creating some, but not all, nodes that represent ignorable |
1413
|
|
|
|
|
|
|
#whitespace. However, at the end of its parse, it traverses the |
1414
|
|
|
|
|
|
|
#tree and deletes any that it missed. (It does this with an |
1415
|
|
|
|
|
|
|
#around-method around HTML::Parser's eof method.) |
1416
|
|
|
|
|
|
|
# |
1417
|
|
|
|
|
|
|
#However, with $tree->parse($content), the cleanup-traversal step |
1418
|
|
|
|
|
|
|
#doesn't happen automatically -- so when you're done parsing all |
1419
|
|
|
|
|
|
|
#content for a document (regardless of whether $content is the only |
1420
|
|
|
|
|
|
|
#bit, or whether it's just another chunk of content you're parsing into |
1421
|
|
|
|
|
|
|
#the tree), call $tree->eof() to signal that you're at the end of the |
1422
|
|
|
|
|
|
|
#text you're inputting to the tree. Besides properly cleaning any bits |
1423
|
|
|
|
|
|
|
#of ignorable whitespace from the tree, this will also ensure that |
1424
|
|
|
|
|
|
|
#HTML::Parser's internal buffer is flushed. |
1425
|
|
|
|
|
|
|
|
1426
|
128
|
100
|
|
128
|
1
|
325
|
sub eof { |
1427
|
|
|
|
|
|
|
|
1428
|
125
|
50
|
|
|
|
297
|
# Accept an "end-of-file" signal from HTML::Parser, or thrown by the user. |
1429
|
|
|
|
|
|
|
|
1430
|
125
|
|
|
|
|
164
|
return if $_[0]->{'_done'}; # we've already been here |
1431
|
125
|
|
|
|
|
159
|
|
1432
|
125
|
|
|
|
|
166
|
return $_[0]->SUPER::eof() if $_[0]->{'_stunted'}; |
1433
|
125
|
50
|
|
|
|
249
|
|
1434
|
|
|
|
|
|
|
my $x = $_[0]; |
1435
|
|
|
|
|
|
|
print "EOF received.\n" if DEBUG; |
1436
|
|
|
|
|
|
|
my (@rv); |
1437
|
0
|
|
|
|
|
0
|
if (wantarray) { |
1438
|
|
|
|
|
|
|
|
1439
|
|
|
|
|
|
|
# I don't think this makes any difference for this particular |
1440
|
125
|
|
|
|
|
554
|
# method, but let's be scrupulous, for once. |
1441
|
|
|
|
|
|
|
@rv = $x->SUPER::eof(); |
1442
|
|
|
|
|
|
|
} |
1443
|
125
|
100
|
66
|
|
|
712
|
else { |
1444
|
|
|
|
|
|
|
$rv[0] = $x->SUPER::eof(); |
1445
|
|
|
|
|
|
|
} |
1446
|
|
|
|
|
|
|
|
1447
|
|
|
|
|
|
|
$x->end('html') unless $x eq ( $x->{'_pos'} || $x ); |
1448
|
|
|
|
|
|
|
|
1449
|
124
|
50
|
|
|
|
273
|
# That SHOULD close everything, and will run the appropriate tweaks. |
1450
|
|
|
|
|
|
|
# We /could/ be running under some insane mode such that there's more |
1451
|
|
|
|
|
|
|
# than one HTML element, but really, that's just insane to do anyhow. |
1452
|
|
|
|
|
|
|
|
1453
|
0
|
|
|
|
|
0
|
unless ( $x->{'_implicit_tags'} ) { |
1454
|
|
|
|
|
|
|
|
1455
|
|
|
|
|
|
|
# delete those silly implicit head and body in case we put |
1456
|
|
|
|
|
|
|
# them there in implicit tags mode |
1457
|
|
|
|
|
|
|
foreach my $node ( $x->{'_head'}, $x->{'_body'} ) { |
1458
|
0
|
0
|
0
|
|
|
0
|
$node->replace_with_content |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
1459
|
|
|
|
|
|
|
if defined $node |
1460
|
|
|
|
|
|
|
and ref $node |
1461
|
|
|
|
|
|
|
and $node->{'_implicit'} |
1462
|
|
|
|
|
|
|
and $node->{'_parent'}; |
1463
|
|
|
|
|
|
|
|
1464
|
|
|
|
|
|
|
# I think they should be empty anyhow, since the only |
1465
|
|
|
|
|
|
|
# logic that'd insert under them can apply only, I think, |
1466
|
|
|
|
|
|
|
# in the case where _implicit_tags is on |
1467
|
|
|
|
|
|
|
} |
1468
|
|
|
|
|
|
|
|
1469
|
|
|
|
|
|
|
# this may still leave an implicit 'html' at the top, but there's |
1470
|
|
|
|
|
|
|
# nothing we can do about that, is there? |
1471
|
|
|
|
|
|
|
} |
1472
|
124
|
50
|
33
|
|
|
309
|
|
1473
|
124
|
|
|
|
|
203
|
$x->delete_ignorable_whitespace() |
1474
|
|
|
|
|
|
|
|
1475
|
124
|
50
|
|
|
|
227
|
# this's why we trap this -- an after-method |
1476
|
124
|
|
|
|
|
269
|
if $x->{'_tighten'} and !$x->{'_ignore_text'}; |
1477
|
|
|
|
|
|
|
$x->{'_done'} = 1; |
1478
|
|
|
|
|
|
|
|
1479
|
|
|
|
|
|
|
return @rv if wantarray; |
1480
|
|
|
|
|
|
|
return $rv[0]; |
1481
|
|
|
|
|
|
|
} |
1482
|
|
|
|
|
|
|
|
1483
|
|
|
|
|
|
|
#========================================================================== |
1484
|
0
|
|
|
0
|
0
|
0
|
|
1485
|
0
|
|
|
|
|
0
|
# TODO: document |
1486
|
0
|
|
|
|
|
0
|
|
1487
|
|
|
|
|
|
|
sub stunt { |
1488
|
0
|
0
|
|
|
|
0
|
my $self = $_[0]; |
1489
|
|
|
|
|
|
|
print "Stunting the tree.\n" if DEBUG; |
1490
|
|
|
|
|
|
|
$self->{'_done'} = 1; |
1491
|
0
|
|
|
|
|
0
|
|
1492
|
0
|
|
|
|
|
0
|
if ( $HTML::Parser::VERSION < 3 ) { |
1493
|
|
|
|
|
|
|
|
1494
|
|
|
|
|
|
|
#This is a MEAN MEAN HACK. And it works most of the time! |
1495
|
0
|
0
|
|
|
|
0
|
$self->{'_buf'} = ''; |
1496
|
0
|
|
|
|
|
0
|
my $fh = *HTML::Parser::F{IO}; |
1497
|
0
|
|
|
|
|
0
|
|
1498
|
|
|
|
|
|
|
# the local'd FH used by parse_file loop |
1499
|
|
|
|
|
|
|
if ( defined $fh ) { |
1500
|
|
|
|
|
|
|
print "Closing Parser's filehandle $fh\n" if DEBUG; |
1501
|
|
|
|
|
|
|
close($fh); |
1502
|
|
|
|
|
|
|
} |
1503
|
|
|
|
|
|
|
|
1504
|
|
|
|
|
|
|
# But if they called $tree->parse_file($filehandle) |
1505
|
|
|
|
|
|
|
# or $tree->parse_file(*IO), then there will be no *HTML::Parser::F{IO} |
1506
|
0
|
|
|
|
|
0
|
# to close. Ahwell. Not a problem for most users these days. |
1507
|
|
|
|
|
|
|
|
1508
|
|
|
|
|
|
|
} |
1509
|
|
|
|
|
|
|
else { |
1510
|
|
|
|
|
|
|
$self->SUPER::eof(); |
1511
|
|
|
|
|
|
|
|
1512
|
|
|
|
|
|
|
# Under 3+ versions, calling eof from inside a parse will abort the |
1513
|
|
|
|
|
|
|
# parse / parse_file |
1514
|
0
|
|
|
|
|
0
|
} |
1515
|
0
|
|
|
|
|
0
|
|
1516
|
|
|
|
|
|
|
# In the off chance that the above didn't work, we'll throw |
1517
|
|
|
|
|
|
|
# this flag to make any future events be no-ops. |
1518
|
|
|
|
|
|
|
$self->stunted(1); |
1519
|
0
|
|
|
0
|
0
|
0
|
return; |
1520
|
0
|
|
|
0
|
0
|
0
|
} |
1521
|
|
|
|
|
|
|
|
1522
|
|
|
|
|
|
|
# TODO: document |
1523
|
|
|
|
|
|
|
sub stunted { shift->_elem( '_stunted', @_ ); } |
1524
|
|
|
|
|
|
|
sub done { shift->_elem( '_done', @_ ); } |
1525
|
|
|
|
|
|
|
|
1526
|
|
|
|
|
|
|
#========================================================================== |
1527
|
|
|
|
|
|
|
|
1528
|
|
|
|
|
|
|
sub delete { |
1529
|
|
|
|
|
|
|
|
1530
|
|
|
|
|
|
|
# Override Element's delete method. |
1531
|
0
|
|
|
0
|
0
|
0
|
# This does most, if not all, of what Element's delete does anyway. |
1532
|
|
|
|
|
|
|
# Deletes content, including content in some special attributes. |
1533
|
0
|
|
|
|
|
0
|
# But doesn't empty out the hash. |
|
0
|
|
|
|
|
0
|
|
1534
|
0
|
|
|
|
|
0
|
|
1535
|
0
|
0
|
|
|
|
0
|
$_[0]->{'_element_count'} = 1; # never hurts to be scrupulously correct |
1536
|
|
|
|
|
|
|
|
1537
|
|
|
|
|
|
|
delete @{ $_[0] }{ '_body', '_head', '_pos' }; |
1538
|
|
|
|
|
|
|
for ( |
1539
|
|
|
|
|
|
|
@{ delete( $_[0]->{'_content'} ) || [] }, # all/any content |
1540
|
|
|
|
|
|
|
|
1541
|
|
|
|
|
|
|
# delete @{$_[0]}{'_body', '_head', '_pos'} |
1542
|
|
|
|
|
|
|
# ...and these, in case these elements don't appear in the |
1543
|
|
|
|
|
|
|
# content, which is possible. If they did appear (as they |
1544
|
|
|
|
|
|
|
# usually do), then calling $_->delete on them again is harmless. |
1545
|
|
|
|
|
|
|
# I don't think that's such a hot idea now. Thru creative reattachment, |
1546
|
|
|
|
|
|
|
# those could actually now point to elements in OTHER trees (which we do |
1547
|
|
|
|
|
|
|
# NOT want to delete!). |
1548
|
|
|
|
|
|
|
## Reasoned out: |
1549
|
|
|
|
|
|
|
# If these point to elements not in the content list of any element in this |
1550
|
|
|
|
|
|
|
# tree, but not in the content list of any element in any OTHER tree, then |
1551
|
|
|
|
|
|
|
# just deleting these will make their refcounts hit zero. |
1552
|
|
|
|
|
|
|
# If these point to elements in the content lists of elements in THIS tree, |
1553
|
|
|
|
|
|
|
# then we'll get to deleting them when we delete from the top. |
1554
|
0
|
0
|
0
|
|
|
0
|
# If these point to elements in the content lists of elements in SOME OTHER |
|
|
|
0
|
|
|
|
|
1555
|
|
|
|
|
|
|
# tree, then they're not to be deleted. |
1556
|
|
|
|
|
|
|
) |
1557
|
|
|
|
|
|
|
{ |
1558
|
|
|
|
|
|
|
$_->delete |
1559
|
0
|
0
|
0
|
|
|
0
|
if defined $_ and ref $_ # Make sure it's an object. |
1560
|
|
|
|
|
|
|
and $_ ne $_[0]; # And avoid hitting myself, just in case! |
1561
|
|
|
|
|
|
|
} |
1562
|
|
|
|
|
|
|
|
1563
|
0
|
|
|
|
|
0
|
$_[0]->detach if $_[0]->{'_parent'} and $_[0]->{'_parent'}{'_content'}; |
1564
|
|
|
|
|
|
|
|
1565
|
|
|
|
|
|
|
# An 'html' element having a parent is quite unlikely. |
1566
|
|
|
|
|
|
|
|
1567
|
0
|
|
|
0
|
0
|
0
|
return; |
1568
|
|
|
|
|
|
|
} |
1569
|
|
|
|
|
|
|
|
1570
|
|
|
|
|
|
|
sub tighten_up { # legacy |
1571
|
|
|
|
|
|
|
shift->delete_ignorable_whitespace(@_); |
1572
|
|
|
|
|
|
|
} |
1573
|
93
|
|
|
93
|
0
|
142
|
|
1574
|
93
|
|
|
|
|
236
|
sub elementify { |
1575
|
93
|
|
|
|
|
854
|
|
1576
|
|
|
|
|
|
|
# Rebless this object down into the normal element class. |
1577
|
93
|
|
|
|
|
335
|
my $self = $_[0]; |
1578
|
2679
|
100
|
66
|
|
|
17653
|
my $to_class = $self->element_class; |
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
1579
|
|
|
|
|
|
|
delete @{$self}{ |
1580
|
|
|
|
|
|
|
grep { |
1581
|
|
|
|
|
|
|
; |
1582
|
|
|
|
|
|
|
length $_ and substr( $_, 0, 1 ) eq '_' |
1583
|
|
|
|
|
|
|
|
1584
|
|
|
|
|
|
|
# The private attributes that we'll retain: |
1585
|
|
|
|
|
|
|
and $_ ne '_tag' |
1586
|
|
|
|
|
|
|
and $_ ne '_parent' |
1587
|
|
|
|
|
|
|
and $_ ne '_content' |
1588
|
|
|
|
|
|
|
and $_ ne '_implicit' |
1589
|
93
|
|
|
|
|
371
|
and $_ ne '_pos' |
1590
|
|
|
|
|
|
|
and $_ ne '_element_class' |
1591
|
|
|
|
|
|
|
} keys %$self |
1592
|
|
|
|
|
|
|
}; |
1593
|
0
|
0
|
|
0
|
0
|
0
|
bless $self, $to_class; # Returns the same object we were fed |
1594
|
0
|
|
0
|
|
|
0
|
} |
1595
|
|
|
|
|
|
|
|
1596
|
|
|
|
|
|
|
sub element_class { |
1597
|
|
|
|
|
|
|
return 'HTML::DOM::_Element' if not ref $_[0]; |
1598
|
|
|
|
|
|
|
return $_[0]->{_element_class} || 'HTML::DOM::_Element'; |
1599
|
|
|
|
|
|
|
} |
1600
|
31
|
|
|
31
|
0
|
46
|
|
1601
|
31
|
|
|
|
|
66
|
#-------------------------------------------------------------------------- |
1602
|
31
|
|
|
|
|
47
|
|
1603
|
31
|
|
|
|
|
39
|
sub guts { |
1604
|
31
|
|
|
|
|
73
|
my @out; |
1605
|
135
|
|
|
|
|
162
|
my @stack = ( $_[0] ); |
1606
|
135
|
50
|
|
|
|
242
|
my $destructive = $_[1]; |
|
|
100
|
|
|
|
|
|
1607
|
0
|
|
|
|
|
0
|
my $this; |
1608
|
|
|
|
|
|
|
while (@stack) { |
1609
|
|
|
|
|
|
|
$this = shift @stack; |
1610
|
38
|
|
|
|
|
56
|
if ( !ref $this ) { |
1611
|
38
|
50
|
|
|
|
89
|
push @out, $this; # yes, it can include text nodes |
1612
|
|
|
|
|
|
|
} |
1613
|
|
|
|
|
|
|
elsif ( !$this->{'_implicit'} ) { |
1614
|
|
|
|
|
|
|
push @out, $this; |
1615
|
|
|
|
|
|
|
delete $this->{'_parent'} if $destructive; |
1616
|
97
|
50
|
|
|
|
165
|
} |
1617
|
|
|
|
|
|
|
else { |
1618
|
|
|
|
|
|
|
|
1619
|
97
|
|
|
|
|
100
|
# it's an implicit node. Delete it and recurse |
1620
|
|
|
|
|
|
|
delete $this->{'_parent'} if $destructive; |
1621
|
97
|
50
|
|
|
|
334
|
unshift @stack, |
|
|
100
|
|
|
|
|
|
1622
|
|
|
|
|
|
|
@{ |
1623
|
|
|
|
|
|
|
( $destructive |
1624
|
|
|
|
|
|
|
? delete( $this->{'_content'} ) |
1625
|
|
|
|
|
|
|
: $this->{'_content'} |
1626
|
|
|
|
|
|
|
) |
1627
|
|
|
|
|
|
|
|| [] |
1628
|
|
|
|
|
|
|
}; |
1629
|
|
|
|
|
|
|
} |
1630
|
|
|
|
|
|
|
} |
1631
|
31
|
50
|
|
|
|
136
|
|
1632
|
0
|
0
|
|
|
|
|
# Doesn't call a real $root->delete on the (when implicit) root, |
1633
|
0
|
0
|
0
|
|
|
|
# but I don't think it needs to. |
1634
|
0
|
|
|
|
|
|
|
1635
|
0
|
|
|
|
|
|
return @out if wantarray; # one simple normal case. |
1636
|
0
|
|
|
|
|
|
return unless @out; |
1637
|
|
|
|
|
|
|
return $out[0] if @out == 1 and ref( $out[0] ); |
1638
|
|
|
|
|
|
|
my $x = HTML::DOM::_Element->new( 'div', '_implicit' => 1 ); |
1639
|
0
|
|
|
0
|
0
|
|
$x->push_content(@out); |
1640
|
|
|
|
|
|
|
return $x; |
1641
|
|
|
|
|
|
|
} |
1642
|
|
|
|
|
|
|
|
1643
|
|
|
|
|
|
|
sub disembowel { $_[0]->guts(1) } |
1644
|
|
|
|
|
|
|
|
1645
|
|
|
|
|
|
|
#-------------------------------------------------------------------------- |
1646
|
|
|
|
|
|
|
1; |
1647
|
|
|
|
|
|
|
|
1648
|
|
|
|
|
|
|
__END__ |