line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package HTML::TreeBuilder; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
# ABSTRACT: Parser that builds a HTML syntax tree |
4
|
|
|
|
|
|
|
|
5
|
20
|
|
|
20
|
|
263963
|
use 5.008; |
|
20
|
|
|
|
|
67
|
|
6
|
20
|
|
|
20
|
|
95
|
use warnings; |
|
20
|
|
|
|
|
39
|
|
|
20
|
|
|
|
|
489
|
|
7
|
20
|
|
|
20
|
|
92
|
use strict; |
|
20
|
|
|
|
|
47
|
|
|
20
|
|
|
|
|
360
|
|
8
|
20
|
|
|
20
|
|
7480
|
use integer; # vroom vroom! |
|
20
|
|
|
|
|
270
|
|
|
20
|
|
|
|
|
82
|
|
9
|
20
|
|
|
20
|
|
474
|
use Carp (); |
|
20
|
|
|
|
|
41
|
|
|
20
|
|
|
|
|
299
|
|
10
|
20
|
|
|
20
|
|
86
|
use Scalar::Util qw(openhandle); |
|
20
|
|
|
|
|
35
|
|
|
20
|
|
|
|
|
3106
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
our $VERSION = '5.910'; # TRIAL VERSION from OurPkgVersion |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
#--------------------------------------------------------------------------- |
15
|
|
|
|
|
|
|
# Make a 'DEBUG' constant... |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
our $DEBUG; # Must be set BEFORE loading this file |
18
|
|
|
|
|
|
|
BEGIN { |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
# We used to have things like |
21
|
|
|
|
|
|
|
# print $indent, "lalala" if $Debug; |
22
|
|
|
|
|
|
|
# But there were an awful lot of having to evaluate $Debug's value. |
23
|
|
|
|
|
|
|
# If we make that depend on a constant, like so: |
24
|
|
|
|
|
|
|
# sub DEBUG () { 1 } # or whatever value. |
25
|
|
|
|
|
|
|
# ... |
26
|
|
|
|
|
|
|
# print $indent, "lalala" if DEBUG; |
27
|
|
|
|
|
|
|
# Which at compile-time (thru the miracle of constant folding) turns into: |
28
|
|
|
|
|
|
|
# print $indent, "lalala"; |
29
|
|
|
|
|
|
|
# or, if DEBUG is a constant with a true value, then that print statement |
30
|
|
|
|
|
|
|
# is simply optimized away, and doesn't appear in the target code at all. |
31
|
|
|
|
|
|
|
# If you don't believe me, run: |
32
|
|
|
|
|
|
|
# perl -MO=Deparse,-uHTML::TreeBuilder -e 'BEGIN { \ |
33
|
|
|
|
|
|
|
# $HTML::TreeBuilder::DEBUG = 4} use HTML::TreeBuilder' |
34
|
|
|
|
|
|
|
# and see for yourself (substituting whatever value you want for $DEBUG |
35
|
|
|
|
|
|
|
# there). |
36
|
|
|
|
|
|
|
## no critic |
37
|
20
|
50
|
|
20
|
|
232
|
if ( defined &DEBUG ) { |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
# Already been defined! Do nothing. |
40
|
|
|
|
|
|
|
} |
41
|
|
|
|
|
|
|
elsif ( $] < 5.00404 ) { |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# Grudgingly accommodate ancient (pre-constant) versions. |
44
|
0
|
|
|
|
|
0
|
eval 'sub DEBUG { $Debug } '; |
45
|
|
|
|
|
|
|
} |
46
|
|
|
|
|
|
|
elsif ( !$DEBUG ) { |
47
|
20
|
|
|
|
|
979
|
eval 'sub DEBUG () {0}'; # Make it a constant. |
48
|
|
|
|
|
|
|
} |
49
|
|
|
|
|
|
|
elsif ( $DEBUG =~ m<^\d+$>s ) { |
50
|
0
|
|
|
|
|
0
|
eval 'sub DEBUG () { ' . $DEBUG . ' }'; # Make THAT a constant. |
51
|
|
|
|
|
|
|
} |
52
|
|
|
|
|
|
|
else { # WTF? |
53
|
0
|
|
|
|
|
0
|
warn "Non-numeric value \"$DEBUG\" in \$HTML::Element::DEBUG"; |
54
|
0
|
|
|
|
|
0
|
eval 'sub DEBUG () { $DEBUG }'; # I guess. |
55
|
|
|
|
|
|
|
} |
56
|
|
|
|
|
|
|
## use critic |
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
#--------------------------------------------------------------------------- |
60
|
|
|
|
|
|
|
|
61
|
20
|
|
|
20
|
|
7624
|
use HTML::Entities (); |
|
20
|
|
|
|
|
97165
|
|
|
20
|
|
|
|
|
671
|
|
62
|
20
|
|
|
20
|
|
7579
|
use HTML::Tagset 3.02 (); |
|
20
|
|
|
|
|
20179
|
|
|
20
|
|
|
|
|
491
|
|
63
|
|
|
|
|
|
|
|
64
|
20
|
|
|
20
|
|
13364
|
use HTML::Element (); |
|
20
|
|
|
|
|
53
|
|
|
20
|
|
|
|
|
545
|
|
65
|
20
|
|
|
20
|
|
113
|
use HTML::Parser 3.46 (); |
|
20
|
|
|
|
|
393
|
|
|
20
|
|
|
|
|
23659
|
|
66
|
|
|
|
|
|
|
our @ISA = qw(HTML::Element HTML::Parser); |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
# This looks schizoid, I know. |
69
|
|
|
|
|
|
|
# It's not that we ARE an element AND a parser. |
70
|
|
|
|
|
|
|
# We ARE an element, but one that knows how to handle signals |
71
|
|
|
|
|
|
|
# (method calls) from Parser in order to elaborate its subtree. |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
# Legacy aliases: |
74
|
|
|
|
|
|
|
*HTML::TreeBuilder::isKnown = \%HTML::Tagset::isKnown; |
75
|
|
|
|
|
|
|
*HTML::TreeBuilder::canTighten = \%HTML::Tagset::canTighten; |
76
|
|
|
|
|
|
|
*HTML::TreeBuilder::isHeadElement = \%HTML::Tagset::isHeadElement; |
77
|
|
|
|
|
|
|
*HTML::TreeBuilder::isBodyElement = \%HTML::Tagset::isBodyElement; |
78
|
|
|
|
|
|
|
*HTML::TreeBuilder::isPhraseMarkup = \%HTML::Tagset::isPhraseMarkup; |
79
|
|
|
|
|
|
|
*HTML::TreeBuilder::isHeadOrBodyElement = \%HTML::Tagset::isHeadOrBodyElement; |
80
|
|
|
|
|
|
|
*HTML::TreeBuilder::isList = \%HTML::Tagset::isList; |
81
|
|
|
|
|
|
|
*HTML::TreeBuilder::isTableElement = \%HTML::Tagset::isTableElement; |
82
|
|
|
|
|
|
|
*HTML::TreeBuilder::isFormElement = \%HTML::Tagset::isFormElement; |
83
|
|
|
|
|
|
|
*HTML::TreeBuilder::p_closure_barriers = \@HTML::Tagset::p_closure_barriers; |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
#========================================================================== |
86
|
|
|
|
|
|
|
# Two little shortcut constructors: |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
sub new_from_file { # or from a FH |
89
|
17
|
|
|
17
|
1
|
11852
|
my $class = shift; |
90
|
17
|
50
|
|
|
|
66
|
Carp::croak("new_from_file takes an odd number of arguments") |
91
|
|
|
|
|
|
|
unless @_ % 2; |
92
|
17
|
50
|
|
|
|
48
|
Carp::croak("new_from_file is a class method only") |
93
|
|
|
|
|
|
|
if ref $class; |
94
|
17
|
|
|
|
|
36
|
my $file = shift; |
95
|
17
|
|
|
|
|
56
|
my $new = $class->new(@_); |
96
|
17
|
100
|
|
|
|
48
|
defined $new->parse_file( $file ) |
97
|
|
|
|
|
|
|
or Carp::croak("unable to parse file: $!"); |
98
|
16
|
|
|
|
|
55
|
return $new; |
99
|
|
|
|
|
|
|
} |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
sub new_from_content { # from any number of scalars |
102
|
8
|
|
|
8
|
1
|
1664
|
my $class = shift; |
103
|
8
|
50
|
|
|
|
34
|
Carp::croak("new_from_content is a class method only") |
104
|
|
|
|
|
|
|
if ref $class; |
105
|
8
|
|
|
|
|
31
|
my $new = $class->new(); |
106
|
8
|
|
|
|
|
25
|
foreach my $whunk (@_) { |
107
|
9
|
100
|
|
|
|
40
|
if ( ref($whunk) eq 'SCALAR' ) { |
108
|
2
|
|
|
|
|
18
|
$new->parse($$whunk); |
109
|
|
|
|
|
|
|
} |
110
|
|
|
|
|
|
|
else { |
111
|
7
|
|
|
|
|
77
|
$new->parse($whunk); |
112
|
|
|
|
|
|
|
} |
113
|
9
|
50
|
|
|
|
37
|
last if $new->{'_stunted'}; # might as well check that. |
114
|
|
|
|
|
|
|
} |
115
|
8
|
|
|
|
|
30
|
$new->eof(); |
116
|
8
|
|
|
|
|
29
|
return $new; |
117
|
|
|
|
|
|
|
} |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
sub new_from_string { # from a single scalar (plus options) |
120
|
16
|
|
|
16
|
1
|
7050
|
my $class = shift; |
121
|
16
|
50
|
|
|
|
58
|
Carp::croak("new_from_string takes an odd number of arguments") |
122
|
|
|
|
|
|
|
unless @_ % 2; |
123
|
16
|
50
|
|
|
|
46
|
Carp::croak("new_from_string is a class method only") |
124
|
|
|
|
|
|
|
if ref $class; |
125
|
16
|
|
|
|
|
32
|
my $string = shift; |
126
|
16
|
|
|
|
|
52
|
my $new = $class->new(@_); |
127
|
16
|
|
|
|
|
49
|
$new->parse_content($string); |
128
|
16
|
|
|
|
|
39
|
return $new; |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
sub new_from_url { # should accept anything that LWP does. |
132
|
4
|
|
|
4
|
1
|
14
|
undef our $lwp_response; |
133
|
4
|
|
|
|
|
9
|
my $class = shift; |
134
|
4
|
50
|
|
|
|
19
|
Carp::croak("new_from_url takes an odd number of arguments") |
135
|
|
|
|
|
|
|
unless @_ % 2; |
136
|
4
|
50
|
|
|
|
13
|
Carp::croak("new_from_url is a class method only") |
137
|
|
|
|
|
|
|
if ref $class; |
138
|
4
|
|
|
|
|
9
|
my $url = shift; |
139
|
|
|
|
|
|
|
|
140
|
4
|
|
|
|
|
27
|
require LWP::UserAgent; |
141
|
|
|
|
|
|
|
# RECOMMEND PREREQ: LWP::UserAgent 5.815 |
142
|
4
|
|
|
|
|
62
|
LWP::UserAgent->VERSION( 5.815 ); # HTTP::Headers content_is_html method |
143
|
4
|
|
|
|
|
24
|
$lwp_response = LWP::UserAgent->new->get( $url ); |
144
|
|
|
|
|
|
|
|
145
|
4
|
100
|
|
|
|
32475
|
Carp::croak("GET failed on $url: " . $lwp_response->status_line) |
146
|
|
|
|
|
|
|
unless $lwp_response->is_success; |
147
|
3
|
100
|
|
|
|
37
|
Carp::croak("$url returned " . $lwp_response->content_type . " not HTML") |
148
|
|
|
|
|
|
|
unless $lwp_response->content_is_html; |
149
|
|
|
|
|
|
|
|
150
|
2
|
|
|
|
|
70
|
my $new = $class->new_from_http($lwp_response, @_); |
151
|
|
|
|
|
|
|
|
152
|
2
|
|
|
|
|
26
|
undef $lwp_response; # Processed successfully |
153
|
2
|
|
|
|
|
9
|
return $new; |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
sub new_from_http { # from a HTTP::Message (or subclass) |
157
|
3
|
|
|
3
|
1
|
6
|
my $class = shift; |
158
|
3
|
50
|
|
|
|
11
|
Carp::croak("new_from_http takes an odd number of arguments") |
159
|
|
|
|
|
|
|
unless @_ % 2; |
160
|
3
|
50
|
|
|
|
9
|
Carp::croak("new_from_http is a class method only") |
161
|
|
|
|
|
|
|
if ref $class; |
162
|
3
|
|
|
|
|
7
|
my $message = shift; |
163
|
3
|
|
|
|
|
12
|
my $new = $class->new(@_); |
164
|
|
|
|
|
|
|
|
165
|
3
|
|
|
|
|
5
|
my $cref; |
166
|
|
|
|
|
|
|
|
167
|
3
|
|
|
|
|
8
|
my %opt = @_; |
168
|
3
|
50
|
|
|
|
10
|
if (defined $opt{encoding}) { |
169
|
|
|
|
|
|
|
# User-specified charset: |
170
|
0
|
|
0
|
|
|
0
|
my $charset = ($opt{encoding} || 'none'); |
171
|
0
|
|
|
|
|
0
|
$charset =~ s/:BOM\z//; |
172
|
0
|
|
|
|
|
0
|
$cref = $message->decoded_content(ref => 1, charset => $charset); |
173
|
|
|
|
|
|
|
} else { |
174
|
|
|
|
|
|
|
# Auto-detect charset: |
175
|
3
|
|
50
|
|
|
14
|
my $charset = $message->content_charset || 'cp1252'; |
176
|
3
|
|
|
|
|
2871
|
$cref = $message->decoded_content(ref => 1, charset => $charset); |
177
|
3
|
50
|
|
|
|
336
|
if ($charset eq 'none') { |
178
|
0
|
|
|
|
|
0
|
$charset = ''; |
179
|
|
|
|
|
|
|
} else { |
180
|
3
|
|
|
|
|
18
|
require Encode; |
181
|
3
|
50
|
|
|
|
11
|
if (my $encoding = Encode::find_encoding($charset)) { |
182
|
3
|
|
|
|
|
60
|
$charset = $encoding->name; # canonical name |
183
|
3
|
50
|
|
|
|
13
|
$charset .= ':BOM' if $$cref =~ /^\x{FeFF}/; |
184
|
|
|
|
|
|
|
} else { |
185
|
0
|
|
|
|
|
0
|
undef $charset; # Encode doesn't recognize it |
186
|
|
|
|
|
|
|
} |
187
|
|
|
|
|
|
|
} |
188
|
3
|
|
|
|
|
8
|
$new->{_encoding} = $charset; |
189
|
|
|
|
|
|
|
} # end else auto-detect charset |
190
|
|
|
|
|
|
|
|
191
|
3
|
|
|
|
|
24
|
$new->parse( $$cref ); |
192
|
3
|
|
|
|
|
13
|
$new->eof; |
193
|
3
|
|
|
|
|
11
|
return $new; |
194
|
|
|
|
|
|
|
} |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
# TODO: document more fully? |
197
|
|
|
|
|
|
|
sub parse_content { # from any number of scalars |
198
|
20
|
|
|
20
|
1
|
53
|
my $tree = shift; |
199
|
20
|
|
|
|
|
36
|
my $retval; |
200
|
20
|
|
|
|
|
46
|
foreach my $whunk (@_) { |
201
|
21
|
100
|
|
|
|
58
|
if ( ref($whunk) eq 'SCALAR' ) { |
202
|
3
|
|
|
|
|
24
|
$retval = $tree->parse($$whunk); |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
else { |
205
|
18
|
|
|
|
|
176
|
$retval = $tree->parse($whunk); |
206
|
|
|
|
|
|
|
} |
207
|
21
|
50
|
|
|
|
73
|
last if $tree->{'_stunted'}; # might as well check that. |
208
|
|
|
|
|
|
|
} |
209
|
20
|
|
|
|
|
69
|
$tree->eof(); |
210
|
20
|
|
|
|
|
46
|
return $retval; |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
#--------------------------------------------------------------------- |
214
|
|
|
|
|
|
|
sub parse_file { |
215
|
20
|
|
|
20
|
1
|
55
|
my ($self, $file) = @_; |
216
|
|
|
|
|
|
|
|
217
|
20
|
50
|
|
|
|
51
|
Carp::croak("parse_file requires file argument") unless defined $file; |
218
|
|
|
|
|
|
|
|
219
|
20
|
|
|
|
|
58
|
my $fh = openhandle($file); |
220
|
20
|
100
|
|
|
|
52
|
unless (defined $fh) { |
221
|
17
|
|
|
|
|
35
|
my $encoding = $self->{_encoding}; |
222
|
|
|
|
|
|
|
|
223
|
17
|
100
|
|
|
|
47
|
if (not defined $encoding) { |
224
|
11
|
|
|
|
|
2151
|
require IO::HTML; |
225
|
|
|
|
|
|
|
|
226
|
11
|
|
|
|
|
28822
|
{ local $@; |
|
11
|
|
|
|
|
20
|
|
227
|
11
|
|
|
|
|
21
|
eval { |
228
|
11
|
|
|
|
|
41
|
($fh, $encoding, my $bom) = |
229
|
|
|
|
|
|
|
IO::HTML::file_and_encoding($file); |
230
|
10
|
100
|
|
|
|
5113
|
$encoding .= ':BOM' if $bom; |
231
|
|
|
|
|
|
|
}; |
232
|
|
|
|
|
|
|
} # end local $@ |
233
|
11
|
|
|
|
|
311
|
$self->{_encoding} = $encoding; |
234
|
|
|
|
|
|
|
} # end if auto encoding |
235
|
|
|
|
|
|
|
else { |
236
|
6
|
|
|
|
|
13
|
$encoding =~ s/:BOM$//; |
237
|
6
|
100
|
|
|
|
168
|
open($fh, (length($encoding) ? "<:encoding($encoding):crlf" |
|
|
50
|
|
|
|
|
|
238
|
|
|
|
|
|
|
: "<:raw"), $file) |
239
|
|
|
|
|
|
|
or undef $fh; |
240
|
|
|
|
|
|
|
} |
241
|
|
|
|
|
|
|
|
242
|
17
|
100
|
|
|
|
857
|
return undef unless defined $fh; |
243
|
|
|
|
|
|
|
} # end unless filehandle was passed in |
244
|
|
|
|
|
|
|
|
245
|
19
|
|
|
|
|
107
|
$self->SUPER::parse_file($fh); |
246
|
|
|
|
|
|
|
} |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
#--------------------------------------------------------------------------- |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
sub new { # constructor! |
251
|
379
|
|
|
379
|
1
|
517955
|
my $class = shift; |
252
|
379
|
|
33
|
|
|
12870
|
$class = ref($class) || $class; |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
# Initialize HTML::Element part |
255
|
379
|
|
|
|
|
12157
|
my $self = $class->element_class->new('html'); |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
{ |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
# A hack for certain strange versions of Parser: |
260
|
379
|
|
|
|
|
11759
|
my $other_self = HTML::Parser->new(); |
|
379
|
|
|
|
|
12495
|
|
261
|
379
|
|
|
|
|
264707
|
%$self = ( %$self, %$other_self ); # copy fields |
262
|
|
|
|
|
|
|
# Yes, multiple inheritance is messy. Kids, don't try this at home. |
263
|
379
|
|
|
|
|
23807
|
bless $other_self, "HTML::TreeBuilder::_hideyhole"; |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
# whack it out of the HTML::Parser class, to avoid the destructor |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
# The root of the tree is special, as it has these funny attributes, |
269
|
|
|
|
|
|
|
# and gets reblessed into this class. |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
# Initialize parser settings |
272
|
379
|
|
|
|
|
11920
|
$self->{'_implicit_tags'} = 1; |
273
|
379
|
|
|
|
|
11727
|
$self->{'_implicit_body_p_tag'} = 0; |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
# If true, trying to insert text, or any of %isPhraseMarkup right |
276
|
|
|
|
|
|
|
# under 'body' will implicate a 'p'. If false, will just go there. |
277
|
|
|
|
|
|
|
|
278
|
379
|
|
|
|
|
11821
|
$self->{'_tighten'} = 1; |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
# whether ignorable WS in this tree should be deleted |
281
|
|
|
|
|
|
|
|
282
|
379
|
|
|
|
|
11985
|
$self->{'_implicit'} = 1; # to delete, once we find a real open-"html" tag |
283
|
|
|
|
|
|
|
|
284
|
379
|
|
|
|
|
11926
|
$self->{'_ignore_unknown'} = 1; |
285
|
379
|
|
|
|
|
11820
|
$self->{'_ignore_text'} = 0; |
286
|
379
|
|
|
|
|
11712
|
$self->{'_warn'} = 0; |
287
|
379
|
|
|
|
|
11681
|
$self->{'_no_space_compacting'} = 0; |
288
|
379
|
|
|
|
|
11693
|
$self->{'_self_closed_tags'} = 0; |
289
|
379
|
|
|
|
|
11627
|
$self->{'_store_comments'} = 0; |
290
|
379
|
|
|
|
|
11649
|
$self->{'_store_declarations'} = 1; |
291
|
379
|
|
|
|
|
11970
|
$self->{'_store_pis'} = 0; |
292
|
379
|
|
|
|
|
11718
|
$self->{'_p_strict'} = 0; |
293
|
379
|
|
|
|
|
11598
|
$HTML::Element::encoded_content = $self->{'_no_expand_entities'} = 0; |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
# rebless to our class |
296
|
379
|
|
|
|
|
11706
|
bless $self, $class; |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
# Parse attributes passed in as arguments |
299
|
379
|
100
|
|
|
|
12077
|
if (@_) { |
300
|
17
|
50
|
|
|
|
53
|
Carp::croak("new must be passed key => value pairs") if @_ % 2; |
301
|
|
|
|
|
|
|
|
302
|
17
|
|
|
|
|
55
|
my %attr = @_; |
303
|
17
|
|
|
|
|
80
|
my $allowed = $self->_is_attr; |
304
|
|
|
|
|
|
|
|
305
|
17
|
|
|
|
|
71
|
while (my ($attr, $value) = each %attr ) { |
306
|
28
|
50
|
|
|
|
77
|
if ($allowed->{$attr}) { |
307
|
28
|
|
|
|
|
496
|
$self->$attr($value); |
308
|
|
|
|
|
|
|
} else { |
309
|
0
|
|
|
|
|
0
|
Carp::carp("Ignoring unknown attribute $attr"); |
310
|
|
|
|
|
|
|
} |
311
|
|
|
|
|
|
|
} # end while each $attr |
312
|
|
|
|
|
|
|
} # end if attributes passed to new |
313
|
|
|
|
|
|
|
|
314
|
379
|
|
|
|
|
12032
|
$self->{'_element_count'} = 1; |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
# undocumented, informal, and maybe not exactly correct |
317
|
|
|
|
|
|
|
|
318
|
379
|
|
|
|
|
12320
|
$self->{'_head'} = $self->insert_element( 'head', 1 ); |
319
|
379
|
|
|
|
|
11732
|
$self->{'_pos'} = undef; # pull it back up |
320
|
379
|
|
|
|
|
12008
|
$self->{'_body'} = $self->insert_element( 'body', 1 ); |
321
|
379
|
|
|
|
|
11711
|
$self->{'_pos'} = undef; # pull it back up again |
322
|
|
|
|
|
|
|
|
323
|
379
|
|
|
|
|
22980
|
return $self; |
324
|
|
|
|
|
|
|
} |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
#========================================================================== |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
sub _elem # universal accessor... |
329
|
|
|
|
|
|
|
{ |
330
|
68
|
|
|
68
|
|
5544
|
my ( $self, $elem, $val ) = @_; |
331
|
68
|
|
|
|
|
5564
|
my $old = $self->{$elem}; |
332
|
68
|
50
|
|
|
|
5619
|
$self->{$elem} = $val if defined $val; |
333
|
68
|
|
|
|
|
16347
|
return $old; |
334
|
|
|
|
|
|
|
} |
335
|
|
|
|
|
|
|
|
336
|
0
|
|
|
|
|
0
|
BEGIN { |
337
|
20
|
|
|
20
|
|
114
|
my @attributes = qw( |
338
|
|
|
|
|
|
|
implicit_tags |
339
|
|
|
|
|
|
|
implicit_body_p_tag |
340
|
|
|
|
|
|
|
p_strict |
341
|
|
|
|
|
|
|
no_space_compacting |
342
|
|
|
|
|
|
|
ignore_unknown |
343
|
|
|
|
|
|
|
ignore_text |
344
|
|
|
|
|
|
|
self_closed_tags |
345
|
|
|
|
|
|
|
store_comments |
346
|
|
|
|
|
|
|
store_declarations |
347
|
|
|
|
|
|
|
store_pis |
348
|
|
|
|
|
|
|
warn |
349
|
|
|
|
|
|
|
); |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
# Create accessor methods: |
352
|
20
|
|
|
|
|
47
|
my $code = join('', map { "sub $_ { shift->_elem( '_$_', \@_ ); }\n" } |
|
220
|
|
|
|
|
623
|
|
353
|
|
|
|
|
|
|
@attributes); |
354
|
20
|
|
|
|
|
52
|
my $err; |
355
|
|
|
|
|
|
|
{ |
356
|
20
|
|
|
|
|
33
|
local $@; |
|
20
|
|
|
|
|
38
|
|
357
|
20
|
50
|
0
|
2
|
1
|
4202
|
$err = $@ || "UNKNOWN ERROR" unless eval "$code 1"; ## no critic |
|
2
|
|
|
2
|
1
|
38
|
|
|
2
|
|
|
36
|
1
|
8
|
|
|
36
|
|
|
11
|
1
|
60581
|
|
|
11
|
|
|
5
|
1
|
244
|
|
|
5
|
|
|
0
|
1
|
19
|
|
|
0
|
|
|
1
|
1
|
0
|
|
|
1
|
|
|
1
|
1
|
6
|
|
|
1
|
|
|
1
|
1
|
47
|
|
|
1
|
|
|
0
|
1
|
411
|
|
|
0
|
|
|
2
|
1
|
0
|
|
|
2
|
|
|
|
|
8
|
|
358
|
|
|
|
|
|
|
} |
359
|
20
|
50
|
|
|
|
68
|
die "$code$err" if $err; |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
# Record names of class attributes: |
362
|
20
|
|
|
|
|
49
|
my %is_attr = map { $_ => 1 } (@attributes, qw( |
|
280
|
|
|
|
|
85297
|
|
363
|
|
|
|
|
|
|
encoding |
364
|
|
|
|
|
|
|
ignore_ignorable_whitespace |
365
|
|
|
|
|
|
|
no_expand_entities |
366
|
|
|
|
|
|
|
)); |
367
|
|
|
|
|
|
|
|
368
|
17
|
|
|
17
|
|
39
|
sub _is_attr { return \%is_attr } |
369
|
|
|
|
|
|
|
} |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
# Custom accessors: |
372
|
|
|
|
|
|
|
sub ignore_ignorable_whitespace { |
373
|
5
|
|
|
5
|
1
|
13
|
shift->_elem( '_tighten', @_ ); # internal name is different |
374
|
|
|
|
|
|
|
} |
375
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
sub no_expand_entities { |
377
|
2
|
|
|
2
|
1
|
6
|
my $self = shift; |
378
|
2
|
|
|
|
|
8
|
my $return = $self->_elem( '_no_expand_entities', @_ ); |
379
|
2
|
|
|
|
|
5
|
$HTML::Element::encoded_content = $self->{_no_expand_entities}; |
380
|
2
|
|
|
|
|
9
|
$return; |
381
|
|
|
|
|
|
|
} |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
#========================================================================== |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
sub warning { |
386
|
3
|
|
|
3
|
1
|
6
|
my $self = shift; |
387
|
3
|
50
|
|
|
|
12
|
CORE::warn("HTML::Parse: $_[0]\n") if $self->{'_warn'}; |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
# should maybe say HTML::TreeBuilder instead |
390
|
|
|
|
|
|
|
} |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
#========================================================================== |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
{ |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
# To avoid having to rebuild these lists constantly... |
397
|
|
|
|
|
|
|
my $_Closed_by_structurals = [qw(p h1 h2 h3 h4 h5 h6 pre textarea)]; |
398
|
|
|
|
|
|
|
my $indent; |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
sub start { |
401
|
1252
|
50
|
|
1252
|
1
|
16007
|
return if $_[0]{'_stunted'}; |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
# Accept a signal from HTML::Parser for start-tags. |
404
|
1252
|
|
|
|
|
6879
|
my ( $self, $tag, $attr ) = @_; |
405
|
|
|
|
|
|
|
|
406
|
1252
|
|
100
|
|
|
7447
|
my $self_closed = ($self->{'_self_closed_tags'} and |
407
|
|
|
|
|
|
|
$_[4] =~ m!/[\n\r\f\t ]*>\z!); |
408
|
1252
|
100
|
|
|
|
7001
|
delete $attr->{'/'} if $self_closed; |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
# Parser passes more, actually: |
411
|
|
|
|
|
|
|
# $self->start($tag, $attr, $attrseq, $origtext) |
412
|
|
|
|
|
|
|
# But we can merrily ignore $attrseq and $origtext. |
413
|
|
|
|
|
|
|
|
414
|
1252
|
50
|
|
|
|
7321
|
if ( $tag eq 'x-html' ) { |
415
|
0
|
|
|
|
|
0
|
print "Ignoring open-x-html tag.\n" if DEBUG; |
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
# inserted by some lame code-generators. |
418
|
0
|
|
|
|
|
0
|
return; # bypass tweaking. |
419
|
|
|
|
|
|
|
} |
420
|
|
|
|
|
|
|
|
421
|
1252
|
|
|
|
|
6559
|
$tag =~ s{/$}{}s; # So turns into . Silently forgive. |
422
|
|
|
|
|
|
|
|
423
|
1252
|
50
|
|
|
|
8661
|
unless ( $tag =~ m/^[-_a-zA-Z0-9:%]+$/s ) { |
424
|
0
|
|
|
|
|
0
|
DEBUG and print "Start-tag name $tag is no good. Skipping.\n"; |
425
|
0
|
|
|
|
|
0
|
return; |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
# This avoids having Element's new() throw an exception. |
428
|
|
|
|
|
|
|
} |
429
|
|
|
|
|
|
|
|
430
|
1252
|
|
66
|
|
|
8260
|
my $ptag = ( my $pos = $self->{'_pos'} || $self )->{'_tag'}; |
431
|
1252
|
|
|
|
|
6004
|
my $already_inserted; |
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
#my($indent); |
434
|
1252
|
|
|
|
|
5918
|
if (DEBUG) { |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
# optimization -- don't figure out indenting unless we're in debug mode |
437
|
|
|
|
|
|
|
my @lineage = $pos->lineage; |
438
|
|
|
|
|
|
|
$indent = ' ' x ( 1 + @lineage ); |
439
|
|
|
|
|
|
|
print $indent, "Proposing a new \U$tag\E under ", |
440
|
|
|
|
|
|
|
join( '/', map $_->{'_tag'}, reverse( $pos, @lineage ) ) |
441
|
|
|
|
|
|
|
|| 'Root', |
442
|
|
|
|
|
|
|
".\n"; |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
#} else { |
445
|
|
|
|
|
|
|
# $indent = ' '; |
446
|
|
|
|
|
|
|
} |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
#print $indent, "POS: $pos ($ptag)\n" if DEBUG > 2; |
449
|
|
|
|
|
|
|
# $attr = {%$attr}; |
450
|
|
|
|
|
|
|
|
451
|
1252
|
|
|
|
|
7708
|
foreach my $k ( keys %$attr ) { |
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
# Make sure some stooge doesn't have "". |
454
|
|
|
|
|
|
|
# That happens every few million Web pages. |
455
|
597
|
50
|
33
|
|
|
2815
|
$attr->{ ' ' . $k } = delete $attr->{$k} |
456
|
|
|
|
|
|
|
if length $k and substr( $k, 0, 1 ) eq '_'; |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
# Looks bad, but is fine for round-tripping. |
459
|
|
|
|
|
|
|
} |
460
|
|
|
|
|
|
|
|
461
|
1252
|
|
|
|
|
6969
|
my $e = $self->element_class->new( $tag, %$attr ); |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
# Make a new element object. |
464
|
|
|
|
|
|
|
# (Only rarely do we end up just throwing it away later in this call.) |
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
# Some prep -- custom messiness for those damned tables, and strict P's. |
467
|
1252
|
100
|
|
|
|
7805
|
if ( $self->{'_implicit_tags'} ) { # wallawallawalla! |
468
|
|
|
|
|
|
|
|
469
|
1214
|
100
|
|
|
|
7238
|
unless ( $HTML::TreeBuilder::isTableElement{$tag} ) { |
470
|
1036
|
50
|
|
|
|
7126
|
if ( $ptag eq 'table' ) { |
|
|
50
|
|
|
|
|
|
471
|
0
|
|
|
|
|
0
|
print $indent, |
472
|
|
|
|
|
|
|
" * Phrasal \U$tag\E right under TABLE makes implicit TR and TD\n" |
473
|
|
|
|
|
|
|
if DEBUG > 1; |
474
|
0
|
|
|
|
|
0
|
$self->insert_element( 'tr', 1 ); |
475
|
0
|
|
|
|
|
0
|
$pos = $self->insert_element( 'td', 1 ) |
476
|
|
|
|
|
|
|
; # yes, needs updating |
477
|
|
|
|
|
|
|
} |
478
|
|
|
|
|
|
|
elsif ( $ptag eq 'tr' ) { |
479
|
0
|
|
|
|
|
0
|
print $indent, |
480
|
|
|
|
|
|
|
" * Phrasal \U$tag\E right under TR makes an implicit TD\n" |
481
|
|
|
|
|
|
|
if DEBUG > 1; |
482
|
0
|
|
|
|
|
0
|
$pos = $self->insert_element( 'td', 1 ) |
483
|
|
|
|
|
|
|
; # yes, needs updating |
484
|
|
|
|
|
|
|
} |
485
|
1036
|
|
|
|
|
10209
|
$ptag = $pos->{'_tag'}; # yes, needs updating |
486
|
|
|
|
|
|
|
} |
487
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
# end of table-implication block. |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
# Now maybe do a little dance to enforce P-strictness. |
491
|
|
|
|
|
|
|
# This seems like it should be integrated with the big |
492
|
|
|
|
|
|
|
# "ALL HOPE..." block, further below, but that doesn't |
493
|
|
|
|
|
|
|
# seem feasible. |
494
|
1214
|
0
|
33
|
|
|
11250
|
if ( $self->{'_p_strict'} |
|
|
|
0
|
|
|
|
|
495
|
|
|
|
|
|
|
and $HTML::TreeBuilder::isKnown{$tag} |
496
|
|
|
|
|
|
|
and not $HTML::Tagset::is_Possible_Strict_P_Content{$tag} ) |
497
|
|
|
|
|
|
|
{ |
498
|
0
|
|
|
|
|
0
|
my $here = $pos; |
499
|
0
|
|
|
|
|
0
|
my $here_tag = $ptag; |
500
|
0
|
|
|
|
|
0
|
while (1) { |
501
|
0
|
0
|
|
|
|
0
|
if ( $here_tag eq 'p' ) { |
502
|
0
|
|
|
|
|
0
|
print $indent, " * Inserting $tag closes strict P.\n" |
503
|
|
|
|
|
|
|
if DEBUG > 1; |
504
|
0
|
|
|
|
|
0
|
$self->end( \q{p} ); |
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
# NB: same as \'q', but less confusing to emacs cperl-mode |
507
|
0
|
|
|
|
|
0
|
last; |
508
|
|
|
|
|
|
|
} |
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
#print("Lasting from $here_tag\n"), |
511
|
|
|
|
|
|
|
last |
512
|
|
|
|
|
|
|
if $HTML::TreeBuilder::isKnown{$here_tag} |
513
|
|
|
|
|
|
|
and |
514
|
|
|
|
|
|
|
not $HTML::Tagset::is_Possible_Strict_P_Content{ |
515
|
0
|
0
|
0
|
|
|
0
|
$here_tag}; |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
# Don't keep looking up the tree if we see something that can't |
518
|
|
|
|
|
|
|
# be strict-P content. |
519
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
$here_tag |
521
|
0
|
|
0
|
|
|
0
|
= ( $here = $here->{'_parent'} || last )->{'_tag'}; |
522
|
|
|
|
|
|
|
} # end while |
523
|
|
|
|
|
|
|
$ptag = ( $pos = $self->{'_pos'} || $self ) |
524
|
0
|
|
0
|
|
|
0
|
->{'_tag'}; # better update! |
525
|
|
|
|
|
|
|
} |
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
# end of strict-p block. |
528
|
|
|
|
|
|
|
} |
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
# And now, get busy... |
531
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
532
|
1252
|
100
|
|
|
|
9073
|
if ( !$self->{'_implicit_tags'} ) { # bimskalabim |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
533
|
|
|
|
|
|
|
# do nothing |
534
|
38
|
|
|
|
|
59
|
print $indent, " * _implicit_tags is off. doing nothing\n" |
535
|
|
|
|
|
|
|
if DEBUG > 1; |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
538
|
|
|
|
|
|
|
} |
539
|
|
|
|
|
|
|
elsif ( $HTML::TreeBuilder::isHeadOrBodyElement{$tag} ) { |
540
|
2
|
100
|
|
|
|
10
|
if ( $pos->is_inside('body') ) { # all is well |
|
|
50
|
|
|
|
|
|
541
|
1
|
|
|
|
|
2
|
print $indent, |
542
|
|
|
|
|
|
|
" * ambilocal element \U$tag\E is fine under BODY.\n" |
543
|
|
|
|
|
|
|
if DEBUG > 1; |
544
|
|
|
|
|
|
|
} |
545
|
|
|
|
|
|
|
elsif ( $pos->is_inside('head') ) { |
546
|
1
|
|
|
|
|
2
|
print $indent, |
547
|
|
|
|
|
|
|
" * ambilocal element \U$tag\E is fine under HEAD.\n" |
548
|
|
|
|
|
|
|
if DEBUG > 1; |
549
|
|
|
|
|
|
|
} |
550
|
|
|
|
|
|
|
else { |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
# In neither head nor body! mmmmm... put under head? |
553
|
|
|
|
|
|
|
|
554
|
0
|
0
|
|
|
|
0
|
if ( $ptag eq 'html' ) { # expected case |
555
|
|
|
|
|
|
|
# TODO?? : would there ever be a case where _head would be |
556
|
|
|
|
|
|
|
# absent from a tree that would ever be accessed at this |
557
|
|
|
|
|
|
|
# point? |
558
|
0
|
0
|
|
|
|
0
|
die "Where'd my head go?" unless ref $self->{'_head'}; |
559
|
0
|
0
|
|
|
|
0
|
if ( $self->{'_head'}{'_implicit'} ) { |
560
|
0
|
|
|
|
|
0
|
print $indent, |
561
|
|
|
|
|
|
|
" * ambilocal element \U$tag\E makes an implicit HEAD.\n" |
562
|
|
|
|
|
|
|
if DEBUG > 1; |
563
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
# or rather, points us at it. |
565
|
|
|
|
|
|
|
$self->{'_pos'} |
566
|
0
|
|
|
|
|
0
|
= $self->{'_head'}; # to insert under... |
567
|
|
|
|
|
|
|
} |
568
|
|
|
|
|
|
|
else { |
569
|
0
|
|
|
|
|
0
|
$self->warning( |
570
|
|
|
|
|
|
|
"Ambilocal element <$tag> not under HEAD or BODY!?" |
571
|
|
|
|
|
|
|
); |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
# Put it under HEAD by default, I guess |
574
|
|
|
|
|
|
|
$self->{'_pos'} |
575
|
0
|
|
|
|
|
0
|
= $self->{'_head'}; # to insert under... |
576
|
|
|
|
|
|
|
} |
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
} |
579
|
|
|
|
|
|
|
else { |
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
# Neither under head nor body, nor right under html... pass thru? |
582
|
0
|
|
|
|
|
0
|
$self->warning( |
583
|
|
|
|
|
|
|
"Ambilocal element <$tag> neither under head nor body, nor right under html!?" |
584
|
|
|
|
|
|
|
); |
585
|
|
|
|
|
|
|
} |
586
|
|
|
|
|
|
|
} |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
589
|
|
|
|
|
|
|
} |
590
|
|
|
|
|
|
|
elsif ( $HTML::TreeBuilder::isBodyElement{$tag} ) { |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
# Ensure that we are within |
593
|
943
|
100
|
66
|
|
|
7840
|
if ( $ptag eq 'body' ) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
594
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
# We're good. |
596
|
|
|
|
|
|
|
} |
597
|
|
|
|
|
|
|
elsif ( |
598
|
|
|
|
|
|
|
$HTML::TreeBuilder::isBodyElement{$ptag} # glarg |
599
|
|
|
|
|
|
|
and not $HTML::TreeBuilder::isHeadOrBodyElement{$ptag} |
600
|
|
|
|
|
|
|
) |
601
|
|
|
|
|
|
|
{ |
602
|
|
|
|
|
|
|
|
603
|
|
|
|
|
|
|
# Special case: Save ourselves a call to is_inside further down. |
604
|
|
|
|
|
|
|
# If our $ptag is an isBodyElement element (but not an |
605
|
|
|
|
|
|
|
# isHeadOrBodyElement element), then we must be under body! |
606
|
258
|
|
|
|
|
385
|
print $indent, " * Inferring that $ptag is under BODY.\n", |
607
|
|
|
|
|
|
|
if DEBUG > 3; |
608
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
# I think this and the test for 'body' trap everything |
610
|
|
|
|
|
|
|
# bodyworthy, except the case where the parent element is |
611
|
|
|
|
|
|
|
# under an unknown element that's a descendant of body. |
612
|
|
|
|
|
|
|
} |
613
|
|
|
|
|
|
|
elsif ( $pos->is_inside('head') ) { |
614
|
169
|
|
|
|
|
349
|
print $indent, |
615
|
|
|
|
|
|
|
" * body-element \U$tag\E minimizes HEAD, makes implicit BODY.\n" |
616
|
|
|
|
|
|
|
if DEBUG > 1; |
617
|
|
|
|
|
|
|
$ptag = ( |
618
|
|
|
|
|
|
|
$pos = $self->{'_pos'} |
619
|
|
|
|
|
|
|
= $self->{'_body'} # yes, needs updating |
620
|
|
|
|
|
|
|
|| die "Where'd my body go?" |
621
|
169
|
|
50
|
|
|
593
|
)->{'_tag'}; # yes, needs updating |
622
|
|
|
|
|
|
|
} |
623
|
|
|
|
|
|
|
elsif ( !$pos->is_inside('body') ) { |
624
|
104
|
|
|
|
|
4336
|
print $indent, |
625
|
|
|
|
|
|
|
" * body-element \U$tag\E makes implicit BODY.\n" |
626
|
|
|
|
|
|
|
if DEBUG > 1; |
627
|
|
|
|
|
|
|
$ptag = ( |
628
|
|
|
|
|
|
|
$pos = $self->{'_pos'} |
629
|
|
|
|
|
|
|
= $self->{'_body'} # yes, needs updating |
630
|
|
|
|
|
|
|
|| die "Where'd my body go?" |
631
|
104
|
|
50
|
|
|
8681
|
)->{'_tag'}; # yes, needs updating |
632
|
|
|
|
|
|
|
} |
633
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
# else we ARE under body, so okay. |
635
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
# Handle implicit endings and insert based on and position |
637
|
|
|
|
|
|
|
# ... ALL HOPE ABANDON ALL YE WHO ENTER HERE ... |
638
|
943
|
100
|
100
|
|
|
20273
|
if ( $tag eq 'p' |
|
|
100
|
100
|
|
|
|
|
|
|
100
|
100
|
|
|
|
|
|
|
50
|
66
|
|
|
|
|
|
|
100
|
66
|
|
|
|
|
|
|
100
|
33
|
|
|
|
|
|
|
100
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
33
|
|
|
|
|
639
|
|
|
|
|
|
|
or $tag eq 'h1' |
640
|
|
|
|
|
|
|
or $tag eq 'h2' |
641
|
|
|
|
|
|
|
or $tag eq 'h3' |
642
|
|
|
|
|
|
|
or $tag eq 'h4' |
643
|
|
|
|
|
|
|
or $tag eq 'h5' |
644
|
|
|
|
|
|
|
or $tag eq 'h6' |
645
|
|
|
|
|
|
|
or $tag eq 'form' |
646
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
# Hm, should |
648
|
|
|
|
|
|
|
) |
649
|
|
|
|
|
|
|
{ |
650
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
# Can't have , or |
652
|
307
|
|
|
|
|
914
|
$self->end( |
653
|
|
|
|
|
|
|
$_Closed_by_structurals, |
654
|
|
|
|
|
|
|
@HTML::TreeBuilder::p_closure_barriers |
655
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
# used to be just li! |
657
|
|
|
|
|
|
|
); |
658
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
} |
660
|
|
|
|
|
|
|
elsif ( $tag eq 'ol' or $tag eq 'ul' or $tag eq 'dl' ) { |
661
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
# Can't have lists inside -- in the unlikely |
663
|
|
|
|
|
|
|
# event anyone tries to put them there! |
664
|
9
|
50
|
33
|
|
|
114
|
if ( $ptag eq 'h1' |
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
665
|
|
|
|
|
|
|
or $ptag eq 'h2' |
666
|
|
|
|
|
|
|
or $ptag eq 'h3' |
667
|
|
|
|
|
|
|
or $ptag eq 'h4' |
668
|
|
|
|
|
|
|
or $ptag eq 'h5' |
669
|
|
|
|
|
|
|
or $ptag eq 'h6' ) |
670
|
|
|
|
|
|
|
{ |
671
|
0
|
|
|
|
|
0
|
$self->end( \$ptag ); |
672
|
|
|
|
|
|
|
} |
673
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
# TODO: Maybe keep closing up the tree until |
675
|
|
|
|
|
|
|
# the ptag isn't any of the above? |
676
|
|
|
|
|
|
|
# But anyone that says
|
677
|
|
|
|
|
|
|
# deserves what they get anyway. |
678
|
|
|
|
|
|
|
|
679
|
|
|
|
|
|
|
} |
680
|
|
|
|
|
|
|
elsif ( $tag eq 'li' ) { # list item |
681
|
|
|
|
|
|
|
# Get under a list tag, one way or another |
682
|
12
|
50
|
66
|
|
|
47
|
unless ( |
683
|
|
|
|
|
|
|
exists $HTML::TreeBuilder::isList{$ptag} |
684
|
|
|
|
|
|
|
or $self->end( \q{*}, keys %HTML::TreeBuilder::isList ) #' |
685
|
|
|
|
|
|
|
) |
686
|
|
|
|
|
|
|
{ |
687
|
0
|
|
|
|
|
0
|
print $indent, |
688
|
|
|
|
|
|
|
" * inserting implicit UL for lack of containing ", |
689
|
|
|
|
|
|
|
join( '|', keys %HTML::TreeBuilder::isList ), ".\n" |
690
|
|
|
|
|
|
|
if DEBUG > 1; |
691
|
0
|
|
|
|
|
0
|
$self->insert_element( 'ul', 1 ); |
692
|
|
|
|
|
|
|
} |
693
|
|
|
|
|
|
|
|
694
|
|
|
|
|
|
|
} |
695
|
|
|
|
|
|
|
elsif ( $tag eq 'dt' or $tag eq 'dd' ) { |
696
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
# Get under a DL, one way or another |
698
|
0
|
0
|
0
|
|
|
0
|
unless ( $ptag eq 'dl' or $self->end( \q{*}, 'dl' ) ) { #' |
699
|
0
|
|
|
|
|
0
|
print $indent, |
700
|
|
|
|
|
|
|
" * inserting implicit DL for lack of containing DL.\n" |
701
|
|
|
|
|
|
|
if DEBUG > 1; |
702
|
0
|
|
|
|
|
0
|
$self->insert_element( 'dl', 1 ); |
703
|
|
|
|
|
|
|
} |
704
|
|
|
|
|
|
|
|
705
|
|
|
|
|
|
|
} |
706
|
|
|
|
|
|
|
elsif ( $HTML::TreeBuilder::isFormElement{$tag} ) { |
707
|
1
|
50
|
33
|
|
|
5
|
if ($self->{ |
708
|
|
|
|
|
|
|
'_ignore_formies_outside_form'} # TODO: document this |
709
|
|
|
|
|
|
|
and not $pos->is_inside('form') |
710
|
|
|
|
|
|
|
) |
711
|
|
|
|
|
|
|
{ |
712
|
0
|
|
|
|
|
0
|
print $indent, |
713
|
|
|
|
|
|
|
" * ignoring \U$tag\E because not in a FORM.\n" |
714
|
|
|
|
|
|
|
if DEBUG > 1; |
715
|
0
|
|
|
|
|
0
|
return; # bypass tweaking. |
716
|
|
|
|
|
|
|
} |
717
|
1
|
50
|
|
|
|
4
|
if ( $tag eq 'option' ) { |
718
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
# return unless $ptag eq 'select'; |
720
|
0
|
|
|
|
|
0
|
$self->end( \q{option} ); |
721
|
0
|
|
0
|
|
|
0
|
$ptag = ( $self->{'_pos'} || $self )->{'_tag'}; |
722
|
0
|
0
|
0
|
|
|
0
|
unless ( $ptag eq 'select' or $ptag eq 'optgroup' ) { |
723
|
0
|
|
|
|
|
0
|
print $indent, |
724
|
|
|
|
|
|
|
" * \U$tag\E makes an implicit SELECT.\n" |
725
|
|
|
|
|
|
|
if DEBUG > 1; |
726
|
0
|
|
|
|
|
0
|
$pos = $self->insert_element( 'select', 1 ); |
727
|
|
|
|
|
|
|
|
728
|
|
|
|
|
|
|
# but not a very useful select -- has no 'name' attribute! |
729
|
|
|
|
|
|
|
# is $pos's value used after this? |
730
|
|
|
|
|
|
|
} |
731
|
|
|
|
|
|
|
} |
732
|
|
|
|
|
|
|
} |
733
|
|
|
|
|
|
|
elsif ( $HTML::TreeBuilder::isTableElement{$tag} ) { |
734
|
178
|
100
|
|
|
|
473
|
if ( !$pos->is_inside('table') ) { |
735
|
3
|
|
|
|
|
6
|
print $indent, " * \U$tag\E makes an implicit TABLE\n" |
736
|
|
|
|
|
|
|
if DEBUG > 1; |
737
|
3
|
|
|
|
|
9
|
$self->insert_element( 'table', 1 ); |
738
|
|
|
|
|
|
|
} |
739
|
|
|
|
|
|
|
|
740
|
178
|
100
|
100
|
|
|
606
|
if ( $tag eq 'td' or $tag eq 'th' ) { |
741
|
|
|
|
|
|
|
|
742
|
|
|
|
|
|
|
# Get under a tr one way or another |
743
|
111
|
100
|
100
|
|
|
330
|
unless ( |
744
|
|
|
|
|
|
|
$ptag eq 'tr' # either under a tr |
745
|
|
|
|
|
|
|
or $self->end( \q{*}, 'tr', |
746
|
|
|
|
|
|
|
'table' ) #or we can get under one |
747
|
|
|
|
|
|
|
) |
748
|
|
|
|
|
|
|
{ |
749
|
2
|
|
|
|
|
5
|
print $indent, |
750
|
|
|
|
|
|
|
" * \U$tag\E under \U$ptag\E makes an implicit TR\n" |
751
|
|
|
|
|
|
|
if DEBUG > 1; |
752
|
2
|
|
|
|
|
7
|
$self->insert_element( 'tr', 1 ); |
753
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
# presumably pos's value isn't used after this. |
755
|
|
|
|
|
|
|
} |
756
|
|
|
|
|
|
|
} |
757
|
|
|
|
|
|
|
else { |
758
|
67
|
|
|
|
|
185
|
$self->end( \$tag, 'table' ); #' |
759
|
|
|
|
|
|
|
} |
760
|
|
|
|
|
|
|
|
761
|
|
|
|
|
|
|
# Hmm, I guess this is right. To work it out: |
762
|
|
|
|
|
|
|
# tr closes any open tr (limited at a table) |
763
|
|
|
|
|
|
|
# thead closes any open thead (limited at a table) |
764
|
|
|
|
|
|
|
# tbody closes any open tbody (limited at a table) |
765
|
|
|
|
|
|
|
# tfoot closes any open tfoot (limited at a table) |
766
|
|
|
|
|
|
|
# colgroup closes any open colgroup (limited at a table) |
767
|
|
|
|
|
|
|
# col can try, but will always fail, at the enclosing table, |
768
|
|
|
|
|
|
|
# as col is empty, and therefore never open! |
769
|
|
|
|
|
|
|
# But! |
770
|
|
|
|
|
|
|
# td closes any open td OR th (limited at a table) |
771
|
|
|
|
|
|
|
# th closes any open th OR td (limited at a table) |
772
|
|
|
|
|
|
|
# ...implementable as "close to a tr, or make a tr" |
773
|
|
|
|
|
|
|
|
774
|
|
|
|
|
|
|
} |
775
|
|
|
|
|
|
|
elsif ( $HTML::TreeBuilder::isPhraseMarkup{$tag} ) { |
776
|
387
|
100
|
66
|
|
|
7335
|
if ( $ptag eq 'body' and $self->{'_implicit_body_p_tag'} ) { |
777
|
9
|
|
|
|
|
1369
|
print |
778
|
|
|
|
|
|
|
" * Phrasal \U$tag\E right under BODY makes an implicit P\n" |
779
|
|
|
|
|
|
|
if DEBUG > 1; |
780
|
9
|
|
|
|
|
1371
|
$pos = $self->insert_element( 'p', 1 ); |
781
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
# is $pos's value used after this? |
783
|
|
|
|
|
|
|
} |
784
|
|
|
|
|
|
|
} |
785
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
# End of implicit endings logic |
787
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
# End of "elsif ($HTML::TreeBuilder::isBodyElement{$tag}" |
789
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
790
|
|
|
|
|
|
|
|
791
|
|
|
|
|
|
|
} |
792
|
|
|
|
|
|
|
elsif ( $HTML::TreeBuilder::isHeadElement{$tag} ) { |
793
|
204
|
100
|
|
|
|
585
|
if ( $pos->is_inside('body') ) { |
|
|
100
|
|
|
|
|
|
794
|
3
|
|
|
|
|
6
|
print $indent, " * head element \U$tag\E found inside BODY!\n" |
795
|
|
|
|
|
|
|
if DEBUG; |
796
|
3
|
|
|
|
|
16
|
$self->warning("Header element <$tag> in body"); # [sic] |
797
|
|
|
|
|
|
|
} |
798
|
|
|
|
|
|
|
elsif ( !$pos->is_inside('head') ) { |
799
|
167
|
|
|
|
|
300
|
print $indent, |
800
|
|
|
|
|
|
|
" * head element \U$tag\E makes an implicit HEAD.\n" |
801
|
|
|
|
|
|
|
if DEBUG > 1; |
802
|
|
|
|
|
|
|
} |
803
|
|
|
|
|
|
|
else { |
804
|
34
|
|
|
|
|
62
|
print $indent, |
805
|
|
|
|
|
|
|
" * head element \U$tag\E goes inside existing HEAD.\n" |
806
|
|
|
|
|
|
|
if DEBUG > 1; |
807
|
|
|
|
|
|
|
} |
808
|
204
|
|
50
|
|
|
648
|
$self->{'_pos'} = $self->{'_head'} || die "Where'd my head go?"; |
809
|
|
|
|
|
|
|
|
810
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
811
|
|
|
|
|
|
|
} |
812
|
|
|
|
|
|
|
elsif ( $tag eq 'html' ) { |
813
|
21
|
50
|
|
|
|
61
|
if ( delete $self->{'_implicit'} ) { # first time here |
814
|
21
|
|
|
|
|
33
|
print $indent, " * good! found the real HTML element!\n" |
815
|
|
|
|
|
|
|
if DEBUG > 1; |
816
|
|
|
|
|
|
|
} |
817
|
|
|
|
|
|
|
else { |
818
|
0
|
|
|
|
|
0
|
print $indent, " * Found a second HTML element\n" |
819
|
|
|
|
|
|
|
if DEBUG; |
820
|
0
|
|
|
|
|
0
|
$self->warning("Found a nested element"); |
821
|
|
|
|
|
|
|
} |
822
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
# in either case, migrate attributes to the real element |
824
|
21
|
|
|
|
|
56
|
for ( keys %$attr ) { |
825
|
6
|
|
|
|
|
27
|
$self->attr( $_, $attr->{$_} ); |
826
|
|
|
|
|
|
|
} |
827
|
21
|
|
|
|
|
39
|
$self->{'_pos'} = undef; |
828
|
21
|
|
|
|
|
160
|
return $self; # bypass tweaking. |
829
|
|
|
|
|
|
|
|
830
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
831
|
|
|
|
|
|
|
} |
832
|
|
|
|
|
|
|
elsif ( $tag eq 'head' ) { |
833
|
22
|
|
50
|
|
|
67
|
my $head = $self->{'_head'} || die "Where'd my head go?"; |
834
|
22
|
50
|
|
|
|
65
|
if ( delete $head->{'_implicit'} ) { # first time here |
835
|
22
|
|
|
|
|
38
|
print $indent, " * good! found the real HEAD element!\n" |
836
|
|
|
|
|
|
|
if DEBUG > 1; |
837
|
|
|
|
|
|
|
} |
838
|
|
|
|
|
|
|
else { # been here before |
839
|
0
|
|
|
|
|
0
|
print $indent, " * Found a second HEAD element\n" |
840
|
|
|
|
|
|
|
if DEBUG; |
841
|
0
|
|
|
|
|
0
|
$self->warning("Found a second element"); |
842
|
|
|
|
|
|
|
} |
843
|
|
|
|
|
|
|
|
844
|
|
|
|
|
|
|
# in either case, migrate attributes to the real element |
845
|
22
|
|
|
|
|
60
|
for ( keys %$attr ) { |
846
|
0
|
|
|
|
|
0
|
$head->attr( $_, $attr->{$_} ); |
847
|
|
|
|
|
|
|
} |
848
|
22
|
|
|
|
|
138
|
return $self->{'_pos'} = $head; # bypass tweaking. |
849
|
|
|
|
|
|
|
|
850
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
851
|
|
|
|
|
|
|
} |
852
|
|
|
|
|
|
|
elsif ( $tag eq 'body' ) { |
853
|
22
|
|
50
|
|
|
77
|
my $body = $self->{'_body'} || die "Where'd my body go?"; |
854
|
22
|
50
|
|
|
|
66
|
if ( delete $body->{'_implicit'} ) { # first time here |
855
|
22
|
|
|
|
|
37
|
print $indent, " * good! found the real BODY element!\n" |
856
|
|
|
|
|
|
|
if DEBUG > 1; |
857
|
|
|
|
|
|
|
} |
858
|
|
|
|
|
|
|
else { # been here before |
859
|
0
|
|
|
|
|
0
|
print $indent, " * Found a second BODY element\n" |
860
|
|
|
|
|
|
|
if DEBUG; |
861
|
0
|
|
|
|
|
0
|
$self->warning("Found a second element"); |
862
|
|
|
|
|
|
|
} |
863
|
|
|
|
|
|
|
|
864
|
|
|
|
|
|
|
# in either case, migrate attributes to the real element |
865
|
22
|
|
|
|
|
62
|
for ( keys %$attr ) { |
866
|
0
|
|
|
|
|
0
|
$body->attr( $_, $attr->{$_} ); |
867
|
|
|
|
|
|
|
} |
868
|
22
|
|
|
|
|
164
|
return $self->{'_pos'} = $body; # bypass tweaking. |
869
|
|
|
|
|
|
|
|
870
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
871
|
|
|
|
|
|
|
} |
872
|
|
|
|
|
|
|
elsif ( $tag eq 'frameset' ) { |
873
|
0
|
0
|
0
|
|
|
0
|
if (!( $self->{'_frameset_seen'}++ ) # first frameset seen |
|
|
|
0
|
|
|
|
|
874
|
|
|
|
|
|
|
and !$self->{'_noframes_seen'} |
875
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
# otherwise it'll be under the noframes already |
877
|
|
|
|
|
|
|
and !$self->is_inside('body') |
878
|
|
|
|
|
|
|
) |
879
|
|
|
|
|
|
|
{ |
880
|
|
|
|
|
|
|
|
881
|
|
|
|
|
|
|
# The following is a bit of a hack. We don't use the normal |
882
|
|
|
|
|
|
|
# insert_element because 1) we don't want it as _pos, but instead |
883
|
|
|
|
|
|
|
# right under $self, and 2), more importantly, that we don't want |
884
|
|
|
|
|
|
|
# this inserted at the /end/ of $self's content_list, but instead |
885
|
|
|
|
|
|
|
# in the middle of it, specifically right before the body element. |
886
|
|
|
|
|
|
|
# |
887
|
0
|
|
0
|
|
|
0
|
my $c = $self->{'_content'} || die "Contentless root?"; |
888
|
0
|
|
0
|
|
|
0
|
my $body = $self->{'_body'} || die "Where'd my BODY go?"; |
889
|
0
|
|
|
|
|
0
|
for ( my $i = 0; $i < @$c; ++$i ) { |
890
|
0
|
0
|
|
|
|
0
|
if ( $c->[$i] eq $body ) { |
891
|
0
|
|
|
|
|
0
|
splice( @$c, $i, 0, $self->{'_pos'} = $pos = $e ); |
892
|
0
|
|
|
|
|
0
|
HTML::Element::_weaken($e->{'_parent'} = $self); |
893
|
0
|
|
|
|
|
0
|
$already_inserted = 1; |
894
|
0
|
|
|
|
|
0
|
print $indent, |
895
|
|
|
|
|
|
|
" * inserting 'frameset' right before BODY.\n" |
896
|
|
|
|
|
|
|
if DEBUG > 1; |
897
|
0
|
|
|
|
|
0
|
last; |
898
|
|
|
|
|
|
|
} |
899
|
|
|
|
|
|
|
} |
900
|
0
|
0
|
|
|
|
0
|
die "BODY not found in children of root?" |
901
|
|
|
|
|
|
|
unless $already_inserted; |
902
|
|
|
|
|
|
|
} |
903
|
|
|
|
|
|
|
|
904
|
|
|
|
|
|
|
} |
905
|
|
|
|
|
|
|
elsif ( $tag eq 'frame' ) { |
906
|
|
|
|
|
|
|
|
907
|
|
|
|
|
|
|
# Okay, fine, pass thru. |
908
|
|
|
|
|
|
|
# Should probably enforce that these should be under a frameset. |
909
|
|
|
|
|
|
|
# But hey. Ditto for enforcing that 'noframes' should be under |
910
|
|
|
|
|
|
|
# a 'frameset', as the DTDs say. |
911
|
|
|
|
|
|
|
|
912
|
|
|
|
|
|
|
} |
913
|
|
|
|
|
|
|
elsif ( $tag eq 'noframes' ) { |
914
|
|
|
|
|
|
|
|
915
|
|
|
|
|
|
|
# This basically assumes there'll be exactly one 'noframes' element |
916
|
|
|
|
|
|
|
# per document. At least, only the first one gets to have the |
917
|
|
|
|
|
|
|
# body under it. And if there are no noframes elements, then |
918
|
|
|
|
|
|
|
# the body pretty much stays where it is. Is that ever a problem? |
919
|
0
|
0
|
|
|
|
0
|
if ( $self->{'_noframes_seen'}++ ) { |
920
|
0
|
|
|
|
|
0
|
print $indent, " * ANOTHER noframes element?\n" if DEBUG; |
921
|
|
|
|
|
|
|
} |
922
|
|
|
|
|
|
|
else { |
923
|
0
|
0
|
|
|
|
0
|
if ( $pos->is_inside('body') ) { |
924
|
0
|
|
|
|
|
0
|
print $indent, " * 'noframes' inside 'body'. Odd!\n" |
925
|
|
|
|
|
|
|
if DEBUG; |
926
|
|
|
|
|
|
|
|
927
|
|
|
|
|
|
|
# In that odd case, we /can't/ make body a child of 'noframes', |
928
|
|
|
|
|
|
|
# because it's an ancestor of the 'noframes'! |
929
|
|
|
|
|
|
|
} |
930
|
|
|
|
|
|
|
else { |
931
|
0
|
|
0
|
|
|
0
|
$e->push_content( $self->{'_body'} |
932
|
|
|
|
|
|
|
|| die "Where'd my body go?" ); |
933
|
0
|
|
|
|
|
0
|
print $indent, " * Moving body to be under noframes.\n" |
934
|
|
|
|
|
|
|
if DEBUG; |
935
|
|
|
|
|
|
|
} |
936
|
|
|
|
|
|
|
} |
937
|
|
|
|
|
|
|
|
938
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
939
|
|
|
|
|
|
|
} |
940
|
|
|
|
|
|
|
else { |
941
|
|
|
|
|
|
|
|
942
|
|
|
|
|
|
|
# unknown tag |
943
|
0
|
0
|
|
|
|
0
|
if ( $self->{'_ignore_unknown'} ) { |
944
|
0
|
|
|
|
|
0
|
print $indent, " * Ignoring unknown tag \U$tag\E\n" if DEBUG; |
945
|
0
|
|
|
|
|
0
|
$self->warning("Skipping unknown tag $tag"); |
946
|
0
|
|
|
|
|
0
|
return; |
947
|
|
|
|
|
|
|
} |
948
|
|
|
|
|
|
|
else { |
949
|
0
|
|
|
|
|
0
|
print $indent, " * Accepting unknown tag \U$tag\E\n" |
950
|
|
|
|
|
|
|
if DEBUG; |
951
|
|
|
|
|
|
|
} |
952
|
|
|
|
|
|
|
} |
953
|
|
|
|
|
|
|
|
954
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
955
|
|
|
|
|
|
|
# End of mumbo-jumbo |
956
|
|
|
|
|
|
|
|
957
|
|
|
|
|
|
|
print $indent, "(Attaching ", $e->{'_tag'}, " under ", |
958
|
1187
|
|
|
|
|
5905
|
( $self->{'_pos'} || $self )->{'_tag'}, ")\n" |
959
|
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
# because if _pos isn't defined, it goes under self |
961
|
|
|
|
|
|
|
if DEBUG; |
962
|
|
|
|
|
|
|
|
963
|
|
|
|
|
|
|
# The following if-clause is to delete /some/ ignorable whitespace |
964
|
|
|
|
|
|
|
# nodes, as we're making the tree. |
965
|
|
|
|
|
|
|
# This'd be a node we'd catch later anyway, but we might as well |
966
|
|
|
|
|
|
|
# nip it in the bud now. |
967
|
|
|
|
|
|
|
# This doesn't catch /all/ deletable WS-nodes, so we do have to call |
968
|
|
|
|
|
|
|
# the tightener later to catch the rest. |
969
|
|
|
|
|
|
|
|
970
|
1187
|
100
|
66
|
|
|
9127
|
if ( $self->{'_tighten'} and !$self->{'_ignore_text'} ) |
971
|
|
|
|
|
|
|
{ # if tightenable |
972
|
1178
|
|
|
|
|
6092
|
my ( $sibs, $par ); |
973
|
1178
|
100
|
66
|
|
|
18215
|
if (( $sibs = ( $par = $self->{'_pos'} || $self )->{'_content'} ) |
|
|
|
66
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
974
|
|
|
|
|
|
|
and @$sibs # parent already has content |
975
|
|
|
|
|
|
|
and ! |
976
|
|
|
|
|
|
|
ref( $sibs->[-1] ) # and the last one there is a text node |
977
|
|
|
|
|
|
|
and $sibs->[-1] !~ m<[^\n\r\f\t ]>s # and it's all whitespace |
978
|
|
|
|
|
|
|
|
979
|
|
|
|
|
|
|
and ( # one of these has to be eligible... |
980
|
|
|
|
|
|
|
$HTML::TreeBuilder::canTighten{$tag} |
981
|
|
|
|
|
|
|
or (( @$sibs == 1 ) |
982
|
|
|
|
|
|
|
? # WS is leftmost -- so parent matters |
983
|
|
|
|
|
|
|
$HTML::TreeBuilder::canTighten{ $par->{'_tag'} } |
984
|
|
|
|
|
|
|
: # WS is after another node -- it matters |
985
|
|
|
|
|
|
|
( ref $sibs->[-2] |
986
|
|
|
|
|
|
|
and |
987
|
|
|
|
|
|
|
$HTML::TreeBuilder::canTighten{ $sibs->[-2] |
988
|
|
|
|
|
|
|
{'_tag'} } |
989
|
|
|
|
|
|
|
) |
990
|
|
|
|
|
|
|
) |
991
|
|
|
|
|
|
|
) |
992
|
|
|
|
|
|
|
|
993
|
|
|
|
|
|
|
and !$par->is_inside( 'pre', 'xmp', 'textarea', 'plaintext' ) |
994
|
|
|
|
|
|
|
|
995
|
|
|
|
|
|
|
# we're clear |
996
|
|
|
|
|
|
|
) |
997
|
|
|
|
|
|
|
{ |
998
|
131
|
|
|
|
|
224
|
pop @$sibs; |
999
|
131
|
|
|
|
|
216
|
print $indent, "Popping a preceding all-WS node\n" if DEBUG; |
1000
|
|
|
|
|
|
|
} |
1001
|
|
|
|
|
|
|
} |
1002
|
|
|
|
|
|
|
|
1003
|
1187
|
50
|
|
|
|
6967
|
unless ($already_inserted) { |
1004
|
1187
|
100
|
|
|
|
6564
|
if ($self_closed) { $self->pos->push_content($e) } |
|
8
|
|
|
|
|
24
|
|
1005
|
1179
|
|
|
|
|
7228
|
else { $self->insert_element($e) } |
1006
|
|
|
|
|
|
|
} |
1007
|
|
|
|
|
|
|
|
1008
|
1187
|
|
|
|
|
6183
|
if (DEBUG) { |
1009
|
|
|
|
|
|
|
if ( $self->{'_pos'} ) { |
1010
|
|
|
|
|
|
|
print $indent, "(Current lineage of pos: \U$tag\E under ", |
1011
|
|
|
|
|
|
|
join( |
1012
|
|
|
|
|
|
|
'/', |
1013
|
|
|
|
|
|
|
reverse( |
1014
|
|
|
|
|
|
|
|
1015
|
|
|
|
|
|
|
# $self->{'_pos'}{'_tag'}, # don't list myself! |
1016
|
|
|
|
|
|
|
$self->{'_pos'}->lineage_tag_names |
1017
|
|
|
|
|
|
|
) |
1018
|
|
|
|
|
|
|
), |
1019
|
|
|
|
|
|
|
".)\n"; |
1020
|
|
|
|
|
|
|
} |
1021
|
|
|
|
|
|
|
else { |
1022
|
|
|
|
|
|
|
print $indent, "(Pos points nowhere!?)\n"; |
1023
|
|
|
|
|
|
|
} |
1024
|
|
|
|
|
|
|
} |
1025
|
|
|
|
|
|
|
|
1026
|
1187
|
100
|
50
|
|
|
8452
|
unless ( ( $self->{'_pos'} || '' ) eq $e ) { |
1027
|
|
|
|
|
|
|
|
1028
|
|
|
|
|
|
|
# if it's an empty element -- i.e., if it didn't change the _pos |
1029
|
51
|
|
|
|
|
1548
|
&{ $self->{"_tweak_$tag"} |
1030
|
51
|
50
|
33
|
|
|
3219
|
|| $self->{'_tweak_*'} |
1031
|
|
|
|
|
|
|
|| return $e }( map $_, $e, $tag, $self ) |
1032
|
|
|
|
|
|
|
; # make a list so the user can't clobber |
1033
|
|
|
|
|
|
|
} |
1034
|
|
|
|
|
|
|
|
1035
|
1136
|
|
|
|
|
12770
|
return $e; |
1036
|
|
|
|
|
|
|
} |
1037
|
|
|
|
|
|
|
} |
1038
|
|
|
|
|
|
|
|
1039
|
|
|
|
|
|
|
#========================================================================== |
1040
|
|
|
|
|
|
|
|
1041
|
|
|
|
|
|
|
{ |
1042
|
|
|
|
|
|
|
my $indent; |
1043
|
|
|
|
|
|
|
|
1044
|
|
|
|
|
|
|
sub end { |
1045
|
1714
|
50
|
|
1714
|
1
|
18274
|
return if $_[0]{'_stunted'}; |
1046
|
|
|
|
|
|
|
|
1047
|
|
|
|
|
|
|
# Either: Acccept an end-tag signal from HTML::Parser |
1048
|
|
|
|
|
|
|
# Or: Method for closing currently open elements in some fairly complex |
1049
|
|
|
|
|
|
|
# way, as used by other methods in this class. |
1050
|
1714
|
|
|
|
|
18619
|
my ( $self, $tag, @stop ) = @_; |
1051
|
1714
|
50
|
|
|
|
18082
|
if ( $tag eq 'x-html' ) { |
1052
|
0
|
|
|
|
|
0
|
print "Ignoring close-x-html tag.\n" if DEBUG; |
1053
|
|
|
|
|
|
|
|
1054
|
|
|
|
|
|
|
# inserted by some lame code-generators. |
1055
|
0
|
|
|
|
|
0
|
return; |
1056
|
|
|
|
|
|
|
} |
1057
|
|
|
|
|
|
|
|
1058
|
1714
|
50
|
66
|
|
|
21789
|
unless ( ref($tag) or $tag =~ m/^[-_a-zA-Z0-9:%]+$/s ) { |
1059
|
0
|
|
|
|
|
0
|
DEBUG and print "End-tag name $tag is no good. Skipping.\n"; |
1060
|
0
|
|
|
|
|
0
|
return; |
1061
|
|
|
|
|
|
|
|
1062
|
|
|
|
|
|
|
# This avoids having Element's new() throw an exception. |
1063
|
|
|
|
|
|
|
} |
1064
|
|
|
|
|
|
|
|
1065
|
|
|
|
|
|
|
# This method accepts two calling formats: |
1066
|
|
|
|
|
|
|
# 1) from Parser: $self->end('tag_name', 'origtext') |
1067
|
|
|
|
|
|
|
# in which case we shouldn't mistake origtext as a blocker tag |
1068
|
|
|
|
|
|
|
# 2) from myself: $self->end(\q{tagname1}, 'blk1', ... ) |
1069
|
|
|
|
|
|
|
# from myself: $self->end(['tagname1', 'tagname2'], 'blk1', ... ) |
1070
|
|
|
|
|
|
|
|
1071
|
|
|
|
|
|
|
# End the specified tag, but don't move above any of the blocker tags. |
1072
|
|
|
|
|
|
|
# The tag can also be a reference to an array. Terminate the first |
1073
|
|
|
|
|
|
|
# tag found. |
1074
|
|
|
|
|
|
|
|
1075
|
1714
|
|
66
|
|
|
18873
|
my $ptag = ( my $p = $self->{'_pos'} || $self )->{'_tag'}; |
1076
|
|
|
|
|
|
|
|
1077
|
|
|
|
|
|
|
# $p and $ptag are sort-of scratch |
1078
|
|
|
|
|
|
|
|
1079
|
1714
|
100
|
|
|
|
17449
|
if ( ref($tag) ) { |
1080
|
|
|
|
|
|
|
|
1081
|
|
|
|
|
|
|
# First param is a ref of one sort or another -- |
1082
|
|
|
|
|
|
|
# THE CALL IS COMING FROM INSIDE THE HOUSE! |
1083
|
415
|
100
|
|
|
|
973
|
$tag = $$tag if ref($tag) eq 'SCALAR'; |
1084
|
|
|
|
|
|
|
|
1085
|
|
|
|
|
|
|
# otherwise it's an arrayref. |
1086
|
|
|
|
|
|
|
} |
1087
|
|
|
|
|
|
|
else { |
1088
|
|
|
|
|
|
|
|
1089
|
|
|
|
|
|
|
# the call came from Parser -- just ignore origtext |
1090
|
|
|
|
|
|
|
# except in a table ignore unmatched table tags RT #59980 |
1091
|
1299
|
100
|
|
|
|
31119
|
@stop = $tag =~ /^t[hdr]\z/ ? 'table' : (); |
1092
|
|
|
|
|
|
|
} |
1093
|
|
|
|
|
|
|
|
1094
|
|
|
|
|
|
|
#my($indent); |
1095
|
1714
|
|
|
|
|
16308
|
if (DEBUG) { |
1096
|
|
|
|
|
|
|
|
1097
|
|
|
|
|
|
|
# optimization -- don't figure out depth unless we're in debug mode |
1098
|
|
|
|
|
|
|
my @lineage_tags = $p->lineage_tag_names; |
1099
|
|
|
|
|
|
|
$indent = ' ' x ( 1 + @lineage_tags ); |
1100
|
|
|
|
|
|
|
|
1101
|
|
|
|
|
|
|
# now announce ourselves |
1102
|
|
|
|
|
|
|
print $indent, "Ending ", |
1103
|
|
|
|
|
|
|
ref($tag) ? ( '[', join( ' ', @$tag ), ']' ) : "\U$tag\E", |
1104
|
|
|
|
|
|
|
scalar(@stop) |
1105
|
|
|
|
|
|
|
? ( " no higher than [", join( ' ', @stop ), "]" ) |
1106
|
|
|
|
|
|
|
: (), ".\n"; |
1107
|
|
|
|
|
|
|
|
1108
|
|
|
|
|
|
|
print $indent, " (Current lineage: ", join( '/', @lineage_tags ), |
1109
|
|
|
|
|
|
|
".)\n" |
1110
|
|
|
|
|
|
|
if DEBUG > 1; |
1111
|
|
|
|
|
|
|
|
1112
|
|
|
|
|
|
|
if ( DEBUG > 3 ) { |
1113
|
|
|
|
|
|
|
|
1114
|
|
|
|
|
|
|
#my( |
1115
|
|
|
|
|
|
|
# $package, $filename, $line, $subroutine, |
1116
|
|
|
|
|
|
|
# $hasargs, $wantarray, $evaltext, $is_require) = caller; |
1117
|
|
|
|
|
|
|
print $indent, |
1118
|
|
|
|
|
|
|
" (Called from ", ( caller(1) )[3], ' line ', |
1119
|
|
|
|
|
|
|
( caller(1) )[2], |
1120
|
|
|
|
|
|
|
")\n"; |
1121
|
|
|
|
|
|
|
} |
1122
|
|
|
|
|
|
|
|
1123
|
|
|
|
|
|
|
#} else { |
1124
|
|
|
|
|
|
|
# $indent = ' '; |
1125
|
|
|
|
|
|
|
} |
1126
|
|
|
|
|
|
|
|
1127
|
|
|
|
|
|
|
# End of if DEBUG |
1128
|
|
|
|
|
|
|
|
1129
|
|
|
|
|
|
|
# Now actually do it |
1130
|
1714
|
|
|
|
|
16430
|
my @to_close; |
1131
|
1714
|
100
|
|
|
|
18135
|
if ( $tag eq '*' ) { |
|
|
100
|
|
|
|
|
|
1132
|
|
|
|
|
|
|
|
1133
|
|
|
|
|
|
|
# Special -- close everything up to (but not including) the first |
1134
|
|
|
|
|
|
|
# limiting tag, or return if none found. Somewhat of a special case. |
1135
|
|
|
|
|
|
|
PARENT: |
1136
|
41
|
|
|
|
|
93
|
while ( defined $p ) { |
1137
|
80
|
|
|
|
|
128
|
$ptag = $p->{'_tag'}; |
1138
|
80
|
|
|
|
|
106
|
print $indent, " (Looking at $ptag.)\n" if DEBUG > 2; |
1139
|
80
|
|
|
|
|
141
|
for (@stop) { |
1140
|
131
|
100
|
|
|
|
314
|
if ( $ptag eq $_ ) { |
1141
|
41
|
|
|
|
|
62
|
print $indent, |
1142
|
|
|
|
|
|
|
" (Hit a $_; closing everything up to here.)\n" |
1143
|
|
|
|
|
|
|
if DEBUG > 2; |
1144
|
41
|
|
|
|
|
78
|
last PARENT; |
1145
|
|
|
|
|
|
|
} |
1146
|
|
|
|
|
|
|
} |
1147
|
39
|
|
|
|
|
67
|
push @to_close, $p; |
1148
|
39
|
|
|
|
|
67
|
$p = $p->{'_parent'}; # no match so far? keep moving up |
1149
|
|
|
|
|
|
|
print $indent, |
1150
|
39
|
|
|
|
|
83
|
" (Moving on up to ", $p ? $p->{'_tag'} : 'nil', ")\n" |
1151
|
|
|
|
|
|
|
if DEBUG > 1; |
1152
|
|
|
|
|
|
|
} |
1153
|
41
|
50
|
|
|
|
95
|
unless ( defined $p ) { # We never found what we were looking for. |
1154
|
0
|
|
|
|
|
0
|
print $indent, " (We never found a limit.)\n" if DEBUG > 1; |
1155
|
0
|
|
|
|
|
0
|
return; |
1156
|
|
|
|
|
|
|
} |
1157
|
|
|
|
|
|
|
|
1158
|
|
|
|
|
|
|
#print |
1159
|
|
|
|
|
|
|
# $indent, |
1160
|
|
|
|
|
|
|
# " (To close: ", join('/', map $_->tag, @to_close), ".)\n" |
1161
|
|
|
|
|
|
|
# if DEBUG > 4; |
1162
|
|
|
|
|
|
|
|
1163
|
|
|
|
|
|
|
# Otherwise update pos and fall thru. |
1164
|
41
|
|
|
|
|
74
|
$self->{'_pos'} = $p; |
1165
|
|
|
|
|
|
|
} |
1166
|
|
|
|
|
|
|
elsif ( ref $tag ) { |
1167
|
|
|
|
|
|
|
|
1168
|
|
|
|
|
|
|
# Close the first of any of the matching tags, giving up if you hit |
1169
|
|
|
|
|
|
|
# any of the stop-tags. |
1170
|
|
|
|
|
|
|
PARENT: |
1171
|
307
|
|
|
|
|
757
|
while ( defined $p ) { |
1172
|
584
|
|
|
|
|
991
|
$ptag = $p->{'_tag'}; |
1173
|
584
|
|
|
|
|
785
|
print $indent, " (Looking at $ptag.)\n" if DEBUG > 2; |
1174
|
584
|
|
|
|
|
1096
|
for (@$tag) { |
1175
|
5014
|
100
|
|
|
|
10511
|
if ( $ptag eq $_ ) { |
1176
|
31
|
|
|
|
|
42
|
print $indent, " (Closing $_.)\n" if DEBUG > 2; |
1177
|
31
|
|
|
|
|
62
|
last PARENT; |
1178
|
|
|
|
|
|
|
} |
1179
|
|
|
|
|
|
|
} |
1180
|
553
|
|
|
|
|
985
|
for (@stop) { |
1181
|
8295
|
50
|
|
|
|
16900
|
if ( $ptag eq $_ ) { |
1182
|
0
|
|
|
|
|
0
|
print $indent, |
1183
|
|
|
|
|
|
|
" (Hit a limiting $_ -- bailing out.)\n" |
1184
|
|
|
|
|
|
|
if DEBUG > 1; |
1185
|
0
|
|
|
|
|
0
|
return; # so it was all for naught |
1186
|
|
|
|
|
|
|
} |
1187
|
|
|
|
|
|
|
} |
1188
|
553
|
|
|
|
|
922
|
push @to_close, $p; |
1189
|
553
|
|
|
|
|
1300
|
$p = $p->{'_parent'}; |
1190
|
|
|
|
|
|
|
} |
1191
|
307
|
100
|
|
|
|
1180
|
return unless defined $p; # We went off the top of the tree. |
1192
|
|
|
|
|
|
|
# Otherwise specified element was found; set pos to its parent. |
1193
|
31
|
|
|
|
|
65
|
push @to_close, $p; |
1194
|
31
|
|
|
|
|
64
|
$self->{'_pos'} = $p->{'_parent'}; |
1195
|
|
|
|
|
|
|
} |
1196
|
|
|
|
|
|
|
else { |
1197
|
|
|
|
|
|
|
|
1198
|
|
|
|
|
|
|
# Close the first of the specified tag, giving up if you hit |
1199
|
|
|
|
|
|
|
# any of the stop-tags. |
1200
|
1366
|
|
|
|
|
16988
|
while ( defined $p ) { |
1201
|
1876
|
|
|
|
|
32053
|
$ptag = $p->{'_tag'}; |
1202
|
1876
|
|
|
|
|
31446
|
print $indent, " (Looking at $ptag.)\n" if DEBUG > 2; |
1203
|
1876
|
100
|
|
|
|
33525
|
if ( $ptag eq $tag ) { |
1204
|
1324
|
|
|
|
|
14532
|
print $indent, " (Closing $tag.)\n" if DEBUG > 2; |
1205
|
1324
|
|
|
|
|
27417
|
last; |
1206
|
|
|
|
|
|
|
} |
1207
|
552
|
|
|
|
|
17839
|
for (@stop) { |
1208
|
67
|
100
|
|
|
|
167
|
if ( $ptag eq $_ ) { |
1209
|
32
|
|
|
|
|
41
|
print $indent, |
1210
|
|
|
|
|
|
|
" (Hit a limiting $_ -- bailing out.)\n" |
1211
|
|
|
|
|
|
|
if DEBUG > 1; |
1212
|
32
|
|
|
|
|
94
|
return; # so it was all for naught |
1213
|
|
|
|
|
|
|
} |
1214
|
|
|
|
|
|
|
} |
1215
|
520
|
|
|
|
|
17715
|
push @to_close, $p; |
1216
|
520
|
|
|
|
|
19113
|
$p = $p->{'_parent'}; |
1217
|
|
|
|
|
|
|
} |
1218
|
1334
|
100
|
|
|
|
18382
|
return unless defined $p; # We went off the top of the tree. |
1219
|
|
|
|
|
|
|
# Otherwise specified element was found; set pos to its parent. |
1220
|
1324
|
|
|
|
|
14740
|
push @to_close, $p; |
1221
|
1324
|
|
|
|
|
27324
|
$self->{'_pos'} = $p->{'_parent'}; |
1222
|
|
|
|
|
|
|
} |
1223
|
|
|
|
|
|
|
|
1224
|
1396
|
100
|
100
|
|
|
17696
|
$self->{'_pos'} = undef if $self eq ( $self->{'_pos'} || '' ); |
1225
|
|
|
|
|
|
|
print $indent, "(Pos now points to ", |
1226
|
1396
|
|
|
|
|
14626
|
$self->{'_pos'} ? $self->{'_pos'}{'_tag'} : '???', ".)\n" |
1227
|
|
|
|
|
|
|
if DEBUG > 1; |
1228
|
|
|
|
|
|
|
|
1229
|
|
|
|
|
|
|
### EXPENSIVE, because has to check that it's not under a pre |
1230
|
|
|
|
|
|
|
### or a CDATA-parent. That's one more method call per end()! |
1231
|
|
|
|
|
|
|
### Might as well just do this at the end of the tree-parse, I guess, |
1232
|
|
|
|
|
|
|
### at which point we'd be parsing top-down, and just not traversing |
1233
|
|
|
|
|
|
|
### under pre's or CDATA-parents. |
1234
|
|
|
|
|
|
|
## |
1235
|
|
|
|
|
|
|
## Take this opportunity to nix any terminal whitespace nodes. |
1236
|
|
|
|
|
|
|
## TODO: consider whether this (plus the logic in start(), above) |
1237
|
|
|
|
|
|
|
## would ever leave any WS nodes in the tree. |
1238
|
|
|
|
|
|
|
## If not, then there's no reason to have eof() call |
1239
|
|
|
|
|
|
|
## delete_ignorable_whitespace on the tree, is there? |
1240
|
|
|
|
|
|
|
## |
1241
|
|
|
|
|
|
|
#if(@to_close and $self->{'_tighten'} and !$self->{'_ignore_text'} and |
1242
|
|
|
|
|
|
|
# ! $to_close[-1]->is_inside('pre', keys %HTML::Tagset::isCDATA_Parent) |
1243
|
|
|
|
|
|
|
#) { # if tightenable |
1244
|
|
|
|
|
|
|
# my($children, $e_tag); |
1245
|
|
|
|
|
|
|
# foreach my $e (reverse @to_close) { # going top-down |
1246
|
|
|
|
|
|
|
# last if 'pre' eq ($e_tag = $e->{'_tag'}) or |
1247
|
|
|
|
|
|
|
# $HTML::Tagset::isCDATA_Parent{$e_tag}; |
1248
|
|
|
|
|
|
|
# |
1249
|
|
|
|
|
|
|
# if( |
1250
|
|
|
|
|
|
|
# $children = $e->{'_content'} |
1251
|
|
|
|
|
|
|
# and @$children # has children |
1252
|
|
|
|
|
|
|
# and !ref($children->[-1]) |
1253
|
|
|
|
|
|
|
# and $children->[-1] =~ m<^\s+$>s # last node is all-WS |
1254
|
|
|
|
|
|
|
# and |
1255
|
|
|
|
|
|
|
# ( |
1256
|
|
|
|
|
|
|
# # has a tightenable parent: |
1257
|
|
|
|
|
|
|
# $HTML::TreeBuilder::canTighten{ $e_tag } |
1258
|
|
|
|
|
|
|
# or |
1259
|
|
|
|
|
|
|
# ( # has a tightenable left sibling: |
1260
|
|
|
|
|
|
|
# @$children > 1 and |
1261
|
|
|
|
|
|
|
# ref($children->[-2]) |
1262
|
|
|
|
|
|
|
# and $HTML::TreeBuilder::canTighten{ $children->[-2]{'_tag'} } |
1263
|
|
|
|
|
|
|
# ) |
1264
|
|
|
|
|
|
|
# ) |
1265
|
|
|
|
|
|
|
# ) { |
1266
|
|
|
|
|
|
|
# pop @$children; |
1267
|
|
|
|
|
|
|
# #print $indent, "Popping a terminal WS node from ", $e->{'_tag'}, |
1268
|
|
|
|
|
|
|
# # " (", $e->address, ") while exiting.\n" if DEBUG; |
1269
|
|
|
|
|
|
|
# } |
1270
|
|
|
|
|
|
|
# } |
1271
|
|
|
|
|
|
|
#} |
1272
|
|
|
|
|
|
|
|
1273
|
1396
|
|
|
|
|
14932
|
foreach my $e (@to_close) { |
1274
|
|
|
|
|
|
|
|
1275
|
|
|
|
|
|
|
# Call the applicable callback, if any |
1276
|
1885
|
|
|
|
|
28117
|
$ptag = $e->{'_tag'}; |
1277
|
1885
|
|
|
|
|
29966
|
&{ $self->{"_tweak_$ptag"} |
1278
|
1885
|
50
|
33
|
|
|
58609
|
|| $self->{'_tweak_*'} |
1279
|
|
|
|
|
|
|
|| next }( map $_, $e, $ptag, $self ); |
1280
|
0
|
|
|
|
|
0
|
print $indent, "Back from tweaking.\n" if DEBUG; |
1281
|
|
|
|
|
|
|
last |
1282
|
0
|
0
|
|
|
|
0
|
if $self->{ '_stunted' |
1283
|
|
|
|
|
|
|
}; # in case one of the handlers called stunt |
1284
|
|
|
|
|
|
|
} |
1285
|
1396
|
|
|
|
|
30331
|
return @to_close; |
1286
|
|
|
|
|
|
|
} |
1287
|
|
|
|
|
|
|
} |
1288
|
|
|
|
|
|
|
|
1289
|
|
|
|
|
|
|
#========================================================================== |
1290
|
|
|
|
|
|
|
{ |
1291
|
|
|
|
|
|
|
my ( $indent, $nugget ); |
1292
|
|
|
|
|
|
|
|
1293
|
|
|
|
|
|
|
sub text { |
1294
|
2587
|
50
|
|
2587
|
1
|
93193
|
return if $_[0]{'_stunted'}; |
1295
|
|
|
|
|
|
|
|
1296
|
|
|
|
|
|
|
# Accept a "here's a text token" signal from HTML::Parser. |
1297
|
2587
|
|
|
|
|
16780
|
my ( $self, $text, $is_cdata ) = @_; |
1298
|
|
|
|
|
|
|
|
1299
|
|
|
|
|
|
|
# the >3.0 versions of Parser may pass a cdata node. |
1300
|
|
|
|
|
|
|
# Thanks to Gisle Aas for pointing this out. |
1301
|
|
|
|
|
|
|
|
1302
|
2587
|
50
|
|
|
|
17186
|
return unless length $text; # I guess that's always right |
1303
|
|
|
|
|
|
|
|
1304
|
2587
|
|
|
|
|
15308
|
my $ignore_text = $self->{'_ignore_text'}; |
1305
|
2587
|
|
|
|
|
15180
|
my $no_space_compacting = $self->{'_no_space_compacting'}; |
1306
|
2587
|
|
|
|
|
15061
|
my $no_expand_entities = $self->{'_no_expand_entities'}; |
1307
|
2587
|
|
66
|
|
|
17801
|
my $pos = $self->{'_pos'} || $self; |
1308
|
|
|
|
|
|
|
|
1309
|
|
|
|
|
|
|
HTML::Entities::decode($text) |
1310
|
|
|
|
|
|
|
unless $ignore_text |
1311
|
|
|
|
|
|
|
|| $is_cdata |
1312
|
2587
|
100
|
33
|
|
|
35009
|
|| $HTML::Tagset::isCDATA_Parent{ $pos->{'_tag'} } |
|
|
|
33
|
|
|
|
|
|
|
|
66
|
|
|
|
|
1313
|
|
|
|
|
|
|
|| $no_expand_entities; |
1314
|
|
|
|
|
|
|
|
1315
|
|
|
|
|
|
|
#my($indent, $nugget); |
1316
|
2587
|
|
|
|
|
14960
|
if (DEBUG) { |
1317
|
|
|
|
|
|
|
|
1318
|
|
|
|
|
|
|
# optimization -- don't figure out depth unless we're in debug mode |
1319
|
|
|
|
|
|
|
my @lineage_tags = $pos->lineage_tag_names; |
1320
|
|
|
|
|
|
|
$indent = ' ' x ( 1 + @lineage_tags ); |
1321
|
|
|
|
|
|
|
|
1322
|
|
|
|
|
|
|
$nugget |
1323
|
|
|
|
|
|
|
= ( length($text) <= 25 ) |
1324
|
|
|
|
|
|
|
? $text |
1325
|
|
|
|
|
|
|
: ( substr( $text, 0, 25 ) . '...' ); |
1326
|
|
|
|
|
|
|
$nugget =~ s<([\x00-\x1F])> |
1327
|
|
|
|
|
|
|
<'\\x'.(unpack("H2",$1))>eg; |
1328
|
|
|
|
|
|
|
print $indent, "Proposing a new text node ($nugget) under ", |
1329
|
|
|
|
|
|
|
join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) |
1330
|
|
|
|
|
|
|
|| 'Root', |
1331
|
|
|
|
|
|
|
".\n"; |
1332
|
|
|
|
|
|
|
|
1333
|
|
|
|
|
|
|
#} else { |
1334
|
|
|
|
|
|
|
# $indent = ' '; |
1335
|
|
|
|
|
|
|
} |
1336
|
|
|
|
|
|
|
|
1337
|
2587
|
|
|
|
|
15087
|
my $ptag; |
1338
|
2587
|
50
|
33
|
|
|
20861
|
if ($HTML::Tagset::isCDATA_Parent{ $ptag = $pos->{'_tag'} } |
1339
|
|
|
|
|
|
|
|
1340
|
|
|
|
|
|
|
#or $pos->is_inside('pre') |
1341
|
|
|
|
|
|
|
or $pos->is_inside( 'pre', 'textarea' ) |
1342
|
|
|
|
|
|
|
) |
1343
|
|
|
|
|
|
|
{ |
1344
|
0
|
0
|
|
|
|
0
|
return if $ignore_text; |
1345
|
0
|
|
|
|
|
0
|
$pos->push_content($text); |
1346
|
|
|
|
|
|
|
} |
1347
|
|
|
|
|
|
|
else { |
1348
|
|
|
|
|
|
|
|
1349
|
|
|
|
|
|
|
# return unless $text =~ /\S/; # This is sometimes wrong |
1350
|
|
|
|
|
|
|
|
1351
|
2587
|
100
|
100
|
|
|
30332
|
if ( !$self->{'_implicit_tags'} || $text !~ /[^\n\r\f\t ]/ ) { |
|
|
50
|
33
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
1352
|
|
|
|
|
|
|
|
1353
|
|
|
|
|
|
|
# don't change anything |
1354
|
|
|
|
|
|
|
} |
1355
|
|
|
|
|
|
|
elsif ( $ptag eq 'head' or $ptag eq 'noframes' ) { |
1356
|
0
|
0
|
|
|
|
0
|
if ( $self->{'_implicit_body_p_tag'} ) { |
1357
|
0
|
|
|
|
|
0
|
print $indent, |
1358
|
|
|
|
|
|
|
" * Text node under \U$ptag\E closes \U$ptag\E, implicates BODY and P.\n" |
1359
|
|
|
|
|
|
|
if DEBUG > 1; |
1360
|
0
|
|
|
|
|
0
|
$self->end( \$ptag ); |
1361
|
|
|
|
|
|
|
$pos = $self->{'_body'} |
1362
|
|
|
|
|
|
|
? ( $self->{'_pos'} |
1363
|
0
|
0
|
|
|
|
0
|
= $self->{'_body'} ) # expected case |
1364
|
|
|
|
|
|
|
: $self->insert_element( 'body', 1 ); |
1365
|
0
|
|
|
|
|
0
|
$pos = $self->insert_element( 'p', 1 ); |
1366
|
|
|
|
|
|
|
} |
1367
|
|
|
|
|
|
|
else { |
1368
|
0
|
|
|
|
|
0
|
print $indent, |
1369
|
|
|
|
|
|
|
" * Text node under \U$ptag\E closes, implicates BODY.\n" |
1370
|
|
|
|
|
|
|
if DEBUG > 1; |
1371
|
0
|
|
|
|
|
0
|
$self->end( \$ptag ); |
1372
|
|
|
|
|
|
|
$pos = $self->{'_body'} |
1373
|
|
|
|
|
|
|
? ( $self->{'_pos'} |
1374
|
0
|
0
|
|
|
|
0
|
= $self->{'_body'} ) # expected case |
1375
|
|
|
|
|
|
|
: $self->insert_element( 'body', 1 ); |
1376
|
|
|
|
|
|
|
} |
1377
|
|
|
|
|
|
|
} |
1378
|
|
|
|
|
|
|
elsif ( $ptag eq 'html' ) { |
1379
|
54
|
100
|
|
|
|
5695
|
if ( $self->{'_implicit_body_p_tag'} ) { |
1380
|
9
|
|
|
|
|
1389
|
print $indent, |
1381
|
|
|
|
|
|
|
" * Text node under HTML implicates BODY and P.\n" |
1382
|
|
|
|
|
|
|
if DEBUG > 1; |
1383
|
|
|
|
|
|
|
$pos = $self->{'_body'} |
1384
|
|
|
|
|
|
|
? ( $self->{'_pos'} |
1385
|
9
|
50
|
|
|
|
1378
|
= $self->{'_body'} ) # expected case |
1386
|
|
|
|
|
|
|
: $self->insert_element( 'body', 1 ); |
1387
|
9
|
|
|
|
|
1373
|
$pos = $self->insert_element( 'p', 1 ); |
1388
|
|
|
|
|
|
|
} |
1389
|
|
|
|
|
|
|
else { |
1390
|
45
|
|
|
|
|
4317
|
print $indent, |
1391
|
|
|
|
|
|
|
" * Text node under HTML implicates BODY.\n" |
1392
|
|
|
|
|
|
|
if DEBUG > 1; |
1393
|
|
|
|
|
|
|
$pos = $self->{'_body'} |
1394
|
|
|
|
|
|
|
? ( $self->{'_pos'} |
1395
|
45
|
50
|
|
|
|
12686
|
= $self->{'_body'} ) # expected case |
1396
|
|
|
|
|
|
|
: $self->insert_element( 'body', 1 ); |
1397
|
|
|
|
|
|
|
|
1398
|
|
|
|
|
|
|
#print "POS is $pos, ", $pos->{'_tag'}, "\n"; |
1399
|
|
|
|
|
|
|
} |
1400
|
|
|
|
|
|
|
} |
1401
|
|
|
|
|
|
|
elsif ( $ptag eq 'body' ) { |
1402
|
400
|
50
|
|
|
|
9760
|
if ( $self->{'_implicit_body_p_tag'} ) { |
1403
|
0
|
|
|
|
|
0
|
print $indent, " * Text node under BODY implicates P.\n" |
1404
|
|
|
|
|
|
|
if DEBUG > 1; |
1405
|
0
|
|
|
|
|
0
|
$pos = $self->insert_element( 'p', 1 ); |
1406
|
|
|
|
|
|
|
} |
1407
|
|
|
|
|
|
|
} |
1408
|
|
|
|
|
|
|
elsif ( $ptag eq 'table' ) { |
1409
|
8
|
|
|
|
|
12
|
print $indent, |
1410
|
|
|
|
|
|
|
" * Text node under TABLE implicates TR and TD.\n" |
1411
|
|
|
|
|
|
|
if DEBUG > 1; |
1412
|
8
|
|
|
|
|
28
|
$self->insert_element( 'tr', 1 ); |
1413
|
8
|
|
|
|
|
21
|
$pos = $self->insert_element( 'td', 1 ); |
1414
|
|
|
|
|
|
|
|
1415
|
|
|
|
|
|
|
# double whammy! |
1416
|
|
|
|
|
|
|
} |
1417
|
|
|
|
|
|
|
elsif ( $ptag eq 'tr' ) { |
1418
|
3
|
|
|
|
|
7
|
print $indent, " * Text node under TR implicates TD.\n" |
1419
|
|
|
|
|
|
|
if DEBUG > 1; |
1420
|
3
|
|
|
|
|
8
|
$pos = $self->insert_element( 'td', 1 ); |
1421
|
|
|
|
|
|
|
} |
1422
|
|
|
|
|
|
|
|
1423
|
|
|
|
|
|
|
# elsif ( |
1424
|
|
|
|
|
|
|
# # $ptag eq 'li' || |
1425
|
|
|
|
|
|
|
# # $ptag eq 'dd' || |
1426
|
|
|
|
|
|
|
# $ptag eq 'form') { |
1427
|
|
|
|
|
|
|
# $pos = $self->insert_element('p', 1); |
1428
|
|
|
|
|
|
|
#} |
1429
|
|
|
|
|
|
|
|
1430
|
|
|
|
|
|
|
# Whatever we've done above should have had the side |
1431
|
|
|
|
|
|
|
# effect of updating $self->{'_pos'} |
1432
|
|
|
|
|
|
|
|
1433
|
|
|
|
|
|
|
#print "POS is now $pos, ", $pos->{'_tag'}, "\n"; |
1434
|
|
|
|
|
|
|
|
1435
|
2587
|
50
|
|
|
|
17024
|
return if $ignore_text; |
1436
|
2587
|
100
|
|
|
|
21919
|
$text =~ s/[\n\r\f\t ]+/ /g # canonical space |
1437
|
|
|
|
|
|
|
unless $no_space_compacting; |
1438
|
|
|
|
|
|
|
|
1439
|
|
|
|
|
|
|
print $indent, " (Attaching text node ($nugget) under ", |
1440
|
|
|
|
|
|
|
|
1441
|
|
|
|
|
|
|
# was: $self->{'_pos'} ? $self->{'_pos'}{'_tag'} : $self->{'_tag'}, |
1442
|
2587
|
|
|
|
|
15537
|
$pos->{'_tag'}, ").\n" |
1443
|
|
|
|
|
|
|
if DEBUG > 1; |
1444
|
|
|
|
|
|
|
|
1445
|
2587
|
|
|
|
|
17650
|
$pos->push_content($text); |
1446
|
|
|
|
|
|
|
} |
1447
|
|
|
|
|
|
|
|
1448
|
2587
|
50
|
|
|
|
43748
|
&{ $self->{'_tweak_~text'} || return }( $text, $pos, |
1449
|
2587
|
|
|
|
|
16509
|
$pos->{'_tag'} . '' ); |
1450
|
|
|
|
|
|
|
|
1451
|
|
|
|
|
|
|
# Note that this is very exceptional -- it doesn't fall back to |
1452
|
|
|
|
|
|
|
# _tweak_*, and it gives its tweak different arguments. |
1453
|
0
|
|
|
|
|
0
|
return; |
1454
|
|
|
|
|
|
|
} |
1455
|
|
|
|
|
|
|
} |
1456
|
|
|
|
|
|
|
|
1457
|
|
|
|
|
|
|
#========================================================================== |
1458
|
|
|
|
|
|
|
|
1459
|
|
|
|
|
|
|
# TODO: test whether comment(), declaration(), and process(), do the right |
1460
|
|
|
|
|
|
|
# thing as far as tightening and whatnot. |
1461
|
|
|
|
|
|
|
# Also, currently, doctypes and comments that appear before head or body |
1462
|
|
|
|
|
|
|
# show up in the tree in the wrong place. Something should be done about |
1463
|
|
|
|
|
|
|
# this. Tricky. Maybe this whole business of pre-making the body and |
1464
|
|
|
|
|
|
|
# whatnot is wrong. |
1465
|
|
|
|
|
|
|
|
1466
|
|
|
|
|
|
|
sub comment { |
1467
|
195
|
50
|
|
195
|
1
|
1948
|
return if $_[0]{'_stunted'}; |
1468
|
|
|
|
|
|
|
|
1469
|
|
|
|
|
|
|
# Accept a "here's a comment" signal from HTML::Parser. |
1470
|
|
|
|
|
|
|
|
1471
|
195
|
|
|
|
|
414
|
my ( $self, $text ) = @_; |
1472
|
195
|
|
66
|
|
|
583
|
my $pos = $self->{'_pos'} || $self; |
1473
|
|
|
|
|
|
|
return |
1474
|
|
|
|
|
|
|
unless $self->{'_store_comments'} |
1475
|
195
|
100
|
66
|
|
|
1207
|
|| $HTML::Tagset::isCDATA_Parent{ $pos->{'_tag'} }; |
1476
|
|
|
|
|
|
|
|
1477
|
1
|
|
|
|
|
1
|
if (DEBUG) { |
1478
|
|
|
|
|
|
|
my @lineage_tags = $pos->lineage_tag_names; |
1479
|
|
|
|
|
|
|
my $indent = ' ' x ( 1 + @lineage_tags ); |
1480
|
|
|
|
|
|
|
|
1481
|
|
|
|
|
|
|
my $nugget |
1482
|
|
|
|
|
|
|
= ( length($text) <= 25 ) |
1483
|
|
|
|
|
|
|
? $text |
1484
|
|
|
|
|
|
|
: ( substr( $text, 0, 25 ) . '...' ); |
1485
|
|
|
|
|
|
|
$nugget =~ s<([\x00-\x1F])> |
1486
|
|
|
|
|
|
|
<'\\x'.(unpack("H2",$1))>eg; |
1487
|
|
|
|
|
|
|
print $indent, "Proposing a Comment ($nugget) under ", |
1488
|
|
|
|
|
|
|
join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root', |
1489
|
|
|
|
|
|
|
".\n"; |
1490
|
|
|
|
|
|
|
} |
1491
|
|
|
|
|
|
|
|
1492
|
1
|
|
|
|
|
4
|
( my $e = $self->element_class->new('~comment') )->{'text'} = $text; |
1493
|
1
|
|
|
|
|
4
|
$pos->push_content($e); |
1494
|
1
|
|
|
|
|
2
|
++( $self->{'_element_count'} ); |
1495
|
|
|
|
|
|
|
|
1496
|
1
|
|
|
|
|
4
|
&{ $self->{'_tweak_~comment'} |
1497
|
1
|
50
|
33
|
|
|
7
|
|| $self->{'_tweak_*'} |
1498
|
|
|
|
|
|
|
|| return $e }( map $_, $e, '~comment', $self ); |
1499
|
|
|
|
|
|
|
|
1500
|
0
|
|
|
|
|
0
|
return $e; |
1501
|
|
|
|
|
|
|
} |
1502
|
|
|
|
|
|
|
|
1503
|
|
|
|
|
|
|
sub declaration { |
1504
|
17
|
50
|
|
17
|
1
|
1158
|
return if $_[0]{'_stunted'}; |
1505
|
|
|
|
|
|
|
|
1506
|
|
|
|
|
|
|
# Accept a "here's a markup declaration" signal from HTML::Parser. |
1507
|
|
|
|
|
|
|
|
1508
|
17
|
|
|
|
|
77
|
my ( $self, $text ) = @_; |
1509
|
17
|
|
33
|
|
|
80
|
my $pos = $self->{'_pos'} || $self; |
1510
|
|
|
|
|
|
|
|
1511
|
17
|
|
|
|
|
28
|
if (DEBUG) { |
1512
|
|
|
|
|
|
|
my @lineage_tags = $pos->lineage_tag_names; |
1513
|
|
|
|
|
|
|
my $indent = ' ' x ( 1 + @lineage_tags ); |
1514
|
|
|
|
|
|
|
|
1515
|
|
|
|
|
|
|
my $nugget |
1516
|
|
|
|
|
|
|
= ( length($text) <= 25 ) |
1517
|
|
|
|
|
|
|
? $text |
1518
|
|
|
|
|
|
|
: ( substr( $text, 0, 25 ) . '...' ); |
1519
|
|
|
|
|
|
|
$nugget =~ s<([\x00-\x1F])> |
1520
|
|
|
|
|
|
|
<'\\x'.(unpack("H2",$1))>eg; |
1521
|
|
|
|
|
|
|
print $indent, "Proposing a Declaration ($nugget) under ", |
1522
|
|
|
|
|
|
|
join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root', |
1523
|
|
|
|
|
|
|
".\n"; |
1524
|
|
|
|
|
|
|
} |
1525
|
17
|
|
|
|
|
47
|
( my $e = $self->element_class->new('~declaration') )->{'text'} = $text; |
1526
|
|
|
|
|
|
|
|
1527
|
17
|
|
|
|
|
39
|
$self->{_decl} = $e; |
1528
|
17
|
|
|
|
|
106
|
return $e; |
1529
|
|
|
|
|
|
|
} |
1530
|
|
|
|
|
|
|
|
1531
|
|
|
|
|
|
|
#========================================================================== |
1532
|
|
|
|
|
|
|
|
1533
|
|
|
|
|
|
|
sub process { |
1534
|
2
|
50
|
|
2
|
1
|
138
|
return if $_[0]{'_stunted'}; |
1535
|
|
|
|
|
|
|
|
1536
|
|
|
|
|
|
|
# Accept a "here's a PI" signal from HTML::Parser. |
1537
|
|
|
|
|
|
|
|
1538
|
2
|
50
|
|
|
|
17
|
return unless $_[0]->{'_store_pis'}; |
1539
|
0
|
|
|
|
|
0
|
my ( $self, $text ) = @_; |
1540
|
0
|
|
0
|
|
|
0
|
my $pos = $self->{'_pos'} || $self; |
1541
|
|
|
|
|
|
|
|
1542
|
0
|
|
|
|
|
0
|
if (DEBUG) { |
1543
|
|
|
|
|
|
|
my @lineage_tags = $pos->lineage_tag_names; |
1544
|
|
|
|
|
|
|
my $indent = ' ' x ( 1 + @lineage_tags ); |
1545
|
|
|
|
|
|
|
|
1546
|
|
|
|
|
|
|
my $nugget |
1547
|
|
|
|
|
|
|
= ( length($text) <= 25 ) |
1548
|
|
|
|
|
|
|
? $text |
1549
|
|
|
|
|
|
|
: ( substr( $text, 0, 25 ) . '...' ); |
1550
|
|
|
|
|
|
|
$nugget =~ s<([\x00-\x1F])> |
1551
|
|
|
|
|
|
|
<'\\x'.(unpack("H2",$1))>eg; |
1552
|
|
|
|
|
|
|
print $indent, "Proposing a PI ($nugget) under ", |
1553
|
|
|
|
|
|
|
join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root', |
1554
|
|
|
|
|
|
|
".\n"; |
1555
|
|
|
|
|
|
|
} |
1556
|
0
|
|
|
|
|
0
|
( my $e = $self->element_class->new('~pi') )->{'text'} = $text; |
1557
|
0
|
|
|
|
|
0
|
$pos->push_content($e); |
1558
|
0
|
|
|
|
|
0
|
++( $self->{'_element_count'} ); |
1559
|
|
|
|
|
|
|
|
1560
|
0
|
0
|
0
|
|
|
0
|
&{ $self->{'_tweak_~pi'} || $self->{'_tweak_*'} || return $e }( map $_, |
|
0
|
|
|
|
|
0
|
|
1561
|
|
|
|
|
|
|
$e, '~pi', $self ); |
1562
|
|
|
|
|
|
|
|
1563
|
0
|
|
|
|
|
0
|
return $e; |
1564
|
|
|
|
|
|
|
} |
1565
|
|
|
|
|
|
|
|
1566
|
|
|
|
|
|
|
#========================================================================== |
1567
|
|
|
|
|
|
|
|
1568
|
|
|
|
|
|
|
#When you call $tree->parse_file($filename), and the |
1569
|
|
|
|
|
|
|
#tree's ignore_ignorable_whitespace attribute is on (as it is |
1570
|
|
|
|
|
|
|
#by default), HTML::TreeBuilder's logic will manage to avoid |
1571
|
|
|
|
|
|
|
#creating some, but not all, nodes that represent ignorable |
1572
|
|
|
|
|
|
|
#whitespace. However, at the end of its parse, it traverses the |
1573
|
|
|
|
|
|
|
#tree and deletes any that it missed. (It does this with an |
1574
|
|
|
|
|
|
|
#around-method around HTML::Parser's eof method.) |
1575
|
|
|
|
|
|
|
# |
1576
|
|
|
|
|
|
|
#However, with $tree->parse($content), the cleanup-traversal step |
1577
|
|
|
|
|
|
|
#doesn't happen automatically -- so when you're done parsing all |
1578
|
|
|
|
|
|
|
#content for a document (regardless of whether $content is the only |
1579
|
|
|
|
|
|
|
#bit, or whether it's just another chunk of content you're parsing into |
1580
|
|
|
|
|
|
|
#the tree), call $tree->eof() to signal that you're at the end of the |
1581
|
|
|
|
|
|
|
#text you're inputting to the tree. Besides properly cleaning any bits |
1582
|
|
|
|
|
|
|
#of ignorable whitespace from the tree, this will also ensure that |
1583
|
|
|
|
|
|
|
#HTML::Parser's internal buffer is flushed. |
1584
|
|
|
|
|
|
|
|
1585
|
|
|
|
|
|
|
sub eof { |
1586
|
|
|
|
|
|
|
|
1587
|
|
|
|
|
|
|
# Accept an "end-of-file" signal from HTML::Parser, or thrown by the user. |
1588
|
|
|
|
|
|
|
|
1589
|
373
|
50
|
|
373
|
1
|
24207
|
return if $_[0]->{'_done'}; # we've already been here |
1590
|
|
|
|
|
|
|
|
1591
|
373
|
50
|
|
|
|
11906
|
return $_[0]->SUPER::eof() if $_[0]->{'_stunted'}; |
1592
|
|
|
|
|
|
|
|
1593
|
373
|
|
|
|
|
11746
|
my $x = $_[0]; |
1594
|
373
|
|
|
|
|
11587
|
print "EOF received.\n" if DEBUG; |
1595
|
373
|
|
|
|
|
11560
|
my (@rv); |
1596
|
373
|
50
|
|
|
|
11901
|
if (wantarray) { |
1597
|
|
|
|
|
|
|
|
1598
|
|
|
|
|
|
|
# I don't think this makes any difference for this particular |
1599
|
|
|
|
|
|
|
# method, but let's be scrupulous, for once. |
1600
|
0
|
|
|
|
|
0
|
@rv = $x->SUPER::eof(); |
1601
|
|
|
|
|
|
|
} |
1602
|
|
|
|
|
|
|
else { |
1603
|
373
|
|
|
|
|
18397
|
$rv[0] = $x->SUPER::eof(); |
1604
|
|
|
|
|
|
|
} |
1605
|
|
|
|
|
|
|
|
1606
|
373
|
100
|
66
|
|
|
13467
|
$x->end('html') unless $x eq ( $x->{'_pos'} || $x ); |
1607
|
|
|
|
|
|
|
|
1608
|
|
|
|
|
|
|
# That SHOULD close everything, and will run the appropriate tweaks. |
1609
|
|
|
|
|
|
|
# We /could/ be running under some insane mode such that there's more |
1610
|
|
|
|
|
|
|
# than one HTML element, but really, that's just insane to do anyhow. |
1611
|
|
|
|
|
|
|
|
1612
|
373
|
100
|
|
|
|
12124
|
unless ( $x->{'_implicit_tags'} ) { |
1613
|
|
|
|
|
|
|
|
1614
|
|
|
|
|
|
|
# delete those silly implicit head and body in case we put |
1615
|
|
|
|
|
|
|
# them there in implicit tags mode |
1616
|
9
|
|
|
|
|
22
|
foreach my $node ( $x->{'_head'}, $x->{'_body'} ) { |
1617
|
|
|
|
|
|
|
$node->replace_with_content |
1618
|
|
|
|
|
|
|
if defined $node |
1619
|
|
|
|
|
|
|
and ref $node |
1620
|
|
|
|
|
|
|
and $node->{'_implicit'} |
1621
|
18
|
50
|
33
|
|
|
167
|
and $node->{'_parent'}; |
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
1622
|
|
|
|
|
|
|
|
1623
|
|
|
|
|
|
|
# I think they should be empty anyhow, since the only |
1624
|
|
|
|
|
|
|
# logic that'd insert under them can apply only, I think, |
1625
|
|
|
|
|
|
|
# in the case where _implicit_tags is on |
1626
|
|
|
|
|
|
|
} |
1627
|
|
|
|
|
|
|
|
1628
|
|
|
|
|
|
|
# this may still leave an implicit 'html' at the top, but there's |
1629
|
|
|
|
|
|
|
# nothing we can do about that, is there? |
1630
|
|
|
|
|
|
|
} |
1631
|
|
|
|
|
|
|
|
1632
|
|
|
|
|
|
|
$x->delete_ignorable_whitespace() |
1633
|
|
|
|
|
|
|
|
1634
|
|
|
|
|
|
|
# this's why we trap this -- an after-method |
1635
|
373
|
100
|
66
|
|
|
13757
|
if $x->{'_tighten'} and !$x->{'_ignore_text'}; |
1636
|
373
|
|
|
|
|
11808
|
$x->{'_done'} = 1; |
1637
|
|
|
|
|
|
|
|
1638
|
373
|
50
|
|
|
|
11946
|
return @rv if wantarray; |
1639
|
373
|
|
|
|
|
23805
|
return $rv[0]; |
1640
|
|
|
|
|
|
|
} |
1641
|
|
|
|
|
|
|
|
1642
|
|
|
|
|
|
|
#========================================================================== |
1643
|
|
|
|
|
|
|
|
1644
|
|
|
|
|
|
|
# TODO: document |
1645
|
|
|
|
|
|
|
|
1646
|
|
|
|
|
|
|
sub stunt { |
1647
|
0
|
|
|
0
|
1
|
0
|
my $self = $_[0]; |
1648
|
0
|
|
|
|
|
0
|
print "Stunting the tree.\n" if DEBUG; |
1649
|
0
|
|
|
|
|
0
|
$self->{'_done'} = 1; |
1650
|
|
|
|
|
|
|
|
1651
|
0
|
0
|
|
|
|
0
|
if ( $HTML::Parser::VERSION < 3 ) { |
1652
|
|
|
|
|
|
|
|
1653
|
|
|
|
|
|
|
#This is a MEAN MEAN HACK. And it works most of the time! |
1654
|
0
|
|
|
|
|
0
|
$self->{'_buf'} = ''; |
1655
|
0
|
|
|
|
|
0
|
my $fh = *HTML::Parser::F{IO}; |
1656
|
|
|
|
|
|
|
|
1657
|
|
|
|
|
|
|
# the local'd FH used by parse_file loop |
1658
|
0
|
0
|
|
|
|
0
|
if ( defined $fh ) { |
1659
|
0
|
|
|
|
|
0
|
print "Closing Parser's filehandle $fh\n" if DEBUG; |
1660
|
0
|
|
|
|
|
0
|
close($fh); |
1661
|
|
|
|
|
|
|
} |
1662
|
|
|
|
|
|
|
|
1663
|
|
|
|
|
|
|
# But if they called $tree->parse_file($filehandle) |
1664
|
|
|
|
|
|
|
# or $tree->parse_file(*IO), then there will be no *HTML::Parser::F{IO} |
1665
|
|
|
|
|
|
|
# to close. Ahwell. Not a problem for most users these days. |
1666
|
|
|
|
|
|
|
|
1667
|
|
|
|
|
|
|
} |
1668
|
|
|
|
|
|
|
else { |
1669
|
0
|
|
|
|
|
0
|
$self->SUPER::eof(); |
1670
|
|
|
|
|
|
|
|
1671
|
|
|
|
|
|
|
# Under 3+ versions, calling eof from inside a parse will abort the |
1672
|
|
|
|
|
|
|
# parse / parse_file |
1673
|
|
|
|
|
|
|
} |
1674
|
|
|
|
|
|
|
|
1675
|
|
|
|
|
|
|
# In the off chance that the above didn't work, we'll throw |
1676
|
|
|
|
|
|
|
# this flag to make any future events be no-ops. |
1677
|
0
|
|
|
|
|
0
|
$self->stunted(1); |
1678
|
0
|
|
|
|
|
0
|
return; |
1679
|
|
|
|
|
|
|
} |
1680
|
|
|
|
|
|
|
|
1681
|
|
|
|
|
|
|
# TODO: document |
1682
|
0
|
|
|
0
|
1
|
0
|
sub stunted { shift->_elem( '_stunted', @_ ); } |
1683
|
0
|
|
|
0
|
1
|
0
|
sub done { shift->_elem( '_done', @_ ); } |
1684
|
|
|
|
|
|
|
|
1685
|
|
|
|
|
|
|
#========================================================================== |
1686
|
|
|
|
|
|
|
|
1687
|
|
|
|
|
|
|
sub delete { |
1688
|
|
|
|
|
|
|
|
1689
|
|
|
|
|
|
|
# Override Element's delete method. |
1690
|
|
|
|
|
|
|
# This does most, if not all, of what Element's delete does anyway. |
1691
|
|
|
|
|
|
|
# Deletes content, including content in some special attributes. |
1692
|
|
|
|
|
|
|
# But doesn't empty out the hash. |
1693
|
|
|
|
|
|
|
|
1694
|
270
|
|
|
270
|
1
|
90470
|
$_[0]->{'_element_count'} = 1; # never hurts to be scrupulously correct |
1695
|
|
|
|
|
|
|
|
1696
|
270
|
|
|
|
|
4095
|
delete @{ $_[0] }{ '_body', '_head', '_pos' }; |
|
270
|
|
|
|
|
8212
|
|
1697
|
270
|
|
|
|
|
4053
|
for ( |
1698
|
270
|
50
|
|
|
|
8288
|
@{ delete( $_[0]->{'_content'} ) || [] }, # all/any content |
1699
|
|
|
|
|
|
|
|
1700
|
|
|
|
|
|
|
# delete @{$_[0]}{'_body', '_head', '_pos'} |
1701
|
|
|
|
|
|
|
# ...and these, in case these elements don't appear in the |
1702
|
|
|
|
|
|
|
# content, which is possible. If they did appear (as they |
1703
|
|
|
|
|
|
|
# usually do), then calling $_->delete on them again is harmless. |
1704
|
|
|
|
|
|
|
# I don't think that's such a hot idea now. Thru creative reattachment, |
1705
|
|
|
|
|
|
|
# those could actually now point to elements in OTHER trees (which we do |
1706
|
|
|
|
|
|
|
# NOT want to delete!). |
1707
|
|
|
|
|
|
|
## Reasoned out: |
1708
|
|
|
|
|
|
|
# If these point to elements not in the content list of any element in this |
1709
|
|
|
|
|
|
|
# tree, but not in the content list of any element in any OTHER tree, then |
1710
|
|
|
|
|
|
|
# just deleting these will make their refcounts hit zero. |
1711
|
|
|
|
|
|
|
# If these point to elements in the content lists of elements in THIS tree, |
1712
|
|
|
|
|
|
|
# then we'll get to deleting them when we delete from the top. |
1713
|
|
|
|
|
|
|
# If these point to elements in the content lists of elements in SOME OTHER |
1714
|
|
|
|
|
|
|
# tree, then they're not to be deleted. |
1715
|
|
|
|
|
|
|
) |
1716
|
|
|
|
|
|
|
{ |
1717
|
540
|
50
|
33
|
|
|
7840
|
$_->delete |
|
|
|
33
|
|
|
|
|
1718
|
|
|
|
|
|
|
if defined $_ and ref $_ # Make sure it's an object. |
1719
|
|
|
|
|
|
|
and $_ ne $_[0]; # And avoid hitting myself, just in case! |
1720
|
|
|
|
|
|
|
} |
1721
|
|
|
|
|
|
|
|
1722
|
270
|
0
|
33
|
|
|
4545
|
$_[0]->detach if $_[0]->{'_parent'} and $_[0]->{'_parent'}{'_content'}; |
1723
|
|
|
|
|
|
|
|
1724
|
|
|
|
|
|
|
# An 'html' element having a parent is quite unlikely. |
1725
|
|
|
|
|
|
|
|
1726
|
270
|
|
|
|
|
17306
|
return; |
1727
|
|
|
|
|
|
|
} |
1728
|
|
|
|
|
|
|
|
1729
|
|
|
|
|
|
|
sub tighten_up { # legacy |
1730
|
0
|
|
|
0
|
1
|
0
|
shift->delete_ignorable_whitespace(@_); |
1731
|
|
|
|
|
|
|
} |
1732
|
|
|
|
|
|
|
|
1733
|
|
|
|
|
|
|
sub elementify { |
1734
|
|
|
|
|
|
|
|
1735
|
|
|
|
|
|
|
# Rebless this object down into the normal element class. |
1736
|
27
|
|
|
27
|
1
|
4295
|
my $self = $_[0]; |
1737
|
27
|
|
|
|
|
4337
|
my $to_class = $self->element_class; |
1738
|
27
|
|
|
|
|
8874
|
delete @{$self}{ |
1739
|
|
|
|
|
|
|
grep { |
1740
|
27
|
|
|
|
|
4412
|
; |
1741
|
621
|
100
|
33
|
|
|
206618
|
length $_ and substr( $_, 0, 1 ) eq '_' |
|
|
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
66
|
|
|
|
|
1742
|
|
|
|
|
|
|
|
1743
|
|
|
|
|
|
|
# The private attributes that we'll retain: |
1744
|
|
|
|
|
|
|
and $_ ne '_tag' |
1745
|
|
|
|
|
|
|
and $_ ne '_parent' |
1746
|
|
|
|
|
|
|
and $_ ne '_content' |
1747
|
|
|
|
|
|
|
and $_ ne '_implicit' |
1748
|
|
|
|
|
|
|
and $_ ne '_pos' |
1749
|
|
|
|
|
|
|
and $_ ne '_element_class' |
1750
|
|
|
|
|
|
|
and $_ ne '_encoding' |
1751
|
|
|
|
|
|
|
} keys %$self |
1752
|
|
|
|
|
|
|
}; |
1753
|
27
|
|
|
|
|
8696
|
bless $self, $to_class; # Returns the same object we were fed |
1754
|
|
|
|
|
|
|
} |
1755
|
|
|
|
|
|
|
|
1756
|
|
|
|
|
|
|
sub element_class { |
1757
|
4637
|
100
|
|
4637
|
1
|
67216
|
return 'HTML::Element' if not ref $_[0]; |
1758
|
4258
|
|
50
|
|
|
89971
|
return $_[0]->{_element_class} || 'HTML::Element'; |
1759
|
|
|
|
|
|
|
} |
1760
|
|
|
|
|
|
|
|
1761
|
|
|
|
|
|
|
#-------------------------------------------------------------------------- |
1762
|
|
|
|
|
|
|
|
1763
|
|
|
|
|
|
|
sub guts { |
1764
|
0
|
|
|
0
|
1
|
|
my @out; |
1765
|
0
|
|
|
|
|
|
my @stack = ( $_[0] ); |
1766
|
0
|
|
|
|
|
|
my $destructive = $_[1]; |
1767
|
0
|
|
|
|
|
|
my $this; |
1768
|
0
|
|
|
|
|
|
while (@stack) { |
1769
|
0
|
|
|
|
|
|
$this = shift @stack; |
1770
|
0
|
0
|
|
|
|
|
if ( !ref $this ) { |
|
|
0
|
|
|
|
|
|
1771
|
0
|
|
|
|
|
|
push @out, $this; # yes, it can include text nodes |
1772
|
|
|
|
|
|
|
} |
1773
|
|
|
|
|
|
|
elsif ( !$this->{'_implicit'} ) { |
1774
|
0
|
|
|
|
|
|
push @out, $this; |
1775
|
0
|
0
|
|
|
|
|
delete $this->{'_parent'} if $destructive; |
1776
|
|
|
|
|
|
|
} |
1777
|
|
|
|
|
|
|
else { |
1778
|
|
|
|
|
|
|
|
1779
|
|
|
|
|
|
|
# it's an implicit node. Delete it and recurse |
1780
|
0
|
0
|
|
|
|
|
delete $this->{'_parent'} if $destructive; |
1781
|
|
|
|
|
|
|
unshift @stack, |
1782
|
|
|
|
|
|
|
@{ |
1783
|
0
|
|
|
|
|
|
( $destructive |
1784
|
|
|
|
|
|
|
? delete( $this->{'_content'} ) |
1785
|
0
|
0
|
|
|
|
|
: $this->{'_content'} |
|
|
0
|
|
|
|
|
|
1786
|
|
|
|
|
|
|
) |
1787
|
|
|
|
|
|
|
|| [] |
1788
|
|
|
|
|
|
|
}; |
1789
|
|
|
|
|
|
|
} |
1790
|
|
|
|
|
|
|
} |
1791
|
|
|
|
|
|
|
|
1792
|
|
|
|
|
|
|
# Doesn't call a real $root->delete on the (when implicit) root, |
1793
|
|
|
|
|
|
|
# but I don't think it needs to. |
1794
|
|
|
|
|
|
|
|
1795
|
0
|
0
|
|
|
|
|
return @out if wantarray; # one simple normal case. |
1796
|
0
|
0
|
|
|
|
|
return unless @out; |
1797
|
0
|
0
|
0
|
|
|
|
return $out[0] if @out == 1 and ref( $out[0] ); |
1798
|
0
|
|
|
|
|
|
my $x = HTML::Element->new( 'div', '_implicit' => 1 ); |
1799
|
0
|
|
|
|
|
|
$x->push_content(@out); |
1800
|
0
|
|
|
|
|
|
return $x; |
1801
|
|
|
|
|
|
|
} |
1802
|
|
|
|
|
|
|
|
1803
|
0
|
|
|
0
|
1
|
|
sub disembowel { $_[0]->guts(1) } |
1804
|
|
|
|
|
|
|
|
1805
|
|
|
|
|
|
|
#-------------------------------------------------------------------------- |
1806
|
|
|
|
|
|
|
1; |
1807
|
|
|
|
|
|
|
|
1808
|
|
|
|
|
|
|
__END__ |