line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package HTML::TokeParser::Simple; |
2
|
|
|
|
|
|
|
|
3
|
5
|
|
|
5
|
|
153280
|
use strict; |
|
5
|
|
|
|
|
11
|
|
|
5
|
|
|
|
|
216
|
|
4
|
5
|
|
|
5
|
|
5239
|
use HTML::TokeParser; |
|
5
|
|
|
|
|
63654
|
|
|
5
|
|
|
|
|
160
|
|
5
|
5
|
|
|
5
|
|
4804
|
use HTML::TokeParser::Simple::Token; |
|
5
|
|
|
|
|
14
|
|
|
5
|
|
|
|
|
135
|
|
6
|
5
|
|
|
5
|
|
3773
|
use HTML::TokeParser::Simple::Token::Tag; |
|
5
|
|
|
|
|
12
|
|
|
5
|
|
|
|
|
167
|
|
7
|
5
|
|
|
5
|
|
2568
|
use HTML::TokeParser::Simple::Token::Tag::Start; |
|
5
|
|
|
|
|
10
|
|
|
5
|
|
|
|
|
127
|
|
8
|
5
|
|
|
5
|
|
2473
|
use HTML::TokeParser::Simple::Token::Tag::End; |
|
5
|
|
|
|
|
11
|
|
|
5
|
|
|
|
|
126
|
|
9
|
5
|
|
|
5
|
|
8332
|
use HTML::TokeParser::Simple::Token::Text; |
|
5
|
|
|
|
|
11
|
|
|
5
|
|
|
|
|
122
|
|
10
|
5
|
|
|
5
|
|
2738
|
use HTML::TokeParser::Simple::Token::Comment; |
|
5
|
|
|
|
|
11
|
|
|
5
|
|
|
|
|
119
|
|
11
|
5
|
|
|
5
|
|
2228
|
use HTML::TokeParser::Simple::Token::Declaration; |
|
5
|
|
|
|
|
39
|
|
|
5
|
|
|
|
|
102
|
|
12
|
5
|
|
|
5
|
|
2160
|
use HTML::TokeParser::Simple::Token::ProcessInstruction; |
|
5
|
|
|
|
|
10
|
|
|
5
|
|
|
|
|
167
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
our $VERSION = '3.16'; |
15
|
5
|
|
|
5
|
|
22
|
use base 'HTML::TokeParser'; |
|
5
|
|
|
|
|
7
|
|
|
5
|
|
|
|
|
3841
|
|
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
# constructors |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
my %FACTORY_CLASSES = ( |
20
|
|
|
|
|
|
|
S => 'HTML::TokeParser::Simple::Token::Tag::Start', |
21
|
|
|
|
|
|
|
E => 'HTML::TokeParser::Simple::Token::Tag::End', |
22
|
|
|
|
|
|
|
T => 'HTML::TokeParser::Simple::Token::Text', |
23
|
|
|
|
|
|
|
C => 'HTML::TokeParser::Simple::Token::Comment', |
24
|
|
|
|
|
|
|
D => 'HTML::TokeParser::Simple::Token::Declaration', |
25
|
|
|
|
|
|
|
PI => 'HTML::TokeParser::Simple::Token::ProcessInstruction', |
26
|
|
|
|
|
|
|
); |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
sub _croak { |
29
|
3
|
|
|
3
|
|
8
|
my ($proto, $message) = @_; |
30
|
3
|
|
|
|
|
23
|
require Carp; |
31
|
3
|
|
|
|
|
765
|
Carp::croak($message); |
32
|
|
|
|
|
|
|
} |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
sub new { |
35
|
19
|
|
|
19
|
1
|
130031
|
my ($class, @args) = @_; |
36
|
19
|
100
|
|
|
|
158
|
return 1 == @args |
37
|
|
|
|
|
|
|
? $class->SUPER::new(@args) |
38
|
|
|
|
|
|
|
: $class->_init(@args); |
39
|
|
|
|
|
|
|
} |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
sub _init { |
42
|
8
|
|
|
8
|
|
143
|
my ($class, $source_type, $source) = @_; |
43
|
|
|
|
|
|
|
my %sources = ( |
44
|
3
|
|
|
3
|
|
16
|
file => sub { $source }, |
45
|
1
|
|
|
1
|
|
4
|
handle => sub { $source }, |
46
|
1
|
|
|
1
|
|
3
|
string => sub { \$source }, |
47
|
|
|
|
|
|
|
url => sub { |
48
|
2
|
|
|
2
|
|
100
|
eval "require LWP::Simple"; |
49
|
2
|
50
|
|
|
|
11
|
$class->_croak("Cannot load LWP::Simple: $@") if $@; |
50
|
2
|
|
|
|
|
7
|
my $content = LWP::Simple::get($source); |
51
|
2
|
100
|
|
|
|
16
|
$class->_croak("Could not fetch content from ($source)") |
52
|
|
|
|
|
|
|
unless defined $content; |
53
|
1
|
|
|
|
|
6
|
return \$content; |
54
|
|
|
|
|
|
|
}, |
55
|
8
|
|
|
|
|
111
|
); |
56
|
8
|
100
|
|
|
|
32
|
unless (exists $sources{$source_type}) { |
57
|
1
|
|
|
|
|
6
|
$class->_croak("Unknown source type ($source_type)"); |
58
|
|
|
|
|
|
|
} |
59
|
7
|
|
|
|
|
22
|
return $class->new($sources{$source_type}->()); |
60
|
|
|
|
|
|
|
} |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
sub get_token { |
63
|
226
|
|
|
226
|
1
|
16232
|
my $self = shift; |
64
|
226
|
|
|
|
|
450
|
my @args = @_; |
65
|
226
|
|
|
|
|
667
|
my $token = $self->SUPER::get_token( @args ); |
66
|
226
|
100
|
|
|
|
4273
|
return unless defined $token; |
67
|
223
|
50
|
|
|
|
542
|
if (my $factory_class = $FACTORY_CLASSES{$token->[0]}) { |
68
|
223
|
|
|
|
|
965
|
return $factory_class->new($token); |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
else { |
71
|
|
|
|
|
|
|
# this should never happen |
72
|
0
|
|
|
|
|
0
|
$self->_croak("Cannot determine token class for token (@$token)"); |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
} |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
sub get_tag { |
77
|
12
|
|
|
12
|
1
|
21104
|
my $self = shift; |
78
|
12
|
|
|
|
|
31
|
my @args = @_; |
79
|
12
|
|
|
|
|
150
|
my $token = $self->SUPER::get_tag( @args ); |
80
|
12
|
50
|
|
|
|
144
|
return unless defined $token; |
81
|
12
|
100
|
|
|
|
68
|
return $token->[0] =~ /^\// |
82
|
|
|
|
|
|
|
? HTML::TokeParser::Simple::Token::Tag::End->new($token) |
83
|
|
|
|
|
|
|
: HTML::TokeParser::Simple::Token::Tag::Start->new($token); |
84
|
|
|
|
|
|
|
} |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
sub peek { |
87
|
7
|
|
|
7
|
1
|
12
|
my ($self, $count) = @_; |
88
|
7
|
|
100
|
|
|
22
|
$count ||= 1; |
89
|
|
|
|
|
|
|
|
90
|
7
|
100
|
|
|
|
31
|
unless ($count =~ /^\d+$/) { |
91
|
1
|
|
|
|
|
5
|
$self->_croak("Argument to peek() must be a positive integer, not ($count)"); |
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
|
94
|
6
|
|
|
|
|
7
|
my $items = 0; |
95
|
6
|
|
|
|
|
10
|
my $html = ''; |
96
|
6
|
|
|
|
|
7
|
my @tokens; |
97
|
6
|
|
100
|
|
|
22
|
while ( $items++ < $count && defined ( my $token = $self->get_token ) ) { |
98
|
61
|
|
|
|
|
166
|
$html .= $token->as_is; |
99
|
61
|
|
|
|
|
237
|
push @tokens, $token; |
100
|
|
|
|
|
|
|
} |
101
|
6
|
|
|
|
|
25
|
$self->unget_token(@tokens); |
102
|
6
|
|
|
|
|
60
|
return $html; |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
1; |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
__END__ |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
=head1 NAME |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
HTML::TokeParser::Simple - Easy to use C<HTML::TokeParser> interface |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=head1 SYNOPSIS |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
use HTML::TokeParser::Simple; |
116
|
|
|
|
|
|
|
my $p = HTML::TokeParser::Simple->new( $somefile ); |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
while ( my $token = $p->get_token ) { |
119
|
|
|
|
|
|
|
# This prints all text in an HTML doc (i.e., it strips the HTML) |
120
|
|
|
|
|
|
|
next unless $token->is_text; |
121
|
|
|
|
|
|
|
print $token->as_is; |
122
|
|
|
|
|
|
|
} |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
=head1 DESCRIPTION |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
C<HTML::TokeParser> is an excellent module that's often used for parsing HTML. |
128
|
|
|
|
|
|
|
However, the tokens returned are not exactly intuitive to parse: |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
["S", $tag, $attr, $attrseq, $text] |
131
|
|
|
|
|
|
|
["E", $tag, $text] |
132
|
|
|
|
|
|
|
["T", $text, $is_data] |
133
|
|
|
|
|
|
|
["C", $text] |
134
|
|
|
|
|
|
|
["D", $text] |
135
|
|
|
|
|
|
|
["PI", $token0, $text] |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
To simplify this, C<HTML::TokeParser::Simple> allows the user ask more |
138
|
|
|
|
|
|
|
intuitive (read: more self-documenting) questions about the tokens returned. |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
You can also rebuild some tags on the fly. Frequently, the attributes |
141
|
|
|
|
|
|
|
associated with start tags need to be altered, added to, or deleted. This |
142
|
|
|
|
|
|
|
functionality is built in. |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
Since this is a subclass of C<HTML::TokeParser>, all C<HTML::TokeParser> |
145
|
|
|
|
|
|
|
methods are available. To truly appreciate the power of this module, please |
146
|
|
|
|
|
|
|
read the documentation for C<HTML::TokeParser> and C<HTML::Parser>. |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
=head1 CONTRUCTORS |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
=head2 C<new($source)> |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
The constructor for C<HTML::TokeParser::Simple> can be used just like |
153
|
|
|
|
|
|
|
C<HTML::TokeParser>'s constructor: |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
my $parser = HTML::TokeParser::Simple->new($filename); |
156
|
|
|
|
|
|
|
# or |
157
|
|
|
|
|
|
|
my $parser = HTML::TokeParser::Simple->new($filehandle); |
158
|
|
|
|
|
|
|
# or |
159
|
|
|
|
|
|
|
my $parser = HTML::TokeParser::Simple->new(\$html_string); |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
=head2 C<new($source_type, $source)> |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
If you wish to be more explicit, there is a new style of |
164
|
|
|
|
|
|
|
constructor available. |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
my $parser = HTML::TokeParser::Simple->new(file => $filename); |
167
|
|
|
|
|
|
|
# or |
168
|
|
|
|
|
|
|
my $parser = HTML::TokeParser::Simple->new(handle => $filehandle); |
169
|
|
|
|
|
|
|
# or |
170
|
|
|
|
|
|
|
my $parser = HTML::TokeParser::Simple->new(string => $html_string); |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
Note that you do not have to provide a reference for the string if using the |
173
|
|
|
|
|
|
|
string constructor. |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
As a convenience, you can also attempt to fetch the HTML directly from a URL. |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
my $parser = HTML::TokeParser::Simple->new(url => 'http://some.url'); |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
This method relies on C<LWP::Simple>. If this module is not found or the page |
180
|
|
|
|
|
|
|
cannot be fetched, the constructor will C<croak()>. |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
=head1 PARSER METHODS |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
=head2 get_token |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
This method will return the next token that C<HTML::TokeParser::get_token()> |
187
|
|
|
|
|
|
|
method would return. However, it will be blessed into a class appropriate |
188
|
|
|
|
|
|
|
which represents the token type. |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
=head2 get_tag |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
This method will return the next token that C<HTML::TokeParser::get_tag()> |
193
|
|
|
|
|
|
|
method would return. However, it will be blessed into either the |
194
|
|
|
|
|
|
|
L<HTML::TokeParser::Simple::Token::Tag::Start> or |
195
|
|
|
|
|
|
|
L<HTML::TokeParser::Simple::Token::Tag::End> class. |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
=head2 peek |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
As of version C<3.14>, you can now C<peek()> at the upcomings tokens without |
200
|
|
|
|
|
|
|
affecting the state of the parser. By default, C<peek()> will return the text |
201
|
|
|
|
|
|
|
of the next token, but specifying an integer C<$count> will return the text of |
202
|
|
|
|
|
|
|
the next C<$count> tokens. |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
This is useful when you're trying to debug where you are in a document. |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
warn $parser->peek(3); # show the next 3 tokens |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
=head1 ACCESSORS |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
The following methods may be called on the token object which is returned, |
211
|
|
|
|
|
|
|
not on the parser object. |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
=head2 Boolean Accessors |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
These accessors return true or false. |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
=over 4 |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
=item * C<is_tag([$tag])> |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
Use this to determine if you have any tag. An optional "tag type" may be |
222
|
|
|
|
|
|
|
passed. This will allow you to match if it's a I<particular> tag. The |
223
|
|
|
|
|
|
|
supplied tag is case-insensitive. |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
if ( $token->is_tag ) { ... } |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
Optionally, you may pass a regular expression as an argument. |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
=item * C<is_start_tag([$tag])> |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
Use this to determine if you have a start tag. An optional "tag type" may be |
232
|
|
|
|
|
|
|
passed. This will allow you to match if it's a I<particular> start tag. The |
233
|
|
|
|
|
|
|
supplied tag is case-insensitive. |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
if ( $token->is_start_tag ) { ... } |
236
|
|
|
|
|
|
|
if ( $token->is_start_tag( 'font' ) ) { ... } |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
Optionally, you may pass a regular expression as an argument. To match all |
239
|
|
|
|
|
|
|
header (h1, h2, ... h6) tags: |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
if ( $token->is_start_tag( qr/^h[123456]$/ ) ) { ... } |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
=item * C<is_end_tag([$tag])> |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
Use this to determine if you have an end tag. An optional "tag type" may be |
246
|
|
|
|
|
|
|
passed. This will allow you to match if it's a I<particular> end tag. The |
247
|
|
|
|
|
|
|
supplied tag is case-insensitive. |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
When testing for an end tag, the forward slash on the tag is optional. |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
while ( $token = $p->get_token ) { |
252
|
|
|
|
|
|
|
if ( $token->is_end_tag( 'form' ) ) { ... } |
253
|
|
|
|
|
|
|
} |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
Or: |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
while ( $token = $p->get_token ) { |
258
|
|
|
|
|
|
|
if ( $token->is_end_tag( '/form' ) ) { ... } |
259
|
|
|
|
|
|
|
} |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
Optionally, you may pass a regular expression as an argument. |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
=item * C<is_text()> |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
Use this to determine if you have text. Note that this is I<not> to be |
266
|
|
|
|
|
|
|
confused with the C<return_text> (I<deprecated>) method described below! |
267
|
|
|
|
|
|
|
C<is_text> will identify text that the user typically sees display in the Web |
268
|
|
|
|
|
|
|
browser. |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
=item * C<is_comment()> |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
Are you still reading this? Nobody reads POD. Don't you know you're supposed |
273
|
|
|
|
|
|
|
to go to CLPM, ask a question that's answered in the POD and get flamed? It's |
274
|
|
|
|
|
|
|
a rite of passage. |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
Really. |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
C<is_comment> is used to identify comments. See the HTML::Parser documentation |
279
|
|
|
|
|
|
|
for more information about comments. There's more than you might think. |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
=item * C<is_declaration()> |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
This will match the DTD at the top of your HTML. (You I<do> use DTD's, don't |
284
|
|
|
|
|
|
|
you?) |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
=item * C<is_process_instruction()> |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
Process Instructions are from XML. This is very handy if you need to parse out |
289
|
|
|
|
|
|
|
PHP and similar things with a parser. |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
Currently, there appear to be some problems with process instructions. You can |
292
|
|
|
|
|
|
|
override C<HTML::TokeParser::Simple::Token::ProcessInstruction> if you need to. |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
=item * C<is_pi()> |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
This is a shorthand for C<is_process_instruction()>. |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
=back |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
=head2 Data Accessors |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
Some of these were originally C<return_> methods, but that name was not only |
303
|
|
|
|
|
|
|
unwieldy, but also went against reasonable conventions. The C<get_> methods |
304
|
|
|
|
|
|
|
listed below still have C<return_> methods available for backwards |
305
|
|
|
|
|
|
|
compatibility reasons, but they merely call their C<get_> counterpart. For |
306
|
|
|
|
|
|
|
example, calling C<return_tag()> actually calls C<get_tag()> internally. |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
=over 4 |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
=item * C<get_tag()> |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
Do you have a start tag or end tag? This will return the type (lower case). |
313
|
|
|
|
|
|
|
Note that this is I<not> the same as the C<get_tag()> method on the actual |
314
|
|
|
|
|
|
|
parser object. |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
=item * C<get_attr([$attribute])> |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
If you have a start tag, this will return a hash ref with the attribute names |
319
|
|
|
|
|
|
|
as keys and the values as the values. |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
If you pass in an attribute name, it will return the value for just that |
322
|
|
|
|
|
|
|
attribute. |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
Returns false if the token is not a start tag. |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
=item * C<get_attrseq()> |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
For a start tag, this is an array reference with the sequence of the |
329
|
|
|
|
|
|
|
attributes, if any. |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
Returns false if the token is not a start tag. |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
=item * C<return_text()> |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
This method has been heavily deprecated (for a couple of years) in favor of |
336
|
|
|
|
|
|
|
C<as_is>. Programmers were getting confused over the difference between |
337
|
|
|
|
|
|
|
C<is_text>, C<return_text>, and some parser methods such as |
338
|
|
|
|
|
|
|
C<HTML::TokeParser::get_text> and friends. |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
Using this method still succeeds, but will now carp and B<will be removed> |
341
|
|
|
|
|
|
|
in the next major release of this module. |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
=item * C<as_is()> |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
This is the exact text of whatever the token is representing. |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
=item * C<get_token0()> |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
For processing instructions, this will return the token found immediately after |
350
|
|
|
|
|
|
|
the opening tag. Example: For <?php, "php" will be the start of the returned |
351
|
|
|
|
|
|
|
string. |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
Note that process instruction handling appears to be incomplete in |
354
|
|
|
|
|
|
|
C<HTML::TokeParser>. |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
Returns false if the token is not a process instruction. |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
=back |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
=head1 MUTATORS |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
The C<delete_attr()> and C<set_attr()> methods allow the programmer to rewrite |
363
|
|
|
|
|
|
|
start tag attributes on the fly. It should be noted that bad HTML will be |
364
|
|
|
|
|
|
|
"corrected" by this. Specifically, the new tag will have all attributes |
365
|
|
|
|
|
|
|
lower-cased with the values properly quoted. |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
Self-closing tags (e.g. E<lt>hr /E<gt>) are also handled correctly. Some older |
368
|
|
|
|
|
|
|
browsers require a space prior to the final slash in a self-closed tag. If |
369
|
|
|
|
|
|
|
such a space is detected in the original HTML, it will be preserved. |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
Calling a mutator on an token type that does not support that property is a |
372
|
|
|
|
|
|
|
no-op. For example: |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
if ($token->is_comment) { |
375
|
|
|
|
|
|
|
$token->set_attr(foo => 'bar'); # does nothing |
376
|
|
|
|
|
|
|
} |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
=over 4 |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
=item * C<delete_attr($name)> |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
This method attempts to delete the attribute specified. It will silently fail |
383
|
|
|
|
|
|
|
if called on anything other than a start tag. The argument is |
384
|
|
|
|
|
|
|
case-insensitive, but must otherwise be an exact match of the attribute you are |
385
|
|
|
|
|
|
|
attempting to delete. If the attribute is not found, the method will return |
386
|
|
|
|
|
|
|
without changing the tag. |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
# <body bgcolor="#FFFFFF"> |
389
|
|
|
|
|
|
|
$token->delete_attr('bgcolor'); |
390
|
|
|
|
|
|
|
print $token->as_is; |
391
|
|
|
|
|
|
|
# <body> |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
After this method is called, if successful, the C<as_is()>, C<get_attr()> |
394
|
|
|
|
|
|
|
and C<get_attrseq()> methods will all return updated results. |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
=item * C<set_attr($name,$value)> |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
This method will set the value of an attribute. If the attribute is not found, |
399
|
|
|
|
|
|
|
then C<get_attrseq()> will have the new attribute listed at the end. |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
# <p> |
402
|
|
|
|
|
|
|
$token->set_attr(class => 'some_class'); |
403
|
|
|
|
|
|
|
print $token->as_is; |
404
|
|
|
|
|
|
|
# <p class="some_class"> |
405
|
|
|
|
|
|
|
|
406
|
|
|
|
|
|
|
# <body bgcolor="#FFFFFF"> |
407
|
|
|
|
|
|
|
$token->set_attr('bgcolor','red'); |
408
|
|
|
|
|
|
|
print $token->as_is; |
409
|
|
|
|
|
|
|
# <body bgcolor="red"> |
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
After this method is called, if successful, the C<as_is()>, C<get_attr()> |
412
|
|
|
|
|
|
|
and C<get_attrseq()> methods will all return updated results. |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
=item * C<set_attr($hashref)> |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
Under the premise that C<set_> methods should accept what their corresponding |
417
|
|
|
|
|
|
|
C<get_> methods emit, the following works: |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
$tag->set_attr($tag->get_attr); |
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
Theoretically that's a no-op and for purposes of rendering HTML, it should be. |
422
|
|
|
|
|
|
|
However, internally this calls C<$tag-E<gt>rewrite_tag>, so see that method to |
423
|
|
|
|
|
|
|
understand how this may affect you. |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
Of course, this is useless if you want to actually change the attributes, so you |
426
|
|
|
|
|
|
|
can do this: |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
my $attrs = { |
429
|
|
|
|
|
|
|
class => 'headline', |
430
|
|
|
|
|
|
|
valign => 'top' |
431
|
|
|
|
|
|
|
}; |
432
|
|
|
|
|
|
|
$token->set_attr($attrs) |
433
|
|
|
|
|
|
|
if $token->is_start_tag('td') && $token->get_attr('class') eq 'stories'; |
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
=item * C<rewrite_tag()> |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
This method rewrites the tag. The tag name and the name of all attributes will |
438
|
|
|
|
|
|
|
be lower-cased. Values that are not quoted with double quotes will be. This |
439
|
|
|
|
|
|
|
may be called on both start or end tags. Note that both C<set_attr()> and |
440
|
|
|
|
|
|
|
C<delete_attr()> call this method prior to returning. |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
If called on a token that is not a tag, it simply returns. Regardless of how |
443
|
|
|
|
|
|
|
it is called, it returns the token. |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
# <body alink=#0000ff BGCOLOR=#ffffff class='none'> |
446
|
|
|
|
|
|
|
$token->rewrite_tag; |
447
|
|
|
|
|
|
|
print $token->as_is; |
448
|
|
|
|
|
|
|
# <body alink="#0000ff" bgcolor="#ffffff" class="none"> |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
A quick cleanup of sloppy HTML is now the following: |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
my $parser = HTML::TokeParser::Simple->new( string => $ugly_html ); |
453
|
|
|
|
|
|
|
while (my $token = $parser->get_token) { |
454
|
|
|
|
|
|
|
$token->rewrite_tag; |
455
|
|
|
|
|
|
|
print $token->as_is; |
456
|
|
|
|
|
|
|
} |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
=back |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
=head1 PARSER VERSUS TOKENS |
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
The parser returns tokens that are blessed into appropriate classes. Some |
463
|
|
|
|
|
|
|
people get confused and try to call parser methods on tokens and token methods |
464
|
|
|
|
|
|
|
on the parser. To prevent this, C<HTML::TokeParser::Simple> versions 1.4 and |
465
|
|
|
|
|
|
|
above now bless all tokens into appropriate token classes. Please keep this in |
466
|
|
|
|
|
|
|
mind while using this module (and many thanks to PodMaster |
467
|
|
|
|
|
|
|
L<http://www.perlmonks.org/index.pl?node_id=107642> for pointing out this issue |
468
|
|
|
|
|
|
|
to me.) |
469
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
=head1 EXAMPLES |
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
=head2 Finding comments |
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
For some strange reason, your Pointy-Haired Boss (PHB) is convinced that the |
475
|
|
|
|
|
|
|
graphics department is making fun of him by embedding rude things about him in |
476
|
|
|
|
|
|
|
HTML comments. You need to get all HTML comments from the HTML. |
477
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
use strict; |
479
|
|
|
|
|
|
|
use HTML::TokeParser::Simple; |
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
my @html_docs = glob( "*.html" ); |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
open PHB, "> phbreport.txt" or die "Cannot open phbreport for writing: $!"; |
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
foreach my $doc ( @html_docs ) { |
486
|
|
|
|
|
|
|
print "Processing $doc\n"; |
487
|
|
|
|
|
|
|
my $p = HTML::TokeParser::Simple->new( file => $doc ); |
488
|
|
|
|
|
|
|
while ( my $token = $p->get_token ) { |
489
|
|
|
|
|
|
|
next unless $token->is_comment; |
490
|
|
|
|
|
|
|
print PHB $token->as_is, "\n"; |
491
|
|
|
|
|
|
|
} |
492
|
|
|
|
|
|
|
} |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
close PHB; |
495
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
=head2 Stripping Comments |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
Uh oh. Turns out that your PHB was right for a change. Many of the comments |
499
|
|
|
|
|
|
|
in the HTML weren't very polite. Since your entire graphics department was |
500
|
|
|
|
|
|
|
just fired, it falls on you need to strip those comments from the HTML. |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
use strict; |
503
|
|
|
|
|
|
|
use HTML::TokeParser::Simple; |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
my $new_folder = 'no_comment/'; |
506
|
|
|
|
|
|
|
my @html_docs = glob( "*.html" ); |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
foreach my $doc ( @html_docs ) { |
509
|
|
|
|
|
|
|
print "Processing $doc\n"; |
510
|
|
|
|
|
|
|
my $new_file = "$new_folder$doc"; |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
open PHB, "> $new_file" or die "Cannot open $new_file for writing: $!"; |
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
my $p = HTML::TokeParser::Simple->new( $file => doc ); |
515
|
|
|
|
|
|
|
while ( my $token = $p->get_token ) { |
516
|
|
|
|
|
|
|
next if $token->is_comment; |
517
|
|
|
|
|
|
|
print PHB $token->as_is; |
518
|
|
|
|
|
|
|
} |
519
|
|
|
|
|
|
|
close PHB; |
520
|
|
|
|
|
|
|
} |
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
=head2 Changing form tags |
523
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
Your company was foo.com and now is bar.com. Unfortunately, whoever wrote your |
525
|
|
|
|
|
|
|
HTML decided to hardcode "http://www.foo.com/" into the C<action> attribute of |
526
|
|
|
|
|
|
|
the form tags. You need to change it to "http://www.bar.com/". |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
use strict; |
529
|
|
|
|
|
|
|
use HTML::TokeParser::Simple; |
530
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
my $new_folder = 'new_html/'; |
532
|
|
|
|
|
|
|
my @html_docs = glob( "*.html" ); |
533
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
foreach my $doc ( @html_docs ) { |
535
|
|
|
|
|
|
|
print "Processing $doc\n"; |
536
|
|
|
|
|
|
|
my $new_file = "$new_folder$doc"; |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
open FILE, "> $new_file" or die "Cannot open $new_file for writing: $!"; |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
my $p = HTML::TokeParser::Simple->new( file => $doc ); |
541
|
|
|
|
|
|
|
while ( my $token = $p->get_token ) { |
542
|
|
|
|
|
|
|
if ( $token->is_start_tag('form') ) { |
543
|
|
|
|
|
|
|
my $action = $token->get_attr(action); |
544
|
|
|
|
|
|
|
$action =~ s/www\.foo\.com/www.bar.com/; |
545
|
|
|
|
|
|
|
$token->set_attr('action', $action); |
546
|
|
|
|
|
|
|
} |
547
|
|
|
|
|
|
|
print FILE $token->as_is; |
548
|
|
|
|
|
|
|
} |
549
|
|
|
|
|
|
|
close FILE; |
550
|
|
|
|
|
|
|
} |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
=head1 CAVEATS |
553
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
For compatibility reasons with C<HTML::TokeParser>, methods that return |
555
|
|
|
|
|
|
|
references are violating encapsulation and altering the references directly |
556
|
|
|
|
|
|
|
B<will> alter the state of the object. Subsequent calls to C<rewrite_tag()> |
557
|
|
|
|
|
|
|
can thus have unexpected results. Do not alter these references directly |
558
|
|
|
|
|
|
|
unless you are following behavior described in these docs. In the future, |
559
|
|
|
|
|
|
|
certain methods such as C<get_attr>, C<get_attrseq> and others may return a |
560
|
|
|
|
|
|
|
copy of the reference rather than the original reference. This behavior has |
561
|
|
|
|
|
|
|
not yet been changed in order to maintain compatibility with previous versions |
562
|
|
|
|
|
|
|
of this module. At the present time, your author is not aware of anyone taking |
563
|
|
|
|
|
|
|
advantage of this "feature," but it's better to be safe than sorry. |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
Use of C<$HTML::Parser::VERSION> which is less than 3.25 may result in |
566
|
|
|
|
|
|
|
incorrect behavior as older versions do not always handle XHTML correctly. It |
567
|
|
|
|
|
|
|
is the programmer's responsibility to verify that the behavior of this code |
568
|
|
|
|
|
|
|
matches the programmer's needs. |
569
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
Note that C<HTML::Parser> processes text in 512 byte chunks. This sometimes |
571
|
|
|
|
|
|
|
will cause strange behavior and cause text to be broken into more than one |
572
|
|
|
|
|
|
|
token. You can suppress this behavior with the following command: |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
$p->unbroken_text( [$bool] ); |
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
See the C<HTML::Parser> documentation and |
577
|
|
|
|
|
|
|
http://www.perlmonks.org/index.pl?node_id=230667 for more information. |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
=head1 BUGS |
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
There are no known bugs, but that's no guarantee. |
582
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
Address bug reports and comments to: E<lt>eop_divo_sitruc@yahoo.comE<gt>. When |
584
|
|
|
|
|
|
|
sending bug reports, please provide the version of C<HTML::Parser>, |
585
|
|
|
|
|
|
|
C<HTML::TokeParser>, C<HTML::TokeParser::Simple>, the version of Perl, and the |
586
|
|
|
|
|
|
|
version of the operating system you are using. |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
Reverse the name to email the author. |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
=head1 SUBCLASSING |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
You may wish to change the behavior of this module. You probably do not want |
593
|
|
|
|
|
|
|
to subclass C<HTML::TokeParser::Simple>. Instead, you'll want to subclass one |
594
|
|
|
|
|
|
|
of the token classes. C<HTML::TokeParser::Simple::Token> is the base class for |
595
|
|
|
|
|
|
|
all tokens. Global behavioral changes should go there. Otherwise, see the |
596
|
|
|
|
|
|
|
appropriate token class for the behavior you wish to alter. |
597
|
|
|
|
|
|
|
|
598
|
|
|
|
|
|
|
=head1 SEE ALSO |
599
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
L<HTML::TokeParser::Simple::Token> |
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
L<HTML::TokeParser::Simple::Token::Tag> |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
L<HTML::TokeParser::Simple::Token::Text> |
605
|
|
|
|
|
|
|
|
606
|
|
|
|
|
|
|
L<HTML::TokeParser::Simple::Token::Comment> |
607
|
|
|
|
|
|
|
|
608
|
|
|
|
|
|
|
L<HTML::TokeParser::Simple::Token::Declaration> |
609
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
L<HTML::TokeParser::Simple::Token::ProcessInstruction> |
611
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
=head1 COPYRIGHT |
613
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
Copyright (c) 2004 by Curtis "Ovid" Poe. All rights reserved. This program is |
615
|
|
|
|
|
|
|
free software; you may redistribute it and/or modify it under the same terms as |
616
|
|
|
|
|
|
|
Perl itself |
617
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
=head1 AUTHOR |
619
|
|
|
|
|
|
|
|
620
|
|
|
|
|
|
|
Curtis "Ovid" Poe E<lt>eop_divo_sitruc@yahoo.comE<gt> |
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
Reverse the name to email the author. |
623
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
=cut |