File Coverage

lib/HTML/TokeParser/Simple.pm

Criterion	Covered	Total	%
statement	74	75	98.6
branch	15	18	83.3
condition	5	5	100.0
subroutine	21	21	100.0
pod	4	4	100.0
total	119	123	96.7

line	stmt	bran	cond	sub	pod	time	code
1							package HTML::TokeParser::Simple;
2
3	5			5		153280	use strict;
	5					11
	5					216
4	5			5		5239	use HTML::TokeParser;
	5					63654
	5					160
5	5			5		4804	use HTML::TokeParser::Simple::Token;
	5					14
	5					135
6	5			5		3773	use HTML::TokeParser::Simple::Token::Tag;
	5					12
	5					167
7	5			5		2568	use HTML::TokeParser::Simple::Token::Tag::Start;
	5					10
	5					127
8	5			5		2473	use HTML::TokeParser::Simple::Token::Tag::End;
	5					11
	5					126
9	5			5		8332	use HTML::TokeParser::Simple::Token::Text;
	5					11
	5					122
10	5			5		2738	use HTML::TokeParser::Simple::Token::Comment;
	5					11
	5					119
11	5			5		2228	use HTML::TokeParser::Simple::Token::Declaration;
	5					39
	5					102
12	5			5		2160	use HTML::TokeParser::Simple::Token::ProcessInstruction;
	5					10
	5					167
13
14							our $VERSION = '3.16';
15	5			5		22	use base 'HTML::TokeParser';
	5					7
	5					3841
16
17							# constructors
18
19							my %FACTORY_CLASSES = (
20							S => 'HTML::TokeParser::Simple::Token::Tag::Start',
21							E => 'HTML::TokeParser::Simple::Token::Tag::End',
22							T => 'HTML::TokeParser::Simple::Token::Text',
23							C => 'HTML::TokeParser::Simple::Token::Comment',
24							D => 'HTML::TokeParser::Simple::Token::Declaration',
25							PI => 'HTML::TokeParser::Simple::Token::ProcessInstruction',
26							);
27
28							sub _croak {
29	3			3		8	my ($proto, $message) = @_;
30	3					23	require Carp;
31	3					765	Carp::croak($message);
32							}
33
34							sub new {
35	19			19	1	130031	my ($class, @args) = @_;
36	19	100				158	return 1 == @args
37							? $class->SUPER::new(@args)
38							: $class->_init(@args);
39							}
40
41							sub _init {
42	8			8		143	my ($class, $source_type, $source) = @_;
43							my %sources = (
44	3			3		16	file => sub { $source },
45	1			1		4	handle => sub { $source },
46	1			1		3	string => sub { \$source },
47							url => sub {
48	2			2		100	eval "require LWP::Simple";
49	2	50				11	$class->_croak("Cannot load LWP::Simple: $@") if $@;
50	2					7	my $content = LWP::Simple::get($source);
51	2	100				16	$class->_croak("Could not fetch content from ($source)")
52							unless defined $content;
53	1					6	return \$content;
54							},
55	8					111	);
56	8	100				32	unless (exists $sources{$source_type}) {
57	1					6	$class->_croak("Unknown source type ($source_type)");
58							}
59	7					22	return $class->new($sources{$source_type}->());
60							}
61
62							sub get_token {
63	226			226	1	16232	my $self = shift;
64	226					450	my @args = @_;
65	226					667	my $token = $self->SUPER::get_token( @args );
66	226	100				4273	return unless defined $token;
67	223	50				542	if (my $factory_class = $FACTORY_CLASSES{$token->[0]}) {
68	223					965	return $factory_class->new($token);
69							}
70							else {
71							# this should never happen
72	0					0	$self->_croak("Cannot determine token class for token (@$token)");
73							}
74							}
75
76							sub get_tag {
77	12			12	1	21104	my $self = shift;
78	12					31	my @args = @_;
79	12					150	my $token = $self->SUPER::get_tag( @args );
80	12	50				144	return unless defined $token;
81	12	100				68	return $token->[0] =~ /^\//
82							? HTML::TokeParser::Simple::Token::Tag::End->new($token)
83							: HTML::TokeParser::Simple::Token::Tag::Start->new($token);
84							}
85
86							sub peek {
87	7			7	1	12	my ($self, $count) = @_;
88	7		100			22	$count \|\|= 1;
89
90	7	100				31	unless ($count =~ /^\d+$/) {
91	1					5	$self->_croak("Argument to peek() must be a positive integer, not ($count)");
92							}
93
94	6					7	my $items = 0;
95	6					10	my $html = '';
96	6					7	my @tokens;
97	6		100			22	while ( $items++ < $count && defined ( my $token = $self->get_token ) ) {
98	61					166	$html .= $token->as_is;
99	61					237	push @tokens, $token;
100							}
101	6					25	$self->unget_token(@tokens);
102	6					60	return $html;
103							}
104
105							1;
106
107							__END__
108
109							=head1 NAME
110
111							HTML::TokeParser::Simple - Easy to use C<HTML::TokeParser> interface
112
113							=head1 SYNOPSIS
114
115							use HTML::TokeParser::Simple;
116							my $p = HTML::TokeParser::Simple->new( $somefile );
117
118							while ( my $token = $p->get_token ) {
119							# This prints all text in an HTML doc (i.e., it strips the HTML)
120							next unless $token->is_text;
121							print $token->as_is;
122							}
123
124
125							=head1 DESCRIPTION
126
127							C<HTML::TokeParser> is an excellent module that's often used for parsing HTML.
128							However, the tokens returned are not exactly intuitive to parse:
129
130							["S", $tag, $attr, $attrseq, $text]
131							["E", $tag, $text]
132							["T", $text, $is_data]
133							["C", $text]
134							["D", $text]
135							["PI", $token0, $text]
136
137							To simplify this, C<HTML::TokeParser::Simple> allows the user ask more
138							intuitive (read: more self-documenting) questions about the tokens returned.
139
140							You can also rebuild some tags on the fly. Frequently, the attributes
141							associated with start tags need to be altered, added to, or deleted. This
142							functionality is built in.
143
144							Since this is a subclass of C<HTML::TokeParser>, all C<HTML::TokeParser>
145							methods are available. To truly appreciate the power of this module, please
146							read the documentation for C<HTML::TokeParser> and C<HTML::Parser>.
147
148							=head1 CONTRUCTORS
149
150							=head2 C<new($source)>
151
152							The constructor for C<HTML::TokeParser::Simple> can be used just like
153							C<HTML::TokeParser>'s constructor:
154
155							my $parser = HTML::TokeParser::Simple->new($filename);
156							# or
157							my $parser = HTML::TokeParser::Simple->new($filehandle);
158							# or
159							my $parser = HTML::TokeParser::Simple->new(\$html_string);
160
161							=head2 C<new($source_type, $source)>
162
163							If you wish to be more explicit, there is a new style of
164							constructor available.
165
166							my $parser = HTML::TokeParser::Simple->new(file => $filename);
167							# or
168							my $parser = HTML::TokeParser::Simple->new(handle => $filehandle);
169							# or
170							my $parser = HTML::TokeParser::Simple->new(string => $html_string);
171
172							Note that you do not have to provide a reference for the string if using the
173							string constructor.
174
175							As a convenience, you can also attempt to fetch the HTML directly from a URL.
176
177							my $parser = HTML::TokeParser::Simple->new(url => 'http://some.url');
178
179							This method relies on C<LWP::Simple>. If this module is not found or the page
180							cannot be fetched, the constructor will C<croak()>.
181
182							=head1 PARSER METHODS
183
184							=head2 get_token
185
186							This method will return the next token that C<HTML::TokeParser::get_token()>
187							method would return. However, it will be blessed into a class appropriate
188							which represents the token type.
189
190							=head2 get_tag
191
192							This method will return the next token that C<HTML::TokeParser::get_tag()>
193							method would return. However, it will be blessed into either the
194							L<HTML::TokeParser::Simple::Token::Tag::Start> or
195							L<HTML::TokeParser::Simple::Token::Tag::End> class.
196
197							=head2 peek
198
199							As of version C<3.14>, you can now C<peek()> at the upcomings tokens without
200							affecting the state of the parser. By default, C<peek()> will return the text
201							of the next token, but specifying an integer C<$count> will return the text of
202							the next C<$count> tokens.
203
204							This is useful when you're trying to debug where you are in a document.
205
206							warn $parser->peek(3); # show the next 3 tokens
207
208							=head1 ACCESSORS
209
210							The following methods may be called on the token object which is returned,
211							not on the parser object.
212
213							=head2 Boolean Accessors
214
215							These accessors return true or false.
216
217							=over 4
218
219							=item * C<is_tag([$tag])>
220
221							Use this to determine if you have any tag. An optional "tag type" may be
222							passed. This will allow you to match if it's a I<particular> tag. The
223							supplied tag is case-insensitive.
224
225							if ( $token->is_tag ) { ... }
226
227							Optionally, you may pass a regular expression as an argument.
228
229							=item * C<is_start_tag([$tag])>
230
231							Use this to determine if you have a start tag. An optional "tag type" may be
232							passed. This will allow you to match if it's a I<particular> start tag. The
233							supplied tag is case-insensitive.
234
235							if ( $token->is_start_tag ) { ... }
236							if ( $token->is_start_tag( 'font' ) ) { ... }
237
238							Optionally, you may pass a regular expression as an argument. To match all
239							header (h1, h2, ... h6) tags:
240
241							if ( $token->is_start_tag( qr/^h[123456]$/ ) ) { ... }
242
243							=item * C<is_end_tag([$tag])>
244
245							Use this to determine if you have an end tag. An optional "tag type" may be
246							passed. This will allow you to match if it's a I<particular> end tag. The
247							supplied tag is case-insensitive.
248
249							When testing for an end tag, the forward slash on the tag is optional.
250
251							while ( $token = $p->get_token ) {
252							if ( $token->is_end_tag( 'form' ) ) { ... }
253							}
254
255							Or:
256
257							while ( $token = $p->get_token ) {
258							if ( $token->is_end_tag( '/form' ) ) { ... }
259							}
260
261							Optionally, you may pass a regular expression as an argument.
262
263							=item * C<is_text()>
264
265							Use this to determine if you have text. Note that this is I<not> to be
266							confused with the C<return_text> (I<deprecated>) method described below!
267							C<is_text> will identify text that the user typically sees display in the Web
268							browser.
269
270							=item * C<is_comment()>
271
272							Are you still reading this? Nobody reads POD. Don't you know you're supposed
273							to go to CLPM, ask a question that's answered in the POD and get flamed? It's
274							a rite of passage.
275
276							Really.
277
278							C<is_comment> is used to identify comments. See the HTML::Parser documentation
279							for more information about comments. There's more than you might think.
280
281							=item * C<is_declaration()>
282
283							This will match the DTD at the top of your HTML. (You I<do> use DTD's, don't
284							you?)
285
286							=item * C<is_process_instruction()>
287
288							Process Instructions are from XML. This is very handy if you need to parse out
289							PHP and similar things with a parser.
290
291							Currently, there appear to be some problems with process instructions. You can
292							override C<HTML::TokeParser::Simple::Token::ProcessInstruction> if you need to.
293
294							=item * C<is_pi()>
295
296							This is a shorthand for C<is_process_instruction()>.
297
298							=back
299
300							=head2 Data Accessors
301
302							Some of these were originally C<return_> methods, but that name was not only
303							unwieldy, but also went against reasonable conventions. The C<get_> methods
304							listed below still have C<return_> methods available for backwards
305							compatibility reasons, but they merely call their C<get_> counterpart. For
306							example, calling C<return_tag()> actually calls C<get_tag()> internally.
307
308							=over 4
309
310							=item * C<get_tag()>
311
312							Do you have a start tag or end tag? This will return the type (lower case).
313							Note that this is I<not> the same as the C<get_tag()> method on the actual
314							parser object.
315
316							=item * C<get_attr([$attribute])>
317
318							If you have a start tag, this will return a hash ref with the attribute names
319							as keys and the values as the values.
320
321							If you pass in an attribute name, it will return the value for just that
322							attribute.
323
324							Returns false if the token is not a start tag.
325
326							=item * C<get_attrseq()>
327
328							For a start tag, this is an array reference with the sequence of the
329							attributes, if any.
330
331							Returns false if the token is not a start tag.
332
333							=item * C<return_text()>
334
335							This method has been heavily deprecated (for a couple of years) in favor of
336							C<as_is>. Programmers were getting confused over the difference between
337							C<is_text>, C<return_text>, and some parser methods such as
338							C<HTML::TokeParser::get_text> and friends.
339
340							Using this method still succeeds, but will now carp and B<will be removed>
341							in the next major release of this module.
342
343							=item * C<as_is()>
344
345							This is the exact text of whatever the token is representing.
346
347							=item * C<get_token0()>
348
349							For processing instructions, this will return the token found immediately after
350							the opening tag. Example: For <?php, "php" will be the start of the returned
351							string.
352
353							Note that process instruction handling appears to be incomplete in
354							C<HTML::TokeParser>.
355
356							Returns false if the token is not a process instruction.
357
358							=back
359
360							=head1 MUTATORS
361
362							The C<delete_attr()> and C<set_attr()> methods allow the programmer to rewrite
363							start tag attributes on the fly. It should be noted that bad HTML will be
364							"corrected" by this. Specifically, the new tag will have all attributes
365							lower-cased with the values properly quoted.
366
367							Self-closing tags (e.g. E<lt>hr /E<gt>) are also handled correctly. Some older
368							browsers require a space prior to the final slash in a self-closed tag. If
369							such a space is detected in the original HTML, it will be preserved.
370
371							Calling a mutator on an token type that does not support that property is a
372							no-op. For example:
373
374							if ($token->is_comment) {
375							$token->set_attr(foo => 'bar'); # does nothing
376							}
377
378							=over 4
379
380							=item * C<delete_attr($name)>
381
382							This method attempts to delete the attribute specified. It will silently fail
383							if called on anything other than a start tag. The argument is
384							case-insensitive, but must otherwise be an exact match of the attribute you are
385							attempting to delete. If the attribute is not found, the method will return
386							without changing the tag.
387
388							# <body bgcolor="#FFFFFF">
389							$token->delete_attr('bgcolor');
390							print $token->as_is;
391							# <body>
392
393							After this method is called, if successful, the C<as_is()>, C<get_attr()>
394							and C<get_attrseq()> methods will all return updated results.
395
396							=item * C<set_attr($name,$value)>
397
398							This method will set the value of an attribute. If the attribute is not found,
399							then C<get_attrseq()> will have the new attribute listed at the end.
400
401							# <p>
402							$token->set_attr(class => 'some_class');
403							print $token->as_is;
404							# <p class="some_class">
405
406							# <body bgcolor="#FFFFFF">
407							$token->set_attr('bgcolor','red');
408							print $token->as_is;
409							# <body bgcolor="red">
410
411							After this method is called, if successful, the C<as_is()>, C<get_attr()>
412							and C<get_attrseq()> methods will all return updated results.
413
414							=item * C<set_attr($hashref)>
415
416							Under the premise that C<set_> methods should accept what their corresponding
417							C<get_> methods emit, the following works:
418
419							$tag->set_attr($tag->get_attr);
420
421							Theoretically that's a no-op and for purposes of rendering HTML, it should be.
422							However, internally this calls C<$tag-E<gt>rewrite_tag>, so see that method to
423							understand how this may affect you.
424
425							Of course, this is useless if you want to actually change the attributes, so you
426							can do this:
427
428							my $attrs = {
429							class => 'headline',
430							valign => 'top'
431							};
432							$token->set_attr($attrs)
433							if $token->is_start_tag('td') && $token->get_attr('class') eq 'stories';
434
435							=item * C<rewrite_tag()>
436
437							This method rewrites the tag. The tag name and the name of all attributes will
438							be lower-cased. Values that are not quoted with double quotes will be. This
439							may be called on both start or end tags. Note that both C<set_attr()> and
440							C<delete_attr()> call this method prior to returning.
441
442							If called on a token that is not a tag, it simply returns. Regardless of how
443							it is called, it returns the token.
444
445							# <body alink=#0000ff BGCOLOR=#ffffff class='none'>
446							$token->rewrite_tag;
447							print $token->as_is;
448							# <body alink="#0000ff" bgcolor="#ffffff" class="none">
449
450							A quick cleanup of sloppy HTML is now the following:
451
452							my $parser = HTML::TokeParser::Simple->new( string => $ugly_html );
453							while (my $token = $parser->get_token) {
454							$token->rewrite_tag;
455							print $token->as_is;
456							}
457
458							=back
459
460							=head1 PARSER VERSUS TOKENS
461
462							The parser returns tokens that are blessed into appropriate classes. Some
463							people get confused and try to call parser methods on tokens and token methods
464							on the parser. To prevent this, C<HTML::TokeParser::Simple> versions 1.4 and
465							above now bless all tokens into appropriate token classes. Please keep this in
466							mind while using this module (and many thanks to PodMaster
467							L<http://www.perlmonks.org/index.pl?node_id=107642> for pointing out this issue
468							to me.)
469
470							=head1 EXAMPLES
471
472							=head2 Finding comments
473
474							For some strange reason, your Pointy-Haired Boss (PHB) is convinced that the
475							graphics department is making fun of him by embedding rude things about him in
476							HTML comments. You need to get all HTML comments from the HTML.
477
478							use strict;
479							use HTML::TokeParser::Simple;
480
481							my @html_docs = glob( "*.html" );
482
483							open PHB, "> phbreport.txt" or die "Cannot open phbreport for writing: $!";
484
485							foreach my $doc ( @html_docs ) {
486							print "Processing $doc\n";
487							my $p = HTML::TokeParser::Simple->new( file => $doc );
488							while ( my $token = $p->get_token ) {
489							next unless $token->is_comment;
490							print PHB $token->as_is, "\n";
491							}
492							}
493
494							close PHB;
495
496							=head2 Stripping Comments
497
498							Uh oh. Turns out that your PHB was right for a change. Many of the comments
499							in the HTML weren't very polite. Since your entire graphics department was
500							just fired, it falls on you need to strip those comments from the HTML.
501
502							use strict;
503							use HTML::TokeParser::Simple;
504
505							my $new_folder = 'no_comment/';
506							my @html_docs = glob( "*.html" );
507
508							foreach my $doc ( @html_docs ) {
509							print "Processing $doc\n";
510							my $new_file = "$new_folder$doc";
511
512							open PHB, "> $new_file" or die "Cannot open $new_file for writing: $!";
513
514							my $p = HTML::TokeParser::Simple->new( $file => doc );
515							while ( my $token = $p->get_token ) {
516							next if $token->is_comment;
517							print PHB $token->as_is;
518							}
519							close PHB;
520							}
521
522							=head2 Changing form tags
523
524							Your company was foo.com and now is bar.com. Unfortunately, whoever wrote your
525							HTML decided to hardcode "http://www.foo.com/" into the C<action> attribute of
526							the form tags. You need to change it to "http://www.bar.com/".
527
528							use strict;
529							use HTML::TokeParser::Simple;
530
531							my $new_folder = 'new_html/';
532							my @html_docs = glob( "*.html" );
533
534							foreach my $doc ( @html_docs ) {
535							print "Processing $doc\n";
536							my $new_file = "$new_folder$doc";
537
538							open FILE, "> $new_file" or die "Cannot open $new_file for writing: $!";
539
540							my $p = HTML::TokeParser::Simple->new( file => $doc );
541							while ( my $token = $p->get_token ) {
542							if ( $token->is_start_tag('form') ) {
543							my $action = $token->get_attr(action);
544							$action =~ s/www\.foo\.com/www.bar.com/;
545							$token->set_attr('action', $action);
546							}
547							print FILE $token->as_is;
548							}
549							close FILE;
550							}
551
552							=head1 CAVEATS
553
554							For compatibility reasons with C<HTML::TokeParser>, methods that return
555							references are violating encapsulation and altering the references directly
556							B<will> alter the state of the object. Subsequent calls to C<rewrite_tag()>
557							can thus have unexpected results. Do not alter these references directly
558							unless you are following behavior described in these docs. In the future,
559							certain methods such as C<get_attr>, C<get_attrseq> and others may return a
560							copy of the reference rather than the original reference. This behavior has
561							not yet been changed in order to maintain compatibility with previous versions
562							of this module. At the present time, your author is not aware of anyone taking
563							advantage of this "feature," but it's better to be safe than sorry.
564
565							Use of C<$HTML::Parser::VERSION> which is less than 3.25 may result in
566							incorrect behavior as older versions do not always handle XHTML correctly. It
567							is the programmer's responsibility to verify that the behavior of this code
568							matches the programmer's needs.
569
570							Note that C<HTML::Parser> processes text in 512 byte chunks. This sometimes
571							will cause strange behavior and cause text to be broken into more than one
572							token. You can suppress this behavior with the following command:
573
574							$p->unbroken_text( [$bool] );
575
576							See the C<HTML::Parser> documentation and
577							http://www.perlmonks.org/index.pl?node_id=230667 for more information.
578
579							=head1 BUGS
580
581							There are no known bugs, but that's no guarantee.
582
583							Address bug reports and comments to: E<lt>eop_divo_sitruc@yahoo.comE<gt>. When
584							sending bug reports, please provide the version of C<HTML::Parser>,
585							C<HTML::TokeParser>, C<HTML::TokeParser::Simple>, the version of Perl, and the
586							version of the operating system you are using.
587
588							Reverse the name to email the author.
589
590							=head1 SUBCLASSING
591
592							You may wish to change the behavior of this module. You probably do not want
593							to subclass C<HTML::TokeParser::Simple>. Instead, you'll want to subclass one
594							of the token classes. C<HTML::TokeParser::Simple::Token> is the base class for
595							all tokens. Global behavioral changes should go there. Otherwise, see the
596							appropriate token class for the behavior you wish to alter.
597
598							=head1 SEE ALSO
599
600							L<HTML::TokeParser::Simple::Token>
601
602							L<HTML::TokeParser::Simple::Token::Tag>
603
604							L<HTML::TokeParser::Simple::Token::Text>
605
606							L<HTML::TokeParser::Simple::Token::Comment>
607
608							L<HTML::TokeParser::Simple::Token::Declaration>
609
610							L<HTML::TokeParser::Simple::Token::ProcessInstruction>
611
612							=head1 COPYRIGHT
613
614							Copyright (c) 2004 by Curtis "Ovid" Poe. All rights reserved. This program is
615							free software; you may redistribute it and/or modify it under the same terms as
616							Perl itself
617
618							=head1 AUTHOR
619
620							Curtis "Ovid" Poe E<lt>eop_divo_sitruc@yahoo.comE<gt>
621
622							Reverse the name to email the author.
623
624							=cut