File Coverage

lib/HTML/TokeParser/Simple.pm
Criterion Covered Total %
statement 74 75 98.6
branch 15 18 83.3
condition 5 5 100.0
subroutine 21 21 100.0
pod 4 4 100.0
total 119 123 96.7


line stmt bran cond sub pod time code
1             package HTML::TokeParser::Simple;
2              
3 5     5   153280 use strict;
  5         11  
  5         216  
4 5     5   5239 use HTML::TokeParser;
  5         63654  
  5         160  
5 5     5   4804 use HTML::TokeParser::Simple::Token;
  5         14  
  5         135  
6 5     5   3773 use HTML::TokeParser::Simple::Token::Tag;
  5         12  
  5         167  
7 5     5   2568 use HTML::TokeParser::Simple::Token::Tag::Start;
  5         10  
  5         127  
8 5     5   2473 use HTML::TokeParser::Simple::Token::Tag::End;
  5         11  
  5         126  
9 5     5   8332 use HTML::TokeParser::Simple::Token::Text;
  5         11  
  5         122  
10 5     5   2738 use HTML::TokeParser::Simple::Token::Comment;
  5         11  
  5         119  
11 5     5   2228 use HTML::TokeParser::Simple::Token::Declaration;
  5         39  
  5         102  
12 5     5   2160 use HTML::TokeParser::Simple::Token::ProcessInstruction;
  5         10  
  5         167  
13              
14             our $VERSION = '3.16';
15 5     5   22 use base 'HTML::TokeParser';
  5         7  
  5         3841  
16              
17             # constructors
18              
19             my %FACTORY_CLASSES = (
20             S => 'HTML::TokeParser::Simple::Token::Tag::Start',
21             E => 'HTML::TokeParser::Simple::Token::Tag::End',
22             T => 'HTML::TokeParser::Simple::Token::Text',
23             C => 'HTML::TokeParser::Simple::Token::Comment',
24             D => 'HTML::TokeParser::Simple::Token::Declaration',
25             PI => 'HTML::TokeParser::Simple::Token::ProcessInstruction',
26             );
27              
28             sub _croak {
29 3     3   8 my ($proto, $message) = @_;
30 3         23 require Carp;
31 3         765 Carp::croak($message);
32             }
33              
34             sub new {
35 19     19 1 130031 my ($class, @args) = @_;
36 19 100       158 return 1 == @args
37             ? $class->SUPER::new(@args)
38             : $class->_init(@args);
39             }
40              
41             sub _init {
42 8     8   143 my ($class, $source_type, $source) = @_;
43             my %sources = (
44 3     3   16 file => sub { $source },
45 1     1   4 handle => sub { $source },
46 1     1   3 string => sub { \$source },
47             url => sub {
48 2     2   100 eval "require LWP::Simple";
49 2 50       11 $class->_croak("Cannot load LWP::Simple: $@") if $@;
50 2         7 my $content = LWP::Simple::get($source);
51 2 100       16 $class->_croak("Could not fetch content from ($source)")
52             unless defined $content;
53 1         6 return \$content;
54             },
55 8         111 );
56 8 100       32 unless (exists $sources{$source_type}) {
57 1         6 $class->_croak("Unknown source type ($source_type)");
58             }
59 7         22 return $class->new($sources{$source_type}->());
60             }
61              
62             sub get_token {
63 226     226 1 16232 my $self = shift;
64 226         450 my @args = @_;
65 226         667 my $token = $self->SUPER::get_token( @args );
66 226 100       4273 return unless defined $token;
67 223 50       542 if (my $factory_class = $FACTORY_CLASSES{$token->[0]}) {
68 223         965 return $factory_class->new($token);
69             }
70             else {
71             # this should never happen
72 0         0 $self->_croak("Cannot determine token class for token (@$token)");
73             }
74             }
75              
76             sub get_tag {
77 12     12 1 21104 my $self = shift;
78 12         31 my @args = @_;
79 12         150 my $token = $self->SUPER::get_tag( @args );
80 12 50       144 return unless defined $token;
81 12 100       68 return $token->[0] =~ /^\//
82             ? HTML::TokeParser::Simple::Token::Tag::End->new($token)
83             : HTML::TokeParser::Simple::Token::Tag::Start->new($token);
84             }
85              
86             sub peek {
87 7     7 1 12 my ($self, $count) = @_;
88 7   100     22 $count ||= 1;
89            
90 7 100       31 unless ($count =~ /^\d+$/) {
91 1         5 $self->_croak("Argument to peek() must be a positive integer, not ($count)");
92             }
93              
94 6         7 my $items = 0;
95 6         10 my $html = '';
96 6         7 my @tokens;
97 6   100     22 while ( $items++ < $count && defined ( my $token = $self->get_token ) ) {
98 61         166 $html .= $token->as_is;
99 61         237 push @tokens, $token;
100             }
101 6         25 $self->unget_token(@tokens);
102 6         60 return $html;
103             }
104              
105             1;
106              
107             __END__
108              
109             =head1 NAME
110              
111             HTML::TokeParser::Simple - Easy to use C<HTML::TokeParser> interface
112              
113             =head1 SYNOPSIS
114              
115             use HTML::TokeParser::Simple;
116             my $p = HTML::TokeParser::Simple->new( $somefile );
117              
118             while ( my $token = $p->get_token ) {
119             # This prints all text in an HTML doc (i.e., it strips the HTML)
120             next unless $token->is_text;
121             print $token->as_is;
122             }
123              
124              
125             =head1 DESCRIPTION
126              
127             C<HTML::TokeParser> is an excellent module that's often used for parsing HTML.
128             However, the tokens returned are not exactly intuitive to parse:
129              
130             ["S", $tag, $attr, $attrseq, $text]
131             ["E", $tag, $text]
132             ["T", $text, $is_data]
133             ["C", $text]
134             ["D", $text]
135             ["PI", $token0, $text]
136              
137             To simplify this, C<HTML::TokeParser::Simple> allows the user ask more
138             intuitive (read: more self-documenting) questions about the tokens returned.
139              
140             You can also rebuild some tags on the fly. Frequently, the attributes
141             associated with start tags need to be altered, added to, or deleted. This
142             functionality is built in.
143              
144             Since this is a subclass of C<HTML::TokeParser>, all C<HTML::TokeParser>
145             methods are available. To truly appreciate the power of this module, please
146             read the documentation for C<HTML::TokeParser> and C<HTML::Parser>.
147              
148             =head1 CONTRUCTORS
149              
150             =head2 C<new($source)>
151              
152             The constructor for C<HTML::TokeParser::Simple> can be used just like
153             C<HTML::TokeParser>'s constructor:
154              
155             my $parser = HTML::TokeParser::Simple->new($filename);
156             # or
157             my $parser = HTML::TokeParser::Simple->new($filehandle);
158             # or
159             my $parser = HTML::TokeParser::Simple->new(\$html_string);
160              
161             =head2 C<new($source_type, $source)>
162              
163             If you wish to be more explicit, there is a new style of
164             constructor available.
165              
166             my $parser = HTML::TokeParser::Simple->new(file => $filename);
167             # or
168             my $parser = HTML::TokeParser::Simple->new(handle => $filehandle);
169             # or
170             my $parser = HTML::TokeParser::Simple->new(string => $html_string);
171              
172             Note that you do not have to provide a reference for the string if using the
173             string constructor.
174              
175             As a convenience, you can also attempt to fetch the HTML directly from a URL.
176              
177             my $parser = HTML::TokeParser::Simple->new(url => 'http://some.url');
178              
179             This method relies on C<LWP::Simple>. If this module is not found or the page
180             cannot be fetched, the constructor will C<croak()>.
181              
182             =head1 PARSER METHODS
183              
184             =head2 get_token
185              
186             This method will return the next token that C<HTML::TokeParser::get_token()>
187             method would return. However, it will be blessed into a class appropriate
188             which represents the token type.
189              
190             =head2 get_tag
191              
192             This method will return the next token that C<HTML::TokeParser::get_tag()>
193             method would return. However, it will be blessed into either the
194             L<HTML::TokeParser::Simple::Token::Tag::Start> or
195             L<HTML::TokeParser::Simple::Token::Tag::End> class.
196              
197             =head2 peek
198              
199             As of version C<3.14>, you can now C<peek()> at the upcomings tokens without
200             affecting the state of the parser. By default, C<peek()> will return the text
201             of the next token, but specifying an integer C<$count> will return the text of
202             the next C<$count> tokens.
203              
204             This is useful when you're trying to debug where you are in a document.
205              
206             warn $parser->peek(3); # show the next 3 tokens
207              
208             =head1 ACCESSORS
209              
210             The following methods may be called on the token object which is returned,
211             not on the parser object.
212              
213             =head2 Boolean Accessors
214              
215             These accessors return true or false.
216              
217             =over 4
218              
219             =item * C<is_tag([$tag])>
220              
221             Use this to determine if you have any tag. An optional "tag type" may be
222             passed. This will allow you to match if it's a I<particular> tag. The
223             supplied tag is case-insensitive.
224              
225             if ( $token->is_tag ) { ... }
226              
227             Optionally, you may pass a regular expression as an argument.
228              
229             =item * C<is_start_tag([$tag])>
230              
231             Use this to determine if you have a start tag. An optional "tag type" may be
232             passed. This will allow you to match if it's a I<particular> start tag. The
233             supplied tag is case-insensitive.
234              
235             if ( $token->is_start_tag ) { ... }
236             if ( $token->is_start_tag( 'font' ) ) { ... }
237              
238             Optionally, you may pass a regular expression as an argument. To match all
239             header (h1, h2, ... h6) tags:
240              
241             if ( $token->is_start_tag( qr/^h[123456]$/ ) ) { ... }
242              
243             =item * C<is_end_tag([$tag])>
244              
245             Use this to determine if you have an end tag. An optional "tag type" may be
246             passed. This will allow you to match if it's a I<particular> end tag. The
247             supplied tag is case-insensitive.
248              
249             When testing for an end tag, the forward slash on the tag is optional.
250              
251             while ( $token = $p->get_token ) {
252             if ( $token->is_end_tag( 'form' ) ) { ... }
253             }
254              
255             Or:
256              
257             while ( $token = $p->get_token ) {
258             if ( $token->is_end_tag( '/form' ) ) { ... }
259             }
260              
261             Optionally, you may pass a regular expression as an argument.
262              
263             =item * C<is_text()>
264              
265             Use this to determine if you have text. Note that this is I<not> to be
266             confused with the C<return_text> (I<deprecated>) method described below!
267             C<is_text> will identify text that the user typically sees display in the Web
268             browser.
269              
270             =item * C<is_comment()>
271              
272             Are you still reading this? Nobody reads POD. Don't you know you're supposed
273             to go to CLPM, ask a question that's answered in the POD and get flamed? It's
274             a rite of passage.
275              
276             Really.
277              
278             C<is_comment> is used to identify comments. See the HTML::Parser documentation
279             for more information about comments. There's more than you might think.
280              
281             =item * C<is_declaration()>
282              
283             This will match the DTD at the top of your HTML. (You I<do> use DTD's, don't
284             you?)
285              
286             =item * C<is_process_instruction()>
287              
288             Process Instructions are from XML. This is very handy if you need to parse out
289             PHP and similar things with a parser.
290              
291             Currently, there appear to be some problems with process instructions. You can
292             override C<HTML::TokeParser::Simple::Token::ProcessInstruction> if you need to.
293              
294             =item * C<is_pi()>
295              
296             This is a shorthand for C<is_process_instruction()>.
297              
298             =back
299              
300             =head2 Data Accessors
301              
302             Some of these were originally C<return_> methods, but that name was not only
303             unwieldy, but also went against reasonable conventions. The C<get_> methods
304             listed below still have C<return_> methods available for backwards
305             compatibility reasons, but they merely call their C<get_> counterpart. For
306             example, calling C<return_tag()> actually calls C<get_tag()> internally.
307              
308             =over 4
309              
310             =item * C<get_tag()>
311              
312             Do you have a start tag or end tag? This will return the type (lower case).
313             Note that this is I<not> the same as the C<get_tag()> method on the actual
314             parser object.
315              
316             =item * C<get_attr([$attribute])>
317              
318             If you have a start tag, this will return a hash ref with the attribute names
319             as keys and the values as the values.
320              
321             If you pass in an attribute name, it will return the value for just that
322             attribute.
323              
324             Returns false if the token is not a start tag.
325              
326             =item * C<get_attrseq()>
327              
328             For a start tag, this is an array reference with the sequence of the
329             attributes, if any.
330              
331             Returns false if the token is not a start tag.
332              
333             =item * C<return_text()>
334              
335             This method has been heavily deprecated (for a couple of years) in favor of
336             C<as_is>. Programmers were getting confused over the difference between
337             C<is_text>, C<return_text>, and some parser methods such as
338             C<HTML::TokeParser::get_text> and friends.
339              
340             Using this method still succeeds, but will now carp and B<will be removed>
341             in the next major release of this module.
342              
343             =item * C<as_is()>
344              
345             This is the exact text of whatever the token is representing.
346              
347             =item * C<get_token0()>
348              
349             For processing instructions, this will return the token found immediately after
350             the opening tag. Example: For <?php, "php" will be the start of the returned
351             string.
352              
353             Note that process instruction handling appears to be incomplete in
354             C<HTML::TokeParser>.
355              
356             Returns false if the token is not a process instruction.
357              
358             =back
359              
360             =head1 MUTATORS
361              
362             The C<delete_attr()> and C<set_attr()> methods allow the programmer to rewrite
363             start tag attributes on the fly. It should be noted that bad HTML will be
364             "corrected" by this. Specifically, the new tag will have all attributes
365             lower-cased with the values properly quoted.
366              
367             Self-closing tags (e.g. E<lt>hr /E<gt>) are also handled correctly. Some older
368             browsers require a space prior to the final slash in a self-closed tag. If
369             such a space is detected in the original HTML, it will be preserved.
370              
371             Calling a mutator on an token type that does not support that property is a
372             no-op. For example:
373              
374             if ($token->is_comment) {
375             $token->set_attr(foo => 'bar'); # does nothing
376             }
377              
378             =over 4
379              
380             =item * C<delete_attr($name)>
381              
382             This method attempts to delete the attribute specified. It will silently fail
383             if called on anything other than a start tag. The argument is
384             case-insensitive, but must otherwise be an exact match of the attribute you are
385             attempting to delete. If the attribute is not found, the method will return
386             without changing the tag.
387              
388             # <body bgcolor="#FFFFFF">
389             $token->delete_attr('bgcolor');
390             print $token->as_is;
391             # <body>
392            
393             After this method is called, if successful, the C<as_is()>, C<get_attr()>
394             and C<get_attrseq()> methods will all return updated results.
395              
396             =item * C<set_attr($name,$value)>
397              
398             This method will set the value of an attribute. If the attribute is not found,
399             then C<get_attrseq()> will have the new attribute listed at the end.
400              
401             # <p>
402             $token->set_attr(class => 'some_class');
403             print $token->as_is;
404             # <p class="some_class">
405              
406             # <body bgcolor="#FFFFFF">
407             $token->set_attr('bgcolor','red');
408             print $token->as_is;
409             # <body bgcolor="red">
410              
411             After this method is called, if successful, the C<as_is()>, C<get_attr()>
412             and C<get_attrseq()> methods will all return updated results.
413              
414             =item * C<set_attr($hashref)>
415              
416             Under the premise that C<set_> methods should accept what their corresponding
417             C<get_> methods emit, the following works:
418              
419             $tag->set_attr($tag->get_attr);
420              
421             Theoretically that's a no-op and for purposes of rendering HTML, it should be.
422             However, internally this calls C<$tag-E<gt>rewrite_tag>, so see that method to
423             understand how this may affect you.
424              
425             Of course, this is useless if you want to actually change the attributes, so you
426             can do this:
427              
428             my $attrs = {
429             class => 'headline',
430             valign => 'top'
431             };
432             $token->set_attr($attrs)
433             if $token->is_start_tag('td') && $token->get_attr('class') eq 'stories';
434              
435             =item * C<rewrite_tag()>
436              
437             This method rewrites the tag. The tag name and the name of all attributes will
438             be lower-cased. Values that are not quoted with double quotes will be. This
439             may be called on both start or end tags. Note that both C<set_attr()> and
440             C<delete_attr()> call this method prior to returning.
441              
442             If called on a token that is not a tag, it simply returns. Regardless of how
443             it is called, it returns the token.
444              
445             # <body alink=#0000ff BGCOLOR=#ffffff class='none'>
446             $token->rewrite_tag;
447             print $token->as_is;
448             # <body alink="#0000ff" bgcolor="#ffffff" class="none">
449              
450             A quick cleanup of sloppy HTML is now the following:
451              
452             my $parser = HTML::TokeParser::Simple->new( string => $ugly_html );
453             while (my $token = $parser->get_token) {
454             $token->rewrite_tag;
455             print $token->as_is;
456             }
457              
458             =back
459              
460             =head1 PARSER VERSUS TOKENS
461              
462             The parser returns tokens that are blessed into appropriate classes. Some
463             people get confused and try to call parser methods on tokens and token methods
464             on the parser. To prevent this, C<HTML::TokeParser::Simple> versions 1.4 and
465             above now bless all tokens into appropriate token classes. Please keep this in
466             mind while using this module (and many thanks to PodMaster
467             L<http://www.perlmonks.org/index.pl?node_id=107642> for pointing out this issue
468             to me.)
469              
470             =head1 EXAMPLES
471              
472             =head2 Finding comments
473              
474             For some strange reason, your Pointy-Haired Boss (PHB) is convinced that the
475             graphics department is making fun of him by embedding rude things about him in
476             HTML comments. You need to get all HTML comments from the HTML.
477              
478             use strict;
479             use HTML::TokeParser::Simple;
480              
481             my @html_docs = glob( "*.html" );
482              
483             open PHB, "> phbreport.txt" or die "Cannot open phbreport for writing: $!";
484              
485             foreach my $doc ( @html_docs ) {
486             print "Processing $doc\n";
487             my $p = HTML::TokeParser::Simple->new( file => $doc );
488             while ( my $token = $p->get_token ) {
489             next unless $token->is_comment;
490             print PHB $token->as_is, "\n";
491             }
492             }
493              
494             close PHB;
495              
496             =head2 Stripping Comments
497              
498             Uh oh. Turns out that your PHB was right for a change. Many of the comments
499             in the HTML weren't very polite. Since your entire graphics department was
500             just fired, it falls on you need to strip those comments from the HTML.
501              
502             use strict;
503             use HTML::TokeParser::Simple;
504              
505             my $new_folder = 'no_comment/';
506             my @html_docs = glob( "*.html" );
507              
508             foreach my $doc ( @html_docs ) {
509             print "Processing $doc\n";
510             my $new_file = "$new_folder$doc";
511              
512             open PHB, "> $new_file" or die "Cannot open $new_file for writing: $!";
513              
514             my $p = HTML::TokeParser::Simple->new( $file => doc );
515             while ( my $token = $p->get_token ) {
516             next if $token->is_comment;
517             print PHB $token->as_is;
518             }
519             close PHB;
520             }
521              
522             =head2 Changing form tags
523              
524             Your company was foo.com and now is bar.com. Unfortunately, whoever wrote your
525             HTML decided to hardcode "http://www.foo.com/" into the C<action> attribute of
526             the form tags. You need to change it to "http://www.bar.com/".
527              
528             use strict;
529             use HTML::TokeParser::Simple;
530              
531             my $new_folder = 'new_html/';
532             my @html_docs = glob( "*.html" );
533              
534             foreach my $doc ( @html_docs ) {
535             print "Processing $doc\n";
536             my $new_file = "$new_folder$doc";
537              
538             open FILE, "> $new_file" or die "Cannot open $new_file for writing: $!";
539              
540             my $p = HTML::TokeParser::Simple->new( file => $doc );
541             while ( my $token = $p->get_token ) {
542             if ( $token->is_start_tag('form') ) {
543             my $action = $token->get_attr(action);
544             $action =~ s/www\.foo\.com/www.bar.com/;
545             $token->set_attr('action', $action);
546             }
547             print FILE $token->as_is;
548             }
549             close FILE;
550             }
551              
552             =head1 CAVEATS
553              
554             For compatibility reasons with C<HTML::TokeParser>, methods that return
555             references are violating encapsulation and altering the references directly
556             B<will> alter the state of the object. Subsequent calls to C<rewrite_tag()>
557             can thus have unexpected results. Do not alter these references directly
558             unless you are following behavior described in these docs. In the future,
559             certain methods such as C<get_attr>, C<get_attrseq> and others may return a
560             copy of the reference rather than the original reference. This behavior has
561             not yet been changed in order to maintain compatibility with previous versions
562             of this module. At the present time, your author is not aware of anyone taking
563             advantage of this "feature," but it's better to be safe than sorry.
564              
565             Use of C<$HTML::Parser::VERSION> which is less than 3.25 may result in
566             incorrect behavior as older versions do not always handle XHTML correctly. It
567             is the programmer's responsibility to verify that the behavior of this code
568             matches the programmer's needs.
569              
570             Note that C<HTML::Parser> processes text in 512 byte chunks. This sometimes
571             will cause strange behavior and cause text to be broken into more than one
572             token. You can suppress this behavior with the following command:
573              
574             $p->unbroken_text( [$bool] );
575              
576             See the C<HTML::Parser> documentation and
577             http://www.perlmonks.org/index.pl?node_id=230667 for more information.
578              
579             =head1 BUGS
580              
581             There are no known bugs, but that's no guarantee.
582              
583             Address bug reports and comments to: E<lt>eop_divo_sitruc@yahoo.comE<gt>. When
584             sending bug reports, please provide the version of C<HTML::Parser>,
585             C<HTML::TokeParser>, C<HTML::TokeParser::Simple>, the version of Perl, and the
586             version of the operating system you are using.
587              
588             Reverse the name to email the author.
589              
590             =head1 SUBCLASSING
591              
592             You may wish to change the behavior of this module. You probably do not want
593             to subclass C<HTML::TokeParser::Simple>. Instead, you'll want to subclass one
594             of the token classes. C<HTML::TokeParser::Simple::Token> is the base class for
595             all tokens. Global behavioral changes should go there. Otherwise, see the
596             appropriate token class for the behavior you wish to alter.
597              
598             =head1 SEE ALSO
599              
600             L<HTML::TokeParser::Simple::Token>
601              
602             L<HTML::TokeParser::Simple::Token::Tag>
603              
604             L<HTML::TokeParser::Simple::Token::Text>
605              
606             L<HTML::TokeParser::Simple::Token::Comment>
607              
608             L<HTML::TokeParser::Simple::Token::Declaration>
609              
610             L<HTML::TokeParser::Simple::Token::ProcessInstruction>
611              
612             =head1 COPYRIGHT
613              
614             Copyright (c) 2004 by Curtis "Ovid" Poe. All rights reserved. This program is
615             free software; you may redistribute it and/or modify it under the same terms as
616             Perl itself
617              
618             =head1 AUTHOR
619              
620             Curtis "Ovid" Poe E<lt>eop_divo_sitruc@yahoo.comE<gt>
621              
622             Reverse the name to email the author.
623              
624             =cut