File Coverage

blib/lib/HTML/PullParser/Nested.pm
Criterion Covered Total %
statement 138 138 100.0
branch 73 80 91.2
condition 17 24 70.8
subroutine 13 13 100.0
pod 6 6 100.0
total 247 261 94.6


line stmt bran cond sub pod time code
1             # $Id: Nested.pm 4647 2010-03-09 18:10:10Z chris $
2              
3             =head1 NAME
4              
5             HTML::PullParser::Nested - Wrapper around HTML::PullParser with awareness of tag nesting.
6              
7              
8             =head1 SYNOPSIS
9              
10             use HTML::PullParser::Nested;
11              
12             my $p = HTML::PullParser::Nested->new(
13             doc => \ "...
  • abcd
  • efgh
  • wvyz
...
  • 1
  • 2
  • 9
",
14             start => "'S',tagname,attr,attrseq,text",
15             end => "'E',tagname,text",
16             text => "'T',text,is_cdata",
17             );
18            
19             while (my $token = $p->get_token()) {
20             if ($token->[0] eq "S" && $token->[1] eq "ul") {
21             $p->push_nest($token);
22             print "List:\n";
23             while (my $token = $p->get_token()) {
24             if ($token->[0] eq "S" && $token->[1] eq "li") {
25             print $p->get_token()->[1], "\n";
26             }
27             }
28             print "\n";
29             $p->pop_nest();
30             }
31             }
32              
33              
34             =head1 DESCRIPTION
35              
36             This class is a wrapper around HTML::PullParser with awareness of the nesting
37             of tags.
38              
39             There is a cursor, which points to the current position within the
40             document. It should be thought of as pointing to the start of the
41             next token, or to 'EOL' (eof of level).
42              
43             Tokens can be read sequentially, and the cursor will be advanced after
44             each read. They can also be unread, reversing any effects of their having
45             been read.
46              
47             As noted, the class is aware of tag nesting, giving the concept of
48             nesting levels. Level 1 encompasses the whole document. As any point
49             a new nesting level can be pushed on, specifying a tag type. In effect,
50             the parser then behaves as if it had instead been opened on a document
51             only containing the content up the closing tag. It is then possible to
52             pop a nesting level, which then moves the cursor to the start of the
53             closing tag and switches to the parent nesting level.
54              
55             =cut
56              
57              
58             package HTML::PullParser::Nested;
59              
60 1     1   823 use strict;
  1         2  
  1         33  
61 1     1   5 use warnings;
  1         2  
  1         38  
62              
63             our $VERSION = '0.04';
64              
65 1     1   5 use Carp;
  1         4  
  1         85  
66              
67 1     1   906 use HTML::PullParser;
  1         13856  
  1         2712  
68              
69             =head1 METHODS
70              
71             =head2 new(file => $file, %options), new(doc => \$doc, %options)
72              
73             Constructor. %options gets passed to the encapsulated HTML::PullParser
74             object and largely has the same restrictions. As HTML::PullParser::Nested
75             needs to be able to process tokens returned by HTML::PullParser, there are
76             some restrictions on the argspecs for each token type. Firstly, so that
77             the token type can be identified, either event, or distinct literal strings
78             must be present at the same array index in the argspec for each returned
79             token type. For start and end tags, tagname must also be present somewhere.
80              
81             =head2 get_token()
82              
83             Read and return the next token and advance the cursor. If the cursor
84             points to eol, undef will be returned on the first read attempt, and
85             an error raised thereafter.
86              
87             =head2 unget_token(@tokens)
88              
89             Reverse the effects of get_token().
90              
91             =head2 eol()
92              
93             End of level flag. Returns true after get_token() has returned undef to
94             signify end of level.
95              
96             =head2 push_nest($token)
97              
98             Push a new nesting level onto the stack. $token should be on start
99             tag. The current level will now correspond of all tags up to the
100             corresponding close tag.
101              
102             The corresponding closing tag is determined by counting the start and
103             end tags of the current nesting level. This means that if
104              
105            
106            
107            
108            
109            
110            
111            
112              
113             is encountered whilst the current nesting level is tracking tags,
114             the parser will end either end up 3 tags deeper or at the same depth
115             depending whether push_nest(), pop_nest() are called for the tag.
116              
117             It is safe to call push_nest() twice for the same tag type.
118              
119             =head2 pop_nest()
120              
121             Pop a nesting level from the stack. Skips to the end of the current
122             nesting level if necessary.
123              
124             =cut
125              
126              
127             sub new {
128 23     23 1 1400 my $class = shift;
129 23         90 my %args = @_;
130              
131 23         40 my $self = {};
132              
133 23         144 bless $self, $class;
134              
135 23         91 $self->_parse_argspecs(%args);
136 18         165 $self->{'p'} = HTML::PullParser->new(%args);
137 18         1455 $self->{'nest'} = [{'tagname' => undef, 'depth' => 0}];
138            
139 18         65 return $self;
140              
141             }
142              
143             sub push_nest {
144 9     9 1 61 my $self = shift;
145 9         12 my $token = shift;
146              
147 9         20 my $tagname = $self->_canon_token($token)->[1];;
148              
149 9         17 unshift @{$self->{'nest'}}, {'tagname' => $tagname, 'depth' => 0};
  9         47  
150              
151             }
152              
153             sub pop_nest {
154 7     7 1 33 my $self = shift;
155              
156 7         14 my $nest = $self->{'nest'}->[0];
157              
158 7 100       9 if (scalar @{$self->{'nest'}} == 1) {
  7         24  
159 1         185 croak "nesting level underflow";
160             }
161              
162 6 100       17 if ($nest->{'depth'} >= 0) {
163 1         4 while ($self->get_token()) { }
164 1 50       5 die "Assert failed" unless ($nest->{'depth'} == -1);
165             }
166              
167 6         8 shift @{$self->{'nest'}};
  6         15  
168              
169             }
170              
171             sub eol {
172 53     53 1 302 my $self = shift;
173              
174 53         191 my $nest = $self->{'nest'}->[0];
175              
176 53         138 return $nest->{'depth'} == -1;
177              
178             }
179              
180             sub get_token {
181 114     114 1 610 my $self = shift;
182              
183 114         177 my $nest = $self->{'nest'}->[0];
184              
185 114 100       266 if ($nest->{'depth'} == -1) { croak "read past eol"; }
  12         4246  
186              
187 102         288 my $token = $self->{'p'}->get_token();
188 102         1230 my $canon = $self->_canon_token($token);
189              
190 102 100       151 if (scalar @{$self->{'nest'}} == 1) {
  102         219  
191 63 100       130 if (!defined $canon) {
192 14         21 $nest->{'depth'}--;
193 14 50       32 die "Assert failed" unless ($nest->{'depth'} == -1);
194             }
195             } else {
196 39 100       180 if (!defined $canon) {
    50          
197 1         111 croak "tokens don't nest correctly";
198             } elsif ($canon->[0] =~ m/^(?:start|end)$/) {
199 38 100       250 if ($canon->[1] eq $nest->{'tagname'}) {
200 22 100       45 if ($canon->[0] eq "start") {
201 9         15 $nest->{'depth'}++;
202             } else {
203 13         16 $nest->{'depth'}--;
204 13 100       36 if ($nest->{'depth'} == -1) {
205 9         31 $self->{'p'}->unget_token($token); # Leave token for parent level;
206 9         53 $token = undef; $canon = undef;
  9         13  
207             }
208             }
209             }
210             }
211             }
212              
213 101         307 return $token;
214              
215             }
216              
217             sub unget_token {
218 14     14 1 92 my $self = shift;
219              
220 14         23 my $nest = $self->{'nest'}->[0];
221              
222 14         34 while (@_) { # Need to recognise undef items in arg list.
223 17         38 my $token = shift;
224 17         33 my $canon = $self->_canon_token($token);
225 17 100       18 if (scalar @{$self->{'nest'}} == 1) {
  17         44  
226 6 100       158 if (!defined $canon) {
227 3         5 $nest->{'depth'}++;
228 3 50       10 die "Assert failed" unless ($nest->{'depth'} == 0);
229             }
230             } else {
231 11 100       50 if (!defined $canon) {
    50          
232 2         5 $nest->{'depth'}++;
233 2 50       7 die "Assert failed" unless ($nest->{'depth'} == 0);
234 2         9 next; # Don't want to add token back onto stack, that was done in get_token()
235            
236             } elsif ($canon->[0] =~ m/^(?:start|end)$/) {
237 9 100       24 if ($canon->[1] eq $nest->{'tagname'}) {
238 8 100       15 if ($canon->[0] eq "start") {
239 7         11 $nest->{'depth'}--;
240 7 100       18 if ($nest->{'depth'} == -1) {
241 1         121 croak "nesting tag underflow";
242             }
243             } else {
244 1         3 $nest->{'depth'}++;
245             }
246             }
247             }
248             }
249              
250 14         50 $self->{'p'}->unget_token($token);
251              
252             }
253              
254             }
255              
256             # HTML::PullParser allows the client to supply an argspec, specifying what data about a token should be
257             # returned by get_token(). We want to preserve this flexibility, but also need to be able to process
258             # start and end tags returned by get_token(). We therefore parse the argspecs supplied by the client
259             # to try to find a way to turn this format into a canonical token with argspec "event,tagname" for
260             # start / end tags (and "'other'" for other tokens).
261             sub _parse_argspecs {
262 23     23   36 my $self = shift;
263 23         68 my %args = @_;
264 23 100 66     139 if (!defined $args{'start'} || !defined $args{'end'}) { croak "need argspec for start and end"; }
  1         148  
265              
266             # Firstly, for each token type, get the array index of (if present) event, tagname and literal string (plus the string content)
267 22         35 my $argspec_info = {};
268 22         43 foreach (qw(start end text process comment declaration)) {
269 132 100       333 if (defined $args{$_}) {
270 51         116 $argspec_info->{$_} = $self->_parse_argspec($args{$_});
271             }
272             }
273              
274             # Now try to find an array index corresponding to either event or a literal string for each token type.
275 22         50 my $arg_idx = { map {$_ => -1} qw(event_idx string_idx) };
  44         118  
276 22         85 foreach my $event (keys %$argspec_info) {
277 51         78 foreach ( qw(event_idx string_idx) ) {
278 102 100 100     3087 if (defined $arg_idx->{$_} && $arg_idx->{$_} == -1) {
    100 66        
      66        
279 44         132 $arg_idx->{$_} = $argspec_info->{$event}->{$_};
280             } elsif ( defined $arg_idx->{$_} && (!defined $argspec_info->{$event}->{$_} || $arg_idx->{$_} != $argspec_info->{$event}->{$_}) ) {
281 1         3 $arg_idx->{$_} = undef;
282             }
283             }
284             }
285              
286              
287             # Finally, store the info require to identify each token type (and tag name).
288 22         70 $self->{'arg_info'} = {};
289              
290             # We can now identidy the token type either by event, or by the literal string.
291 22 100       70 if (defined $arg_idx->{'event_idx'}) {
    100          
292 2         6 $self->{'arg_info'}->{'event_idx'} = $arg_idx->{'event_idx'};
293             } elsif (defined $arg_idx->{'string_idx'}) {
294 18         39 my %strs = map {$argspec_info->{$_}->{'string'} => 1} keys %$argspec_info;
  39         140  
295 18 100       61 if (keys %strs != keys %$argspec_info) { croak "'string' must be unique across all argspecs"; }
  1         148  
296 17         38 $self->{'arg_info'}->{'string_idx'} = $arg_idx->{'string_idx'};
297 17         2471 $self->{'arg_info'}->{'start_string'} = $argspec_info->{'start'}->{'string'};
298 17         56 $self->{'arg_info'}->{'end_string'} = $argspec_info->{'end'}->{'string'};
299             } else {
300 2         434 croak "need either event or 'string' at a consistent index across all argspecs"
301             }
302              
303             # For start and end tags, we also need the tagname.
304 19 100 66     94 if (defined $argspec_info->{'start'}->{'tagname_idx'} && defined defined $argspec_info->{'end'}->{'tagname_idx'} ) {
305 18         48 $self->{'arg_info'}->{'start_tagname_idx'} = $argspec_info->{'start'}->{'tagname_idx'};
306 18         113 $self->{'arg_info'}->{'end_tagname_idx'} = $argspec_info->{'end'}->{'tagname_idx'};
307             } else {
308 1         141 croak "need tagname in argspec for start and end tags";
309             }
310              
311             }
312              
313             # Get the array index of (if present) event, tagname and literal string (plus the string content)
314             sub _parse_argspec {
315 51     51   67 my $self = shift;
316 51         299 my @argspec = split(/,/, shift);
317              
318 51         66 my $i;
319 51         56 my ($event_idx, $tagname_idx, $string_idx, $string);
320              
321 51         137 for ($i = 0; $i < @argspec; $i++) {
322 192 100 66     2077 if ($argspec[$i] eq "event" && !defined $event_idx) {
    100 66        
    100 66        
323 9         28 $event_idx = $i;
324             } elsif ($argspec[$i] eq "tagname" && !defined $tagname_idx) {
325 42         115 $tagname_idx = $i;
326             } elsif ((my ($str) = $argspec[$i] =~ m/^'(.+)'$/) && !defined $string_idx) {
327 39         47 $string_idx = $i;
328 39         115 $string = $str;
329             }
330             }
331              
332 51         360 return {'event_idx' => $event_idx, 'tagname_idx' => $tagname_idx, 'string_idx' => $string_idx, 'string' => $string};
333              
334             }
335              
336             # For start + end tags, return result in the form "event,tagname". For other tokens, uses the form "'other'"
337             sub _canon_token {
338 128     128   137 my $self = shift;
339 128         135 my $token = shift;
340 128         163 my $canon = [];
341              
342 128 100       492 if (!defined $token) {
    100          
    50          
343 20         42 return undef;
344             } elsif (defined $self->{'arg_info'}->{'event_idx'}) {
345 5         12 $canon->[0] = $token->[$self->{'arg_info'}->{'event_idx'}];
346 5 100       22 if ($canon->[0] !~ m/^(?:start|end)$/) { $canon->[0] = "other"; } # Flatten other token types to 'other' for consistency with detection based upon string.
  1         3  
347             } elsif (defined $self->{'arg_info'}->{'string_idx'}) {
348 103 100       283 if ($token->[$self->{'arg_info'}->{'string_idx'}] eq $self->{'arg_info'}->{'start_string'}) {
    100          
349 79         171 $canon->[0] = "start";
350             } elsif ($token->[$self->{'arg_info'}->{'string_idx'}] eq $self->{'arg_info'}->{'end_string'}) {
351 22         43 $canon->[0] = "end";
352             } else {
353 2         5 $canon->[0] = "other";
354             }
355             }
356              
357 108 100       226 if ($canon->[0] eq "start") {
    100          
358 82         179 $canon->[1] = $token->[$self->{'arg_info'}->{'start_tagname_idx'}];
359             } elsif ($canon->[0] eq "end") {
360 23         52 $canon->[1] = $token->[$self->{'arg_info'}->{'end_tagname_idx'}];
361             }
362              
363 108         182 return $canon;
364             }
365              
366             1;
367              
368             =head1 SEE ALSO
369              
370             L
371              
372              
373             =head1 AUTHOR
374              
375             Christopher Key
376              
377              
378             =head1 COPYRIGHT AND LICENCE
379              
380             Copyright (C) 2010 Christopher Key
381              
382             This library is free software; you can redistribute it and/or modify
383             it under the same terms as Perl itself, either Perl version 5.8.4 or,
384             at your option, any later version of Perl 5 you may have available.
385              
386             =cut