File Coverage

blib/lib/HTML/PullParser/Nested.pm

Criterion	Covered	Total	%
statement	138	138	100.0
branch	73	80	91.2
condition	17	24	70.8
subroutine	13	13	100.0
pod	6	6	100.0
total	247	261	94.6

line	stmt	bran	cond	sub	pod	time	code
1							# $Id: Nested.pm 4647 2010-03-09 18:10:10Z chris $
2
3							=head1 NAME
4
5							HTML::PullParser::Nested - Wrapper around HTML::PullParser with awareness of tag nesting.
6
7
8							=head1 SYNOPSIS
9
10							use HTML::PullParser::Nested;
11
12							my $p = HTML::PullParser::Nested->new(
13							doc => \ "... abcd efgh wvyz ... 1 2 9 ",
14							start => "'S',tagname,attr,attrseq,text",
15							end => "'E',tagname,text",
16							text => "'T',text,is_cdata",
17							);
18
19							while (my $token = $p->get_token()) {
20							if ($token->[0] eq "S" && $token->[1] eq "ul") {
21							$p->push_nest($token);
22							print "List:\n";
23							while (my $token = $p->get_token()) {
24							if ($token->[0] eq "S" && $token->[1] eq "li") {
25							print $p->get_token()->[1], "\n";
26							}
27							}
28							print "\n";
29							$p->pop_nest();
30							}
31							}
32
33
34							=head1 DESCRIPTION
35
36							This class is a wrapper around HTML::PullParser with awareness of the nesting
37							of tags.
38
39							There is a cursor, which points to the current position within the
40							document. It should be thought of as pointing to the start of the
41							next token, or to 'EOL' (eof of level).
42
43							Tokens can be read sequentially, and the cursor will be advanced after
44							each read. They can also be unread, reversing any effects of their having
45							been read.
46
47							As noted, the class is aware of tag nesting, giving the concept of
48							nesting levels. Level 1 encompasses the whole document. As any point
49							a new nesting level can be pushed on, specifying a tag type. In effect,
50							the parser then behaves as if it had instead been opened on a document
51							only containing the content up the closing tag. It is then possible to
52							pop a nesting level, which then moves the cursor to the start of the
53							closing tag and switches to the parent nesting level.
54
55							=cut
56
57
58							package HTML::PullParser::Nested;
59
60	1			1		823	use strict;
	1					2
	1					33
61	1			1		5	use warnings;
	1					2
	1					38
62
63							our $VERSION = '0.04';
64
65	1			1		5	use Carp;
	1					4
	1					85
66
67	1			1		906	use HTML::PullParser;
	1					13856
	1					2712
68
69							=head1 METHODS
70
71							=head2 new(file => $file, %options), new(doc => \$doc, %options)
72
73							Constructor. %options gets passed to the encapsulated HTML::PullParser
74							object and largely has the same restrictions. As HTML::PullParser::Nested
75							needs to be able to process tokens returned by HTML::PullParser, there are
76							some restrictions on the argspecs for each token type. Firstly, so that
77							the token type can be identified, either event, or distinct literal strings
78							must be present at the same array index in the argspec for each returned
79							token type. For start and end tags, tagname must also be present somewhere.
80
81							=head2 get_token()
82
83							Read and return the next token and advance the cursor. If the cursor
84							points to eol, undef will be returned on the first read attempt, and
85							an error raised thereafter.
86
87							=head2 unget_token(@tokens)
88
89							Reverse the effects of get_token().
90
91							=head2 eol()
92
93							End of level flag. Returns true after get_token() has returned undef to
94							signify end of level.
95
96							=head2 push_nest($token)
97
98							Push a new nesting level onto the stack. $token should be on start
99							tag. The current level will now correspond of all tags up to the
100							corresponding close tag.
101
102							The corresponding closing tag is determined by counting the start and
103							end tags of the current nesting level. This means that if
104
105
106
107
108
109
110
111
112
113							is encountered whilst the current nesting level is tracking tags,
114							the parser will end either end up 3 tags deeper or at the same depth
115							depending whether push_nest(), pop_nest() are called for the tag.
116
117							It is safe to call push_nest() twice for the same tag type.
118
119							=head2 pop_nest()
120
121							Pop a nesting level from the stack. Skips to the end of the current
122							nesting level if necessary.
123
124							=cut
125
126
127							sub new {
128	23			23	1	1400	my $class = shift;
129	23					90	my %args = @_;
130
131	23					40	my $self = {};
132
133	23					144	bless $self, $class;
134
135	23					91	$self->_parse_argspecs(%args);
136	18					165	$self->{'p'} = HTML::PullParser->new(%args);
137	18					1455	$self->{'nest'} = [{'tagname' => undef, 'depth' => 0}];
138
139	18					65	return $self;
140
141							}
142
143							sub push_nest {
144	9			9	1	61	my $self = shift;
145	9					12	my $token = shift;
146
147	9					20	my $tagname = $self->_canon_token($token)->[1];;
148
149	9					17	unshift @{$self->{'nest'}}, {'tagname' => $tagname, 'depth' => 0};
	9					47
150
151							}
152
153							sub pop_nest {
154	7			7	1	33	my $self = shift;
155
156	7					14	my $nest = $self->{'nest'}->[0];
157
158	7	100				9	if (scalar @{$self->{'nest'}} == 1) {
	7					24
159	1					185	croak "nesting level underflow";
160							}
161
162	6	100				17	if ($nest->{'depth'} >= 0) {
163	1					4	while ($self->get_token()) { }
164	1	50				5	die "Assert failed" unless ($nest->{'depth'} == -1);
165							}
166
167	6					8	shift @{$self->{'nest'}};
	6					15
168
169							}
170
171							sub eol {
172	53			53	1	302	my $self = shift;
173
174	53					191	my $nest = $self->{'nest'}->[0];
175
176	53					138	return $nest->{'depth'} == -1;
177
178							}
179
180							sub get_token {
181	114			114	1	610	my $self = shift;
182
183	114					177	my $nest = $self->{'nest'}->[0];
184
185	114	100				266	if ($nest->{'depth'} == -1) { croak "read past eol"; }
	12					4246
186
187	102					288	my $token = $self->{'p'}->get_token();
188	102					1230	my $canon = $self->_canon_token($token);
189
190	102	100				151	if (scalar @{$self->{'nest'}} == 1) {
	102					219
191	63	100				130	if (!defined $canon) {
192	14					21	$nest->{'depth'}--;
193	14	50				32	die "Assert failed" unless ($nest->{'depth'} == -1);
194							}
195							} else {
196	39	100				180	if (!defined $canon) {
		50
197	1					111	croak "tokens don't nest correctly";
198							} elsif ($canon->[0] =~ m/^(?:start\|end)$/) {
199	38	100				250	if ($canon->[1] eq $nest->{'tagname'}) {
200	22	100				45	if ($canon->[0] eq "start") {
201	9					15	$nest->{'depth'}++;
202							} else {
203	13					16	$nest->{'depth'}--;
204	13	100				36	if ($nest->{'depth'} == -1) {
205	9					31	$self->{'p'}->unget_token($token); # Leave token for parent level;
206	9					53	$token = undef; $canon = undef;
	9					13
207							}
208							}
209							}
210							}
211							}
212
213	101					307	return $token;
214
215							}
216
217							sub unget_token {
218	14			14	1	92	my $self = shift;
219
220	14					23	my $nest = $self->{'nest'}->[0];
221
222	14					34	while (@_) { # Need to recognise undef items in arg list.
223	17					38	my $token = shift;
224	17					33	my $canon = $self->_canon_token($token);
225	17	100				18	if (scalar @{$self->{'nest'}} == 1) {
	17					44
226	6	100				158	if (!defined $canon) {
227	3					5	$nest->{'depth'}++;
228	3	50				10	die "Assert failed" unless ($nest->{'depth'} == 0);
229							}
230							} else {
231	11	100				50	if (!defined $canon) {
		50
232	2					5	$nest->{'depth'}++;
233	2	50				7	die "Assert failed" unless ($nest->{'depth'} == 0);
234	2					9	next; # Don't want to add token back onto stack, that was done in get_token()
235
236							} elsif ($canon->[0] =~ m/^(?:start\|end)$/) {
237	9	100				24	if ($canon->[1] eq $nest->{'tagname'}) {
238	8	100				15	if ($canon->[0] eq "start") {
239	7					11	$nest->{'depth'}--;
240	7	100				18	if ($nest->{'depth'} == -1) {
241	1					121	croak "nesting tag underflow";
242							}
243							} else {
244	1					3	$nest->{'depth'}++;
245							}
246							}
247							}
248							}
249
250	14					50	$self->{'p'}->unget_token($token);
251
252							}
253
254							}
255
256							# HTML::PullParser allows the client to supply an argspec, specifying what data about a token should be
257							# returned by get_token(). We want to preserve this flexibility, but also need to be able to process
258							# start and end tags returned by get_token(). We therefore parse the argspecs supplied by the client
259							# to try to find a way to turn this format into a canonical token with argspec "event,tagname" for
260							# start / end tags (and "'other'" for other tokens).
261							sub _parse_argspecs {
262	23			23		36	my $self = shift;
263	23					68	my %args = @_;
264	23	100	66			139	if (!defined $args{'start'} \|\| !defined $args{'end'}) { croak "need argspec for start and end"; }
	1					148
265
266							# Firstly, for each token type, get the array index of (if present) event, tagname and literal string (plus the string content)
267	22					35	my $argspec_info = {};
268	22					43	foreach (qw(start end text process comment declaration)) {
269	132	100				333	if (defined $args{$_}) {
270	51					116	$argspec_info->{$_} = $self->_parse_argspec($args{$_});
271							}
272							}
273
274							# Now try to find an array index corresponding to either event or a literal string for each token type.
275	22					50	my $arg_idx = { map {$_ => -1} qw(event_idx string_idx) };
	44					118
276	22					85	foreach my $event (keys %$argspec_info) {
277	51					78	foreach ( qw(event_idx string_idx) ) {
278	102	100	100			3087	if (defined $arg_idx->{$_} && $arg_idx->{$_} == -1) {
		100	66
			66
279	44					132	$arg_idx->{$_} = $argspec_info->{$event}->{$_};
280							} elsif ( defined $arg_idx->{$_} && (!defined $argspec_info->{$event}->{$_} \|\| $arg_idx->{$_} != $argspec_info->{$event}->{$_}) ) {
281	1					3	$arg_idx->{$_} = undef;
282							}
283							}
284							}
285
286
287							# Finally, store the info require to identify each token type (and tag name).
288	22					70	$self->{'arg_info'} = {};
289
290							# We can now identidy the token type either by event, or by the literal string.
291	22	100				70	if (defined $arg_idx->{'event_idx'}) {
		100
292	2					6	$self->{'arg_info'}->{'event_idx'} = $arg_idx->{'event_idx'};
293							} elsif (defined $arg_idx->{'string_idx'}) {
294	18					39	my %strs = map {$argspec_info->{$_}->{'string'} => 1} keys %$argspec_info;
	39					140
295	18	100				61	if (keys %strs != keys %$argspec_info) { croak "'string' must be unique across all argspecs"; }
	1					148
296	17					38	$self->{'arg_info'}->{'string_idx'} = $arg_idx->{'string_idx'};
297	17					2471	$self->{'arg_info'}->{'start_string'} = $argspec_info->{'start'}->{'string'};
298	17					56	$self->{'arg_info'}->{'end_string'} = $argspec_info->{'end'}->{'string'};
299							} else {
300	2					434	croak "need either event or 'string' at a consistent index across all argspecs"
301							}
302
303							# For start and end tags, we also need the tagname.
304	19	100	66			94	if (defined $argspec_info->{'start'}->{'tagname_idx'} && defined defined $argspec_info->{'end'}->{'tagname_idx'} ) {
305	18					48	$self->{'arg_info'}->{'start_tagname_idx'} = $argspec_info->{'start'}->{'tagname_idx'};
306	18					113	$self->{'arg_info'}->{'end_tagname_idx'} = $argspec_info->{'end'}->{'tagname_idx'};
307							} else {
308	1					141	croak "need tagname in argspec for start and end tags";
309							}
310
311							}
312
313							# Get the array index of (if present) event, tagname and literal string (plus the string content)
314							sub _parse_argspec {
315	51			51		67	my $self = shift;
316	51					299	my @argspec = split(/,/, shift);
317
318	51					66	my $i;
319	51					56	my ($event_idx, $tagname_idx, $string_idx, $string);
320
321	51					137	for ($i = 0; $i < @argspec; $i++) {
322	192	100	66			2077	if ($argspec[$i] eq "event" && !defined $event_idx) {
		100	66
		100	66
323	9					28	$event_idx = $i;
324							} elsif ($argspec[$i] eq "tagname" && !defined $tagname_idx) {
325	42					115	$tagname_idx = $i;
326							} elsif ((my ($str) = $argspec[$i] =~ m/^'(.+)'$/) && !defined $string_idx) {
327	39					47	$string_idx = $i;
328	39					115	$string = $str;
329							}
330							}
331
332	51					360	return {'event_idx' => $event_idx, 'tagname_idx' => $tagname_idx, 'string_idx' => $string_idx, 'string' => $string};
333
334							}
335
336							# For start + end tags, return result in the form "event,tagname". For other tokens, uses the form "'other'"
337							sub _canon_token {
338	128			128		137	my $self = shift;
339	128					135	my $token = shift;
340	128					163	my $canon = [];
341
342	128	100				492	if (!defined $token) {
		100
		50
343	20					42	return undef;
344							} elsif (defined $self->{'arg_info'}->{'event_idx'}) {
345	5					12	$canon->[0] = $token->[$self->{'arg_info'}->{'event_idx'}];
346	5	100				22	if ($canon->[0] !~ m/^(?:start\|end)$/) { $canon->[0] = "other"; } # Flatten other token types to 'other' for consistency with detection based upon string.
	1					3
347							} elsif (defined $self->{'arg_info'}->{'string_idx'}) {
348	103	100				283	if ($token->[$self->{'arg_info'}->{'string_idx'}] eq $self->{'arg_info'}->{'start_string'}) {
		100
349	79					171	$canon->[0] = "start";
350							} elsif ($token->[$self->{'arg_info'}->{'string_idx'}] eq $self->{'arg_info'}->{'end_string'}) {
351	22					43	$canon->[0] = "end";
352							} else {
353	2					5	$canon->[0] = "other";
354							}
355							}
356
357	108	100				226	if ($canon->[0] eq "start") {
		100
358	82					179	$canon->[1] = $token->[$self->{'arg_info'}->{'start_tagname_idx'}];
359							} elsif ($canon->[0] eq "end") {
360	23					52	$canon->[1] = $token->[$self->{'arg_info'}->{'end_tagname_idx'}];
361							}
362
363	108					182	return $canon;
364							}
365
366							1;
367
368							=head1 SEE ALSO
369
370							L
371
372
373							=head1 AUTHOR
374
375							Christopher Key
376
377
378							=head1 COPYRIGHT AND LICENCE
379
380							Copyright (C) 2010 Christopher Key
381
382							This library is free software; you can redistribute it and/or modify
383							it under the same terms as Perl itself, either Perl version 5.8.4 or,
384							at your option, any later version of Perl 5 you may have available.
385
386							=cut