line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# $Id: Nested.pm 4647 2010-03-09 18:10:10Z chris $ |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=head1 NAME |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
HTML::PullParser::Nested - Wrapper around HTML::PullParser with awareness of tag nesting. |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
=head1 SYNOPSIS |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
use HTML::PullParser::Nested; |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
my $p = HTML::PullParser::Nested->new( |
13
|
|
|
|
|
|
|
doc => \ "......", |
14
|
|
|
|
|
|
|
start => "'S',tagname,attr,attrseq,text", |
15
|
|
|
|
|
|
|
end => "'E',tagname,text", |
16
|
|
|
|
|
|
|
text => "'T',text,is_cdata", |
17
|
|
|
|
|
|
|
); |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
while (my $token = $p->get_token()) { |
20
|
|
|
|
|
|
|
if ($token->[0] eq "S" && $token->[1] eq "ul") { |
21
|
|
|
|
|
|
|
$p->push_nest($token); |
22
|
|
|
|
|
|
|
print "List:\n"; |
23
|
|
|
|
|
|
|
while (my $token = $p->get_token()) { |
24
|
|
|
|
|
|
|
if ($token->[0] eq "S" && $token->[1] eq "li") { |
25
|
|
|
|
|
|
|
print $p->get_token()->[1], "\n"; |
26
|
|
|
|
|
|
|
} |
27
|
|
|
|
|
|
|
} |
28
|
|
|
|
|
|
|
print "\n"; |
29
|
|
|
|
|
|
|
$p->pop_nest(); |
30
|
|
|
|
|
|
|
} |
31
|
|
|
|
|
|
|
} |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
=head1 DESCRIPTION |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
This class is a wrapper around HTML::PullParser with awareness of the nesting |
37
|
|
|
|
|
|
|
of tags. |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
There is a cursor, which points to the current position within the |
40
|
|
|
|
|
|
|
document. It should be thought of as pointing to the start of the |
41
|
|
|
|
|
|
|
next token, or to 'EOL' (eof of level). |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
Tokens can be read sequentially, and the cursor will be advanced after |
44
|
|
|
|
|
|
|
each read. They can also be unread, reversing any effects of their having |
45
|
|
|
|
|
|
|
been read. |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
As noted, the class is aware of tag nesting, giving the concept of |
48
|
|
|
|
|
|
|
nesting levels. Level 1 encompasses the whole document. As any point |
49
|
|
|
|
|
|
|
a new nesting level can be pushed on, specifying a tag type. In effect, |
50
|
|
|
|
|
|
|
the parser then behaves as if it had instead been opened on a document |
51
|
|
|
|
|
|
|
only containing the content up the closing tag. It is then possible to |
52
|
|
|
|
|
|
|
pop a nesting level, which then moves the cursor to the start of the |
53
|
|
|
|
|
|
|
closing tag and switches to the parent nesting level. |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=cut |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
package HTML::PullParser::Nested; |
59
|
|
|
|
|
|
|
|
60
|
1
|
|
|
1
|
|
823
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
33
|
|
61
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
38
|
|
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
our $VERSION = '0.04'; |
64
|
|
|
|
|
|
|
|
65
|
1
|
|
|
1
|
|
5
|
use Carp; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
85
|
|
66
|
|
|
|
|
|
|
|
67
|
1
|
|
|
1
|
|
906
|
use HTML::PullParser; |
|
1
|
|
|
|
|
13856
|
|
|
1
|
|
|
|
|
2712
|
|
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=head1 METHODS |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
=head2 new(file => $file, %options), new(doc => \$doc, %options) |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
Constructor. %options gets passed to the encapsulated HTML::PullParser |
74
|
|
|
|
|
|
|
object and largely has the same restrictions. As HTML::PullParser::Nested |
75
|
|
|
|
|
|
|
needs to be able to process tokens returned by HTML::PullParser, there are |
76
|
|
|
|
|
|
|
some restrictions on the argspecs for each token type. Firstly, so that |
77
|
|
|
|
|
|
|
the token type can be identified, either event, or distinct literal strings |
78
|
|
|
|
|
|
|
must be present at the same array index in the argspec for each returned |
79
|
|
|
|
|
|
|
token type. For start and end tags, tagname must also be present somewhere. |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
=head2 get_token() |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
Read and return the next token and advance the cursor. If the cursor |
84
|
|
|
|
|
|
|
points to eol, undef will be returned on the first read attempt, and |
85
|
|
|
|
|
|
|
an error raised thereafter. |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
=head2 unget_token(@tokens) |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
Reverse the effects of get_token(). |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=head2 eol() |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
End of level flag. Returns true after get_token() has returned undef to |
94
|
|
|
|
|
|
|
signify end of level. |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
=head2 push_nest($token) |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
Push a new nesting level onto the stack. $token should be on start |
99
|
|
|
|
|
|
|
tag. The current level will now correspond of all tags up to the |
100
|
|
|
|
|
|
|
corresponding close tag. |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
The corresponding closing tag is determined by counting the start and |
103
|
|
|
|
|
|
|
end tags of the current nesting level. This means that if |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
is encountered whilst the current nesting level is tracking tags, |
114
|
|
|
|
|
|
|
the parser will end either end up 3 tags deeper or at the same depth |
115
|
|
|
|
|
|
|
depending whether push_nest(), pop_nest() are called for the tag. |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
It is safe to call push_nest() twice for the same tag type. |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
=head2 pop_nest() |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
Pop a nesting level from the stack. Skips to the end of the current |
122
|
|
|
|
|
|
|
nesting level if necessary. |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
=cut |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
sub new { |
128
|
23
|
|
|
23
|
1
|
1400
|
my $class = shift; |
129
|
23
|
|
|
|
|
90
|
my %args = @_; |
130
|
|
|
|
|
|
|
|
131
|
23
|
|
|
|
|
40
|
my $self = {}; |
132
|
|
|
|
|
|
|
|
133
|
23
|
|
|
|
|
144
|
bless $self, $class; |
134
|
|
|
|
|
|
|
|
135
|
23
|
|
|
|
|
91
|
$self->_parse_argspecs(%args); |
136
|
18
|
|
|
|
|
165
|
$self->{'p'} = HTML::PullParser->new(%args); |
137
|
18
|
|
|
|
|
1455
|
$self->{'nest'} = [{'tagname' => undef, 'depth' => 0}]; |
138
|
|
|
|
|
|
|
|
139
|
18
|
|
|
|
|
65
|
return $self; |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
} |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
sub push_nest { |
144
|
9
|
|
|
9
|
1
|
61
|
my $self = shift; |
145
|
9
|
|
|
|
|
12
|
my $token = shift; |
146
|
|
|
|
|
|
|
|
147
|
9
|
|
|
|
|
20
|
my $tagname = $self->_canon_token($token)->[1];; |
148
|
|
|
|
|
|
|
|
149
|
9
|
|
|
|
|
17
|
unshift @{$self->{'nest'}}, {'tagname' => $tagname, 'depth' => 0}; |
|
9
|
|
|
|
|
47
|
|
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
} |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
sub pop_nest { |
154
|
7
|
|
|
7
|
1
|
33
|
my $self = shift; |
155
|
|
|
|
|
|
|
|
156
|
7
|
|
|
|
|
14
|
my $nest = $self->{'nest'}->[0]; |
157
|
|
|
|
|
|
|
|
158
|
7
|
100
|
|
|
|
9
|
if (scalar @{$self->{'nest'}} == 1) { |
|
7
|
|
|
|
|
24
|
|
159
|
1
|
|
|
|
|
185
|
croak "nesting level underflow"; |
160
|
|
|
|
|
|
|
} |
161
|
|
|
|
|
|
|
|
162
|
6
|
100
|
|
|
|
17
|
if ($nest->{'depth'} >= 0) { |
163
|
1
|
|
|
|
|
4
|
while ($self->get_token()) { } |
164
|
1
|
50
|
|
|
|
5
|
die "Assert failed" unless ($nest->{'depth'} == -1); |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
|
167
|
6
|
|
|
|
|
8
|
shift @{$self->{'nest'}}; |
|
6
|
|
|
|
|
15
|
|
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
} |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
sub eol { |
172
|
53
|
|
|
53
|
1
|
302
|
my $self = shift; |
173
|
|
|
|
|
|
|
|
174
|
53
|
|
|
|
|
191
|
my $nest = $self->{'nest'}->[0]; |
175
|
|
|
|
|
|
|
|
176
|
53
|
|
|
|
|
138
|
return $nest->{'depth'} == -1; |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
sub get_token { |
181
|
114
|
|
|
114
|
1
|
610
|
my $self = shift; |
182
|
|
|
|
|
|
|
|
183
|
114
|
|
|
|
|
177
|
my $nest = $self->{'nest'}->[0]; |
184
|
|
|
|
|
|
|
|
185
|
114
|
100
|
|
|
|
266
|
if ($nest->{'depth'} == -1) { croak "read past eol"; } |
|
12
|
|
|
|
|
4246
|
|
186
|
|
|
|
|
|
|
|
187
|
102
|
|
|
|
|
288
|
my $token = $self->{'p'}->get_token(); |
188
|
102
|
|
|
|
|
1230
|
my $canon = $self->_canon_token($token); |
189
|
|
|
|
|
|
|
|
190
|
102
|
100
|
|
|
|
151
|
if (scalar @{$self->{'nest'}} == 1) { |
|
102
|
|
|
|
|
219
|
|
191
|
63
|
100
|
|
|
|
130
|
if (!defined $canon) { |
192
|
14
|
|
|
|
|
21
|
$nest->{'depth'}--; |
193
|
14
|
50
|
|
|
|
32
|
die "Assert failed" unless ($nest->{'depth'} == -1); |
194
|
|
|
|
|
|
|
} |
195
|
|
|
|
|
|
|
} else { |
196
|
39
|
100
|
|
|
|
180
|
if (!defined $canon) { |
|
|
50
|
|
|
|
|
|
197
|
1
|
|
|
|
|
111
|
croak "tokens don't nest correctly"; |
198
|
|
|
|
|
|
|
} elsif ($canon->[0] =~ m/^(?:start|end)$/) { |
199
|
38
|
100
|
|
|
|
250
|
if ($canon->[1] eq $nest->{'tagname'}) { |
200
|
22
|
100
|
|
|
|
45
|
if ($canon->[0] eq "start") { |
201
|
9
|
|
|
|
|
15
|
$nest->{'depth'}++; |
202
|
|
|
|
|
|
|
} else { |
203
|
13
|
|
|
|
|
16
|
$nest->{'depth'}--; |
204
|
13
|
100
|
|
|
|
36
|
if ($nest->{'depth'} == -1) { |
205
|
9
|
|
|
|
|
31
|
$self->{'p'}->unget_token($token); # Leave token for parent level; |
206
|
9
|
|
|
|
|
53
|
$token = undef; $canon = undef; |
|
9
|
|
|
|
|
13
|
|
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
|
213
|
101
|
|
|
|
|
307
|
return $token; |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
} |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
sub unget_token { |
218
|
14
|
|
|
14
|
1
|
92
|
my $self = shift; |
219
|
|
|
|
|
|
|
|
220
|
14
|
|
|
|
|
23
|
my $nest = $self->{'nest'}->[0]; |
221
|
|
|
|
|
|
|
|
222
|
14
|
|
|
|
|
34
|
while (@_) { # Need to recognise undef items in arg list. |
223
|
17
|
|
|
|
|
38
|
my $token = shift; |
224
|
17
|
|
|
|
|
33
|
my $canon = $self->_canon_token($token); |
225
|
17
|
100
|
|
|
|
18
|
if (scalar @{$self->{'nest'}} == 1) { |
|
17
|
|
|
|
|
44
|
|
226
|
6
|
100
|
|
|
|
158
|
if (!defined $canon) { |
227
|
3
|
|
|
|
|
5
|
$nest->{'depth'}++; |
228
|
3
|
50
|
|
|
|
10
|
die "Assert failed" unless ($nest->{'depth'} == 0); |
229
|
|
|
|
|
|
|
} |
230
|
|
|
|
|
|
|
} else { |
231
|
11
|
100
|
|
|
|
50
|
if (!defined $canon) { |
|
|
50
|
|
|
|
|
|
232
|
2
|
|
|
|
|
5
|
$nest->{'depth'}++; |
233
|
2
|
50
|
|
|
|
7
|
die "Assert failed" unless ($nest->{'depth'} == 0); |
234
|
2
|
|
|
|
|
9
|
next; # Don't want to add token back onto stack, that was done in get_token() |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
} elsif ($canon->[0] =~ m/^(?:start|end)$/) { |
237
|
9
|
100
|
|
|
|
24
|
if ($canon->[1] eq $nest->{'tagname'}) { |
238
|
8
|
100
|
|
|
|
15
|
if ($canon->[0] eq "start") { |
239
|
7
|
|
|
|
|
11
|
$nest->{'depth'}--; |
240
|
7
|
100
|
|
|
|
18
|
if ($nest->{'depth'} == -1) { |
241
|
1
|
|
|
|
|
121
|
croak "nesting tag underflow"; |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
} else { |
244
|
1
|
|
|
|
|
3
|
$nest->{'depth'}++; |
245
|
|
|
|
|
|
|
} |
246
|
|
|
|
|
|
|
} |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
} |
249
|
|
|
|
|
|
|
|
250
|
14
|
|
|
|
|
50
|
$self->{'p'}->unget_token($token); |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
} |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
# HTML::PullParser allows the client to supply an argspec, specifying what data about a token should be |
257
|
|
|
|
|
|
|
# returned by get_token(). We want to preserve this flexibility, but also need to be able to process |
258
|
|
|
|
|
|
|
# start and end tags returned by get_token(). We therefore parse the argspecs supplied by the client |
259
|
|
|
|
|
|
|
# to try to find a way to turn this format into a canonical token with argspec "event,tagname" for |
260
|
|
|
|
|
|
|
# start / end tags (and "'other'" for other tokens). |
261
|
|
|
|
|
|
|
sub _parse_argspecs { |
262
|
23
|
|
|
23
|
|
36
|
my $self = shift; |
263
|
23
|
|
|
|
|
68
|
my %args = @_; |
264
|
23
|
100
|
66
|
|
|
139
|
if (!defined $args{'start'} || !defined $args{'end'}) { croak "need argspec for start and end"; } |
|
1
|
|
|
|
|
148
|
|
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
# Firstly, for each token type, get the array index of (if present) event, tagname and literal string (plus the string content) |
267
|
22
|
|
|
|
|
35
|
my $argspec_info = {}; |
268
|
22
|
|
|
|
|
43
|
foreach (qw(start end text process comment declaration)) { |
269
|
132
|
100
|
|
|
|
333
|
if (defined $args{$_}) { |
270
|
51
|
|
|
|
|
116
|
$argspec_info->{$_} = $self->_parse_argspec($args{$_}); |
271
|
|
|
|
|
|
|
} |
272
|
|
|
|
|
|
|
} |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
# Now try to find an array index corresponding to either event or a literal string for each token type. |
275
|
22
|
|
|
|
|
50
|
my $arg_idx = { map {$_ => -1} qw(event_idx string_idx) }; |
|
44
|
|
|
|
|
118
|
|
276
|
22
|
|
|
|
|
85
|
foreach my $event (keys %$argspec_info) { |
277
|
51
|
|
|
|
|
78
|
foreach ( qw(event_idx string_idx) ) { |
278
|
102
|
100
|
100
|
|
|
3087
|
if (defined $arg_idx->{$_} && $arg_idx->{$_} == -1) { |
|
|
100
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
279
|
44
|
|
|
|
|
132
|
$arg_idx->{$_} = $argspec_info->{$event}->{$_}; |
280
|
|
|
|
|
|
|
} elsif ( defined $arg_idx->{$_} && (!defined $argspec_info->{$event}->{$_} || $arg_idx->{$_} != $argspec_info->{$event}->{$_}) ) { |
281
|
1
|
|
|
|
|
3
|
$arg_idx->{$_} = undef; |
282
|
|
|
|
|
|
|
} |
283
|
|
|
|
|
|
|
} |
284
|
|
|
|
|
|
|
} |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
# Finally, store the info require to identify each token type (and tag name). |
288
|
22
|
|
|
|
|
70
|
$self->{'arg_info'} = {}; |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
# We can now identidy the token type either by event, or by the literal string. |
291
|
22
|
100
|
|
|
|
70
|
if (defined $arg_idx->{'event_idx'}) { |
|
|
100
|
|
|
|
|
|
292
|
2
|
|
|
|
|
6
|
$self->{'arg_info'}->{'event_idx'} = $arg_idx->{'event_idx'}; |
293
|
|
|
|
|
|
|
} elsif (defined $arg_idx->{'string_idx'}) { |
294
|
18
|
|
|
|
|
39
|
my %strs = map {$argspec_info->{$_}->{'string'} => 1} keys %$argspec_info; |
|
39
|
|
|
|
|
140
|
|
295
|
18
|
100
|
|
|
|
61
|
if (keys %strs != keys %$argspec_info) { croak "'string' must be unique across all argspecs"; } |
|
1
|
|
|
|
|
148
|
|
296
|
17
|
|
|
|
|
38
|
$self->{'arg_info'}->{'string_idx'} = $arg_idx->{'string_idx'}; |
297
|
17
|
|
|
|
|
2471
|
$self->{'arg_info'}->{'start_string'} = $argspec_info->{'start'}->{'string'}; |
298
|
17
|
|
|
|
|
56
|
$self->{'arg_info'}->{'end_string'} = $argspec_info->{'end'}->{'string'}; |
299
|
|
|
|
|
|
|
} else { |
300
|
2
|
|
|
|
|
434
|
croak "need either event or 'string' at a consistent index across all argspecs" |
301
|
|
|
|
|
|
|
} |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
# For start and end tags, we also need the tagname. |
304
|
19
|
100
|
66
|
|
|
94
|
if (defined $argspec_info->{'start'}->{'tagname_idx'} && defined defined $argspec_info->{'end'}->{'tagname_idx'} ) { |
305
|
18
|
|
|
|
|
48
|
$self->{'arg_info'}->{'start_tagname_idx'} = $argspec_info->{'start'}->{'tagname_idx'}; |
306
|
18
|
|
|
|
|
113
|
$self->{'arg_info'}->{'end_tagname_idx'} = $argspec_info->{'end'}->{'tagname_idx'}; |
307
|
|
|
|
|
|
|
} else { |
308
|
1
|
|
|
|
|
141
|
croak "need tagname in argspec for start and end tags"; |
309
|
|
|
|
|
|
|
} |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
} |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
# Get the array index of (if present) event, tagname and literal string (plus the string content) |
314
|
|
|
|
|
|
|
sub _parse_argspec { |
315
|
51
|
|
|
51
|
|
67
|
my $self = shift; |
316
|
51
|
|
|
|
|
299
|
my @argspec = split(/,/, shift); |
317
|
|
|
|
|
|
|
|
318
|
51
|
|
|
|
|
66
|
my $i; |
319
|
51
|
|
|
|
|
56
|
my ($event_idx, $tagname_idx, $string_idx, $string); |
320
|
|
|
|
|
|
|
|
321
|
51
|
|
|
|
|
137
|
for ($i = 0; $i < @argspec; $i++) { |
322
|
192
|
100
|
66
|
|
|
2077
|
if ($argspec[$i] eq "event" && !defined $event_idx) { |
|
|
100
|
66
|
|
|
|
|
|
|
100
|
66
|
|
|
|
|
323
|
9
|
|
|
|
|
28
|
$event_idx = $i; |
324
|
|
|
|
|
|
|
} elsif ($argspec[$i] eq "tagname" && !defined $tagname_idx) { |
325
|
42
|
|
|
|
|
115
|
$tagname_idx = $i; |
326
|
|
|
|
|
|
|
} elsif ((my ($str) = $argspec[$i] =~ m/^'(.+)'$/) && !defined $string_idx) { |
327
|
39
|
|
|
|
|
47
|
$string_idx = $i; |
328
|
39
|
|
|
|
|
115
|
$string = $str; |
329
|
|
|
|
|
|
|
} |
330
|
|
|
|
|
|
|
} |
331
|
|
|
|
|
|
|
|
332
|
51
|
|
|
|
|
360
|
return {'event_idx' => $event_idx, 'tagname_idx' => $tagname_idx, 'string_idx' => $string_idx, 'string' => $string}; |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
} |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
# For start + end tags, return result in the form "event,tagname". For other tokens, uses the form "'other'" |
337
|
|
|
|
|
|
|
sub _canon_token { |
338
|
128
|
|
|
128
|
|
137
|
my $self = shift; |
339
|
128
|
|
|
|
|
135
|
my $token = shift; |
340
|
128
|
|
|
|
|
163
|
my $canon = []; |
341
|
|
|
|
|
|
|
|
342
|
128
|
100
|
|
|
|
492
|
if (!defined $token) { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
343
|
20
|
|
|
|
|
42
|
return undef; |
344
|
|
|
|
|
|
|
} elsif (defined $self->{'arg_info'}->{'event_idx'}) { |
345
|
5
|
|
|
|
|
12
|
$canon->[0] = $token->[$self->{'arg_info'}->{'event_idx'}]; |
346
|
5
|
100
|
|
|
|
22
|
if ($canon->[0] !~ m/^(?:start|end)$/) { $canon->[0] = "other"; } # Flatten other token types to 'other' for consistency with detection based upon string. |
|
1
|
|
|
|
|
3
|
|
347
|
|
|
|
|
|
|
} elsif (defined $self->{'arg_info'}->{'string_idx'}) { |
348
|
103
|
100
|
|
|
|
283
|
if ($token->[$self->{'arg_info'}->{'string_idx'}] eq $self->{'arg_info'}->{'start_string'}) { |
|
|
100
|
|
|
|
|
|
349
|
79
|
|
|
|
|
171
|
$canon->[0] = "start"; |
350
|
|
|
|
|
|
|
} elsif ($token->[$self->{'arg_info'}->{'string_idx'}] eq $self->{'arg_info'}->{'end_string'}) { |
351
|
22
|
|
|
|
|
43
|
$canon->[0] = "end"; |
352
|
|
|
|
|
|
|
} else { |
353
|
2
|
|
|
|
|
5
|
$canon->[0] = "other"; |
354
|
|
|
|
|
|
|
} |
355
|
|
|
|
|
|
|
} |
356
|
|
|
|
|
|
|
|
357
|
108
|
100
|
|
|
|
226
|
if ($canon->[0] eq "start") { |
|
|
100
|
|
|
|
|
|
358
|
82
|
|
|
|
|
179
|
$canon->[1] = $token->[$self->{'arg_info'}->{'start_tagname_idx'}]; |
359
|
|
|
|
|
|
|
} elsif ($canon->[0] eq "end") { |
360
|
23
|
|
|
|
|
52
|
$canon->[1] = $token->[$self->{'arg_info'}->{'end_tagname_idx'}]; |
361
|
|
|
|
|
|
|
} |
362
|
|
|
|
|
|
|
|
363
|
108
|
|
|
|
|
182
|
return $canon; |
364
|
|
|
|
|
|
|
} |
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
1; |
367
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
=head1 SEE ALSO |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
L |
371
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
=head1 AUTHOR |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
Christopher Key |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENCE |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
Copyright (C) 2010 Christopher Key |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
383
|
|
|
|
|
|
|
it under the same terms as Perl itself, either Perl version 5.8.4 or, |
384
|
|
|
|
|
|
|
at your option, any later version of Perl 5 you may have available. |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
=cut |