line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
=head1 NAME
|
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
XML::TokeParser - Simplified interface to XML::Parser
|
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=head1 SYNOPSIS
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
use XML::TokeParser;
|
8
|
|
|
|
|
|
|
#
|
9
|
|
|
|
|
|
|
#parse from file
|
10
|
|
|
|
|
|
|
my $p = XML::TokeParser->new('file.xml')
|
11
|
|
|
|
|
|
|
#
|
12
|
|
|
|
|
|
|
#parse from open handle
|
13
|
|
|
|
|
|
|
open IN, 'file.xml' or die $!;
|
14
|
|
|
|
|
|
|
my $p = XML::TokeParser->new( \*IN, Noempty => 1 );
|
15
|
|
|
|
|
|
|
#
|
16
|
|
|
|
|
|
|
#parse literal text
|
17
|
|
|
|
|
|
|
my $text = 'text';
|
18
|
|
|
|
|
|
|
my $p = XML::TokeParser->new( \$text, Namespaces => 1 );
|
19
|
|
|
|
|
|
|
#
|
20
|
|
|
|
|
|
|
#read next token
|
21
|
|
|
|
|
|
|
my $token = $p->get_token();
|
22
|
|
|
|
|
|
|
#
|
23
|
|
|
|
|
|
|
#skip to and read text
|
24
|
|
|
|
|
|
|
$p->get_tag('title');
|
25
|
|
|
|
|
|
|
$p->get_text();
|
26
|
|
|
|
|
|
|
#
|
27
|
|
|
|
|
|
|
#read text of next , ignoring any internal markup
|
28
|
|
|
|
|
|
|
$p->get_tag('para');
|
29
|
|
|
|
|
|
|
$p->get_trimmed_text('/para');
|
30
|
|
|
|
|
|
|
#
|
31
|
|
|
|
|
|
|
#process if interesting text
|
32
|
|
|
|
|
|
|
$t = $p->get_tag('para');
|
33
|
|
|
|
|
|
|
$p->begin_saving($t);
|
34
|
|
|
|
|
|
|
if ( $p->get_trimmed_text('/para') =~ /interesting stuff/ ) {
|
35
|
|
|
|
|
|
|
$p->restore_saved();
|
36
|
|
|
|
|
|
|
process_para($p);
|
37
|
|
|
|
|
|
|
}
|
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
=head1 DESCRIPTION
|
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
XML::TokeParser provides a procedural ("pull mode") interface to XML::Parser
|
42
|
|
|
|
|
|
|
in much the same way that Gisle Aas' HTML::TokeParser provides a procedural
|
43
|
|
|
|
|
|
|
interface to HTML::Parser. XML::TokeParser splits its XML input up into
|
44
|
|
|
|
|
|
|
"tokens," each corresponding to an XML::Parser event.
|
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
A token is a B> reference to an array whose first element is an event-type
|
47
|
|
|
|
|
|
|
string and whose last element is the literal text of the XML input that
|
48
|
|
|
|
|
|
|
generated the event, with intermediate elements varying according to the
|
49
|
|
|
|
|
|
|
event type.
|
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
Each token is an I |
52
|
|
|
|
|
|
|
Read
|
53
|
|
|
|
|
|
|
L<"XML::TokeParser::Token"|"XML::TokeParser::Token">
|
54
|
|
|
|
|
|
|
to learn what methods are available for inspecting the token,
|
55
|
|
|
|
|
|
|
and retrieving data from it.
|
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=cut
|
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
package XML::TokeParser;
|
60
|
|
|
|
|
|
|
|
61
|
2
|
|
|
2
|
|
16849
|
use strict;
|
|
2
|
|
|
|
|
6
|
|
|
2
|
|
|
|
|
103
|
|
62
|
2
|
|
|
2
|
|
12
|
use vars qw($VERSION);
|
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
6007
|
|
63
|
2
|
|
|
2
|
|
15
|
use Carp;# qw( carp croak );
|
|
2
|
|
|
|
|
8
|
|
|
2
|
|
|
|
|
173
|
|
64
|
2
|
|
|
2
|
|
4251
|
use XML::Parser;
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
$VERSION = '0.05';
|
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
=head1 METHODS
|
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=over 4
|
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=item $p = XML::TokeParser->new($input, [options])
|
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
Creates a new parser, specifying the input source and any options. If
|
75
|
|
|
|
|
|
|
$input is a string, it is the name of the file to parse. If $input is a
|
76
|
|
|
|
|
|
|
reference to a string, that string is the actual text to parse. If $input
|
77
|
|
|
|
|
|
|
is a reference to a typeglob or an IO::Handle object corresponding to an
|
78
|
|
|
|
|
|
|
open file or socket, the text read from the handle will be parsed.
|
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
Options are name=>value pairs and can be any of the following:
|
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
=over 4
|
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
=item Namespaces
|
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
If set to a true value, namespace processing is enabled.
|
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=item ParseParamEnt
|
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
This option is passed on to the underlying XML::Parser object; see that
|
91
|
|
|
|
|
|
|
module's documentation for details.
|
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
=item Noempty
|
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
If set to a true value, text tokens consisting of only whitespace (such as
|
96
|
|
|
|
|
|
|
those created by indentation and line breaks in between tags) will be
|
97
|
|
|
|
|
|
|
ignored.
|
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
=item Latin
|
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
If set to a true value, all text other than the literal text elements of
|
102
|
|
|
|
|
|
|
tokens will be translated into the ISO 8859-1 (Latin-1) character encoding
|
103
|
|
|
|
|
|
|
rather than the normal UTF-8 encoding.
|
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
=item Catalog
|
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
The value is the URI of a catalog file used to resolve PUBLIC and SYSTEM
|
108
|
|
|
|
|
|
|
identifiers. See XML::Catalog for details.
|
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
=back
|
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=cut
|
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
sub new {
|
115
|
|
|
|
|
|
|
my $class = shift;
|
116
|
|
|
|
|
|
|
my $source = shift;
|
117
|
|
|
|
|
|
|
my %args = ( Noempty => 0, Latin => 0, Catalog => 0, @_ );
|
118
|
|
|
|
|
|
|
my $self = { output => [], EOF => 0 };
|
119
|
|
|
|
|
|
|
$self->{noempty} = delete $args{Noempty};
|
120
|
|
|
|
|
|
|
$self->{latin} = delete $args{Latin};
|
121
|
|
|
|
|
|
|
my $catname = delete $args{Catalog};
|
122
|
|
|
|
|
|
|
my $parser = XML::Parser->new(%args) or croak "$!";
|
123
|
|
|
|
|
|
|
$parser->setHandlers(
|
124
|
|
|
|
|
|
|
Start => \&start,
|
125
|
|
|
|
|
|
|
End => \&end,
|
126
|
|
|
|
|
|
|
Char => \&char,
|
127
|
|
|
|
|
|
|
Proc => \&proc,
|
128
|
|
|
|
|
|
|
Comment => \&comment
|
129
|
|
|
|
|
|
|
);
|
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
if ($catname) {
|
132
|
|
|
|
|
|
|
require XML::Catalog;
|
133
|
|
|
|
|
|
|
my $catalog = XML::Catalog->new($catname) or croak "$!";
|
134
|
|
|
|
|
|
|
$parser->setHandlers( ExternEnt => $catalog->get_handler($parser) );
|
135
|
|
|
|
|
|
|
}
|
136
|
|
|
|
|
|
|
$self->{parser} = $parser->parse_start( TokeParser => $self ) or croak "$!";
|
137
|
|
|
|
|
|
|
if ( ref($source) eq 'SCALAR' ) {
|
138
|
|
|
|
|
|
|
$self->{src} = $source;
|
139
|
|
|
|
|
|
|
$self->{src_offset} = 0;
|
140
|
|
|
|
|
|
|
}
|
141
|
|
|
|
|
|
|
elsif ( ref($source) =~ /^IO:|^GLOB$/ ) {
|
142
|
|
|
|
|
|
|
$self->{srcfile} = $source;
|
143
|
|
|
|
|
|
|
}
|
144
|
|
|
|
|
|
|
else {
|
145
|
|
|
|
|
|
|
require IO::File;
|
146
|
|
|
|
|
|
|
$self->{srcfile} = IO::File->new( $source, 'r' ) or return undef;
|
147
|
|
|
|
|
|
|
$self->{opened} = 1;
|
148
|
|
|
|
|
|
|
}
|
149
|
|
|
|
|
|
|
bless $self, $class;
|
150
|
|
|
|
|
|
|
}
|
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
sub DESTROY {
|
153
|
|
|
|
|
|
|
my $self = shift;
|
154
|
|
|
|
|
|
|
$self->{srcfile}->close() if $self->{srcfile} && $self->{opened};
|
155
|
|
|
|
|
|
|
$self->{parser} = undef;
|
156
|
|
|
|
|
|
|
}
|
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
=item $token = $p->get_token()
|
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
Returns the next token, as an array reference, from the input. Returns
|
162
|
|
|
|
|
|
|
undef if there are no remaining tokens.
|
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=cut
|
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
sub get_token {
|
167
|
|
|
|
|
|
|
local $_;
|
168
|
|
|
|
|
|
|
my $self = shift;
|
169
|
|
|
|
|
|
|
$self->parsechunks();
|
170
|
|
|
|
|
|
|
my $token = shift @{ $self->{output} };
|
171
|
|
|
|
|
|
|
while ($self->{noempty}
|
172
|
|
|
|
|
|
|
&& $token
|
173
|
|
|
|
|
|
|
&& $token->[0] eq 'T'
|
174
|
|
|
|
|
|
|
&& $token->[1] =~ /^\s*$/ )
|
175
|
|
|
|
|
|
|
{
|
176
|
|
|
|
|
|
|
$self->parsechunks();
|
177
|
|
|
|
|
|
|
$token = shift @{ $self->{output} };
|
178
|
|
|
|
|
|
|
}
|
179
|
|
|
|
|
|
|
if ( defined $token and exists $self->{savebuff} ) {
|
180
|
|
|
|
|
|
|
push @{ $self->{savebuff} }, [@$token];
|
181
|
|
|
|
|
|
|
}
|
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
return() unless defined $token;
|
184
|
|
|
|
|
|
|
bless $token, 'XML::TokeParser::Token';
|
185
|
|
|
|
|
|
|
}
|
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
=item $p->unget_token($token,...)
|
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
Pushes tokens back so they will be re-read. Useful if you've read one or
|
191
|
|
|
|
|
|
|
more tokens too far. Correctly handles "partial" tokens returned by
|
192
|
|
|
|
|
|
|
get_tag().
|
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=cut
|
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
sub unget_token {
|
197
|
|
|
|
|
|
|
my $self = shift;
|
198
|
|
|
|
|
|
|
while ( my $token = pop @_ ) {
|
199
|
|
|
|
|
|
|
if ( @$token == 4 && ref( $token->[1] ) eq 'HASH' ) {
|
200
|
|
|
|
|
|
|
$token = [ 'S', @$token ];
|
201
|
|
|
|
|
|
|
}
|
202
|
|
|
|
|
|
|
elsif ( @$token == 2 && substr( $token->[0], 0, 1 ) eq '/' ) {
|
203
|
|
|
|
|
|
|
$token = [ 'E', substr( $token->[0], 1 ), $token->[1] ];
|
204
|
|
|
|
|
|
|
}
|
205
|
|
|
|
|
|
|
unshift @{ $self->{output} }, $token;
|
206
|
|
|
|
|
|
|
}
|
207
|
|
|
|
|
|
|
}
|
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
=item $token = $p->get_tag( [$token] )
|
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
If no argument given, skips tokens until the next start tag or end tag
|
213
|
|
|
|
|
|
|
token. If an argument is given, skips tokens until the start tag or end tag
|
214
|
|
|
|
|
|
|
(if the argument begins with '/') for the named element. The returned
|
215
|
|
|
|
|
|
|
token does not include an event type code; its first element is the element
|
216
|
|
|
|
|
|
|
name, prefixed by a '/' if the token is for an end tag.
|
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
=cut
|
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
sub get_tag {
|
221
|
|
|
|
|
|
|
my ( $self, $tag ) = @_;
|
222
|
|
|
|
|
|
|
my $token;
|
223
|
|
|
|
|
|
|
while ( $token = $self->get_token() ) {
|
224
|
|
|
|
|
|
|
my $type = shift @$token;
|
225
|
|
|
|
|
|
|
next unless $type =~ /[SE]/;
|
226
|
|
|
|
|
|
|
substr( $token->[0], 0, 0 ) = '/' if $type eq 'E';
|
227
|
|
|
|
|
|
|
last unless ( defined($tag) && $token->[0] ne $tag );
|
228
|
|
|
|
|
|
|
}
|
229
|
|
|
|
|
|
|
$token;
|
230
|
|
|
|
|
|
|
}
|
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
=item $text = $p->get_text( [$token] )
|
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
If no argument given, returns the text at the current position, or an empty
|
236
|
|
|
|
|
|
|
string if the next token is not a 'T' token. If an argument is given,
|
237
|
|
|
|
|
|
|
gathers up all text between the current position and the specified start or
|
238
|
|
|
|
|
|
|
end tag, stripping out any intervening tags (much like the way a typical
|
239
|
|
|
|
|
|
|
Web browser deals with unknown tags).
|
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
=cut
|
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
sub get_text {
|
244
|
|
|
|
|
|
|
my ( $self, $tag ) = @_;
|
245
|
|
|
|
|
|
|
my $text = "";
|
246
|
|
|
|
|
|
|
my $token;
|
247
|
|
|
|
|
|
|
while ( $token = $self->get_token() ) {
|
248
|
|
|
|
|
|
|
my $type = $token->[0];
|
249
|
|
|
|
|
|
|
if ( $type eq 'T' ) {
|
250
|
|
|
|
|
|
|
$text .= $token->[1];
|
251
|
|
|
|
|
|
|
}
|
252
|
|
|
|
|
|
|
elsif ( $type =~ /[SE]/ ) {
|
253
|
|
|
|
|
|
|
my $tt = $token->[1];
|
254
|
|
|
|
|
|
|
$tt = "/$tt" if $type eq 'E';
|
255
|
|
|
|
|
|
|
last if ( !defined($tag) || $tt eq $tag );
|
256
|
|
|
|
|
|
|
}
|
257
|
|
|
|
|
|
|
elsif ( $type eq 'PI' ) {
|
258
|
|
|
|
|
|
|
last;
|
259
|
|
|
|
|
|
|
}
|
260
|
|
|
|
|
|
|
}
|
261
|
|
|
|
|
|
|
if ($token) {
|
262
|
|
|
|
|
|
|
$self->unget_token($token);
|
263
|
|
|
|
|
|
|
pop @{ $self->{savebuff} } if exists $self->{savebuff};
|
264
|
|
|
|
|
|
|
}
|
265
|
|
|
|
|
|
|
$text;
|
266
|
|
|
|
|
|
|
}
|
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
=item $text = $p->get_trimmed_text( [$token] )
|
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
Like get_text(), but deletes any leading or trailing whitespaces and
|
272
|
|
|
|
|
|
|
collapses multiple whitespace (including newlines) into single spaces.
|
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
=cut
|
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
sub get_trimmed_text {
|
277
|
|
|
|
|
|
|
my $self = shift;
|
278
|
|
|
|
|
|
|
my $text = $self->get_text(@_);
|
279
|
|
|
|
|
|
|
$text =~ s/^\s+//;
|
280
|
|
|
|
|
|
|
$text =~ s/\s+$//;
|
281
|
|
|
|
|
|
|
$text =~ s/\s+/ /g;
|
282
|
|
|
|
|
|
|
$text;
|
283
|
|
|
|
|
|
|
}
|
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
=item $p->begin_saving( [$token] )
|
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
Causes subsequent calls to get_token(), get_tag(), get_text(), and
|
289
|
|
|
|
|
|
|
get_trimmed_text() to save the returned tokens. In conjunction with
|
290
|
|
|
|
|
|
|
restore_saved(), allows you to "back up" within a token stream. If an
|
291
|
|
|
|
|
|
|
argument is supplied, it is placed at the beginning of the list of saved
|
292
|
|
|
|
|
|
|
tokens (useful because you often won't know you want to begin saving until
|
293
|
|
|
|
|
|
|
you've already read the first token you want saved).
|
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
=cut
|
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
sub begin_saving {
|
298
|
|
|
|
|
|
|
my $self = shift;
|
299
|
|
|
|
|
|
|
delete $self->{savebuff} if exists $self->{savebuff};
|
300
|
|
|
|
|
|
|
$self->{savebuff} = [];
|
301
|
|
|
|
|
|
|
push @{ $self->{savebuff} }, @_ if @_;
|
302
|
|
|
|
|
|
|
}
|
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
=item $p->restore_saved()
|
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
Pushes all the tokens saved by begin_saving() back onto the token stream.
|
308
|
|
|
|
|
|
|
Stops saving tokens. To cancel saving without backing up, call
|
309
|
|
|
|
|
|
|
begin_saving() and restore_saved() in succession.
|
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
=back
|
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
=cut
|
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
sub restore_saved {
|
316
|
|
|
|
|
|
|
my $self = shift;
|
317
|
|
|
|
|
|
|
if ( exists $self->{savebuff} ) {
|
318
|
|
|
|
|
|
|
$self->unget_token( @{ $self->{savebuff} } );
|
319
|
|
|
|
|
|
|
delete $self->{savebuff};
|
320
|
|
|
|
|
|
|
}
|
321
|
|
|
|
|
|
|
}
|
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
=for comment
|
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
=cut
|
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
sub parsechunks {
|
329
|
|
|
|
|
|
|
my ($self) = @_;
|
330
|
|
|
|
|
|
|
my $buf = '';
|
331
|
|
|
|
|
|
|
while ( ( !@{ $self->{output} } || $self->{output}[-1][0] eq 'T' )
|
332
|
|
|
|
|
|
|
&& !$self->{EOF} )
|
333
|
|
|
|
|
|
|
{
|
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
# if (defined($self->{src}) && ($self->{src_offset}{src}}))) {
|
336
|
|
|
|
|
|
|
# $buf=substr(${$self->{src}},$self->{src_offset},4096);
|
337
|
|
|
|
|
|
|
# $self->{src_offset}+=4096;
|
338
|
|
|
|
|
|
|
# }
|
339
|
|
|
|
|
|
|
if ( defined( $self->{src} ) ) {
|
340
|
|
|
|
|
|
|
if ( $self->{src_offset} < length( ${ $self->{src} } ) ) {
|
341
|
|
|
|
|
|
|
$buf = substr( ${ $self->{src} }, $self->{src_offset}, 4096 );
|
342
|
|
|
|
|
|
|
$self->{src_offset} += 4096;
|
343
|
|
|
|
|
|
|
}
|
344
|
|
|
|
|
|
|
}
|
345
|
|
|
|
|
|
|
else {
|
346
|
|
|
|
|
|
|
read( $self->{srcfile}, $buf, 4096 );
|
347
|
|
|
|
|
|
|
}
|
348
|
|
|
|
|
|
|
if ( length($buf) == 0 ) {
|
349
|
|
|
|
|
|
|
$self->{EOF} = 1;
|
350
|
|
|
|
|
|
|
$self->{parser}->parse_done();
|
351
|
|
|
|
|
|
|
}
|
352
|
|
|
|
|
|
|
else {
|
353
|
|
|
|
|
|
|
$self->{parser}->parse_more($buf);
|
354
|
|
|
|
|
|
|
}
|
355
|
|
|
|
|
|
|
}
|
356
|
|
|
|
|
|
|
}
|
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
=for comment Start handler
|
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
=cut
|
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
sub start {
|
364
|
|
|
|
|
|
|
my ( $parser, $element, @attrs ) = @_;
|
365
|
|
|
|
|
|
|
my $self = $parser->{TokeParser};
|
366
|
|
|
|
|
|
|
push @{ $self->{output} },
|
367
|
|
|
|
|
|
|
[ 'S', $self->nsname($element), {}, [], $parser->original_string() ];
|
368
|
|
|
|
|
|
|
while (@attrs) {
|
369
|
|
|
|
|
|
|
my ( $name, $val ) = ( shift @attrs, shift @attrs );
|
370
|
|
|
|
|
|
|
$name = $self->nsname($name);
|
371
|
|
|
|
|
|
|
$val = $self->encode($val);
|
372
|
|
|
|
|
|
|
$self->{output}[-1][2]{$name} = $val;
|
373
|
|
|
|
|
|
|
push @{ $self->{output}[-1][3] }, $name;
|
374
|
|
|
|
|
|
|
}
|
375
|
|
|
|
|
|
|
}
|
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
=for comment End handler
|
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
=cut
|
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
sub end {
|
383
|
|
|
|
|
|
|
my ( $parser, $element ) = @_;
|
384
|
|
|
|
|
|
|
my $self = $parser->{TokeParser};
|
385
|
|
|
|
|
|
|
push @{ $self->{output} },
|
386
|
|
|
|
|
|
|
[ 'E', $self->nsname($element), $parser->original_string() ];
|
387
|
|
|
|
|
|
|
}
|
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
=for comment Char handler
|
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
=cut
|
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
sub char {
|
395
|
|
|
|
|
|
|
my ( $parser, $text ) = @_;
|
396
|
|
|
|
|
|
|
my $self = $parser->{TokeParser};
|
397
|
|
|
|
|
|
|
$text = $self->encode($text);
|
398
|
|
|
|
|
|
|
if ( @{ $self->{output} } && $self->{output}[-1][0] eq 'T' ) {
|
399
|
|
|
|
|
|
|
$self->{output}[-1][1] .= $text;
|
400
|
|
|
|
|
|
|
$self->{output}[-1][-1] .= $parser->original_string();
|
401
|
|
|
|
|
|
|
}
|
402
|
|
|
|
|
|
|
else {
|
403
|
|
|
|
|
|
|
push @{ $self->{output} }, [ 'T', $text, $parser->original_string() ];
|
404
|
|
|
|
|
|
|
}
|
405
|
|
|
|
|
|
|
}
|
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
=for comment
|
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
=cut
|
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
sub proc {
|
413
|
|
|
|
|
|
|
my ( $parser, $target, $value ) = @_;
|
414
|
|
|
|
|
|
|
my $self = $parser->{TokeParser};
|
415
|
|
|
|
|
|
|
push @{ $self->{output} },
|
416
|
|
|
|
|
|
|
[
|
417
|
|
|
|
|
|
|
"PI", $self->encode($target),
|
418
|
|
|
|
|
|
|
$self->encode($value), $parser->original_string()
|
419
|
|
|
|
|
|
|
];
|
420
|
|
|
|
|
|
|
}
|
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
=for comment Comment handler
|
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
=cut
|
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
sub comment {
|
428
|
|
|
|
|
|
|
my ( $parser, $text ) = @_;
|
429
|
|
|
|
|
|
|
my $self = $parser->{TokeParser};
|
430
|
|
|
|
|
|
|
push @{ $self->{output} },
|
431
|
|
|
|
|
|
|
[ "C", $self->encode($text), $parser->original_string() ];
|
432
|
|
|
|
|
|
|
}
|
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
=for comment nsname
|
436
|
|
|
|
|
|
|
figures out the Namespace if Namespaces is on
|
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
=cut
|
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
sub nsname {
|
441
|
|
|
|
|
|
|
my ( $self, $name ) = @_;
|
442
|
|
|
|
|
|
|
my $parser = $self->{parser};
|
443
|
|
|
|
|
|
|
if ( $parser->{Namespaces} ) {
|
444
|
|
|
|
|
|
|
my $ns = $parser->namespace($name) || '';
|
445
|
|
|
|
|
|
|
$name = "{$ns}" . $name;
|
446
|
|
|
|
|
|
|
}
|
447
|
|
|
|
|
|
|
return $self->encode($name);
|
448
|
|
|
|
|
|
|
}
|
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
=for comment
|
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
=cut
|
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
sub encode {
|
456
|
|
|
|
|
|
|
my ( $self, $text ) = @_;
|
457
|
|
|
|
|
|
|
if ( $self->{latin} ) {
|
458
|
|
|
|
|
|
|
$text =~ s{([\xc0-\xc3])(.)}{
|
459
|
|
|
|
|
|
|
my $hi = ord($1);
|
460
|
|
|
|
|
|
|
my $lo = ord($2);
|
461
|
|
|
|
|
|
|
chr((($hi & 0x03) <<6) | ($lo & 0x3F))
|
462
|
|
|
|
|
|
|
}ge;
|
463
|
|
|
|
|
|
|
}
|
464
|
|
|
|
|
|
|
$text;
|
465
|
|
|
|
|
|
|
}
|
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
package XML::TokeParser::Token;
|
469
|
|
|
|
|
|
|
use strict;
|
470
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
=head2 XML::TokeParser::Token
|
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
A token is a blessed array reference,
|
474
|
|
|
|
|
|
|
that you acquire using C<$p-Eget_token> or C<$p-Eget_tag>,
|
475
|
|
|
|
|
|
|
and that might look like:
|
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
["S", $tag, $attr, $attrseq, $raw]
|
478
|
|
|
|
|
|
|
["E", $tag, $raw]
|
479
|
|
|
|
|
|
|
["T", $text, $raw]
|
480
|
|
|
|
|
|
|
["C", $text, $raw]
|
481
|
|
|
|
|
|
|
["PI", $target, $data, $raw]
|
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
If you don't like remembering array indices (you're a real programmer),
|
484
|
|
|
|
|
|
|
you may access the attributes of a token like:
|
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
C<$t-Etag>, C<$t-Eattr>, C<$t-Eattrseq>, C<$t-Eraw>,
|
487
|
|
|
|
|
|
|
C<$t-Etext>, C<$t-Etarget>, C<$t-Edata>.
|
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
B<****Please note that this may change in the future,>
|
490
|
|
|
|
|
|
|
B
|
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
What kind of token is it?
|
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
To find out, inspect your token using any of these is_* methods
|
495
|
|
|
|
|
|
|
(1 == true, 0 == false, d'oh):
|
496
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
=over 4
|
498
|
|
|
|
|
|
|
|
499
|
|
|
|
|
|
|
=item is_text
|
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
=item is_comment
|
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
=item is_pi which is short for is_process_instruction
|
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
=item is_start_tag
|
506
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
=item is_end_tag
|
508
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
=item is_tag
|
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
=back
|
512
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
=cut
|
514
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
# test your token, but don't toke
|
516
|
|
|
|
|
|
|
#sub toke { croak "Don't toke!!!!"; }
|
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
sub is_text { return 1 if $_[0]->[0] eq 'T'; return 0;}
|
519
|
|
|
|
|
|
|
sub is_comment { return 1 if $_[0]->[0] eq 'C'; return 0;}
|
520
|
|
|
|
|
|
|
sub is_pi { return 1 if $_[0]->[0] eq 'PI'; return 0;}
|
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
#sub is_process_instruction { goto &is_pi; }
|
523
|
|
|
|
|
|
|
{
|
524
|
|
|
|
|
|
|
no strict;
|
525
|
|
|
|
|
|
|
*is_process_instruction = *is_pi;
|
526
|
|
|
|
|
|
|
}
|
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
sub is_start_tag {
|
530
|
|
|
|
|
|
|
if( $_[0]->[0] eq 'S'
|
531
|
|
|
|
|
|
|
or ( @{$_[0]} == 4 && ref( $_[0]->[1] ) eq 'HASH' )
|
532
|
|
|
|
|
|
|
){
|
533
|
|
|
|
|
|
|
if(defined $_[1]){
|
534
|
|
|
|
|
|
|
return 1 if $_[0]->[1] eq $_[1];
|
535
|
|
|
|
|
|
|
} else {
|
536
|
|
|
|
|
|
|
return 1;
|
537
|
|
|
|
|
|
|
}
|
538
|
|
|
|
|
|
|
}
|
539
|
|
|
|
|
|
|
return 0;
|
540
|
|
|
|
|
|
|
}
|
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
sub is_end_tag {
|
543
|
|
|
|
|
|
|
if( $_[0]->[0] eq 'E'
|
544
|
|
|
|
|
|
|
or ( @{$_[0]} == 2 && substr( $_[0]->[0], 0, 1 ) eq '/' )
|
545
|
|
|
|
|
|
|
){
|
546
|
|
|
|
|
|
|
if(defined $_[1]){
|
547
|
|
|
|
|
|
|
return 1 if $_[0]->[1] eq $_[1];
|
548
|
|
|
|
|
|
|
} else {
|
549
|
|
|
|
|
|
|
return 1;
|
550
|
|
|
|
|
|
|
}
|
551
|
|
|
|
|
|
|
}
|
552
|
|
|
|
|
|
|
return 0;
|
553
|
|
|
|
|
|
|
}
|
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
sub is_tag {
|
558
|
|
|
|
|
|
|
if( $_[0]->[0] eq 'S'
|
559
|
|
|
|
|
|
|
or $_[0]->[0] eq 'E'
|
560
|
|
|
|
|
|
|
or ( @{$_[0]} == 4 && ref( $_[0]->[1] ) eq 'HASH' )
|
561
|
|
|
|
|
|
|
or ( @{$_[0]} == 2 && substr( $_[0]->[0], 0, 1 ) eq '/' )
|
562
|
|
|
|
|
|
|
){
|
563
|
|
|
|
|
|
|
if( defined $_[1] ){
|
564
|
|
|
|
|
|
|
return 1 if $_[0]->[1] eq $_[1];
|
565
|
|
|
|
|
|
|
} else {
|
566
|
|
|
|
|
|
|
return 1;
|
567
|
|
|
|
|
|
|
}
|
568
|
|
|
|
|
|
|
}
|
569
|
|
|
|
|
|
|
return 0;
|
570
|
|
|
|
|
|
|
}
|
571
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
=pod
|
574
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
What's that token made of?
|
576
|
|
|
|
|
|
|
To retrieve data from your token, use any of the following methods,
|
577
|
|
|
|
|
|
|
depending on the kind of token you have:
|
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
=over 4
|
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
=item target
|
582
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
only for process instructions
|
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
=cut
|
586
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
sub target { return $_[0]->[1] if $_[0]->is_pi; }
|
588
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
=item data
|
590
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
only for process instructions
|
592
|
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
=cut
|
594
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
sub data { return $_[0]->[2] if $_[0]->is_pi; }
|
596
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
=item raw
|
598
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
for all tokens
|
600
|
|
|
|
|
|
|
|
601
|
|
|
|
|
|
|
=cut
|
602
|
|
|
|
|
|
|
|
603
|
|
|
|
|
|
|
sub raw { return $_[0]->[-1]; }
|
604
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
|
606
|
|
|
|
|
|
|
=item attr
|
607
|
|
|
|
|
|
|
|
608
|
|
|
|
|
|
|
only for start tags, returns a hashref ( CC<$t-Eattr>C<-E{href}> ).
|
609
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
=cut
|
611
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
#sub attr { return $_[0]->[2] if $_[0]->is_start_tag(); }
|
613
|
|
|
|
|
|
|
sub attr { return $_[0]->[-3] if $_[0]->is_start_tag(); }
|
614
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
=item my $attrseq = $t->attrseq
|
616
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
only for start tags, returns an array ref of the keys found in C<$t-Eattr>
|
618
|
|
|
|
|
|
|
in the order they originally appeared in.
|
619
|
|
|
|
|
|
|
|
620
|
|
|
|
|
|
|
=cut
|
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
#sub attrseq { return $_[0]->[3] if $_[0]->is_start_tag(); }
|
623
|
|
|
|
|
|
|
sub attrseq { return $_[0]->[-2] if $_[0]->is_start_tag(); }
|
624
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
#for S|E
|
626
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
|
628
|
|
|
|
|
|
|
=item my $tagname = $t->tag
|
629
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
only for tags ( CC<$t-Etag>C< if >C<$t-Eis_start_tag> ).
|
631
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
=cut
|
633
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
sub tag { return $_[0]->[1] if $_[0]->is_tag; }
|
635
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
=item my $text = $token->text
|
637
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
only for tokens of type text and comment
|
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
=back
|
641
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
=cut
|
643
|
|
|
|
|
|
|
|
644
|
|
|
|
|
|
|
sub text { return $_[0]->[1] if $_[0]->is_text or $_[0]->is_comment; }
|
645
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
1;
|
647
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
|
649
|
|
|
|
|
|
|
=pod
|
650
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
Here's more detailed info about the tokens.
|
652
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
=over 4
|
654
|
|
|
|
|
|
|
|
655
|
|
|
|
|
|
|
=item Start tag
|
656
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
The token has five elements: 'S', the element's name, a reference to a hash
|
658
|
|
|
|
|
|
|
of attribute values keyed by attribute names, a reference to an array of
|
659
|
|
|
|
|
|
|
attribute names in the order in which they appeared in the tag, and the
|
660
|
|
|
|
|
|
|
literal text.
|
661
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
=item End tag
|
663
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
The token has three elements: 'E', the element's name, and the literal text.
|
665
|
|
|
|
|
|
|
|
666
|
|
|
|
|
|
|
=item Character data (text)
|
667
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
The token has three elements: 'T', the parsed text, and the literal text.
|
669
|
|
|
|
|
|
|
All contiguous runs of text are gathered into single tokens; there will
|
670
|
|
|
|
|
|
|
never be two 'T' tokens in a row.
|
671
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
=item Comment
|
673
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
The token has three elements: 'C', the parsed text of the comment, and the
|
675
|
|
|
|
|
|
|
literal text.
|
676
|
|
|
|
|
|
|
|
677
|
|
|
|
|
|
|
=item Processing instruction
|
678
|
|
|
|
|
|
|
|
679
|
|
|
|
|
|
|
The token has four elements: 'PI', the target, the data, and the literal
|
680
|
|
|
|
|
|
|
text.
|
681
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
=back
|
683
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
The literal text includes any markup delimiters (pointy brackets,
|
685
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
is in the XML document's original character encoding. All other text is in
|
687
|
|
|
|
|
|
|
UTF-8 (unless the Latin option is set, in which case it's in ISO-8859-1)
|
688
|
|
|
|
|
|
|
regardless of the original encoding, and all entity and character
|
689
|
|
|
|
|
|
|
references are expanded.
|
690
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
If the Namespaces option is set, element and attribute names are prefixed
|
692
|
|
|
|
|
|
|
by their (possibly empty) namespace URIs enclosed in curly brackets and
|
693
|
|
|
|
|
|
|
xmlns:* attributes do not appear in 'S' tokens.
|
694
|
|
|
|
|
|
|
|
695
|
|
|
|
|
|
|
=head1 DIFFERENCES FROM HTML::TokeParser
|
696
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
Uses a true XML parser rather than a modified HTML parser.
|
698
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
Text and comment tokens include extracted text as well as literal text.
|
700
|
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
PI tokens include target and data as well as literal text.
|
702
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
No tokens for declarations.
|
704
|
|
|
|
|
|
|
|
705
|
|
|
|
|
|
|
No "textify" hash.
|
706
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
unget_token correctly handles partial tokens returned by get_tag().
|
708
|
|
|
|
|
|
|
|
709
|
|
|
|
|
|
|
begin_saving() and restore_saved()
|
710
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
=head1 EXAMPLES
|
712
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
Example:
|
714
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
use XML::TokeParser;
|
716
|
|
|
|
|
|
|
use strict;
|
717
|
|
|
|
|
|
|
#
|
718
|
|
|
|
|
|
|
my $text = ' some text ';
|
719
|
|
|
|
|
|
|
my $p = XML::TokeParser->new( \$text );
|
720
|
|
|
|
|
|
|
#
|
721
|
|
|
|
|
|
|
print $/;
|
722
|
|
|
|
|
|
|
#
|
723
|
|
|
|
|
|
|
while( defined( my $t = $p->get_token() ) ){
|
724
|
|
|
|
|
|
|
local $\="\n";
|
725
|
|
|
|
|
|
|
print ' raw = ', $t->raw;
|
726
|
|
|
|
|
|
|
#
|
727
|
|
|
|
|
|
|
if( $t->tag ){
|
728
|
|
|
|
|
|
|
print ' tag = ', $t->tag;
|
729
|
|
|
|
|
|
|
#
|
730
|
|
|
|
|
|
|
if( $t->is_start_tag ) {
|
731
|
|
|
|
|
|
|
print ' attr = ', join ',', %{$t->attr};
|
732
|
|
|
|
|
|
|
print ' attrseq = ', join ',', @{$t->attrseq};
|
733
|
|
|
|
|
|
|
}
|
734
|
|
|
|
|
|
|
#
|
735
|
|
|
|
|
|
|
print 'is_tag ', $t->is_tag;
|
736
|
|
|
|
|
|
|
print 'is_start_tag ', $t->is_start_tag;
|
737
|
|
|
|
|
|
|
print 'is_end_tag ', $t->is_end_tag;
|
738
|
|
|
|
|
|
|
}
|
739
|
|
|
|
|
|
|
elsif( $t->is_pi ){
|
740
|
|
|
|
|
|
|
print ' target = ', $t->target;
|
741
|
|
|
|
|
|
|
print ' data = ', $t->data;
|
742
|
|
|
|
|
|
|
print 'is_pi ', $t->is_pi;
|
743
|
|
|
|
|
|
|
}
|
744
|
|
|
|
|
|
|
else {
|
745
|
|
|
|
|
|
|
print ' text = ', $t->text;
|
746
|
|
|
|
|
|
|
print 'is_text ', $t->is_text;
|
747
|
|
|
|
|
|
|
print 'is_comment ', $t->is_comment;
|
748
|
|
|
|
|
|
|
}
|
749
|
|
|
|
|
|
|
#
|
750
|
|
|
|
|
|
|
print $/;
|
751
|
|
|
|
|
|
|
}
|
752
|
|
|
|
|
|
|
__END__
|
753
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
Output:
|
756
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
raw =
|
758
|
|
|
|
|
|
|
tag = tag
|
759
|
|
|
|
|
|
|
attr = foo,bar,foy,floy
|
760
|
|
|
|
|
|
|
attrseq = foo,foy
|
761
|
|
|
|
|
|
|
is_tag 1
|
762
|
|
|
|
|
|
|
is_start_tag 1
|
763
|
|
|
|
|
|
|
is_end_tag 0
|
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
raw = some text
|
767
|
|
|
|
|
|
|
text = some text
|
768
|
|
|
|
|
|
|
is_text 1
|
769
|
|
|
|
|
|
|
is_comment 0
|
770
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
|
772
|
|
|
|
|
|
|
raw =
|
773
|
|
|
|
|
|
|
text = comment
|
774
|
|
|
|
|
|
|
is_text 0
|
775
|
|
|
|
|
|
|
is_comment 1
|
776
|
|
|
|
|
|
|
|
777
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
raw =
|
779
|
|
|
|
|
|
|
tag = tag
|
780
|
|
|
|
|
|
|
is_tag 1
|
781
|
|
|
|
|
|
|
is_start_tag 0
|
782
|
|
|
|
|
|
|
is_end_tag 1
|
783
|
|
|
|
|
|
|
|
784
|
|
|
|
|
|
|
|
785
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
=head1 BUGS
|
787
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
To report bugs, go to
|
789
|
|
|
|
|
|
|
Ehttp://rt.cpan.org/NoAuth/Bugs.html?Dist=XML-TokeParserE
|
790
|
|
|
|
|
|
|
or send mail to Ebug-XML-Tokeparser@rt.cpan.orgE
|
791
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
=head1 AUTHOR
|
793
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
Copyright (c) 2003 D.H. aka PodMaster (current maintainer).
|
795
|
|
|
|
|
|
|
Copyright (c) 2001 Eric Bohlman (original author).
|
796
|
|
|
|
|
|
|
|
797
|
|
|
|
|
|
|
All rights reserved.
|
798
|
|
|
|
|
|
|
This program is free software;
|
799
|
|
|
|
|
|
|
you can redistribute it and/or modify it
|
800
|
|
|
|
|
|
|
under the same terms as Perl itself.
|
801
|
|
|
|
|
|
|
If you don't know what this means,
|
802
|
|
|
|
|
|
|
visit Ehttp://perl.com/E or Ehttp://cpan.org/E.
|
803
|
|
|
|
|
|
|
|
804
|
|
|
|
|
|
|
=head1 SEE ALSO
|
805
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
L,
|
807
|
|
|
|
|
|
|
L,
|
808
|
|
|
|
|
|
|
L,
|
809
|
|
|
|
|
|
|
L,
|
810
|
|
|
|
|
|
|
L.
|
811
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
=cut
|