line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package XML::CompactTree; |
2
|
|
|
|
|
|
|
|
3
|
3
|
|
|
3
|
|
96764
|
use warnings; |
|
3
|
|
|
|
|
8
|
|
|
3
|
|
|
|
|
228
|
|
4
|
3
|
|
|
3
|
|
17
|
use strict; |
|
3
|
|
|
|
|
4
|
|
|
3
|
|
|
|
|
163
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
=head1 NAME |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
XML::CompactTree - builder of compact tree structures from XML documents |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
=head1 VERSION |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
Version 0.03 |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=cut |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
our $VERSION = '0.03'; |
17
|
|
|
|
|
|
|
|
18
|
3
|
|
|
3
|
|
15
|
use base qw(Exporter); |
|
3
|
|
|
|
|
9
|
|
|
3
|
|
|
|
|
720
|
|
19
|
3
|
|
|
3
|
|
16
|
use vars qw( @EXPORT @EXPORT_OK %EXPORT_TAGS ); |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
212
|
|
20
|
3
|
|
|
3
|
|
1290
|
use XML::LibXML::Reader; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
# XCT_USE_QNAMES /* not yet implemented */ |
23
|
|
|
|
|
|
|
# XCT_TEXT_AS_STRING /* not yet implemented */ |
24
|
|
|
|
|
|
|
# XCT_PRESERVE_PARENT /* not yet implemented */ |
25
|
|
|
|
|
|
|
# XCT_MERGE_TEXT_NODES /* not yet implemented */ |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
use constant do { |
28
|
|
|
|
|
|
|
my @flags = (qw( |
29
|
|
|
|
|
|
|
XCT_IGNORE_WS |
30
|
|
|
|
|
|
|
XCT_IGNORE_SIGNIFICANT_WS |
31
|
|
|
|
|
|
|
XCT_IGNORE_PROCESSING_INSTRUCTIONS |
32
|
|
|
|
|
|
|
XCT_IGNORE_COMMENTS |
33
|
|
|
|
|
|
|
XCT_USE_QNAMES |
34
|
|
|
|
|
|
|
XCT_KEEP_NS_DECLS |
35
|
|
|
|
|
|
|
XCT_TEXT_AS_STRING |
36
|
|
|
|
|
|
|
XCT_ATTRIBUTE_ARRAY |
37
|
|
|
|
|
|
|
XCT_PRESERVE_PARENT |
38
|
|
|
|
|
|
|
XCT_MERGE_TEXT_NODES |
39
|
|
|
|
|
|
|
XCT_LINE_NUMBERS |
40
|
|
|
|
|
|
|
XCT_DOCUMENT_ROOT |
41
|
|
|
|
|
|
|
)); |
42
|
|
|
|
|
|
|
$EXPORT_TAGS{flags} = \@flags; |
43
|
|
|
|
|
|
|
my %c = map { ($flags[$_] => (1 << $_)) } 0..$#flags; |
44
|
|
|
|
|
|
|
\%c |
45
|
|
|
|
|
|
|
}; |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
BEGIN { |
48
|
|
|
|
|
|
|
@EXPORT = (map @$_, values %EXPORT_TAGS); |
49
|
|
|
|
|
|
|
@EXPORT_OK = @EXPORT; |
50
|
|
|
|
|
|
|
$EXPORT_TAGS{all}=\@EXPORT_OK; |
51
|
|
|
|
|
|
|
} |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
=head1 SYNOPSIS |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
use XML::CompactTree; |
56
|
|
|
|
|
|
|
use XML::LibXML::Reader; |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
my $reader = XML::LibXML::Reader->new(location => $url); |
59
|
|
|
|
|
|
|
... |
60
|
|
|
|
|
|
|
my $tree = XML::CompactTree::readSubtreeToPerl($reader); |
61
|
|
|
|
|
|
|
... |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=head1 DESCRIPTION |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
This module provides functions that use XML::LibXML::Reader to parse |
66
|
|
|
|
|
|
|
an XML document into a parse tree formed of nested arrays (and hashes). |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
It aims to be fast in doing that and to presreve all relevant |
69
|
|
|
|
|
|
|
information from the XML (including namespaces, document order, mixed |
70
|
|
|
|
|
|
|
content, etc.). It sacrifices user friendliness for speed. |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
IMPORTANT: There is an even more efficient XS implementation of this |
73
|
|
|
|
|
|
|
module called XML::CompactTree::XS with 100% equivalent functionality. |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
=head1 PURPOSE |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
I wrote this module because I noticed that repeated calls to methods |
78
|
|
|
|
|
|
|
implemented in C (XS) were very expensive in Perl. |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
Therefore traversing a large DOM tree using XML::LibXML or iterating |
81
|
|
|
|
|
|
|
over an XML stream using XML::LibXML::Reader was much slower than |
82
|
|
|
|
|
|
|
traversing similarly large and structured native Perl data |
83
|
|
|
|
|
|
|
structures. |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
This module allows the user to build a document parse tree consisting |
86
|
|
|
|
|
|
|
of native Perl data structures (arrays and optionally hashes) using |
87
|
|
|
|
|
|
|
XML::LibXML::Reader with minimal number of XS calls. |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
(Note that there XML::CompactTree::XS is 100% equivalent of this |
90
|
|
|
|
|
|
|
module that manages the same with just one XS call.) |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
It does not provide full DOM navigation but attempts to provide |
93
|
|
|
|
|
|
|
maximum amount of information. Its memory footprint should be |
94
|
|
|
|
|
|
|
somewhat smaller than that of a corresponding XML::LibXML DOM tree. |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
=head1 EXPORT |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
By default, the following constants are exported (C<:flags> export |
99
|
|
|
|
|
|
|
tag) to be used as flags for the tree builder: |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
XCT_IGNORE_WS |
102
|
|
|
|
|
|
|
XCT_IGNORE_SIGNIFICANT_WS |
103
|
|
|
|
|
|
|
XCT_IGNORE_PROCESSING_INSTRUCTIONS |
104
|
|
|
|
|
|
|
XCT_IGNORE_COMMENTS |
105
|
|
|
|
|
|
|
XCT_USE_QNAMES /* not yet implemented */ |
106
|
|
|
|
|
|
|
XCT_KEEP_NS_DECLS |
107
|
|
|
|
|
|
|
XCT_TEXT_AS_STRING /* not yet implemented */ |
108
|
|
|
|
|
|
|
XCT_ATTRIBUTE_ARRAY |
109
|
|
|
|
|
|
|
XCT_PRESERVE_PARENT /* not yet implemented */ |
110
|
|
|
|
|
|
|
XCT_MERGE_TEXT_NODES /* not yet implemented */ |
111
|
|
|
|
|
|
|
XCT_DOCUMENT_ROOT |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=head1 FUNCTIONS |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
=head2 readSubtreeToPerl( $reader, $flags, \my %ns ) |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
Uses a given XML::LibXML::Reader parser objects to parse a subtree at |
118
|
|
|
|
|
|
|
the current reader position to build a tree formed of nested arrays |
119
|
|
|
|
|
|
|
(see L |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
=over 4 |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
=item reader |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
A XML::LibXML::Reader object to use as the reader. While building the |
126
|
|
|
|
|
|
|
tree, the reader moves to the next node on the current or higher |
127
|
|
|
|
|
|
|
level. |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
=item flags |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
An integer consisting of 1 bit flags (see constants in the EXPORT section). |
132
|
|
|
|
|
|
|
Use binary or (|) to combine individual flags. |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
The following flags are NOT implemented yet: |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
XCT_USE_QNAMES, XCT_TEXT_AS_STRING, XCT_PRESERVE_PARENT, XCT_MERGE_TEXT_NODES |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=item ns |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
You may pass an empty hash reference that will be populated by a |
141
|
|
|
|
|
|
|
namespace_uri to namespace_index map, that can be used to decode |
142
|
|
|
|
|
|
|
namespace indexes in the resulting data structure (see L |
143
|
|
|
|
|
|
|
FORMAT>). |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
=back |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
=cut |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
sub readSubtreeToPerl { |
151
|
|
|
|
|
|
|
my ($reader,$flags,$ns)=@_; |
152
|
|
|
|
|
|
|
$ns||={}; |
153
|
|
|
|
|
|
|
$ns->{''}=0; |
154
|
|
|
|
|
|
|
my $ret = _readSubtreeToPerl($reader,$flags,$ns,1,0); |
155
|
|
|
|
|
|
|
return $ret->[0]; |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
=head2 readLevelToPerl( $reader, $flags, $ns ) |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
Like C, but reads the subtree |
161
|
|
|
|
|
|
|
at the current reader position and all its following siblings. |
162
|
|
|
|
|
|
|
It returns an array reference of representations of these subtrees |
163
|
|
|
|
|
|
|
as in the format described in L |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
=cut |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
sub readLevelToPerl { |
168
|
|
|
|
|
|
|
my ($reader,$flags,$ns)=@_; |
169
|
|
|
|
|
|
|
$ns||={}; |
170
|
|
|
|
|
|
|
$ns->{''}=0; |
171
|
|
|
|
|
|
|
my $ret = _readSubtreeToPerl($reader,$flags,$ns,1,1); |
172
|
|
|
|
|
|
|
return $ret; |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
sub _readSubtreeToPerl { |
176
|
|
|
|
|
|
|
my ($reader, $flags, $ns_map, $free_ns_index, $read_siblings) = @_; |
177
|
|
|
|
|
|
|
my @parents; |
178
|
|
|
|
|
|
|
my ($av,$prev,$kids,$ret,$type,$name); |
179
|
|
|
|
|
|
|
my $cur_depth=$reader->depth(); |
180
|
|
|
|
|
|
|
my $start_depth = $cur_depth; |
181
|
|
|
|
|
|
|
my $prev_depth = $start_depth; |
182
|
|
|
|
|
|
|
my $top = []; |
183
|
|
|
|
|
|
|
if ($reader->nodeType()==0) { |
184
|
|
|
|
|
|
|
return if $reader->read()!=1; |
185
|
|
|
|
|
|
|
if ($flags & XCT_DOCUMENT_ROOT) { |
186
|
|
|
|
|
|
|
$prev = [ XML_READER_TYPE_DOCUMENT, |
187
|
|
|
|
|
|
|
$reader->encoding, |
188
|
|
|
|
|
|
|
]; |
189
|
|
|
|
|
|
|
$start_depth --; |
190
|
|
|
|
|
|
|
$prev_depth --; |
191
|
|
|
|
|
|
|
push @$top, $prev; |
192
|
|
|
|
|
|
|
push @parents, $prev; |
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
} |
195
|
|
|
|
|
|
|
do {{ |
196
|
|
|
|
|
|
|
$type = $reader->nodeType(); |
197
|
|
|
|
|
|
|
# warn("$type, $cur_depth, ".$reader->name."\n"); |
198
|
|
|
|
|
|
|
if ($type == XML_READER_TYPE_NONE |
199
|
|
|
|
|
|
|
or $type == XML_READER_TYPE_ATTRIBUTE |
200
|
|
|
|
|
|
|
or $type == XML_READER_TYPE_DOCUMENT_TYPE |
201
|
|
|
|
|
|
|
or $type == XML_READER_TYPE_END_ELEMENT |
202
|
|
|
|
|
|
|
or $type == XML_READER_TYPE_ENTITY |
203
|
|
|
|
|
|
|
or $type == XML_READER_TYPE_END_ENTITY |
204
|
|
|
|
|
|
|
or $type == XML_READER_TYPE_XML_DECLARATION) { |
205
|
|
|
|
|
|
|
$ret = $reader->read(); |
206
|
|
|
|
|
|
|
} else { |
207
|
|
|
|
|
|
|
if (($flags & (XCT_IGNORE_WS|XCT_IGNORE_SIGNIFICANT_WS)) |
208
|
|
|
|
|
|
|
and $type == XML_READER_TYPE_WHITESPACE |
209
|
|
|
|
|
|
|
or |
210
|
|
|
|
|
|
|
($flags & XCT_IGNORE_SIGNIFICANT_WS) |
211
|
|
|
|
|
|
|
and $type == XML_READER_TYPE_SIGNIFICANT_WHITESPACE |
212
|
|
|
|
|
|
|
or |
213
|
|
|
|
|
|
|
($flags & XCT_IGNORE_COMMENTS) |
214
|
|
|
|
|
|
|
and $type == XML_READER_TYPE_COMMENT |
215
|
|
|
|
|
|
|
or |
216
|
|
|
|
|
|
|
($flags & XCT_IGNORE_PROCESSING_INSTRUCTIONS |
217
|
|
|
|
|
|
|
and $type == XML_READER_TYPE_PROCESSING_INSTRUCTION)) { |
218
|
|
|
|
|
|
|
$ret = $reader->read(); |
219
|
|
|
|
|
|
|
} else { |
220
|
|
|
|
|
|
|
my @av=(); |
221
|
|
|
|
|
|
|
$av=\@av; |
222
|
|
|
|
|
|
|
push @av, $type; |
223
|
|
|
|
|
|
|
if ($type == XML_READER_TYPE_ELEMENT) { |
224
|
|
|
|
|
|
|
# warn(" element\n"); |
225
|
|
|
|
|
|
|
push @av, $reader->localName(); |
226
|
|
|
|
|
|
|
$name = $reader->namespaceURI(); |
227
|
|
|
|
|
|
|
if ($name) { |
228
|
|
|
|
|
|
|
if (exists($ns_map->{$name})) { |
229
|
|
|
|
|
|
|
push(@av, $ns_map->{$name} || 0); |
230
|
|
|
|
|
|
|
} else { |
231
|
|
|
|
|
|
|
# warn("storing namespace $name as $free_ns_index)"; |
232
|
|
|
|
|
|
|
push(@av, $free_ns_index); |
233
|
|
|
|
|
|
|
$ns_map->{$name}=$free_ns_index; |
234
|
|
|
|
|
|
|
$free_ns_index++; |
235
|
|
|
|
|
|
|
} |
236
|
|
|
|
|
|
|
} else { |
237
|
|
|
|
|
|
|
push(@av, 0); # no namespace |
238
|
|
|
|
|
|
|
} |
239
|
|
|
|
|
|
|
if ($reader->hasAttributes() && $reader->moveToFirstAttribute()==1) { |
240
|
|
|
|
|
|
|
if ($flags & XCT_ATTRIBUTE_ARRAY) { |
241
|
|
|
|
|
|
|
my @attrs; |
242
|
|
|
|
|
|
|
do { |
243
|
|
|
|
|
|
|
$name = $reader->name(); |
244
|
|
|
|
|
|
|
if (($flags & XCT_KEEP_NS_DECLS) || substr($name,0,5) ne 'xmlns' ) { |
245
|
|
|
|
|
|
|
push(@attrs, $name); |
246
|
|
|
|
|
|
|
push(@attrs, $reader->value()); |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
} while ($reader->moveToNextAttribute()==1); |
249
|
|
|
|
|
|
|
# $reader->moveToElement(); |
250
|
|
|
|
|
|
|
push(@av, \@attrs); |
251
|
|
|
|
|
|
|
} else { |
252
|
|
|
|
|
|
|
my %attrs; |
253
|
|
|
|
|
|
|
do { |
254
|
|
|
|
|
|
|
$name = $reader->name(); |
255
|
|
|
|
|
|
|
if (($flags & XCT_KEEP_NS_DECLS) || substr($name,0,5) ne 'xmlns' ) { |
256
|
|
|
|
|
|
|
$attrs{$name}=$reader->value(); |
257
|
|
|
|
|
|
|
} |
258
|
|
|
|
|
|
|
} while ($reader->moveToNextAttribute()==1); |
259
|
|
|
|
|
|
|
$reader->moveToElement(); |
260
|
|
|
|
|
|
|
push(@av, \%attrs); |
261
|
|
|
|
|
|
|
} |
262
|
|
|
|
|
|
|
} else { |
263
|
|
|
|
|
|
|
push(@av, undef); # no attributes |
264
|
|
|
|
|
|
|
} |
265
|
|
|
|
|
|
|
if ($flags & XCT_LINE_NUMBERS) { |
266
|
|
|
|
|
|
|
push(@av, $reader->lineNumber()); |
267
|
|
|
|
|
|
|
} |
268
|
|
|
|
|
|
|
} elsif ($type == XML_READER_TYPE_TEXT or |
269
|
|
|
|
|
|
|
$type == XML_READER_TYPE_CDATA or |
270
|
|
|
|
|
|
|
$type == XML_READER_TYPE_COMMENT or |
271
|
|
|
|
|
|
|
$type == XML_READER_TYPE_WHITESPACE or |
272
|
|
|
|
|
|
|
$type == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) { |
273
|
|
|
|
|
|
|
push(@av, $reader->value()); |
274
|
|
|
|
|
|
|
} elsif ($type == XML_READER_TYPE_ENTITY_REFERENCE or |
275
|
|
|
|
|
|
|
$type == XML_READER_TYPE_PROCESSING_INSTRUCTION or |
276
|
|
|
|
|
|
|
$type == XML_READER_TYPE_NOTATION) { |
277
|
|
|
|
|
|
|
push(@av, $reader->localName()); |
278
|
|
|
|
|
|
|
push(@av, $reader->value()); |
279
|
|
|
|
|
|
|
} elsif ($type == XML_READER_TYPE_DOCUMENT or |
280
|
|
|
|
|
|
|
$type == XML_READER_TYPE_DOCUMENT_FRAGMENT) { |
281
|
|
|
|
|
|
|
push(@av, $reader->encoding()); |
282
|
|
|
|
|
|
|
} |
283
|
|
|
|
|
|
|
if ($cur_depth==$start_depth) { |
284
|
|
|
|
|
|
|
push(@$top, $av); |
285
|
|
|
|
|
|
|
$prev_depth = $cur_depth; |
286
|
|
|
|
|
|
|
$kids = undef; |
287
|
|
|
|
|
|
|
} elsif ($cur_depth > $prev_depth) { |
288
|
|
|
|
|
|
|
$kids=[]; |
289
|
|
|
|
|
|
|
push(@$prev, $kids); |
290
|
|
|
|
|
|
|
push(@$kids, $av); |
291
|
|
|
|
|
|
|
push(@parents, $prev); |
292
|
|
|
|
|
|
|
$prev_depth = $cur_depth; |
293
|
|
|
|
|
|
|
} elsif ($cur_depth == $prev_depth) { |
294
|
|
|
|
|
|
|
push(@$kids, $av) if $kids; |
295
|
|
|
|
|
|
|
} else { |
296
|
|
|
|
|
|
|
do { |
297
|
|
|
|
|
|
|
$prev_depth--; |
298
|
|
|
|
|
|
|
pop(@parents); |
299
|
|
|
|
|
|
|
} while ($cur_depth < $prev_depth); |
300
|
|
|
|
|
|
|
my $p = $parents[-1]; |
301
|
|
|
|
|
|
|
if ($p) { |
302
|
|
|
|
|
|
|
$prev = $p; |
303
|
|
|
|
|
|
|
$p = $prev->[-1]; |
304
|
|
|
|
|
|
|
if ($p) { |
305
|
|
|
|
|
|
|
$kids = $p; |
306
|
|
|
|
|
|
|
push(@$kids, $av); |
307
|
|
|
|
|
|
|
} |
308
|
|
|
|
|
|
|
} |
309
|
|
|
|
|
|
|
} |
310
|
|
|
|
|
|
|
$prev = $av; |
311
|
|
|
|
|
|
|
$ret = $reader->read(); |
312
|
|
|
|
|
|
|
} |
313
|
|
|
|
|
|
|
} |
314
|
|
|
|
|
|
|
# print STDERR "$cur_depth, ",$reader->depth(),"\n"; |
315
|
|
|
|
|
|
|
}} while ($ret == 1 && ($cur_depth = $reader->depth()) > ($start_depth - ($read_siblings ? 1 : 0))); |
316
|
|
|
|
|
|
|
if ($ret == 1) { |
317
|
|
|
|
|
|
|
if ($reader->depth() == $start_depth && |
318
|
|
|
|
|
|
|
$reader->nodeType() == XML_READER_TYPE_END_ELEMENT) { |
319
|
|
|
|
|
|
|
$reader->read(); |
320
|
|
|
|
|
|
|
} |
321
|
|
|
|
|
|
|
} |
322
|
|
|
|
|
|
|
return $top; |
323
|
|
|
|
|
|
|
} |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
=head1 OUTPUT FORMAT |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
The result of parsing a subtree is a Perl array reference C<$node> |
328
|
|
|
|
|
|
|
contains a node type followed by node data whose interpretation on |
329
|
|
|
|
|
|
|
further positions in $node depends on the node type, as described |
330
|
|
|
|
|
|
|
below: |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
=head2 Any Node |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
=over 5 |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
=item * |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
$node->[0] is an integer representing the node type. Use |
339
|
|
|
|
|
|
|
XML::LibXML::Reader node-tye constants, e.g. XML_READER_TYPE_ELEMENT |
340
|
|
|
|
|
|
|
for an element node, XML_READER_TYPE_TEXT for text node, etc. |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
=back |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
=head2 Document or Document Fragment Nodes |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
=over 5 |
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
=item * |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
$node->[1] contains the document encoding |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
=item * |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
$node->[2] is an array reference containing similar represention of |
355
|
|
|
|
|
|
|
all the child nodes of the document (fragment). |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
=back |
358
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
Note: XML::LibXML::Reader does not document node by default, which |
360
|
|
|
|
|
|
|
means that calling readSubtreeToPerl on a reader object in its initial |
361
|
|
|
|
|
|
|
state only parses the first node in the document (which can be the |
362
|
|
|
|
|
|
|
root element, but also a comment or a processing instruction). Use |
363
|
|
|
|
|
|
|
XCT_DOCUMENT_ROOT flag to force creating a document node in such case. |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
=head2 Element nodes |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
=over 5 |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
=item * |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
$node->[1] is the local name (UTF-8 encoded character string) |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
=item * |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
$node->[2] is the namespace index (see L below) |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
=item * |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
$node->[3] is undef if the element has no attributes. Otherwise if |
380
|
|
|
|
|
|
|
XCT_ATTRIBUTE_ARRAY flag was used, $node->[3] is an array reference of |
381
|
|
|
|
|
|
|
the form C<[ name1, value1, name2, value2, ....]> of attribute names and |
382
|
|
|
|
|
|
|
corresponding values. If XCT_ATTRIBUTE_ARRAY flag was not used, then |
383
|
|
|
|
|
|
|
$node->[3] is a hash reference mapping attribute names to the |
384
|
|
|
|
|
|
|
corresponding attribute values C<{ name1=>value1, name2=>value2...}> |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
The flag XCT_KEEP_NS_DECLS controls whether namespace declarations |
387
|
|
|
|
|
|
|
(xmlns=... or xmlns:prefix=...) are included along with normal |
388
|
|
|
|
|
|
|
attributes or not. |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
Note: there is no support for namespaced attributes yet, but the |
391
|
|
|
|
|
|
|
attribute names are stored as QNames, so one can always use |
392
|
|
|
|
|
|
|
XCT_KEEP_NS_DECLS to keep track of namespace prefix declarations and |
393
|
|
|
|
|
|
|
do the resolving manually. Support for namespaced attributes is |
394
|
|
|
|
|
|
|
planned. |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
=item * |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
If XTC_LINE_NUMBERS flag was used, $node->[4] contains the line number |
399
|
|
|
|
|
|
|
of the element and $node->[5] contains an array reference containing |
400
|
|
|
|
|
|
|
similar representions of the child nodes of the current node. |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
=item * |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
If XTC_LINE_NUMBERS flag was NOT used, $node->[4] contains an array |
405
|
|
|
|
|
|
|
reference of similar representations of the child nodes of the current |
406
|
|
|
|
|
|
|
node. |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
=back |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
=head2 Text, CDATA, Comment and White-Space Nodes |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
=over 5 |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
=item * |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
$node->[1] contains the node value (UTF-8 encoded character string) |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
=back |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
=head2 Unparsed Entity, Processing-Instruction, and Notation Nodes |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
=over 5 |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
=item * |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
$node->[1] contains the local name (there is no support for |
427
|
|
|
|
|
|
|
namespaces on these types of nodes yet) |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
=item * |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
$node->[2] contains the node value |
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
=back |
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
=head2 Skipping Less-Significant Nodes |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
White-space (non-significant or significant), processing-instruction |
438
|
|
|
|
|
|
|
and comment nodes can be completely skipped, using the following |
439
|
|
|
|
|
|
|
flags: |
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
XCT_IGNORE_WS |
442
|
|
|
|
|
|
|
XCT_IGNORE_SIGNIFICANT_WS |
443
|
|
|
|
|
|
|
XCT_IGNORE_PROCESSING_INSTRUCTIONS |
444
|
|
|
|
|
|
|
XCT_IGNORE_COMMENTS |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
=head1 NAMESPACES |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
Namespaces of element nodes are stored in the element node as an |
449
|
|
|
|
|
|
|
integer. 0 always represents nodes without namespace, all other |
450
|
|
|
|
|
|
|
namespaces are assigned unique numbers in an increasing order as they |
451
|
|
|
|
|
|
|
appear. You can pass an empty hash reference to the parsing functions |
452
|
|
|
|
|
|
|
to obtain the mapping. |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
=head2 Example |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
use XML::CompactTree; |
457
|
|
|
|
|
|
|
use XML::LibXML::Reader; |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
my $reader = XML::LibXML::Reader->new(location => $ARGV[0]); |
460
|
|
|
|
|
|
|
my %ns; |
461
|
|
|
|
|
|
|
my $data = XML::CompactTree::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT, \%ns ); |
462
|
|
|
|
|
|
|
$ns_map[$ns{$_}]=$_ for keys %ns; |
463
|
|
|
|
|
|
|
my @nodes = ($data); |
464
|
|
|
|
|
|
|
while (@nodes) { |
465
|
|
|
|
|
|
|
my $node = shift @nodes; |
466
|
|
|
|
|
|
|
my $type = $node->[0]; |
467
|
|
|
|
|
|
|
if ($type == XML_READER_TYPE_ELEMENT) { |
468
|
|
|
|
|
|
|
print "element $node->[1] is from ns $node->[2] '$ns_map[$node->[2]]'\n"; |
469
|
|
|
|
|
|
|
push @nodes, @{$node->[4]}; # queue children |
470
|
|
|
|
|
|
|
} elsif ($type == XML_READER_TYPE_DOCUMENT) { |
471
|
|
|
|
|
|
|
push @nodes, @{$node->[2]}; # queue children |
472
|
|
|
|
|
|
|
} |
473
|
|
|
|
|
|
|
} |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
=head1 PLANNED FEATURES |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
Planned flags: |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
XCT_USE_QNAMES - use QNames instead of local names for all nodes |
480
|
|
|
|
|
|
|
XCT_TEXT_AS_STRING - put text nodes into the tree as plain scalars |
481
|
|
|
|
|
|
|
XCT_PRESERVE_PARENT - add a slot with a weak reference to the parent node |
482
|
|
|
|
|
|
|
XCT_MERGE_TEXT_NODES - merge adjacent text/cdata nodes together |
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
Features: allow blessing the array refs to default or user-specified |
485
|
|
|
|
|
|
|
classes; the default classes would provide a very small subset of DOM |
486
|
|
|
|
|
|
|
methods to retrieve node information, manipulate the tree, and |
487
|
|
|
|
|
|
|
possibly serialize the parse tree back to XML. |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
=head1 AUTHOR |
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
Petr Pajas, C<< >> |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
=head1 BUGS |
494
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
Please report any bugs or feature requests to |
496
|
|
|
|
|
|
|
C, or through the web interface at |
497
|
|
|
|
|
|
|
L. |
498
|
|
|
|
|
|
|
I will be notified, and then you'll automatically be notified of progress on |
499
|
|
|
|
|
|
|
your bug as I make changes. |
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
=head1 COPYRIGHT & LICENSE |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
Copyright 2008-2009 Petr Pajas, All Rights Reserved. |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
506
|
|
|
|
|
|
|
under the same terms as Perl itself. |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
=head1 SEE ALSO |
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
XML::CompactTree::XS |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
XML::LibXML::Reader |
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
=cut |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
1; # End of XML::CompactTree |