line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package XML::Rules;
|
2
|
|
|
|
|
|
|
|
3
|
14
|
|
|
14
|
|
703797
|
use warnings;
|
|
14
|
|
|
|
|
35
|
|
|
14
|
|
|
|
|
563
|
|
4
|
14
|
|
|
14
|
|
80
|
no warnings qw(uninitialized);
|
|
14
|
|
|
|
|
27
|
|
|
14
|
|
|
|
|
519
|
|
5
|
14
|
|
|
14
|
|
75
|
use strict;
|
|
14
|
|
|
|
|
58
|
|
|
14
|
|
|
|
|
480
|
|
6
|
14
|
|
|
14
|
|
80
|
use Carp;
|
|
14
|
|
|
|
|
29
|
|
|
14
|
|
|
|
|
1103
|
|
7
|
14
|
|
|
14
|
|
380
|
use 5.008;
|
|
14
|
|
|
|
|
49
|
|
|
14
|
|
|
|
|
642
|
|
8
|
14
|
|
|
14
|
|
80
|
use Scalar::Util qw(weaken);
|
|
14
|
|
|
|
|
55
|
|
|
14
|
|
|
|
|
1910
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
require Exporter;
|
11
|
|
|
|
|
|
|
our @ISA = qw(Exporter);
|
12
|
|
|
|
|
|
|
our @EXPORT_OK = qw(paths2rules);
|
13
|
|
|
|
|
|
|
|
14
|
14
|
|
|
14
|
|
30218
|
use XML::Parser::Expat;
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
use constant STRIP => "0000";
|
17
|
|
|
|
|
|
|
use constant STRIP_RULE => 'pass';
|
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
#use Data::Dumper;
|
20
|
|
|
|
|
|
|
#$Data::Dumper::Indent = 1;
|
21
|
|
|
|
|
|
|
#$Data::Dumper::Terse = 1;
|
22
|
|
|
|
|
|
|
#$Data::Dumper::Quotekeys = 0;
|
23
|
|
|
|
|
|
|
#$Data::Dumper::Sortkeys = 1;
|
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
=head1 NAME
|
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
XML::Rules - parse XML and specify what and how to keep/process for individual tags
|
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
=head1 VERSION
|
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
Version 1.16
|
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
=cut
|
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
our $VERSION = '1.16';
|
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
=head1 SYNOPSIS
|
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
use XML::Rules;
|
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
$xml = <<'*END*';
|
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
...
|
46
|
|
|
|
|
|
|
...
|
47
|
|
|
|
|
|
|
...
|
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
...
|
50
|
|
|
|
|
|
|
...
|
51
|
|
|
|
|
|
|
...
|
52
|
|
|
|
|
|
|
...
|
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
123-456-7890
|
56
|
|
|
|
|
|
|
663-486-7890
|
57
|
|
|
|
|
|
|
663-486-7000
|
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
...
|
62
|
|
|
|
|
|
|
...
|
63
|
|
|
|
|
|
|
...
|
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
...
|
66
|
|
|
|
|
|
|
...
|
67
|
|
|
|
|
|
|
...
|
68
|
|
|
|
|
|
|
...
|
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
663-486-7891
|
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
*END*
|
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
@rules = (
|
78
|
|
|
|
|
|
|
_default => sub {$_[0] => $_[1]->{_content}},
|
79
|
|
|
|
|
|
|
# by default I'm only interested in the content of the tag, not the attributes
|
80
|
|
|
|
|
|
|
bogus => undef,
|
81
|
|
|
|
|
|
|
# let's ignore this tag and all inner ones as well
|
82
|
|
|
|
|
|
|
address => sub {address => "$_[1]->{street}, $_[1]->{city} ($_[1]->{country})"},
|
83
|
|
|
|
|
|
|
# merge the address into a single string
|
84
|
|
|
|
|
|
|
phone => sub {$_[1]->{type} => $_[1]->{_content}},
|
85
|
|
|
|
|
|
|
# let's use the "type" attribute as the key and the content as the value
|
86
|
|
|
|
|
|
|
phones => sub {delete $_[1]->{_content}; %{$_[1]}},
|
87
|
|
|
|
|
|
|
# remove the text content and pass along the type => content from the child nodes
|
88
|
|
|
|
|
|
|
person => sub { # lets print the values, all the data is readily available in the attributes
|
89
|
|
|
|
|
|
|
print "$_[1]->{lname}, $_[1]->{fname} <$_[1]->{email}>\n";
|
90
|
|
|
|
|
|
|
print "Home phone: $_[1]->{home}\n" if $_[1]->{home};
|
91
|
|
|
|
|
|
|
print "Office phone: $_[1]->{office}\n" if $_[1]->{office};
|
92
|
|
|
|
|
|
|
print "Fax: $_[1]->{fax}\n" if $_[1]->{fax};
|
93
|
|
|
|
|
|
|
print "$_[1]->{address}\n\n";
|
94
|
|
|
|
|
|
|
return; # the tag is processed, no need to remember what it contained
|
95
|
|
|
|
|
|
|
},
|
96
|
|
|
|
|
|
|
);
|
97
|
|
|
|
|
|
|
$parser = XML::Rules->new(rules => \@rules);
|
98
|
|
|
|
|
|
|
$parser->parse( $xml);
|
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=head1 INTRODUCTION
|
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
There are several ways to extract data from XML. One that's often used is to read the whole file and transform it into a huge maze of objects and then write code like
|
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
foreach my $obj ($XML->forTheLifeOfMyMotherGiveMeTheFirstChildNamed("Peter")->pleaseBeSoKindAndGiveMeAllChildrenNamedSomethingLike("Jane")) {
|
105
|
|
|
|
|
|
|
my $obj2 = $obj->sorryToKeepBotheringButINeedTheChildNamed("Theophile");
|
106
|
|
|
|
|
|
|
my $birth = $obj2->whatsTheValueOfAttribute("BirthDate");
|
107
|
|
|
|
|
|
|
print "Theophile was born at $birth\n";
|
108
|
|
|
|
|
|
|
}
|
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
I'm exagerating of course, but you probably know what I mean. You can of course shorten the path and call just one method ... that is if you spend the time to learn one more "cool" thing starting with X. XPath.
|
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
You can also use XML::Simple and generate an almost equaly huge maze of hashes and arrays ... which may make the code more or less complex. In either case you need to have enough memory
|
113
|
|
|
|
|
|
|
to store all that data, even if you only need a piece here and there.
|
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
Another way to parse the XML is to create some subroutines that handle the start and end tags and the text and whatever else may appear in the XML. Some modules will let you specify just one for start tag, one for text and one for end tag, others will let you install different handlers for different tags. The catch is that you have to build your data structures yourself, you have to know where you are, what tag is just open and what is the parent and its parent etc. so that you could add the attributes and especially the text to the right place. And the handlers have to do everything as their side effect. Does anyone remember what do they say about side efects? They make the code hard to debug, they tend to change the code into a maze of interdependent snippets of code.
|
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
So what's the difference in the way XML::Rules works? At the first glance, not much. You can also specify subroutines to be called for the tags encountered while parsing the XML, just like the other even based XML parsers. The difference is that you do not have to rely on side-effects if all you want is to store the value of a tag. You simply return whatever you need from the current tag and the module will add it at the right place in the data structure it builds and will provide it to the handlers for the parent tag. And if the parent tag does return that data again it will be passed to its parent and so forth. Until we get to the level at which it's convenient to handle all the data we accumulated from the twig.
|
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
Do we want to keep just the content and access it in the parent tag handler under a specific name?
|
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
foo => sub {return 'foo' => $_[1]->{_content}}
|
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
Do we want to ornament the content a bit and add it to the parent tag's content?
|
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
u => sub {return '_' . $_[1]->{_content} . '_'}
|
126
|
|
|
|
|
|
|
strong => sub {return '*' . $_[1]->{_content} . '*'}
|
127
|
|
|
|
|
|
|
uc => sub {return uc($_[1]->{_content})}
|
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
Do we want to merge the attributes into a string and access the string from the parent tag under a specified name?
|
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
address => sub {return 'Address' => "Street: $_[1]->{street} $_[1]->{bldngNo}\nCity: $_[1]->{city}\nCountry: $_[1]->{country}\nPostal code: $_[1]->{zip}"}
|
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
and in this case the $_[1]->{street} may either be an attribute of the tag or it may be ther result of the handler (rule)
|
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
street => sub {return 'street' => $_[1]->{_content}}
|
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
and thus come from a child tag . You may also use the rules to convert codes to values
|
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
our %states = (
|
140
|
|
|
|
|
|
|
AL => 'Alabama',
|
141
|
|
|
|
|
|
|
AK => 'Alaska',
|
142
|
|
|
|
|
|
|
...
|
143
|
|
|
|
|
|
|
);
|
144
|
|
|
|
|
|
|
...
|
145
|
|
|
|
|
|
|
state => sub {return 'state' => $states{$_[1]->{_content}}; }
|
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
or
|
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
address => sub {
|
150
|
|
|
|
|
|
|
if (exists $_[1]->{id}) {
|
151
|
|
|
|
|
|
|
$sthFetchAddress->execute($_[1]->{id});
|
152
|
|
|
|
|
|
|
my $addr = $sthFetchAddress->fetchrow_hashref();
|
153
|
|
|
|
|
|
|
$sthFetchAddress->finish();
|
154
|
|
|
|
|
|
|
return 'address' => $addr;
|
155
|
|
|
|
|
|
|
} else {
|
156
|
|
|
|
|
|
|
return 'address' => $_[1];
|
157
|
|
|
|
|
|
|
}
|
158
|
|
|
|
|
|
|
}
|
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
so that you do not have to care whether there was
|
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
or
|
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
Larry Wall's St.478CoreThe Programming Republic of Perl
|
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
And if you do not like to end up with a datastructure of plain old arrays and hashes, you can create
|
169
|
|
|
|
|
|
|
application specific objects in the rules
|
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
address => sub {
|
172
|
|
|
|
|
|
|
my $type = lc(delete $_[1]->{type});
|
173
|
|
|
|
|
|
|
$type.'Address' => MyApp::Address->new(%{$_[1]})
|
174
|
|
|
|
|
|
|
},
|
175
|
|
|
|
|
|
|
person => sub {
|
176
|
|
|
|
|
|
|
'@person' => MyApp::Person->new(
|
177
|
|
|
|
|
|
|
firstname => $_[1]->{fname},
|
178
|
|
|
|
|
|
|
lastname => $_[1]->{lname},
|
179
|
|
|
|
|
|
|
deliveryAddress => $_[1]->{deliveryAddress},
|
180
|
|
|
|
|
|
|
billingAddress => $_[1]->{billingAddress},
|
181
|
|
|
|
|
|
|
phone => $_[1]->{phone},
|
182
|
|
|
|
|
|
|
)
|
183
|
|
|
|
|
|
|
}
|
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
At each level in the tree structure serialized as XML you can decide what to keep, what to throw away, what to transform and
|
187
|
|
|
|
|
|
|
then just return the stuff you care about and it will be available to the handler at the next level.
|
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
=head1 CONSTRUCTOR
|
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
my $parser = XML::Rules->new(
|
192
|
|
|
|
|
|
|
rules => \@rules,
|
193
|
|
|
|
|
|
|
[ start_rules => \@start_rules, ]
|
194
|
|
|
|
|
|
|
[ stripspaces => 0 / 1 / 2 / 3 + 0 / 4 + 0 / 8, ]
|
195
|
|
|
|
|
|
|
[ normalisespaces => 0 / 1, ]
|
196
|
|
|
|
|
|
|
[ style => 'parser' / 'filter', ]
|
197
|
|
|
|
|
|
|
[ ident => ' ', [reformat_all => 0 / 1] ],
|
198
|
|
|
|
|
|
|
[ encode => 'encoding specification', ]
|
199
|
|
|
|
|
|
|
[ output_encoding => 'encoding specification', ]
|
200
|
|
|
|
|
|
|
[ namespaces => \%namespace2alias_mapping, ]
|
201
|
|
|
|
|
|
|
[ handlers => \%additional_expat_handlers, ]
|
202
|
|
|
|
|
|
|
# and optionaly parameters passed to XML::Parser::Expat
|
203
|
|
|
|
|
|
|
);
|
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
Options passed to XML::Parser::Expat : ProtocolEncoding Namespaces NoExpand Stream_Delimiter ErrorContext ParseParamEnt Base
|
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
The "stripspaces" controls the handling of whitespace. Please see the C bellow.
|
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
The "style" specifies whether you want to build a parser used to extract stuff from the XML or filter/modify the XML. If you specify
|
210
|
|
|
|
|
|
|
style => 'filter' then all tags for which you do not specify a subroutine rule or that occure inside such a tag are copied to the output filehandle
|
211
|
|
|
|
|
|
|
passed to the ->filter() or ->filterfile() methods.
|
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
The "ident" specifies what character(s) to use to ident the tags when filtering, by default the tags are not formatted in any way. If the
|
214
|
|
|
|
|
|
|
"reformat_all" is not set then this affects only the tags that have a rule and their subtags. And in case of subtags only those that were
|
215
|
|
|
|
|
|
|
added into the attribute hash by their rules, not those left in the _content array!
|
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
The "warnoverwrite" instructs XML::Rules to issue a warning whenever the rule cause a key in a tag's hash to be overwritten by new
|
218
|
|
|
|
|
|
|
data produced by the rule of a subtag. This happens eg. if a tag is repeated and its rule doesn't expect it.
|
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
The "encode" allows you to ask the module to run all data through Encode::encode( 'encoding_specification', ...)
|
221
|
|
|
|
|
|
|
before being passed to the rules. Otherwise all data comes as UTF8.
|
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
The "output_encoding" on the other hand specifies in what encoding is the resulting data going to be, the default is again UTF8.
|
224
|
|
|
|
|
|
|
This means that if you specify
|
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
encode => 'windows-1250',
|
227
|
|
|
|
|
|
|
output_encoding => 'utf8',
|
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
and the XML is in ISO-8859-2 (Latin2) then the filter will 1) convert the content and attributes of the tags you are not interested in from Latin2
|
230
|
|
|
|
|
|
|
directly to utf8 and output and 2) convert the content and attributes of the tags you want to process from Latin2 to Windows-1250, let you mangle
|
231
|
|
|
|
|
|
|
the data and then convert the results to utf8 for the output.
|
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
The C and C affects also the C<$parser->toXML(...)>, if they are different then the data are converted from
|
234
|
|
|
|
|
|
|
one encoding to the other.
|
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
The C allow you to set additional handlers for XML::Parser::Expat->setHandlers.
|
237
|
|
|
|
|
|
|
Your Start, End, Char and XMLDecl handlers are evaluated before the ones installed by XML::Rules and may
|
238
|
|
|
|
|
|
|
modify the values in @_, but you should be very carefull with that. Consider that experimental and if you do make
|
239
|
|
|
|
|
|
|
that work the way you needed, please let me know so that I know what was it good for and can make sure
|
240
|
|
|
|
|
|
|
it doesn't break in a new version.
|
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
=head2 The Rules
|
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
The rules option may be either an arrayref or a hashref, the module doesn't care, but if you want to use regexps to specify the groups of tags to be handled
|
245
|
|
|
|
|
|
|
by the same rule you should use the array ref. The rules array/hash is made of pairs in form
|
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
tagspecification => action
|
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
where the tagspecification may be either a name of a tag, a string containing comma or pipe ( "|" ) delimited list of tag names
|
250
|
|
|
|
|
|
|
or a string containing a regexp enclosed in // optionaly followed by the regular expression modifiers or a qr// compiled regular expressions.
|
251
|
|
|
|
|
|
|
The tag names and tag name lists take precedence to the regexps, the regexps are (in case of arrayrefs only!!!) tested in the order in which
|
252
|
|
|
|
|
|
|
they are specified.
|
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
These rules are evaluated/executed whenever a tag if fully parsed including all the content and child tags and they may access the content and attributes of the
|
255
|
|
|
|
|
|
|
specified tag plus the stuff produced by the rules evaluated for the child tags.
|
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
The action may be either
|
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
- an undef or empty string = ignore the tag and all its children
|
260
|
|
|
|
|
|
|
- a subroutine reference = the subroutine will be called to handle the tag data&contents
|
261
|
|
|
|
|
|
|
sub { my ($tagname, $attrHash, $contexArray, $parentDataArray, $parser) = @_; ...}
|
262
|
|
|
|
|
|
|
- one of the built in rules below
|
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
=head3 Custom rules
|
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
The subroutines in the rules specification receive five parameters:
|
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
$rule->( $tag_name, \%attrs, \@context, \@parent_data, $parser)
|
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
It's OK to destroy the first two parameters, but you should treat the other three as read only
|
271
|
|
|
|
|
|
|
or at least treat them with care!
|
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
$tag_name = string containing the tag name
|
274
|
|
|
|
|
|
|
\%attrs = hash containing the attributes of the tag plus the _content key
|
275
|
|
|
|
|
|
|
containing the text content of the tag. If it's not a leaf tag it may
|
276
|
|
|
|
|
|
|
also contain the data returned by the rules invoked for the child tags.
|
277
|
|
|
|
|
|
|
\@context = an array containing the names of the tags enclosing the current
|
278
|
|
|
|
|
|
|
one. The parent tag name is the last element of the array. (READONLY!)
|
279
|
|
|
|
|
|
|
\@parent_data = an array containing the hashes with the attributes
|
280
|
|
|
|
|
|
|
and content read&produced for the enclosing tags so far.
|
281
|
|
|
|
|
|
|
You may need to access this for example to find out the version
|
282
|
|
|
|
|
|
|
of the format specified as an attribute of the root tag. You may
|
283
|
|
|
|
|
|
|
safely add, change or delete attributes in the hashes, but all bets
|
284
|
|
|
|
|
|
|
are off if you change the number or type of elements of this array!
|
285
|
|
|
|
|
|
|
$parser = the parser object
|
286
|
|
|
|
|
|
|
you may use $parser->{pad} or $parser->{parameters} to store any data
|
287
|
|
|
|
|
|
|
you need. The first is never touched by XML::Rules, the second is set to
|
288
|
|
|
|
|
|
|
the last argument of parse() or filter() methods and reset to undef
|
289
|
|
|
|
|
|
|
before those methods exit.
|
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
The subroutine may decide to handle the data and return nothing or
|
292
|
|
|
|
|
|
|
tweak the data as necessary and return just the relevant bits. It may also
|
293
|
|
|
|
|
|
|
load more information from elsewhere based on the ids found in the XML
|
294
|
|
|
|
|
|
|
and provide it to the rules of the ancestor tags as if it was part of the XML.
|
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
The possible return values of the subroutines are:
|
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
1) nothing or undef or "" - nothing gets added to the parent tag's hash
|
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
2) a single string - if the parent's _content is a string then the one produced by this rule is appended to the parent's _content.
|
301
|
|
|
|
|
|
|
If the parent's _content is an array, then the string is push()ed to the array.
|
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
3) a single reference - if the parent's _content is a string then it's changed to an array containing the original string and this reference.
|
304
|
|
|
|
|
|
|
If the parent's _content is an array, then the string is push()ed to the array.
|
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
4) an even numbered list - it's a list of key & value pairs to be added to the parent's hash.
|
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
The handling of the attributes may be changed by adding '@', '%', '+', '*' or '.' before the attribute name.
|
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
Without any "sigil" the key & value is added to the hash overwriting any previous values.
|
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
The values for the keys starting with '@' are push()ed to the arrays referenced by the key name
|
313
|
|
|
|
|
|
|
without the @. If there already is an attribute of the same name then the value will be preserved and will become
|
314
|
|
|
|
|
|
|
the first element in the array.
|
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
The values for the keys starting with '%' have to be either hash or array references. The key&value pairs
|
317
|
|
|
|
|
|
|
in the referenced hash or array will be added to the hash referenced by the key. This is nice for rows of tags like this:
|
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
if you specify the rule as
|
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
field => sub { '%fields' => [$_[1]->{name} => $_[1]->{value}]}
|
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
then the parent tag's has will contain
|
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
fields => {
|
329
|
|
|
|
|
|
|
foo => 12,
|
330
|
|
|
|
|
|
|
bar => 24,
|
331
|
|
|
|
|
|
|
}
|
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
The values for the keys starting with '+' are added to the current value, the ones starting with '.' are
|
334
|
|
|
|
|
|
|
appended to the current value and the ones starting with '*' multiply the current value.
|
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
5) an odd numbered list - the last element is appended or push()ed to the parent's _content, the rest is handled as in the previous case.
|
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
=head3 Builtin rules
|
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
'content' = only the content of the tag is preserved and added to
|
341
|
|
|
|
|
|
|
the parent tag's hash as an attribute named after the tag. Equivalent to:
|
342
|
|
|
|
|
|
|
sub { $_[0] => $_[1]->{_content}}
|
343
|
|
|
|
|
|
|
'content trim' = only the content of the tag is preserved, trimmed and added to
|
344
|
|
|
|
|
|
|
the parent tag's hash as an attribute named after the tag
|
345
|
|
|
|
|
|
|
sub { s/^\s+//,s/\s+$// for ($_[1]->{_content}); $_[0] => $_[1]->{_content}}
|
346
|
|
|
|
|
|
|
'content array' = only the content of the tag is preserved and pushed
|
347
|
|
|
|
|
|
|
to the array pointed to by the attribute
|
348
|
|
|
|
|
|
|
sub { '@' . $_[0] => $_[1]->{_content}}
|
349
|
|
|
|
|
|
|
'as is' = the tag's hash is added to the parent tag's hash
|
350
|
|
|
|
|
|
|
as an attribute named after the tag
|
351
|
|
|
|
|
|
|
sub { $_[0] => $_[1]}
|
352
|
|
|
|
|
|
|
'as is trim' = the tag's hash is added to the parent tag's hash
|
353
|
|
|
|
|
|
|
as an attribute named after the tag, the content is trimmed
|
354
|
|
|
|
|
|
|
sub { $_[0] => $_[1]}
|
355
|
|
|
|
|
|
|
'as array' = the tag's hash is pushed to the attribute named after the tag
|
356
|
|
|
|
|
|
|
in the parent tag's hash
|
357
|
|
|
|
|
|
|
sub { '@'.$_[0] => $_[1]}
|
358
|
|
|
|
|
|
|
'as array trim' = the tag's hash is pushed to the attribute named after the tag
|
359
|
|
|
|
|
|
|
in the parent tag's hash, the content is trimmed
|
360
|
|
|
|
|
|
|
sub { '@'.$_[0] => $_[1]}
|
361
|
|
|
|
|
|
|
'no content' = the _content is removed from the tag's hash and the hash
|
362
|
|
|
|
|
|
|
is added to the parent's hash into the attribute named after the tag
|
363
|
|
|
|
|
|
|
sub { delete $_[1]->{_content}; $_[0] => $_[1]}
|
364
|
|
|
|
|
|
|
'no content array' = similar to 'no content' except the hash is pushed
|
365
|
|
|
|
|
|
|
into the array referenced by the attribute
|
366
|
|
|
|
|
|
|
'as array no content' = same as 'no content array'
|
367
|
|
|
|
|
|
|
'pass' = the tag's hash is dissolved into the parent's hash,
|
368
|
|
|
|
|
|
|
that is all tag's attributes become the parent's attributes.
|
369
|
|
|
|
|
|
|
The _content is appended to the parent's _content.
|
370
|
|
|
|
|
|
|
sub { %{$_[1]}}
|
371
|
|
|
|
|
|
|
'pass no content' = the _content is removed and the hash is dissolved
|
372
|
|
|
|
|
|
|
into the parent's hash.
|
373
|
|
|
|
|
|
|
sub { delete $_[1]->{_content}; %{$_[1]}}
|
374
|
|
|
|
|
|
|
'pass without content' = same as 'pass no content'
|
375
|
|
|
|
|
|
|
'raw' = the [tagname => attrs] is pushed to the parent tag's _content.
|
376
|
|
|
|
|
|
|
You would use this style if you wanted to be able to print
|
377
|
|
|
|
|
|
|
the parent tag as XML preserving the whitespace or other textual content
|
378
|
|
|
|
|
|
|
sub { [$_[0] => $_[1]]}
|
379
|
|
|
|
|
|
|
'raw extended' = the [tagname => attrs] is pushed to the parent tag's _content
|
380
|
|
|
|
|
|
|
and the attrs are added to the parent's attribute hash with ":$tagname" as the key
|
381
|
|
|
|
|
|
|
sub { (':'.$_[0] => $_[1], [$_[0] => $_[1]])};
|
382
|
|
|
|
|
|
|
'raw extended array' = the [tagname => attrs] is pushed to the parent tag's _content
|
383
|
|
|
|
|
|
|
and the attrs are pushed to the parent's attribute hash with ":$tagname" as the key
|
384
|
|
|
|
|
|
|
sub { ('@:'.$_[0] => $_[1], [$_[0] => $_[1]])};
|
385
|
|
|
|
|
|
|
'by ' = uses the value of the specified attribute as the key when adding the
|
386
|
|
|
|
|
|
|
attribute hash into the parent tag's hash. You can specify more names, in that case
|
387
|
|
|
|
|
|
|
the first found is used.
|
388
|
|
|
|
|
|
|
sub {delete($_[1]->{name}) => $_[1]}
|
389
|
|
|
|
|
|
|
'content by ' = uses the value of the specified attribute as the key when adding the
|
390
|
|
|
|
|
|
|
tags content into the parent tag's hash. You can specify more names, in that case
|
391
|
|
|
|
|
|
|
the first found is used.
|
392
|
|
|
|
|
|
|
sub {$_[1]->{name} => $_[1]->{_content}}
|
393
|
|
|
|
|
|
|
'no content by ' = uses the value of the specified attribute as the key when adding the
|
394
|
|
|
|
|
|
|
attribute hash into the parent tag's hash. The content is dropped. You can specify more names,
|
395
|
|
|
|
|
|
|
in that case the first found is used.
|
396
|
|
|
|
|
|
|
sub {delete($_[1]->{_content}); delete($_[1]->{name}) => $_[1]}
|
397
|
|
|
|
|
|
|
'==...' = replace the tag by the specified string. That is the string will be added to
|
398
|
|
|
|
|
|
|
the parent tag's _content
|
399
|
|
|
|
|
|
|
sub { return '...' }
|
400
|
|
|
|
|
|
|
'=...' = replace the tag contents by the specified string and forget the attributes.
|
401
|
|
|
|
|
|
|
sub { return $_[0] => '...' }
|
402
|
|
|
|
|
|
|
'' = forget the tag's contents (after processing the rules for subtags)
|
403
|
|
|
|
|
|
|
sub { return };
|
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
I include the unnamed subroutines that would be equivalent to the builtin rule in case you need to add
|
406
|
|
|
|
|
|
|
some tests and then behave as if one of the builtins was used.
|
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
=head3 Builtin rule modifiers
|
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
You can add these modifiers to most rules, just add them to the string literal, at the end, separated from the base rule by a space.
|
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
no xmlns = strip the namespace alias from the $_[0] (tag name)
|
413
|
|
|
|
|
|
|
remove(list,of,attributes) = remove all specified attributes (or keys produced by child tag rules) from the tag data
|
414
|
|
|
|
|
|
|
only(list,of,attributes) = filter the hash of attributes and keys+values produced by child tag rules in the tag data
|
415
|
|
|
|
|
|
|
to only include those specified here. In case you need to include the tag content do not forget to include
|
416
|
|
|
|
|
|
|
_content in the list!
|
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
Not all modifiers make sense for all rules. For example if the rule is 'content', it's pointless to filter the attributes, because the only one
|
419
|
|
|
|
|
|
|
used will be the content anyway.
|
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
The behaviour of the combination of the 'raw...' rules and the rule modifiers is UNDEFINED!
|
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
=head3 Different rules for different paths to tags
|
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
Since 0.19 it's possible to specify several actions for a tag if you need to do something different based on the path to the tag like this:
|
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
tagname => [
|
428
|
|
|
|
|
|
|
'tag/path' => action,
|
429
|
|
|
|
|
|
|
'/root/tag/path' => action,
|
430
|
|
|
|
|
|
|
'/root/*/path' => action,
|
431
|
|
|
|
|
|
|
qr{^root/ns:[^/]+/par$} => action,
|
432
|
|
|
|
|
|
|
default_action
|
433
|
|
|
|
|
|
|
],
|
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
The path is matched against the list of parent tags joined by slashes.
|
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
If you need to use more complex conditions to select the actions you have to use a single subroutine rule and implement
|
438
|
|
|
|
|
|
|
the conditions within that subroutine. You have access both to the list of enclosing tags and their attribute hashes (including
|
439
|
|
|
|
|
|
|
the data obtained from the rules of the already closed subtags of the enclosing tags.
|
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
=head2 The Start Rules
|
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
Apart from the normal rules that get invoked once the tag is fully parsed, including the contents and child tags, you may want to
|
445
|
|
|
|
|
|
|
attach some code to the start tag to (optionaly) skip whole branches of XML or set up attributes and variables. You may set up
|
446
|
|
|
|
|
|
|
the start rules either in a separate parameter to the constructor or in the rules=> by prepending the tag name(s) by ^.
|
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
These rules are in form
|
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
tagspecification => undef / '' / 'skip' --> skip the element, including child tags
|
451
|
|
|
|
|
|
|
tagspecification => 1 / 'handle' --> handle the element, may be needed
|
452
|
|
|
|
|
|
|
if you specify the _default rule.
|
453
|
|
|
|
|
|
|
tagspecification => \&subroutine
|
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
The subroutines receive the same parameters as for the "end tag" rules except of course the _content, but their return value is treated differently.
|
456
|
|
|
|
|
|
|
If the subroutine returns a false value then the whole branch enclosed by the current tag is skipped, no data are stored and no rules are
|
457
|
|
|
|
|
|
|
executed. You may modify the hash referenced by $attr.
|
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
You may even tie() the hash referenced by $attr, for example in case you want to store the parsed data in a DBM::Deep.
|
460
|
|
|
|
|
|
|
In such case all the data returned by the immediate subtags of this tag will be stored in the DBM::Deep.
|
461
|
|
|
|
|
|
|
Make sure you do not overwrite the data by data from another occurance of the same tag if you return $_[1]/$attr from the rule!
|
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
YourHugeTag => sub {
|
464
|
|
|
|
|
|
|
my %temp = %{$_[1]};
|
465
|
|
|
|
|
|
|
tie %{$_[1]}, 'DBM::Deep', $filename;
|
466
|
|
|
|
|
|
|
%{$_[1]} = %temp;
|
467
|
|
|
|
|
|
|
1;
|
468
|
|
|
|
|
|
|
}
|
469
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
Both types of rules are free to store any data they want in $parser->{pad}. This property is NOT emptied
|
471
|
|
|
|
|
|
|
after the parsing!
|
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
=head2 Whitespace handling
|
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
There are two options that affect the whitespace handling: stripspaces and normalisespaces. The normalisespaces is a simple flag that controls
|
476
|
|
|
|
|
|
|
whether multiple spaces/tabs/newlines are collapsed into a single space or not. The stripspaces is more complex, it's a bit-mask,
|
477
|
|
|
|
|
|
|
an ORed combination of the following options:
|
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
0 - don't remove whitespace around tags
|
480
|
|
|
|
|
|
|
(around tags means before the opening tag and after the closing tag, not in the tag's content!)
|
481
|
|
|
|
|
|
|
1 - remove whitespace before tags whose rules did not return any text content
|
482
|
|
|
|
|
|
|
(the rule specified for the tag caused the data of the tag to be ignored,
|
483
|
|
|
|
|
|
|
processed them already or added them as attributes to parent's \%attr)
|
484
|
|
|
|
|
|
|
2 - remove whitespace around tags whose rules did not return any text content
|
485
|
|
|
|
|
|
|
3 - remove whitespace around all tags
|
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
0 - remove only whitespace-only content
|
488
|
|
|
|
|
|
|
(that is remove the whitespace around in this case " "
|
489
|
|
|
|
|
|
|
but not this one "blah blah")
|
490
|
|
|
|
|
|
|
4 - remove trailing/leading whitespace
|
491
|
|
|
|
|
|
|
(remove the whitespace in both cases above)
|
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
0 - don't trim content
|
494
|
|
|
|
|
|
|
8 - do trim content
|
495
|
|
|
|
|
|
|
(That is for " blah " only pass to the rule {_content => 'blah'})
|
496
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
That is if you have a data oriented XML in which each tag contains either text content or subtags, but not both,
|
499
|
|
|
|
|
|
|
you want to use stripspaces => 3 or stripspaces => 3|4. This will not only make sure you don't need to bother
|
500
|
|
|
|
|
|
|
with the whitespace-only _content of the tags with subtags, but will also make sure you do not keep on wasting
|
501
|
|
|
|
|
|
|
memory while parsing a huge XML and processing the "twigs". Without that option the parent tag of
|
502
|
|
|
|
|
|
|
the repeated tag would keep on accumulating unneeded whitespace in its _content.
|
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
=cut
|
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
sub new {
|
507
|
|
|
|
|
|
|
my $class = shift;
|
508
|
|
|
|
|
|
|
my %params = @_;
|
509
|
|
|
|
|
|
|
croak "Please specify the rules=> for the parser!" unless $params{rules} and ref($params{rules});
|
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
my $self = {rules => {}, start_rules => {}};
|
512
|
|
|
|
|
|
|
bless $self, $class;
|
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
my @rules = (ref($params{rules}) eq 'HASH' ? %{$params{rules}} : @{$params{rules}}); # dereference and copy
|
515
|
|
|
|
|
|
|
delete $params{rules};
|
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
my @start_rules;
|
518
|
|
|
|
|
|
|
if ($params{start_rules} and ref($params{start_rules})) {
|
519
|
|
|
|
|
|
|
@start_rules = ref($params{start_rules}) eq 'HASH' ? %{$params{start_rules}} : @{$params{start_rules}}; # dereference and copy
|
520
|
|
|
|
|
|
|
};
|
521
|
|
|
|
|
|
|
delete $params{start_rules};
|
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
for (my $i=0; $i <= $#rules; $i+=2) {
|
524
|
|
|
|
|
|
|
next unless $rules[$i] =~ s/^\^//;
|
525
|
|
|
|
|
|
|
push @start_rules, splice( @rules, $i, 2);
|
526
|
|
|
|
|
|
|
$i-=2;
|
527
|
|
|
|
|
|
|
}
|
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
$self->_split_rules( \@rules, 'rules', 'as is');
|
530
|
|
|
|
|
|
|
$self->_split_rules( \@start_rules, 'start_rules', 'handle');
|
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
$self->{for_parser} = {};
|
533
|
|
|
|
|
|
|
{ # extract the params for the XML::Parser::Expat constructor
|
534
|
|
|
|
|
|
|
my @for_parser = grep exists($params{$_}), qw(ProtocolEncoding Namespaces NoExpand Stream_Delimiter ErrorContext ParseParamEnt Base);
|
535
|
|
|
|
|
|
|
if (@for_parser) {
|
536
|
|
|
|
|
|
|
@{$self->{for_parser}}{@for_parser} = @params{@for_parser};
|
537
|
|
|
|
|
|
|
delete @params{@for_parser};
|
538
|
|
|
|
|
|
|
}
|
539
|
|
|
|
|
|
|
}
|
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
$self->{namespaces} = delete($params{namespaces});
|
542
|
|
|
|
|
|
|
if (defined($self->{namespaces})) {
|
543
|
|
|
|
|
|
|
croak 'XML::Rules->new( ... , namespaces => ...HERE...) must be a hash reference!'
|
544
|
|
|
|
|
|
|
unless ref($self->{namespaces}) eq 'HASH';
|
545
|
|
|
|
|
|
|
$self->{xmlns_map} = {};
|
546
|
|
|
|
|
|
|
if (defined $self->{namespaces}{'*'}) {
|
547
|
|
|
|
|
|
|
if (! grep $_ eq $self->{namespaces}{'*'}, qw(warn die keep strip), '') {
|
548
|
|
|
|
|
|
|
# local $Carp::CarpLevel = 2;
|
549
|
|
|
|
|
|
|
croak qq{Unknown namespaces->{'*'} option '$self->{namespaces}{'*'}'!};
|
550
|
|
|
|
|
|
|
}
|
551
|
|
|
|
|
|
|
} else {
|
552
|
|
|
|
|
|
|
$self->{namespaces}{'*'} = 'warn';
|
553
|
|
|
|
|
|
|
}
|
554
|
|
|
|
|
|
|
}
|
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
$self->{custom_escape} = delete($params{custom_escape}) if exists $params{custom_escape};
|
557
|
|
|
|
|
|
|
$self->{style} = delete($params{style}) || 'parser';
|
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
my $handlers = delete $params{handlers}; # need to remove it so that it doesn't end up in opt
|
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
$self->{opt}{lc $_} = $params{$_} for keys %params;
|
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
delete $self->{opt}{encode} if $self->{opt}{encode} =~ /^utf-?8$/i;
|
564
|
|
|
|
|
|
|
delete $self->{opt}{output_encoding} if $self->{opt}{output_encoding} =~ /^utf-?8$/i;
|
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
for (qw(normalisespace normalizespace normalizespaces)) {
|
567
|
|
|
|
|
|
|
last if defined($self->{opt}{normalisespaces});
|
568
|
|
|
|
|
|
|
$self->{opt}{normalisespaces} = $self->{opt}{$_};
|
569
|
|
|
|
|
|
|
delete $self->{opt}{$_};
|
570
|
|
|
|
|
|
|
}
|
571
|
|
|
|
|
|
|
$self->{opt}{normalisespaces} = 0 unless(defined($self->{opt}{normalisespaces}));
|
572
|
|
|
|
|
|
|
$self->{opt}{stripspaces} = 0 unless(defined($self->{opt}{stripspaces}));
|
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
require 'Encode.pm' if ($self->{opt}{encode} or $self->{opt}{output_encoding});
|
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
if ($handlers) {
|
577
|
|
|
|
|
|
|
croak qq{The 'handlers' option must be a hashref!} unless ref($handlers) eq 'HASH';
|
578
|
|
|
|
|
|
|
my %handlers = %{$handlers}; # shallow copy
|
579
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
for (qw(Start End Char XMLDecl), ($self->{style} eq 'filter' ? qw(CdataStart CdataEnd) : ())) {
|
581
|
|
|
|
|
|
|
no strict 'refs';
|
582
|
|
|
|
|
|
|
if ($handlers{$_}) {
|
583
|
|
|
|
|
|
|
my $custom = $handlers{$_};
|
584
|
|
|
|
|
|
|
my $mine = "_$_"->($self);
|
585
|
|
|
|
|
|
|
# $handlers{$_} = sub {$custom->(@_); $mine->(@_)}
|
586
|
|
|
|
|
|
|
$handlers{$_} = sub {&$custom; &$mine}
|
587
|
|
|
|
|
|
|
} else {
|
588
|
|
|
|
|
|
|
$handlers{$_} = "_$_"->($self);
|
589
|
|
|
|
|
|
|
}
|
590
|
|
|
|
|
|
|
}
|
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
for (qw(Start End Char XMLDecl)) {
|
593
|
|
|
|
|
|
|
$self->{basic_handlers}{$_} = delete $self->{other_handlers}{$_} if exists $self->{other_handlers}{$_};
|
594
|
|
|
|
|
|
|
}
|
595
|
|
|
|
|
|
|
$self->{normal_handlers} = [ %handlers ];
|
596
|
|
|
|
|
|
|
} else {
|
597
|
|
|
|
|
|
|
$self->{normal_handlers} = [
|
598
|
|
|
|
|
|
|
Start => _Start($self),
|
599
|
|
|
|
|
|
|
End => _End($self),
|
600
|
|
|
|
|
|
|
Char => _Char($self),
|
601
|
|
|
|
|
|
|
XMLDecl => _XMLDecl($self),
|
602
|
|
|
|
|
|
|
(
|
603
|
|
|
|
|
|
|
$self->{style} eq 'filter' ? (CdataStart => _CdataStart($self), CdataEnd => _CdataEnd ($self)) : ()
|
604
|
|
|
|
|
|
|
)
|
605
|
|
|
|
|
|
|
];
|
606
|
|
|
|
|
|
|
}
|
607
|
|
|
|
|
|
|
$self->{ignore_handlers} = [
|
608
|
|
|
|
|
|
|
Start => _StartIgnore($self),
|
609
|
|
|
|
|
|
|
Char => undef,
|
610
|
|
|
|
|
|
|
End => _EndIgnore($self),
|
611
|
|
|
|
|
|
|
];
|
612
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
return $self;
|
614
|
|
|
|
|
|
|
}
|
615
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
sub _split_rules {
|
617
|
|
|
|
|
|
|
my ($self, $rules, $type, $default) = @_;
|
618
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
$self->{$type}{_default} = $default unless exists($self->{$type}{_default});
|
620
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
while (@$rules) {
|
622
|
|
|
|
|
|
|
my ($tag, $code) = (shift(@$rules), shift(@$rules));
|
623
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
if (ref($code) eq 'ARRAY') {
|
625
|
|
|
|
|
|
|
for( my $i = 0; $i < $#$code; $i+=2) {
|
626
|
|
|
|
|
|
|
$code->[$i] = _xpath2re($code->[$i]);
|
627
|
|
|
|
|
|
|
}
|
628
|
|
|
|
|
|
|
push @$code, $self->{$type}{_default} if @$code % 2 == 0; # add the default type if there's even number of items (path => code, path => code)
|
629
|
|
|
|
|
|
|
}
|
630
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
if ($tag =~ m{^/([^/].*)/([imosx]*)$}) { # string with a '/regexp/'
|
632
|
|
|
|
|
|
|
if ($2) {
|
633
|
|
|
|
|
|
|
push @{$self->{$type.'_re'}}, qr/(?$2)$1/;
|
634
|
|
|
|
|
|
|
} else {
|
635
|
|
|
|
|
|
|
push @{$self->{$type.'_re'}}, qr/$1/;
|
636
|
|
|
|
|
|
|
}
|
637
|
|
|
|
|
|
|
push @{$self->{$type.'_re_code'}}, $code;
|
638
|
|
|
|
|
|
|
} elsif (ref($tag) eq 'Regexp') { # a qr// created regexp
|
639
|
|
|
|
|
|
|
push @{$self->{$type.'_re'}}, $tag;
|
640
|
|
|
|
|
|
|
push @{$self->{$type.'_re_code'}}, $code;
|
641
|
|
|
|
|
|
|
} elsif ($tag =~ /[,\|]/) { # a , or | separated list
|
642
|
|
|
|
|
|
|
if ($tag =~ s/^\^//) {
|
643
|
|
|
|
|
|
|
my @tags = split(/\s*[,\|]\s*/, $tag);
|
644
|
|
|
|
|
|
|
$self->{$type}{'^'.$_} = $code for (@tags);
|
645
|
|
|
|
|
|
|
} else {
|
646
|
|
|
|
|
|
|
my @tags = split(/\s*[,\|]\s*/, $tag);
|
647
|
|
|
|
|
|
|
$self->{$type}{$_} = $code for (@tags);
|
648
|
|
|
|
|
|
|
}
|
649
|
|
|
|
|
|
|
} else { # a single tag
|
650
|
|
|
|
|
|
|
$self->{$type}{$tag} = $code;
|
651
|
|
|
|
|
|
|
}
|
652
|
|
|
|
|
|
|
}
|
653
|
|
|
|
|
|
|
}
|
654
|
|
|
|
|
|
|
|
655
|
|
|
|
|
|
|
sub _xpath2re {
|
656
|
|
|
|
|
|
|
my $s = shift;
|
657
|
|
|
|
|
|
|
return $s if ref($s);
|
658
|
|
|
|
|
|
|
for ($s) {
|
659
|
|
|
|
|
|
|
s/([\.\[\]+{}\-])/\\$1/g;
|
660
|
|
|
|
|
|
|
s{\*}{.+}g;
|
661
|
|
|
|
|
|
|
s{^//}{}s;
|
662
|
|
|
|
|
|
|
s{^/}{^}s;
|
663
|
|
|
|
|
|
|
}
|
664
|
|
|
|
|
|
|
return qr{$s$};
|
665
|
|
|
|
|
|
|
}
|
666
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
sub _import_usage {
|
668
|
|
|
|
|
|
|
croak
|
669
|
|
|
|
|
|
|
"Usage: use XML::Rules subroutine_name => {method => '...', rules => {...}, ...};
|
670
|
|
|
|
|
|
|
or use XML::Rules inferRules => 'file/path.dtd';
|
671
|
|
|
|
|
|
|
or use XML::Rules inferRules => 'file/path.xml';
|
672
|
|
|
|
|
|
|
or use XML::Rules inferRules => ['file/path1.xml','file/path2.xml'];"
|
673
|
|
|
|
|
|
|
}
|
674
|
|
|
|
|
|
|
|
675
|
|
|
|
|
|
|
sub import {
|
676
|
|
|
|
|
|
|
my $class = shift();
|
677
|
|
|
|
|
|
|
return unless @_;
|
678
|
|
|
|
|
|
|
_import_usage() unless scalar(@_) % 2 == 0;
|
679
|
|
|
|
|
|
|
my $caller_pack = caller;
|
680
|
|
|
|
|
|
|
while (@_) {
|
681
|
|
|
|
|
|
|
my $subname = shift;
|
682
|
|
|
|
|
|
|
my $params = shift;
|
683
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
if (lc($subname) eq 'inferrules') {
|
685
|
|
|
|
|
|
|
require Data::Dumper;
|
686
|
|
|
|
|
|
|
local $Data::Dumper::Terse = 1;
|
687
|
|
|
|
|
|
|
local $Data::Dumper::Indent = 1;
|
688
|
|
|
|
|
|
|
if (ref $params) {
|
689
|
|
|
|
|
|
|
if (ref $params eq 'ARRAY') {
|
690
|
|
|
|
|
|
|
print Data::Dumper::Dumper(inferRulesFromExample(@$params))
|
691
|
|
|
|
|
|
|
} else {
|
692
|
|
|
|
|
|
|
_import_usage()
|
693
|
|
|
|
|
|
|
}
|
694
|
|
|
|
|
|
|
} elsif ($params =~ /\.dtd$/i) {
|
695
|
|
|
|
|
|
|
print Data::Dumper::Dumper(inferRulesFromDTD($params))
|
696
|
|
|
|
|
|
|
} else {
|
697
|
|
|
|
|
|
|
print Data::Dumper::Dumper(inferRulesFromExample($params))
|
698
|
|
|
|
|
|
|
}
|
699
|
|
|
|
|
|
|
} else {
|
700
|
|
|
|
|
|
|
_import_usage()
|
701
|
|
|
|
|
|
|
unless !ref($subname) and ref($params) eq 'HASH';
|
702
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
my $method = delete $params->{method} || $subname;
|
704
|
|
|
|
|
|
|
if (!$params->{rules} && $method =~ /^[tT]oXML$/) {
|
705
|
|
|
|
|
|
|
$params->{rules} = {};
|
706
|
|
|
|
|
|
|
}
|
707
|
|
|
|
|
|
|
my $parser = XML::Rules->new(%$params);
|
708
|
|
|
|
|
|
|
|
709
|
|
|
|
|
|
|
no strict 'refs';
|
710
|
|
|
|
|
|
|
*{$caller_pack . '::' . $subname} = sub {unshift @_, $parser; goto &$method; };
|
711
|
|
|
|
|
|
|
}
|
712
|
|
|
|
|
|
|
}
|
713
|
|
|
|
|
|
|
}
|
714
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
sub skip_rest {
|
716
|
|
|
|
|
|
|
die "[XML::Rules] skip rest\n";
|
717
|
|
|
|
|
|
|
}
|
718
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
sub return_nothing {
|
720
|
|
|
|
|
|
|
die "[XML::Rules] return nothing\n";
|
721
|
|
|
|
|
|
|
}
|
722
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
sub return_this {
|
724
|
|
|
|
|
|
|
my $self = shift();
|
725
|
|
|
|
|
|
|
die bless({val => [@_]}, "XML::Rules::return_this");
|
726
|
|
|
|
|
|
|
}
|
727
|
|
|
|
|
|
|
|
728
|
|
|
|
|
|
|
sub _run {
|
729
|
|
|
|
|
|
|
my $self = shift;
|
730
|
|
|
|
|
|
|
my $string = shift;
|
731
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
croak "This parser is already busy parsing a document!" if exists $self->{parser};
|
733
|
|
|
|
|
|
|
|
734
|
|
|
|
|
|
|
$self->{parameters} = shift;
|
735
|
|
|
|
|
|
|
|
736
|
|
|
|
|
|
|
$self->{parser} = XML::Parser::Expat->new( %{$self->{for_parser}});
|
737
|
|
|
|
|
|
|
|
738
|
|
|
|
|
|
|
$self->{parser}->setHandlers( @{$self->{normal_handlers}} );
|
739
|
|
|
|
|
|
|
|
740
|
|
|
|
|
|
|
$self->{data} = [];
|
741
|
|
|
|
|
|
|
$self->{context} = [];
|
742
|
|
|
|
|
|
|
$self->{_ltrim} = [0];
|
743
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
if (! eval {
|
745
|
|
|
|
|
|
|
$self->{parser}->parse($string) and 1;
|
746
|
|
|
|
|
|
|
}) {
|
747
|
|
|
|
|
|
|
my $err = $@;
|
748
|
|
|
|
|
|
|
undef $@;
|
749
|
|
|
|
|
|
|
if ($err =~ /^\[XML::Rules\] skip rest/) {
|
750
|
|
|
|
|
|
|
my (undef, $handler) = $self->{parser}->setHandlers(End => undef);
|
751
|
|
|
|
|
|
|
foreach my $tag (reverse @{$self->{context} = []}) {
|
752
|
|
|
|
|
|
|
$handler->( $self->{parser}, $tag);
|
753
|
|
|
|
|
|
|
}
|
754
|
|
|
|
|
|
|
} else {
|
755
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
delete $self->{parameters};
|
757
|
|
|
|
|
|
|
$self->{parser}->release();
|
758
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
$self->{data} = [];
|
760
|
|
|
|
|
|
|
$self->{context} = [];
|
761
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
if ($err =~ /^\[XML::Rules\] return nothing/) {
|
763
|
|
|
|
|
|
|
return;
|
764
|
|
|
|
|
|
|
} elsif (ref $err eq 'XML::Rules::return_this') {
|
765
|
|
|
|
|
|
|
if (wantarray()) {
|
766
|
|
|
|
|
|
|
return @{$err->{val}}
|
767
|
|
|
|
|
|
|
} else {
|
768
|
|
|
|
|
|
|
return ${$err->{val}}[-1]
|
769
|
|
|
|
|
|
|
}
|
770
|
|
|
|
|
|
|
}
|
771
|
|
|
|
|
|
|
|
772
|
|
|
|
|
|
|
$err =~ s/at \S+Rules\.pm line \d+$//
|
773
|
|
|
|
|
|
|
and croak $err or die $err;
|
774
|
|
|
|
|
|
|
}
|
775
|
|
|
|
|
|
|
};
|
776
|
|
|
|
|
|
|
|
777
|
|
|
|
|
|
|
$self->{parser}->release();
|
778
|
|
|
|
|
|
|
delete $self->{parser};
|
779
|
|
|
|
|
|
|
|
780
|
|
|
|
|
|
|
delete $self->{parameters};
|
781
|
|
|
|
|
|
|
my $data; # return the accumulated data, without keeping a copy inside the object
|
782
|
|
|
|
|
|
|
($data, $self->{data}) = ($self->{data}[0], undef);
|
783
|
|
|
|
|
|
|
if (!defined(wantarray()) or ! keys(%$data)) {
|
784
|
|
|
|
|
|
|
return;
|
785
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
} elsif (keys(%$data) == 1 and exists(${$data}{_content})) {
|
787
|
|
|
|
|
|
|
if (ref(${$data}{_content}) eq 'ARRAY' and @{${$data}{_content}} == 1) {
|
788
|
|
|
|
|
|
|
return ${${$data}{_content}}[0]
|
789
|
|
|
|
|
|
|
} else {
|
790
|
|
|
|
|
|
|
return ${$data}{_content}
|
791
|
|
|
|
|
|
|
}
|
792
|
|
|
|
|
|
|
|
793
|
|
|
|
|
|
|
} else {
|
794
|
|
|
|
|
|
|
return $data;
|
795
|
|
|
|
|
|
|
}
|
796
|
|
|
|
|
|
|
}
|
797
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
|
799
|
|
|
|
|
|
|
sub parsestring;
|
800
|
|
|
|
|
|
|
*parsestring = \&parse;
|
801
|
|
|
|
|
|
|
sub parse_string;
|
802
|
|
|
|
|
|
|
*parse_string = \&parse;
|
803
|
|
|
|
|
|
|
sub parse {
|
804
|
|
|
|
|
|
|
if (!ref $_[0] and $_[0] eq 'XML::Rules') {
|
805
|
|
|
|
|
|
|
my $parser = &new; # get's the current @_
|
806
|
|
|
|
|
|
|
return sub {unshift @_, $parser; goto &parse;}
|
807
|
|
|
|
|
|
|
}
|
808
|
|
|
|
|
|
|
my $self = shift;
|
809
|
|
|
|
|
|
|
croak("This XML::Rules object may only be used as a filter!") if ($self->{style} eq 'filter');
|
810
|
|
|
|
|
|
|
$self->_run(@_);
|
811
|
|
|
|
|
|
|
}
|
812
|
|
|
|
|
|
|
|
813
|
|
|
|
|
|
|
sub parse_file;
|
814
|
|
|
|
|
|
|
*parse_file = \&parsefile;
|
815
|
|
|
|
|
|
|
sub parsefile {
|
816
|
|
|
|
|
|
|
if (!ref $_[0] and $_[0] eq 'XML::Rules') {
|
817
|
|
|
|
|
|
|
my $parser = &new; # get's the current @_
|
818
|
|
|
|
|
|
|
return sub {unshift @_, $parser; goto &parsefile;}
|
819
|
|
|
|
|
|
|
}
|
820
|
|
|
|
|
|
|
my $self = shift;
|
821
|
|
|
|
|
|
|
croak("This XML::Rules object may only be used as a filter!") if ($self->{style} eq 'filter');
|
822
|
|
|
|
|
|
|
my $filename = shift;
|
823
|
|
|
|
|
|
|
open my $IN, '<', $filename or croak "Cannot open '$filename' for reading: $^E";
|
824
|
|
|
|
|
|
|
return $self->_run($IN, @_);
|
825
|
|
|
|
|
|
|
}
|
826
|
|
|
|
|
|
|
|
827
|
|
|
|
|
|
|
|
828
|
|
|
|
|
|
|
sub filterstring;
|
829
|
|
|
|
|
|
|
*filterstring = \&filter;
|
830
|
|
|
|
|
|
|
sub filter_string;
|
831
|
|
|
|
|
|
|
*filter_string = \&filter;
|
832
|
|
|
|
|
|
|
sub filter {
|
833
|
|
|
|
|
|
|
if (!ref $_[0] and $_[0] eq 'XML::Rules') {
|
834
|
|
|
|
|
|
|
my $parser = &new; # get's the current @_
|
835
|
|
|
|
|
|
|
return sub {unshift @_, $parser; goto &filter;}
|
836
|
|
|
|
|
|
|
}
|
837
|
|
|
|
|
|
|
my $self = shift;
|
838
|
|
|
|
|
|
|
croak("This XML::Rules object may only be used as a parser!") unless ($self->{style} eq 'filter');
|
839
|
|
|
|
|
|
|
|
840
|
|
|
|
|
|
|
my $XML = shift;
|
841
|
|
|
|
|
|
|
$self->{FH} = shift || select(); # either passed or the selected filehandle
|
842
|
|
|
|
|
|
|
if (!ref($self->{FH})) {
|
843
|
|
|
|
|
|
|
if ($self->{FH} =~ /^main::(?:STDOUT|STDERR)$/) {
|
844
|
|
|
|
|
|
|
# yeah, select sometimes returns the name of the filehandle, not the filehandle itself. eg. "main::STDOUT"
|
845
|
|
|
|
|
|
|
no strict;
|
846
|
|
|
|
|
|
|
$self->{FH} = \*{$self->{FH}};
|
847
|
|
|
|
|
|
|
} else {
|
848
|
|
|
|
|
|
|
open my $FH, '>:utf8', $self->{FH} or croak(qq{Failed to open "$self->{FH}" for writing: $^E});
|
849
|
|
|
|
|
|
|
$self->{FH} = $FH;
|
850
|
|
|
|
|
|
|
}
|
851
|
|
|
|
|
|
|
} elsif (ref($self->{FH}) eq 'SCALAR') {
|
852
|
|
|
|
|
|
|
open my $FH, '>', $self->{FH};
|
853
|
|
|
|
|
|
|
$self->{FH} = $FH;
|
854
|
|
|
|
|
|
|
}
|
855
|
|
|
|
|
|
|
if (! $self->{opt}{skip_xml_version}) {
|
856
|
|
|
|
|
|
|
if ($self->{opt}{output_encoding}) {
|
857
|
|
|
|
|
|
|
print {$self->{FH}} qq{{opt}{output_encoding}"?>\n};
|
858
|
|
|
|
|
|
|
} else {
|
859
|
|
|
|
|
|
|
print {$self->{FH}} qq{\n};
|
860
|
|
|
|
|
|
|
}
|
861
|
|
|
|
|
|
|
}
|
862
|
|
|
|
|
|
|
|
863
|
|
|
|
|
|
|
$self->_run($XML, @_);
|
864
|
|
|
|
|
|
|
print {$self->{FH}} "\n";
|
865
|
|
|
|
|
|
|
delete $self->{FH};
|
866
|
|
|
|
|
|
|
}
|
867
|
|
|
|
|
|
|
|
868
|
|
|
|
|
|
|
sub filterfile {
|
869
|
|
|
|
|
|
|
if (!ref $_[0] and $_[0] eq 'XML::Rules') {
|
870
|
|
|
|
|
|
|
my $parser = &new; # get's the current @_
|
871
|
|
|
|
|
|
|
return sub {unshift @_, $parser; goto &filterfile;}
|
872
|
|
|
|
|
|
|
}
|
873
|
|
|
|
|
|
|
my $self = shift;
|
874
|
|
|
|
|
|
|
croak("This XML::Rules object may only be used as a parser!") unless ($self->{style} eq 'filter');
|
875
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
my $filename = shift;
|
877
|
|
|
|
|
|
|
open my $IN, '<', $filename or croak "Cannot open '$filename' for reading: $^E";
|
878
|
|
|
|
|
|
|
|
879
|
|
|
|
|
|
|
$self->{FH} = shift || select(); # either passed or the selected filehandle
|
880
|
|
|
|
|
|
|
if (!ref($self->{FH})) {
|
881
|
|
|
|
|
|
|
if ($self->{FH} =~ /^main::(?:STDOUT|STDERR)$/) {
|
882
|
|
|
|
|
|
|
# yeah, select sometimes returns the name of the filehandle, not the filehandle itself. eg. "main::STDOUT"
|
883
|
|
|
|
|
|
|
no strict;
|
884
|
|
|
|
|
|
|
$self->{FH} = \*{$self->{FH}};
|
885
|
|
|
|
|
|
|
} else {
|
886
|
|
|
|
|
|
|
open my $FH, '>:utf8', $self->{FH} or croak(qq{Failed to open "$self->{FH}" for writing: $^E});
|
887
|
|
|
|
|
|
|
$self->{FH} = $FH;
|
888
|
|
|
|
|
|
|
}
|
889
|
|
|
|
|
|
|
} elsif (ref($self->{FH}) eq 'SCALAR') {
|
890
|
|
|
|
|
|
|
open $self->{FH}, '>', $self->{FH};
|
891
|
|
|
|
|
|
|
}
|
892
|
|
|
|
|
|
|
if (! $self->{opt}{skip_xml_version}) {
|
893
|
|
|
|
|
|
|
if ($self->{opt}{output_encoding}) {
|
894
|
|
|
|
|
|
|
print {$self->{FH}} qq{{opt}{output_encoding}"?>\n};
|
895
|
|
|
|
|
|
|
} else {
|
896
|
|
|
|
|
|
|
print {$self->{FH}} qq{\n};
|
897
|
|
|
|
|
|
|
}
|
898
|
|
|
|
|
|
|
}
|
899
|
|
|
|
|
|
|
$self->_run($IN, @_);
|
900
|
|
|
|
|
|
|
print {$self->{FH}} "\n";
|
901
|
|
|
|
|
|
|
delete $self->{FH};
|
902
|
|
|
|
|
|
|
}
|
903
|
|
|
|
|
|
|
|
904
|
|
|
|
|
|
|
## chunk processing
|
905
|
|
|
|
|
|
|
|
906
|
|
|
|
|
|
|
sub parse_chunk {
|
907
|
|
|
|
|
|
|
my $self = shift;
|
908
|
|
|
|
|
|
|
croak("This XML::Rules object may only be used as a filter!") if ($self->{style} eq 'filter');
|
909
|
|
|
|
|
|
|
$self->_parse_or_filter_chunk(@_);
|
910
|
|
|
|
|
|
|
}
|
911
|
|
|
|
|
|
|
|
912
|
|
|
|
|
|
|
sub _parse_or_filter_chunk {
|
913
|
|
|
|
|
|
|
my $self = shift;
|
914
|
|
|
|
|
|
|
my $string = shift;
|
915
|
|
|
|
|
|
|
|
916
|
|
|
|
|
|
|
if (exists $self->{parser}) {
|
917
|
|
|
|
|
|
|
if (ref($self->{parser}) ne 'XML::Parser::ExpatNB') {
|
918
|
|
|
|
|
|
|
croak "This parser is already busy parsing a full document!";
|
919
|
|
|
|
|
|
|
} else {
|
920
|
|
|
|
|
|
|
if (exists $self->{chunk_processing_result}) {
|
921
|
|
|
|
|
|
|
if (defined $self->{chunk_processing_result}) {
|
922
|
|
|
|
|
|
|
if (wantarray()) {
|
923
|
|
|
|
|
|
|
return @{$self->{chunk_processing_result}}
|
924
|
|
|
|
|
|
|
} else {
|
925
|
|
|
|
|
|
|
return ${$self->{chunk_processing_result}}[-1]
|
926
|
|
|
|
|
|
|
}
|
927
|
|
|
|
|
|
|
} else {
|
928
|
|
|
|
|
|
|
return;
|
929
|
|
|
|
|
|
|
}
|
930
|
|
|
|
|
|
|
}
|
931
|
|
|
|
|
|
|
|
932
|
|
|
|
|
|
|
if (! eval {
|
933
|
|
|
|
|
|
|
$self->{parser}->parse_more($string) and 1;
|
934
|
|
|
|
|
|
|
}) {
|
935
|
|
|
|
|
|
|
my $err = $@;
|
936
|
|
|
|
|
|
|
undef $@;
|
937
|
|
|
|
|
|
|
if ($err =~ /^\[XML::Rules\] skip rest/) {
|
938
|
|
|
|
|
|
|
my (undef, $handler) = $self->{parser}->setHandlers(End => undef);
|
939
|
|
|
|
|
|
|
foreach my $tag (reverse @{$self->{context} = []}) {
|
940
|
|
|
|
|
|
|
$handler->( $self->{parser}, $tag);
|
941
|
|
|
|
|
|
|
}
|
942
|
|
|
|
|
|
|
} else {
|
943
|
|
|
|
|
|
|
|
944
|
|
|
|
|
|
|
delete $self->{parameters};
|
945
|
|
|
|
|
|
|
$self->{parser}->release();
|
946
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
$self->{data} = [];
|
948
|
|
|
|
|
|
|
$self->{context} = [];
|
949
|
|
|
|
|
|
|
|
950
|
|
|
|
|
|
|
if ($err =~ /^\[XML::Rules\] return nothing/) {
|
951
|
|
|
|
|
|
|
$self->{chunk_processing_result} = undef;
|
952
|
|
|
|
|
|
|
return;
|
953
|
|
|
|
|
|
|
} elsif (ref $err eq 'XML::Rules::return_this') {
|
954
|
|
|
|
|
|
|
$self->{chunk_processing_result} = $err->{val};
|
955
|
|
|
|
|
|
|
if (wantarray()) {
|
956
|
|
|
|
|
|
|
return @{$err->{val}}
|
957
|
|
|
|
|
|
|
} else {
|
958
|
|
|
|
|
|
|
return ${$err->{val}}[-1]
|
959
|
|
|
|
|
|
|
}
|
960
|
|
|
|
|
|
|
}
|
961
|
|
|
|
|
|
|
|
962
|
|
|
|
|
|
|
$err =~ s/at \S+Rules\.pm line \d+$//
|
963
|
|
|
|
|
|
|
and croak $err or die $err;
|
964
|
|
|
|
|
|
|
}
|
965
|
|
|
|
|
|
|
};
|
966
|
|
|
|
|
|
|
return 1;
|
967
|
|
|
|
|
|
|
}
|
968
|
|
|
|
|
|
|
}
|
969
|
|
|
|
|
|
|
|
970
|
|
|
|
|
|
|
$self->{parameters} = shift;
|
971
|
|
|
|
|
|
|
|
972
|
|
|
|
|
|
|
$self->{parser} = XML::Parser::ExpatNB->new( %{$self->{for_parser}});
|
973
|
|
|
|
|
|
|
|
974
|
|
|
|
|
|
|
$self->{parser}->setHandlers( @{$self->{normal_handlers}} );
|
975
|
|
|
|
|
|
|
|
976
|
|
|
|
|
|
|
$self->{data} = [];
|
977
|
|
|
|
|
|
|
$self->{context} = [];
|
978
|
|
|
|
|
|
|
$self->{_ltrim} = [0];
|
979
|
|
|
|
|
|
|
|
980
|
|
|
|
|
|
|
return $self->_parse_or_filter_chunk($string);
|
981
|
|
|
|
|
|
|
}
|
982
|
|
|
|
|
|
|
|
983
|
|
|
|
|
|
|
sub filter_chunk {
|
984
|
|
|
|
|
|
|
my $self = shift;
|
985
|
|
|
|
|
|
|
croak("This XML::Rules object may only be used as a parser!") unless ($self->{style} eq 'filter');
|
986
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
my $XML = shift;
|
988
|
|
|
|
|
|
|
|
989
|
|
|
|
|
|
|
if (!exists $self->{FH}) {
|
990
|
|
|
|
|
|
|
$self->{FH} = shift || select(); # either passed or the selected filehandle
|
991
|
|
|
|
|
|
|
if (!ref($self->{FH})) {
|
992
|
|
|
|
|
|
|
if ($self->{FH} =~ /^main::(?:STDOUT|STDERR)$/) {
|
993
|
|
|
|
|
|
|
# yeah, select sometimes returns the name of the filehandle, not the filehandle itself. eg. "main::STDOUT"
|
994
|
|
|
|
|
|
|
no strict;
|
995
|
|
|
|
|
|
|
$self->{FH} = \*{$self->{FH}};
|
996
|
|
|
|
|
|
|
} else {
|
997
|
|
|
|
|
|
|
open my $FH, '>:utf8', $self->{FH} or croak(qq{Failed to open "$self->{FH}" for writing: $^E});
|
998
|
|
|
|
|
|
|
$self->{FH} = $FH;
|
999
|
|
|
|
|
|
|
}
|
1000
|
|
|
|
|
|
|
} elsif (ref($self->{FH}) eq 'SCALAR') {
|
1001
|
|
|
|
|
|
|
open my $FH, '>', $self->{FH};
|
1002
|
|
|
|
|
|
|
$self->{FH} = $FH;
|
1003
|
|
|
|
|
|
|
}
|
1004
|
|
|
|
|
|
|
if (! $self->{opt}{skip_xml_version}) {
|
1005
|
|
|
|
|
|
|
if ($self->{opt}{output_encoding}) {
|
1006
|
|
|
|
|
|
|
print {$self->{FH}} qq{{opt}{output_encoding}"?>\n};
|
1007
|
|
|
|
|
|
|
} else {
|
1008
|
|
|
|
|
|
|
print {$self->{FH}} qq{\n};
|
1009
|
|
|
|
|
|
|
}
|
1010
|
|
|
|
|
|
|
}
|
1011
|
|
|
|
|
|
|
}
|
1012
|
|
|
|
|
|
|
|
1013
|
|
|
|
|
|
|
$self->_parse_or_filter_chunk($XML, @_);
|
1014
|
|
|
|
|
|
|
}
|
1015
|
|
|
|
|
|
|
|
1016
|
|
|
|
|
|
|
sub last_chunk {
|
1017
|
|
|
|
|
|
|
my $self = shift;
|
1018
|
|
|
|
|
|
|
my $string = shift;
|
1019
|
|
|
|
|
|
|
if (exists $self->{parser}) {
|
1020
|
|
|
|
|
|
|
if (ref($self->{parser}) ne 'XML::Parser::ExpatNB') {
|
1021
|
|
|
|
|
|
|
if (exists $self->{FH}) { # in case it was a filter ...
|
1022
|
|
|
|
|
|
|
print {$self->{FH}} "\n";
|
1023
|
|
|
|
|
|
|
delete $self->{FH};
|
1024
|
|
|
|
|
|
|
}
|
1025
|
|
|
|
|
|
|
croak "This parser is already busy parsing a full document!";
|
1026
|
|
|
|
|
|
|
} else {
|
1027
|
|
|
|
|
|
|
if (exists $self->{chunk_processing_result}) {
|
1028
|
|
|
|
|
|
|
if (exists $self->{FH}) { # in case it was a filter ...
|
1029
|
|
|
|
|
|
|
print {$self->{FH}} "\n";
|
1030
|
|
|
|
|
|
|
delete $self->{FH};
|
1031
|
|
|
|
|
|
|
}
|
1032
|
|
|
|
|
|
|
if (defined $self->{chunk_processing_result}) {
|
1033
|
|
|
|
|
|
|
if (wantarray()) {
|
1034
|
|
|
|
|
|
|
return @{$self->{chunk_processing_result}}
|
1035
|
|
|
|
|
|
|
} else {
|
1036
|
|
|
|
|
|
|
return ${$self->{chunk_processing_result}}[-1]
|
1037
|
|
|
|
|
|
|
}
|
1038
|
|
|
|
|
|
|
} else {
|
1039
|
|
|
|
|
|
|
return;
|
1040
|
|
|
|
|
|
|
}
|
1041
|
|
|
|
|
|
|
}
|
1042
|
|
|
|
|
|
|
}
|
1043
|
|
|
|
|
|
|
} elsif (defined $string) {
|
1044
|
|
|
|
|
|
|
return ($self->{style} eq 'filter') ? $self->filter($string,@_) : $self->parse($string); # no chunks in processing
|
1045
|
|
|
|
|
|
|
} else {
|
1046
|
|
|
|
|
|
|
return;
|
1047
|
|
|
|
|
|
|
}
|
1048
|
|
|
|
|
|
|
|
1049
|
|
|
|
|
|
|
if (defined $string) {
|
1050
|
|
|
|
|
|
|
$self->_parse_or_filter_chunk($string);
|
1051
|
|
|
|
|
|
|
}
|
1052
|
|
|
|
|
|
|
|
1053
|
|
|
|
|
|
|
$self->{parser}->parse_done();
|
1054
|
|
|
|
|
|
|
delete $self->{parser};
|
1055
|
|
|
|
|
|
|
|
1056
|
|
|
|
|
|
|
if (exists $self->{FH}) {
|
1057
|
|
|
|
|
|
|
print {$self->{FH}} "\n";
|
1058
|
|
|
|
|
|
|
delete $self->{FH};
|
1059
|
|
|
|
|
|
|
}
|
1060
|
|
|
|
|
|
|
|
1061
|
|
|
|
|
|
|
delete $self->{parameters};
|
1062
|
|
|
|
|
|
|
my $data; # return the accumulated data, without keeping a copy inside the object
|
1063
|
|
|
|
|
|
|
($data, $self->{data}) = ($self->{data}[0], undef);
|
1064
|
|
|
|
|
|
|
if (!defined(wantarray()) or ! keys(%$data)) {
|
1065
|
|
|
|
|
|
|
return;
|
1066
|
|
|
|
|
|
|
|
1067
|
|
|
|
|
|
|
} elsif (keys(%$data) == 1 and exists(${$data}{_content})) {
|
1068
|
|
|
|
|
|
|
if (ref(${$data}{_content}) eq 'ARRAY' and @{${$data}{_content}} == 1) {
|
1069
|
|
|
|
|
|
|
return ${${$data}{_content}}[0]
|
1070
|
|
|
|
|
|
|
} else {
|
1071
|
|
|
|
|
|
|
return ${$data}{_content}
|
1072
|
|
|
|
|
|
|
}
|
1073
|
|
|
|
|
|
|
|
1074
|
|
|
|
|
|
|
} else {
|
1075
|
|
|
|
|
|
|
return $data;
|
1076
|
|
|
|
|
|
|
}
|
1077
|
|
|
|
|
|
|
}
|
1078
|
|
|
|
|
|
|
|
1079
|
|
|
|
|
|
|
##
|
1080
|
|
|
|
|
|
|
|
1081
|
|
|
|
|
|
|
sub _XMLDecl {
|
1082
|
|
|
|
|
|
|
weaken( my $self = shift);
|
1083
|
|
|
|
|
|
|
return sub {
|
1084
|
|
|
|
|
|
|
my ( $Parser, $Version, $Encoding, $Standalone) = @_;
|
1085
|
|
|
|
|
|
|
$self->{opt}{original_encoding} = $Encoding
|
1086
|
|
|
|
|
|
|
}
|
1087
|
|
|
|
|
|
|
}
|
1088
|
|
|
|
|
|
|
|
1089
|
|
|
|
|
|
|
=begin comment
|
1090
|
|
|
|
|
|
|
|
1091
|
|
|
|
|
|
|
start tag
|
1092
|
|
|
|
|
|
|
& 3 = 3 -> rtrim parent's _content
|
1093
|
|
|
|
|
|
|
& 8 = 8 -> $ltrim = 1
|
1094
|
|
|
|
|
|
|
|
1095
|
|
|
|
|
|
|
string content
|
1096
|
|
|
|
|
|
|
$ltrim -> ltrim the string, if not completely whitespace set $ltrim 0
|
1097
|
|
|
|
|
|
|
|
1098
|
|
|
|
|
|
|
end tag
|
1099
|
|
|
|
|
|
|
& 8 = 8 -> rtrim own content
|
1100
|
|
|
|
|
|
|
& 3 = 3 -> $ltrim = 1
|
1101
|
|
|
|
|
|
|
empty_returned_content and & 3 in (1,2) -> rtrim parent content
|
1102
|
|
|
|
|
|
|
empty_returned_content and & 3 = 2 -> $ltrim
|
1103
|
|
|
|
|
|
|
|
1104
|
|
|
|
|
|
|
=end comment
|
1105
|
|
|
|
|
|
|
|
1106
|
|
|
|
|
|
|
=cut
|
1107
|
|
|
|
|
|
|
|
1108
|
|
|
|
|
|
|
sub _rtrim {
|
1109
|
|
|
|
|
|
|
my ($self, $attr, $more) = @_;
|
1110
|
|
|
|
|
|
|
|
1111
|
|
|
|
|
|
|
if ($more) {
|
1112
|
|
|
|
|
|
|
if (ref $attr->{_content}) {
|
1113
|
|
|
|
|
|
|
if (!ref($attr->{_content}[-1])) {
|
1114
|
|
|
|
|
|
|
$attr->{_content}[-1] =~ s/\s+$//s;
|
1115
|
|
|
|
|
|
|
pop @{$attr->{_content}} if $attr->{_content}[-1] eq '';
|
1116
|
|
|
|
|
|
|
delete $attr->{_content} unless @{$attr->{_content}};
|
1117
|
|
|
|
|
|
|
}
|
1118
|
|
|
|
|
|
|
} else {
|
1119
|
|
|
|
|
|
|
$attr->{_content} =~ s/\s+$//s;
|
1120
|
|
|
|
|
|
|
delete $attr->{_content} if $attr->{_content} eq '';
|
1121
|
|
|
|
|
|
|
}
|
1122
|
|
|
|
|
|
|
} else {
|
1123
|
|
|
|
|
|
|
if (ref $attr->{_content}) {
|
1124
|
|
|
|
|
|
|
if (!ref($attr->{_content}[-1]) and $attr->{_content}[-1] =~ /^\s*$/s) {
|
1125
|
|
|
|
|
|
|
pop @{$attr->{_content}} ;
|
1126
|
|
|
|
|
|
|
delete $attr->{_content} unless @{$attr->{_content}};
|
1127
|
|
|
|
|
|
|
}
|
1128
|
|
|
|
|
|
|
} else {
|
1129
|
|
|
|
|
|
|
delete $attr->{_content} if $attr->{_content} =~ /^\s*$/s;
|
1130
|
|
|
|
|
|
|
}
|
1131
|
|
|
|
|
|
|
}
|
1132
|
|
|
|
|
|
|
}
|
1133
|
|
|
|
|
|
|
|
1134
|
|
|
|
|
|
|
sub _findUnusedNs {
|
1135
|
|
|
|
|
|
|
my ($self, $old_ns) = @_;
|
1136
|
|
|
|
|
|
|
my $new_ns = $old_ns;
|
1137
|
|
|
|
|
|
|
my %used;
|
1138
|
|
|
|
|
|
|
@used{values %{$self->{namespaces}}, values %{$self->{xmlns_map}}}= ();
|
1139
|
|
|
|
|
|
|
no warnings 'numeric';
|
1140
|
|
|
|
|
|
|
while (exists $used{$new_ns}) {
|
1141
|
|
|
|
|
|
|
$new_ns =~ s/(\d*)$/$1+1/e;
|
1142
|
|
|
|
|
|
|
}
|
1143
|
|
|
|
|
|
|
return $new_ns;
|
1144
|
|
|
|
|
|
|
}
|
1145
|
|
|
|
|
|
|
|
1146
|
|
|
|
|
|
|
sub _Start {
|
1147
|
|
|
|
|
|
|
weaken( my $self = shift);
|
1148
|
|
|
|
|
|
|
my $encode = $self->{opt}{encode};
|
1149
|
|
|
|
|
|
|
my $output_encoding = $self->{opt}{output_encoding};
|
1150
|
|
|
|
|
|
|
return sub {
|
1151
|
|
|
|
|
|
|
my ( $Parser, $Element , %Attr) = @_;
|
1152
|
|
|
|
|
|
|
|
1153
|
|
|
|
|
|
|
if (($self->{opt}{stripspaces} & 3) == 3) {
|
1154
|
|
|
|
|
|
|
#rtrim parent
|
1155
|
|
|
|
|
|
|
#print "rtrim parent content in _Start\n";
|
1156
|
|
|
|
|
|
|
if ($self->{data}[-1] and $self->{data}[-1]{_content}) {
|
1157
|
|
|
|
|
|
|
$self->_rtrim( $self->{data}[-1], ($self->{opt}{stripspaces} & 4));
|
1158
|
|
|
|
|
|
|
}
|
1159
|
|
|
|
|
|
|
}
|
1160
|
|
|
|
|
|
|
if ($self->{opt}{stripspaces} & 8) {
|
1161
|
|
|
|
|
|
|
#print "ltrim own content in _Start\n";
|
1162
|
|
|
|
|
|
|
push @{$self->{_ltrim}}, 2;
|
1163
|
|
|
|
|
|
|
} else {
|
1164
|
|
|
|
|
|
|
push @{$self->{_ltrim}}, 0;
|
1165
|
|
|
|
|
|
|
}
|
1166
|
|
|
|
|
|
|
|
1167
|
|
|
|
|
|
|
if ($self->{namespaces}) {
|
1168
|
|
|
|
|
|
|
my %restore;
|
1169
|
|
|
|
|
|
|
foreach my $attr (keys %Attr) { # find the namespace aliases
|
1170
|
|
|
|
|
|
|
next unless $attr =~ /^xmlns:(.*)$/;
|
1171
|
|
|
|
|
|
|
my $orig_ns = $1;
|
1172
|
|
|
|
|
|
|
$restore{$orig_ns} = $self->{xmlns_map}{$orig_ns};
|
1173
|
|
|
|
|
|
|
if (! exists($self->{namespaces}{ $Attr{$attr} })) {
|
1174
|
|
|
|
|
|
|
if ($self->{namespaces}{'*'} eq 'die') {
|
1175
|
|
|
|
|
|
|
local $Carp::CarpLevel = 2;
|
1176
|
|
|
|
|
|
|
croak qq{Unexpected namespace "$Attr{$attr}" found in the XML!};
|
1177
|
|
|
|
|
|
|
} elsif ($self->{namespaces}{'*'} eq '') {
|
1178
|
|
|
|
|
|
|
delete $Attr{$attr};
|
1179
|
|
|
|
|
|
|
$self->{xmlns_map}{$orig_ns} = '';
|
1180
|
|
|
|
|
|
|
} elsif ($self->{namespaces}{'*'} eq 'strip') {
|
1181
|
|
|
|
|
|
|
delete $Attr{$attr};
|
1182
|
|
|
|
|
|
|
$self->{xmlns_map}{$orig_ns} = STRIP;
|
1183
|
|
|
|
|
|
|
} else {
|
1184
|
|
|
|
|
|
|
warn qq{Unexpected namespace "$Attr{$attr}" found in the XML!\n} if ($self->{namespaces}{'*'} eq 'warn');
|
1185
|
|
|
|
|
|
|
my $new_ns = $self->_findUnusedNs( $orig_ns);
|
1186
|
|
|
|
|
|
|
if ($orig_ns ne $new_ns) {
|
1187
|
|
|
|
|
|
|
$Attr{'xmlns:' . $new_ns} = delete $Attr{$attr};
|
1188
|
|
|
|
|
|
|
}
|
1189
|
|
|
|
|
|
|
$self->{xmlns_map}{$orig_ns} = $new_ns;
|
1190
|
|
|
|
|
|
|
}
|
1191
|
|
|
|
|
|
|
} else {
|
1192
|
|
|
|
|
|
|
$self->{xmlns_map}{$orig_ns} = $self->{namespaces}{ delete($Attr{$attr}) };
|
1193
|
|
|
|
|
|
|
}
|
1194
|
|
|
|
|
|
|
}
|
1195
|
|
|
|
|
|
|
if (exists $Attr{xmlns}) { # find the default namespace
|
1196
|
|
|
|
|
|
|
#print "Found a xmlns attribute in $Element!\n";
|
1197
|
|
|
|
|
|
|
$restore{''} = $self->{xmlns_map}{''};
|
1198
|
|
|
|
|
|
|
if (!exists($self->{namespaces}{ $Attr{xmlns} })) { # unknown namespace
|
1199
|
|
|
|
|
|
|
if ($self->{namespaces}{'*'} eq 'die') {
|
1200
|
|
|
|
|
|
|
local $Carp::CarpLevel = 2;
|
1201
|
|
|
|
|
|
|
croak qq{Unexpected namespace "$Attr{xmlns}" found in the XML!};
|
1202
|
|
|
|
|
|
|
} elsif ($self->{namespaces}{'*'} eq '') {
|
1203
|
|
|
|
|
|
|
delete $Attr{xmlns};
|
1204
|
|
|
|
|
|
|
} elsif ($self->{namespaces}{'*'} eq 'strip') {
|
1205
|
|
|
|
|
|
|
delete $Attr{xmlns};
|
1206
|
|
|
|
|
|
|
$self->{xmlns_map}{''} = STRIP;
|
1207
|
|
|
|
|
|
|
} else { # warn or keep
|
1208
|
|
|
|
|
|
|
warn qq{Unexpected namespace "$Attr{xmlns}" found in the XML!\n} if ($self->{namespaces}{'*'} eq 'warn');
|
1209
|
|
|
|
|
|
|
my $new_ns = $self->_findUnusedNs( 'ns1');
|
1210
|
|
|
|
|
|
|
$Attr{'xmlns:'.$new_ns} = delete $Attr{xmlns};
|
1211
|
|
|
|
|
|
|
$self->{xmlns_map}{''} = $new_ns;
|
1212
|
|
|
|
|
|
|
}
|
1213
|
|
|
|
|
|
|
} else {
|
1214
|
|
|
|
|
|
|
$self->{xmlns_map}{''} = $self->{namespaces}{ delete($Attr{xmlns}) };
|
1215
|
|
|
|
|
|
|
}
|
1216
|
|
|
|
|
|
|
}
|
1217
|
|
|
|
|
|
|
if (%restore) {
|
1218
|
|
|
|
|
|
|
push @{$self->{xmlns_restore}}, \%restore;
|
1219
|
|
|
|
|
|
|
} else {
|
1220
|
|
|
|
|
|
|
push @{$self->{xmlns_restore}}, undef;
|
1221
|
|
|
|
|
|
|
}
|
1222
|
|
|
|
|
|
|
|
1223
|
|
|
|
|
|
|
if (%{$self->{xmlns_map}}) {
|
1224
|
|
|
|
|
|
|
#print "About to map aliases for $Element\n";
|
1225
|
|
|
|
|
|
|
# add or map the alias for the tag
|
1226
|
|
|
|
|
|
|
if ($Element =~ /^([^:]+):(.*)$/) {
|
1227
|
|
|
|
|
|
|
#print "Mapping an alias $1 for tag $Element\n";
|
1228
|
|
|
|
|
|
|
if (exists($self->{xmlns_map}{$1})) {
|
1229
|
|
|
|
|
|
|
if ($self->{xmlns_map}{$1} eq '') {
|
1230
|
|
|
|
|
|
|
$Element = $2 ;
|
1231
|
|
|
|
|
|
|
} else {
|
1232
|
|
|
|
|
|
|
$Element = $self->{xmlns_map}{$1} . ':' . $2 ;
|
1233
|
|
|
|
|
|
|
}
|
1234
|
|
|
|
|
|
|
}
|
1235
|
|
|
|
|
|
|
#print " -> $Element\n";
|
1236
|
|
|
|
|
|
|
} elsif (defined($self->{xmlns_map}{''}) and $self->{xmlns_map}{''} ne '') { # no namespace alias in the tag and there's a default
|
1237
|
|
|
|
|
|
|
#print "Adding default alias $self->{xmlns_map}{''}:\n";
|
1238
|
|
|
|
|
|
|
$Element = $self->{xmlns_map}{''} . ':' . $Element;
|
1239
|
|
|
|
|
|
|
#print " -> $Element\n";
|
1240
|
|
|
|
|
|
|
}
|
1241
|
|
|
|
|
|
|
if (substr( $Element, 0, length(STRIP)+1) eq STRIP.':') {%Attr = ()}
|
1242
|
|
|
|
|
|
|
|
1243
|
|
|
|
|
|
|
# map the aliases for the attributes
|
1244
|
|
|
|
|
|
|
foreach my $attr (keys %Attr) {
|
1245
|
|
|
|
|
|
|
next unless $attr =~ /^([^:]+):(.*)$/; # there's an alias
|
1246
|
|
|
|
|
|
|
next unless exists($self->{xmlns_map}{$1}); # and there's a mapping
|
1247
|
|
|
|
|
|
|
if ($self->{xmlns_map}{$1} eq '') {
|
1248
|
|
|
|
|
|
|
$Attr{$2} = delete($Attr{$attr}); # rename the attribute
|
1249
|
|
|
|
|
|
|
} elsif ($self->{xmlns_map}{$1} eq STRIP) {
|
1250
|
|
|
|
|
|
|
delete($Attr{$attr}); # remove the attribute
|
1251
|
|
|
|
|
|
|
} else {
|
1252
|
|
|
|
|
|
|
$Attr{$self->{xmlns_map}{$1} . ':' . $2} = delete($Attr{$attr}); # rename the attribute
|
1253
|
|
|
|
|
|
|
}
|
1254
|
|
|
|
|
|
|
}
|
1255
|
|
|
|
|
|
|
}
|
1256
|
|
|
|
|
|
|
} # /of namespace handling
|
1257
|
|
|
|
|
|
|
|
1258
|
|
|
|
|
|
|
|
1259
|
|
|
|
|
|
|
my ( $start_rule, $end_rule) = map {
|
1260
|
|
|
|
|
|
|
if ($self->{$_}{$Element} and ref($self->{$_}{$Element}) ne 'ARRAY') {
|
1261
|
|
|
|
|
|
|
$self->{$_}{$Element}
|
1262
|
|
|
|
|
|
|
} else {
|
1263
|
|
|
|
|
|
|
$self->_find_rule( $_, $Element, $self->{context})
|
1264
|
|
|
|
|
|
|
}
|
1265
|
|
|
|
|
|
|
} ( 'start_rules', 'rules');
|
1266
|
|
|
|
|
|
|
|
1267
|
|
|
|
|
|
|
if ($start_rule ne 'handle'
|
1268
|
|
|
|
|
|
|
and (
|
1269
|
|
|
|
|
|
|
!$start_rule
|
1270
|
|
|
|
|
|
|
or $start_rule eq 'skip'
|
1271
|
|
|
|
|
|
|
or !$start_rule->($Element,\%Attr, $self->{context}, $self->{data}, $self)
|
1272
|
|
|
|
|
|
|
)
|
1273
|
|
|
|
|
|
|
) {
|
1274
|
|
|
|
|
|
|
# ignore the tag and the ones below
|
1275
|
|
|
|
|
|
|
$Parser->setHandlers(@{$self->{ignore_handlers}});
|
1276
|
|
|
|
|
|
|
$self->{ignore_level}=1;
|
1277
|
|
|
|
|
|
|
|
1278
|
|
|
|
|
|
|
} else {
|
1279
|
|
|
|
|
|
|
# process the tag and the ones below
|
1280
|
|
|
|
|
|
|
if ($encode) {
|
1281
|
|
|
|
|
|
|
foreach my $value (values %Attr) {
|
1282
|
|
|
|
|
|
|
$value = Encode::encode( $encode, $value);
|
1283
|
|
|
|
|
|
|
}
|
1284
|
|
|
|
|
|
|
}
|
1285
|
|
|
|
|
|
|
|
1286
|
|
|
|
|
|
|
push @{$self->{context}}, $Element;
|
1287
|
|
|
|
|
|
|
push @{$self->{data}}, \%Attr;
|
1288
|
|
|
|
|
|
|
$self->{lastempty} = 0;
|
1289
|
|
|
|
|
|
|
|
1290
|
|
|
|
|
|
|
if ($self->{style} eq 'filter') {
|
1291
|
|
|
|
|
|
|
$self->{in_interesting}++ if ref($end_rule) or $end_rule =~ /^=/s; # is this tag interesting?
|
1292
|
|
|
|
|
|
|
|
1293
|
|
|
|
|
|
|
if (! $self->{in_interesting}) { # it neither this tag not an acestor is interesting, just copy the tag
|
1294
|
|
|
|
|
|
|
#print "Start:R ".$Parser->recognized_string()."\n";
|
1295
|
|
|
|
|
|
|
#print "Start:O ".$Parser->original_string()."\n";
|
1296
|
|
|
|
|
|
|
#print "Start:R ".$Parser->recognized_string()."\n";
|
1297
|
|
|
|
|
|
|
#print "Start:O ".$Parser->original_string()."\n";
|
1298
|
|
|
|
|
|
|
if (! $output_encoding) {
|
1299
|
|
|
|
|
|
|
print {$self->{FH}} $Parser->recognized_string();
|
1300
|
|
|
|
|
|
|
} elsif ($output_encoding eq $self->{opt}{original_encoding}) {
|
1301
|
|
|
|
|
|
|
print {$self->{FH}} $Parser->original_string();
|
1302
|
|
|
|
|
|
|
} else {
|
1303
|
|
|
|
|
|
|
print {$self->{FH}} $self->toXML($Element, \%Attr, "don't close");
|
1304
|
|
|
|
|
|
|
}
|
1305
|
|
|
|
|
|
|
}
|
1306
|
|
|
|
|
|
|
}
|
1307
|
|
|
|
|
|
|
|
1308
|
|
|
|
|
|
|
}
|
1309
|
|
|
|
|
|
|
}
|
1310
|
|
|
|
|
|
|
}
|
1311
|
|
|
|
|
|
|
|
1312
|
|
|
|
|
|
|
sub _find_rule {
|
1313
|
|
|
|
|
|
|
my ($self, $type, $Element, $path) = @_;
|
1314
|
|
|
|
|
|
|
|
1315
|
|
|
|
|
|
|
if (substr( $Element, 0, length(STRIP)+1) eq STRIP.':') {
|
1316
|
|
|
|
|
|
|
return ($type eq 'rules' ? STRIP_RULE : 'handle');
|
1317
|
|
|
|
|
|
|
}
|
1318
|
|
|
|
|
|
|
|
1319
|
|
|
|
|
|
|
if (exists($self->{$type.'_re'})) {
|
1320
|
|
|
|
|
|
|
for(my $i = 0; $i < @{$self->{$type.'_re'}}; $i++) {
|
1321
|
|
|
|
|
|
|
if ($Element =~ $self->{$type.'_re'}[$i]) {
|
1322
|
|
|
|
|
|
|
$self->{$type}{$Element} = $self->{$type.'_re_code'}[$i];
|
1323
|
|
|
|
|
|
|
last;
|
1324
|
|
|
|
|
|
|
}
|
1325
|
|
|
|
|
|
|
}
|
1326
|
|
|
|
|
|
|
}
|
1327
|
|
|
|
|
|
|
if (! exists $self->{$type}{$Element}) {
|
1328
|
|
|
|
|
|
|
$self->{$type}{$Element} = $self->{$type}{_default};
|
1329
|
|
|
|
|
|
|
}
|
1330
|
|
|
|
|
|
|
|
1331
|
|
|
|
|
|
|
if (ref $self->{$type}{$Element} eq 'ARRAY') {
|
1332
|
|
|
|
|
|
|
$path = join( '/', @$path);
|
1333
|
|
|
|
|
|
|
for(my $i=0; $i < $#{$self->{$type}{$Element}}; $i+=2) {
|
1334
|
|
|
|
|
|
|
if ($path =~ $self->{$type}{$Element}[$i]) {
|
1335
|
|
|
|
|
|
|
return $self->{$type}{$Element}[$i+1];
|
1336
|
|
|
|
|
|
|
}
|
1337
|
|
|
|
|
|
|
}
|
1338
|
|
|
|
|
|
|
return $self->{$type}{$Element}[-1];
|
1339
|
|
|
|
|
|
|
} else {
|
1340
|
|
|
|
|
|
|
return $self->{$type}{$Element};
|
1341
|
|
|
|
|
|
|
}
|
1342
|
|
|
|
|
|
|
}
|
1343
|
|
|
|
|
|
|
|
1344
|
|
|
|
|
|
|
sub _CdataStart {
|
1345
|
|
|
|
|
|
|
my $self = shift;
|
1346
|
|
|
|
|
|
|
my $encode = $self->{opt}{encode};
|
1347
|
|
|
|
|
|
|
return $self->{style} eq 'filter'
|
1348
|
|
|
|
|
|
|
? sub {
|
1349
|
|
|
|
|
|
|
my ( $Parser, $String) = @_;
|
1350
|
|
|
|
|
|
|
|
1351
|
|
|
|
|
|
|
return if (substr( $self->{context}[-1], 0, length(STRIP)+1) eq STRIP.':');
|
1352
|
|
|
|
|
|
|
|
1353
|
|
|
|
|
|
|
if (! $self->{in_interesting}) {
|
1354
|
|
|
|
|
|
|
print {$self->{FH}} '
|
1355
|
|
|
|
|
|
|
}
|
1356
|
|
|
|
|
|
|
}
|
1357
|
|
|
|
|
|
|
: undef;
|
1358
|
|
|
|
|
|
|
}
|
1359
|
|
|
|
|
|
|
|
1360
|
|
|
|
|
|
|
sub _CdataEnd {
|
1361
|
|
|
|
|
|
|
my $self = shift;
|
1362
|
|
|
|
|
|
|
my $encode = $self->{opt}{encode};
|
1363
|
|
|
|
|
|
|
return $self->{style} eq 'filter'
|
1364
|
|
|
|
|
|
|
? sub {
|
1365
|
|
|
|
|
|
|
my ( $Parser, $String) = @_;
|
1366
|
|
|
|
|
|
|
|
1367
|
|
|
|
|
|
|
return if (substr( $self->{context}[-1], 0, length(STRIP)+1) eq STRIP.':');
|
1368
|
|
|
|
|
|
|
|
1369
|
|
|
|
|
|
|
if (! $self->{in_interesting}) {
|
1370
|
|
|
|
|
|
|
print {$self->{FH}} ']]>';
|
1371
|
|
|
|
|
|
|
}
|
1372
|
|
|
|
|
|
|
}
|
1373
|
|
|
|
|
|
|
: undef;
|
1374
|
|
|
|
|
|
|
}
|
1375
|
|
|
|
|
|
|
|
1376
|
|
|
|
|
|
|
sub _Char {
|
1377
|
|
|
|
|
|
|
weaken( my $self = shift);
|
1378
|
|
|
|
|
|
|
my $encode = $self->{opt}{encode};
|
1379
|
|
|
|
|
|
|
return sub {
|
1380
|
|
|
|
|
|
|
my ( $Parser, $String) = @_;
|
1381
|
|
|
|
|
|
|
|
1382
|
|
|
|
|
|
|
return if (substr( $self->{context}[-1], 0, length(STRIP)+1) eq STRIP.':');
|
1383
|
|
|
|
|
|
|
|
1384
|
|
|
|
|
|
|
if ($self->{style} eq 'filter' and ! $self->{in_interesting}) {
|
1385
|
|
|
|
|
|
|
if (! $self->{opt}{output_encoding}) {
|
1386
|
|
|
|
|
|
|
print {$self->{FH}} $Parser->recognized_string();
|
1387
|
|
|
|
|
|
|
} elsif ($self->{opt}{output_encoding} eq $self->{opt}{original_encoding}) {
|
1388
|
|
|
|
|
|
|
print {$self->{FH}} $Parser->original_string();
|
1389
|
|
|
|
|
|
|
} else {
|
1390
|
|
|
|
|
|
|
print {$self->{FH}} encode($self->{opt}{output_encoding}, $Parser->recognized_string());
|
1391
|
|
|
|
|
|
|
}
|
1392
|
|
|
|
|
|
|
return;
|
1393
|
|
|
|
|
|
|
}
|
1394
|
|
|
|
|
|
|
|
1395
|
|
|
|
|
|
|
if ($encode) {
|
1396
|
|
|
|
|
|
|
$String = Encode::encode( $encode, $String);
|
1397
|
|
|
|
|
|
|
}
|
1398
|
|
|
|
|
|
|
|
1399
|
|
|
|
|
|
|
if ($self->{_ltrim}[-1]) {
|
1400
|
|
|
|
|
|
|
#print "ltrim in $self->{context}[-1] ($String)\n";
|
1401
|
|
|
|
|
|
|
if ($self->{_ltrim}[-1] == 2) {
|
1402
|
|
|
|
|
|
|
$String =~ s/^\s+//s;
|
1403
|
|
|
|
|
|
|
return if $String eq '';
|
1404
|
|
|
|
|
|
|
} else {
|
1405
|
|
|
|
|
|
|
return if $String =~ /^\s*$/s;
|
1406
|
|
|
|
|
|
|
}
|
1407
|
|
|
|
|
|
|
$self->{_ltrim}[-1] = 0;
|
1408
|
|
|
|
|
|
|
#print " ($String)\n";
|
1409
|
|
|
|
|
|
|
}
|
1410
|
|
|
|
|
|
|
$String =~ s/\s+/ /gs if ($self->{opt}{normalisespaces});
|
1411
|
|
|
|
|
|
|
|
1412
|
|
|
|
|
|
|
if (!exists $self->{data}[-1]{_content}) {
|
1413
|
|
|
|
|
|
|
$self->{data}[-1]{_content} = $String;
|
1414
|
|
|
|
|
|
|
} elsif (!ref $self->{data}[-1]{_content}) {
|
1415
|
|
|
|
|
|
|
if ($self->{opt}{normalisespaces} and $self->{data}[-1]{_content} =~ /\s$/ and $String =~ /^\s/) {
|
1416
|
|
|
|
|
|
|
$String =~ s/^\s+//s;
|
1417
|
|
|
|
|
|
|
}
|
1418
|
|
|
|
|
|
|
$self->{data}[-1]{_content} .= $String;
|
1419
|
|
|
|
|
|
|
} else {
|
1420
|
|
|
|
|
|
|
if (ref $self->{data}[-1]{_content}[-1]) {
|
1421
|
|
|
|
|
|
|
push @{$self->{data}[-1]{_content}}, $String;
|
1422
|
|
|
|
|
|
|
} else {
|
1423
|
|
|
|
|
|
|
if ($self->{opt}{normalisespaces} and $self->{data}[-1]{_content}[-1] =~ /\s$/ and $String =~ /^\s/) {
|
1424
|
|
|
|
|
|
|
$String =~ s/^\s+//s;
|
1425
|
|
|
|
|
|
|
}
|
1426
|
|
|
|
|
|
|
$self->{data}[-1]{_content}[-1] .= $String;
|
1427
|
|
|
|
|
|
|
}
|
1428
|
|
|
|
|
|
|
}
|
1429
|
|
|
|
|
|
|
}
|
1430
|
|
|
|
|
|
|
}
|
1431
|
|
|
|
|
|
|
|
1432
|
|
|
|
|
|
|
sub _End {
|
1433
|
|
|
|
|
|
|
weaken( my $self = shift);
|
1434
|
|
|
|
|
|
|
return sub {
|
1435
|
|
|
|
|
|
|
my ( $Parser, $Element) = @_;
|
1436
|
|
|
|
|
|
|
$Element = pop @{$self->{context}}; # the element name may have been mangled by XMLNS aliasing
|
1437
|
|
|
|
|
|
|
|
1438
|
|
|
|
|
|
|
if ($self->{opt}{stripspaces} & 8) {
|
1439
|
|
|
|
|
|
|
#print "rtrim own content\n";
|
1440
|
|
|
|
|
|
|
if ($self->{data}[-1] and $self->{data}[-1]{_content}) {
|
1441
|
|
|
|
|
|
|
$self->_rtrim( $self->{data}[-1], 1);
|
1442
|
|
|
|
|
|
|
}
|
1443
|
|
|
|
|
|
|
}
|
1444
|
|
|
|
|
|
|
pop(@{$self->{_ltrim}});
|
1445
|
|
|
|
|
|
|
|
1446
|
|
|
|
|
|
|
if ($self->{namespaces}) {
|
1447
|
|
|
|
|
|
|
if (my $restore = pop @{$self->{xmlns_restore}}) { # restore the old default namespace and/or alias mapping
|
1448
|
|
|
|
|
|
|
while (my ($their, $our) = each %$restore) {
|
1449
|
|
|
|
|
|
|
if (defined($our)) {
|
1450
|
|
|
|
|
|
|
$self->{xmlns_map}{$their} = $our;
|
1451
|
|
|
|
|
|
|
} else {
|
1452
|
|
|
|
|
|
|
delete $self->{xmlns_map}{$their};
|
1453
|
|
|
|
|
|
|
}
|
1454
|
|
|
|
|
|
|
}
|
1455
|
|
|
|
|
|
|
}
|
1456
|
|
|
|
|
|
|
}
|
1457
|
|
|
|
|
|
|
|
1458
|
|
|
|
|
|
|
my ($rule) = map {
|
1459
|
|
|
|
|
|
|
if ($self->{$_}{$Element} and ref($self->{$_}{$Element}) ne 'ARRAY') {
|
1460
|
|
|
|
|
|
|
$self->{$_}{$Element}
|
1461
|
|
|
|
|
|
|
} else {
|
1462
|
|
|
|
|
|
|
$self->_find_rule( $_, $Element, $self->{context})
|
1463
|
|
|
|
|
|
|
}
|
1464
|
|
|
|
|
|
|
} ('rules');
|
1465
|
|
|
|
|
|
|
|
1466
|
|
|
|
|
|
|
my $data = pop @{$self->{data}};
|
1467
|
|
|
|
|
|
|
|
1468
|
|
|
|
|
|
|
my @results;
|
1469
|
|
|
|
|
|
|
if (ref $rule or $rule =~ /^=/s) {
|
1470
|
|
|
|
|
|
|
if ($rule =~ /^==(.*)$/s) { # change the whole tag to a string
|
1471
|
|
|
|
|
|
|
@results = ($1);
|
1472
|
|
|
|
|
|
|
} elsif ($rule =~ /^=(.*)$/s) { # change the contents to a string
|
1473
|
|
|
|
|
|
|
@results = ($Element => $1);
|
1474
|
|
|
|
|
|
|
} else {
|
1475
|
|
|
|
|
|
|
@results = $rule->($Element, $data, $self->{context}, $self->{data}, $self);
|
1476
|
|
|
|
|
|
|
}
|
1477
|
|
|
|
|
|
|
|
1478
|
|
|
|
|
|
|
if ($self->{style} eq 'filter') {
|
1479
|
|
|
|
|
|
|
|
1480
|
|
|
|
|
|
|
$self->{in_interesting}--;
|
1481
|
|
|
|
|
|
|
if (!$self->{in_interesting}) {
|
1482
|
|
|
|
|
|
|
if (@{$self->{data}}) {
|
1483
|
|
|
|
|
|
|
print {$self->{FH}} $self->escape_value($self->{data}[-1]{_content});
|
1484
|
|
|
|
|
|
|
delete $self->{data}[-1]{_content};
|
1485
|
|
|
|
|
|
|
}
|
1486
|
|
|
|
|
|
|
my $base;
|
1487
|
|
|
|
|
|
|
if ($self->{opt}{ident} ne '') {
|
1488
|
|
|
|
|
|
|
$base = $self->{opt}{ident} x scalar(@{$self->{context}});
|
1489
|
|
|
|
|
|
|
}
|
1490
|
|
|
|
|
|
|
@results and $results[0] =~ s/^[\@%\+\*\.]//;
|
1491
|
|
|
|
|
|
|
while (@results) {
|
1492
|
|
|
|
|
|
|
#use Data::Dumper;
|
1493
|
|
|
|
|
|
|
#print "\@results=".Dumper(\@results)."\n";
|
1494
|
|
|
|
|
|
|
if (ref($results[0])) {
|
1495
|
|
|
|
|
|
|
croak(ref($results[0]) . " not supported as the return value of a filter") unless ref($results[0]) eq 'ARRAY';
|
1496
|
|
|
|
|
|
|
if (@{$results[0]} ==2 and ref($results[0][1]) eq 'HASH') {
|
1497
|
|
|
|
|
|
|
print {$self->{FH}} $self->toXML(@{$results[0]}[0,1], 0, $self->{opt}{ident}, $base);
|
1498
|
|
|
|
|
|
|
} else {
|
1499
|
|
|
|
|
|
|
foreach my $item (@{$results[0]}) {
|
1500
|
|
|
|
|
|
|
if (ref($item)) {
|
1501
|
|
|
|
|
|
|
croak(ref($item) . " not supported in the return value of a filter") unless ref($item) eq 'ARRAY';
|
1502
|
|
|
|
|
|
|
croak("Empty array not supported in the return value of a filter") unless @$item;
|
1503
|
|
|
|
|
|
|
if (@$item <= 2) {
|
1504
|
|
|
|
|
|
|
print {$self->{FH}} $self->toXML(@{$item}[0,1], 0, $self->{opt}{ident}, $base);
|
1505
|
|
|
|
|
|
|
} else { # we suppose the 3rd and following elements are parameters to ->toXML()
|
1506
|
|
|
|
|
|
|
print {$self->{FH}} $self->toXML(@$item);
|
1507
|
|
|
|
|
|
|
}
|
1508
|
|
|
|
|
|
|
} else {
|
1509
|
|
|
|
|
|
|
print {$self->{FH}} $self->escape_value($item);
|
1510
|
|
|
|
|
|
|
}
|
1511
|
|
|
|
|
|
|
}
|
1512
|
|
|
|
|
|
|
}
|
1513
|
|
|
|
|
|
|
shift(@results);
|
1514
|
|
|
|
|
|
|
} else {
|
1515
|
|
|
|
|
|
|
if (@results == 1) {
|
1516
|
|
|
|
|
|
|
print {$self->{FH}} $self->escape_value($results[0]);
|
1517
|
|
|
|
|
|
|
@results = ();last;
|
1518
|
|
|
|
|
|
|
} else {
|
1519
|
|
|
|
|
|
|
print {$self->{FH}} $self->toXML(shift(@results), shift(@results), 0, $self->{opt}{ident}, $base);
|
1520
|
|
|
|
|
|
|
}
|
1521
|
|
|
|
|
|
|
}
|
1522
|
|
|
|
|
|
|
}
|
1523
|
|
|
|
|
|
|
}
|
1524
|
|
|
|
|
|
|
}
|
1525
|
|
|
|
|
|
|
} elsif ($self->{style} eq 'filter' and ! $self->{in_interesting}) {
|
1526
|
|
|
|
|
|
|
#print "End: \$Element=$Element; \$Parser->recognized_string()=".$Parser->recognized_string()."; \$Parser->original_string()=".$Parser->original_string()."\n";
|
1527
|
|
|
|
|
|
|
die "Unexpected \$data->{_content}={$data->{_content}} in filter outside interesting nodes!\n" if $data->{_content} ne '';
|
1528
|
|
|
|
|
|
|
if (! $self->{opt}{output_encoding}) {
|
1529
|
|
|
|
|
|
|
print {$self->{FH}} $Parser->recognized_string();
|
1530
|
|
|
|
|
|
|
} elsif ($self->{opt}{output_encoding} eq $self->{opt}{original_encoding}) {
|
1531
|
|
|
|
|
|
|
print {$self->{FH}} $Parser->original_string();
|
1532
|
|
|
|
|
|
|
} else {
|
1533
|
|
|
|
|
|
|
print {$self->{FH}} encode($self->{opt}{output_encoding}, $Parser->recognized_string());
|
1534
|
|
|
|
|
|
|
}
|
1535
|
|
|
|
|
|
|
# print {$self->{FH}} $self->escape_value($data->{_content})."$Element>";
|
1536
|
|
|
|
|
|
|
|
1537
|
|
|
|
|
|
|
} else { # a predefined rule
|
1538
|
|
|
|
|
|
|
|
1539
|
|
|
|
|
|
|
if ($rule =~ s/(?:^| )no\s+xmlns$//) {
|
1540
|
|
|
|
|
|
|
$Element =~ s/^\w+://;
|
1541
|
|
|
|
|
|
|
$rule = 'as is' if $rule eq '';
|
1542
|
|
|
|
|
|
|
}
|
1543
|
|
|
|
|
|
|
if ($rule =~ s/^((?:(?:no\s+)?content\s+)?by\s+(\S+))\s+remove\(([^\)]+)\)$/$1/) {
|
1544
|
|
|
|
|
|
|
my $keep = $2;
|
1545
|
|
|
|
|
|
|
my @remove = split /\s*,\s*/, $3;
|
1546
|
|
|
|
|
|
|
foreach (@remove) {
|
1547
|
|
|
|
|
|
|
next if $_ eq $keep;
|
1548
|
|
|
|
|
|
|
delete $data->{$_};
|
1549
|
|
|
|
|
|
|
}
|
1550
|
|
|
|
|
|
|
$rule = 'as is' if $rule eq '';
|
1551
|
|
|
|
|
|
|
} elsif ($rule =~ s/\s*\bremove\(([^\)]+)\)//) {
|
1552
|
|
|
|
|
|
|
my @remove = split /\s*,\s*/, $1;
|
1553
|
|
|
|
|
|
|
foreach (@remove) {
|
1554
|
|
|
|
|
|
|
delete $data->{$_};
|
1555
|
|
|
|
|
|
|
}
|
1556
|
|
|
|
|
|
|
$rule = 'as is' if $rule eq '';
|
1557
|
|
|
|
|
|
|
}
|
1558
|
|
|
|
|
|
|
if ($rule =~ s/^((?:(?:no\s+)?content\s+)?by\s+(\S+))\s+only\(([^\)]+)\)$/$1/) {
|
1559
|
|
|
|
|
|
|
my %only;
|
1560
|
|
|
|
|
|
|
$only{$2} = undef;
|
1561
|
|
|
|
|
|
|
@only{split /\s*,\s*/, $3} = ();
|
1562
|
|
|
|
|
|
|
foreach (keys %$data) {
|
1563
|
|
|
|
|
|
|
delete $data->{$_} unless exists $only{$_};
|
1564
|
|
|
|
|
|
|
}
|
1565
|
|
|
|
|
|
|
$rule = 'as is' if $rule eq '';
|
1566
|
|
|
|
|
|
|
} elsif ($rule =~ s/\s*\bonly\(([^\)]+)\)//) {
|
1567
|
|
|
|
|
|
|
my %only;
|
1568
|
|
|
|
|
|
|
@only{split /\s*,\s*/, $1} = ();
|
1569
|
|
|
|
|
|
|
foreach (keys %$data) {
|
1570
|
|
|
|
|
|
|
delete $data->{$_} unless exists $only{$_};
|
1571
|
|
|
|
|
|
|
}
|
1572
|
|
|
|
|
|
|
$rule = 'as is' if $rule eq '';
|
1573
|
|
|
|
|
|
|
}
|
1574
|
|
|
|
|
|
|
|
1575
|
|
|
|
|
|
|
if ($rule eq '') {
|
1576
|
|
|
|
|
|
|
@results = ();
|
1577
|
|
|
|
|
|
|
} elsif ($rule eq 'content') {
|
1578
|
|
|
|
|
|
|
@results = ($Element => $data->{_content});
|
1579
|
|
|
|
|
|
|
} elsif ($rule eq 'content trim') {
|
1580
|
|
|
|
|
|
|
s/^\s+//,s/\s+$// for ($data->{_content});
|
1581
|
|
|
|
|
|
|
@results = ($Element => $data->{_content});
|
1582
|
|
|
|
|
|
|
} elsif ($rule eq 'content array') {
|
1583
|
|
|
|
|
|
|
@results = ('@'.$Element => $data->{_content});
|
1584
|
|
|
|
|
|
|
} elsif ($rule eq 'as is') {
|
1585
|
|
|
|
|
|
|
@results = ($Element => $data);
|
1586
|
|
|
|
|
|
|
} elsif ($rule eq 'as is trim') {
|
1587
|
|
|
|
|
|
|
s/^\s+//,s/\s+$// for ($data->{_content});
|
1588
|
|
|
|
|
|
|
@results = ($Element => $data);
|
1589
|
|
|
|
|
|
|
} elsif ($rule eq 'as array') {
|
1590
|
|
|
|
|
|
|
@results = ('@'.$Element => $data);
|
1591
|
|
|
|
|
|
|
} elsif ($rule eq 'as array trim') {
|
1592
|
|
|
|
|
|
|
s/^\s+//,s/\s+$// for ($data->{_content});
|
1593
|
|
|
|
|
|
|
@results = ('@'.$Element => $data);
|
1594
|
|
|
|
|
|
|
} elsif ($rule eq 'no content') {
|
1595
|
|
|
|
|
|
|
delete ${$data}{_content}; @results = ($Element => $data);
|
1596
|
|
|
|
|
|
|
} elsif ($rule eq 'no content array' or $rule eq 'as array no content') {
|
1597
|
|
|
|
|
|
|
delete ${$data}{_content}; @results = ('@' . $Element => $data);
|
1598
|
|
|
|
|
|
|
|
1599
|
|
|
|
|
|
|
} elsif ($rule eq 'pass') {
|
1600
|
|
|
|
|
|
|
@results = (%$data);
|
1601
|
|
|
|
|
|
|
} elsif ($rule eq 'pass trim') {
|
1602
|
|
|
|
|
|
|
s/^\s+//,s/\s+$// for ($data->{_content});
|
1603
|
|
|
|
|
|
|
@results = (%$data);
|
1604
|
|
|
|
|
|
|
} elsif ($rule eq 'pass no content' or $rule eq 'pass without content') {
|
1605
|
|
|
|
|
|
|
delete ${$data}{_content}; @results = (%$data);
|
1606
|
|
|
|
|
|
|
} elsif ($rule =~ /^pass\s+(\S+)$/) {
|
1607
|
|
|
|
|
|
|
my %allowed = map {$_ => 1} split( /\s*,\s*/, $1);
|
1608
|
|
|
|
|
|
|
@results = map { $_ => $data->{$_} } grep {$allowed{$_}} keys %allowed;
|
1609
|
|
|
|
|
|
|
|
1610
|
|
|
|
|
|
|
} elsif ($rule eq 'raw') {
|
1611
|
|
|
|
|
|
|
@results = [$Element => $data];
|
1612
|
|
|
|
|
|
|
|
1613
|
|
|
|
|
|
|
} elsif ($rule eq 'raw extended') {
|
1614
|
|
|
|
|
|
|
@results = (':'.$Element => $data, [$Element => $data]);
|
1615
|
|
|
|
|
|
|
|
1616
|
|
|
|
|
|
|
} elsif ($rule eq 'raw extended array') {
|
1617
|
|
|
|
|
|
|
@results = ('@:'.$Element => $data, [$Element => $data]);
|
1618
|
|
|
|
|
|
|
|
1619
|
|
|
|
|
|
|
} elsif ($rule =~ /^((?:no )?content )?by\s+(\S+)$/) {
|
1620
|
|
|
|
|
|
|
my ($cnt,$attr) = ($1,$2);
|
1621
|
|
|
|
|
|
|
if ($cnt eq 'no content ') {
|
1622
|
|
|
|
|
|
|
delete $data->{_content};
|
1623
|
|
|
|
|
|
|
}
|
1624
|
|
|
|
|
|
|
if ($attr =~ /,/) {
|
1625
|
|
|
|
|
|
|
my @attr = split /,/, $attr;
|
1626
|
|
|
|
|
|
|
foreach (@attr) {
|
1627
|
|
|
|
|
|
|
next unless exists ($data->{$_});
|
1628
|
|
|
|
|
|
|
if ($cnt eq 'content ') {
|
1629
|
|
|
|
|
|
|
@results = ($data->{$_} => $data->{_content})
|
1630
|
|
|
|
|
|
|
} else {
|
1631
|
|
|
|
|
|
|
@results = (delete $data->{$_} => $data)
|
1632
|
|
|
|
|
|
|
}
|
1633
|
|
|
|
|
|
|
last;
|
1634
|
|
|
|
|
|
|
}
|
1635
|
|
|
|
|
|
|
} else {
|
1636
|
|
|
|
|
|
|
if ($cnt eq 'content ') {
|
1637
|
|
|
|
|
|
|
@results = ($data->{$attr} => $data->{_content})
|
1638
|
|
|
|
|
|
|
} else {
|
1639
|
|
|
|
|
|
|
@results = (delete $data->{$attr} => $data);
|
1640
|
|
|
|
|
|
|
}
|
1641
|
|
|
|
|
|
|
}
|
1642
|
|
|
|
|
|
|
|
1643
|
|
|
|
|
|
|
} else {
|
1644
|
|
|
|
|
|
|
croak "Unknown predefined rule '$rule'!";
|
1645
|
|
|
|
|
|
|
}
|
1646
|
|
|
|
|
|
|
}
|
1647
|
|
|
|
|
|
|
|
1648
|
|
|
|
|
|
|
if (! @results or (@results % 2 == 0) or $results[-1] eq '') {
|
1649
|
|
|
|
|
|
|
if ($self->{opt}{stripspaces} & 3 and @{$self->{data}} and $self->{data}[-1]{_content}) { # stripping some spaces, it's not root and it did not return content
|
1650
|
|
|
|
|
|
|
#print "maybe stripping some spaces in $Element, it's not root and it did not return content\n";
|
1651
|
|
|
|
|
|
|
if (($self->{opt}{stripspaces} & 3) < 3 and $self->{data}[-1]{_content}) {
|
1652
|
|
|
|
|
|
|
# rtrim parent content
|
1653
|
|
|
|
|
|
|
#print " yes, rtrim parent '$self->{data}[-1]{_content}'\n";
|
1654
|
|
|
|
|
|
|
$self->_rtrim( $self->{data}[-1], ($self->{opt}{stripspaces} & 4));
|
1655
|
|
|
|
|
|
|
#print " result '$self->{data}[-1]{_content}'\n";
|
1656
|
|
|
|
|
|
|
}
|
1657
|
|
|
|
|
|
|
|
1658
|
|
|
|
|
|
|
$self->{_ltrim}[-1] = (($self->{opt}{stripspaces} & 4) ? 2 : 1)
|
1659
|
|
|
|
|
|
|
if ($self->{opt}{stripspaces} & 3) == 2;
|
1660
|
|
|
|
|
|
|
}
|
1661
|
|
|
|
|
|
|
} else {
|
1662
|
|
|
|
|
|
|
$self->{_ltrim}[-1] = 0;
|
1663
|
|
|
|
|
|
|
}
|
1664
|
|
|
|
|
|
|
if (($self->{opt}{stripspaces} & 3) == 3) {
|
1665
|
|
|
|
|
|
|
$self->{_ltrim}[-1] = (($self->{opt}{stripspaces} & 4) ? 2 : 1);
|
1666
|
|
|
|
|
|
|
}
|
1667
|
|
|
|
|
|
|
|
1668
|
|
|
|
|
|
|
|
1669
|
|
|
|
|
|
|
return unless scalar(@results) or scalar(@results) == 1 and ($results[0] eq '' or !defined($results[0]));
|
1670
|
|
|
|
|
|
|
|
1671
|
|
|
|
|
|
|
@{$self->{data}} = ({}) unless @{$self->{data}}; # oops we are already closing the root tag! We do need there to be at least one hashref in $self->{data}
|
1672
|
|
|
|
|
|
|
|
1673
|
|
|
|
|
|
|
if (scalar(@results) % 2) {
|
1674
|
|
|
|
|
|
|
# odd number of items, last is content
|
1675
|
|
|
|
|
|
|
my $value = pop(@results);
|
1676
|
|
|
|
|
|
|
_add_content( $self->{data}[-1], $value);
|
1677
|
|
|
|
|
|
|
}
|
1678
|
|
|
|
|
|
|
|
1679
|
|
|
|
|
|
|
while (@results) {
|
1680
|
|
|
|
|
|
|
my ($key, $value) = ( shift(@results), shift(@results));
|
1681
|
|
|
|
|
|
|
if ($key eq '_content') {
|
1682
|
|
|
|
|
|
|
_add_content( $self->{data}[-1], $value);
|
1683
|
|
|
|
|
|
|
} elsif ($key =~ s/^\@//) {
|
1684
|
|
|
|
|
|
|
if (exists($self->{data}[-1]{$key}) and ref($self->{data}[-1]{$key}) ne 'ARRAY') {
|
1685
|
|
|
|
|
|
|
$self->{data}[-1]{$key} = [$self->{data}[-1]{$key}, $value];
|
1686
|
|
|
|
|
|
|
} else {
|
1687
|
|
|
|
|
|
|
push @{$self->{data}[-1]{$key}}, $value;
|
1688
|
|
|
|
|
|
|
}
|
1689
|
|
|
|
|
|
|
} elsif ($key =~ s/^\+//) {
|
1690
|
|
|
|
|
|
|
if (exists($self->{data}[-1]{$key})) {
|
1691
|
|
|
|
|
|
|
$self->{data}[-1]{$key} += $value;
|
1692
|
|
|
|
|
|
|
} else {
|
1693
|
|
|
|
|
|
|
$self->{data}[-1]{$key} = $value;
|
1694
|
|
|
|
|
|
|
}
|
1695
|
|
|
|
|
|
|
} elsif ($key =~ s/^\*//) {
|
1696
|
|
|
|
|
|
|
if (exists($self->{data}[-1]{$key})) {
|
1697
|
|
|
|
|
|
|
$self->{data}[-1]{$key} *= $value;
|
1698
|
|
|
|
|
|
|
} else {
|
1699
|
|
|
|
|
|
|
$self->{data}[-1]{$key} = $value;
|
1700
|
|
|
|
|
|
|
}
|
1701
|
|
|
|
|
|
|
} elsif ($key =~ s/^\.//) {
|
1702
|
|
|
|
|
|
|
if (exists($self->{data}[-1]{$key})) {
|
1703
|
|
|
|
|
|
|
$self->{data}[-1]{$key} .= $value;
|
1704
|
|
|
|
|
|
|
} else {
|
1705
|
|
|
|
|
|
|
$self->{data}[-1]{$key} = $value;
|
1706
|
|
|
|
|
|
|
}
|
1707
|
|
|
|
|
|
|
# } elsif ($key =~ s/^\%//) {
|
1708
|
|
|
|
|
|
|
# if (exists($self->{data}[-1]{$key})) {
|
1709
|
|
|
|
|
|
|
# if (ref($value) eq 'HASH') {
|
1710
|
|
|
|
|
|
|
# %{$self->{data}[-1]{$key}} = (%{$self->{data}[-1]{$key}}, %$value);
|
1711
|
|
|
|
|
|
|
# } elsif (ref($value) eq 'ARRAY') {
|
1712
|
|
|
|
|
|
|
# %{$self->{data}[-1]{$key}} = (%{$self->{data}[-1]{$key}}, @$value);
|
1713
|
|
|
|
|
|
|
# } else {
|
1714
|
|
|
|
|
|
|
# croak "The value of the rule return \%$key must be a hash or array ref!";
|
1715
|
|
|
|
|
|
|
# }
|
1716
|
|
|
|
|
|
|
} elsif ($key =~ s/^\%//) {
|
1717
|
|
|
|
|
|
|
if (exists($self->{data}[-1]{$key})) {
|
1718
|
|
|
|
|
|
|
if (ref($value) eq 'HASH') {
|
1719
|
|
|
|
|
|
|
if ($self->{opt}{warnoverwrite}) {
|
1720
|
|
|
|
|
|
|
foreach my $subkey (%$value) {
|
1721
|
|
|
|
|
|
|
warn "The key '$subkey' already exists in attribute $key for tag $self->{context}[-1].\n old value: $self->{data}[-1]{$key}{$subkey}\n new value: $value->{$subkey}\n"
|
1722
|
|
|
|
|
|
|
if (exists $self->{data}[-1]{$key}{$subkey} and $self->{data}[-1]{$key}{$subkey} ne $value->{$subkey});
|
1723
|
|
|
|
|
|
|
$self->{data}[-1]{$key}{$subkey} = $value->{$subkey};
|
1724
|
|
|
|
|
|
|
}
|
1725
|
|
|
|
|
|
|
} else {
|
1726
|
|
|
|
|
|
|
%{$self->{data}[-1]{$key}} = (%{$self->{data}[-1]{$key}}, %$value);
|
1727
|
|
|
|
|
|
|
}
|
1728
|
|
|
|
|
|
|
} elsif (ref($value) eq 'ARRAY') {
|
1729
|
|
|
|
|
|
|
if ($self->{opt}{warnoverwrite}) {
|
1730
|
|
|
|
|
|
|
$value = {@$value}; # convert to hash
|
1731
|
|
|
|
|
|
|
foreach my $subkey (%$value) {
|
1732
|
|
|
|
|
|
|
warn "The key '$subkey' already exists in attribute $key for tag $self->{context}[-1].\n old value: $self->{data}[-1]{$key}{$subkey}\n new value: $value->{$subkey}\n"
|
1733
|
|
|
|
|
|
|
if (exists $self->{data}[-1]{$key}{$subkey} and $self->{data}[-1]{$key}{$subkey} ne $value->{$subkey});
|
1734
|
|
|
|
|
|
|
$self->{data}[-1]{$key}{$subkey} = $value->{$subkey};
|
1735
|
|
|
|
|
|
|
}
|
1736
|
|
|
|
|
|
|
} else {
|
1737
|
|
|
|
|
|
|
%{$self->{data}[-1]{$key}} = (%{$self->{data}[-1]{$key}}, @$value);
|
1738
|
|
|
|
|
|
|
}
|
1739
|
|
|
|
|
|
|
} else {
|
1740
|
|
|
|
|
|
|
croak "The value of the rule return \%$key must be a hash or array ref!";
|
1741
|
|
|
|
|
|
|
}
|
1742
|
|
|
|
|
|
|
} else {
|
1743
|
|
|
|
|
|
|
if (ref($value) eq 'HASH') {
|
1744
|
|
|
|
|
|
|
$self->{data}[-1]{$key} = $value;
|
1745
|
|
|
|
|
|
|
} elsif (ref($value) eq 'ARRAY') {
|
1746
|
|
|
|
|
|
|
$self->{data}[-1]{$key} = {@$value};
|
1747
|
|
|
|
|
|
|
} else {
|
1748
|
|
|
|
|
|
|
croak "The value of the rule return \%$key must be a hash or array ref!";
|
1749
|
|
|
|
|
|
|
}
|
1750
|
|
|
|
|
|
|
}
|
1751
|
|
|
|
|
|
|
} else {
|
1752
|
|
|
|
|
|
|
warn "The attribute '$key' already exists for tag $self->{context}[-1].\n old value: $self->{data}[-1]{$key}\n new value: $value\n"
|
1753
|
|
|
|
|
|
|
if ($self->{opt}{warnoverwrite} and exists $self->{data}[-1]{$key} and $self->{data}[-1]{$key} ne $value);
|
1754
|
|
|
|
|
|
|
$self->{data}[-1]{$key} = $value;
|
1755
|
|
|
|
|
|
|
}
|
1756
|
|
|
|
|
|
|
}
|
1757
|
|
|
|
|
|
|
}
|
1758
|
|
|
|
|
|
|
}
|
1759
|
|
|
|
|
|
|
|
1760
|
|
|
|
|
|
|
sub _StartIgnore {
|
1761
|
|
|
|
|
|
|
weaken( my $self = shift);
|
1762
|
|
|
|
|
|
|
return sub {
|
1763
|
|
|
|
|
|
|
$self->{ignore_level}++
|
1764
|
|
|
|
|
|
|
}
|
1765
|
|
|
|
|
|
|
}
|
1766
|
|
|
|
|
|
|
|
1767
|
|
|
|
|
|
|
sub _EndIgnore {
|
1768
|
|
|
|
|
|
|
weaken( my $self = shift);
|
1769
|
|
|
|
|
|
|
return sub {
|
1770
|
|
|
|
|
|
|
return if --$self->{ignore_level};
|
1771
|
|
|
|
|
|
|
|
1772
|
|
|
|
|
|
|
$self->{parser}->setHandlers(@{$self->{normal_handlers}})
|
1773
|
|
|
|
|
|
|
}
|
1774
|
|
|
|
|
|
|
}
|
1775
|
|
|
|
|
|
|
|
1776
|
|
|
|
|
|
|
sub _add_content {
|
1777
|
|
|
|
|
|
|
my ($hash, $value) = @_;
|
1778
|
|
|
|
|
|
|
if (ref($value)) {
|
1779
|
|
|
|
|
|
|
if (ref($hash->{_content})) {
|
1780
|
|
|
|
|
|
|
# both are refs, push to @_content
|
1781
|
|
|
|
|
|
|
push @{$hash->{_content}}, $value;
|
1782
|
|
|
|
|
|
|
} elsif (exists($hash->{_content})) {
|
1783
|
|
|
|
|
|
|
# result is ref, _content is not -> convert to an arrayref containing old _content and result
|
1784
|
|
|
|
|
|
|
$hash->{_content} = [ $hash->{_content}, $value]
|
1785
|
|
|
|
|
|
|
} else {
|
1786
|
|
|
|
|
|
|
# result is ref, _content is not present
|
1787
|
|
|
|
|
|
|
$hash->{_content} = [ $value]
|
1788
|
|
|
|
|
|
|
}
|
1789
|
|
|
|
|
|
|
} else {
|
1790
|
|
|
|
|
|
|
if (ref($hash->{_content})) {
|
1791
|
|
|
|
|
|
|
# _content is an arrayref, value is a string
|
1792
|
|
|
|
|
|
|
if (ref $hash->{_content}[-1]) {
|
1793
|
|
|
|
|
|
|
# the last element is a ref -> push
|
1794
|
|
|
|
|
|
|
push @{$hash->{_content}}, $value;
|
1795
|
|
|
|
|
|
|
} else {
|
1796
|
|
|
|
|
|
|
# the last element is a string -> concatenate
|
1797
|
|
|
|
|
|
|
$hash->{_content}[-1] .= $value;
|
1798
|
|
|
|
|
|
|
}
|
1799
|
|
|
|
|
|
|
} else {
|
1800
|
|
|
|
|
|
|
# neither is ref, concatenate
|
1801
|
|
|
|
|
|
|
$hash->{_content} .= $value;
|
1802
|
|
|
|
|
|
|
}
|
1803
|
|
|
|
|
|
|
}
|
1804
|
|
|
|
|
|
|
}
|
1805
|
|
|
|
|
|
|
|
1806
|
|
|
|
|
|
|
=head1 INSTANCE METHODS
|
1807
|
|
|
|
|
|
|
|
1808
|
|
|
|
|
|
|
=head2 parse
|
1809
|
|
|
|
|
|
|
|
1810
|
|
|
|
|
|
|
$parser->parse( $string [, $parameters]);
|
1811
|
|
|
|
|
|
|
$parser->parse( $IOhandle [, $parameters]);
|
1812
|
|
|
|
|
|
|
|
1813
|
|
|
|
|
|
|
Parses the XML in the string or reads and parses the XML from the opened IO handle,
|
1814
|
|
|
|
|
|
|
executes the rules as it encounters the closing tags and returns the resulting structure.
|
1815
|
|
|
|
|
|
|
|
1816
|
|
|
|
|
|
|
The scalar or reference passed as the second parameter to the parse() method is assigned to
|
1817
|
|
|
|
|
|
|
$parser->{parameters} for the parsing of the file or string. Once the XML is parsed the key is
|
1818
|
|
|
|
|
|
|
deleted. This means that the $parser does not retain a reference to the $parameters after the parsing.
|
1819
|
|
|
|
|
|
|
|
1820
|
|
|
|
|
|
|
=head2 parsestring
|
1821
|
|
|
|
|
|
|
|
1822
|
|
|
|
|
|
|
$parser->parsestring( $string [, $parameters]);
|
1823
|
|
|
|
|
|
|
|
1824
|
|
|
|
|
|
|
Just an alias to ->parse().
|
1825
|
|
|
|
|
|
|
|
1826
|
|
|
|
|
|
|
=head2 parse_string
|
1827
|
|
|
|
|
|
|
|
1828
|
|
|
|
|
|
|
$parser->parse_string( $string [, $parameters]);
|
1829
|
|
|
|
|
|
|
|
1830
|
|
|
|
|
|
|
Just an alias to ->parse().
|
1831
|
|
|
|
|
|
|
|
1832
|
|
|
|
|
|
|
=head2 parsefile
|
1833
|
|
|
|
|
|
|
|
1834
|
|
|
|
|
|
|
$parser->parsefile( $filename [, $parameters]);
|
1835
|
|
|
|
|
|
|
|
1836
|
|
|
|
|
|
|
Opens the specified file and parses the XML and executes the rules as it encounters
|
1837
|
|
|
|
|
|
|
the closing tags and returns the resulting structure.
|
1838
|
|
|
|
|
|
|
|
1839
|
|
|
|
|
|
|
=head2 parse_file
|
1840
|
|
|
|
|
|
|
|
1841
|
|
|
|
|
|
|
$parser->parse_file( $filename [, $parameters]);
|
1842
|
|
|
|
|
|
|
|
1843
|
|
|
|
|
|
|
Just an alias to ->parsefile().
|
1844
|
|
|
|
|
|
|
|
1845
|
|
|
|
|
|
|
=head2 parse_chunk
|
1846
|
|
|
|
|
|
|
|
1847
|
|
|
|
|
|
|
while (my $chunk = read_chunk_of_data()) {
|
1848
|
|
|
|
|
|
|
$parser->parse_chunk($chunk);
|
1849
|
|
|
|
|
|
|
}
|
1850
|
|
|
|
|
|
|
my $data = $parser->last_chunk();
|
1851
|
|
|
|
|
|
|
|
1852
|
|
|
|
|
|
|
This method allows you to process the XML in chunks as you receive them. The chunks do not need to be in any
|
1853
|
|
|
|
|
|
|
way valid ... it's fine if the chunk ends in the middle of a tag or attribute.
|
1854
|
|
|
|
|
|
|
|
1855
|
|
|
|
|
|
|
If you need to set the $parser->{parameters}, pass it to the first call to parse_chunk() the same way you would to parse().
|
1856
|
|
|
|
|
|
|
The first chunk may be empty so if you need to set up the parameters, but read the chunks in a loop or in a callback, you can do this:
|
1857
|
|
|
|
|
|
|
|
1858
|
|
|
|
|
|
|
$parser->parse_chunk('', {foo => 15, bar => "Hello World!"});
|
1859
|
|
|
|
|
|
|
while (my $chunk = read_chunk_of_data()) {
|
1860
|
|
|
|
|
|
|
$parser->parse_chunk($chunk);
|
1861
|
|
|
|
|
|
|
}
|
1862
|
|
|
|
|
|
|
my $data = $parser->last_chunk();
|
1863
|
|
|
|
|
|
|
|
1864
|
|
|
|
|
|
|
or
|
1865
|
|
|
|
|
|
|
|
1866
|
|
|
|
|
|
|
$parser->parse_chunk('', {foo => 15, bar => "Hello World!"});
|
1867
|
|
|
|
|
|
|
$ua->get($url, ':content_cb' => sub { my($data, $response, $protocol) = @_; $parser->parse_chunk($data); return 1 });
|
1868
|
|
|
|
|
|
|
my $data = $parser->last_chunk();
|
1869
|
|
|
|
|
|
|
|
1870
|
|
|
|
|
|
|
The parse_chunk() returns 1 or dies, to get the accumulated data, you need to call last_chunk(). You will want to either agressively trim the data remembered
|
1871
|
|
|
|
|
|
|
or handle parts of the file using custom rules as the XML is being parsed.
|
1872
|
|
|
|
|
|
|
|
1873
|
|
|
|
|
|
|
=head2 filter
|
1874
|
|
|
|
|
|
|
|
1875
|
|
|
|
|
|
|
$parser->filter( $string);
|
1876
|
|
|
|
|
|
|
$parser->filter( $string, $LexicalOutputIOhandle [, $parameters]);
|
1877
|
|
|
|
|
|
|
$parser->filter( $LexicalInputIOhandle, $LexicalOutputIOhandle [, $parameters]);
|
1878
|
|
|
|
|
|
|
$parser->filter( $string, \*OutputIOhandle [, $parameters]);
|
1879
|
|
|
|
|
|
|
$parser->filter( $LexicalInputIOhandle, \*OutputIOhandle [, $parameters]);
|
1880
|
|
|
|
|
|
|
$parser->filter( $string, $OutputFilename [, $parameters]);
|
1881
|
|
|
|
|
|
|
$parser->filter( $LexicalInputIOhandle, $OutputFilename [, $parameters]);
|
1882
|
|
|
|
|
|
|
$parser->filter( $string, $StringReference [, $parameters]);
|
1883
|
|
|
|
|
|
|
$parser->filter( $LexicalInputIOhandle, $StringReference [, $parameters]);
|
1884
|
|
|
|
|
|
|
|
1885
|
|
|
|
|
|
|
Parses the XML in the string or reads and parses the XML from the opened IO handle,
|
1886
|
|
|
|
|
|
|
copies the tags that do not have a subroutine rule specified and do not occure under such a tag,
|
1887
|
|
|
|
|
|
|
executes the specified rules and prints the results to select()ed filehandle, $OutputFilename or
|
1888
|
|
|
|
|
|
|
$OutputIOhandle or stores them in the scalar referenced by $StringReference using the ->ToXML() method.
|
1889
|
|
|
|
|
|
|
|
1890
|
|
|
|
|
|
|
The scalar or reference passed as the third parameter to the filter() method is assigned to
|
1891
|
|
|
|
|
|
|
$parser->{parameters} for the parsing of the file or string. Once the XML is parsed the key is
|
1892
|
|
|
|
|
|
|
deleted. This means that the $parser does not retain a reference to the $parameters after the parsing.
|
1893
|
|
|
|
|
|
|
|
1894
|
|
|
|
|
|
|
=head2 filterstring
|
1895
|
|
|
|
|
|
|
|
1896
|
|
|
|
|
|
|
$parser->filterstring( ...);
|
1897
|
|
|
|
|
|
|
|
1898
|
|
|
|
|
|
|
Just an alias to ->filter().
|
1899
|
|
|
|
|
|
|
|
1900
|
|
|
|
|
|
|
=head2 filter_string
|
1901
|
|
|
|
|
|
|
|
1902
|
|
|
|
|
|
|
$parser->filter_string( ...);
|
1903
|
|
|
|
|
|
|
|
1904
|
|
|
|
|
|
|
Just an alias to ->filter().
|
1905
|
|
|
|
|
|
|
|
1906
|
|
|
|
|
|
|
=head2 filterfile
|
1907
|
|
|
|
|
|
|
|
1908
|
|
|
|
|
|
|
$parser->filterfile( $filename);
|
1909
|
|
|
|
|
|
|
$parser->filterfile( $filename, $LexicalOutputIOhandle [, $parameters]);
|
1910
|
|
|
|
|
|
|
$parser->filterfile( $filename, \*OutputIOhandle [, $parameters]);
|
1911
|
|
|
|
|
|
|
$parser->filterfile( $filename, $OutputFilename [, $parameters]);
|
1912
|
|
|
|
|
|
|
|
1913
|
|
|
|
|
|
|
Parses the XML in the specified file, copies the tags that do not have a subroutine rule specified
|
1914
|
|
|
|
|
|
|
and do not occure under such a tag, executes the specified rules and prints the results to select()ed
|
1915
|
|
|
|
|
|
|
filehandle, $OutputFilename or $OutputIOhandle or stores them in the scalar
|
1916
|
|
|
|
|
|
|
referenced by $StringReference.
|
1917
|
|
|
|
|
|
|
|
1918
|
|
|
|
|
|
|
The scalar or reference passed as the third parameter to the filter() method is assigned to
|
1919
|
|
|
|
|
|
|
$parser->{parameters} for the parsing of the file or string. Once the XML is parsed the key is
|
1920
|
|
|
|
|
|
|
deleted. This means that the $parser does not retain a reference to the $parameters after the parsing.
|
1921
|
|
|
|
|
|
|
|
1922
|
|
|
|
|
|
|
=head2 filter_file
|
1923
|
|
|
|
|
|
|
|
1924
|
|
|
|
|
|
|
Just an alias to ->filterfile().
|
1925
|
|
|
|
|
|
|
|
1926
|
|
|
|
|
|
|
=head2 filter_chunk
|
1927
|
|
|
|
|
|
|
|
1928
|
|
|
|
|
|
|
while (my $chunk = read_chunk_of_data()) {
|
1929
|
|
|
|
|
|
|
$parser->filter_chunk($chunk);
|
1930
|
|
|
|
|
|
|
}
|
1931
|
|
|
|
|
|
|
$parser->last_chunk();
|
1932
|
|
|
|
|
|
|
|
1933
|
|
|
|
|
|
|
This method allows you to process the XML in chunks as you receive them. The chunks do not need to be in any
|
1934
|
|
|
|
|
|
|
way valid ... it's fine if the chunk ends in the middle of a tag or attribute.
|
1935
|
|
|
|
|
|
|
|
1936
|
|
|
|
|
|
|
If you need to set the file to store the result to (default is the select()ed filehandle) or set the $parser->{parameters}, pass it to the first call to filter_chunk() the same way you would to filter().
|
1937
|
|
|
|
|
|
|
The first chunk may be empty so if you need to set up the parameters, but read the chunks in a loop or in a callback, you can do this:
|
1938
|
|
|
|
|
|
|
|
1939
|
|
|
|
|
|
|
$parser->filter_chunk('', "the-filtered.xml", {foo => 15, bar => "Hello World!"});
|
1940
|
|
|
|
|
|
|
while (my $chunk = read_chunk_of_data()) {
|
1941
|
|
|
|
|
|
|
$parser->filter_chunk($chunk);
|
1942
|
|
|
|
|
|
|
}
|
1943
|
|
|
|
|
|
|
$parser->last_chunk();
|
1944
|
|
|
|
|
|
|
|
1945
|
|
|
|
|
|
|
or
|
1946
|
|
|
|
|
|
|
|
1947
|
|
|
|
|
|
|
$parser->filter_chunk('', "the_filtered.xml", {foo => 15, bar => "Hello World!"});
|
1948
|
|
|
|
|
|
|
$ua->get($url, ':content_cb' => sub { my($data, $response, $protocol) = @_; $parser->filter_chunk($data); return 1 });
|
1949
|
|
|
|
|
|
|
filter_chunk$parser->last_chunk();
|
1950
|
|
|
|
|
|
|
|
1951
|
|
|
|
|
|
|
The filter_chunk() returns 1 or dies, you need to call last_chunk() to sign the end of the data and close the filehandles and clean the parser status.
|
1952
|
|
|
|
|
|
|
Make sure you do not set a rule for the root tag or other tag containing way too much data. Keep in mind that even if the parser works as a filter,
|
1953
|
|
|
|
|
|
|
the data for a custom rule must be kept in memory for the rule to execute!
|
1954
|
|
|
|
|
|
|
|
1955
|
|
|
|
|
|
|
=head2 last_chunk
|
1956
|
|
|
|
|
|
|
|
1957
|
|
|
|
|
|
|
my $data = $parser->last_chunk();
|
1958
|
|
|
|
|
|
|
my $data = $parser->last_chunk($the_last_chunk_contents);
|
1959
|
|
|
|
|
|
|
|
1960
|
|
|
|
|
|
|
Finishes the processing of a XML fed to the parser in chunks. In case of the parser style, returns the accumulated data. In case of the filter style,
|
1961
|
|
|
|
|
|
|
flushes and closes the output file. You can pass the last piece of the XML to this method or call it without parameters if all the data was passed to parse_chunk()/filter_chunk().
|
1962
|
|
|
|
|
|
|
|
1963
|
|
|
|
|
|
|
You HAVE to execute this method after call(s) to parse_chunk() or filter_chunk()! Until you do, the parser will refuse to process full documents and
|
1964
|
|
|
|
|
|
|
expect another call to parse_chunk()/filter_chunk()!
|
1965
|
|
|
|
|
|
|
|
1966
|
|
|
|
|
|
|
=cut
|
1967
|
|
|
|
|
|
|
|
1968
|
|
|
|
|
|
|
sub escape_value {
|
1969
|
|
|
|
|
|
|
my($self, $data, $level) = @_;
|
1970
|
|
|
|
|
|
|
|
1971
|
|
|
|
|
|
|
if (exists $self->{custom_escape}) {
|
1972
|
|
|
|
|
|
|
if (ref $self->{custom_escape}) {
|
1973
|
|
|
|
|
|
|
return $self->{custom_escape}->($data,$level);
|
1974
|
|
|
|
|
|
|
} else {
|
1975
|
|
|
|
|
|
|
return $data;
|
1976
|
|
|
|
|
|
|
}
|
1977
|
|
|
|
|
|
|
}
|
1978
|
|
|
|
|
|
|
|
1979
|
|
|
|
|
|
|
return '' unless(defined($data) and $data ne '');
|
1980
|
|
|
|
|
|
|
|
1981
|
|
|
|
|
|
|
if ($self->{opt}{output_encoding} ne $self->{opt}{encode}) {
|
1982
|
|
|
|
|
|
|
$data = Encode::decode( $self->{opt}{encode}, $data) if $self->{opt}{encode};
|
1983
|
|
|
|
|
|
|
$data = Encode::encode( $self->{opt}{output_encoding}, $data) if $self->{opt}{output_encoding};
|
1984
|
|
|
|
|
|
|
}
|
1985
|
|
|
|
|
|
|
|
1986
|
|
|
|
|
|
|
$data =~ s/&/&/sg;
|
1987
|
|
|
|
|
|
|
$data =~ s/</sg;
|
1988
|
|
|
|
|
|
|
$data =~ s/>/>/sg;
|
1989
|
|
|
|
|
|
|
$data =~ s/"/"/sg;
|
1990
|
|
|
|
|
|
|
|
1991
|
|
|
|
|
|
|
$level = $self->{opt}->{numericescape} unless defined $level;
|
1992
|
|
|
|
|
|
|
return $data unless $level;
|
1993
|
|
|
|
|
|
|
|
1994
|
|
|
|
|
|
|
if($self->{opt}->{numericescape} eq '2') {
|
1995
|
|
|
|
|
|
|
$data =~ s/([^\x00-\x7F])/'' . ord($1) . ';'/gse;
|
1996
|
|
|
|
|
|
|
} else {
|
1997
|
|
|
|
|
|
|
$data =~ s/([^\x00-\xFF])/'' . ord($1) . ';'/gse;
|
1998
|
|
|
|
|
|
|
}
|
1999
|
|
|
|
|
|
|
return $data;
|
2000
|
|
|
|
|
|
|
}
|
2001
|
|
|
|
|
|
|
|
2002
|
|
|
|
|
|
|
=head2 escape_value
|
2003
|
|
|
|
|
|
|
|
2004
|
|
|
|
|
|
|
$parser->escape_value( $data [, $numericescape])
|
2005
|
|
|
|
|
|
|
|
2006
|
|
|
|
|
|
|
This method escapes the $data for inclusion in XML, the $numericescape may be 0, 1 or 2
|
2007
|
|
|
|
|
|
|
and controls whether to convert 'high' (non ASCII) characters to XML entities.
|
2008
|
|
|
|
|
|
|
|
2009
|
|
|
|
|
|
|
0 - default: no numeric escaping (OK if you're writing out UTF8)
|
2010
|
|
|
|
|
|
|
|
2011
|
|
|
|
|
|
|
1 - only characters above 0xFF are escaped (ie: characters in the 0x80-FF range are not escaped), possibly useful with ISO8859-1 output
|
2012
|
|
|
|
|
|
|
|
2013
|
|
|
|
|
|
|
2 - all characters above 0x7F are escaped (good for plain ASCII output)
|
2014
|
|
|
|
|
|
|
|
2015
|
|
|
|
|
|
|
You can also specify the default value in the constructor
|
2016
|
|
|
|
|
|
|
|
2017
|
|
|
|
|
|
|
my $parser = XML::Rules->new(
|
2018
|
|
|
|
|
|
|
...
|
2019
|
|
|
|
|
|
|
NumericEscape => 2,
|
2020
|
|
|
|
|
|
|
);
|
2021
|
|
|
|
|
|
|
|
2022
|
|
|
|
|
|
|
=cut
|
2023
|
|
|
|
|
|
|
|
2024
|
|
|
|
|
|
|
sub ToXML;*ToXML=\&toXML;
|
2025
|
|
|
|
|
|
|
sub toXML {
|
2026
|
|
|
|
|
|
|
my $self = shift;
|
2027
|
|
|
|
|
|
|
if (!ref($self) and $self eq 'XML::Rules') {
|
2028
|
|
|
|
|
|
|
$self = XML::Rules->new(rules=>{}, ident => ' ');
|
2029
|
|
|
|
|
|
|
}
|
2030
|
|
|
|
|
|
|
my ($tag, $attrs, $no_close, $ident, $base);
|
2031
|
|
|
|
|
|
|
if (ref $_[0]) {
|
2032
|
|
|
|
|
|
|
($tag, $no_close, $ident, $base) = @_;
|
2033
|
|
|
|
|
|
|
} else {
|
2034
|
|
|
|
|
|
|
($tag, $attrs, $no_close, $ident, $base) = @_;
|
2035
|
|
|
|
|
|
|
}
|
2036
|
|
|
|
|
|
|
$ident = $self->{opt}{ident} unless defined $ident;
|
2037
|
|
|
|
|
|
|
|
2038
|
|
|
|
|
|
|
if ($ident eq '') {
|
2039
|
|
|
|
|
|
|
$self->_toXMLnoformat(@_)
|
2040
|
|
|
|
|
|
|
} else {
|
2041
|
|
|
|
|
|
|
$base = '' unless defined $base;
|
2042
|
|
|
|
|
|
|
$base = "\n" . $base unless $base =~ /^\n/s;
|
2043
|
|
|
|
|
|
|
if (ref $tag) {
|
2044
|
|
|
|
|
|
|
$self->_toXMLformat($tag, $no_close, $ident, $base)
|
2045
|
|
|
|
|
|
|
} else {
|
2046
|
|
|
|
|
|
|
$self->_toXMLformat($tag, $attrs, $no_close, $ident, $base)
|
2047
|
|
|
|
|
|
|
}
|
2048
|
|
|
|
|
|
|
}
|
2049
|
|
|
|
|
|
|
}
|
2050
|
|
|
|
|
|
|
|
2051
|
|
|
|
|
|
|
sub _toXMLnoformat {
|
2052
|
|
|
|
|
|
|
my ($self, $tag, $attrs, @body, $no_close);
|
2053
|
|
|
|
|
|
|
if (ref $_[1]) {
|
2054
|
|
|
|
|
|
|
if (ref $_[1] eq 'ARRAY') {
|
2055
|
|
|
|
|
|
|
($self, $tag, $no_close) = @_;
|
2056
|
|
|
|
|
|
|
($tag, $attrs, @body) = @$tag;
|
2057
|
|
|
|
|
|
|
if (defined $attrs and ref $attrs ne 'HASH') {
|
2058
|
|
|
|
|
|
|
unshift( @body, $attrs);
|
2059
|
|
|
|
|
|
|
$attrs = undef;
|
2060
|
|
|
|
|
|
|
}
|
2061
|
|
|
|
|
|
|
} else {
|
2062
|
|
|
|
|
|
|
croak("The first parameter to ->ToXML() must be the tag name or the arrayref containing [tagname, {attributes}, content]")
|
2063
|
|
|
|
|
|
|
}
|
2064
|
|
|
|
|
|
|
} else {
|
2065
|
|
|
|
|
|
|
($self, $tag, $attrs, $no_close) = @_;
|
2066
|
|
|
|
|
|
|
if (ref $attrs ne 'HASH') {
|
2067
|
|
|
|
|
|
|
if (defined $attrs and ref $attrs eq 'ARRAY') {
|
2068
|
|
|
|
|
|
|
return '' unless @$attrs;
|
2069
|
|
|
|
|
|
|
($attrs,@body) = (undef,@$attrs);
|
2070
|
|
|
|
|
|
|
} else {
|
2071
|
|
|
|
|
|
|
($attrs,@body) = (undef,$attrs);
|
2072
|
|
|
|
|
|
|
}
|
2073
|
|
|
|
|
|
|
}
|
2074
|
|
|
|
|
|
|
}
|
2075
|
|
|
|
|
|
|
|
2076
|
|
|
|
|
|
|
push @body, $attrs->{_content} if $attrs and defined $attrs->{_content};
|
2077
|
|
|
|
|
|
|
$attrs = undef if (ref $attrs eq 'HASH' and (keys(%{$attrs}) == 0 or keys(%{$attrs}) == 1 and exists $attrs->{_content})); # ->toXML( $tagname, {}, ...)
|
2078
|
|
|
|
|
|
|
|
2079
|
|
|
|
|
|
|
#use Data::Dumper;
|
2080
|
|
|
|
|
|
|
#print Dumper( [$tag, $attrs, \@body]);
|
2081
|
|
|
|
|
|
|
#sleep(1);
|
2082
|
|
|
|
|
|
|
|
2083
|
|
|
|
|
|
|
if ($tag eq '') {
|
2084
|
|
|
|
|
|
|
# \%attrs is ignored
|
2085
|
|
|
|
|
|
|
if (@body) {
|
2086
|
|
|
|
|
|
|
return join( '', map {
|
2087
|
|
|
|
|
|
|
if (!ref($_)) {
|
2088
|
|
|
|
|
|
|
$self->escape_value($_)
|
2089
|
|
|
|
|
|
|
} elsif (ref($_) eq 'ARRAY') {
|
2090
|
|
|
|
|
|
|
$self->_toXMLnoformat($_, 0)
|
2091
|
|
|
|
|
|
|
} else {
|
2092
|
|
|
|
|
|
|
croak "The content in XML::Rules->ToXML( '', here) must be a string or an arrayref containing strings and arrayrefs!";
|
2093
|
|
|
|
|
|
|
}
|
2094
|
|
|
|
|
|
|
} @body);
|
2095
|
|
|
|
|
|
|
} else {
|
2096
|
|
|
|
|
|
|
return '';
|
2097
|
|
|
|
|
|
|
}
|
2098
|
|
|
|
|
|
|
}
|
2099
|
|
|
|
|
|
|
|
2100
|
|
|
|
|
|
|
if (@body > 1) {
|
2101
|
|
|
|
|
|
|
if (! $attrs) {
|
2102
|
|
|
|
|
|
|
my $result = '';
|
2103
|
|
|
|
|
|
|
while (@body) {
|
2104
|
|
|
|
|
|
|
my $content = shift(@body);
|
2105
|
|
|
|
|
|
|
if (ref $content eq 'HASH') {
|
2106
|
|
|
|
|
|
|
if (@body and ref($body[0]) ne 'HASH') {
|
2107
|
|
|
|
|
|
|
$result .= $self->_toXMLnoformat([$tag, $content, shift(@body)], 0)
|
2108
|
|
|
|
|
|
|
} else {
|
2109
|
|
|
|
|
|
|
$result .= $self->_toXMLnoformat([$tag, $content], 0)
|
2110
|
|
|
|
|
|
|
}
|
2111
|
|
|
|
|
|
|
} else {
|
2112
|
|
|
|
|
|
|
$result .= $self->_toXMLnoformat([$tag, undef, $content], 0)
|
2113
|
|
|
|
|
|
|
}
|
2114
|
|
|
|
|
|
|
}
|
2115
|
|
|
|
|
|
|
return $result;
|
2116
|
|
|
|
|
|
|
} else {
|
2117
|
|
|
|
|
|
|
my $result = '';
|
2118
|
|
|
|
|
|
|
while (@body) {
|
2119
|
|
|
|
|
|
|
my $content = shift(@body);
|
2120
|
|
|
|
|
|
|
if (ref $content eq 'HASH') {
|
2121
|
|
|
|
|
|
|
my %h = (%$attrs, %$content);
|
2122
|
|
|
|
|
|
|
if (@body and ref($body[0]) ne 'HASH') {
|
2123
|
|
|
|
|
|
|
$result .= $self->_toXMLnoformat([$tag, \%h, shift(@body)], 0)
|
2124
|
|
|
|
|
|
|
} else {
|
2125
|
|
|
|
|
|
|
$result .= $self->_toXMLnoformat([$tag, \%h], 0)
|
2126
|
|
|
|
|
|
|
}
|
2127
|
|
|
|
|
|
|
} else {
|
2128
|
|
|
|
|
|
|
$result .= $self->_toXMLnoformat([$tag, $attrs, $content])
|
2129
|
|
|
|
|
|
|
}
|
2130
|
|
|
|
|
|
|
}
|
2131
|
|
|
|
|
|
|
return $result;
|
2132
|
|
|
|
|
|
|
}
|
2133
|
|
|
|
|
|
|
}
|
2134
|
|
|
|
|
|
|
|
2135
|
|
|
|
|
|
|
if (! $attrs and !ref($body[0])) { # ->toXML( $tagname, $string_content, ...)
|
2136
|
|
|
|
|
|
|
if ($no_close) {
|
2137
|
|
|
|
|
|
|
return "<$tag>" . $self->escape_value($body[0]);
|
2138
|
|
|
|
|
|
|
} elsif (! defined $body[0]) {
|
2139
|
|
|
|
|
|
|
return "<$tag/>";
|
2140
|
|
|
|
|
|
|
} else {
|
2141
|
|
|
|
|
|
|
return "<$tag>" . $self->escape_value($body[0]) . "$tag>";
|
2142
|
|
|
|
|
|
|
}
|
2143
|
|
|
|
|
|
|
}
|
2144
|
|
|
|
|
|
|
|
2145
|
|
|
|
|
|
|
my $content = $body[0];
|
2146
|
|
|
|
|
|
|
my $result = "<$tag";
|
2147
|
|
|
|
|
|
|
my $subtags = '';
|
2148
|
|
|
|
|
|
|
foreach my $key (sort keys %$attrs) {
|
2149
|
|
|
|
|
|
|
next if $key =~ /^:/ or $key eq '_content';
|
2150
|
|
|
|
|
|
|
if (ref $attrs->{$key}) {
|
2151
|
|
|
|
|
|
|
if (ref $attrs->{$key} eq 'ARRAY') {
|
2152
|
|
|
|
|
|
|
if (@{$attrs->{$key}}) {
|
2153
|
|
|
|
|
|
|
foreach my $subtag (@{$attrs->{$key}}) {
|
2154
|
|
|
|
|
|
|
$subtags .= $self->_toXMLnoformat($key, $subtag, 0);
|
2155
|
|
|
|
|
|
|
}
|
2156
|
|
|
|
|
|
|
} else {
|
2157
|
|
|
|
|
|
|
$subtags .= "<$key/>";
|
2158
|
|
|
|
|
|
|
}
|
2159
|
|
|
|
|
|
|
} elsif (ref $attrs->{$key} eq 'HASH') {
|
2160
|
|
|
|
|
|
|
$subtags .= $self->_toXMLnoformat($key, $attrs->{$key}, 0)
|
2161
|
|
|
|
|
|
|
} else {
|
2162
|
|
|
|
|
|
|
croak(ref($attrs->{$key}) . " attributes not supported in XML::Rules->toXML()!");
|
2163
|
|
|
|
|
|
|
}
|
2164
|
|
|
|
|
|
|
} else {
|
2165
|
|
|
|
|
|
|
$result .= qq{ $key="} . $self->escape_value($attrs->{$key}) . qq{"};
|
2166
|
|
|
|
|
|
|
}
|
2167
|
|
|
|
|
|
|
}
|
2168
|
|
|
|
|
|
|
if (! defined $content and $subtags eq '') {
|
2169
|
|
|
|
|
|
|
if ($no_close) {
|
2170
|
|
|
|
|
|
|
return $result.">";
|
2171
|
|
|
|
|
|
|
} else {
|
2172
|
|
|
|
|
|
|
return $result."/>";
|
2173
|
|
|
|
|
|
|
}
|
2174
|
|
|
|
|
|
|
|
2175
|
|
|
|
|
|
|
} elsif (!ref($content)) { # content is a string, not an array of strings and subtags
|
2176
|
|
|
|
|
|
|
if ($no_close) {
|
2177
|
|
|
|
|
|
|
return "$result>$subtags" . $self->escape_value($content);
|
2178
|
|
|
|
|
|
|
} elsif ($content eq '' and $subtags ne '') {
|
2179
|
|
|
|
|
|
|
return "$result>$subtags$tag>";
|
2180
|
|
|
|
|
|
|
} else {
|
2181
|
|
|
|
|
|
|
return "$result>$subtags" . $self->escape_value($content) ."$tag>";
|
2182
|
|
|
|
|
|
|
}
|
2183
|
|
|
|
|
|
|
|
2184
|
|
|
|
|
|
|
} elsif (ref($content) eq 'ARRAY') {
|
2185
|
|
|
|
|
|
|
$result .= ">$subtags";
|
2186
|
|
|
|
|
|
|
foreach my $snippet (@$content) {
|
2187
|
|
|
|
|
|
|
if (!ref($snippet)) {
|
2188
|
|
|
|
|
|
|
$result .= $self->escape_value($snippet);
|
2189
|
|
|
|
|
|
|
} elsif (ref($snippet) eq 'ARRAY') {
|
2190
|
|
|
|
|
|
|
$result .= $self->_toXMLnoformat($snippet, 0);
|
2191
|
|
|
|
|
|
|
} else {
|
2192
|
|
|
|
|
|
|
croak(ref($snippet) . " not supported in _content in XML::Rules->toXML()!");
|
2193
|
|
|
|
|
|
|
}
|
2194
|
|
|
|
|
|
|
}
|
2195
|
|
|
|
|
|
|
if ($no_close) {
|
2196
|
|
|
|
|
|
|
return $result;
|
2197
|
|
|
|
|
|
|
} else {
|
2198
|
|
|
|
|
|
|
return $result."$tag>";
|
2199
|
|
|
|
|
|
|
}
|
2200
|
|
|
|
|
|
|
} else {
|
2201
|
|
|
|
|
|
|
croak(ref($content) . " _content not supported in XML::Rules->toXML()!");
|
2202
|
|
|
|
|
|
|
}
|
2203
|
|
|
|
|
|
|
}
|
2204
|
|
|
|
|
|
|
|
2205
|
|
|
|
|
|
|
sub _toXMLformat {
|
2206
|
|
|
|
|
|
|
my ($self, $tag, $attrs, @body, $no_close, $ident, $base);
|
2207
|
|
|
|
|
|
|
if (ref $_[1]) {
|
2208
|
|
|
|
|
|
|
if (ref $_[1] eq 'ARRAY') {
|
2209
|
|
|
|
|
|
|
($self, $tag, $no_close, $ident, $base) = @_;
|
2210
|
|
|
|
|
|
|
($tag, $attrs, @body) = @$tag;
|
2211
|
|
|
|
|
|
|
if (defined $attrs and ref $attrs ne 'HASH') {
|
2212
|
|
|
|
|
|
|
unshift( @body, $attrs);
|
2213
|
|
|
|
|
|
|
$attrs = undef;
|
2214
|
|
|
|
|
|
|
}
|
2215
|
|
|
|
|
|
|
} else {
|
2216
|
|
|
|
|
|
|
croak("The first parameter to ->ToXML() must be the tag name or the arrayref containing [tagname, {attributes}, content]")
|
2217
|
|
|
|
|
|
|
}
|
2218
|
|
|
|
|
|
|
} else {
|
2219
|
|
|
|
|
|
|
($self, $tag, $attrs, $no_close, $ident, $base) = @_;
|
2220
|
|
|
|
|
|
|
if (ref $attrs ne 'HASH') {
|
2221
|
|
|
|
|
|
|
if (defined $attrs and ref $attrs eq 'ARRAY') {
|
2222
|
|
|
|
|
|
|
return '' unless @$attrs;
|
2223
|
|
|
|
|
|
|
($attrs,@body) = (undef,@$attrs);
|
2224
|
|
|
|
|
|
|
} else {
|
2225
|
|
|
|
|
|
|
($attrs,@body) = (undef,$attrs);
|
2226
|
|
|
|
|
|
|
}
|
2227
|
|
|
|
|
|
|
}
|
2228
|
|
|
|
|
|
|
}
|
2229
|
|
|
|
|
|
|
|
2230
|
|
|
|
|
|
|
push @body, $attrs->{_content} if $attrs and defined $attrs->{_content};
|
2231
|
|
|
|
|
|
|
$attrs = undef if (ref $attrs eq 'HASH' and (keys(%{$attrs}) == 0 or keys(%{$attrs}) == 1 and exists $attrs->{_content})); # ->toXML( $tagname, {}, ...)
|
2232
|
|
|
|
|
|
|
|
2233
|
|
|
|
|
|
|
#use Data::Dumper;
|
2234
|
|
|
|
|
|
|
#print Dumper( [$tag, $attrs, \@body]);
|
2235
|
|
|
|
|
|
|
#sleep(1);
|
2236
|
|
|
|
|
|
|
|
2237
|
|
|
|
|
|
|
if ($tag eq '') {
|
2238
|
|
|
|
|
|
|
# \%attrs is ignored
|
2239
|
|
|
|
|
|
|
if (@body) {
|
2240
|
|
|
|
|
|
|
return join( '', map {
|
2241
|
|
|
|
|
|
|
if (!ref($_)) {
|
2242
|
|
|
|
|
|
|
$self->escape_value($_)
|
2243
|
|
|
|
|
|
|
} elsif (ref($_) eq 'ARRAY') {
|
2244
|
|
|
|
|
|
|
$self->_toXMLformat($_, 0, $ident, $base)
|
2245
|
|
|
|
|
|
|
} else {
|
2246
|
|
|
|
|
|
|
croak "The content in XML::Rules->ToXML( '', here) must be a string or an arrayref containing strings and arrayrefs!";
|
2247
|
|
|
|
|
|
|
}
|
2248
|
|
|
|
|
|
|
} @body);
|
2249
|
|
|
|
|
|
|
} else {
|
2250
|
|
|
|
|
|
|
return '';
|
2251
|
|
|
|
|
|
|
}
|
2252
|
|
|
|
|
|
|
}
|
2253
|
|
|
|
|
|
|
|
2254
|
|
|
|
|
|
|
if (@body > 1) {
|
2255
|
|
|
|
|
|
|
if (! $attrs) {
|
2256
|
|
|
|
|
|
|
my $result = '';
|
2257
|
|
|
|
|
|
|
while (@body) {
|
2258
|
|
|
|
|
|
|
$result .= $base if $result ne '';
|
2259
|
|
|
|
|
|
|
my $content = shift(@body);
|
2260
|
|
|
|
|
|
|
if (ref $content eq 'HASH') {
|
2261
|
|
|
|
|
|
|
if (@body and ref($body[0]) ne 'HASH') {
|
2262
|
|
|
|
|
|
|
$result .= $self->_toXMLformat([$tag, $content, shift(@body)], 0, $ident, $base)
|
2263
|
|
|
|
|
|
|
} else {
|
2264
|
|
|
|
|
|
|
$result .= $self->_toXMLformat([$tag, $content], 0, $ident, $base)
|
2265
|
|
|
|
|
|
|
}
|
2266
|
|
|
|
|
|
|
} else {
|
2267
|
|
|
|
|
|
|
$result .= $self->_toXMLformat([$tag, undef, $content], 0, $ident, $base)
|
2268
|
|
|
|
|
|
|
}
|
2269
|
|
|
|
|
|
|
}
|
2270
|
|
|
|
|
|
|
return $result;
|
2271
|
|
|
|
|
|
|
} else {
|
2272
|
|
|
|
|
|
|
my $result = '';
|
2273
|
|
|
|
|
|
|
while (@body) {
|
2274
|
|
|
|
|
|
|
$result .= $base if $result ne '';
|
2275
|
|
|
|
|
|
|
my $content = shift(@body);
|
2276
|
|
|
|
|
|
|
if (ref $content eq 'HASH') {
|
2277
|
|
|
|
|
|
|
my %h = (%$attrs, %$content);
|
2278
|
|
|
|
|
|
|
if (@body and ref($body[0]) ne 'HASH') {
|
2279
|
|
|
|
|
|
|
$result .= $self->_toXMLformat([$tag, \%h, shift(@body)], 0, $ident, $base)
|
2280
|
|
|
|
|
|
|
} else {
|
2281
|
|
|
|
|
|
|
$result .= $self->_toXMLformat([$tag, \%h], 0, $ident, $base)
|
2282
|
|
|
|
|
|
|
}
|
2283
|
|
|
|
|
|
|
} else {
|
2284
|
|
|
|
|
|
|
$result .= $self->_toXMLformat([$tag, $attrs, $content], 0, $ident, $base)
|
2285
|
|
|
|
|
|
|
}
|
2286
|
|
|
|
|
|
|
}
|
2287
|
|
|
|
|
|
|
return $result;
|
2288
|
|
|
|
|
|
|
}
|
2289
|
|
|
|
|
|
|
}
|
2290
|
|
|
|
|
|
|
|
2291
|
|
|
|
|
|
|
if (! $attrs and !ref($body[0])) { # ->toXML( $tagname, $string_content, ...)
|
2292
|
|
|
|
|
|
|
if ($no_close) {
|
2293
|
|
|
|
|
|
|
return "<$tag>" . $self->escape_value($body[0]);
|
2294
|
|
|
|
|
|
|
} elsif (! defined $body[0]) {
|
2295
|
|
|
|
|
|
|
return "<$tag/>";
|
2296
|
|
|
|
|
|
|
} else {
|
2297
|
|
|
|
|
|
|
return "<$tag>" . $self->escape_value($body[0]) . "$tag>";
|
2298
|
|
|
|
|
|
|
}
|
2299
|
|
|
|
|
|
|
}
|
2300
|
|
|
|
|
|
|
|
2301
|
|
|
|
|
|
|
my $content = $body[0];
|
2302
|
|
|
|
|
|
|
my $result = "<$tag";
|
2303
|
|
|
|
|
|
|
my $subtags = '';
|
2304
|
|
|
|
|
|
|
my $had_child = 0;
|
2305
|
|
|
|
|
|
|
foreach my $key (sort keys %$attrs) {
|
2306
|
|
|
|
|
|
|
next if $key =~ /^:/ or $key eq '_content';
|
2307
|
|
|
|
|
|
|
if (ref $attrs->{$key}) {
|
2308
|
|
|
|
|
|
|
if (ref $attrs->{$key} eq 'ARRAY') {
|
2309
|
|
|
|
|
|
|
if (@{$attrs->{$key}}) {
|
2310
|
|
|
|
|
|
|
foreach my $subtag (@{$attrs->{$key}}) {
|
2311
|
|
|
|
|
|
|
$subtags .= $base . $ident . $self->_toXMLformat($key, $subtag, 0, $ident, $base.$ident);
|
2312
|
|
|
|
|
|
|
$had_child = 1;
|
2313
|
|
|
|
|
|
|
}
|
2314
|
|
|
|
|
|
|
} else {
|
2315
|
|
|
|
|
|
|
$subtags .= $base . $ident . "<$key/>";
|
2316
|
|
|
|
|
|
|
}
|
2317
|
|
|
|
|
|
|
} elsif (ref $attrs->{$key} eq 'HASH') {
|
2318
|
|
|
|
|
|
|
$subtags .= $base . $ident . $self->_toXMLformat($key, $attrs->{$key}, 0, $ident, $base.$ident);
|
2319
|
|
|
|
|
|
|
$had_child = 1;
|
2320
|
|
|
|
|
|
|
} else {
|
2321
|
|
|
|
|
|
|
croak(ref($attrs->{$key}) . " attributes not supported in XML::Rules->toXML()!");
|
2322
|
|
|
|
|
|
|
}
|
2323
|
|
|
|
|
|
|
} else {
|
2324
|
|
|
|
|
|
|
$result .= qq{ $key="} . $self->escape_value($attrs->{$key}) . qq{"};
|
2325
|
|
|
|
|
|
|
}
|
2326
|
|
|
|
|
|
|
}
|
2327
|
|
|
|
|
|
|
if (! defined $content and $subtags eq '') {
|
2328
|
|
|
|
|
|
|
if ($no_close) {
|
2329
|
|
|
|
|
|
|
return $result.">";
|
2330
|
|
|
|
|
|
|
} else {
|
2331
|
|
|
|
|
|
|
return $result."/>";
|
2332
|
|
|
|
|
|
|
}
|
2333
|
|
|
|
|
|
|
|
2334
|
|
|
|
|
|
|
} elsif (!ref($content)) { # content is a string, not an array of strings and subtags
|
2335
|
|
|
|
|
|
|
if ($no_close) {
|
2336
|
|
|
|
|
|
|
return "$result>$subtags" . $self->escape_value($content);
|
2337
|
|
|
|
|
|
|
} elsif ($content eq '' and $subtags ne '') {
|
2338
|
|
|
|
|
|
|
return "$result>$subtags".($had_child ? $base : '')."$tag>";
|
2339
|
|
|
|
|
|
|
} else {
|
2340
|
|
|
|
|
|
|
return "$result>$subtags" . $self->escape_value($content) . ($had_child ? $base : '') ."$tag>";
|
2341
|
|
|
|
|
|
|
}
|
2342
|
|
|
|
|
|
|
|
2343
|
|
|
|
|
|
|
} elsif (ref($content) eq 'ARRAY') {
|
2344
|
|
|
|
|
|
|
$result .= ">$subtags";
|
2345
|
|
|
|
|
|
|
foreach my $snippet (@$content) {
|
2346
|
|
|
|
|
|
|
if (!ref($snippet)) {
|
2347
|
|
|
|
|
|
|
$result .= $self->escape_value($snippet);
|
2348
|
|
|
|
|
|
|
} elsif (ref($snippet) eq 'ARRAY') {
|
2349
|
|
|
|
|
|
|
$result .= $base.$ident . $self->_toXMLformat($snippet, 0, $ident, $base.$ident);
|
2350
|
|
|
|
|
|
|
$had_child = 1;
|
2351
|
|
|
|
|
|
|
} else {
|
2352
|
|
|
|
|
|
|
croak(ref($snippet) . " not supported in _content in XML::Rules->toXML()!");
|
2353
|
|
|
|
|
|
|
}
|
2354
|
|
|
|
|
|
|
}
|
2355
|
|
|
|
|
|
|
if ($no_close) {
|
2356
|
|
|
|
|
|
|
return $result;
|
2357
|
|
|
|
|
|
|
} else {
|
2358
|
|
|
|
|
|
|
if ($had_child) {
|
2359
|
|
|
|
|
|
|
return $result.$base."$tag>";
|
2360
|
|
|
|
|
|
|
} else {
|
2361
|
|
|
|
|
|
|
return $result."$tag>";
|
2362
|
|
|
|
|
|
|
}
|
2363
|
|
|
|
|
|
|
}
|
2364
|
|
|
|
|
|
|
} else {
|
2365
|
|
|
|
|
|
|
croak(ref($content) . " _content not supported in XML::Rules->toXML()!");
|
2366
|
|
|
|
|
|
|
}
|
2367
|
|
|
|
|
|
|
}
|
2368
|
|
|
|
|
|
|
|
2369
|
|
|
|
|
|
|
|
2370
|
|
|
|
|
|
|
sub parentsToXML {
|
2371
|
|
|
|
|
|
|
my ($self, $level) = @_;
|
2372
|
|
|
|
|
|
|
my $tag_names = $self->{context};
|
2373
|
|
|
|
|
|
|
my $tag_attrs = $self->{data};
|
2374
|
|
|
|
|
|
|
|
2375
|
|
|
|
|
|
|
$level = scalar(@$tag_names) unless $level;
|
2376
|
|
|
|
|
|
|
|
2377
|
|
|
|
|
|
|
my $result = '';
|
2378
|
|
|
|
|
|
|
for (my $i = -1; -$i <= $level; $i--) {
|
2379
|
|
|
|
|
|
|
$result = $self->toXML( ${$tag_names}[$i], ${$tag_attrs}[$i], 1) . $result;
|
2380
|
|
|
|
|
|
|
}
|
2381
|
|
|
|
|
|
|
return $result;
|
2382
|
|
|
|
|
|
|
}
|
2383
|
|
|
|
|
|
|
|
2384
|
|
|
|
|
|
|
sub closeParentsToXML {
|
2385
|
|
|
|
|
|
|
my ($self, $level) = @_;
|
2386
|
|
|
|
|
|
|
my $tag_names = $self->{context};
|
2387
|
|
|
|
|
|
|
|
2388
|
|
|
|
|
|
|
if ($level) {
|
2389
|
|
|
|
|
|
|
return '' . join( '>', (reverse(@{$tag_names}))[0..$level-1]) . '>';
|
2390
|
|
|
|
|
|
|
} else {
|
2391
|
|
|
|
|
|
|
return '' . join( '>', reverse(@$tag_names)) . '>';
|
2392
|
|
|
|
|
|
|
}
|
2393
|
|
|
|
|
|
|
}
|
2394
|
|
|
|
|
|
|
|
2395
|
|
|
|
|
|
|
=head2 toXML / ToXML
|
2396
|
|
|
|
|
|
|
|
2397
|
|
|
|
|
|
|
$xml = $parser->toXML( $tagname, \%attrs[, $do_not_close, $ident, $base])
|
2398
|
|
|
|
|
|
|
|
2399
|
|
|
|
|
|
|
You may use this method to convert the datastructures created by parsing the XML into the XML format.
|
2400
|
|
|
|
|
|
|
Not all data structures may be printed! I'll add more docs later, for now please do experiment.
|
2401
|
|
|
|
|
|
|
|
2402
|
|
|
|
|
|
|
The $ident and $base, if defined, turn on and control the pretty-printing. The $ident specifies the character(s)
|
2403
|
|
|
|
|
|
|
used for one level of identation, the base contains the identation of the current tag. That is if you want to include the data inside of
|
2404
|
|
|
|
|
|
|
|
2405
|
|
|
|
|
|
|
|
2406
|
|
|
|
|
|
|
|
2407
|
|
|
|
|
|
|
$here
|
2408
|
|
|
|
|
|
|
|
2409
|
|
|
|
|
|
|
|
2410
|
|
|
|
|
|
|
|
2411
|
|
|
|
|
|
|
you will call
|
2412
|
|
|
|
|
|
|
|
2413
|
|
|
|
|
|
|
$parser->toXML( $tagname, \%attrs, 0, "\t", "\t\t\t");
|
2414
|
|
|
|
|
|
|
|
2415
|
|
|
|
|
|
|
The method does NOT validate that the $ident and $base are whitespace only, but of course if it's not you end up with invalid
|
2416
|
|
|
|
|
|
|
XML. Newlines are added only before the start tag and (if the tag has only child tags and no content) before the closing tag,
|
2417
|
|
|
|
|
|
|
but not after the closing tag! Newlines are added even if the $ident is an empty string.
|
2418
|
|
|
|
|
|
|
|
2419
|
|
|
|
|
|
|
=head2 parentsToXML
|
2420
|
|
|
|
|
|
|
|
2421
|
|
|
|
|
|
|
$xml = $parser->parentsToXML( [$level])
|
2422
|
|
|
|
|
|
|
|
2423
|
|
|
|
|
|
|
Prints all or only the topmost $level ancestor tags, including the attributes and content (parsed so far),
|
2424
|
|
|
|
|
|
|
but without the closing tags. You may use this to print the header of the file you are parsing,
|
2425
|
|
|
|
|
|
|
followed by calling toXML() on a structure you build and then by closeParentsToXML() to close
|
2426
|
|
|
|
|
|
|
the tags left opened by parentsToXML(). You most likely want to use the style => 'filter' option
|
2427
|
|
|
|
|
|
|
for the constructor instead.
|
2428
|
|
|
|
|
|
|
|
2429
|
|
|
|
|
|
|
=head2 closeParentsToXML
|
2430
|
|
|
|
|
|
|
|
2431
|
|
|
|
|
|
|
$xml = $parser->closeParentsToXML( [$level])
|
2432
|
|
|
|
|
|
|
|
2433
|
|
|
|
|
|
|
Prints the closing tags for all or the topmost $level ancestor tags of the one currently processed.
|
2434
|
|
|
|
|
|
|
|
2435
|
|
|
|
|
|
|
=head2 paths2rules
|
2436
|
|
|
|
|
|
|
|
2437
|
|
|
|
|
|
|
my $parser = XML::Rules->new(
|
2438
|
|
|
|
|
|
|
rules => paths2rules {
|
2439
|
|
|
|
|
|
|
'/root/subtag/tag' => sub { ...},
|
2440
|
|
|
|
|
|
|
'/root/othertag/tag' => sub {...},
|
2441
|
|
|
|
|
|
|
'tag' => sub{ ... the default code for this tag ...},
|
2442
|
|
|
|
|
|
|
...
|
2443
|
|
|
|
|
|
|
}
|
2444
|
|
|
|
|
|
|
);
|
2445
|
|
|
|
|
|
|
|
2446
|
|
|
|
|
|
|
This helper function converts a hash of "somewhat xpath-like" paths and subs/rules into the format required by the module.
|
2447
|
|
|
|
|
|
|
Due to backwards compatibility and efficiency I can't directly support paths in the rules and the direct syntax for their
|
2448
|
|
|
|
|
|
|
specification is a bit awkward. So if you need the paths and not the regexps, you may use this helper instead of:
|
2449
|
|
|
|
|
|
|
|
2450
|
|
|
|
|
|
|
my $parser = XML::Rules->new(
|
2451
|
|
|
|
|
|
|
rules => {
|
2452
|
|
|
|
|
|
|
'tag' => [
|
2453
|
|
|
|
|
|
|
'/root/subtag' => sub { ...},
|
2454
|
|
|
|
|
|
|
'/root/othertag' => sub {...},
|
2455
|
|
|
|
|
|
|
sub{ ... the default code for this tag ...},
|
2456
|
|
|
|
|
|
|
],
|
2457
|
|
|
|
|
|
|
...
|
2458
|
|
|
|
|
|
|
}
|
2459
|
|
|
|
|
|
|
);
|
2460
|
|
|
|
|
|
|
|
2461
|
|
|
|
|
|
|
=cut
|
2462
|
|
|
|
|
|
|
|
2463
|
|
|
|
|
|
|
sub paths2rules {
|
2464
|
|
|
|
|
|
|
my ($paths) = @_;
|
2465
|
|
|
|
|
|
|
|
2466
|
|
|
|
|
|
|
my %rules;
|
2467
|
|
|
|
|
|
|
while ( my ($tag, $val) = each %$paths) {
|
2468
|
|
|
|
|
|
|
|
2469
|
|
|
|
|
|
|
if ($tag =~ m{^(.*)/(.*)$}) {
|
2470
|
|
|
|
|
|
|
my ($path, $tagname) = ($1, $2);
|
2471
|
|
|
|
|
|
|
|
2472
|
|
|
|
|
|
|
if (exists $rules{$tagname} and ref($rules{$tagname}) eq 'ARRAY') {
|
2473
|
|
|
|
|
|
|
if (@{$rules{$tagname}} % 2) {
|
2474
|
|
|
|
|
|
|
push @{$rules{$tagname}}, $path, $val;
|
2475
|
|
|
|
|
|
|
} else {
|
2476
|
|
|
|
|
|
|
splice @{$rules{$tagname}}, -1, 0, $path, $val;
|
2477
|
|
|
|
|
|
|
}
|
2478
|
|
|
|
|
|
|
} else {
|
2479
|
|
|
|
|
|
|
$rules{$tagname} = [ $path => $val]
|
2480
|
|
|
|
|
|
|
}
|
2481
|
|
|
|
|
|
|
|
2482
|
|
|
|
|
|
|
} elsif (exists $rules{$tag} and ref($rules{$tag}) eq 'ARRAY') {
|
2483
|
|
|
|
|
|
|
push @{$rules{$tag}}, $val;
|
2484
|
|
|
|
|
|
|
} else {
|
2485
|
|
|
|
|
|
|
$rules{$tag} = $val
|
2486
|
|
|
|
|
|
|
}
|
2487
|
|
|
|
|
|
|
}
|
2488
|
|
|
|
|
|
|
|
2489
|
|
|
|
|
|
|
return \%rules;
|
2490
|
|
|
|
|
|
|
}
|
2491
|
|
|
|
|
|
|
|
2492
|
|
|
|
|
|
|
=head2 return_nothing
|
2493
|
|
|
|
|
|
|
|
2494
|
|
|
|
|
|
|
Stop parsing the XML, forget any data we already have and return from the $parser->parse().
|
2495
|
|
|
|
|
|
|
This is only supposed to be used within rules and may be called both as a method and as
|
2496
|
|
|
|
|
|
|
an ordinary function (it's not exported).
|
2497
|
|
|
|
|
|
|
|
2498
|
|
|
|
|
|
|
=head2 return_this
|
2499
|
|
|
|
|
|
|
|
2500
|
|
|
|
|
|
|
Stop parsing the XML, forget any data we already have and return the attributes passed to this subroutine
|
2501
|
|
|
|
|
|
|
from the $parser->parse(). This is only supposed to be used within rules and may be called both as a method
|
2502
|
|
|
|
|
|
|
and as an ordinary function (it's not exported).
|
2503
|
|
|
|
|
|
|
|
2504
|
|
|
|
|
|
|
=head2 skip_rest
|
2505
|
|
|
|
|
|
|
|
2506
|
|
|
|
|
|
|
Stop parsing the XML and return whatever data we already have from the $parser->parse().
|
2507
|
|
|
|
|
|
|
The rules for the currently opened tags are evaluated as if the XML contained all
|
2508
|
|
|
|
|
|
|
the closing tags in the right order.
|
2509
|
|
|
|
|
|
|
|
2510
|
|
|
|
|
|
|
These three work via raising an exception, the exception is caught within the $parser->parse() and does not propagate outside.
|
2511
|
|
|
|
|
|
|
It's also safe to raise any other exception within the rules, the exception will be caught as well, the internal state of the $parser object
|
2512
|
|
|
|
|
|
|
will be cleaned and the exception rethrown.
|
2513
|
|
|
|
|
|
|
|
2514
|
|
|
|
|
|
|
=head1 CLASS METHODS
|
2515
|
|
|
|
|
|
|
|
2516
|
|
|
|
|
|
|
=head2 parse
|
2517
|
|
|
|
|
|
|
|
2518
|
|
|
|
|
|
|
When called as a class method, parse() accepts the same parameters as new(), instantiates a new parser object
|
2519
|
|
|
|
|
|
|
and returns a subroutine reference that calls the parse() method on that instance.
|
2520
|
|
|
|
|
|
|
|
2521
|
|
|
|
|
|
|
my $parser = XML::Rules->new(rules => \%rules);
|
2522
|
|
|
|
|
|
|
my $data = $parser->parse($xml);
|
2523
|
|
|
|
|
|
|
|
2524
|
|
|
|
|
|
|
becomes
|
2525
|
|
|
|
|
|
|
|
2526
|
|
|
|
|
|
|
my $read_data = XML::Rules->parse(rules => \%rules);
|
2527
|
|
|
|
|
|
|
my $data = $read_data->($xml);
|
2528
|
|
|
|
|
|
|
|
2529
|
|
|
|
|
|
|
or
|
2530
|
|
|
|
|
|
|
|
2531
|
|
|
|
|
|
|
sub read_data;
|
2532
|
|
|
|
|
|
|
*read_data = XML::Rules->parse(rules => \%rules);
|
2533
|
|
|
|
|
|
|
my $data = read_data($xml);
|
2534
|
|
|
|
|
|
|
|
2535
|
|
|
|
|
|
|
=head2 parsestring, parsefile, parse_file, filter, filterstring, filter_string, filterfile, filter_file
|
2536
|
|
|
|
|
|
|
|
2537
|
|
|
|
|
|
|
All these methods work the same way as parse() when used as a class method. They accept the same parameters as new(),
|
2538
|
|
|
|
|
|
|
instantiate a new object and return a subroutine reference that calls the respective method.
|
2539
|
|
|
|
|
|
|
|
2540
|
|
|
|
|
|
|
=head2 inferRulesFromExample
|
2541
|
|
|
|
|
|
|
|
2542
|
|
|
|
|
|
|
Dumper(XML::Rules::inferRulesFromExample( $fileOrXML, $fileOrXML, $fileOrXML, ...)
|
2543
|
|
|
|
|
|
|
Dumper(XML::Rules->inferRulesFromExample( $fileOrXML, $fileOrXML, $fileOrXML, ...)
|
2544
|
|
|
|
|
|
|
Dumper($parser->inferRulesFromExample( $fileOrXML, $fileOrXML, $fileOrXML, ...)
|
2545
|
|
|
|
|
|
|
|
2546
|
|
|
|
|
|
|
The subroutine parses the listed files and infers the rules that would produce the minimal, but complete datastructure.
|
2547
|
|
|
|
|
|
|
It finds out what tags may be repeated, whether they contain text content, attributes etc. You may want to give
|
2548
|
|
|
|
|
|
|
the subroutine several examples to make sure it knows about all possibilities. You should use this once and store
|
2549
|
|
|
|
|
|
|
the generated rules in your script or even take this as the basis of a more specific set of rules.
|
2550
|
|
|
|
|
|
|
|
2551
|
|
|
|
|
|
|
=cut
|
2552
|
|
|
|
|
|
|
|
2553
|
|
|
|
|
|
|
sub inferRulesFromExample {
|
2554
|
|
|
|
|
|
|
shift(@_) if $_[0] eq 'XML::Rules' or ref($_[0]);
|
2555
|
|
|
|
|
|
|
my @files = @_;
|
2556
|
|
|
|
|
|
|
|
2557
|
|
|
|
|
|
|
my %rules;
|
2558
|
|
|
|
|
|
|
|
2559
|
|
|
|
|
|
|
my $parser = XML::Rules->new(
|
2560
|
|
|
|
|
|
|
namespaces => { '*' => ''},
|
2561
|
|
|
|
|
|
|
rules => {
|
2562
|
|
|
|
|
|
|
_default => sub {
|
2563
|
|
|
|
|
|
|
my ($tag, $attrs, $context, $parent_data, $parser) = @_;
|
2564
|
|
|
|
|
|
|
my $repeated = (exists $parent_data->[-1] and exists $parent_data->[-1]{$tag});
|
2565
|
|
|
|
|
|
|
my $has_content = (exists $attrs->{_content});
|
2566
|
|
|
|
|
|
|
my $has_children = grep ref($_) eq 'HASH', values %$attrs;
|
2567
|
|
|
|
|
|
|
my $has_attr = grep {$_ ne '_content' and !ref($attrs->{$_})} keys %$attrs;
|
2568
|
|
|
|
|
|
|
|
2569
|
|
|
|
|
|
|
my $rule = do {
|
2570
|
|
|
|
|
|
|
if ($repeated) {
|
2571
|
|
|
|
|
|
|
if ($has_content) {
|
2572
|
|
|
|
|
|
|
if ($has_attr or $has_children) {
|
2573
|
|
|
|
|
|
|
'as array'
|
2574
|
|
|
|
|
|
|
} else {
|
2575
|
|
|
|
|
|
|
'content array'
|
2576
|
|
|
|
|
|
|
}
|
2577
|
|
|
|
|
|
|
} else {
|
2578
|
|
|
|
|
|
|
if ($has_attr or $has_children) {
|
2579
|
|
|
|
|
|
|
'as array no content'
|
2580
|
|
|
|
|
|
|
} else {
|
2581
|
|
|
|
|
|
|
'content array'
|
2582
|
|
|
|
|
|
|
}
|
2583
|
|
|
|
|
|
|
}
|
2584
|
|
|
|
|
|
|
} else {
|
2585
|
|
|
|
|
|
|
if ($has_content) {
|
2586
|
|
|
|
|
|
|
if ($has_attr or $has_children) {
|
2587
|
|
|
|
|
|
|
'as is'
|
2588
|
|
|
|
|
|
|
} else {
|
2589
|
|
|
|
|
|
|
'content'
|
2590
|
|
|
|
|
|
|
}
|
2591
|
|
|
|
|
|
|
} else {
|
2592
|
|
|
|
|
|
|
if ($has_attr or $has_children) {
|
2593
|
|
|
|
|
|
|
'no content'
|
2594
|
|
|
|
|
|
|
} else {
|
2595
|
|
|
|
|
|
|
'content'
|
2596
|
|
|
|
|
|
|
}
|
2597
|
|
|
|
|
|
|
}
|
2598
|
|
|
|
|
|
|
}
|
2599
|
|
|
|
|
|
|
};
|
2600
|
|
|
|
|
|
|
|
2601
|
|
|
|
|
|
|
if (not exists $rules{$tag}) {
|
2602
|
|
|
|
|
|
|
$rules{$tag} = $rule
|
2603
|
|
|
|
|
|
|
} elsif($rules{$tag} ne $rule) {
|
2604
|
|
|
|
|
|
|
# we've already seen the tag and it had different type
|
2605
|
|
|
|
|
|
|
if ($rules{$tag} eq 'raw extended array') {
|
2606
|
|
|
|
|
|
|
} elsif ($rule eq 'raw extended array') {
|
2607
|
|
|
|
|
|
|
$rules{$tag} = 'raw extended array';
|
2608
|
|
|
|
|
|
|
} elsif ($rules{$tag} eq 'raw extended' and $rule =~ /array/
|
2609
|
|
|
|
|
|
|
or $rule eq 'raw extended' and $rules{$tag} =~ /array/) {
|
2610
|
|
|
|
|
|
|
$rules{$tag} = 'raw extended array'
|
2611
|
|
|
|
|
|
|
} elsif ($rules{$tag} eq 'as array' or $rule eq 'as array') {
|
2612
|
|
|
|
|
|
|
$rules{$tag} = 'as array'
|
2613
|
|
|
|
|
|
|
} elsif ($rules{$tag} eq 'content array' and $rule eq 'content'
|
2614
|
|
|
|
|
|
|
or $rule eq 'content array' and $rules{$tag} eq 'content') {
|
2615
|
|
|
|
|
|
|
$rules{$tag} = 'content array'
|
2616
|
|
|
|
|
|
|
} elsif ($rules{$tag} eq 'content array' and $rule eq 'as array no content'
|
2617
|
|
|
|
|
|
|
or $rule eq 'content array' and $rules{$tag} eq 'as array no content') {
|
2618
|
|
|
|
|
|
|
$rules{$tag} = 'as array'
|
2619
|
|
|
|
|
|
|
} elsif ($rules{$tag} eq 'content array' and $rule eq 'as is'
|
2620
|
|
|
|
|
|
|
or $rule eq 'content array' and $rules{$tag} eq 'as is') {
|
2621
|
|
|
|
|
|
|
$rules{$tag} = 'as array'
|
2622
|
|
|
|
|
|
|
} elsif ($rules{$tag} eq 'content array' and $rule eq 'no content'
|
2623
|
|
|
|
|
|
|
or $rule eq 'content array' and $rules{$tag} eq 'no content') {
|
2624
|
|
|
|
|
|
|
$rules{$tag} = 'as array'
|
2625
|
|
|
|
|
|
|
} elsif ($rules{$tag} eq 'as array no content' and $rule eq 'as is'
|
2626
|
|
|
|
|
|
|
or $rule eq 'as array no content' and $rules{$tag} eq 'as is') {
|
2627
|
|
|
|
|
|
|
$rules{$tag} = 'as array'
|
2628
|
|
|
|
|
|
|
} elsif ($rules{$tag} eq 'as array no content' and $rule eq 'content'
|
2629
|
|
|
|
|
|
|
or $rule eq 'as array no content' and $rules{$tag} eq 'content') {
|
2630
|
|
|
|
|
|
|
$rules{$tag} = 'as array'
|
2631
|
|
|
|
|
|
|
} elsif ($rules{$tag} eq 'as array no content' and $rule eq 'no content'
|
2632
|
|
|
|
|
|
|
or $rule eq 'as array no content' and $rules{$tag} eq 'no content') {
|
2633
|
|
|
|
|
|
|
$rules{$tag} = 'as array no content'
|
2634
|
|
|
|
|
|
|
} elsif ($rules{$tag} eq 'as is' and ($rule eq 'no content' or $rule eq 'content')
|
2635
|
|
|
|
|
|
|
or $rule eq 'as is' and ($rules{$tag} eq 'no content' or $rules{$tag} eq 'content')) {
|
2636
|
|
|
|
|
|
|
$rules{$tag} = 'as is'
|
2637
|
|
|
|
|
|
|
} elsif ($rules{$tag} eq 'content' and $rule eq 'no content'
|
2638
|
|
|
|
|
|
|
or $rule eq 'content' and $rules{$tag} eq 'no content') {
|
2639
|
|
|
|
|
|
|
$rules{$tag} = 'as is'
|
2640
|
|
|
|
|
|
|
} else {
|
2641
|
|
|
|
|
|
|
die "Unexpected combination of rules: old=$rules{$tag}, new=$rule for tag $tag\n";
|
2642
|
|
|
|
|
|
|
}
|
2643
|
|
|
|
|
|
|
}
|
2644
|
|
|
|
|
|
|
|
2645
|
|
|
|
|
|
|
if ($has_content and $has_children) { # the tag contains both text content and subtags!, need to use the raw extended rules
|
2646
|
|
|
|
|
|
|
foreach my $child (grep ref($attrs->{$_}) eq 'HASH', keys %$attrs) {
|
2647
|
|
|
|
|
|
|
next if $rules{$child} =~ /^raw extended/;
|
2648
|
|
|
|
|
|
|
if ($rules{$child} =~ /array/) {
|
2649
|
|
|
|
|
|
|
$rules{$child} = 'raw extended array'
|
2650
|
|
|
|
|
|
|
} else {
|
2651
|
|
|
|
|
|
|
$rules{$child} = 'raw extended'
|
2652
|
|
|
|
|
|
|
}
|
2653
|
|
|
|
|
|
|
}
|
2654
|
|
|
|
|
|
|
}
|
2655
|
|
|
|
|
|
|
return $tag => {};
|
2656
|
|
|
|
|
|
|
}
|
2657
|
|
|
|
|
|
|
},
|
2658
|
|
|
|
|
|
|
stripspaces => 7,
|
2659
|
|
|
|
|
|
|
);
|
2660
|
|
|
|
|
|
|
|
2661
|
|
|
|
|
|
|
for (@files) {
|
2662
|
|
|
|
|
|
|
eval {
|
2663
|
|
|
|
|
|
|
if (! ref($_) and $_ !~ /\n/ and $_ !~ /^\s*) {
|
2664
|
|
|
|
|
|
|
$parser->parsefile($_);
|
2665
|
|
|
|
|
|
|
} else {
|
2666
|
|
|
|
|
|
|
$parser->parse($_);
|
2667
|
|
|
|
|
|
|
}
|
2668
|
|
|
|
|
|
|
} or croak "Error parsing $_: $@\n";
|
2669
|
|
|
|
|
|
|
}
|
2670
|
|
|
|
|
|
|
|
2671
|
|
|
|
|
|
|
my %short_rules;
|
2672
|
|
|
|
|
|
|
foreach my $tag (sort keys %rules) {
|
2673
|
|
|
|
|
|
|
push @{$short_rules{$rules{$tag}}}, $tag
|
2674
|
|
|
|
|
|
|
}
|
2675
|
|
|
|
|
|
|
|
2676
|
|
|
|
|
|
|
foreach my $tags (values %short_rules) {
|
2677
|
|
|
|
|
|
|
$tags = join ',', sort @$tags;
|
2678
|
|
|
|
|
|
|
}
|
2679
|
|
|
|
|
|
|
%short_rules = reverse %short_rules;
|
2680
|
|
|
|
|
|
|
|
2681
|
|
|
|
|
|
|
return \%short_rules;
|
2682
|
|
|
|
|
|
|
}
|
2683
|
|
|
|
|
|
|
|
2684
|
|
|
|
|
|
|
=head2 inferRulesFromDTD
|
2685
|
|
|
|
|
|
|
|
2686
|
|
|
|
|
|
|
Dumper(XML::Rules::inferRulesFromDTD( $DTDorDTDfile, [$enableExtended]))
|
2687
|
|
|
|
|
|
|
Dumper(XML::Rules->inferRulesFromDTD( $DTDorDTDfile, [$enableExtended]))
|
2688
|
|
|
|
|
|
|
Dumper($parser->inferRulesFromDTD( $DTDorDTDfile, [$enableExtended]))
|
2689
|
|
|
|
|
|
|
|
2690
|
|
|
|
|
|
|
The subroutine parses the DTD and infers the rules that would produce the minimal, but complete datastructure.
|
2691
|
|
|
|
|
|
|
It finds out what tags may be repeated, whether they contain text content, attributes etc. You may use this
|
2692
|
|
|
|
|
|
|
each time you are about to parse the XML or once and store the generated rules in your script or even take this
|
2693
|
|
|
|
|
|
|
as the basis of a more specific set of rules.
|
2694
|
|
|
|
|
|
|
|
2695
|
|
|
|
|
|
|
With the second parameter set to a true value, the tags included in a mixed content will use the "raw extended"
|
2696
|
|
|
|
|
|
|
or "raw extended array" types instead of just "raw". This makes sure the tag data both stay at the right place in
|
2697
|
|
|
|
|
|
|
the content and are accessible easily from the parent tag's atrribute hash.
|
2698
|
|
|
|
|
|
|
|
2699
|
|
|
|
|
|
|
This subroutine requires the XML::DTDParser module!
|
2700
|
|
|
|
|
|
|
|
2701
|
|
|
|
|
|
|
=cut
|
2702
|
|
|
|
|
|
|
|
2703
|
|
|
|
|
|
|
sub inferRulesFromDTD {
|
2704
|
|
|
|
|
|
|
shift(@_) if $_[0] eq 'XML::Rules' or ref($_[0]);
|
2705
|
|
|
|
|
|
|
require XML::DTDParser;
|
2706
|
|
|
|
|
|
|
|
2707
|
|
|
|
|
|
|
my ($DTDfile, $enable_extended) = @_;
|
2708
|
|
|
|
|
|
|
|
2709
|
|
|
|
|
|
|
my $DTD = ( ($DTDfile=~ /\n/) ? XML::DTDParser::ParseDTD($DTDfile) : XML::DTDParser::ParseDTDFile($DTDfile));
|
2710
|
|
|
|
|
|
|
|
2711
|
|
|
|
|
|
|
my $has_mixed = 0;
|
2712
|
|
|
|
|
|
|
foreach my $tag (values %$DTD) {
|
2713
|
|
|
|
|
|
|
$tag->{is_mixed} = (($tag->{content} and $tag->{children}) ? 1 : 0)
|
2714
|
|
|
|
|
|
|
and $has_mixed = 1;
|
2715
|
|
|
|
|
|
|
}
|
2716
|
|
|
|
|
|
|
|
2717
|
|
|
|
|
|
|
my %settings;
|
2718
|
|
|
|
|
|
|
foreach my $tagname (keys %$DTD) {
|
2719
|
|
|
|
|
|
|
my $tag = $DTD->{$tagname};
|
2720
|
|
|
|
|
|
|
|
2721
|
|
|
|
|
|
|
my $repeated = ($tag->{option} =~ /^[+*]$/ ? 1 : 0);
|
2722
|
|
|
|
|
|
|
my $has_content = $tag->{content};
|
2723
|
|
|
|
|
|
|
|
2724
|
|
|
|
|
|
|
my $in_mixed = grep {$DTD->{$_}{is_mixed}} @{$tag->{parent}};
|
2725
|
|
|
|
|
|
|
|
2726
|
|
|
|
|
|
|
if ($in_mixed) {
|
2727
|
|
|
|
|
|
|
if ($enable_extended) {
|
2728
|
|
|
|
|
|
|
if ($repeated) {
|
2729
|
|
|
|
|
|
|
$settings{$tagname} = "raw extended array"
|
2730
|
|
|
|
|
|
|
} else {
|
2731
|
|
|
|
|
|
|
$settings{$tagname} = "raw extended"
|
2732
|
|
|
|
|
|
|
}
|
2733
|
|
|
|
|
|
|
} else {
|
2734
|
|
|
|
|
|
|
$settings{$tagname} = "raw"
|
2735
|
|
|
|
|
|
|
}
|
2736
|
|
|
|
|
|
|
} else {
|
2737
|
|
|
|
|
|
|
if (exists $tag->{attributes} or exists $tag->{children}) {
|
2738
|
|
|
|
|
|
|
my @ids ;
|
2739
|
|
|
|
|
|
|
if (exists $tag->{attributes}) {
|
2740
|
|
|
|
|
|
|
@ids = grep {$tag->{attributes}{$_}[0] eq 'ID' and $tag->{attributes}{$_}[1] eq '#REQUIRED'} keys %{$tag->{attributes}};
|
2741
|
|
|
|
|
|
|
}
|
2742
|
|
|
|
|
|
|
if (scalar(@ids) == 1) {
|
2743
|
|
|
|
|
|
|
if ($has_content) {
|
2744
|
|
|
|
|
|
|
$settings{$tagname} = "by $ids[0]"
|
2745
|
|
|
|
|
|
|
} else {
|
2746
|
|
|
|
|
|
|
$settings{$tagname} = "no content by $ids[0]"
|
2747
|
|
|
|
|
|
|
}
|
2748
|
|
|
|
|
|
|
} else {
|
2749
|
|
|
|
|
|
|
if ($has_content) {
|
2750
|
|
|
|
|
|
|
if ($repeated) {
|
2751
|
|
|
|
|
|
|
$settings{$tagname} = "as array"
|
2752
|
|
|
|
|
|
|
} else {
|
2753
|
|
|
|
|
|
|
$settings{$tagname} = "as is"
|
2754
|
|
|
|
|
|
|
}
|
2755
|
|
|
|
|
|
|
} else {
|
2756
|
|
|
|
|
|
|
if ($repeated) {
|
2757
|
|
|
|
|
|
|
$settings{$tagname} = "as array no content"
|
2758
|
|
|
|
|
|
|
} else {
|
2759
|
|
|
|
|
|
|
$settings{$tagname} = "no content"
|
2760
|
|
|
|
|
|
|
}
|
2761
|
|
|
|
|
|
|
}
|
2762
|
|
|
|
|
|
|
}
|
2763
|
|
|
|
|
|
|
} elsif ($repeated) {
|
2764
|
|
|
|
|
|
|
$settings{$tagname} = "content array"
|
2765
|
|
|
|
|
|
|
} else {
|
2766
|
|
|
|
|
|
|
$settings{$tagname} = "content"
|
2767
|
|
|
|
|
|
|
}
|
2768
|
|
|
|
|
|
|
}
|
2769
|
|
|
|
|
|
|
}
|
2770
|
|
|
|
|
|
|
|
2771
|
|
|
|
|
|
|
# use Data::Dumper;
|
2772
|
|
|
|
|
|
|
# print Dumper(\%settings);
|
2773
|
|
|
|
|
|
|
|
2774
|
|
|
|
|
|
|
my %compressed;
|
2775
|
|
|
|
|
|
|
{
|
2776
|
|
|
|
|
|
|
my %tmp;
|
2777
|
|
|
|
|
|
|
while (my ($tag, $option) = each %settings) {
|
2778
|
|
|
|
|
|
|
push @{$tmp{$option}}, $tag;
|
2779
|
|
|
|
|
|
|
}
|
2780
|
|
|
|
|
|
|
|
2781
|
|
|
|
|
|
|
while (my ($option, $tags) = each %tmp) {
|
2782
|
|
|
|
|
|
|
$compressed{join ',', sort @$tags} = $option
|
2783
|
|
|
|
|
|
|
}
|
2784
|
|
|
|
|
|
|
}
|
2785
|
|
|
|
|
|
|
|
2786
|
|
|
|
|
|
|
if ($has_mixed) {
|
2787
|
|
|
|
|
|
|
$compressed{"#stripspaces"} = 0;
|
2788
|
|
|
|
|
|
|
} else {
|
2789
|
|
|
|
|
|
|
$compressed{"#stripspaces"} = 7;
|
2790
|
|
|
|
|
|
|
}
|
2791
|
|
|
|
|
|
|
|
2792
|
|
|
|
|
|
|
return \%compressed;
|
2793
|
|
|
|
|
|
|
}
|
2794
|
|
|
|
|
|
|
|
2795
|
|
|
|
|
|
|
=head2 toXML / ToXML
|
2796
|
|
|
|
|
|
|
|
2797
|
|
|
|
|
|
|
The ToXML() method may be called as a class/static method as well. In that case the default identation is two spaces and the output encoding is utf8.
|
2798
|
|
|
|
|
|
|
|
2799
|
|
|
|
|
|
|
=head1 PROPERTIES
|
2800
|
|
|
|
|
|
|
|
2801
|
|
|
|
|
|
|
=head2 parameters
|
2802
|
|
|
|
|
|
|
|
2803
|
|
|
|
|
|
|
You can pass a parameter (scalar or reference) to the parse...() or filter...() methods, this parameter
|
2804
|
|
|
|
|
|
|
is later available to the rules as $parser->{parameters}. The module will never use this parameter for
|
2805
|
|
|
|
|
|
|
any other purpose so you are free to use it for any purposes provided that you expect it to be reset by
|
2806
|
|
|
|
|
|
|
each call to parse...() or filter...() first to the passed value and then, after the parsing is complete, to undef.
|
2807
|
|
|
|
|
|
|
|
2808
|
|
|
|
|
|
|
=head2 pad
|
2809
|
|
|
|
|
|
|
|
2810
|
|
|
|
|
|
|
The $parser->{pad} key is specificaly reserved by the module as a place where the module users can
|
2811
|
|
|
|
|
|
|
store their data. The module doesn't and will not use this key in any way, doesn't set or reset it under any
|
2812
|
|
|
|
|
|
|
circumstances. If you need to share some data between the rules and do not want to use the structure built
|
2813
|
|
|
|
|
|
|
by applying the rules you are free to use this key.
|
2814
|
|
|
|
|
|
|
|
2815
|
|
|
|
|
|
|
You should refrain from modifying or accessing other properties of the XML::Rules object!
|
2816
|
|
|
|
|
|
|
|
2817
|
|
|
|
|
|
|
=head1 IMPORTS
|
2818
|
|
|
|
|
|
|
|
2819
|
|
|
|
|
|
|
When used without parameters, the module does not export anything into the caller's namespace. When used with parameters
|
2820
|
|
|
|
|
|
|
it either infers and prints a set of rules from a DTD or example(s) or instantiates a parser
|
2821
|
|
|
|
|
|
|
and exports a subroutine calling the specified method similar to the parse() and other methods when called as class methods:
|
2822
|
|
|
|
|
|
|
|
2823
|
|
|
|
|
|
|
use XML::Rules inferRules => 'c:\temp\example.xml';
|
2824
|
|
|
|
|
|
|
use XML::Rules inferRules => 'c:\temp\ourOwn.dtd';
|
2825
|
|
|
|
|
|
|
use XML::Rules inferRules => ['c:\temp\example.xml', c:\temp\other.xml'];
|
2826
|
|
|
|
|
|
|
use XML::Rules
|
2827
|
|
|
|
|
|
|
read_data => {
|
2828
|
|
|
|
|
|
|
method => 'parse',
|
2829
|
|
|
|
|
|
|
rules => { ... },
|
2830
|
|
|
|
|
|
|
...
|
2831
|
|
|
|
|
|
|
};
|
2832
|
|
|
|
|
|
|
use XML::Rules ToXML => {
|
2833
|
|
|
|
|
|
|
method => 'ToXML',
|
2834
|
|
|
|
|
|
|
rules => {}, # the option is required, but may be empty
|
2835
|
|
|
|
|
|
|
ident => ' '
|
2836
|
|
|
|
|
|
|
};
|
2837
|
|
|
|
|
|
|
...
|
2838
|
|
|
|
|
|
|
my $data => read_data($xml);
|
2839
|
|
|
|
|
|
|
print ToXML(
|
2840
|
|
|
|
|
|
|
rootTag => {
|
2841
|
|
|
|
|
|
|
thing => [
|
2842
|
|
|
|
|
|
|
{Name => "english", child => [7480], otherChild => ['Hello world']},
|
2843
|
|
|
|
|
|
|
{Name => "espanol", child => [7440], otherChild => ['Hola mundo']},
|
2844
|
|
|
|
|
|
|
]
|
2845
|
|
|
|
|
|
|
});
|
2846
|
|
|
|
|
|
|
|
2847
|
|
|
|
|
|
|
|
2848
|
|
|
|
|
|
|
Please keep in mind that the use statement is executed at "compile time", which means that the variables declared and assigned above the statement
|
2849
|
|
|
|
|
|
|
do not have the value yet! This is wrong!
|
2850
|
|
|
|
|
|
|
|
2851
|
|
|
|
|
|
|
my %rules = ( _default => 'content', foo => 'as is', ...};
|
2852
|
|
|
|
|
|
|
use XML::Rules
|
2853
|
|
|
|
|
|
|
read_data => {
|
2854
|
|
|
|
|
|
|
method => 'parse',
|
2855
|
|
|
|
|
|
|
rules => \%rules,
|
2856
|
|
|
|
|
|
|
...
|
2857
|
|
|
|
|
|
|
};
|
2858
|
|
|
|
|
|
|
|
2859
|
|
|
|
|
|
|
If you do not specify the method, then the method named the same as the import is assumed. You also do not have to specify the rules option for
|
2860
|
|
|
|
|
|
|
the ToXML method as it is not used anyway:
|
2861
|
|
|
|
|
|
|
|
2862
|
|
|
|
|
|
|
use XML::Rules ToXML => { ident => ' ' };
|
2863
|
|
|
|
|
|
|
use XML::Rules parse => {stripspaces => 7, rules => { ... }};
|
2864
|
|
|
|
|
|
|
|
2865
|
|
|
|
|
|
|
You can use the inferRules form the command line like this:
|
2866
|
|
|
|
|
|
|
|
2867
|
|
|
|
|
|
|
perl -e "use XML::Rules inferRules => 'c:\temp\example.xml'"
|
2868
|
|
|
|
|
|
|
|
2869
|
|
|
|
|
|
|
or this:
|
2870
|
|
|
|
|
|
|
|
2871
|
|
|
|
|
|
|
perl -MXML::Rules=inferRules,c:\temp\example.xml -e 1
|
2872
|
|
|
|
|
|
|
|
2873
|
|
|
|
|
|
|
or use the included xml2XMLRules.pl and dtd2XMLRules.pl scripts.
|
2874
|
|
|
|
|
|
|
|
2875
|
|
|
|
|
|
|
=head1 Namespace support
|
2876
|
|
|
|
|
|
|
|
2877
|
|
|
|
|
|
|
By default the module doesn't handle namespaces in any way, it doesn't do anything special with
|
2878
|
|
|
|
|
|
|
xmlns or xmlns:alias attributes and it doesn't strip or mangle the namespace aliases
|
2879
|
|
|
|
|
|
|
in tag or attribute names. This means that if you know for sure what namespace
|
2880
|
|
|
|
|
|
|
aliases will be used you can set up rules for tags including the aliases and unless
|
2881
|
|
|
|
|
|
|
someone decides to use a different alias or makes use of the default namespace
|
2882
|
|
|
|
|
|
|
your script will work without turning the namespace support on.
|
2883
|
|
|
|
|
|
|
|
2884
|
|
|
|
|
|
|
If you do specify any namespace to alias mapping in the constructor it does
|
2885
|
|
|
|
|
|
|
start processing the namespace stuff. The xmlns and xmlns:alias attributes
|
2886
|
|
|
|
|
|
|
for the known namespaces are stripped from the datastructures and
|
2887
|
|
|
|
|
|
|
the aliases are transformed from whatever the XML author decided to use
|
2888
|
|
|
|
|
|
|
to whatever your namespace mapping specifies. Aliases are also added to all
|
2889
|
|
|
|
|
|
|
tags that belong to a default namespace.
|
2890
|
|
|
|
|
|
|
|
2891
|
|
|
|
|
|
|
Assuming the constructor parameters contain
|
2892
|
|
|
|
|
|
|
|
2893
|
|
|
|
|
|
|
namespaces => {
|
2894
|
|
|
|
|
|
|
'http://my.namespaces.com/foo' => 'foo',
|
2895
|
|
|
|
|
|
|
'http://my.namespaces.com/bar' => 'bar',
|
2896
|
|
|
|
|
|
|
}
|
2897
|
|
|
|
|
|
|
|
2898
|
|
|
|
|
|
|
and the XML looks like this:
|
2899
|
|
|
|
|
|
|
|
2900
|
|
|
|
|
|
|
|
2901
|
|
|
|
|
|
|
|
2902
|
|
|
|
|
|
|
Hello world
|
2903
|
|
|
|
|
|
|
|
2904
|
|
|
|
|
|
|
|
2905
|
|
|
|
|
|
|
|
2906
|
|
|
|
|
|
|
NaRuzku
|
2907
|
|
|
|
|
|
|
at any crossroads
|
2908
|
|
|
|
|
|
|
Fakt desnej pajzl.
|
2909
|
|
|
|
|
|
|
|
2910
|
|
|
|
|
|
|
|
2911
|
|
|
|
|
|
|
|
2912
|
|
|
|
|
|
|
|
2913
|
|
|
|
|
|
|
then the rules wil be called as if the XML looked like this
|
2914
|
|
|
|
|
|
|
while the namespace support is turned off:
|
2915
|
|
|
|
|
|
|
|
2916
|
|
|
|
|
|
|
|
2917
|
|
|
|
|
|
|
|
2918
|
|
|
|
|
|
|
Hello world
|
2919
|
|
|
|
|
|
|
|
2920
|
|
|
|
|
|
|
|
2921
|
|
|
|
|
|
|
|
2922
|
|
|
|
|
|
|
NaRuzku
|
2923
|
|
|
|
|
|
|
at any crossroads
|
2924
|
|
|
|
|
|
|
Fakt desnej pajzl.
|
2925
|
|
|
|
|
|
|
|
2926
|
|
|
|
|
|
|
|
2927
|
|
|
|
|
|
|
|
2928
|
|
|
|
|
|
|
|
2929
|
|
|
|
|
|
|
|
2930
|
|
|
|
|
|
|
This means that the namespace handling will normalize the aliases used so that you can use
|
2931
|
|
|
|
|
|
|
them in the rules.
|
2932
|
|
|
|
|
|
|
|
2933
|
|
|
|
|
|
|
It is possible to specify an empty alias, so eg. in case you are processing a SOAP XML
|
2934
|
|
|
|
|
|
|
and know the tags defined by SOAP do not colide with the tags in the enclosed XML
|
2935
|
|
|
|
|
|
|
you may simplify the parsing by removing all namespace aliases.
|
2936
|
|
|
|
|
|
|
|
2937
|
|
|
|
|
|
|
You can control the behaviour with respect to the namespaces that you did not include
|
2938
|
|
|
|
|
|
|
in your mapping by setting the "alias" for the special pseudonamespace '*'. The possible values
|
2939
|
|
|
|
|
|
|
of the "alias"are: "warn" (default), "keep", "strip", "" and "die".
|
2940
|
|
|
|
|
|
|
|
2941
|
|
|
|
|
|
|
warn: whenever an unknown namespace is encountered, XML::Rules prints a warning.
|
2942
|
|
|
|
|
|
|
The xmlns:XX attributes and the XX: aliases are retained for these namespaces.
|
2943
|
|
|
|
|
|
|
If the alias clashes with one specified by your mapping it will be changed in all places,
|
2944
|
|
|
|
|
|
|
the xmlns="..." referencing an unexpected namespace are changed to xmlns:nsN
|
2945
|
|
|
|
|
|
|
and the alias is added to the tag names included.
|
2946
|
|
|
|
|
|
|
|
2947
|
|
|
|
|
|
|
keep: this works just like the "warn" except for the warning.
|
2948
|
|
|
|
|
|
|
|
2949
|
|
|
|
|
|
|
strip: all attributes and tags in the unknown namespaces are stripped. If
|
2950
|
|
|
|
|
|
|
a tag in such a namespace contains a tag from a known namespace,
|
2951
|
|
|
|
|
|
|
then the child tag is retained.
|
2952
|
|
|
|
|
|
|
|
2953
|
|
|
|
|
|
|
"": all the xmlns attributes and the aliases for the unexected namespaces are removed,
|
2954
|
|
|
|
|
|
|
the tags and normal attributes are retained without any alias.
|
2955
|
|
|
|
|
|
|
|
2956
|
|
|
|
|
|
|
die: as soon as any unexpected namespace is encountered, XML::Rules croak()s.
|
2957
|
|
|
|
|
|
|
|
2958
|
|
|
|
|
|
|
|
2959
|
|
|
|
|
|
|
=head1 HOW TO USE
|
2960
|
|
|
|
|
|
|
|
2961
|
|
|
|
|
|
|
You may view the module either as a XML::Simple on steriods and use it to build a data structure
|
2962
|
|
|
|
|
|
|
similar to the one produced by XML::Simple with the added benefit of being able
|
2963
|
|
|
|
|
|
|
to specify what tags or attributes to ignore, when to take just the content, what to store as an array etc.
|
2964
|
|
|
|
|
|
|
|
2965
|
|
|
|
|
|
|
You could also view it as yet another event based XML parser that differs from all the others only in one thing.
|
2966
|
|
|
|
|
|
|
It stores the data for you so that you do not have to use globals or closures and wonder where to attach
|
2967
|
|
|
|
|
|
|
the snippet of data you just received onto the structure you are building.
|
2968
|
|
|
|
|
|
|
|
2969
|
|
|
|
|
|
|
You can use it in a way similar to XML::Twig with simplify(): specify the rules to transform the lower
|
2970
|
|
|
|
|
|
|
level tags into a XML::Simple like (simplify()ed) structure and then handle the structure in the rule for
|
2971
|
|
|
|
|
|
|
the tag(s) you'd specify in XML::Twig's twig_roots.
|
2972
|
|
|
|
|
|
|
|
2973
|
|
|
|
|
|
|
=head1 Unrelated tricks
|
2974
|
|
|
|
|
|
|
|
2975
|
|
|
|
|
|
|
If you need to parse a XML file without the root tag (something that each and any sane person would allow,
|
2976
|
|
|
|
|
|
|
but the XML comitee did not), you can parse
|
2977
|
|
|
|
|
|
|
|
2978
|
|
|
|
|
|
|
]>&real_doc;
|
2979
|
|
|
|
|
|
|
|
2980
|
|
|
|
|
|
|
instead.
|
2981
|
|
|
|
|
|
|
|
2982
|
|
|
|
|
|
|
=head1 AUTHOR
|
2983
|
|
|
|
|
|
|
|
2984
|
|
|
|
|
|
|
Jan Krynicky, C<< >>
|
2985
|
|
|
|
|
|
|
|
2986
|
|
|
|
|
|
|
=head1 BUGS
|
2987
|
|
|
|
|
|
|
|
2988
|
|
|
|
|
|
|
Please report any bugs or feature requests to
|
2989
|
|
|
|
|
|
|
C, or through the web interface at
|
2990
|
|
|
|
|
|
|
L.
|
2991
|
|
|
|
|
|
|
I will be notified, and then you'll automatically be notified of progress on
|
2992
|
|
|
|
|
|
|
your bug as I make changes.
|
2993
|
|
|
|
|
|
|
|
2994
|
|
|
|
|
|
|
=head1 SUPPORT
|
2995
|
|
|
|
|
|
|
|
2996
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command.
|
2997
|
|
|
|
|
|
|
|
2998
|
|
|
|
|
|
|
perldoc XML::Rules
|
2999
|
|
|
|
|
|
|
|
3000
|
|
|
|
|
|
|
You can also look for information at:
|
3001
|
|
|
|
|
|
|
|
3002
|
|
|
|
|
|
|
=over 4
|
3003
|
|
|
|
|
|
|
|
3004
|
|
|
|
|
|
|
=item * AnnoCPAN: Annotated CPAN documentation
|
3005
|
|
|
|
|
|
|
|
3006
|
|
|
|
|
|
|
L
|
3007
|
|
|
|
|
|
|
|
3008
|
|
|
|
|
|
|
=item * CPAN Ratings
|
3009
|
|
|
|
|
|
|
|
3010
|
|
|
|
|
|
|
L
|
3011
|
|
|
|
|
|
|
|
3012
|
|
|
|
|
|
|
=item * RT: CPAN's request tracker
|
3013
|
|
|
|
|
|
|
|
3014
|
|
|
|
|
|
|
L
|
3015
|
|
|
|
|
|
|
|
3016
|
|
|
|
|
|
|
=item * Search CPAN
|
3017
|
|
|
|
|
|
|
|
3018
|
|
|
|
|
|
|
L
|
3019
|
|
|
|
|
|
|
|
3020
|
|
|
|
|
|
|
=item * PerlMonks
|
3021
|
|
|
|
|
|
|
|
3022
|
|
|
|
|
|
|
Please see L or
|
3023
|
|
|
|
|
|
|
L for discussion.
|
3024
|
|
|
|
|
|
|
|
3025
|
|
|
|
|
|
|
=back
|
3026
|
|
|
|
|
|
|
|
3027
|
|
|
|
|
|
|
=head1 SEE ALSO
|
3028
|
|
|
|
|
|
|
|
3029
|
|
|
|
|
|
|
L, L, L
|
3030
|
|
|
|
|
|
|
|
3031
|
|
|
|
|
|
|
=head1 ACKNOWLEDGEMENTS
|
3032
|
|
|
|
|
|
|
|
3033
|
|
|
|
|
|
|
The escape_value() method is taken with minor changes from XML::Simple.
|
3034
|
|
|
|
|
|
|
|
3035
|
|
|
|
|
|
|
=head1 COPYRIGHT & LICENSE
|
3036
|
|
|
|
|
|
|
|
3037
|
|
|
|
|
|
|
Copyright 2006-2012 Jan Krynicky, all rights reserved.
|
3038
|
|
|
|
|
|
|
|
3039
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it
|
3040
|
|
|
|
|
|
|
under the same terms as Perl itself.
|
3041
|
|
|
|
|
|
|
|
3042
|
|
|
|
|
|
|
=cut
|
3043
|
|
|
|
|
|
|
|
3044
|
|
|
|
|
|
|
# if I ever attempt to switch to SAX I want to look at XML::Handler::Trees
|
3045
|
|
|
|
|
|
|
|
3046
|
|
|
|
|
|
|
1; # End of XML::Rules
|