line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
=head1 NAME |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
XML::Easy::SimpleSchemaUtil - help with simple kinds of XML schema |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=head1 SYNOPSIS |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
use XML::Easy::SimpleSchemaUtil qw( |
8
|
|
|
|
|
|
|
xml_s_canonise_chars xml_c_canonise_chars |
9
|
|
|
|
|
|
|
xml_c_subelements xml_c_chardata |
10
|
|
|
|
|
|
|
); |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
$chardata = xml_s_canonise_chars($chardata); |
13
|
|
|
|
|
|
|
$content = xml_c_canonise_chars($content); |
14
|
|
|
|
|
|
|
$subelements = xml_c_subelements($content); |
15
|
|
|
|
|
|
|
$chars = xml_c_chardata($content); |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
=head1 DESCRIPTION |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
The rules by which some class of thing is encoded in XML constitute a |
20
|
|
|
|
|
|
|
schema. (A schema does not need to be codified in a formal language such |
21
|
|
|
|
|
|
|
as Schematron: a natural-language specification can also be a schema. |
22
|
|
|
|
|
|
|
Even if there is no explicit specification at all, the behaviour of |
23
|
|
|
|
|
|
|
the interoperating processors of related XML documents constitutes a de |
24
|
|
|
|
|
|
|
facto schema.) Certain kinds of rule are commonly used in all manner |
25
|
|
|
|
|
|
|
of schemata. This module supplies functions that help to implement such |
26
|
|
|
|
|
|
|
common kinds of rule, regardless of how a schema is specified. |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
This module processes XML data in the form used by L, |
29
|
|
|
|
|
|
|
consisting of C and C objects |
30
|
|
|
|
|
|
|
and twine arrays. In this form, character data are stored fully decoded, |
31
|
|
|
|
|
|
|
so they can be manipulated with no knowledge of XML syntax. |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
=cut |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
package XML::Easy::SimpleSchemaUtil; |
36
|
|
|
|
|
|
|
|
37
|
4
|
|
|
4
|
|
202279
|
{ use 5.006; } |
|
4
|
|
|
|
|
15
|
|
|
4
|
|
|
|
|
195
|
|
38
|
4
|
|
|
4
|
|
24
|
use warnings; |
|
4
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
138
|
|
39
|
4
|
|
|
4
|
|
22
|
use strict; |
|
4
|
|
|
|
|
18
|
|
|
4
|
|
|
|
|
189
|
|
40
|
|
|
|
|
|
|
|
41
|
4
|
|
|
4
|
|
1234
|
use Params::Classify 0.000 qw(is_ref); |
|
4
|
|
|
|
|
2803
|
|
|
4
|
|
|
|
|
259
|
|
42
|
4
|
|
|
4
|
|
833
|
use XML::Easy::Classify 0.006 qw(check_xml_chardata check_xml_content_twine); |
|
4
|
|
|
|
|
42368
|
|
|
4
|
|
|
|
|
291
|
|
43
|
|
|
|
|
|
|
use XML::Easy::NodeBasics 0.007 |
44
|
4
|
|
|
4
|
|
969
|
qw(xml_content_object xml_content_twine xml_c_content_twine); |
|
4
|
|
|
|
|
6287
|
|
|
4
|
|
|
|
|
291
|
|
45
|
4
|
|
|
4
|
|
27
|
use XML::Easy::Syntax 0.000 qw($xml10_s_rx); |
|
4
|
|
|
|
|
67
|
|
|
4
|
|
|
|
|
601
|
|
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
our $VERSION = "0.002"; |
48
|
|
|
|
|
|
|
|
49
|
4
|
|
|
4
|
|
23
|
use parent "Exporter"; |
|
4
|
|
|
|
|
6
|
|
|
4
|
|
|
|
|
22
|
|
50
|
|
|
|
|
|
|
our @EXPORT_OK = qw( |
51
|
|
|
|
|
|
|
xml_s_canonise_chars xs_charcanon xml_c_canonise_chars xc_charcanon |
52
|
|
|
|
|
|
|
xml_c_subelements xc_subelems xml_c_chardata xc_chars |
53
|
|
|
|
|
|
|
); |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
sub _throw_data_error($) { |
56
|
0
|
|
|
0
|
|
0
|
my($msg) = @_; |
57
|
0
|
|
|
|
|
0
|
die "invalid XML data: $msg\n"; |
58
|
|
|
|
|
|
|
} |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
sub _throw_schema_error($) { |
61
|
44
|
|
|
44
|
|
74
|
my($msg) = @_; |
62
|
44
|
|
|
|
|
318
|
die "XML schema error: $msg\n"; |
63
|
|
|
|
|
|
|
} |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
=head1 FUNCTIONS |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
Each function has two names. There is a longer descriptive name, and |
68
|
|
|
|
|
|
|
a shorter name to spare screen space and the programmer's fingers. |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=over |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=item xml_s_canonise_chars(STRING, OPTIONS) |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
=item xs_charcanon(STRING, OPTIONS) |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
This function is intended to help in parsing XML data, in situations |
77
|
|
|
|
|
|
|
where the schema states that some aspects of characters are not |
78
|
|
|
|
|
|
|
entirely significant. I must be a plain Perl string consisting |
79
|
|
|
|
|
|
|
of character data that is valid for XML. The function examines the |
80
|
|
|
|
|
|
|
characters, processes them as specified in the I, and returns |
81
|
|
|
|
|
|
|
a modified version of the string. |
82
|
|
|
|
|
|
|
I must be a reference to a hash, in which the permitted keys are: |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
=over |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
=item B |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=item B |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
=item B |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
Controls handling of sequences of whitespace characters. The three |
93
|
|
|
|
|
|
|
keys control, respectively, whitespace at the beginning of the string, |
94
|
|
|
|
|
|
|
whitespace that is at neither the beginning nor the end, and whitespace at |
95
|
|
|
|
|
|
|
the end of the string. If the entire content of the string is whitespace, |
96
|
|
|
|
|
|
|
it is treated as both leading and trailing. |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
The whitespace characters, for this purpose, are tab, linefeed/newline, |
99
|
|
|
|
|
|
|
carriage return, and space. This is the same set of characters that |
100
|
|
|
|
|
|
|
are whitespace for the purposes of the XML syntax. |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
The value for each key may be: |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=over |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
=item B |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
Completely remove the whitespace. For situations where the whitespace is |
109
|
|
|
|
|
|
|
of no significance at all. (Common for leading and trailing whitespace, |
110
|
|
|
|
|
|
|
but rare for intermediate whitespace.) |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=item B |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
Replace the whitespace sequence with a single space character. |
115
|
|
|
|
|
|
|
For situations where the presence of whitespace is significant but the |
116
|
|
|
|
|
|
|
length and type are not. (Common for intermediate whitespace.) |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
=item B (default) |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
Leave the whitespace unchanged. For situations where the exact type of |
121
|
|
|
|
|
|
|
whitespace is significant. |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
=back |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
=back |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
=cut |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
sub _canonise_chars($$) { |
130
|
140
|
|
|
140
|
|
260
|
my($string, $options) = @_; |
131
|
140
|
100
|
|
|
|
565
|
my $leading_wsp = exists($options->{leading_wsp}) ? |
132
|
|
|
|
|
|
|
$options->{leading_wsp} : "PRESERVE"; |
133
|
140
|
100
|
|
|
|
440
|
if($leading_wsp eq "DELETE") { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
134
|
14
|
|
|
|
|
96
|
$string =~ s/\A$xml10_s_rx//o; |
135
|
|
|
|
|
|
|
} elsif($leading_wsp eq "COMPRESS") { |
136
|
14
|
|
|
|
|
91
|
$string =~ s/\A$xml10_s_rx/ /o; |
137
|
|
|
|
|
|
|
} elsif($leading_wsp ne "PRESERVE") { |
138
|
0
|
|
|
|
|
0
|
_throw_data_error("bad character canonicalisation option"); |
139
|
|
|
|
|
|
|
} |
140
|
140
|
100
|
|
|
|
292
|
my $intermediate_wsp = exists($options->{intermediate_wsp}) ? |
141
|
|
|
|
|
|
|
$options->{intermediate_wsp} : "PRESERVE"; |
142
|
140
|
100
|
|
|
|
741
|
if($intermediate_wsp eq "DELETE") { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
143
|
14
|
|
|
|
|
188
|
$string =~ s/(?!$xml10_s_rx)(.)$xml10_s_rx(?!$xml10_s_rx|\z) |
144
|
|
|
|
|
|
|
/$1/xsog; |
145
|
|
|
|
|
|
|
} elsif($intermediate_wsp eq "COMPRESS") { |
146
|
14
|
|
|
|
|
284
|
$string =~ s/(?!$xml10_s_rx)(.)$xml10_s_rx(?!$xml10_s_rx|\z) |
147
|
|
|
|
|
|
|
/$1 /xsog; |
148
|
|
|
|
|
|
|
} elsif($intermediate_wsp ne "PRESERVE") { |
149
|
0
|
|
|
|
|
0
|
_throw_data_error("bad character canonicalisation option"); |
150
|
|
|
|
|
|
|
} |
151
|
140
|
100
|
|
|
|
333
|
my $trailing_wsp = exists($options->{trailing_wsp}) ? |
152
|
|
|
|
|
|
|
$options->{trailing_wsp} : "PRESERVE"; |
153
|
140
|
100
|
|
|
|
1610
|
if($trailing_wsp eq "DELETE") { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
154
|
14
|
|
|
|
|
104
|
$string =~ s/$xml10_s_rx\z//o; |
155
|
|
|
|
|
|
|
} elsif($trailing_wsp eq "COMPRESS") { |
156
|
14
|
|
|
|
|
108
|
$string =~ s/$xml10_s_rx\z/ /o; |
157
|
|
|
|
|
|
|
} elsif($trailing_wsp ne "PRESERVE") { |
158
|
0
|
|
|
|
|
0
|
_throw_data_error("bad character canonicalisation option"); |
159
|
|
|
|
|
|
|
} |
160
|
140
|
|
|
|
|
585
|
return $string; |
161
|
|
|
|
|
|
|
} |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
sub xml_s_canonise_chars($$) { |
164
|
22
|
|
|
22
|
1
|
26896
|
check_xml_chardata($_[0]); |
165
|
20
|
|
|
|
|
162
|
return &_canonise_chars; |
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
*xs_charcanon = \&xml_s_canonise_chars; |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
=item xml_c_canonise_chars(CONTENT, OPTIONS) |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
=item xc_charcanon(CONTENT, OPTIONS) |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
This function is intended to help in parsing XML data, in situations |
175
|
|
|
|
|
|
|
where the schema states that some aspects of characters are not |
176
|
|
|
|
|
|
|
entirely significant. I must be a reference to either an |
177
|
|
|
|
|
|
|
L object or a twine array. The function processes its |
178
|
|
|
|
|
|
|
top-level character content in the same way as L, |
179
|
|
|
|
|
|
|
and returns the resulting modified version of the content in the same |
180
|
|
|
|
|
|
|
form that the input supplied. |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
Any element inside the content chunk acts like a special character that |
183
|
|
|
|
|
|
|
will not be modified. It interrupts any character sequence of interest. |
184
|
|
|
|
|
|
|
Elements are not processed recursively: they are treated as atomic. |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
=cut |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
sub _canonise_chars_twine($$) { |
189
|
60
|
|
|
60
|
|
624
|
my($twine, $options) = @_; |
190
|
60
|
100
|
|
|
|
252
|
return [ _canonise_chars($twine->[0], $options) ] |
191
|
|
|
|
|
|
|
if @$twine == 1; |
192
|
20
|
|
|
|
|
82
|
my $leading_options = {%$options}; |
193
|
20
|
|
|
|
|
57
|
my $intermediate_options = {%$options}; |
194
|
20
|
|
|
|
|
42
|
my $trailing_options = {%$options}; |
195
|
20
|
100
|
|
|
|
91
|
$leading_options->{trailing_wsp} = |
196
|
|
|
|
|
|
|
$intermediate_options->{leading_wsp} = |
197
|
|
|
|
|
|
|
$intermediate_options->{trailing_wsp} = |
198
|
|
|
|
|
|
|
$trailing_options->{leading_wsp} = |
199
|
|
|
|
|
|
|
exists($options->{intermediate_wsp}) ? |
200
|
|
|
|
|
|
|
$options->{intermediate_wsp} : "PRESERVE"; |
201
|
20
|
|
|
|
|
65
|
my @output = @$twine; |
202
|
20
|
|
|
|
|
48
|
$output[0] = _canonise_chars($output[0], $leading_options); |
203
|
20
|
|
|
|
|
46
|
$output[-1] = _canonise_chars($output[-1], $trailing_options); |
204
|
20
|
|
|
|
|
63
|
for(my $i = @output - 3; $i != 0; $i--) { |
205
|
40
|
|
|
|
|
80
|
$output[$i] = |
206
|
|
|
|
|
|
|
_canonise_chars($output[$i], $intermediate_options); |
207
|
|
|
|
|
|
|
} |
208
|
20
|
|
|
|
|
130
|
return \@output; |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
sub xml_c_canonise_chars($$) { |
212
|
60
|
100
|
|
60
|
1
|
45447
|
if(is_ref($_[0], "ARRAY")) { |
213
|
30
|
|
|
|
|
108
|
check_xml_content_twine($_[0]); |
214
|
30
|
|
|
|
|
740
|
return xml_content_twine(&_canonise_chars_twine); |
215
|
|
|
|
|
|
|
} else { |
216
|
30
|
|
|
|
|
322
|
return xml_content_object(_canonise_chars_twine( |
217
|
|
|
|
|
|
|
xml_c_content_twine($_[0]), $_[1])); |
218
|
|
|
|
|
|
|
} |
219
|
|
|
|
|
|
|
} |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
*xc_charcanon = \&xml_c_canonise_chars; |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
=item xml_c_subelements(CONTENT, ALLOW_WSP) |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
=item xc_subelems(CONTENT, ALLOW_WSP) |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
This function is intended to help in parsing XML data, in situations |
228
|
|
|
|
|
|
|
where the schema calls for an element to contain only subelements, |
229
|
|
|
|
|
|
|
possibly with optional whitespace around and between them. |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
I must be a reference to either an L object |
232
|
|
|
|
|
|
|
or a twine array. The function checks whether the content includes |
233
|
|
|
|
|
|
|
any unpermitted characters at the top level, and Cs if it does. |
234
|
|
|
|
|
|
|
If the content is of permitted form, the function returns a reference |
235
|
|
|
|
|
|
|
to an array listing all the subelements. |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
I is a truth value controlling whether whitespace is permitted |
238
|
|
|
|
|
|
|
around and between the subelements. The characters recognised as |
239
|
|
|
|
|
|
|
whitespace are the same as those for XML syntax. Allowing whitespace in |
240
|
|
|
|
|
|
|
this way is easier (and slightly more efficient) than first filtering |
241
|
|
|
|
|
|
|
it out via L. Non-whitespace characters are |
242
|
|
|
|
|
|
|
never permitted. |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
=cut |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
sub xml_c_subelements($$) { |
247
|
58
|
|
|
58
|
1
|
51803
|
my($content, $allow_wsp) = @_; |
248
|
58
|
|
|
|
|
353
|
$content = xml_c_content_twine($content); |
249
|
56
|
|
|
|
|
1082
|
my $clen = @$content; |
250
|
56
|
|
|
|
|
192
|
for(my $i = $clen-1; $i >= 0; $i -= 2) { |
251
|
112
|
100
|
|
|
|
375
|
if($allow_wsp) { |
252
|
62
|
100
|
|
|
|
381
|
_throw_schema_error("non-whitespace characters ". |
253
|
|
|
|
|
|
|
"where not permitted") |
254
|
|
|
|
|
|
|
unless $content->[$i] =~ /\A$xml10_s_rx?\z/o; |
255
|
|
|
|
|
|
|
} else { |
256
|
50
|
100
|
|
|
|
356
|
_throw_schema_error("characters where not permitted") |
257
|
|
|
|
|
|
|
unless $content->[$i] eq ""; |
258
|
|
|
|
|
|
|
} |
259
|
|
|
|
|
|
|
} |
260
|
22
|
|
|
|
|
207
|
my @subelem; |
261
|
22
|
|
|
|
|
225
|
for(my $i = 1; $i < $clen; $i += 2) { |
262
|
28
|
|
|
|
|
70
|
push @subelem, $content->[$i]; |
263
|
|
|
|
|
|
|
} |
264
|
22
|
|
|
|
|
284
|
return \@subelem; |
265
|
|
|
|
|
|
|
} |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
*xc_subelems = \&xml_c_subelements; |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
=item xml_c_chardata(CONTENT) |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
=item xc_chars(CONTENT) |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
This function is intended to help in parsing XML data, in situations |
274
|
|
|
|
|
|
|
where the schema calls for an element to contain only character data. |
275
|
|
|
|
|
|
|
I must be a reference to either an L object |
276
|
|
|
|
|
|
|
or a twine array. The function Cs if it contains any subelements. |
277
|
|
|
|
|
|
|
If the content is of permitted form, the function returns a string |
278
|
|
|
|
|
|
|
containing all the character content. |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
=cut |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
sub xml_c_chardata($) { |
283
|
26
|
|
|
26
|
1
|
15570
|
my($content) = @_; |
284
|
26
|
|
|
|
|
75
|
$content = xml_c_content_twine($content); |
285
|
24
|
100
|
|
|
|
322
|
_throw_schema_error("subelement where not permitted") |
286
|
|
|
|
|
|
|
unless @$content == 1; |
287
|
14
|
|
|
|
|
59
|
return $content->[0]; |
288
|
|
|
|
|
|
|
} |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
*xc_chars = \&xml_c_chardata; |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
=back |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
=head1 SEE ALSO |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
L |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
=head1 AUTHOR |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
Andrew Main (Zefram) |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
=head1 COPYRIGHT |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
Copyright (C) 2010 PhotoBox Ltd |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
Copyright (C) 2011 Andrew Main (Zefram) |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
=head1 LICENSE |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
This module is free software; you can redistribute it and/or modify it |
311
|
|
|
|
|
|
|
under the same terms as Perl itself. |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
=cut |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
1; |