line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
=head1 NAME |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
XML::Easy::SimpleSchemaUtil - help with simple kinds of XML schema |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=head1 SYNOPSIS |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
use XML::Easy::SimpleSchemaUtil qw( |
8
|
|
|
|
|
|
|
xml_s_canonise_chars xml_c_canonise_chars |
9
|
|
|
|
|
|
|
xml_c_subelements xml_c_chardata); |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
$chardata = xml_s_canonise_chars($chardata); |
12
|
|
|
|
|
|
|
$content = xml_c_canonise_chars($content); |
13
|
|
|
|
|
|
|
$subelements = xml_c_subelements($content); |
14
|
|
|
|
|
|
|
$chars = xml_c_chardata($content); |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=head1 DESCRIPTION |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
The rules by which some class of thing is encoded in XML constitute a |
19
|
|
|
|
|
|
|
schema. (A schema does not need to be codified in a formal language such |
20
|
|
|
|
|
|
|
as Schematron: a natural-language specification can also be a schema. |
21
|
|
|
|
|
|
|
Even if there is no explicit specification at all, the behaviour of |
22
|
|
|
|
|
|
|
the interoperating processors of related XML documents constitutes a de |
23
|
|
|
|
|
|
|
facto schema.) Certain kinds of rule are commonly used in all manner |
24
|
|
|
|
|
|
|
of schemata. This module supplies functions that help to implement such |
25
|
|
|
|
|
|
|
common kinds of rule, regardless of how a schema is specified. |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
This module processes XML data in the form used by L, |
28
|
|
|
|
|
|
|
consisting of C and C objects |
29
|
|
|
|
|
|
|
and twine arrays. In this form, character data are stored fully decoded, |
30
|
|
|
|
|
|
|
so they can be manipulated with no knowledge of XML syntax. |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=cut |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
package XML::Easy::SimpleSchemaUtil; |
35
|
|
|
|
|
|
|
|
36
|
4
|
|
|
4
|
|
243108
|
{ use 5.006; } |
|
4
|
|
|
|
|
13
|
|
37
|
4
|
|
|
4
|
|
22
|
use warnings; |
|
4
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
111
|
|
38
|
4
|
|
|
4
|
|
21
|
use strict; |
|
4
|
|
|
|
|
13
|
|
|
4
|
|
|
|
|
113
|
|
39
|
|
|
|
|
|
|
|
40
|
4
|
|
|
4
|
|
292
|
use Params::Classify 0.000 qw(is_ref); |
|
4
|
|
|
|
|
2166
|
|
|
4
|
|
|
|
|
207
|
|
41
|
4
|
|
|
4
|
|
271
|
use XML::Easy::Classify 0.006 qw(check_xml_chardata check_xml_content_twine); |
|
4
|
|
|
|
|
16277
|
|
|
4
|
|
|
|
|
204
|
|
42
|
|
|
|
|
|
|
use XML::Easy::NodeBasics 0.007 |
43
|
4
|
|
|
4
|
|
291
|
qw(xml_content_object xml_content_twine xml_c_content_twine); |
|
4
|
|
|
|
|
3537
|
|
|
4
|
|
|
|
|
216
|
|
44
|
4
|
|
|
4
|
|
25
|
use XML::Easy::Syntax 0.000 qw($xml10_s_rx); |
|
4
|
|
|
|
|
45
|
|
|
4
|
|
|
|
|
461
|
|
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
our $VERSION = "0.003"; |
47
|
|
|
|
|
|
|
|
48
|
4
|
|
|
4
|
|
26
|
use parent "Exporter"; |
|
4
|
|
|
|
|
11
|
|
|
4
|
|
|
|
|
25
|
|
49
|
|
|
|
|
|
|
our @EXPORT_OK = qw( |
50
|
|
|
|
|
|
|
xml_s_canonise_chars xs_charcanon xml_c_canonise_chars xc_charcanon |
51
|
|
|
|
|
|
|
xml_c_subelements xc_subelems xml_c_chardata xc_chars |
52
|
|
|
|
|
|
|
); |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
sub _throw_data_error($) { |
55
|
0
|
|
|
0
|
|
0
|
my($msg) = @_; |
56
|
0
|
|
|
|
|
0
|
die "invalid XML data: $msg\n"; |
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
sub _throw_schema_error($) { |
60
|
44
|
|
|
44
|
|
92
|
my($msg) = @_; |
61
|
44
|
|
|
|
|
272
|
die "XML schema error: $msg\n"; |
62
|
|
|
|
|
|
|
} |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
=head1 FUNCTIONS |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
Each function has two names. There is a longer descriptive name, and |
67
|
|
|
|
|
|
|
a shorter name to spare screen space and the programmer's fingers. |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=over |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
=item xml_s_canonise_chars(STRING, OPTIONS) |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
=item xs_charcanon(STRING, OPTIONS) |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
This function is intended to help in parsing XML data, in situations |
76
|
|
|
|
|
|
|
where the schema states that some aspects of characters are not |
77
|
|
|
|
|
|
|
entirely significant. I must be a plain Perl string consisting |
78
|
|
|
|
|
|
|
of character data that is valid for XML. The function examines the |
79
|
|
|
|
|
|
|
characters, processes them as specified in the I, and returns |
80
|
|
|
|
|
|
|
a modified version of the string. |
81
|
|
|
|
|
|
|
I must be a reference to a hash, in which the permitted keys are: |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
=over |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
=item B |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
=item B |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
=item B |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
Controls handling of sequences of whitespace characters. The three |
92
|
|
|
|
|
|
|
keys control, respectively, whitespace at the beginning of the string, |
93
|
|
|
|
|
|
|
whitespace that is at neither the beginning nor the end, and whitespace at |
94
|
|
|
|
|
|
|
the end of the string. If the entire content of the string is whitespace, |
95
|
|
|
|
|
|
|
it is treated as both leading and trailing. |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
The whitespace characters, for this purpose, are tab, linefeed/newline, |
98
|
|
|
|
|
|
|
carriage return, and space. This is the same set of characters that |
99
|
|
|
|
|
|
|
are whitespace for the purposes of the XML syntax. |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
The value for each key may be: |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
=over |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
=item B |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
Completely remove the whitespace. For situations where the whitespace is |
108
|
|
|
|
|
|
|
of no significance at all. (Common for leading and trailing whitespace, |
109
|
|
|
|
|
|
|
but rare for intermediate whitespace.) |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
=item B |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
Replace the whitespace sequence with a single space character. |
114
|
|
|
|
|
|
|
For situations where the presence of whitespace is significant but the |
115
|
|
|
|
|
|
|
length and type are not. (Common for intermediate whitespace.) |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
=item B (default) |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
Leave the whitespace unchanged. For situations where the exact type of |
120
|
|
|
|
|
|
|
whitespace is significant. |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
=back |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
=back |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=cut |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
sub _canonise_chars($$) { |
129
|
140
|
|
|
140
|
|
263
|
my($string, $options) = @_; |
130
|
|
|
|
|
|
|
my $leading_wsp = exists($options->{leading_wsp}) ? |
131
|
140
|
100
|
|
|
|
290
|
$options->{leading_wsp} : "PRESERVE"; |
132
|
140
|
100
|
|
|
|
360
|
if($leading_wsp eq "DELETE") { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
133
|
14
|
|
|
|
|
89
|
$string =~ s/\A$xml10_s_rx//o; |
134
|
|
|
|
|
|
|
} elsif($leading_wsp eq "COMPRESS") { |
135
|
14
|
|
|
|
|
74
|
$string =~ s/\A$xml10_s_rx/ /o; |
136
|
|
|
|
|
|
|
} elsif($leading_wsp ne "PRESERVE") { |
137
|
0
|
|
|
|
|
0
|
_throw_data_error("bad character canonicalisation option"); |
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
my $intermediate_wsp = exists($options->{intermediate_wsp}) ? |
140
|
140
|
100
|
|
|
|
270
|
$options->{intermediate_wsp} : "PRESERVE"; |
141
|
140
|
100
|
|
|
|
304
|
if($intermediate_wsp eq "DELETE") { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
142
|
14
|
|
|
|
|
156
|
$string =~ s/(?!$xml10_s_rx)(.)$xml10_s_rx(?!$xml10_s_rx|\z) |
143
|
|
|
|
|
|
|
/$1/xsog; |
144
|
|
|
|
|
|
|
} elsif($intermediate_wsp eq "COMPRESS") { |
145
|
14
|
|
|
|
|
167
|
$string =~ s/(?!$xml10_s_rx)(.)$xml10_s_rx(?!$xml10_s_rx|\z) |
146
|
|
|
|
|
|
|
/$1 /xsog; |
147
|
|
|
|
|
|
|
} elsif($intermediate_wsp ne "PRESERVE") { |
148
|
0
|
|
|
|
|
0
|
_throw_data_error("bad character canonicalisation option"); |
149
|
|
|
|
|
|
|
} |
150
|
|
|
|
|
|
|
my $trailing_wsp = exists($options->{trailing_wsp}) ? |
151
|
140
|
100
|
|
|
|
278
|
$options->{trailing_wsp} : "PRESERVE"; |
152
|
140
|
100
|
|
|
|
366
|
if($trailing_wsp eq "DELETE") { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
153
|
14
|
|
|
|
|
102
|
$string =~ s/$xml10_s_rx\z//o; |
154
|
|
|
|
|
|
|
} elsif($trailing_wsp eq "COMPRESS") { |
155
|
14
|
|
|
|
|
93
|
$string =~ s/$xml10_s_rx\z/ /o; |
156
|
|
|
|
|
|
|
} elsif($trailing_wsp ne "PRESERVE") { |
157
|
0
|
|
|
|
|
0
|
_throw_data_error("bad character canonicalisation option"); |
158
|
|
|
|
|
|
|
} |
159
|
140
|
|
|
|
|
520
|
return $string; |
160
|
|
|
|
|
|
|
} |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
sub xml_s_canonise_chars($$) { |
163
|
22
|
|
|
22
|
1
|
15421
|
check_xml_chardata($_[0]); |
164
|
20
|
|
|
|
|
189
|
return &_canonise_chars; |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
*xs_charcanon = \&xml_s_canonise_chars; |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=item xml_c_canonise_chars(CONTENT, OPTIONS) |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
=item xc_charcanon(CONTENT, OPTIONS) |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
This function is intended to help in parsing XML data, in situations |
174
|
|
|
|
|
|
|
where the schema states that some aspects of characters are not |
175
|
|
|
|
|
|
|
entirely significant. I must be a reference to either an |
176
|
|
|
|
|
|
|
L object or a twine array. The function processes its |
177
|
|
|
|
|
|
|
top-level character content in the same way as L, |
178
|
|
|
|
|
|
|
and returns the resulting modified version of the content in the same |
179
|
|
|
|
|
|
|
form that the input supplied. |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
Any element inside the content chunk acts like a special character that |
182
|
|
|
|
|
|
|
will not be modified. It interrupts any character sequence of interest. |
183
|
|
|
|
|
|
|
Elements are not processed recursively: they are treated as atomic. |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
=cut |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
sub _canonise_chars_twine($$) { |
188
|
60
|
|
|
60
|
|
496
|
my($twine, $options) = @_; |
189
|
60
|
100
|
|
|
|
165
|
return [ _canonise_chars($twine->[0], $options) ] |
190
|
|
|
|
|
|
|
if @$twine == 1; |
191
|
20
|
|
|
|
|
60
|
my $leading_options = {%$options}; |
192
|
20
|
|
|
|
|
45
|
my $intermediate_options = {%$options}; |
193
|
20
|
|
|
|
|
36
|
my $trailing_options = {%$options}; |
194
|
|
|
|
|
|
|
$leading_options->{trailing_wsp} = |
195
|
|
|
|
|
|
|
$intermediate_options->{leading_wsp} = |
196
|
|
|
|
|
|
|
$intermediate_options->{trailing_wsp} = |
197
|
|
|
|
|
|
|
$trailing_options->{leading_wsp} = |
198
|
|
|
|
|
|
|
exists($options->{intermediate_wsp}) ? |
199
|
20
|
100
|
|
|
|
59
|
$options->{intermediate_wsp} : "PRESERVE"; |
200
|
20
|
|
|
|
|
51
|
my @output = @$twine; |
201
|
20
|
|
|
|
|
41
|
$output[0] = _canonise_chars($output[0], $leading_options); |
202
|
20
|
|
|
|
|
76
|
$output[-1] = _canonise_chars($output[-1], $trailing_options); |
203
|
20
|
|
|
|
|
57
|
for(my $i = @output - 3; $i != 0; $i--) { |
204
|
40
|
|
|
|
|
72
|
$output[$i] = |
205
|
|
|
|
|
|
|
_canonise_chars($output[$i], $intermediate_options); |
206
|
|
|
|
|
|
|
} |
207
|
20
|
|
|
|
|
103
|
return \@output; |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
sub xml_c_canonise_chars($$) { |
211
|
60
|
100
|
|
60
|
1
|
27473
|
if(is_ref($_[0], "ARRAY")) { |
212
|
30
|
|
|
|
|
105
|
check_xml_content_twine($_[0]); |
213
|
30
|
|
|
|
|
710
|
return xml_content_twine(&_canonise_chars_twine); |
214
|
|
|
|
|
|
|
} else { |
215
|
30
|
|
|
|
|
88
|
return xml_content_object(_canonise_chars_twine( |
216
|
|
|
|
|
|
|
xml_c_content_twine($_[0]), $_[1])); |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
} |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
*xc_charcanon = \&xml_c_canonise_chars; |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
=item xml_c_subelements(CONTENT, ALLOW_WSP) |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
=item xc_subelems(CONTENT, ALLOW_WSP) |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
This function is intended to help in parsing XML data, in situations |
227
|
|
|
|
|
|
|
where the schema calls for an element to contain only subelements, |
228
|
|
|
|
|
|
|
possibly with optional whitespace around and between them. |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
I must be a reference to either an L object |
231
|
|
|
|
|
|
|
or a twine array. The function checks whether the content includes |
232
|
|
|
|
|
|
|
any unpermitted characters at the top level, and Cs if it does. |
233
|
|
|
|
|
|
|
If the content is of permitted form, the function returns a reference |
234
|
|
|
|
|
|
|
to an array listing all the subelements. |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
I is a truth value controlling whether whitespace is permitted |
237
|
|
|
|
|
|
|
around and between the subelements. The characters recognised as |
238
|
|
|
|
|
|
|
whitespace are the same as those for XML syntax. Allowing whitespace in |
239
|
|
|
|
|
|
|
this way is easier (and slightly more efficient) than first filtering |
240
|
|
|
|
|
|
|
it out via L. Non-whitespace characters are |
241
|
|
|
|
|
|
|
never permitted. |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
=cut |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
sub xml_c_subelements($$) { |
246
|
58
|
|
|
58
|
1
|
33735
|
my($content, $allow_wsp) = @_; |
247
|
58
|
|
|
|
|
187
|
$content = xml_c_content_twine($content); |
248
|
56
|
|
|
|
|
876
|
my $clen = @$content; |
249
|
56
|
|
|
|
|
216
|
for(my $i = $clen-1; $i >= 0; $i -= 2) { |
250
|
112
|
100
|
|
|
|
233
|
if($allow_wsp) { |
251
|
62
|
100
|
|
|
|
350
|
_throw_schema_error("non-whitespace characters ". |
252
|
|
|
|
|
|
|
"where not permitted") |
253
|
|
|
|
|
|
|
unless $content->[$i] =~ /\A$xml10_s_rx?\z/o; |
254
|
|
|
|
|
|
|
} else { |
255
|
50
|
100
|
|
|
|
174
|
_throw_schema_error("characters where not permitted") |
256
|
|
|
|
|
|
|
unless $content->[$i] eq ""; |
257
|
|
|
|
|
|
|
} |
258
|
|
|
|
|
|
|
} |
259
|
22
|
|
|
|
|
48
|
my @subelem; |
260
|
22
|
|
|
|
|
60
|
for(my $i = 1; $i < $clen; $i += 2) { |
261
|
28
|
|
|
|
|
79
|
push @subelem, $content->[$i]; |
262
|
|
|
|
|
|
|
} |
263
|
22
|
|
|
|
|
112
|
return \@subelem; |
264
|
|
|
|
|
|
|
} |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
*xc_subelems = \&xml_c_subelements; |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
=item xml_c_chardata(CONTENT) |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
=item xc_chars(CONTENT) |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
This function is intended to help in parsing XML data, in situations |
273
|
|
|
|
|
|
|
where the schema calls for an element to contain only character data. |
274
|
|
|
|
|
|
|
I must be a reference to either an L object |
275
|
|
|
|
|
|
|
or a twine array. The function Cs if it contains any subelements. |
276
|
|
|
|
|
|
|
If the content is of permitted form, the function returns a string |
277
|
|
|
|
|
|
|
containing all the character content. |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
=cut |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
sub xml_c_chardata($) { |
282
|
26
|
|
|
26
|
1
|
13033
|
my($content) = @_; |
283
|
26
|
|
|
|
|
58
|
$content = xml_c_content_twine($content); |
284
|
24
|
100
|
|
|
|
289
|
_throw_schema_error("subelement where not permitted") |
285
|
|
|
|
|
|
|
unless @$content == 1; |
286
|
14
|
|
|
|
|
45
|
return $content->[0]; |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
*xc_chars = \&xml_c_chardata; |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
=back |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
=head1 SEE ALSO |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
L |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
=head1 AUTHOR |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
Andrew Main (Zefram) |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
=head1 COPYRIGHT |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
Copyright (C) 2010 PhotoBox Ltd |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
Copyright (C) 2011, 2017 Andrew Main (Zefram) |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
=head1 LICENSE |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
This module is free software; you can redistribute it and/or modify it |
310
|
|
|
|
|
|
|
under the same terms as Perl itself. |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
=cut |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
1; |