File Coverage

blib/lib/XML/Easy/SimpleSchemaUtil.pm

Criterion	Covered	Total	%
statement	73	78	93.5
branch	35	38	92.1
condition			n/a
subroutine	15	16	93.7
pod	4	4	100.0
total	127	136	93.3

line	stmt	bran	sub	pod	time	code
1						=head1 NAME
2
3						XML::Easy::SimpleSchemaUtil - help with simple kinds of XML schema
4
5						=head1 SYNOPSIS
6
7						use XML::Easy::SimpleSchemaUtil qw(
8						xml_s_canonise_chars xml_c_canonise_chars
9						xml_c_subelements xml_c_chardata
10						);
11
12						$chardata = xml_s_canonise_chars($chardata);
13						$content = xml_c_canonise_chars($content);
14						$subelements = xml_c_subelements($content);
15						$chars = xml_c_chardata($content);
16
17						=head1 DESCRIPTION
18
19						The rules by which some class of thing is encoded in XML constitute a
20						schema. (A schema does not need to be codified in a formal language such
21						as Schematron: a natural-language specification can also be a schema.
22						Even if there is no explicit specification at all, the behaviour of
23						the interoperating processors of related XML documents constitutes a de
24						facto schema.) Certain kinds of rule are commonly used in all manner
25						of schemata. This module supplies functions that help to implement such
26						common kinds of rule, regardless of how a schema is specified.
27
28						This module processes XML data in the form used by L,
29						consisting of C and C objects
30						and twine arrays. In this form, character data are stored fully decoded,
31						so they can be manipulated with no knowledge of XML syntax.
32
33						=cut
34
35						package XML::Easy::SimpleSchemaUtil;
36
37	4		4		202279	{ use 5.006; }
	4				15
	4				195
38	4		4		24	use warnings;
	4				7
	4				138
39	4		4		22	use strict;
	4				18
	4				189
40
41	4		4		1234	use Params::Classify 0.000 qw(is_ref);
	4				2803
	4				259
42	4		4		833	use XML::Easy::Classify 0.006 qw(check_xml_chardata check_xml_content_twine);
	4				42368
	4				291
43						use XML::Easy::NodeBasics 0.007
44	4		4		969	qw(xml_content_object xml_content_twine xml_c_content_twine);
	4				6287
	4				291
45	4		4		27	use XML::Easy::Syntax 0.000 qw($xml10_s_rx);
	4				67
	4				601
46
47						our $VERSION = "0.002";
48
49	4		4		23	use parent "Exporter";
	4				6
	4				22
50						our @EXPORT_OK = qw(
51						xml_s_canonise_chars xs_charcanon xml_c_canonise_chars xc_charcanon
52						xml_c_subelements xc_subelems xml_c_chardata xc_chars
53						);
54
55						sub _throw_data_error($) {
56	0		0		0	my($msg) = @_;
57	0				0	die "invalid XML data: $msg\n";
58						}
59
60						sub _throw_schema_error($) {
61	44		44		74	my($msg) = @_;
62	44				318	die "XML schema error: $msg\n";
63						}
64
65						=head1 FUNCTIONS
66
67						Each function has two names. There is a longer descriptive name, and
68						a shorter name to spare screen space and the programmer's fingers.
69
70						=over
71
72						=item xml_s_canonise_chars(STRING, OPTIONS)
73
74						=item xs_charcanon(STRING, OPTIONS)
75
76						This function is intended to help in parsing XML data, in situations
77						where the schema states that some aspects of characters are not
78						entirely significant. I must be a plain Perl string consisting
79						of character data that is valid for XML. The function examines the
80						characters, processes them as specified in the I, and returns
81						a modified version of the string.
82						I must be a reference to a hash, in which the permitted keys are:
83
84						=over
85
86						=item B
87
88						=item B
89
90						=item B
91
92						Controls handling of sequences of whitespace characters. The three
93						keys control, respectively, whitespace at the beginning of the string,
94						whitespace that is at neither the beginning nor the end, and whitespace at
95						the end of the string. If the entire content of the string is whitespace,
96						it is treated as both leading and trailing.
97
98						The whitespace characters, for this purpose, are tab, linefeed/newline,
99						carriage return, and space. This is the same set of characters that
100						are whitespace for the purposes of the XML syntax.
101
102						The value for each key may be:
103
104						=over
105
106						=item B
107
108						Completely remove the whitespace. For situations where the whitespace is
109						of no significance at all. (Common for leading and trailing whitespace,
110						but rare for intermediate whitespace.)
111
112						=item B
113
114						Replace the whitespace sequence with a single space character.
115						For situations where the presence of whitespace is significant but the
116						length and type are not. (Common for intermediate whitespace.)
117
118						=item B (default)
119
120						Leave the whitespace unchanged. For situations where the exact type of
121						whitespace is significant.
122
123						=back
124
125						=back
126
127						=cut
128
129						sub _canonise_chars($$) {
130	140		140		260	my($string, $options) = @_;
131	140	100			565	my $leading_wsp = exists($options->{leading_wsp}) ?
132						$options->{leading_wsp} : "PRESERVE";
133	140	100			440	if($leading_wsp eq "DELETE") {
		100
		50
134	14				96	$string =~ s/\A$xml10_s_rx//o;
135						} elsif($leading_wsp eq "COMPRESS") {
136	14				91	$string =~ s/\A$xml10_s_rx/ /o;
137						} elsif($leading_wsp ne "PRESERVE") {
138	0				0	_throw_data_error("bad character canonicalisation option");
139						}
140	140	100			292	my $intermediate_wsp = exists($options->{intermediate_wsp}) ?
141						$options->{intermediate_wsp} : "PRESERVE";
142	140	100			741	if($intermediate_wsp eq "DELETE") {
		100
		50
143	14				188	$string =~ s/(?!$xml10_s_rx)(.)$xml10_s_rx(?!$xml10_s_rx\|\z)
144						/$1/xsog;
145						} elsif($intermediate_wsp eq "COMPRESS") {
146	14				284	$string =~ s/(?!$xml10_s_rx)(.)$xml10_s_rx(?!$xml10_s_rx\|\z)
147						/$1 /xsog;
148						} elsif($intermediate_wsp ne "PRESERVE") {
149	0				0	_throw_data_error("bad character canonicalisation option");
150						}
151	140	100			333	my $trailing_wsp = exists($options->{trailing_wsp}) ?
152						$options->{trailing_wsp} : "PRESERVE";
153	140	100			1610	if($trailing_wsp eq "DELETE") {
		100
		50
154	14				104	$string =~ s/$xml10_s_rx\z//o;
155						} elsif($trailing_wsp eq "COMPRESS") {
156	14				108	$string =~ s/$xml10_s_rx\z/ /o;
157						} elsif($trailing_wsp ne "PRESERVE") {
158	0				0	_throw_data_error("bad character canonicalisation option");
159						}
160	140				585	return $string;
161						}
162
163						sub xml_s_canonise_chars($$) {
164	22		22	1	26896	check_xml_chardata($_[0]);
165	20				162	return &_canonise_chars;
166						}
167
168						*xs_charcanon = \&xml_s_canonise_chars;
169
170						=item xml_c_canonise_chars(CONTENT, OPTIONS)
171
172						=item xc_charcanon(CONTENT, OPTIONS)
173
174						This function is intended to help in parsing XML data, in situations
175						where the schema states that some aspects of characters are not
176						entirely significant. I must be a reference to either an
177						L object or a twine array. The function processes its
178						top-level character content in the same way as L,
179						and returns the resulting modified version of the content in the same
180						form that the input supplied.
181
182						Any element inside the content chunk acts like a special character that
183						will not be modified. It interrupts any character sequence of interest.
184						Elements are not processed recursively: they are treated as atomic.
185
186						=cut
187
188						sub _canonise_chars_twine($$) {
189	60		60		624	my($twine, $options) = @_;
190	60	100			252	return [ _canonise_chars($twine->[0], $options) ]
191						if @$twine == 1;
192	20				82	my $leading_options = {%$options};
193	20				57	my $intermediate_options = {%$options};
194	20				42	my $trailing_options = {%$options};
195	20	100			91	$leading_options->{trailing_wsp} =
196						$intermediate_options->{leading_wsp} =
197						$intermediate_options->{trailing_wsp} =
198						$trailing_options->{leading_wsp} =
199						exists($options->{intermediate_wsp}) ?
200						$options->{intermediate_wsp} : "PRESERVE";
201	20				65	my @output = @$twine;
202	20				48	$output[0] = _canonise_chars($output[0], $leading_options);
203	20				46	$output[-1] = _canonise_chars($output[-1], $trailing_options);
204	20				63	for(my $i = @output - 3; $i != 0; $i--) {
205	40				80	$output[$i] =
206						_canonise_chars($output[$i], $intermediate_options);
207						}
208	20				130	return \@output;
209						}
210
211						sub xml_c_canonise_chars($$) {
212	60	100	60	1	45447	if(is_ref($_[0], "ARRAY")) {
213	30				108	check_xml_content_twine($_[0]);
214	30				740	return xml_content_twine(&_canonise_chars_twine);
215						} else {
216	30				322	return xml_content_object(_canonise_chars_twine(
217						xml_c_content_twine($_[0]), $_[1]));
218						}
219						}
220
221						*xc_charcanon = \&xml_c_canonise_chars;
222
223						=item xml_c_subelements(CONTENT, ALLOW_WSP)
224
225						=item xc_subelems(CONTENT, ALLOW_WSP)
226
227						This function is intended to help in parsing XML data, in situations
228						where the schema calls for an element to contain only subelements,
229						possibly with optional whitespace around and between them.
230
231						I must be a reference to either an L object
232						or a twine array. The function checks whether the content includes
233						any unpermitted characters at the top level, and Cs if it does.
234						If the content is of permitted form, the function returns a reference
235						to an array listing all the subelements.
236
237						I is a truth value controlling whether whitespace is permitted
238						around and between the subelements. The characters recognised as
239						whitespace are the same as those for XML syntax. Allowing whitespace in
240						this way is easier (and slightly more efficient) than first filtering
241						it out via L. Non-whitespace characters are
242						never permitted.
243
244						=cut
245
246						sub xml_c_subelements($$) {
247	58		58	1	51803	my($content, $allow_wsp) = @_;
248	58				353	$content = xml_c_content_twine($content);
249	56				1082	my $clen = @$content;
250	56				192	for(my $i = $clen-1; $i >= 0; $i -= 2) {
251	112	100			375	if($allow_wsp) {
252	62	100			381	_throw_schema_error("non-whitespace characters ".
253						"where not permitted")
254						unless $content->[$i] =~ /\A$xml10_s_rx?\z/o;
255						} else {
256	50	100			356	_throw_schema_error("characters where not permitted")
257						unless $content->[$i] eq "";
258						}
259						}
260	22				207	my @subelem;
261	22				225	for(my $i = 1; $i < $clen; $i += 2) {
262	28				70	push @subelem, $content->[$i];
263						}
264	22				284	return \@subelem;
265						}
266
267						*xc_subelems = \&xml_c_subelements;
268
269						=item xml_c_chardata(CONTENT)
270
271						=item xc_chars(CONTENT)
272
273						This function is intended to help in parsing XML data, in situations
274						where the schema calls for an element to contain only character data.
275						I must be a reference to either an L object
276						or a twine array. The function Cs if it contains any subelements.
277						If the content is of permitted form, the function returns a string
278						containing all the character content.
279
280						=cut
281
282						sub xml_c_chardata($) {
283	26		26	1	15570	my($content) = @_;
284	26				75	$content = xml_c_content_twine($content);
285	24	100			322	_throw_schema_error("subelement where not permitted")
286						unless @$content == 1;
287	14				59	return $content->[0];
288						}
289
290						*xc_chars = \&xml_c_chardata;
291
292						=back
293
294						=head1 SEE ALSO
295
296						L
297
298						=head1 AUTHOR
299
300						Andrew Main (Zefram)
301
302						=head1 COPYRIGHT
303
304						Copyright (C) 2010 PhotoBox Ltd
305
306						Copyright (C) 2011 Andrew Main (Zefram)
307
308						=head1 LICENSE
309
310						This module is free software; you can redistribute it and/or modify it
311						under the same terms as Perl itself.
312
313						=cut
314
315						1;