File Coverage

blib/lib/XML/Easy/SimpleSchemaUtil.pm

Criterion	Covered	Total	%
statement	72	77	93.5
branch	35	38	92.1
condition			n/a
subroutine	15	16	93.7
pod	4	4	100.0
total	126	135	93.3

line	stmt	bran	sub	pod	time	code
1						=head1 NAME
2
3						XML::Easy::SimpleSchemaUtil - help with simple kinds of XML schema
4
5						=head1 SYNOPSIS
6
7						use XML::Easy::SimpleSchemaUtil qw(
8						xml_s_canonise_chars xml_c_canonise_chars
9						xml_c_subelements xml_c_chardata);
10
11						$chardata = xml_s_canonise_chars($chardata);
12						$content = xml_c_canonise_chars($content);
13						$subelements = xml_c_subelements($content);
14						$chars = xml_c_chardata($content);
15
16						=head1 DESCRIPTION
17
18						The rules by which some class of thing is encoded in XML constitute a
19						schema. (A schema does not need to be codified in a formal language such
20						as Schematron: a natural-language specification can also be a schema.
21						Even if there is no explicit specification at all, the behaviour of
22						the interoperating processors of related XML documents constitutes a de
23						facto schema.) Certain kinds of rule are commonly used in all manner
24						of schemata. This module supplies functions that help to implement such
25						common kinds of rule, regardless of how a schema is specified.
26
27						This module processes XML data in the form used by L,
28						consisting of C and C objects
29						and twine arrays. In this form, character data are stored fully decoded,
30						so they can be manipulated with no knowledge of XML syntax.
31
32						=cut
33
34						package XML::Easy::SimpleSchemaUtil;
35
36	4		4		243108	{ use 5.006; }
	4				13
37	4		4		22	use warnings;
	4				7
	4				111
38	4		4		21	use strict;
	4				13
	4				113
39
40	4		4		292	use Params::Classify 0.000 qw(is_ref);
	4				2166
	4				207
41	4		4		271	use XML::Easy::Classify 0.006 qw(check_xml_chardata check_xml_content_twine);
	4				16277
	4				204
42						use XML::Easy::NodeBasics 0.007
43	4		4		291	qw(xml_content_object xml_content_twine xml_c_content_twine);
	4				3537
	4				216
44	4		4		25	use XML::Easy::Syntax 0.000 qw($xml10_s_rx);
	4				45
	4				461
45
46						our $VERSION = "0.003";
47
48	4		4		26	use parent "Exporter";
	4				11
	4				25
49						our @EXPORT_OK = qw(
50						xml_s_canonise_chars xs_charcanon xml_c_canonise_chars xc_charcanon
51						xml_c_subelements xc_subelems xml_c_chardata xc_chars
52						);
53
54						sub _throw_data_error($) {
55	0		0		0	my($msg) = @_;
56	0				0	die "invalid XML data: $msg\n";
57						}
58
59						sub _throw_schema_error($) {
60	44		44		92	my($msg) = @_;
61	44				272	die "XML schema error: $msg\n";
62						}
63
64						=head1 FUNCTIONS
65
66						Each function has two names. There is a longer descriptive name, and
67						a shorter name to spare screen space and the programmer's fingers.
68
69						=over
70
71						=item xml_s_canonise_chars(STRING, OPTIONS)
72
73						=item xs_charcanon(STRING, OPTIONS)
74
75						This function is intended to help in parsing XML data, in situations
76						where the schema states that some aspects of characters are not
77						entirely significant. I must be a plain Perl string consisting
78						of character data that is valid for XML. The function examines the
79						characters, processes them as specified in the I, and returns
80						a modified version of the string.
81						I must be a reference to a hash, in which the permitted keys are:
82
83						=over
84
85						=item B
86
87						=item B
88
89						=item B
90
91						Controls handling of sequences of whitespace characters. The three
92						keys control, respectively, whitespace at the beginning of the string,
93						whitespace that is at neither the beginning nor the end, and whitespace at
94						the end of the string. If the entire content of the string is whitespace,
95						it is treated as both leading and trailing.
96
97						The whitespace characters, for this purpose, are tab, linefeed/newline,
98						carriage return, and space. This is the same set of characters that
99						are whitespace for the purposes of the XML syntax.
100
101						The value for each key may be:
102
103						=over
104
105						=item B
106
107						Completely remove the whitespace. For situations where the whitespace is
108						of no significance at all. (Common for leading and trailing whitespace,
109						but rare for intermediate whitespace.)
110
111						=item B
112
113						Replace the whitespace sequence with a single space character.
114						For situations where the presence of whitespace is significant but the
115						length and type are not. (Common for intermediate whitespace.)
116
117						=item B (default)
118
119						Leave the whitespace unchanged. For situations where the exact type of
120						whitespace is significant.
121
122						=back
123
124						=back
125
126						=cut
127
128						sub _canonise_chars($$) {
129	140		140		263	my($string, $options) = @_;
130						my $leading_wsp = exists($options->{leading_wsp}) ?
131	140	100			290	$options->{leading_wsp} : "PRESERVE";
132	140	100			360	if($leading_wsp eq "DELETE") {
		100
		50
133	14				89	$string =~ s/\A$xml10_s_rx//o;
134						} elsif($leading_wsp eq "COMPRESS") {
135	14				74	$string =~ s/\A$xml10_s_rx/ /o;
136						} elsif($leading_wsp ne "PRESERVE") {
137	0				0	_throw_data_error("bad character canonicalisation option");
138						}
139						my $intermediate_wsp = exists($options->{intermediate_wsp}) ?
140	140	100			270	$options->{intermediate_wsp} : "PRESERVE";
141	140	100			304	if($intermediate_wsp eq "DELETE") {
		100
		50
142	14				156	$string =~ s/(?!$xml10_s_rx)(.)$xml10_s_rx(?!$xml10_s_rx\|\z)
143						/$1/xsog;
144						} elsif($intermediate_wsp eq "COMPRESS") {
145	14				167	$string =~ s/(?!$xml10_s_rx)(.)$xml10_s_rx(?!$xml10_s_rx\|\z)
146						/$1 /xsog;
147						} elsif($intermediate_wsp ne "PRESERVE") {
148	0				0	_throw_data_error("bad character canonicalisation option");
149						}
150						my $trailing_wsp = exists($options->{trailing_wsp}) ?
151	140	100			278	$options->{trailing_wsp} : "PRESERVE";
152	140	100			366	if($trailing_wsp eq "DELETE") {
		100
		50
153	14				102	$string =~ s/$xml10_s_rx\z//o;
154						} elsif($trailing_wsp eq "COMPRESS") {
155	14				93	$string =~ s/$xml10_s_rx\z/ /o;
156						} elsif($trailing_wsp ne "PRESERVE") {
157	0				0	_throw_data_error("bad character canonicalisation option");
158						}
159	140				520	return $string;
160						}
161
162						sub xml_s_canonise_chars($$) {
163	22		22	1	15421	check_xml_chardata($_[0]);
164	20				189	return &_canonise_chars;
165						}
166
167						*xs_charcanon = \&xml_s_canonise_chars;
168
169						=item xml_c_canonise_chars(CONTENT, OPTIONS)
170
171						=item xc_charcanon(CONTENT, OPTIONS)
172
173						This function is intended to help in parsing XML data, in situations
174						where the schema states that some aspects of characters are not
175						entirely significant. I must be a reference to either an
176						L object or a twine array. The function processes its
177						top-level character content in the same way as L,
178						and returns the resulting modified version of the content in the same
179						form that the input supplied.
180
181						Any element inside the content chunk acts like a special character that
182						will not be modified. It interrupts any character sequence of interest.
183						Elements are not processed recursively: they are treated as atomic.
184
185						=cut
186
187						sub _canonise_chars_twine($$) {
188	60		60		496	my($twine, $options) = @_;
189	60	100			165	return [ _canonise_chars($twine->[0], $options) ]
190						if @$twine == 1;
191	20				60	my $leading_options = {%$options};
192	20				45	my $intermediate_options = {%$options};
193	20				36	my $trailing_options = {%$options};
194						$leading_options->{trailing_wsp} =
195						$intermediate_options->{leading_wsp} =
196						$intermediate_options->{trailing_wsp} =
197						$trailing_options->{leading_wsp} =
198						exists($options->{intermediate_wsp}) ?
199	20	100			59	$options->{intermediate_wsp} : "PRESERVE";
200	20				51	my @output = @$twine;
201	20				41	$output[0] = _canonise_chars($output[0], $leading_options);
202	20				76	$output[-1] = _canonise_chars($output[-1], $trailing_options);
203	20				57	for(my $i = @output - 3; $i != 0; $i--) {
204	40				72	$output[$i] =
205						_canonise_chars($output[$i], $intermediate_options);
206						}
207	20				103	return \@output;
208						}
209
210						sub xml_c_canonise_chars($$) {
211	60	100	60	1	27473	if(is_ref($_[0], "ARRAY")) {
212	30				105	check_xml_content_twine($_[0]);
213	30				710	return xml_content_twine(&_canonise_chars_twine);
214						} else {
215	30				88	return xml_content_object(_canonise_chars_twine(
216						xml_c_content_twine($_[0]), $_[1]));
217						}
218						}
219
220						*xc_charcanon = \&xml_c_canonise_chars;
221
222						=item xml_c_subelements(CONTENT, ALLOW_WSP)
223
224						=item xc_subelems(CONTENT, ALLOW_WSP)
225
226						This function is intended to help in parsing XML data, in situations
227						where the schema calls for an element to contain only subelements,
228						possibly with optional whitespace around and between them.
229
230						I must be a reference to either an L object
231						or a twine array. The function checks whether the content includes
232						any unpermitted characters at the top level, and Cs if it does.
233						If the content is of permitted form, the function returns a reference
234						to an array listing all the subelements.
235
236						I is a truth value controlling whether whitespace is permitted
237						around and between the subelements. The characters recognised as
238						whitespace are the same as those for XML syntax. Allowing whitespace in
239						this way is easier (and slightly more efficient) than first filtering
240						it out via L. Non-whitespace characters are
241						never permitted.
242
243						=cut
244
245						sub xml_c_subelements($$) {
246	58		58	1	33735	my($content, $allow_wsp) = @_;
247	58				187	$content = xml_c_content_twine($content);
248	56				876	my $clen = @$content;
249	56				216	for(my $i = $clen-1; $i >= 0; $i -= 2) {
250	112	100			233	if($allow_wsp) {
251	62	100			350	_throw_schema_error("non-whitespace characters ".
252						"where not permitted")
253						unless $content->[$i] =~ /\A$xml10_s_rx?\z/o;
254						} else {
255	50	100			174	_throw_schema_error("characters where not permitted")
256						unless $content->[$i] eq "";
257						}
258						}
259	22				48	my @subelem;
260	22				60	for(my $i = 1; $i < $clen; $i += 2) {
261	28				79	push @subelem, $content->[$i];
262						}
263	22				112	return \@subelem;
264						}
265
266						*xc_subelems = \&xml_c_subelements;
267
268						=item xml_c_chardata(CONTENT)
269
270						=item xc_chars(CONTENT)
271
272						This function is intended to help in parsing XML data, in situations
273						where the schema calls for an element to contain only character data.
274						I must be a reference to either an L object
275						or a twine array. The function Cs if it contains any subelements.
276						If the content is of permitted form, the function returns a string
277						containing all the character content.
278
279						=cut
280
281						sub xml_c_chardata($) {
282	26		26	1	13033	my($content) = @_;
283	26				58	$content = xml_c_content_twine($content);
284	24	100			289	_throw_schema_error("subelement where not permitted")
285						unless @$content == 1;
286	14				45	return $content->[0];
287						}
288
289						*xc_chars = \&xml_c_chardata;
290
291						=back
292
293						=head1 SEE ALSO
294
295						L
296
297						=head1 AUTHOR
298
299						Andrew Main (Zefram)
300
301						=head1 COPYRIGHT
302
303						Copyright (C) 2010 PhotoBox Ltd
304
305						Copyright (C) 2011, 2017 Andrew Main (Zefram)
306
307						=head1 LICENSE
308
309						This module is free software; you can redistribute it and/or modify it
310						under the same terms as Perl itself.
311
312						=cut
313
314						1;