File Coverage

blib/lib/XML/Invisible.pm

Criterion	Covered	Total	%
statement	100	119	84.0
branch	48	68	70.5
condition	11	16	68.7
subroutine	14	15	93.3
pod	3	3	100.0
total	176	221	79.6

line	stmt	bran	cond	sub	pod	time	code
1							package XML::Invisible;
2
3	2			2		153433	use strict;
	2					18
	2					49
4	2			2		8	use warnings;
	2					3
	2					43
5	2			2		8	use Exporter qw(import);
	2					3
	2					39
6	2			2		719	use Pegex::Grammar;
	2					8230
	2					9
7	2			2		818	use Pegex::Parser;
	2					17042
	2					48
8	2			2		716	use XML::Invisible::Receiver;
	2					5
	2					96
9
10							our $VERSION = '0.07';
11							our @EXPORT_OK = qw(make_parser ast2xml make_canonicaliser);
12
13	2			2		11	use constant DEBUG => $ENV{XML_INVISIBLE_DEBUG};
	2					4
	2					2218
14
15							sub make_parser {
16	9			9	1	7459	my ($grammar) = @_;
17	9	50				73	$grammar = Pegex::Grammar->new(text => $grammar) if !ref $grammar;
18	9					393	my $parser = Pegex::Parser->new(
19							grammar => $grammar,
20							receiver => XML::Invisible::Receiver->new,
21							debug => DEBUG,
22							);
23							sub {
24	15			15		3498	my ($ixml_text) = @_;
25	15					41	$parser->parse($ixml_text);
26	9					883	};
27							}
28
29							sub make_canonicaliser {
30	6			6	1	134	my ($grammar_text) = @_;
31	6					36	require Pegex::Compiler;
32	6					15	require Pegex::Grammar::Atoms;
33	6					18	my $grammar_tree = Pegex::Compiler->new->parse($grammar_text)->tree;
34	6					100331	my $toprule = $grammar_tree->{'+toprule'};
35	6					17	my $atoms = _atoms2canonical(Pegex::Grammar::Atoms->atoms);
36							sub {
37	6			6		13	my ($ast) = @_;
38	6					15	my @results = _extract_canonical($atoms, $ast, $grammar_tree, $toprule);
39	6	50				16	return undef if grep !defined, @results;
40	6					22	join '', @results;
41	6					33	};
42							}
43
44							my %ATOM2SPECIAL = (
45							ALL => "",
46							BLANK => " ",
47							BREAK => "\n",
48							BS => "\x08",
49							CONTROL => "\x00",
50							CR => "\r",
51							DOS => "\r\n",
52							EOL => "\n",
53							EOS => "",
54							FF => "\x0C",
55							HICHAR => "\x7f",
56							NL => "\n",
57							TAB => "\t",
58							WORD => "a",
59							WS => " ",
60							_ => "",
61							__ => " ",
62							ws => "",
63							ws1 => "",
64							ws2 => " ",
65							);
66							sub _atoms2canonical {
67	6			6		23	my ($atoms) = @_;
68	6					8	my %lookup;
69	6					46	for my $atom (keys %$atoms) {
70	462					496	my $c = $atoms->{$atom};
71	462	100				892	if (exists $ATOM2SPECIAL{$atom}) {
		100
		100
72	120					128	$c = $ATOM2SPECIAL{$atom};
73							} elsif ($c =~ s/^\\//) {
74							# all good
75							} elsif ($c =~ s/^\[//) {
76	42					56	$c = substr $c, 0, 1;
77							}
78	462					744	$lookup{$atom} = $c;
79							}
80	6					23	\%lookup;
81							}
82
83							sub _extract_canonical {
84	62			62		94	my ($atoms, $elt, $grammar_tree, $elt_sought, $grammar_frag, $attrs) = @_;
85	62		66			97	$grammar_frag \|\|= $grammar_tree->{$elt_sought};
86	62		100			98	$attrs \|\|= {};
87	62	100				102	$attrs = { %$attrs, %{ $elt->{attributes} \|\| {} } } if ref $elt;
	32	100				87
88	62	100				91	if (defined($elt)) {
89	35	100				59	return $elt if !ref $elt; # just text node - trust here for good reason
90	32	50	66			67	return undef if defined($elt_sought) and $elt_sought ne $elt->{nodename};
91							} else {
92	27	100	100			56	if (defined($elt_sought) and defined(my $value = $attrs->{$elt_sought})) {
93	4					15	return $value;
94							}
95							}
96	55	100				87	if (my $rgx = $grammar_frag->{'.rgx'}) {
97							# RE, so parent of text nodes
98	6	100				10	if (defined $elt) {
99	5	50				10	return join('', @{$elt->{children}}) if $elt->{children};
	5					23
100	0					0	return $elt->{nodename};
101							}
102							# or just a bare regex, which is a literal "canonical" representation
103	1	50				4	return undef if $rgx =~ /^\(/; # out of our league
104	1					4	$rgx =~ s#\\##g;
105	1					3	return $rgx;
106							}
107	49	100				87	if (my $all = $grammar_frag->{'.all'}) {
		100
		50
108							# sequence of productions
109	11					17	my ($childcount, @results) = (0);
110	11					23	for my $i (0..$#$all) {
111	33					48	my $child = $elt->{children}[$childcount];
112	33					34	my $all_frag = $all->[$i];
113	33					35	my $new_elt_sought = undef;
114	33	100				55	if ($all_frag->{'-skip'}) {
		100
115	5					6	$child = undef;
116							} elsif ($all_frag->{'-wrap'}) {
117	1					2	$child = undef;
118	1					2	$new_elt_sought = $all_frag->{'.ref'};
119							} else {
120	27					27	$childcount++;
121							}
122	33					47	my @partial = _extract_canonical(
123							$atoms, $child, $grammar_tree, $new_elt_sought, $all_frag, $attrs,
124							);
125	33	50				71	return undef if grep !defined, @partial; # any non-match
126	33					68	push @results, @partial;
127							}
128	11					36	return @results;
129							} elsif (my $ref = $grammar_frag->{'.ref'}) {
130	37	100				72	return $atoms->{$ref} if exists $atoms->{$ref};
131	22	50	66			53	return undef if defined($elt) and $elt->{nodename} ne $ref;
132	22					26	my $new_frag = $grammar_tree->{$ref};
133	22	100				37	if (my $new_ref = $new_frag->{'.ref'}) {
134	11					12	my $child;
135	11					19	my $new_attrs = { %$attrs };
136	11	50				28	if (!defined($elt)) {
		100
		50
137	0					0	$child = undef;
138							} elsif ($elt->{children}) {
139	8	50				10	return undef if @{$elt->{children}} != 1;
	8					15
140	8					13	$child = $elt->{children}[0];
141							} elsif ($new_frag->{'-wrap'}) {
142	3					5	$new_attrs = { %$new_attrs, %{ $elt->{attributes} } };
	3					9
143							}
144	11					20	return _extract_canonical(
145							$atoms, $child, $grammar_tree, $new_ref, $new_frag, $new_attrs,
146							);
147							}
148							# treat ourselves as if we're the ref-ed to thing
149	11					18	return _extract_canonical(
150							$atoms, $elt, $grammar_tree, $ref, $new_frag, $attrs,
151							);
152							} elsif (my $any = $grammar_frag->{'.any'}) {
153							# choice, pick first successful
154	1					3	for my $i (0..$#$any) {
155	1					2	my $any_frag = $any->[$i];
156							my @partial = _extract_canonical(
157							$atoms, $elt, $grammar_tree,
158	1	50				5	(defined($elt) ? $elt->{nodename} : $elt),
159							$any_frag, $attrs,
160							);
161	1	50				5	next if grep !defined, @partial; # any non-match
162	1					3	return @partial;
163							}
164	0					0	return undef;
165							}
166							}
167
168							my $xml_loaded = 0;
169							sub ast2xml {
170	3	50		3	1	2473	do { require XML::LibXML; $xml_loaded = 1 } unless $xml_loaded;
	3					472
	0
171	0						my ($ast) = @_;
172	0						my $doc = XML::LibXML->createDocument("1.0", "UTF-8");
173	0						$doc->addChild(_item2elt($ast));
174	0						$doc;
175							}
176
177							sub _item2elt {
178	0			0			my ($item) = @_;
179	0	0					die "Unknown item '$item' passed" if ref $item ne 'HASH';
180	0						my $elt = XML::LibXML::Element->new($item->{nodename});
181	0		0				my $attrs = $item->{attributes} \|\| {};
182	0						$elt->setAttribute($_, $attrs->{$_}) for keys %$attrs;
183	0	0					for (@{ $item->{children} \|\| [] }) {
	0
184	0	0					if (!ref) {
185	0						$elt->appendTextNode($_);
186							} else {
187	0						$elt->addChild(_item2elt($_));
188							}
189							}
190	0						$elt;
191							}
192
193							1;
194
195							__END__