File Coverage

blib/lib/XML/PP.pm
Criterion Covered Total %
statement 117 138 84.7
branch 50 72 69.4
condition 19 28 67.8
subroutine 11 11 100.0
pod 3 3 100.0
total 200 252 79.3


line stmt bran cond sub pod time code
1             package XML::PP;
2              
3 4     4   941580 use strict;
  4         10  
  4         156  
4 4     4   21 use warnings;
  4         8  
  4         288  
5              
6 4     4   1741 use Params::Get 0.13;
  4         45241  
  4         233  
7 4     4   33 use Scalar::Util;
  4         24  
  4         151  
8 4     4   1588 use Return::Set;
  4         137591  
  4         9606  
9              
10             =head1 NAME
11              
12             XML::PP - A simple XML parser
13              
14             =head1 VERSION
15              
16             Version 0.07
17              
18             =cut
19              
20             our $VERSION = '0.07';
21              
22             =head1 SYNOPSIS
23              
24             use XML::PP;
25              
26             my $parser = XML::PP->new();
27             my $xml = 'ToveJaniReminderDon\'t forget me this weekend!';
28             my $tree = $parser->parse($xml);
29              
30             print $tree->{name}; # 'note'
31             print $tree->{children}[0]->{name}; # 'to'
32              
33             =head1 DESCRIPTION
34              
35             You almost certainly do not need this module.
36             For most tasks,
37             use L or L.
38             C exists only for the most lightweight scenarios where you can't get one of the above modules to install,
39             for example,
40             CI/CD machines running Windows that get stuck with L.
41              
42             C is a simple, lightweight XML parser written in pure Perl.
43             It does not rely on external libraries like C and is suitable for small XML parsing tasks.
44             This module supports basic XML document parsing, including namespace handling, attributes, and text nodes.
45              
46             =head1 METHODS
47              
48             =head2 new
49              
50             my $parser = XML::PP->new();
51             my $parser = XML::PP->new(strict => 1);
52             my $parser = XML::PP->new(warn_on_error => 1);
53              
54             Creates a new C object.
55             It can take several optional arguments:
56              
57             =over 4
58              
59             =item * C - If set to true, the parser dies when it encounters unknown entities or unescaped ampersands.
60              
61             =item * C - If true, the parser emits warnings for unknown or malformed XML entities. This is enabled automatically if C is enabled.
62              
63             =item * C
64              
65             Used for warnings and traces.
66             It can be an object that understands warn() and trace() messages,
67             such as a L or L object,
68             a reference to code,
69             a reference to an array,
70             or a filename.
71              
72             =back
73              
74             =cut
75              
76             # Constructor for creating a new XML::PP object
77             sub new
78             {
79 8     8 1 696340 my $class = shift;
80 8   100     39 my $params = Params::Get::get_params(undef, @_) || {};
81              
82             # strict implies warn_on_error
83 8 100       197 if($params->{strict}) {
84 2         6 $params->{warn_on_error} = 1;
85             }
86              
87 8         13 my $self = bless { %{$params} }, $class;
  8         37  
88              
89 8 50       31 if(my $logger = $self->{'logger'}) {
90 0 0       0 if(!Scalar::Util::blessed($logger)) {
91             # Don't "use" at the top, because of circular dependancy:
92             # Log::Abstraction->Config::Abstraction->XML::PP
93 0         0 eval { require Log::Abstraction };
  0         0  
94 0 0       0 if($@) {
95 0         0 die $@;
96             }
97 0         0 Log::Abstraction->import();
98 0         0 $self->{'logger'} = Log::Abstraction->new($logger);
99             }
100             }
101              
102 8         24 return $self;
103             }
104              
105             =head2 parse
106              
107             my $tree = $parser->parse($xml_string);
108              
109             Parses the XML string and returns a tree structure representing the XML content.
110             The returned structure is a hash reference with the following fields:
111              
112             =over 4
113              
114             =item * C - The tag name of the node.
115              
116             =item * C - The namespace prefix (if any).
117              
118             =item * C - The namespace URI (if any).
119              
120             =item * C - A hash reference of attributes.
121              
122             =item * C - An array reference of child nodes (either text nodes or further elements).
123              
124             =back
125              
126             =cut
127              
128             # Parse the given XML string and return the root node
129             sub parse
130             {
131 11     11 1 29633 my $self = shift;
132 11         42 my $params = Params::Get::get_params('xml', \@_);
133 11         245 my $xml_string = $params->{'xml'};
134              
135 11 50       32 if(ref($xml_string) eq 'SCALAR') {
136 0         0 $xml_string = ${$xml_string};
  0         0  
137             }
138             # Check if the XML string is empty
139             # if (!$xml_string || $xml_string !~ /<\?xml/) {
140             # $self->_handle_error("Invalid or empty XML document provided");
141 11 50       22 if (!$xml_string) {
142             # $self->_handle_error("Empty XML document provided");
143 0         0 return {};
144             }
145              
146 11         34 $xml_string =~ s///sg; # Ignore comments
147 11         23 $xml_string =~ s/<\?xml.+\?>//; # Ignore the header
148              
149 11         199 $xml_string =~ s/^\s+|\s+$//g; # Trim whitespace
150             # Check if the XML string is empty
151 11         41 return $self->_parse_node(\$xml_string, {});
152             }
153              
154             =head2 collapse_structure
155              
156             Collapse an XML-like structure into a simplified hash (like L).
157              
158             use XML::PP;
159              
160             my $input = {
161             name => 'note',
162             children => [
163             { name => 'to', children => [ { text => 'Tove' } ] },
164             { name => 'from', children => [ { text => 'Jani' } ] },
165             { name => 'heading', children => [ { text => 'Reminder' } ] },
166             { name => 'body', children => [ { text => 'Don\'t forget me this weekend!' } ] },
167             ],
168             attributes => { id => 'n1' },
169             };
170              
171             my $result = collapse_structure($input);
172              
173             # Output:
174             # {
175             # note => {
176             # to => 'Tove',
177             # from => 'Jani',
178             # heading => 'Reminder',
179             # body => 'Don\'t forget me this weekend!',
180             # }
181             # }
182              
183             The C subroutine takes a nested hash structure (representing an XML-like data structure) and collapses it into a simplified hash where each child element is mapped to its name as the key, and the text content is mapped as the corresponding value. The final result is wrapped in a C key, which contains a hash of all child elements.
184              
185             This subroutine is particularly useful for flattening XML-like data into a more manageable hash format, suitable for further processing or display.
186              
187             C accepts a single argument:
188              
189             =over 4
190              
191             =item * C<$node> (Required)
192              
193             A hash reference representing a node with the following structure:
194              
195             {
196             name => 'element_name', # Name of the element (e.g., 'note', 'to', etc.)
197             children => [ # List of child elements
198             { name => 'child_name', children => [{ text => 'value' }] },
199             ...
200             ],
201             attributes => { ... }, # Optional attributes for the element
202             ns_uri => ... , # Optional namespace URI
203             ns => ... , # Optional namespace
204             }
205              
206             The C key holds an array of child elements. Each child element may have its own C and C, and the function will collapse all text values into key-value pairs.
207              
208             =back
209              
210             The subroutine returns a hash reference that represents the collapsed structure, where the top-level key is C and its value is another hash containing the child elements' names as keys and their corresponding text values as values.
211              
212             For example:
213              
214             {
215             note => {
216             to => 'Tove',
217             from => 'Jani',
218             heading => 'Reminder',
219             body => 'Don\'t forget me this weekend!',
220             }
221             }
222              
223             =over 4
224              
225             =item Basic Example:
226              
227             Given the following input structure:
228              
229             my $input = {
230             name => 'note',
231             children => [
232             { name => 'to', children => [ { text => 'Tove' } ] },
233             { name => 'from', children => [ { text => 'Jani' } ] },
234             { name => 'heading', children => [ { text => 'Reminder' } ] },
235             { name => 'body', children => [ { text => 'Don\'t forget me this weekend!' } ] },
236             ],
237             };
238              
239             Calling C will return:
240              
241             {
242             note => {
243             to => 'Tove',
244             from => 'Jani',
245             heading => 'Reminder',
246             body => 'Don\'t forget me this weekend!',
247             }
248             }
249              
250             =back
251              
252             =cut
253              
254             sub collapse_structure {
255 8     8 1 1492 my ($self, $node) = @_;
256             # my $self = shift;
257             # my $params = Params::Get::get_params('node', \@_);
258             # my $node = $params->{'node'};
259              
260 8 50 33     47 return {} unless ref $node eq 'HASH' && $node->{children};
261              
262 8         14 my %result;
263 8         11 for my $child (@{ $node->{children} }) {
  8         18  
264 19 50       81 my $name = $child->{name} or next;
265 19         21 my $value;
266              
267 19 100 66     46 if ($child->{children} && @{ $child->{children} }) {
  19         50  
268 18 100 100     27 if (@{ $child->{children} } == 1 && exists $child->{children}[0]{text}) {
  18         88  
269 15         31 $value = $child->{children}[0]{text};
270             } else {
271 3         19 $value = $self->collapse_structure($child)->{$name};
272             }
273             }
274              
275 19 100 100     74 next unless defined $value && $value ne '';
276              
277             # Handle multiple same-name children as an array
278 17 100       61 if (exists $result{$name}) {
279 2 100       20 $result{$name} = [ $result{$name} ] unless ref $result{$name} eq 'ARRAY';
280 2         4 push @{ $result{$name} }, $value;
  2         7  
281             } else {
282 15         43 $result{$name} = $value;
283             }
284             }
285 8         83 return { $node->{name} => \%result };
286             }
287              
288             =head2 _parse_node
289              
290             my $node = $self->_parse_node($xml_ref, $nsmap);
291              
292             Recursively parses an individual XML node.
293             This method is used internally by the C method.
294             It handles the parsing of tags, attributes, text nodes, and child elements.
295             It also manages namespaces and handles self-closing tags.
296              
297             =cut
298              
299             # Internal method to parse an individual XML node
300             sub _parse_node {
301 24     24   66 my ($self, $xml_ref, $nsmap) = @_;
302              
303 24 50       51 if(!defined($xml_ref)) {
304 0 0       0 if($self->{'logger'}) {
305 0         0 $self->{'logger'}->fatal('BUG: _parse_node, xml_ref not defined');
306             }
307 0         0 die 'BUG: _parse_node, xml_ref not defined';
308             }
309              
310             # Match the start of a tag (self-closing or regular)
311 24 50       189 $$xml_ref =~ s{^\s*<([^\s/>]+)([^>]*)\s*(/?)>}{}s or do {
312 0         0 $self->_handle_error('Expected a valid XML tag, but none found at position: ' . pos($$xml_ref));
313 0         0 return;
314             };
315              
316 24   100     180 my ($raw_tag, $attr_string, $self_close) = ($1, $2 || '', $3);
317              
318             # Check for malformed self-closing tags
319 24 50 33     63 if($self_close && $$xml_ref !~ /^\s*<\/(?:\w+:)?$raw_tag\s*>/) {
320 0         0 $self->_handle_error("Malformed self-closing tag for <$raw_tag>");
321 0         0 return;
322             }
323              
324             # Handle possible trailing slash like
325 24 100       61 if($attr_string =~ s{/\s*$}{}) {
326 1         3 $self_close = 1;
327             }
328              
329 24 100       96 my ($ns, $tag) = $raw_tag =~ /^([^:]+):(.+)$/
330             ? ($1, $2)
331             : (undef, $raw_tag);
332              
333 24         55 my %local_nsmap = (%$nsmap);
334              
335             # XMLNS declarations
336 24         106 while ($attr_string =~ /(\w+)(?::(\w+))?="([^"]*)"/g) {
337 12         41 my ($k1, $k2, $v) = ($1, $2, $3);
338 12 50 66     96 if ($k1 eq 'xmlns' && !defined $k2) {
    100 66        
339 0         0 $local_nsmap{''} = $v;
340             } elsif ($k1 eq 'xmlns' && defined $k2) {
341 1         5 $local_nsmap{$k2} = $v;
342             }
343             }
344              
345             # Normalize whitespace between attributes but not inside quotes
346             # - Collapse run of whitespace to one space
347             # - Remove leading/trailing whitespace
348             # - Preserve quoted attribute values
349             {
350 24         47 my $tmp = $attr_string;
  24         51  
351              
352             # Replace all whitespace sequences outside of quotes with a single space
353             # This works because it alternates: quoted | non-quoted
354 24         103 my @parts = $tmp =~ /"[^"]*"|'[^']*'|[^\s"'']+/g;
355              
356             # Rejoin non-quoted segments with a single space
357 24         131 $attr_string = join(' ', @parts);
358             }
359              
360 24         46 my %attributes;
361 24         67 pos($attr_string) = 0;
362              
363             # Accept name="value" and name='value' (value captured lazily, same quote used to open/close)
364             # Attribute name follows XML Name-ish rules: start with letter/underscore/colon, then letters/digits/._:-
365 24         115 while ($attr_string =~ /([A-Za-z_:][-A-Za-z0-9_.:]*)\s*=\s*(['"])(.*?)\2/g) {
366 12         46 my ($attr, $quote, $v) = ($1, $2, $3);
367              
368             # Skip xmlns declarations (already handled)
369 12 100       32 next if $attr =~ /^xmlns(?::|$)/;
370              
371             # Decode XML entities inside attribute values
372 11         53 $attributes{$attr} = $self->_decode_entities($v);
373             }
374              
375             my $node = {
376             name => $tag,
377             ns => $ns,
378 22 100       113 ns_uri => defined $ns ? $local_nsmap{$ns} : undef,
379             attributes => \%attributes,
380             children => [],
381             };
382              
383             # Return immediately if self-closing tag
384 22 100       54 return $node if $self_close;
385              
386             # Capture text
387 21 50       99 if ($$xml_ref =~ s{^([^<]+)}{}s) {
388 21         49 my $text = $self->_decode_entities($1);
389 21         87 $text =~ s/^\s+|\s+$//g;
390 21 100       57 push @{ $node->{children} }, { text => $text } if $text ne '';
  15         52  
391             }
392              
393             # Recursively parse children
394 21         87 while ($$xml_ref =~ /^\s*<([^\/>"][^>]*)>/) {
395 13         67 my $child = $self->_parse_node($xml_ref, \%local_nsmap);
396 13 50       2623 push @{ $node->{children} }, $child if $child;
  13         116  
397             }
398              
399             # Consume closing tag
400 21         1812 $$xml_ref =~ s{^\s*}{}s;
401              
402 21         152 return Return::Set::set_return($node, { 'type' => 'hashref', 'min' => 1 });
403             }
404              
405             # Internal helper to decode XML entities
406             sub _decode_entities {
407 32     32   86 my ($self, $text) = @_;
408              
409 32 50       111 return undef unless defined $text;
410              
411             # Decode known named entities
412 32         54 $text =~ s/</
413 32         45 $text =~ s/>/>/g;
414 32         41 $text =~ s/&/&/g;
415 32         46 $text =~ s/"/"/g;
416 32         37 $text =~ s/'/'/g;
417              
418             # Decode decimal numeric entities
419 32         53 $text =~ s/&#(\d+);/chr($1)/eg;
  3         11  
420              
421             # Decode hex numeric entities
422 32         44 $text =~ s/&#x([0-9a-fA-F]+);/chr(hex($1))/eg;
  1         3  
423              
424 32 100       63 if ($text =~ /&([^;]*);/) {
425 3         5 my $entity = $1;
426 3 50 33     16 unless ($entity =~ /^(lt|gt|amp|quot|apos)$/ || $entity =~ /^#(?:x[0-9a-fA-F]+|\d+)$/) {
427 3         4 my $msg = "Unknown or malformed XML entity: &$entity;";
428 3         5 $self->_handle_error($msg);
429             }
430             }
431              
432 31 100       63 if ($text =~ /&/) {
433 7         10 my $msg = "Unescaped ampersand detected: $text";
434 7         12 $self->_handle_error($msg);
435             }
436              
437 30         100 return $text;
438             }
439              
440             sub _handle_error {
441 10     10   12 my ($self, $message) = @_;
442              
443 10         12 my $error_message = __PACKAGE__ . ": XML Parsing Error: $message";
444              
445 10 100       19 if($self->{strict}) {
    100          
446             # Throws an error if strict mode is enabled
447 2 50       4 if($self->{'logger'}) {
448 0         0 $self->{'logger'}->fatal($error_message);
449             }
450 2         25 die $error_message;
451             } elsif ($self->{warn_on_error}) {
452             # Otherwise, just warn
453 3 50       6 if($self->{'logger'}) {
454 0         0 $self->{'logger'}->warn($error_message);
455             } else {
456 3         18 warn $error_message;
457             }
458             } else {
459 5 50       17 if($self->{'logger'}) {
460 0         0 $self->{'logger'}->notice($error_message);
461             } else {
462 5         45 print STDERR "Warning: $error_message\n";
463             }
464             }
465             }
466              
467             =head1 AUTHOR
468              
469             Nigel Horne, C<< >>
470              
471             =head1 SEE ALSO
472              
473             =over 4
474              
475             =item * Test coverage report: L
476              
477             =item * L
478              
479             =item * L
480              
481             =back
482              
483             =head1 SUPPORT
484              
485             This module is provided as-is without any warranty.
486              
487             =head1 LICENSE AND COPYRIGHT
488              
489             Copyright 2025 Nigel Horne.
490              
491             Usage is subject to licence terms.
492              
493             The licence terms of this software are as follows:
494              
495             =over 4
496              
497             =item * Personal single user, single computer use: GPL2
498              
499             =item * All other users (including Commercial, Charity, Educational, Government)
500             must apply in writing for a licence for use from Nigel Horne at the
501             above e-mail.
502              
503             =back
504              
505             =cut
506              
507             1;