File Coverage

blib/lib/ETL/Pipeline/Input/Xml.pm
Criterion Covered Total %
statement 18 20 90.0
branch n/a
condition n/a
subroutine 7 7 100.0
pod n/a
total 25 27 92.5


line stmt bran cond sub pod time code
1             =pod
2              
3             =head1 NAME
4              
5             ETL::Pipeline::Input::Xml - Records from an XML file
6              
7             =head1 SYNOPSIS
8              
9             use ETL::Pipeline;
10             ETL::Pipeline->new( {
11             input => ['Xml', matching => 'Data.xml', root => '/Root'],
12             mapping => {Name => 'Name', Address => 'Address'},
13             output => ['UnitTest']
14             } )->process;
15              
16             =head1 DESCRIPTION
17              
18             B<ETL::Pipeline::Input::Xml> defines an input source that reads records from an
19             XML file. Individual records are found under the L</root> node. Fields are
20             accessed with a relative XML path.
21              
22             =cut
23              
24             package ETL::Pipeline::Input::Xml;
25 1     1   3 use Moose;
  1         1  
  1         6  
26              
27 1     1   4635 use 5.014000;
  1         2  
28 1     1   3 use warnings;
  1         1  
  1         27  
29              
30 1     1   3 use Carp;
  1         1  
  1         56  
31 1     1   3 use List::Util qw/first/;
  1         1  
  1         51  
32 1     1   4 use String::Util qw/hascontent trim/;
  1         1  
  1         36  
33 1     1   479 use XML::XPath;
  0            
  0            
34              
35              
36             our $VERSION = '2.00';
37              
38              
39             =head1 METHODS & ATTRIBUTES
40              
41             =head2 Arguments for L<ETL::Pipeline/input>
42              
43             =head3 root
44              
45             The B<root> attribute holds the XPath for the top node. L</next_record>
46             iterates over B<root>'s children.
47              
48             =cut
49              
50             has 'root' => (
51             is => 'ro',
52             isa => 'Str',
53             );
54              
55              
56             =head2 Called from L<ETL::Pipeline/process>
57              
58             =head3 get
59              
60             B<get> returns a list of values from matching nodes. The field name is an
61             I<XPath>, relative to L</root>. See
62             L<http://www.w3schools.com/xpath/xpath_functions.asp> for more information on
63             XPaths.
64              
65             XML lends itself to recursive records. What happens when you need two fields
66             under the same subnode? For example, a I<person involved> can have both a
67             I<name> and a I<role>. The names and roles go together. How do you B<get> them
68             together?
69              
70             B<get> supports subnodes as additional parameters. Pass the top node as the
71             first parameter. Pass the subnode names in subsequent parameters. The values
72             are returned in the same order as the parameters. B<get> returns C<undef> for
73             any non-existant subnodes.
74              
75             Here are some examples...
76              
77             # Return a single value from a single field.
78             $etl->get( 'Name' );
79             'John Doe'
80            
81             # Return a list from multiple fields with the same name.
82             $etl->get( 'PersonInvolved/Name' );
83             ('John Doe', 'Jane Doe')
84            
85             # Return a list from subnodes.
86             $etl->get( 'PersonInvolved', 'Name' );
87             ('John Doe', 'Jane Doe')
88            
89             # Return a list of related fields from subnodes.
90             $etl->get( 'PersonInvolved', 'Name', 'Role' );
91             (['John Doe', 'Husband'], ['Jane Doe', 'Wife'])
92              
93             In the L<ETL::Pipeline/mapping>, those examples looks like this...
94              
95             {Name => 'Name'}
96             {Name => 'PersonInvolved/Name'}
97             {Name => ['PersonInvolved', 'Name']}
98             {Name => ['PersonInvolved', 'Name', 'Role']}
99              
100             =cut
101              
102             sub get {
103             my ($self, $top, @subnodes) = @_;
104             my $xpath = $self->xpath;
105              
106             my $match = $xpath->find( $top, $self->current );
107             if ($match->isa( 'XML::XPath::NodeSet' )) {
108             if (scalar( @subnodes ) == 0) {
109             return map { $_->string_value } $match->get_nodelist;
110             } elsif (scalar( @subnodes ) == 1) {
111             my @values;
112             foreach my $node ($match->get_nodelist) {
113             my $data = $xpath->find( $subnodes[0], $node );
114             push @values, $data->string_value;
115             }
116             return @values;
117             } else {
118             my @values;
119             foreach my $node ($match->get_nodelist) {
120             my @current;
121             foreach my $path (@subnodes) {
122             my $data = $xpath->find( $path, $node );
123             push @current, $data->string_value;
124             }
125             push @values, \@current;
126             }
127             return @values;
128             }
129             } else { return $match->value; }
130             }
131              
132              
133             =head3 next_record
134              
135             This method parses the next file in the folder.
136              
137             B<Data::ETL::Extract::XmlFiles> builds a list of file names when it first
138             starts. B<next_record> iterates over this in-memory list. It will not parse
139             any new files saved into the folder.
140              
141             =cut
142              
143             sub next_record {
144             my ($self) = @_;
145              
146             my $return = undef;
147             until (defined $return) {
148             my $next = $self->node_set->shift();
149             if (not defined $next) {
150             $return = 0;
151             } elsif ($next->isa( 'XML::XPath::Node::Element' )) {
152             $self->_set_current( $next );
153             $return = 1;
154             }
155             }
156             return $return;
157             }
158              
159              
160             =head3 configure
161              
162             B<configure> opens the XML file and extracts the node set. L</next_record> then
163             iterates over the node set.
164              
165             =cut
166              
167             sub configure {
168             my ($self) = @_;
169              
170             my $file = $self->file;
171             my $root = $self->root;
172              
173             my $parser = XML::XPath->new( filename => "$file" );
174             my $node_set = $parser->findnodes( $root );
175             croak "Cannot find $root in $file" unless defined $node_set;
176              
177             $self->_set_xpath( $parser );
178             $self->_set_node_set( $node_set );
179             }
180              
181              
182             =head3 finish
183              
184             B<finish> doesn't actually do anything. But it is required by
185             L<ETL::Pipeline/process>.
186              
187             =cut
188              
189             sub finish { }
190              
191              
192             =head2 Other Methods & Attributes
193              
194             =head3 attribute
195              
196             The B<attribute> method returns the value of an attribute on the root node.
197             For example, deleted records may have an attribute like C<ACTION="DELETE">.
198             L<ETL::Pipeline::Input/skip_if> can use B<attribute> and bypass these records.
199              
200             $elt->input( 'Xml',
201             bypass_if => sub { $_->input->attribute( 'ACTION' ) eq 'DELETE' },
202             matching => 'Data.xml',
203             root_node => '/File'
204             );
205              
206             =cut
207              
208             sub attribute {
209             my ($self, $name) = @_;
210             return $self->current->getAttribute( $name );
211             }
212              
213              
214             =head3 current
215              
216             The B<current> attribute holds the currently selected node (record).
217             L</next_record> automatically sets B<current>.
218              
219             =cut
220              
221             has 'current' => (
222             init_arg => undef,
223             is => 'ro',
224             isa => 'XML::XPath::Node::Element',
225             writer => '_set_current',
226             );
227              
228              
229             =head3 node_set
230              
231             The B<node_set> attribute holds the node set of records. It is the list of
232             records in this file. L</configure> automatically sets B<node_set>.
233              
234             =cut
235              
236             has 'node_set' => (
237             init_arg => undef,
238             is => 'ro',
239             isa => 'XML::XPath::NodeSet',
240             writer => '_set_node_set',
241             );
242              
243              
244             =head3 xpath
245              
246             The B<xpath> attribute holds the current L<XML::XPath> object. It is
247             automatically set by the L</next_record> method.
248              
249             =cut
250              
251             has 'xpath' => (
252             init_arg => undef,
253             is => 'ro',
254             isa => 'XML::XPath',
255             writer => '_set_xpath',
256             );
257              
258              
259             =head1 SEE ALSO
260              
261             L<ETL::Pipeline>, L<ETL::Pipeline::Input>, L<ETL::Pipeline::Input::File>,
262             L<XML::XPath>
263              
264             =cut
265              
266             with 'ETL::Pipeline::Input::File';
267             with 'ETL::Pipeline::Input';
268              
269              
270             =head1 AUTHOR
271              
272             Robert Wohlfarth <robert.j.wohlfarth@vanderbilt.edu>
273              
274             =head1 LICENSE
275              
276             Copyright 2016 (c) Vanderbilt University Medical Center
277              
278             This program is free software; you can redistribute it and/or modify it under
279             the same terms as Perl itself.
280              
281             =cut
282              
283             no Moose;
284             __PACKAGE__->meta->make_immutable;