File Coverage

blib/lib/ETL/Pipeline/Input/XmlFiles.pm
Criterion Covered Total %
statement 30 30 100.0
branch 1 2 50.0
condition 1 3 33.3
subroutine 8 8 100.0
pod 1 1 100.0
total 41 44 93.1


line stmt bran cond sub pod time code
1             =pod
2              
3             =head1 NAME
4              
5             ETL::Pipeline::Input::XmlFiles - Process XML content from individual files
6              
7             =head1 SYNOPSIS
8              
9             use ETL::Pipeline;
10             ETL::Pipeline->new( {
11             input => ['XmlFiles', iname => qr/\.xlsx$/i, records_at => '/Xml'],
12             mapping => {First => '/File/A', Second => '/File/Patient'},
13             output => ['UnitTest']
14             } )->process;
15              
16             =head1 DESCRIPTION
17              
18             B<ETL::Pipeline::Input::XmlFiles> defines an input source that reads one or more
19             records from one or more XML files. Most of the time, there should be one record
20             per file. But the class handles multiple records per file too.
21              
22             =cut
23              
24             package ETL::Pipeline::Input::XmlFiles;
25              
26 1     1   33 use 5.014000;
  1         3  
27 1     1   6 use warnings;
  1         1  
  1         37  
28              
29 1     1   6 use Carp;
  1         1  
  1         87  
30 1     1   6 use Data::DPath qw/dpath/;
  1         2  
  1         29  
31 1     1   977 use XML::Bare;
  1         7022  
  1         82  
32 1     1   14 use Moose;
  1         2  
  1         14  
33              
34              
35             our $VERSION = '2.00';
36              
37              
38             =head1 METHODS & ATTRIBUTES
39              
40             =head2 Arguments for L<ETL::Pipeline/input>
41              
42             =head3 records_at
43              
44             Optional. The path to the record nodes, such as C</XMLDATA/Root/Record>. The
45             last item in the list is the name of the root for each individual record. The
46             default is B</> - one record in the file.
47              
48             You might use this attribute in two cases...
49              
50             =over
51              
52             =item 1. Multiple records per file. This is the top of each record, like in L<ETL::Pipeline::Input::Xml>.
53              
54             =item 2. Shorthand to leave off extra nodes from every path. One record per file, but you don't want extra path parts on the beginning of every field.
55              
56             =back
57              
58             This can be any value accepted by L<Data::DPath>. Fortunately, L<Data::Dpath>
59             takes paths that look like XPath for XML.
60              
61             =cut
62              
63             has 'records_at' => (
64             default => '/',
65             is => 'ro',
66             isa => 'Str',
67             );
68              
69              
70             =head3 skipping
71              
72             Not used. This attribute is ignored. XML files must follow specific formatting
73             rules. Extra rows are parsed as data. There's nothing to skip.
74              
75             =head2 Methods
76              
77             =head3 run
78              
79             This is the main loop. It opens the file, reads records, and closes it when
80             done. This is the place to look if there are problems.
81              
82             L<ETL::Pipeline> automatically calls this method.
83              
84             =cut
85              
86             sub run {
87 3     3 1 10 my ($self, $etl) = @_;
88              
89 3         19 while (my $path = $self->next_path( $etl )) {
90             # Load the XML file and turn it into a Perl hash.
91 6         47 my $parser = XML::Bare->new( file => "$path" );
92 6         2519 my $xml = $parser->parse;
93              
94             # Find the node that is an array of records. dpath should return a list
95             # with one array reference. And that array has the actual records. But I
96             # check, just in case your XML is structured a little differently.
97             #
98             # XML should generate hashes - field/value pairs. In theory, there might
99             # be an XML file that sends back a single record as an array reference.
100             # Not likely when transfering data.
101 6         3275 my @matches = dpath( $self->records_at )->match( $xml );
102 6 50 33     2454 my $list = (scalar( @matches ) == 1 && ref( $matches[0] ) eq 'ARRAY') ? $matches[0] : \@matches;
103              
104             # Process each record. And that's it.
105 6         201 my $source = $self->source;
106 6         26 foreach my $record (@$list) {
107 6         206 $self->source( sprintf( '%s character %d', $source, $record->{_pos} ) );
108 6         50 $etl->record( $record );
109             }
110             }
111             }
112              
113              
114             =head1 SEE ALSO
115              
116             L<ETL::Pipeline>, L<ETL::Pipeline::Input>, L<ETL::Pipeline::Input::File::List>,
117             L<XML::Bare>
118              
119             =cut
120              
121             with 'ETL::Pipeline::Input';
122             with 'ETL::Pipeline::Input::File::List';
123              
124              
125             =head1 AUTHOR
126              
127             Robert Wohlfarth <robert.j.wohlfarth@vumc.org>
128              
129             =head1 LICENSE
130              
131             Copyright 2021 (c) Vanderbilt University Medical Center
132              
133             This program is free software; you can redistribute it and/or modify it under
134             the same terms as Perl itself.
135              
136             =cut
137              
138 1     1   7448 no Moose;
  1         3  
  1         8  
139             __PACKAGE__->meta->make_immutable;