File Coverage

blib/lib/ETL/Pipeline/Input/File/List.pm
Criterion Covered Total %
statement 34 34 100.0
branch 5 6 83.3
condition n/a
subroutine 8 8 100.0
pod 1 2 50.0
total 48 50 96.0


line stmt bran cond sub pod time code
1             =pod
2              
3             =head1 NAME
4              
5             ETL::Pipeline::Input::File::List - Role for input sources with multiple files
6              
7             =head1 SYNOPSIS
8              
9             # In the input source...
10             use Moose;
11             with 'ETL::Pipeline::Input';
12             with 'ETL::Pipeline::Input::File::List';
13             ...
14             sub run {
15             my ($self, $etl) = @_;
16             ...
17             while (my $path = $self->next_path( $etl )) {
18             ...
19             }
20             }
21              
22             =head1 DESCRIPTION
23              
24             This is a role used by input sources. It defines everything you need to process
25             multiple input files of the same format. The role uses L<Path::Class::Rule> to
26             locate matching files.
27              
28             Your input source calls the L</next_path> method in a loop. That's it. The role
29             automatically processes constructor arguments that match L<Path::Class::Rule>
30             criteria. It then builds a list of matching files the first time your code calls
31             L</next_path>.
32              
33             =cut
34              
35             package ETL::Pipeline::Input::File::List;
36              
37 4     4   4718 use 5.014000;
  4         13  
38              
39 4     4   32 use Carp;
  4         9  
  4         282  
40 4     4   21 use Moose::Role;
  4         9  
  4         45  
41 4     4   19783 use MooseX::Types::Path::Class;
  4         11  
  4         76  
42 4     4   3725 use Path::Class::Rule;
  4         9  
  4         1981  
43              
44              
45             our $VERSION = '3.00';
46              
47              
48             =head1 METHODS & ATTRIBUTES
49              
50             =head2 Arguments for L<ETL::Pipeline/input>
51              
52             B<ETL::Pipeline::Input::File::List> accepts any of the tests provided by
53             L<Path::Iterator::Rule>. The value of the argument is passed directly into the
54             test. For boolean tests (e.g. readable, exists, etc.), pass an C<undef> value.
55              
56             B<ETL::Pipeline::Input::File> automatically applies the C<file> filter. Do not
57             pass C<file> through L<ETL::Pipeline/input>.
58              
59             C<iname> is the most common one that I use. It matches the file name, supports
60             wildcards and regular expressions, and is case insensitive.
61              
62             # Search using a regular expression...
63             $etl->input( 'XmlFiles', iname => qr/\.xml$/ );
64              
65             # Search using a file glob...
66             $etl->input( 'XmlFiles', iname => '*.xml' );
67              
68             =cut
69              
70             # BUILD in the consuming class will override this one. I add a fake BUILD in
71             # case the class doesn't have one. The method modifier then runs the code to
72             # extract search criteria from the constructor arguments. The modifier will
73             # run even if the consuming class has its own BUILD.
74             # https://www.perlmonks.org/?node_id=837369
75       16 0   sub BUILD {}
76              
77             after 'BUILD' => sub {
78             my $self = shift;
79             my $arguments = shift;
80              
81             while (my ($name, $value) = each %$arguments) {
82             $self->_add_criteria( $name, $value )
83             if $name ne 'file' && Path::Class::Rule->can( $name );
84             }
85             };
86              
87              
88             =head3 path
89              
90             L<Path::Class::File> object for the currently selected file. This is first file
91             that matches the criteria. When you call L</next_path>, it finds the next match
92             and sets B<path>.
93              
94             So B<path> always points to the current file. It should be used by your input
95             source class as the file name.
96              
97             # Inside the input source class...
98             while ($self->next_path( $etl )) {
99             open my $io, '<', $self->path;
100             ...
101             }
102              
103             C<undef> means no more matches.
104              
105             =cut
106              
107             has 'path' => (
108             coerce => 1,
109             is => 'ro',
110             isa => 'Path::Class::File|Undef',
111             writer => '_set_path',
112             );
113              
114              
115             =head2 Methods
116              
117             =head3 next_path
118              
119             Looks for the next match in the list and sets the L</path> attribute. It also
120             returns the matching path. Your input source class should setup a loop calling
121             this method. Inside the loop, process each file.
122              
123             B<next_path> takes one parameter - the L<ETL::Pipeline> object. The method
124             matches files in L<ETL::Pipeline/data_in>.
125              
126             =cut
127              
128             sub next_path {
129 72     72 1 840 my ($self, $etl) = @_;
130              
131 72 100       1838 if ($self->_list_built) {
132             # Get the next file from the list. We'll return "undef" if you query
133             # beyond the end of the list.
134 57         1734 $self->_next_file;
135             } else {
136             # Build the list the first time through.
137 15         150 my $rule = Path::Class::Rule->new->file;
138 15         1091 foreach my $pair ($self->_search_criteria) {
139 14         36 my $name = $pair->[0];
140 14         26 my $value = $pair->[1];
141              
142 14         948 eval "\$rule = \$rule->$name( \$value )";
143 14 50       2704 croak $@ unless $@ eq '';
144             }
145 15         75 $self->_matches( $rule->all( $etl->data_in ) );
146 15         596 $self->_list_built( 1 );
147             }
148              
149             # Set "position" to something more readable.
150 72         1697 my $file = $self->_set_path( $self->_file( $self->_file_index ) );
151              
152 72 100       182 if (defined $file) {
153 57         233 $self->source( $file->relative( $etl->work_in )->stringify );
154 57         295 $etl->status( 'INFO', 'Next file' );
155 15         341 } else { $self->source( '' ); }
156              
157 72         369 return $file;
158             }
159              
160              
161             #-------------------------------------------------------------------------------
162             # Internal methods and attributes
163              
164             # Search criteria for the file list. I capture the criteria from the constructor
165             # but don't build the iterator until the loop kicks off. Since the search
166             # depends on "data_in", this allows the user to setup the pipeline in whatever
167             # order they want and it will do the right thing.
168             has '_criteria' => (
169             default => sub { {} },
170             handles => {_add_criteria => 'set', _search_criteria => 'kv'},
171             is => 'ro',
172             isa => 'HashRef[Any]',
173             traits => [qw/Hash/],
174             );
175              
176              
177             # Index into "_file_list" for the current file. This counter is used to loop
178             # through the list by calling "next_path".
179             has '_file_index' => (
180             default => 0,
181             handles => {_next_file => 'inc'},
182             is => 'ro',
183             isa => 'Int',
184             traits => [qw/Counter/],
185             );
186              
187              
188             # List of files that match the search criteria. The list is built at the
189             # beginning of the pipeline. So your pipeline can't add files on the fly.
190             has '_file_list' => (
191             default => sub { [] },
192             handles => {_file => 'get', _matches => 'push'},
193             is => 'ro',
194             isa => 'ArrayRef[Any]',
195             traits => [qw/Array/],
196             );
197              
198              
199             # Since the list always exists, I needed a way to tell the difference between
200             # "no matches" and "not built yet". That way, "next_record" can build the list
201             # on the first pass.
202             has '_list_built' => (
203             default => 0,
204             is => 'rw',
205             isa => 'Bool',
206             );
207              
208              
209             =head1 SEE ALSO
210              
211             L<ETL::Pipeline>, L<ETL::Pipeline::Input>, L<Path::Class::File>,
212             L<Path::Class::Rule>, L<Path::Iterator::Rule>
213              
214             =cut
215              
216             =head1 AUTHOR
217              
218             Robert Wohlfarth <robert.j.wohlfarth@vumc.org>
219              
220             =head1 LICENSE
221              
222             Copyright 2021 (c) Vanderbilt University Medical Center
223              
224             This program is free software; you can redistribute it and/or modify it under
225             the same terms as Perl itself.
226              
227             =cut
228              
229 4     4   33 no Moose;
  4         9  
  4         36  
230              
231             # Required by Perl to load the module.
232             1;