File Coverage

blib/lib/ETL/Pipeline/Input/File.pm

Criterion	Covered	Total	%
statement	17	17	100.0
branch			n/a
condition			n/a
subroutine	7	7	100.0
pod	0	1	0.0
total	24	25	96.0

line	stmt	sub	pod	time	code
1					=pod
2
3					=head1 NAME
4
5					ETL::Pipeline::Input::File - Role for file based input sources
6
7					=head1 SYNOPSIS
8
9					# In the input source...
10					use Moose;
11					with 'ETL::Pipeline::Input';
12					with 'ETL::Pipeline::Input::File';
13					...
14
15					# In the ETL::Pipeline script...
16					ETL::Pipeline->new( {
17					work_in => {root => 'C:\Data', iname => qr/Ficticious/},
18					input => ['Excel', iname => qr/\.xlsx?$/ ],
19					mapping => {Name => 'A', Address => 'B', ID => 'C' },
20					constants => {Type => 1, Information => 'Demographic' },
21					output => ['SQL', table => 'NewData' ],
22					} )->process;
23
24					# Or with a specific file...
25					ETL::Pipeline->new( {
26					work_in => {root => 'C:\Data', iname => qr/Ficticious/},
27					input => ['Excel', iname => 'ExportedData.xlsx' ],
28					mapping => {Name => 'A', Address => 'B', ID => 'C' },
29					constants => {Type => 1, Information => 'Demographic' },
30					output => ['SQL', table => 'NewData' ],
31					} )->process;
32
33					=head1 DESCRIPTION
34
35					This role adds functionality and attributes common to all file based input
36					sources. It is a quick and easy way to create new sources with the ability
37					to search directories. Useful when the file name changes.
38
39					B<ETL::Pipeline::Input::File> works with a single source file. To process an
40					entire directory of files, use L<ETL::Pipeline::Input::FileListing> instead.
41
42					=cut
43
44					package ETL::Pipeline::Input::File;
45
46	4	4		3888	use 5.014000;
	4			19
47
48	4	4		26	use Carp;
	4			10
	4			298
49	4	4		26	use Moose::Role;
	4			9
	4			40
50	4	4		24026	use MooseX::Types::Path::Class qw/File/;
	4			12
	4			84
51	4	4		5614	use Path::Class::Rule;
	4			10
	4			2194
52
53
54					our $VERSION = '3.00';
55
56
57					=head1 METHODS & ATTRIBUTES
58
59					=head2 Arguments for L<ETL::Pipeline/input>
60
61					B<ETL::Pipeline::Input::File> accepts any of the tests provided by
62					L<Path::Iterator::Rule>. The value of the argument is passed directly into the
63					test. For boolean tests (e.g. readable, exists, etc.), pass an C<undef> value.
64
65					B<ETL::Pipeline::Input::File> automatically applies the C<file> filter. Do not
66					pass C<file> through L<ETL::Pipeline/input>.
67
68					C<iname> is the most common one that I use. It matches the file name, supports
69					wildcards and regular expressions, and is case insensitive.
70
71					# Search using a regular expression...
72					$etl->input( 'Excel', iname => qr/\.xlsx$/ );
73
74					# Search using a file glob...
75					$etl->input( 'Excel', iname => '*.xlsx' );
76
77					The code throws an error if no files match the criteria. Only the first match
78					is used. If you want to match more than one file, use
79					L<ETL::Pipeline::Input::File::List> instead.
80
81					=cut
82
83					# BUILD in the consuming class will override this one. I add a fake BUILD in
84					# case the class doesn't have one. The method modifier then runs the code to
85					# extract search criteria from the constructor arguments. The modifier will
86					# run even if the consuming class has its own BUILD.
87					# https://www.perlmonks.org/?node_id=837369
88		9	0		sub BUILD {}
89
90					after 'BUILD' => sub {
91					my $self = shift;
92					my $arguments = shift;
93
94					while (my ($name, $value) = each %$arguments) {
95					$self->_add_criteria( $name, $value )
96					if $name ne 'file' && Path::Class::Rule->can( $name );
97					}
98					};
99
100
101					# Execute the actual search AFTER everything is set in stone. This lets a script
102					# create the input source before it calls "work_in".
103					before 'run' => sub {
104					my ($self, $etl) = @_;
105
106					if (defined $self->path) {
107					$self->_set_path( $self->path->absolute( $etl->data_in ) )
108					if $self->path->is_relative;
109					} else {
110					# Build the search rule from the criteria passed to the constructor.
111					my $rule = Path::Class::Rule->new->file;
112					foreach my $pair ($self->_search_criteria) {
113					my $name = $pair->[0];
114					my $value = $pair->[1];
115
116					eval "\$rule = \$rule->$name( \$value )";
117					croak $@ unless $@ eq '';
118					}
119					my @matches = $rule->all( $etl->data_in );
120
121					# Find the first file that matches all of the criteria.
122					if (scalar( @matches ) < 1) {
123					croak 'No files matched the search criteria';
124					} elsif (!-r $matches[0]) {
125					croak "You do not have permission to read '$matches[0]'";
126					} else {
127					$self->_set_path( $matches[0] );
128					$self->source( $matches[0]->relative( $etl->work_in )->stringify );
129					$etl->status( 'INFO', 'File name' );
130					}
131					}
132					};
133
134
135					=head3 path
136
137					Optional. When passed to L<ETL::Pipeline/input>, this file becomes the input
138					source. No search or matching is performed. If you specify a relative path, it
139					is relative to L</data_in>.
140
141					Once the object has been created, this attribute holds the file that matched
142					search criteria. It should be used by your input source class as the file name.
143
144					# File inside of "data_in"...
145					$etl->input( 'Excel', path => 'Data.xlsx' );
146
147					# Absolute path name...
148					$etl->input( 'Excel', path => 'C:\Data.xlsx' );
149
150					# Inside the input source class...
151					open my $io, '<', $self->path;
152
153					=cut
154
155					has 'path' => (
156					coerce => 1,
157					is => 'ro',
158					isa => File,
159					writer => '_set_path',
160					);
161
162
163					=head3 skipping
164
165					Optional. B<skipping> jumps over a certain number of rows/lines in the beginning
166					of the file. Report formats often contain extra headers - even before the column
167					names. B<skipping> ignores those and starts processing at the data.
168
169					B<Note:> B<skipping> is applied I<before> reading column names.
170
171					B<skipping> accepts either an integer or code reference. An integer represents
172					the number of rows/records to ignore. For a code reference, the code discards
173					records until the subroutine returns a I<true> value.
174
175					# Bypass the first three rows.
176					$etl->input( 'Excel', skipping => 3 );
177
178					# Bypass until we find something in column 'C'.
179					$etl->input( 'Excel', skipping => sub { hascontent( $_->get( 'C' ) ) } );
180
181					The exact nature of the I<record> depends on the input file. For example files,
182					Excel files will send a data row as a hash. But a CSV file would send a single
183					line of plain text with no parsing. See the input source to find out exactly
184					what it sends.
185
186					If your input source implements B<skipping>, you can pass whatever parameters
187					you want. For consistency, I recommend passing the raw data. If you are jumping
188					over report headers, they may not be formatted.
189
190					=cut
191
192					has 'skipping' => (
193					default => 0,
194					is => 'ro',
195					isa => 'CodeRef\|Int',
196					);
197
198
199					#-------------------------------------------------------------------------------
200					# Internal methods and attributes
201
202					# Search criteria for the file list. I capture the criteria from the constructor
203					# but don't build the iterator until the loop kicks off. Since the search
204					# depends on "data_in", this allows the user to setup the pipeline in whatever
205					# order they want and it will do the right thing.
206					has '_criteria' => (
207					default => sub { {} },
208					handles => {_add_criteria => 'set', _search_criteria => 'kv'},
209					is => 'ro',
210					isa => 'HashRef[Any]',
211					traits => [qw/Hash/],
212					);
213
214
215					=head1 SEE ALSO
216
217					L<ETL::Pipeline>, L<ETL::Pipeline::Input>, L<ETL::Pipeline::Input::File::List>,
218					L<Path::Iterator::Rule>
219
220					=head1 AUTHOR
221
222					Robert Wohlfarth <robert.j.wohlfarth@vumc.org>
223
224					=head1 LICENSE
225
226					Copyright 2021 (c) Vanderbilt University Medical Center
227
228					This program is free software; you can redistribute it and/or modify it under
229					the same terms as Perl itself.
230
231					=cut
232
233	4	4		53	no Moose;
	4			11
	4			45
234
235					# Required by Perl to load the module.
236					1;