File Coverage

blib/lib/ETL/Pipeline/Input/Tabular.pm

Criterion	Covered	Total	%
statement	24	28	85.7
branch	4	6	66.6
condition			n/a
subroutine	7	8	87.5
pod	2	2	100.0
total	37	44	84.0

line	stmt	bran	sub	pod	time	code
1						=pod
2
3						=head1 NAME
4
5						ETL::Pipeline::Input::Tabular - Sequential input in rows and columns
6
7						=head1 SYNOPSIS
8
9						# In the input source...
10						use Moose;
11						with 'ETL::Pipeline::Input::Tabular';
12						...
13
14						=head1 DESCRIPTION
15
16						B<ETL::Pipeline::Input::Tabular> provides a common interface where the data
17						is in a table or columns. Spreadsheets and CSV files are considered I<tabular>.
18
19						While B<ETL::Pipeline::Input::Tabular> works with any sequential input source,
20						L<ETL::Pipeline::Input::File>s would be the most common.
21
22						=cut
23
24						package ETL::Pipeline::Input::Tabular;
25	3		3		2924	use Moose::Role;
	3				5
	3				19
26
27	3		3		10738	use 5.014000;
	3				8
28	3		3		1392	use List::AllUtils qw/indexes/;
	3				14086
	3				215
29	3		3		18	use String::Util qw/hascontent trim/;
	3				4
	3				1665
30
31
32						our $VERSION = '1.00';
33
34
35						=head1 METHODS & ATTRIBUTES
36
37						=head2 Arguments for L<ETL::Pipeline/input>
38
39						=head3 no_column_names
40
41						By default, B<ETL::Pipeline::Input::Tabular> assumes that the first data row
42						has column names (headers) and not real data. If your data does not have column
43						names, set this boolean flag to B<true>.
44
45						$etl->input( 'Excel', no_column_names => 1 );
46
47						=cut
48
49						has 'no_column_names' => (
50						default => 0,
51						is => 'ro',
52						isa => 'Bool',
53						);
54
55
56						=head3 skipping
57
58						B<skipping> jumps over a certain number of records in the beginning of the
59						file. Report formats often contain extra headers - even before the column
60						names. B<skipping> ignores those and starts processing at the data.
61
62						B<skipping> accepts either an integer or code reference. An integer represents
63						the number of rows/records to ignore. For a code reference, the code discards
64						records until the subroutine returns a I<true> value.
65
66						# Bypass the first three rows.
67						$etl->input( 'Excel', skipping => 3 );
68
69						# Bypass until we find something in column 'C'.
70						$etl->input( 'Excel', skipping => sub { hascontent( $_->get( 'C' ) ) } );
71
72						=cut
73
74						has 'skipping' => (
75						default => 0,
76						is => 'ro',
77						isa => 'CodeRef\|Int',
78						);
79
80
81						# This block of code implements both "skipping" and "no_column_names".
82						after 'configure' => sub {
83						my $self = shift @_;
84
85						# "skipping"
86						my $headers = $self->skipping;
87						if (ref( $headers ) eq 'CODE') {
88						do {
89						$self->next_record;
90						} until $self->pipeline->execute_code_ref( $headers );
91						$self->_cached( 1 );
92						} else { $self->next_record foreach (1 .. $headers); }
93
94						# "no_column_names"
95						$self->get_column_names unless $self->no_column_names;
96						};
97
98
99						# This attribute indicates if the next record has been cached in memory. When
100						# processing variable length report headers, I can't tell they end until I read
101						# the next line. If the next line is where your data starts, then I can't just
102						# throw it away. This attribute tells the code to process the current record in
103						# memory instead of reading one from disk.
104						#
105						# The code automatically adjusts the record count down, so that we don't count
106						# this record twice.
107						has '_cached' => (
108						default => 0,
109						is => 'rw',
110						isa => 'Bool',
111						trigger => \&_trigger_cached,
112						);
113
114
115						around 'next_record' => sub {
116						my ($original, $self, @arguments) = @_;
117
118						if ($self->_cached) {
119						$self->_cached( 0 );
120						return 1;
121						} else { return $original->( $self, @arguments ); }
122						};
123
124
125						sub _trigger_cached {
126	1		1		2	my ($self, $new, $old) = @_;
127	1	50			37	$self->decrement_record_number if $new;
128						}
129
130
131						=head2 Other Methods & Attributes
132
133						=head3 get_column_names
134
135						This method reads the column name row, parses it, and sets L</column_names>.
136						B<ETL::Pipeline::Input::TabularFile> knows nothing about the internal storage
137						of individual records. It relies on the implementing class for that ability.
138						That's where B<get_column_names> comes into play.
139
140						B<get_column_names> should call L</add_column> for each column name.
141
142						sub get_column_names {
143						my ($self) = @_;
144						$self->next_record;
145						# Loop through all of the fields...
146						$self->add_column( $value, $field );
147						}
148
149						=cut
150
151						requires 'get_column_names';
152
153
154						=head3 column_names
155
156						B<column_names> holds a list of the column names as read from the file. The
157						list is kept in file order. Duplicate names are allowed. B<column_names> is
158						filled when L</get_column_names> calls the L</add_column> method.
159
160						When L<ETL::Pipeline/mapping> calls L<ETL::Pipeline::Input/get>, this role
161						intercepts the call. The role translates column names or regular expressions
162						into actual field names. L<ETL::Pipeline::Input/get> returns a list of values
163						from all fields that match.
164
165						=cut
166
167						has 'column_names' => (
168						default => sub { [] },
169						handles => {
170						_add_column_name => 'push',
171						_clear_column_names => 'clear',
172						columns => 'elements',
173						_get_column_name => 'get',
174						number_of_columns => 'count',
175						},
176						init_arg => undef,
177						is => 'ro',
178						isa => 'ArrayRef[Str]',
179						traits => [qw/Array/],
180						);
181
182
183						# This private hash is used for non-numeric field names.
184						has '_column_mapping' => (
185						default => sub { {} },
186						handles => {
187						_clear_column_mapping => 'clear',
188						column_mapped => 'exists',
189						_get_field_names => 'get',
190						_set_column_mapping => 'set',
191						},
192						init_arg => undef,
193						is => 'ro',
194						isa => 'HashRef[ArrayRef[Any]]',
195						traits => [qw/Hash/],
196						);
197
198
199						around 'get' => sub {
200						my ($original, $self, $field, @arguments) = @_;
201
202						# Find the first match based on order fields appear in the file.
203						my @matches;
204						if (ref( $field ) eq 'Regexp') {
205						@matches = indexes { m/$field/ } $self->columns;
206						} else {
207						@matches = indexes { $_ eq $field } $self->columns;
208						}
209
210						# See if this column name maps to a field. If it doesn't, the index
211						# number is the real field name.
212						my @real_field;
213						foreach my $index (@matches) {
214						my $column = $self->_get_column_name( $index );
215						if ($self->column_mapped( $column )) {
216						push @real_field, @{$self->_get_field_names( $column )};
217						} else {
218						push @real_field, $index;
219						}
220						}
221
222						# Call the real "get" method with the translated field name.
223						if (scalar( @real_field ) == 0) {
224						if (ref( $field ) eq 'Regexp') {
225						return ();
226						} else {
227						return $original->( $self, $field, @arguments );
228						}
229						} else {
230						return map { $original->( $self, $_, @arguments ) } @real_field;
231						}
232						};
233
234
235						=head3 add_column
236
237						L</get_column_names> calls this method once for every column name.
238						B<add_column> puts the column name into L</column_names>.
239
240						L</get_column_names> passes in the column name as the first parameter and the
241						field name as the second. The field name is optional.
242						L<ETL::Pipeline::Input/get> will use the L</column_names> index as the field
243						name by default.
244
245						# Add column names for fields 0 and 1. No field name means that "get" uses
246						# the index numbers - 0 and 1.
247						$self->add_column( 'First' );
248						$self->add_column( 'Second' );
249
250						# Add column names for fields 'A' and 'B'. Always pass the field name if
251						# it's a string.
252						$self->add_column( 'First', 'A' );
253						$self->add_column( 'Second', 'B' );
254
255						B<Note:> B<add_column> trims leading and trailing whitespace from column names.
256
257						=cut
258
259						sub add_column {
260	32		32	1	261	my $self = shift;
261	32				66	my $name = trim( shift );
262
263	32				1317	$self->_add_column_name( $name );
264
265						# Always return the first field with a given name.
266	32	100			74	if (scalar( @_ ) > 0) {
267	26				29	my $field = shift;
268	26				848	my $mapping = $self->_get_field_names( $name );
269	26	50			40	if (defined $mapping) {
270	0				0	push @$mapping, $field;
271						} else {
272	26				833	$self->_set_column_mapping( $name, [$field] );
273						}
274						}
275						}
276
277
278						=head3 reset_column_names
279
280						This method wipes out the existing column names. It can be used from
281						L</get_column_names>.
282
283						$self->reset_column_names;
284
285						=cut
286
287						sub reset_column_names {
288	0		0	1		my ($self) = @_;
289	0					$self->_clear_column_mapping;
290	0					$self->_clear_column_names;
291						}
292
293
294						=head1 SEE ALSO
295
296						L<ETL::Pipeline>, L<ETL::Pipeline::Input>, L<ETL::Pipeline::Input::File>
297
298						=head1 AUTHOR
299
300						Robert Wohlfarth <robert.j.wohlfarth@vanderbilt.edu>
301
302						=head1 LICENSE
303
304						Copyright 2016 (c) Vanderbilt University Medical Center
305
306						This program is free software; you can redistribute it and/or modify it under
307						the same terms as Perl itself.
308
309						=cut
310
311	3		3		15	no Moose;
	3				3
	3				21
312
313						# Required by Perl to load the module.
314						1;