line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
=pod |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=head1 NAME |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
ETL::Pipeline::Input::Excel - Input source for Microsoft Excel spreadsheets |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 SYNOPSIS |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
use ETL::Pipeline; |
10
|
|
|
|
|
|
|
ETL::Pipeline->new( { |
11
|
|
|
|
|
|
|
input => ['Excel', iname => qr/\.xlsx$/i], |
12
|
|
|
|
|
|
|
mapping => {First => 'A', Second => qr/ID\s*Num/i}, |
13
|
|
|
|
|
|
|
output => ['UnitTest'] |
14
|
|
|
|
|
|
|
} )->process; |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=head1 DESCRIPTION |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
B<ETL::Pipeline::Input::Excel> defines an input source for reading MS Excel |
19
|
|
|
|
|
|
|
spreadsheets. It uses L<Spreadsheet::XLSX> or L<Spreadsheet::ParseExcel>, |
20
|
|
|
|
|
|
|
depending on the file type (XLSX or XLS). |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=cut |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
package ETL::Pipeline::Input::Excel; |
25
|
|
|
|
|
|
|
|
26
|
1
|
|
|
1
|
|
33
|
use 5.014000; |
|
1
|
|
|
|
|
3
|
|
27
|
1
|
|
|
1
|
|
8
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
39
|
|
28
|
|
|
|
|
|
|
|
29
|
1
|
|
|
1
|
|
5
|
use Carp; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
93
|
|
30
|
1
|
|
|
1
|
|
8
|
use List::AllUtils qw/first none/; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
65
|
|
31
|
1
|
|
|
1
|
|
7
|
use Moose; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
12
|
|
32
|
1
|
|
|
1
|
|
10062
|
use Spreadsheet::ParseExcel; |
|
1
|
|
|
|
|
54654
|
|
|
1
|
|
|
|
|
58
|
|
33
|
1
|
|
|
1
|
|
15
|
use Spreadsheet::ParseExcel::Utility qw/int2col/; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
83
|
|
34
|
1
|
|
|
1
|
|
766
|
use Spreadsheet::XLSX; |
|
1
|
|
|
|
|
86686
|
|
|
1
|
|
|
|
|
50
|
|
35
|
1
|
|
|
1
|
|
15
|
use String::Util qw/hascontent/; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
766
|
|
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
our $VERSION = '3.00'; |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
=head1 METHODS & ATTRIBUTES |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=head2 Arguments for L<ETL::Pipeline/input> |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
B<ETL::Pipeline::Input::Excel> implements the L<ETL::Pipeline::Input::File> |
46
|
|
|
|
|
|
|
and L<ETL::Pipeline::Input::File::Table> roles. It supports all of the |
47
|
|
|
|
|
|
|
attributes from these roles. |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
=head3 worksheet |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
Optional. B<worksheet> reads data from a specific worksheet. By default, |
52
|
|
|
|
|
|
|
B<ETL::Pipeline::Input::Excel> uses the first worksheet. |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
B<worksheet> accepts a string or regular expression. As a string, B<worksheet> |
55
|
|
|
|
|
|
|
looks for an exact match. As a regular expression, B<worksheet> finds the first |
56
|
|
|
|
|
|
|
worksheet whose name matches the regular expression. Note that B<worksheet> |
57
|
|
|
|
|
|
|
stops looking once it finds the first mach. |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
B<ETL::Pipeline::Input::Excel> throws an error if it cannot find a worksheet |
60
|
|
|
|
|
|
|
with a matching name. |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
=cut |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
has 'worksheet' => ( |
65
|
|
|
|
|
|
|
is => 'ro', |
66
|
|
|
|
|
|
|
isa => 'Maybe[RegexpRef|Str]', |
67
|
|
|
|
|
|
|
); |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=head3 password |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
Optional. B<password> works with encrypted files. B<ETL::Pipeline::Input::Excel> |
73
|
|
|
|
|
|
|
decrypts the file automatically. |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
B<Warning:> B<password> only works with Excel 2003 file (XLS). Encrypted XLSX |
76
|
|
|
|
|
|
|
files always fail. L<Spreadsheet::XLSX> does not support encryption. |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
=cut |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
has 'password' => ( |
81
|
|
|
|
|
|
|
is => 'ro', |
82
|
|
|
|
|
|
|
isa => 'Maybe[Str]', |
83
|
|
|
|
|
|
|
); |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
=head3 skipping |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
Optional. If you use a code reference for B<skipping>, this input source sends a |
89
|
|
|
|
|
|
|
hash reference. You can access the columns by number or letter. |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
If you pass an integer, the input source completely skips over that many lines. |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
=head2 Methods |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
=head3 run |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
This is the main loop. It opens the file, reads records, and closes it when |
98
|
|
|
|
|
|
|
done. This is the place to look if there are problems. |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
L<ETL::Pipeline> automatically calls this method. |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
=cut |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
sub run { |
105
|
|
|
|
|
|
|
my ($self, $etl) = @_; |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
108
|
|
|
|
|
|
|
# Open the file. |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
# Create the correct workbook object based on the file format. |
111
|
|
|
|
|
|
|
my $path = $self->path; |
112
|
|
|
|
|
|
|
my $workbook; |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
if ("$path" =~ m/\.xls$/i) { |
115
|
|
|
|
|
|
|
my $excel = Spreadsheet::ParseExcel->new( Password => $self->password ); |
116
|
|
|
|
|
|
|
$workbook = $excel->parse( "$path" ); |
117
|
|
|
|
|
|
|
croak "Unable to open the Excel file '$path'" unless defined $workbook; |
118
|
|
|
|
|
|
|
} else { |
119
|
|
|
|
|
|
|
$workbook = Spreadsheet::XLSX->new( "$path" ); |
120
|
|
|
|
|
|
|
croak "Unable to open the Excel file '$path'" unless defined $workbook; |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
# Find the worksheet with data... |
124
|
|
|
|
|
|
|
my $name = $self->worksheet; |
125
|
|
|
|
|
|
|
my $worksheet; |
126
|
|
|
|
|
|
|
if (hascontent( $name )) { |
127
|
|
|
|
|
|
|
if (ref( $name ) eq 'Regexp') { |
128
|
|
|
|
|
|
|
$worksheet = first { $_->get_name() =~ m/$name/ } $workbook->worksheets(); |
129
|
|
|
|
|
|
|
} else { |
130
|
|
|
|
|
|
|
$worksheet = $workbook->worksheet( $name ); |
131
|
|
|
|
|
|
|
} |
132
|
|
|
|
|
|
|
croak "No worksheets match '$name'" unless defined $worksheet; |
133
|
|
|
|
|
|
|
} else { |
134
|
|
|
|
|
|
|
$worksheet = $workbook->worksheet( 0 ); |
135
|
|
|
|
|
|
|
croak "'$path' has no worksheets" unless defined $worksheet; |
136
|
|
|
|
|
|
|
} |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
# Convert the column numbers into their letter designations. |
139
|
|
|
|
|
|
|
my %names; |
140
|
|
|
|
|
|
|
$names{int2col( $_ )} = $_ foreach ($worksheet->{MinCol} .. $worksheet->{MaxCol}); |
141
|
|
|
|
|
|
|
$etl->aliases( \%names ); |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
144
|
|
|
|
|
|
|
# Read the records. |
145
|
|
|
|
|
|
|
my $cells = $worksheet->{Cells}; |
146
|
|
|
|
|
|
|
my $start = $worksheet->{MinRow}; |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
# Skip over report headers. These are not data. They are extra rows put |
149
|
|
|
|
|
|
|
# there for report formats. The data starts after these rows. |
150
|
|
|
|
|
|
|
my $skip = $self->skipping; |
151
|
|
|
|
|
|
|
if (ref( $skip ) eq 'CODE') { |
152
|
|
|
|
|
|
|
while ($start <= $worksheet->{MaxRow}) { |
153
|
|
|
|
|
|
|
my %record; |
154
|
|
|
|
|
|
|
foreach my $column ($worksheet->{MinCol} .. $worksheet->{MaxCol}) { |
155
|
|
|
|
|
|
|
$record{$column} = $cells->[$start][$column]->value; |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
foreach my $alias ($etl->aliases) { |
158
|
|
|
|
|
|
|
while (my ($name, $column) = each %$alias) { |
159
|
|
|
|
|
|
|
$record{$name} = $record{$column} if |
160
|
|
|
|
|
|
|
exists( $record{$column} ) |
161
|
|
|
|
|
|
|
&& !exists( $record{$name} ) |
162
|
|
|
|
|
|
|
; |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
} |
165
|
|
|
|
|
|
|
last if !$skip->( \%record ); |
166
|
|
|
|
|
|
|
$start++; |
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
} elsif ($skip > 0) { |
169
|
|
|
|
|
|
|
$start += $skip; |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
# Load field names. |
173
|
|
|
|
|
|
|
unless ($self->no_column_names) { |
174
|
|
|
|
|
|
|
my @names; |
175
|
|
|
|
|
|
|
push( @names, {$cells->[$start][$_]->value => $_} ) |
176
|
|
|
|
|
|
|
foreach ($worksheet->{MinCol} .. $worksheet->{MaxCol}); |
177
|
|
|
|
|
|
|
$etl->aliases( @names ); |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
$start++; |
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
# Load the data. |
183
|
|
|
|
|
|
|
my $source = $self->source; |
184
|
|
|
|
|
|
|
foreach my $row ($start .. $worksheet->{MaxRow}) { |
185
|
|
|
|
|
|
|
$self->source( sprintf( '%s at row %d', $source, $row + 1 ) ); |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
my $record = []; |
188
|
|
|
|
|
|
|
push( @$record, $cells->[$row][$_]->value ) |
189
|
|
|
|
|
|
|
foreach ($worksheet->{MinCol} .. $worksheet->{MaxCol}); |
190
|
|
|
|
|
|
|
$etl->record( $record ); |
191
|
|
|
|
|
|
|
} |
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
=head1 SEE ALSO |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
L<ETL::Pipeline>, L<ETL::Pipeline::Input>, L<ETL::Pipeline::Input::File>, |
198
|
|
|
|
|
|
|
L<ETL::Pipeline::Input::File::Table> |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
=cut |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
with 'ETL::Pipeline::Input'; |
203
|
|
|
|
|
|
|
with 'ETL::Pipeline::Input::File'; |
204
|
|
|
|
|
|
|
with 'ETL::Pipeline::Input::File::Table'; |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
=head1 AUTHOR |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
Robert Wohlfarth <robert.j.wohlfarth@vumc.org> |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
=head1 LICENSE |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
Copyright 2021 (c) Vanderbilt University Medical Center |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it under |
216
|
|
|
|
|
|
|
the same terms as Perl itself. |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
=cut |
219
|
|
|
|
|
|
|
|
220
|
1
|
|
|
1
|
|
10
|
no Moose; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
11
|
|
221
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |