File Coverage

blib/lib/ETL/Pipeline/Input/Excel.pm
Criterion Covered Total %
statement 29 29 100.0
branch n/a
condition n/a
subroutine 10 10 100.0
pod n/a
total 39 39 100.0


line stmt bran cond sub pod time code
1             =pod
2              
3             =head1 NAME
4              
5             ETL::Pipeline::Input::Excel - Input source for Microsoft Excel spreadsheets
6              
7             =head1 SYNOPSIS
8              
9             use ETL::Pipeline;
10             ETL::Pipeline->new( {
11             input => ['Excel', iname => qr/\.xlsx$/i],
12             mapping => {First => 'A', Second => qr/ID\s*Num/i},
13             output => ['UnitTest']
14             } )->process;
15              
16             =head1 DESCRIPTION
17              
18             B<ETL::Pipeline::Input::Excel> defines an input source for reading MS Excel
19             spreadsheets. It uses L<Spreadsheet::XLSX> or L<Spreadsheet::ParseExcel>,
20             depending on the file type (XLSX or XLS).
21              
22             =cut
23              
24             package ETL::Pipeline::Input::Excel;
25              
26 1     1   33 use 5.014000;
  1         3  
27 1     1   8 use warnings;
  1         2  
  1         39  
28              
29 1     1   5 use Carp;
  1         2  
  1         93  
30 1     1   8 use List::AllUtils qw/first none/;
  1         3  
  1         65  
31 1     1   7 use Moose;
  1         3  
  1         12  
32 1     1   10062 use Spreadsheet::ParseExcel;
  1         54654  
  1         58  
33 1     1   15 use Spreadsheet::ParseExcel::Utility qw/int2col/;
  1         4  
  1         83  
34 1     1   766 use Spreadsheet::XLSX;
  1         86686  
  1         50  
35 1     1   15 use String::Util qw/hascontent/;
  1         3  
  1         766  
36              
37              
38             our $VERSION = '3.00';
39              
40              
41             =head1 METHODS & ATTRIBUTES
42              
43             =head2 Arguments for L<ETL::Pipeline/input>
44              
45             B<ETL::Pipeline::Input::Excel> implements the L<ETL::Pipeline::Input::File>
46             and L<ETL::Pipeline::Input::File::Table> roles. It supports all of the
47             attributes from these roles.
48              
49             =head3 worksheet
50              
51             Optional. B<worksheet> reads data from a specific worksheet. By default,
52             B<ETL::Pipeline::Input::Excel> uses the first worksheet.
53              
54             B<worksheet> accepts a string or regular expression. As a string, B<worksheet>
55             looks for an exact match. As a regular expression, B<worksheet> finds the first
56             worksheet whose name matches the regular expression. Note that B<worksheet>
57             stops looking once it finds the first mach.
58              
59             B<ETL::Pipeline::Input::Excel> throws an error if it cannot find a worksheet
60             with a matching name.
61              
62             =cut
63              
64             has 'worksheet' => (
65             is => 'ro',
66             isa => 'Maybe[RegexpRef|Str]',
67             );
68              
69              
70             =head3 password
71              
72             Optional. B<password> works with encrypted files. B<ETL::Pipeline::Input::Excel>
73             decrypts the file automatically.
74              
75             B<Warning:> B<password> only works with Excel 2003 file (XLS). Encrypted XLSX
76             files always fail. L<Spreadsheet::XLSX> does not support encryption.
77              
78             =cut
79              
80             has 'password' => (
81             is => 'ro',
82             isa => 'Maybe[Str]',
83             );
84              
85              
86             =head3 skipping
87              
88             Optional. If you use a code reference for B<skipping>, this input source sends a
89             hash reference. You can access the columns by number or letter.
90              
91             If you pass an integer, the input source completely skips over that many lines.
92              
93             =head2 Methods
94              
95             =head3 run
96              
97             This is the main loop. It opens the file, reads records, and closes it when
98             done. This is the place to look if there are problems.
99              
100             L<ETL::Pipeline> automatically calls this method.
101              
102             =cut
103              
104             sub run {
105             my ($self, $etl) = @_;
106              
107             #----------------------------------------------------------------------
108             # Open the file.
109              
110             # Create the correct workbook object based on the file format.
111             my $path = $self->path;
112             my $workbook;
113              
114             if ("$path" =~ m/\.xls$/i) {
115             my $excel = Spreadsheet::ParseExcel->new( Password => $self->password );
116             $workbook = $excel->parse( "$path" );
117             croak "Unable to open the Excel file '$path'" unless defined $workbook;
118             } else {
119             $workbook = Spreadsheet::XLSX->new( "$path" );
120             croak "Unable to open the Excel file '$path'" unless defined $workbook;
121             }
122              
123             # Find the worksheet with data...
124             my $name = $self->worksheet;
125             my $worksheet;
126             if (hascontent( $name )) {
127             if (ref( $name ) eq 'Regexp') {
128             $worksheet = first { $_->get_name() =~ m/$name/ } $workbook->worksheets();
129             } else {
130             $worksheet = $workbook->worksheet( $name );
131             }
132             croak "No worksheets match '$name'" unless defined $worksheet;
133             } else {
134             $worksheet = $workbook->worksheet( 0 );
135             croak "'$path' has no worksheets" unless defined $worksheet;
136             }
137              
138             # Convert the column numbers into their letter designations.
139             my %names;
140             $names{int2col( $_ )} = $_ foreach ($worksheet->{MinCol} .. $worksheet->{MaxCol});
141             $etl->aliases( \%names );
142              
143             #----------------------------------------------------------------------
144             # Read the records.
145             my $cells = $worksheet->{Cells};
146             my $start = $worksheet->{MinRow};
147              
148             # Skip over report headers. These are not data. They are extra rows put
149             # there for report formats. The data starts after these rows.
150             my $skip = $self->skipping;
151             if (ref( $skip ) eq 'CODE') {
152             while ($start <= $worksheet->{MaxRow}) {
153             my %record;
154             foreach my $column ($worksheet->{MinCol} .. $worksheet->{MaxCol}) {
155             $record{$column} = $cells->[$start][$column]->value;
156             }
157             foreach my $alias ($etl->aliases) {
158             while (my ($name, $column) = each %$alias) {
159             $record{$name} = $record{$column} if
160             exists( $record{$column} )
161             && !exists( $record{$name} )
162             ;
163             }
164             }
165             last if !$skip->( \%record );
166             $start++;
167             }
168             } elsif ($skip > 0) {
169             $start += $skip;
170             }
171              
172             # Load field names.
173             unless ($self->no_column_names) {
174             my @names;
175             push( @names, {$cells->[$start][$_]->value => $_} )
176             foreach ($worksheet->{MinCol} .. $worksheet->{MaxCol});
177             $etl->aliases( @names );
178              
179             $start++;
180             }
181              
182             # Load the data.
183             my $source = $self->source;
184             foreach my $row ($start .. $worksheet->{MaxRow}) {
185             $self->source( sprintf( '%s at row %d', $source, $row + 1 ) );
186              
187             my $record = [];
188             push( @$record, $cells->[$row][$_]->value )
189             foreach ($worksheet->{MinCol} .. $worksheet->{MaxCol});
190             $etl->record( $record );
191             }
192             }
193              
194              
195             =head1 SEE ALSO
196              
197             L<ETL::Pipeline>, L<ETL::Pipeline::Input>, L<ETL::Pipeline::Input::File>,
198             L<ETL::Pipeline::Input::File::Table>
199              
200             =cut
201              
202             with 'ETL::Pipeline::Input';
203             with 'ETL::Pipeline::Input::File';
204             with 'ETL::Pipeline::Input::File::Table';
205              
206              
207             =head1 AUTHOR
208              
209             Robert Wohlfarth <robert.j.wohlfarth@vumc.org>
210              
211             =head1 LICENSE
212              
213             Copyright 2021 (c) Vanderbilt University Medical Center
214              
215             This program is free software; you can redistribute it and/or modify it under
216             the same terms as Perl itself.
217              
218             =cut
219              
220 1     1   10 no Moose;
  1         2  
  1         11  
221             __PACKAGE__->meta->make_immutable;