| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
=pod |
|
2
|
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=head1 NAME |
|
4
|
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
ETL::Pipeline - Extract-Transform-Load pattern for data file conversions |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
use ETL::Pipeline; |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
# The object oriented interface... |
|
12
|
|
|
|
|
|
|
ETL::Pipeline->new( { |
|
13
|
|
|
|
|
|
|
work_in => {search => 'C:\Data', iname => qr/Ficticious/}, |
|
14
|
|
|
|
|
|
|
input => ['Excel', iname => qr/\.xlsx?$/ ], |
|
15
|
|
|
|
|
|
|
mapping => {Name => 'A', Address => 'B', ID => 'C' }, |
|
16
|
|
|
|
|
|
|
constants => {Type => 1, Information => 'Demographic' }, |
|
17
|
|
|
|
|
|
|
output => ['Memory', key => 'ID' ], |
|
18
|
|
|
|
|
|
|
} )->process; |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
# Or using method calls... |
|
21
|
|
|
|
|
|
|
my $etl = ETL::Pipeline->new; |
|
22
|
|
|
|
|
|
|
$etl->work_in ( search => 'C:\Data', iname => qr/Ficticious/ ); |
|
23
|
|
|
|
|
|
|
$etl->input ( 'Excel', iname => qr/\.xlsx?$/i ); |
|
24
|
|
|
|
|
|
|
$etl->mapping ( Name => 'A', Address => 'B', ID => 'C' ); |
|
25
|
|
|
|
|
|
|
$etl->constants( Type => 1, Information => 'Demographic' ); |
|
26
|
|
|
|
|
|
|
$etl->output ( 'Memory', key => 'ID' ); |
|
27
|
|
|
|
|
|
|
$etl->process; |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
=cut |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
package ETL::Pipeline; |
|
32
|
|
|
|
|
|
|
|
|
33
|
14
|
|
|
14
|
|
5829
|
use 5.021000; # Required for "no warnings 'redundant'". |
|
|
14
|
|
|
|
|
21402
|
|
|
34
|
14
|
|
|
14
|
|
184
|
use warnings; |
|
|
14
|
|
|
|
|
538
|
|
|
|
14
|
|
|
|
|
12525
|
|
|
35
|
|
|
|
|
|
|
|
|
36
|
14
|
|
|
14
|
|
159
|
use Carp; |
|
|
14
|
|
|
|
|
49
|
|
|
|
14
|
|
|
|
|
901
|
|
|
37
|
14
|
|
|
14
|
|
5037
|
use Data::DPath qw/dpath/; |
|
|
14
|
|
|
|
|
1057442
|
|
|
|
14
|
|
|
|
|
82
|
|
|
38
|
14
|
|
|
14
|
|
6702
|
use Data::Traverse qw/traverse/; |
|
|
14
|
|
|
|
|
6337
|
|
|
|
14
|
|
|
|
|
548
|
|
|
39
|
14
|
|
|
14
|
|
5565
|
use List::AllUtils qw/any first/; |
|
|
14
|
|
|
|
|
154478
|
|
|
|
14
|
|
|
|
|
931
|
|
|
40
|
14
|
|
|
14
|
|
5566
|
use Moose; |
|
|
14
|
|
|
|
|
4142069
|
|
|
|
14
|
|
|
|
|
91
|
|
|
41
|
14
|
|
|
14
|
|
131335
|
use MooseX::Types::Path::Class qw/Dir/; |
|
|
14
|
|
|
|
|
1337804
|
|
|
|
14
|
|
|
|
|
98
|
|
|
42
|
14
|
|
|
11
|
|
18371
|
use Path::Class::Rule; |
|
|
11
|
|
|
|
|
119201
|
|
|
|
11
|
|
|
|
|
383
|
|
|
43
|
11
|
|
|
11
|
|
154
|
use Scalar::Util qw/blessed/; |
|
|
11
|
|
|
|
|
23
|
|
|
|
11
|
|
|
|
|
494
|
|
|
44
|
11
|
|
|
11
|
|
5849
|
use String::Util qw/hascontent trim/; |
|
|
11
|
|
|
|
|
33485
|
|
|
|
11
|
|
|
|
|
32003
|
|
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
our $VERSION = '3.10'; |
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
B<ETL> stands for I<Extract-Transform-Load>. ETL isn't just for Data |
|
53
|
|
|
|
|
|
|
Warehousing. ETL works on almost any type of data conversion. You read the |
|
54
|
|
|
|
|
|
|
source, translate the data for your target, and store the result. |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
By dividing a conversion into 3 steps, we isolate the input from the output... |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
=over |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=item * Centralizes data formatting and validation. |
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
=item * Makes new input formats a breeze. |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
=item * Makes new outputs just as easy. |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
=back |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
B<ETL::Pipeline> takes your data files from extract to load. It reads an input |
|
69
|
|
|
|
|
|
|
source, translates the data, and writes it to an output destination. For |
|
70
|
|
|
|
|
|
|
example, this pipeline reads an Excel spread sheet (input) and saves the |
|
71
|
|
|
|
|
|
|
information in a Perl hash (output). |
|
72
|
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
use ETL::Pipeline; |
|
74
|
|
|
|
|
|
|
ETL::Pipeline->new( { |
|
75
|
|
|
|
|
|
|
work_in => {search => 'C:\Data', find => qr/Ficticious/}, |
|
76
|
|
|
|
|
|
|
input => ['Excel', find => qr/\.xlsx?$/], |
|
77
|
|
|
|
|
|
|
mapping => {Name => 'A', Complaint => 'B', ID => 'C'}, |
|
78
|
|
|
|
|
|
|
constants => {Client => 1, Type => 'Complaint'} |
|
79
|
|
|
|
|
|
|
output => ['Memory', key => 'ID'] |
|
80
|
|
|
|
|
|
|
} )->process; |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
Or like this, calling the methods instead of through the constructor... |
|
83
|
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
use ETL::Pipeline; |
|
85
|
|
|
|
|
|
|
my $etl = ETL::Pipeline->new; |
|
86
|
|
|
|
|
|
|
$etl->work_in ( search => 'C:\Data', find => qr/Ficticious/ ); |
|
87
|
|
|
|
|
|
|
$etl->input ( 'Excel', find => qr/\.xlsx?$/ ); |
|
88
|
|
|
|
|
|
|
$etl->mapping ( Name => 'A', Complaint => 'B', ID => 'C' ); |
|
89
|
|
|
|
|
|
|
$etl->constants( Client => 1, Type => 'Complaint' ); |
|
90
|
|
|
|
|
|
|
$etl->output ( 'Memory', key => 'ID' ); |
|
91
|
|
|
|
|
|
|
$etl->process; |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
These are equivalent. They do exactly the same thing. You can pick whichever |
|
94
|
|
|
|
|
|
|
best suits your style. |
|
95
|
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
=head2 What is a pipeline? |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
The term I<pipeline> describes a complete ETL process - extract, transform, |
|
99
|
|
|
|
|
|
|
and load. Or more accurately - input, mapping, output. Raw data enters one end |
|
100
|
|
|
|
|
|
|
of the pipe (input) and useful information comes out the other (output). An |
|
101
|
|
|
|
|
|
|
B<ETL::Pipeline> object represents a complete pipeline. |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
=head2 Upgrade Warning |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
B<WARNING:> The API for input sources has changed in version 3.00. Custom input |
|
106
|
|
|
|
|
|
|
sources written for an earlier version will not work with version 3.00 and |
|
107
|
|
|
|
|
|
|
later. You will need to re-write your custom input sources. |
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
See L<ETL::Pipeline::Input> for more details. |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
=head1 METHODS & ATTRIBUTES |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=head2 Managing the pipeline |
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
=head3 new |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
Create a new ETL pipeline. The constructor accepts a hash reference whose keys |
|
118
|
|
|
|
|
|
|
are B<ETL::Pipeline> attributes. See the corresponding attribute documentation |
|
119
|
|
|
|
|
|
|
for details about acceptable values. |
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
=over |
|
122
|
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
=item aliases |
|
124
|
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
=item constants |
|
126
|
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
=item data_in |
|
128
|
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
=item input |
|
130
|
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
=item mapping |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
=item on_record |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
=item output |
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
=item session |
|
138
|
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
=item work_in |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
=back |
|
142
|
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
=cut |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
sub BUILD { |
|
146
|
73
|
|
|
73
|
0
|
192
|
my $self = shift; |
|
147
|
73
|
|
|
|
|
139
|
my $arguments = shift; |
|
148
|
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
# "_chain" is a special argument to the constructor that implements the |
|
150
|
|
|
|
|
|
|
# "chain" method. It copies information from an existing object. This allows |
|
151
|
|
|
|
|
|
|
# pipelines to share settings. |
|
152
|
|
|
|
|
|
|
# |
|
153
|
|
|
|
|
|
|
# Always handle "_chain" first. That way "work_in" and "data_in" arguments |
|
154
|
|
|
|
|
|
|
# can override the defaults. |
|
155
|
73
|
100
|
|
|
|
340
|
if (defined $arguments->{_chain}) { |
|
156
|
4
|
|
|
|
|
29
|
my $object = $arguments->{_chain}; |
|
157
|
4
|
50
|
33
|
|
|
42
|
croak '"chain" requires an ETL::Pipeline object' unless |
|
158
|
|
|
|
|
|
|
defined( blessed( $object ) ) |
|
159
|
|
|
|
|
|
|
&& $object->isa( 'ETL::Pipeline' ) |
|
160
|
|
|
|
|
|
|
; |
|
161
|
4
|
100
|
|
|
|
86
|
$self->_work_in( $object->_work_in ) if defined $object->_work_in; |
|
162
|
4
|
100
|
|
|
|
96
|
$self->_data_in( $object->_data_in ) if defined $object->_data_in; |
|
163
|
4
|
|
|
|
|
82
|
$self->_session( $object->_session ); |
|
164
|
|
|
|
|
|
|
} |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
# The order of these two is important. "work_in" resets "data_in" with a |
|
167
|
|
|
|
|
|
|
# trigger. "work_in" must be set first so that we don't lose the value |
|
168
|
|
|
|
|
|
|
# from "data_in". |
|
169
|
73
|
100
|
|
|
|
256
|
if (defined $arguments->{work_in}) { |
|
170
|
65
|
|
|
|
|
196
|
my $values = $arguments->{work_in}; |
|
171
|
65
|
50
|
|
|
|
390
|
$self->work_in( ref( $values ) eq '' ? $values : @$values ); |
|
172
|
|
|
|
|
|
|
} |
|
173
|
73
|
100
|
|
|
|
295
|
if (defined $arguments->{data_in}) { |
|
174
|
3
|
|
|
|
|
47
|
my $values = $arguments->{data_in}; |
|
175
|
3
|
50
|
|
|
|
21
|
$self->data_in( ref( $values ) eq '' ? $values : @$values ); |
|
176
|
|
|
|
|
|
|
} |
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
# The input and output configurations may be single string values or an |
|
179
|
|
|
|
|
|
|
# array of arguments. It depends on what each source or destination expects. |
|
180
|
73
|
100
|
|
|
|
351
|
if (defined $arguments->{input}) { |
|
181
|
65
|
|
|
|
|
190
|
my $values = $arguments->{input}; |
|
182
|
65
|
100
|
|
|
|
390
|
$self->input( ref( $values ) eq '' ? $values : @$values ); |
|
183
|
|
|
|
|
|
|
} |
|
184
|
73
|
100
|
|
|
|
292
|
if (defined $arguments->{output}) { |
|
185
|
65
|
|
|
|
|
202
|
my $values = $arguments->{output}; |
|
186
|
65
|
100
|
|
|
|
369
|
$self->output( ref( $values ) eq '' ? $values : @$values ); |
|
187
|
|
|
|
|
|
|
} |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
# Save any alias definition for use in "record". |
|
190
|
73
|
50
|
|
|
|
1726
|
if (defined $arguments->{aliases}) { |
|
191
|
1
|
0
|
|
|
|
30
|
if (ref( $arguments->{aliases} ) eq 'ARRAY') { |
|
192
|
1
|
|
|
|
|
9
|
$self->aliases( @{$arguments->{aliases}} ); |
|
|
1
|
|
|
|
|
2
|
|
|
193
|
|
|
|
|
|
|
} else { |
|
194
|
1
|
|
|
|
|
40
|
$self->aliases( $arguments->{aliases} ); |
|
195
|
|
|
|
|
|
|
} |
|
196
|
|
|
|
|
|
|
} |
|
197
|
|
|
|
|
|
|
} |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
=head3 aliases |
|
201
|
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
B<aliases> defines alternate names for input fields. This is how column headers |
|
203
|
|
|
|
|
|
|
work, for example. You can define your own shortcuts using this method or |
|
204
|
|
|
|
|
|
|
declaring B<aliases> in L</new>. Aliases can make complex field names more |
|
205
|
|
|
|
|
|
|
readable. |
|
206
|
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
B<aliases> accepts a list of hash references. Each hash reference has one or |
|
208
|
|
|
|
|
|
|
more alias-to-field definitions. The hash key is the alias name. The value is |
|
209
|
|
|
|
|
|
|
any field name recognized by L</get>. |
|
210
|
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
Aliases are resolved in the order they are added. That way, your pipelines know |
|
212
|
|
|
|
|
|
|
where each value came from, if that's important. Aliases set by the input source |
|
213
|
|
|
|
|
|
|
always sort before aliases set by the script. Within a hash, all definitions are |
|
214
|
|
|
|
|
|
|
considered equal and may sort in any order. |
|
215
|
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
# Array definitions to force sorting. |
|
217
|
|
|
|
|
|
|
my $etl = ETL::Pipeline->new( {aliases => [{A => '0'}, {B => '1'}], ...} ); |
|
218
|
|
|
|
|
|
|
$etl->aliases( {C => '2'}, {D => '3'} ); |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
# Hash where it can sort either way. |
|
221
|
|
|
|
|
|
|
my $etl = ETL::Pipeline->new( {aliases => {A => '0', B => '1'}, ...} ); |
|
222
|
|
|
|
|
|
|
$etl->aliases( {C => '2', D => '3'} ); |
|
223
|
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
B<aliases> returns a sorted list of all aliases for fields in this input source. |
|
225
|
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
I recommend using the hash, unless order matters. In that case, use the array |
|
227
|
|
|
|
|
|
|
form instead. |
|
228
|
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
B<Special Note:> Custom input sources call B<aliases> to add their own |
|
230
|
|
|
|
|
|
|
shortcuts, such as column headers. These aliases are always evaluated I<before> |
|
231
|
|
|
|
|
|
|
those set by L</new> or calls to this method by the script. |
|
232
|
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
=cut |
|
234
|
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
sub aliases { |
|
236
|
85
|
|
|
85
|
1
|
167
|
my $self = shift; |
|
237
|
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
# Add any new aliases first. |
|
239
|
85
|
|
|
|
|
1888
|
my $list = $self->_alias->{$self->_alias_type}; |
|
240
|
85
|
|
|
|
|
356
|
push( @$list, $_ ) foreach (@_); |
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
# Update the cache, if it already exists. This should be VERY, VERY rare. |
|
243
|
|
|
|
|
|
|
# But I wanted to plan for it so that things behave as expected. |
|
244
|
85
|
100
|
|
|
|
2430
|
if ($self->_alias_cache_built) { |
|
245
|
2
|
|
|
|
|
23
|
my $cache = $self->_alias_cache; |
|
246
|
2
|
|
|
|
|
72
|
foreach my $item (@_) { |
|
247
|
1
|
|
|
|
|
7
|
while (my ($alias, $location) = each %$item) { |
|
248
|
1
|
0
|
|
|
|
138
|
$cache->{$alias} = [] unless exists $cache->{alias}; |
|
249
|
1
|
|
|
|
|
28
|
push @{$cache->{alias}}, $self->_as_dpath( $location ); |
|
|
1
|
|
|
|
|
9
|
|
|
250
|
|
|
|
|
|
|
} |
|
251
|
|
|
|
|
|
|
} |
|
252
|
|
|
|
|
|
|
} |
|
253
|
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
# Return a flattened list of aliases. Input source defined aliases first. |
|
255
|
|
|
|
|
|
|
# Then user defined aliases. |
|
256
|
85
|
|
|
|
|
142
|
my @all; |
|
257
|
85
|
|
|
|
|
289
|
push( @all, @{$self->_alias->{$_}} ) foreach (qw/input pipeline/); |
|
|
169
|
|
|
|
|
3143
|
|
|
258
|
85
|
|
|
|
|
234
|
return @all; |
|
259
|
|
|
|
|
|
|
} |
|
260
|
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
=head3 chain |
|
263
|
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
This method creates a new pipeline using the same L</work_in>, L</data_in>, and |
|
265
|
|
|
|
|
|
|
L</session> as the current pipeline. It returns a new instance of |
|
266
|
|
|
|
|
|
|
B<ETL::Pipeline>. |
|
267
|
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
B<chain> takes the same arguments as L</new>. It passes those arguments through |
|
269
|
|
|
|
|
|
|
to the constructor of the new object. |
|
270
|
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
See the section on L</Multiple input sources> for examples of chaining. |
|
272
|
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
=cut |
|
274
|
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
sub chain { |
|
276
|
4
|
|
|
4
|
1
|
58
|
my ($self, $arguments) = @_; |
|
277
|
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
# Create the new object. Use the internal "_chain" argument to do the |
|
279
|
|
|
|
|
|
|
# actual work of chaining. |
|
280
|
4
|
50
|
|
|
|
21
|
if (defined $arguments) { $arguments->{_chain} = $self ; } |
|
|
1
|
|
|
|
|
3
|
|
|
281
|
4
|
|
|
|
|
64
|
else { $arguments = {_chain => $self}; } |
|
282
|
|
|
|
|
|
|
|
|
283
|
4
|
|
|
|
|
94
|
return ETL::Pipeline->new( $arguments ); |
|
284
|
|
|
|
|
|
|
} |
|
285
|
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
=head3 constants |
|
288
|
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
B<constants> sets output fields to literal values. L</mapping> accepts input |
|
290
|
|
|
|
|
|
|
field names as strings. Instead of obtuse Perl tricks for marking literals, |
|
291
|
|
|
|
|
|
|
B<constants> explicitly handles them. |
|
292
|
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
Hash keys are output field names. The L</output> class defines acceptable |
|
294
|
|
|
|
|
|
|
field names. The hash values are literals. |
|
295
|
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
# Set the output field "Name" to the string "John Doe"... |
|
297
|
|
|
|
|
|
|
$etl->constants( Name => 'John Doe' ); |
|
298
|
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
# Get the current list of constants... |
|
300
|
|
|
|
|
|
|
my $transformation = $etl->constants; |
|
301
|
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
B<Note:> B<constants> does not accept code references, array references, or hash |
|
303
|
|
|
|
|
|
|
references. It only works with literal values. Use L</mapping> instead for |
|
304
|
|
|
|
|
|
|
calculated items. |
|
305
|
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
With no parameters, B<constants> returns the current hash reference. If you pass |
|
307
|
|
|
|
|
|
|
in a hash reference, B<constants> replaces the current hash with this new one. |
|
308
|
|
|
|
|
|
|
If you pass in a list of key value pairs, B<constants> adds them to the current |
|
309
|
|
|
|
|
|
|
hash. |
|
310
|
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
=cut |
|
312
|
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
has '_constants' => ( |
|
314
|
|
|
|
|
|
|
handles => {_add_constants => 'set', _has_constants => 'count'}, |
|
315
|
|
|
|
|
|
|
init_arg => 'constants', |
|
316
|
|
|
|
|
|
|
is => 'rw', |
|
317
|
|
|
|
|
|
|
isa => 'HashRef[Maybe[Str]]', |
|
318
|
|
|
|
|
|
|
traits => [qw/Hash/], |
|
319
|
|
|
|
|
|
|
); |
|
320
|
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
sub constants { |
|
323
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
|
324
|
1
|
|
|
|
|
23
|
my @pairs = @_; |
|
325
|
|
|
|
|
|
|
|
|
326
|
1
|
0
|
0
|
|
|
9
|
if (scalar( @pairs ) == 1 && ref( $pairs[0] ) eq 'HASH') { |
|
|
|
0
|
|
|
|
|
|
|
327
|
1
|
|
|
|
|
2
|
return $self->_constants( $pairs[0] ); |
|
328
|
|
|
|
|
|
|
} elsif (scalar @pairs) { |
|
329
|
1
|
|
|
|
|
62
|
return $self->_add_constants( @pairs ); |
|
330
|
|
|
|
|
|
|
} else { |
|
331
|
1
|
|
|
|
|
7
|
return $self->_constants; |
|
332
|
|
|
|
|
|
|
} |
|
333
|
|
|
|
|
|
|
} |
|
334
|
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
=head3 data_in |
|
337
|
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
The working directory (L</work_in>) usually contains the raw data files. In |
|
339
|
|
|
|
|
|
|
some cases, though, the actual data sits in a subdirectory underneath |
|
340
|
|
|
|
|
|
|
L</work_in>. B<data_in> tells the pipeline where to find the input file. |
|
341
|
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
B<data_in> accepts a search pattern - name, glob, or regular expression. It |
|
343
|
|
|
|
|
|
|
searches L</work_in> for the first matching directory. The search is case |
|
344
|
|
|
|
|
|
|
insensitive. |
|
345
|
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
If you pass an empty string to B<data_in>, the pipeline resets B<data_in> to |
|
347
|
|
|
|
|
|
|
the L</work_in> directory. This is useful when chaining pipelines. If one |
|
348
|
|
|
|
|
|
|
changes the data directory, the next in line can change back. |
|
349
|
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
=cut |
|
351
|
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
has '_data_in' => ( |
|
353
|
|
|
|
|
|
|
coerce => 1, |
|
354
|
|
|
|
|
|
|
init_arg => undef, |
|
355
|
|
|
|
|
|
|
is => 'rw', |
|
356
|
|
|
|
|
|
|
isa => Dir, |
|
357
|
|
|
|
|
|
|
); |
|
358
|
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
sub data_in { |
|
361
|
88
|
|
|
88
|
1
|
3941
|
my $self = shift; |
|
362
|
|
|
|
|
|
|
|
|
363
|
88
|
100
|
|
|
|
314
|
if (scalar @_) { |
|
364
|
5
|
50
|
|
|
|
106
|
croak 'The working folder was not set' unless defined $self->_work_in; |
|
365
|
|
|
|
|
|
|
|
|
366
|
5
|
|
|
|
|
16
|
my $name = shift; |
|
367
|
5
|
50
|
|
|
|
72
|
if (hascontent( $name )) { |
|
368
|
5
|
|
|
|
|
117
|
my $next = Path::Class::Rule |
|
369
|
|
|
|
|
|
|
->new |
|
370
|
|
|
|
|
|
|
->min_depth( 1 ) |
|
371
|
|
|
|
|
|
|
->iname( $name ) |
|
372
|
|
|
|
|
|
|
->directory |
|
373
|
|
|
|
|
|
|
->iter( $self->_work_in ) |
|
374
|
|
|
|
|
|
|
; |
|
375
|
5
|
|
|
|
|
1140
|
my $match = $next->(); |
|
376
|
5
|
50
|
|
|
|
11096
|
croak 'No matching directories' unless defined $match; |
|
377
|
5
|
|
|
|
|
223
|
return $self->_data_in( $match ); |
|
378
|
1
|
|
|
|
|
3
|
} else { return $self->_data_in( $self->_work_in ); } |
|
379
|
84
|
|
|
|
|
2269
|
} else { return $self->_data_in; } |
|
380
|
|
|
|
|
|
|
} |
|
381
|
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
=head3 input |
|
384
|
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
B<input> sets and returns the L<ETL::Pipeline::Input> object. This object reads |
|
386
|
|
|
|
|
|
|
the data. With no parameters, B<input> returns the current |
|
387
|
|
|
|
|
|
|
L<ETL::Pipeline::Input> object. |
|
388
|
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
my $source = $etl->input(); |
|
390
|
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
Set the input source by calling B<input> with parameters... |
|
392
|
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
$etl->input( 'Excel', find => qr/\.xlsx/i ); |
|
394
|
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
The first parameter is a class name. B<input> looks for a Perl module matching |
|
396
|
|
|
|
|
|
|
this name in the C<ETL::Pipeline::Input> namespace. In this example, the actual |
|
397
|
|
|
|
|
|
|
class name becomes C<ETL::Pipeline::Input::Excel>. |
|
398
|
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
The rest of the parameters are passed directly to the C<new> method of that |
|
400
|
|
|
|
|
|
|
class. |
|
401
|
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
B<Technical Note:> Want to use a custom class from B<Local> instead of |
|
403
|
|
|
|
|
|
|
B<ETL::Pipeline::Input>? Put a B<+> (plus sign) in front of the class name. |
|
404
|
|
|
|
|
|
|
For example, this command uses the input class B<Local::CustomExtract>. |
|
405
|
|
|
|
|
|
|
|
|
406
|
|
|
|
|
|
|
$etl->input( '+Local::CustomExtract' ); |
|
407
|
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
=cut |
|
409
|
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
has '_input' => ( |
|
411
|
|
|
|
|
|
|
does => 'ETL::Pipeline::Input', |
|
412
|
|
|
|
|
|
|
init_arg => undef, |
|
413
|
|
|
|
|
|
|
is => 'rw', |
|
414
|
|
|
|
|
|
|
); |
|
415
|
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
sub input { |
|
418
|
208
|
|
|
208
|
1
|
436
|
my $self = shift; |
|
419
|
|
|
|
|
|
|
|
|
420
|
208
|
100
|
|
|
|
794
|
return $self->_input( $self->_object_of_class( 'Input', @_ ) ) if scalar @_; |
|
421
|
143
|
|
|
|
|
3456
|
return $self->_input; |
|
422
|
|
|
|
|
|
|
} |
|
423
|
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
=head3 mapping |
|
426
|
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
B<mapping> ties the input fields with the output fields. Hash keys are output |
|
428
|
|
|
|
|
|
|
field names. The L</output> class defines acceptable field names. The hash |
|
429
|
|
|
|
|
|
|
values can be anything accepted by the L</get> method. See L</get> for more |
|
430
|
|
|
|
|
|
|
information. |
|
431
|
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
# Add the output field "Name" with data from input column "A"... |
|
433
|
|
|
|
|
|
|
$etl->mapping( Name => 'A' ); |
|
434
|
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
# Change "Name" to get data from "Full Name" or "FullName"... |
|
436
|
|
|
|
|
|
|
$etl->mapping( Name => qr/Full\s*Name/i ); |
|
437
|
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
# "Name" gets the lower case of input column "A"... |
|
439
|
|
|
|
|
|
|
$etl->mapping( Name => sub { |
|
440
|
|
|
|
|
|
|
my ($etl, $record) = @_; |
|
441
|
|
|
|
|
|
|
return lc $record{A}; |
|
442
|
|
|
|
|
|
|
} ); |
|
443
|
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
If L</get> returns an ARRAY reference (aka multiple values), they will be |
|
445
|
|
|
|
|
|
|
concatenated in the output with a semi-colon between values - B<; >. You can |
|
446
|
|
|
|
|
|
|
override the seperator by setting the value to an ARRAY reference. The first |
|
447
|
|
|
|
|
|
|
element is a regular field name for L</get>. The second element is a new |
|
448
|
|
|
|
|
|
|
seperator string. |
|
449
|
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
# Slashes between multiple names. |
|
451
|
|
|
|
|
|
|
$etl->mapping( Name => [qr/Name/i, ' / '] ); |
|
452
|
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
# These will do the same thing - semi-colon between multiple names. |
|
454
|
|
|
|
|
|
|
$etl->mapping( Name => [qr/Name/i, '; '] ); |
|
455
|
|
|
|
|
|
|
$etl->mapping( Name => qr/Name/i ); |
|
456
|
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
With no parameters, B<mapping> returns the current hash reference. If you pass |
|
458
|
|
|
|
|
|
|
in a hash reference, B<mapping> replaces the current hash with this new one. If |
|
459
|
|
|
|
|
|
|
you pass in a list of key value pairs, B<mapping> adds them to the current hash. |
|
460
|
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
# Get the current mapping... |
|
462
|
|
|
|
|
|
|
my $transformation = $etl->mapping; |
|
463
|
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
# Add the output field "Name" with data from input column "A"... |
|
465
|
|
|
|
|
|
|
$etl->mapping( Name => 'A' ); |
|
466
|
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
# Replace the entire mapping so only "Name" is output... |
|
468
|
|
|
|
|
|
|
$etl->mapping( {Name => 'C'} ); |
|
469
|
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
Want to save a literal value? Use L</constants> instead. |
|
471
|
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
=head4 Complex data structures |
|
473
|
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
B<mapping> only sets scalar values. If the matching fields contain sub-records, |
|
475
|
|
|
|
|
|
|
L</record> throws an error message and sets the output field to C<undef>. |
|
476
|
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
=head4 Fully customized mapping |
|
478
|
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
B<mapping> accepts a CODE reference in place of the hash. In this case, |
|
480
|
|
|
|
|
|
|
L</record> executes the code and uses the return value as the record to send |
|
481
|
|
|
|
|
|
|
L</output>. The CODE should return a hash reference for success or C<undef> if |
|
482
|
|
|
|
|
|
|
there is an error. |
|
483
|
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
# Execute code instead of defining the output fields. |
|
485
|
|
|
|
|
|
|
$etl->mapping( sub { ... } ); |
|
486
|
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
# These are the same. |
|
488
|
|
|
|
|
|
|
$etl->mapping( {Name => 'A'} ); |
|
489
|
|
|
|
|
|
|
$etl->mapping( sub { |
|
490
|
|
|
|
|
|
|
my $etl = shift; |
|
491
|
|
|
|
|
|
|
return {Name => $etl->get( 'A' )}; |
|
492
|
|
|
|
|
|
|
} ); |
|
493
|
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
C<undef> saves an empty record. To print an error message, have your code call |
|
495
|
|
|
|
|
|
|
L</status> with a type of B<ERROR>. |
|
496
|
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
# Return an enpty record. |
|
498
|
|
|
|
|
|
|
$etl->mapping( sub { undef; } ); |
|
499
|
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
# Print an error message. |
|
501
|
|
|
|
|
|
|
$etl->mapping( sub { |
|
502
|
|
|
|
|
|
|
... |
|
503
|
|
|
|
|
|
|
$etl->status( 'ERROR', 'There is no data!' ); |
|
504
|
|
|
|
|
|
|
return undef; |
|
505
|
|
|
|
|
|
|
}); |
|
506
|
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
The results of L</constants> are folded into the resulting hash reference. |
|
508
|
|
|
|
|
|
|
Fields set by B<mapping> override constants. |
|
509
|
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
# Output record has two fields - "Extra" and "Name". |
|
511
|
|
|
|
|
|
|
$etl->constants( Extra => 'ABC' ); |
|
512
|
|
|
|
|
|
|
$etl->mapping( sub { {Name => shift->get( 'A' )} } ); |
|
513
|
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
# Output record has only one field, with the value from the input record. |
|
515
|
|
|
|
|
|
|
$etl->constants( Name => 'ABC' ); |
|
516
|
|
|
|
|
|
|
$etl->mapping( sub { {Name => shift->get( 'A' )} } ); |
|
517
|
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
L</record> passes two parameters into the CODE reference - the B<ETL::Pipeline> |
|
519
|
|
|
|
|
|
|
object and L<the raw data record|/this>. |
|
520
|
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
$etl->mapping( sub { |
|
522
|
|
|
|
|
|
|
my ($etl, $record) = @_; |
|
523
|
|
|
|
|
|
|
... |
|
524
|
|
|
|
|
|
|
} ); |
|
525
|
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
B<WARNING:> This is considered an I<advanced> feature and should be used |
|
527
|
|
|
|
|
|
|
sparingly. You will find the I<< name => field >> format easier to maintain. |
|
528
|
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
=cut |
|
530
|
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
has '_mapping' => ( |
|
532
|
|
|
|
|
|
|
init_arg => 'mapping', |
|
533
|
|
|
|
|
|
|
is => 'rw', |
|
534
|
|
|
|
|
|
|
isa => 'HashRef|CodeRef', |
|
535
|
|
|
|
|
|
|
); |
|
536
|
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
sub mapping { |
|
539
|
127
|
|
|
127
|
1
|
221
|
my $self = shift; |
|
540
|
127
|
|
|
|
|
259
|
my @pairs = @_; |
|
541
|
|
|
|
|
|
|
|
|
542
|
127
|
50
|
|
|
|
427
|
if (scalar( @pairs) <= 0) { |
|
|
|
0
|
|
|
|
|
|
|
543
|
127
|
|
|
|
|
2770
|
return $self->_mapping; |
|
544
|
|
|
|
|
|
|
} elsif (scalar( @pairs ) == 1) { |
|
545
|
1
|
|
|
|
|
3
|
return $self->_mapping( $pairs[0] ); |
|
546
|
|
|
|
|
|
|
} else { |
|
547
|
1
|
0
|
|
|
|
22
|
$self->_mapping( {} ) if ref( $self->_mapping) ne 'HASH'; |
|
548
|
1
|
|
|
|
|
10
|
my $reference = $self->_mapping; |
|
549
|
1
|
|
|
|
|
2
|
$reference = {%$reference, @pairs}; |
|
550
|
|
|
|
|
|
|
} |
|
551
|
|
|
|
|
|
|
} |
|
552
|
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
=head3 on_record |
|
555
|
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
Executes a customized subroutine on every record before any mapping. The code |
|
557
|
|
|
|
|
|
|
can modify the record and your changes will feed into the mapping. You can use |
|
558
|
|
|
|
|
|
|
B<on_record> for filtering, debugging, or just about anything. |
|
559
|
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
B<on_record> accepts a code reference. L</record> executes this code for every |
|
561
|
|
|
|
|
|
|
input record. |
|
562
|
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
The code reference receives two parameters - the C<ETL::Pipeline> object and the |
|
564
|
|
|
|
|
|
|
input record. The record is passed as a hash reference. If B<on_record> returns |
|
565
|
|
|
|
|
|
|
a false value, L</record> will never send this record to the output destination. |
|
566
|
|
|
|
|
|
|
It's as if this record never existed. |
|
567
|
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
ETL::Pipeline->new( { |
|
569
|
|
|
|
|
|
|
... |
|
570
|
|
|
|
|
|
|
on_record => sub { |
|
571
|
|
|
|
|
|
|
my ($etl, $record) = @_; |
|
572
|
|
|
|
|
|
|
foreach my $field (keys %$record) { |
|
573
|
|
|
|
|
|
|
my $value = $record->{$field}; |
|
574
|
|
|
|
|
|
|
$record->{$field} = ($value eq 'NA' ? '' : $value); |
|
575
|
|
|
|
|
|
|
} |
|
576
|
|
|
|
|
|
|
}, |
|
577
|
|
|
|
|
|
|
... |
|
578
|
|
|
|
|
|
|
} )->process; |
|
579
|
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
# -- OR -- |
|
581
|
|
|
|
|
|
|
$etl->on_record( sub { |
|
582
|
|
|
|
|
|
|
my ($etl, $record) = @_; |
|
583
|
|
|
|
|
|
|
foreach my $field (keys %$record) { |
|
584
|
|
|
|
|
|
|
my $value = $record->{$field}; |
|
585
|
|
|
|
|
|
|
$record->{$field} = ($value eq 'NA' ? '' : $value); |
|
586
|
|
|
|
|
|
|
} |
|
587
|
|
|
|
|
|
|
} ); |
|
588
|
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
B<Note:> L</record> automatically removes leading and trailing whitespace. You |
|
590
|
|
|
|
|
|
|
do not need B<on_record> for that. |
|
591
|
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
=cut |
|
593
|
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
has 'on_record' => ( |
|
595
|
|
|
|
|
|
|
is => 'rw', |
|
596
|
|
|
|
|
|
|
isa => 'Maybe[CodeRef]', |
|
597
|
|
|
|
|
|
|
); |
|
598
|
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
=head3 output |
|
601
|
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
B<output> sets and returns the L<ETL::Pipeline::Output> object. This object |
|
603
|
|
|
|
|
|
|
writes records to their final destination. With no parameters, B<output> returns |
|
604
|
|
|
|
|
|
|
the current L<ETL::Pipeline::Output> object. |
|
605
|
|
|
|
|
|
|
|
|
606
|
|
|
|
|
|
|
Set the output destination by calling B<output> with parameters... |
|
607
|
|
|
|
|
|
|
|
|
608
|
|
|
|
|
|
|
$etl->output( 'SQL', table => 'NewData' ); |
|
609
|
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
The first parameter is a class name. B<output> looks for a Perl module |
|
611
|
|
|
|
|
|
|
matching this name in the C<ETL::Pipeline::Output> namespace. In this example, |
|
612
|
|
|
|
|
|
|
the actual class name becomes C<ETL::Pipeline::Output::SQL>. |
|
613
|
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
The rest of the parameters are passed directly to the C<new> method of that |
|
615
|
|
|
|
|
|
|
class. |
|
616
|
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
B<Technical Note:> Want to use a custom class from B<Local> instead of |
|
618
|
|
|
|
|
|
|
B<ETL::Pipeline::Output>? Put a B<+> (plus sign) in front of the class name. |
|
619
|
|
|
|
|
|
|
For example, this command uses the input class B<Local::CustomLoad>. |
|
620
|
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
$etl->output( '+Local::CustomLoad' ); |
|
622
|
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
=cut |
|
624
|
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
has '_output' => ( |
|
626
|
|
|
|
|
|
|
does => 'ETL::Pipeline::Output', |
|
627
|
|
|
|
|
|
|
init_arg => undef, |
|
628
|
|
|
|
|
|
|
is => 'rw', |
|
629
|
|
|
|
|
|
|
); |
|
630
|
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
sub output { |
|
633
|
144
|
|
|
144
|
1
|
11079
|
my $self = shift; |
|
634
|
|
|
|
|
|
|
|
|
635
|
144
|
100
|
|
|
|
497
|
return $self->_output( $self->_object_of_class( 'Output', @_ ) ) if scalar @_; |
|
636
|
79
|
|
|
|
|
1914
|
return $self->_output; |
|
637
|
|
|
|
|
|
|
} |
|
638
|
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
=head3 process |
|
641
|
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
B<process> kicks off the entire data conversion process. It takes no |
|
643
|
|
|
|
|
|
|
parameters. All of the setup is done by the other methods. |
|
644
|
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
B<process> returns the B<ETL::Pipeline> object so you can do things like |
|
646
|
|
|
|
|
|
|
this... |
|
647
|
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
ETL::Pipeline->new( {...} )->process->chain( ... )->process; |
|
649
|
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
=cut |
|
651
|
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
sub process { |
|
653
|
57
|
|
|
57
|
1
|
218
|
my $self = shift; |
|
654
|
|
|
|
|
|
|
|
|
655
|
|
|
|
|
|
|
# Make sure we have all the required information. |
|
656
|
56
|
|
|
|
|
271
|
my ($success, $error) = $self->is_valid; |
|
657
|
56
|
50
|
|
|
|
150
|
croak $error unless $success; |
|
658
|
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
# Sort aliases from the input source before any that were set in the object. |
|
660
|
56
|
|
|
|
|
1412
|
$self->_alias_type( 'input' ); |
|
661
|
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
# Kick off the process. The input source loops over the records. It calls |
|
663
|
|
|
|
|
|
|
# the "record" method, described below. |
|
664
|
56
|
|
|
|
|
1181
|
$self->_output->open( $self ); |
|
665
|
56
|
|
|
|
|
254
|
$self->status( 'START' ); |
|
666
|
56
|
|
|
|
|
329
|
$self->input->run( $self ); |
|
667
|
55
|
|
|
|
|
2093
|
$self->_decrement_count; # "record" adds 1 at the end, so this goes one past the last record. |
|
668
|
55
|
|
|
|
|
167
|
$self->status( 'END' ); |
|
669
|
55
|
|
|
|
|
1773
|
$self->_output->close( $self ); |
|
670
|
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
# Return the pipeline object so that we can chain calls. Useful shorthand |
|
672
|
|
|
|
|
|
|
# when running multiple pipelines. |
|
673
|
55
|
|
|
|
|
201
|
return $self; |
|
674
|
|
|
|
|
|
|
} |
|
675
|
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
|
|
677
|
|
|
|
|
|
|
=head3 session |
|
678
|
|
|
|
|
|
|
|
|
679
|
|
|
|
|
|
|
B<ETL::Pipeline> supports sessions. A session allows input and output objects |
|
680
|
|
|
|
|
|
|
to share information along a chain. For example, imagine 3 Excel files being |
|
681
|
|
|
|
|
|
|
loaded into an Access database. All 3 files go into the same Access database. |
|
682
|
|
|
|
|
|
|
The first pipeline creates the database and saves its path in the session. That |
|
683
|
|
|
|
|
|
|
pipeline chains with a second pipeline. The second pipeline retrieves the |
|
684
|
|
|
|
|
|
|
Access filename from the session. |
|
685
|
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
The B<session> method provides access to session level variables. As you write |
|
687
|
|
|
|
|
|
|
your own L<ETL::Pipeline::Output> classes, they can use session variables for |
|
688
|
|
|
|
|
|
|
sharing information. |
|
689
|
|
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
The first parameter is the variable name. If you pass only the variable name, |
|
691
|
|
|
|
|
|
|
B<session> returns the value. |
|
692
|
|
|
|
|
|
|
|
|
693
|
|
|
|
|
|
|
my $database = $etl->session( 'access_file' ); |
|
694
|
|
|
|
|
|
|
my $identifier = $etl->session( 'session_identifier' ); |
|
695
|
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
A second parameter is the value. |
|
697
|
|
|
|
|
|
|
|
|
698
|
|
|
|
|
|
|
$etl->session( access_file => 'C:\ExcelData.accdb' ); |
|
699
|
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
You can set multiple variables in one call. |
|
701
|
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
$etl->session( access_file => 'C:\ExcelData.accdb', name => 'Abe' ); |
|
703
|
|
|
|
|
|
|
|
|
704
|
|
|
|
|
|
|
If you pass in a hash referece, it completely replaces the current session with |
|
705
|
|
|
|
|
|
|
the new values. |
|
706
|
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
When retrieving an array or hash reference, B<session> automatically |
|
708
|
|
|
|
|
|
|
derefernces it if called in a list context. In a scalar context, B<session> |
|
709
|
|
|
|
|
|
|
returns the reference. |
|
710
|
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
# Returns the list of names as a list. |
|
712
|
|
|
|
|
|
|
foreach my $name ($etl->session( 'name_list' )) { ... } |
|
713
|
|
|
|
|
|
|
|
|
714
|
|
|
|
|
|
|
# Returns a list reference instead of a list. |
|
715
|
|
|
|
|
|
|
my $reference = $etl->session( 'name_list' ); |
|
716
|
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
=head3 session_has |
|
718
|
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
B<session_has> checks for a specific session variable. It returns I<true> if |
|
720
|
|
|
|
|
|
|
the variable exists and I<false> if it doesn't. |
|
721
|
|
|
|
|
|
|
|
|
722
|
|
|
|
|
|
|
B<session_has> only checks existence. It does not tell you if the value is |
|
723
|
|
|
|
|
|
|
defined. |
|
724
|
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
if ($etl->session_has( 'access_file' )) { ... } |
|
726
|
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
=cut |
|
728
|
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
# Alternate design: Use attributes for session level information. |
|
730
|
|
|
|
|
|
|
# Result: Discarded |
|
731
|
|
|
|
|
|
|
# |
|
732
|
|
|
|
|
|
|
# Instead of keeping session variables in a hash, the class would have an |
|
733
|
|
|
|
|
|
|
# attribute corresponding to the session data it can keep. Since |
|
734
|
|
|
|
|
|
|
# ETL::Pipeline::Input and ETL::Pipeline::Output objects have access to the |
|
735
|
|
|
|
|
|
|
# the pipeline, they can share data through the attributes. |
|
736
|
|
|
|
|
|
|
# |
|
737
|
|
|
|
|
|
|
# For any session information, the developer must subclass ETL::Pipeline. The |
|
738
|
|
|
|
|
|
|
# ETL::Pipeline::Input or ETL::Pipeline::Output classes would be tied to that |
|
739
|
|
|
|
|
|
|
# specific subclass. And if you needed to combine two sets of session |
|
740
|
|
|
|
|
|
|
# variables, well that just means another class type. That's very confusing. |
|
741
|
|
|
|
|
|
|
# |
|
742
|
|
|
|
|
|
|
# Attributes make development of new input and output classes very difficult. |
|
743
|
|
|
|
|
|
|
# The hash is simple. It decouples the input/output classes from pipeline. |
|
744
|
|
|
|
|
|
|
# That keeps customization simpler. |
|
745
|
|
|
|
|
|
|
|
|
746
|
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
has '_session' => ( |
|
748
|
|
|
|
|
|
|
default => sub { {} }, |
|
749
|
|
|
|
|
|
|
handles => { |
|
750
|
|
|
|
|
|
|
_add_session => 'set', |
|
751
|
|
|
|
|
|
|
_get_session => 'get', |
|
752
|
|
|
|
|
|
|
session_has => 'exists', |
|
753
|
|
|
|
|
|
|
}, |
|
754
|
|
|
|
|
|
|
init_arg => undef, |
|
755
|
|
|
|
|
|
|
is => 'rw', |
|
756
|
|
|
|
|
|
|
isa => 'HashRef[Any]', |
|
757
|
|
|
|
|
|
|
traits => [qw/Hash/], |
|
758
|
|
|
|
|
|
|
); |
|
759
|
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
|
|
761
|
|
|
|
|
|
|
sub session { |
|
762
|
15
|
|
|
16
|
1
|
16699
|
my $self = shift; |
|
763
|
|
|
|
|
|
|
|
|
764
|
15
|
100
|
|
|
|
85
|
if (scalar( @_ ) > 1) { |
|
|
|
50
|
|
|
|
|
|
|
765
|
6
|
|
|
|
|
24
|
my %parameters = @_; |
|
766
|
6
|
|
|
|
|
40
|
while (my ($key, $value) = each %parameters) { |
|
767
|
7
|
|
|
|
|
334
|
$self->_add_session( $key, $value ); |
|
768
|
|
|
|
|
|
|
} |
|
769
|
6
|
|
|
|
|
31
|
return $_[1]; |
|
770
|
|
|
|
|
|
|
} elsif (scalar( @_ ) == 1) { |
|
771
|
9
|
|
|
|
|
21
|
my $key = shift; |
|
772
|
9
|
50
|
|
|
|
48
|
if (ref( $key ) eq 'HASH') { |
|
|
|
100
|
|
|
|
|
|
|
773
|
0
|
|
|
|
|
0
|
return $self->_session( $key ); |
|
774
|
|
|
|
|
|
|
} elsif (wantarray) { |
|
775
|
1
|
|
|
|
|
61
|
my $result = $self->_get_session( $key ); |
|
776
|
1
|
50
|
|
|
|
6
|
if (ref( $result ) eq 'ARRAY') { return @$result; } |
|
|
1
|
0
|
|
|
|
6
|
|
|
777
|
0
|
|
|
|
|
0
|
elsif (ref( $result ) eq 'HASH' ) { return %$result; } |
|
778
|
0
|
|
|
|
|
0
|
else { return $result; } |
|
779
|
8
|
|
|
|
|
358
|
} else { return $self->_get_session( $key ); } |
|
780
|
0
|
|
|
|
|
0
|
} else { return undef; } |
|
781
|
|
|
|
|
|
|
} |
|
782
|
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
|
|
784
|
|
|
|
|
|
|
=head3 work_in |
|
785
|
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
The working directory sets the default place for finding files. All searches |
|
787
|
|
|
|
|
|
|
start here and only descend subdirectories. Temporary or output files go into |
|
788
|
|
|
|
|
|
|
this directory as well. |
|
789
|
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
B<work_in> has two forms: C<work_in( 'C:\Data' );> or |
|
791
|
|
|
|
|
|
|
C<< work_in( root => 'C:\Data', iname => 'Ficticious' ); >>. |
|
792
|
|
|
|
|
|
|
|
|
793
|
|
|
|
|
|
|
The first form specifies the exact directory path. In our example, the working |
|
794
|
|
|
|
|
|
|
directory is F<C:\Data>. |
|
795
|
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
The second form searches the file system for a matching directory. Take this |
|
797
|
|
|
|
|
|
|
example... |
|
798
|
|
|
|
|
|
|
|
|
799
|
|
|
|
|
|
|
$etl->work_in( root => 'C:\Data', iname => 'Ficticious' ); |
|
800
|
|
|
|
|
|
|
|
|
801
|
|
|
|
|
|
|
It scans the F<C:\Data> directory for a subdirectory named F<Fictious>, like |
|
802
|
|
|
|
|
|
|
this: F<C:\Data\Ficticious>. The search is B<not> recursive. It locates files |
|
803
|
|
|
|
|
|
|
in the B<root> folder. |
|
804
|
|
|
|
|
|
|
|
|
805
|
|
|
|
|
|
|
B<work_in> accepts any of the tests provided by L<Path::Iterator::Rule>. The |
|
806
|
|
|
|
|
|
|
values of these arguments are passed directly into the test. For boolean tests |
|
807
|
|
|
|
|
|
|
(e.g. readable, exists, etc.), pass an C<undef> value. |
|
808
|
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
B<work_in> automatically applies the C<directory> filter. Do not set it |
|
810
|
|
|
|
|
|
|
yourself. |
|
811
|
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
C<iname> is the most common one that I use. It matches the file name, supports |
|
813
|
|
|
|
|
|
|
wildcards and regular expressions, and is case insensitive. |
|
814
|
|
|
|
|
|
|
|
|
815
|
|
|
|
|
|
|
# Search using a regular expression... |
|
816
|
|
|
|
|
|
|
$etl->work_in( iname => qr/\.xlsx$/, root => 'C:\Data' ); |
|
817
|
|
|
|
|
|
|
|
|
818
|
|
|
|
|
|
|
# Search using a file glob... |
|
819
|
|
|
|
|
|
|
$etl->work_in( iname => '*.xlsx', root => 'C:\Data' ); |
|
820
|
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
The code throws an error if no directory matches the criteria. Only the first |
|
822
|
|
|
|
|
|
|
match is used. |
|
823
|
|
|
|
|
|
|
|
|
824
|
|
|
|
|
|
|
B<work_in> automatically resets L</data_in>. |
|
825
|
|
|
|
|
|
|
|
|
826
|
|
|
|
|
|
|
=cut |
|
827
|
|
|
|
|
|
|
|
|
828
|
|
|
|
|
|
|
has '_work_in' => ( |
|
829
|
|
|
|
|
|
|
coerce => 1, |
|
830
|
|
|
|
|
|
|
init_arg => undef, |
|
831
|
|
|
|
|
|
|
is => 'rw', |
|
832
|
|
|
|
|
|
|
isa => Dir, |
|
833
|
|
|
|
|
|
|
trigger => \&_trigger_work_in, |
|
834
|
|
|
|
|
|
|
); |
|
835
|
|
|
|
|
|
|
|
|
836
|
|
|
|
|
|
|
|
|
837
|
|
|
|
|
|
|
sub work_in { |
|
838
|
143
|
|
|
144
|
1
|
5122
|
my $self = shift; |
|
839
|
|
|
|
|
|
|
|
|
840
|
143
|
100
|
|
|
|
580
|
if (scalar( @_ ) == 1) { |
|
|
|
100
|
|
|
|
|
|
|
841
|
65
|
|
|
|
|
1775
|
return $self->_work_in( shift ); |
|
842
|
|
|
|
|
|
|
} elsif(scalar( @_ ) > 1) { |
|
843
|
5
|
|
|
|
|
35
|
my %options = @_; |
|
844
|
|
|
|
|
|
|
|
|
845
|
5
|
|
100
|
|
|
28
|
my $root = $options{root} // '.'; |
|
846
|
5
|
|
|
|
|
15
|
delete $options{root}; |
|
847
|
|
|
|
|
|
|
|
|
848
|
5
|
50
|
|
|
|
17
|
if (scalar %options) { |
|
849
|
5
|
|
|
|
|
86
|
my $rule = Path::Class::Rule->new->directory; |
|
850
|
5
|
|
|
|
|
293
|
while (my ($name, $value) = each %options) { |
|
851
|
5
|
|
|
|
|
374
|
eval "\$rule = \$rule->$name( \$value )"; |
|
852
|
5
|
50
|
|
|
|
1458
|
confess $@ unless $@ eq ''; |
|
853
|
|
|
|
|
|
|
} |
|
854
|
5
|
|
|
|
|
31
|
my $next = $rule->iter( $root ); |
|
855
|
5
|
|
|
|
|
1458
|
my $match = $next->(); |
|
856
|
5
|
50
|
|
|
|
18032
|
croak 'No matching directories' unless defined $match; |
|
857
|
5
|
|
|
|
|
364
|
return $self->_work_in( $match ); |
|
858
|
0
|
|
|
|
|
0
|
} else { return $self->_work_in( $root ); } |
|
859
|
73
|
|
|
|
|
1736
|
} else { return $self->_work_in; } |
|
860
|
|
|
|
|
|
|
} |
|
861
|
|
|
|
|
|
|
|
|
862
|
|
|
|
|
|
|
|
|
863
|
|
|
|
|
|
|
sub _trigger_work_in { |
|
864
|
142
|
|
|
143
|
|
266
|
my $self = shift; |
|
865
|
142
|
|
|
|
|
197
|
my $new = shift; |
|
866
|
|
|
|
|
|
|
|
|
867
|
|
|
|
|
|
|
# Force absolute paths. Changing the value will fire this trigger again. |
|
868
|
|
|
|
|
|
|
# I only want to change "_data_in" once. |
|
869
|
142
|
100
|
66
|
|
|
649
|
if (defined( $new ) && $new->is_relative) { |
|
870
|
70
|
|
|
|
|
3091
|
$self->_work_in( $new->cleanup->absolute ); |
|
871
|
|
|
|
|
|
|
} else { |
|
872
|
72
|
|
|
|
|
3714
|
$self->_data_in( $new ); |
|
873
|
|
|
|
|
|
|
} |
|
874
|
|
|
|
|
|
|
} |
|
875
|
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
|
|
877
|
|
|
|
|
|
|
=head2 Used in mapping |
|
878
|
|
|
|
|
|
|
|
|
879
|
|
|
|
|
|
|
These methods may be used by code references in the L</mapping> attribute. They |
|
880
|
|
|
|
|
|
|
will return information about/from the current record. |
|
881
|
|
|
|
|
|
|
|
|
882
|
|
|
|
|
|
|
=head3 count |
|
883
|
|
|
|
|
|
|
|
|
884
|
|
|
|
|
|
|
This attribute tells you how many records have been read from the input source. |
|
885
|
|
|
|
|
|
|
This value is incremented before any filtering. So it even counts records that |
|
886
|
|
|
|
|
|
|
are bypassed by L</on_record>. |
|
887
|
|
|
|
|
|
|
|
|
888
|
|
|
|
|
|
|
The first record is always number B<1>. |
|
889
|
|
|
|
|
|
|
|
|
890
|
|
|
|
|
|
|
=cut |
|
891
|
|
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
has 'count' => ( |
|
893
|
|
|
|
|
|
|
default => '1', |
|
894
|
|
|
|
|
|
|
handles => {_decrement_count => 'dec', _increment_count => 'inc'}, |
|
895
|
|
|
|
|
|
|
is => 'ro', |
|
896
|
|
|
|
|
|
|
isa => 'Int', |
|
897
|
|
|
|
|
|
|
traits => [qw/Counter/], |
|
898
|
|
|
|
|
|
|
); |
|
899
|
|
|
|
|
|
|
|
|
900
|
|
|
|
|
|
|
|
|
901
|
|
|
|
|
|
|
=head3 get |
|
902
|
|
|
|
|
|
|
|
|
903
|
|
|
|
|
|
|
Retrieve the value from the record for the given field name. This method accepts |
|
904
|
|
|
|
|
|
|
two parameters - a field name and the current record. It returns the exact value |
|
905
|
|
|
|
|
|
|
found at the matching node. |
|
906
|
|
|
|
|
|
|
|
|
907
|
|
|
|
|
|
|
=head4 Field names |
|
908
|
|
|
|
|
|
|
|
|
909
|
|
|
|
|
|
|
The field name can be... |
|
910
|
|
|
|
|
|
|
|
|
911
|
|
|
|
|
|
|
=over |
|
912
|
|
|
|
|
|
|
|
|
913
|
|
|
|
|
|
|
=item A string containing a hash key |
|
914
|
|
|
|
|
|
|
|
|
915
|
|
|
|
|
|
|
=item An array index (all digits) |
|
916
|
|
|
|
|
|
|
|
|
917
|
|
|
|
|
|
|
=item A string containing a L<Data::DPath> path (starts with B</>) |
|
918
|
|
|
|
|
|
|
|
|
919
|
|
|
|
|
|
|
=item A regular expression reference |
|
920
|
|
|
|
|
|
|
|
|
921
|
|
|
|
|
|
|
=item A code reference |
|
922
|
|
|
|
|
|
|
|
|
923
|
|
|
|
|
|
|
=back |
|
924
|
|
|
|
|
|
|
|
|
925
|
|
|
|
|
|
|
Hash keys and regular expressions match both field names and aliases. These are |
|
926
|
|
|
|
|
|
|
the only types that match aliases. Hash keys cannot be all digits and cannot |
|
927
|
|
|
|
|
|
|
begin with the B</> character. Otherwise B<get> mis-identifies them. |
|
928
|
|
|
|
|
|
|
|
|
929
|
|
|
|
|
|
|
B<get> interprets strings of all digits as array indexes (numbers). Excel files, |
|
930
|
|
|
|
|
|
|
for example, return an array instead of a hash. And this is an easy way to |
|
931
|
|
|
|
|
|
|
reference columns in order. |
|
932
|
|
|
|
|
|
|
|
|
933
|
|
|
|
|
|
|
B<get> treats a string beginning with a B</> (slash) as a L<Data::DPath> path. |
|
934
|
|
|
|
|
|
|
This lets you very specifically traverse a complex data sturcture, such as those |
|
935
|
|
|
|
|
|
|
from XML or JSON. |
|
936
|
|
|
|
|
|
|
|
|
937
|
|
|
|
|
|
|
For a regular expression, B<get> matches hash keys at the top level of the data |
|
938
|
|
|
|
|
|
|
structure plus aliases. |
|
939
|
|
|
|
|
|
|
|
|
940
|
|
|
|
|
|
|
And with a a code reference, B<get> executes the subroutine. The return value |
|
941
|
|
|
|
|
|
|
becomes the field value. The code reference is called in a scalar context. If |
|
942
|
|
|
|
|
|
|
you need to return multiple values, then return an ARRAY or HASH reference. |
|
943
|
|
|
|
|
|
|
|
|
944
|
|
|
|
|
|
|
A code reference receives two parameters - the B<ETL::Pipeline> object and the |
|
945
|
|
|
|
|
|
|
current record. |
|
946
|
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
=head4 Current record |
|
948
|
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
The current record is optional. B<get> will use L</this> if you do not pass in a |
|
950
|
|
|
|
|
|
|
record. By accepting a record, you can use B<get> on sub-records. So by default, |
|
951
|
|
|
|
|
|
|
B<get> returns a value from the top record. Use the second parameter to retrieve |
|
952
|
|
|
|
|
|
|
values from a sub-record. |
|
953
|
|
|
|
|
|
|
|
|
954
|
|
|
|
|
|
|
B<get> only applies aliases when using L</this>. Aliases do not apply to |
|
955
|
|
|
|
|
|
|
sub-records. |
|
956
|
|
|
|
|
|
|
|
|
957
|
|
|
|
|
|
|
=head4 Return value |
|
958
|
|
|
|
|
|
|
|
|
959
|
|
|
|
|
|
|
B<get> always returns a scalar value, but not always a string. The return value |
|
960
|
|
|
|
|
|
|
might be a string, ARRAY reference, or HASH reference. |
|
961
|
|
|
|
|
|
|
|
|
962
|
|
|
|
|
|
|
B<get> does not flatten out the nodes that it finds. It merely returns a |
|
963
|
|
|
|
|
|
|
reference to whatever is in the data structure at the named point. The calling |
|
964
|
|
|
|
|
|
|
code must account for the possibility of finding an array or hash or string. |
|
965
|
|
|
|
|
|
|
|
|
966
|
|
|
|
|
|
|
=cut |
|
967
|
|
|
|
|
|
|
|
|
968
|
|
|
|
|
|
|
sub get { |
|
969
|
460
|
|
|
461
|
1
|
903
|
my ($self, $field, $record) = @_; |
|
970
|
|
|
|
|
|
|
|
|
971
|
|
|
|
|
|
|
# Because the reference may be stored, I want to force a new copy every |
|
972
|
|
|
|
|
|
|
# time. Otherwise scripts may get invalid values from previous records. |
|
973
|
460
|
|
|
|
|
648
|
my $found = []; |
|
974
|
|
|
|
|
|
|
|
|
975
|
|
|
|
|
|
|
# Use the current record from the attribute unless the programmer explicilty |
|
976
|
|
|
|
|
|
|
# sent in a record. By sending in a record, "get" works on sub-records. But |
|
977
|
|
|
|
|
|
|
# the default behaviour is what you would expect. |
|
978
|
460
|
|
|
|
|
566
|
my $full = 0; |
|
979
|
460
|
50
|
|
|
|
811
|
unless (defined $record) { |
|
980
|
460
|
|
|
|
|
10125
|
$record = $self->this; |
|
981
|
460
|
|
|
|
|
612
|
$full = 1; |
|
982
|
|
|
|
|
|
|
} |
|
983
|
|
|
|
|
|
|
|
|
984
|
|
|
|
|
|
|
# Execute code reference. This is all there is to do. We send back whatever |
|
985
|
|
|
|
|
|
|
# the code returns. |
|
986
|
460
|
100
|
|
|
|
864
|
if (ref( $field ) eq 'CODE') { |
|
987
|
100
|
|
|
|
|
370
|
@$found = $field->( $self, $record ); |
|
988
|
|
|
|
|
|
|
} |
|
989
|
|
|
|
|
|
|
|
|
990
|
|
|
|
|
|
|
# Anything else we match - either a field name or an alias. The sequence is |
|
991
|
|
|
|
|
|
|
# the same for both. |
|
992
|
|
|
|
|
|
|
else { |
|
993
|
|
|
|
|
|
|
# Match field names first. |
|
994
|
360
|
|
|
|
|
461
|
my $check_alias = 0; |
|
995
|
360
|
|
|
|
|
796
|
$field = $self->_as_dpath( $field, \$check_alias ); |
|
996
|
360
|
|
|
|
|
1178
|
@$found = dpath( $field )->match( $record ); |
|
997
|
|
|
|
|
|
|
|
|
998
|
360
|
100
|
66
|
|
|
106930
|
if ($check_alias && $full) { |
|
999
|
|
|
|
|
|
|
# Build the cache first time through. Re-use it later to save time. |
|
1000
|
89
|
100
|
|
|
|
2945
|
unless ($self->_alias_cache_built) { |
|
1001
|
41
|
|
|
|
|
867
|
my $cache = $self->_alias_cache; |
|
1002
|
41
|
|
|
|
|
125
|
foreach my $item ($self->aliases) { |
|
1003
|
46
|
|
|
|
|
141
|
while (my ($alias, $location) = each %$item) { |
|
1004
|
70
|
100
|
|
|
|
191
|
$cache->{$alias} = [] unless exists $cache->{$alias}; |
|
1005
|
70
|
|
|
|
|
84
|
push @{$cache->{$alias}}, $self->_as_dpath( $location ); |
|
|
70
|
|
|
|
|
119
|
|
|
1006
|
|
|
|
|
|
|
} |
|
1007
|
|
|
|
|
|
|
} |
|
1008
|
|
|
|
|
|
|
} |
|
1009
|
|
|
|
|
|
|
|
|
1010
|
|
|
|
|
|
|
# Search the actual data in all of the fields from matching aliases. |
|
1011
|
89
|
|
|
|
|
258
|
my @search = dpath( $field )->match( $self->_alias_cache ); |
|
1012
|
89
|
|
|
|
|
18475
|
foreach my $list (@search) { |
|
1013
|
51
|
|
|
|
|
112
|
foreach my $location (@$list) { |
|
1014
|
55
|
|
|
|
|
144
|
my @values = dpath( $location )->match( $record ); |
|
1015
|
55
|
|
|
|
|
11590
|
push @$found, @values; |
|
1016
|
|
|
|
|
|
|
} |
|
1017
|
|
|
|
|
|
|
} |
|
1018
|
|
|
|
|
|
|
} |
|
1019
|
|
|
|
|
|
|
} |
|
1020
|
|
|
|
|
|
|
|
|
1021
|
|
|
|
|
|
|
# Send back the final value. |
|
1022
|
460
|
100
|
|
|
|
2078
|
return (scalar( @$found ) <= 1 ? $found->[0] : $found); |
|
1023
|
|
|
|
|
|
|
} |
|
1024
|
|
|
|
|
|
|
|
|
1025
|
|
|
|
|
|
|
|
|
1026
|
|
|
|
|
|
|
# Format the match string for Data::DPath. I allow scripts to use shortcut |
|
1027
|
|
|
|
|
|
|
# formatting so they are easier to read. This method translates those into a |
|
1028
|
|
|
|
|
|
|
# correct Data::DPath path. |
|
1029
|
|
|
|
|
|
|
sub _as_dpath { |
|
1030
|
430
|
|
|
431
|
|
721
|
my ($self, $field, $alias) = @_; |
|
1031
|
|
|
|
|
|
|
|
|
1032
|
430
|
100
|
|
|
|
1675
|
if (ref( $field ) eq 'Regexp') { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
1033
|
4
|
50
|
|
|
|
19
|
$$alias = 1 if ref( $alias ) eq 'SCALAR'; |
|
1034
|
4
|
|
|
|
|
18
|
return "/*[key =~ /$field/]"; |
|
1035
|
|
|
|
|
|
|
} elsif ($field =~ m/^\d+$/) { |
|
1036
|
239
|
100
|
|
|
|
502
|
$$alias = 0 if ref( $alias ) eq 'SCALAR'; |
|
1037
|
239
|
|
|
|
|
652
|
return "/*[$field]"; |
|
1038
|
|
|
|
|
|
|
} elsif ($field !~ m|^/|) { |
|
1039
|
85
|
50
|
|
|
|
232
|
$$alias = 1 if ref( $alias ) eq 'SCALAR'; |
|
1040
|
85
|
|
|
|
|
215
|
return "/$field"; |
|
1041
|
|
|
|
|
|
|
} else { |
|
1042
|
102
|
50
|
|
|
|
331
|
$$alias = 0 if ref( $alias ) eq 'SCALAR'; |
|
1043
|
102
|
|
|
|
|
223
|
return $field; |
|
1044
|
|
|
|
|
|
|
} |
|
1045
|
|
|
|
|
|
|
} |
|
1046
|
|
|
|
|
|
|
|
|
1047
|
|
|
|
|
|
|
|
|
1048
|
|
|
|
|
|
|
# Alternate designs... |
|
1049
|
|
|
|
|
|
|
# |
|
1050
|
|
|
|
|
|
|
# I considered building a temporary hash keyed by the alias names. Then I could |
|
1051
|
|
|
|
|
|
|
# apply a full Data::DPath to retrieve aliased fields. But a path like "/*[0]" |
|
1052
|
|
|
|
|
|
|
# would always match both the main record and the aliases. I would always be |
|
1053
|
|
|
|
|
|
|
# returning multiple values when the user clearly expected one. It makes aliases |
|
1054
|
|
|
|
|
|
|
# pretty much useless. |
|
1055
|
|
|
|
|
|
|
|
|
1056
|
|
|
|
|
|
|
|
|
1057
|
|
|
|
|
|
|
=head3 this |
|
1058
|
|
|
|
|
|
|
|
|
1059
|
|
|
|
|
|
|
The current record. The L</record> method sets B<this> before it does anything |
|
1060
|
|
|
|
|
|
|
else. L</get> will use B<this> if you don't pass in a record. It makes a |
|
1061
|
|
|
|
|
|
|
convenient shortcut so you don't have to pass the record into every call. |
|
1062
|
|
|
|
|
|
|
|
|
1063
|
|
|
|
|
|
|
B<this> can be any valid Perl data. Usually a hash reference or array reference. |
|
1064
|
|
|
|
|
|
|
The input source controls the type. |
|
1065
|
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
=cut |
|
1067
|
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
has 'this' => ( |
|
1069
|
|
|
|
|
|
|
is => 'ro', |
|
1070
|
|
|
|
|
|
|
isa => 'Maybe[Any]', |
|
1071
|
|
|
|
|
|
|
writer => '_set_this', |
|
1072
|
|
|
|
|
|
|
); |
|
1073
|
|
|
|
|
|
|
|
|
1074
|
|
|
|
|
|
|
|
|
1075
|
|
|
|
|
|
|
=head2 Used by input sources |
|
1076
|
|
|
|
|
|
|
|
|
1077
|
|
|
|
|
|
|
=head3 aliases (see above) |
|
1078
|
|
|
|
|
|
|
|
|
1079
|
|
|
|
|
|
|
Your input source can use the L</aliases> method documented above to set |
|
1080
|
|
|
|
|
|
|
column headers as field names. Excel files, for example, would call L</aliases> |
|
1081
|
|
|
|
|
|
|
to assign letters to column numbers, like a real spreadsheet. |
|
1082
|
|
|
|
|
|
|
|
|
1083
|
|
|
|
|
|
|
=head3 record |
|
1084
|
|
|
|
|
|
|
|
|
1085
|
|
|
|
|
|
|
The input source calls this method for each data record. This is where |
|
1086
|
|
|
|
|
|
|
L<ETL::Pipeline> applies the mapping, constants, and sends the results on to the |
|
1087
|
|
|
|
|
|
|
L<ETL::Pipeline> applies the mapping, constants, and sends the results on to the |
|
1088
|
|
|
|
|
|
|
output destination. |
|
1089
|
|
|
|
|
|
|
|
|
1090
|
|
|
|
|
|
|
B<record> takes one parameter - he current record. The record can be any Perl |
|
1091
|
|
|
|
|
|
|
data structure - hash, array, or scalar. B<record> uses L<Data::DPath> to |
|
1092
|
|
|
|
|
|
|
traverse the structure. |
|
1093
|
|
|
|
|
|
|
|
|
1094
|
|
|
|
|
|
|
B<record> calls L</get> on each field in L</mapping>. B</get> traverses the |
|
1095
|
|
|
|
|
|
|
data structure retrieving the correct values. B<record> concatenates multiple |
|
1096
|
|
|
|
|
|
|
matches into a single, scalar value for the output. |
|
1097
|
|
|
|
|
|
|
|
|
1098
|
|
|
|
|
|
|
=cut |
|
1099
|
|
|
|
|
|
|
|
|
1100
|
|
|
|
|
|
|
sub record { |
|
1101
|
127
|
|
|
128
|
1
|
6753
|
my ($self, $record) = @_; |
|
1102
|
|
|
|
|
|
|
|
|
1103
|
|
|
|
|
|
|
# Save the current record so that other methods and helper functions can |
|
1104
|
|
|
|
|
|
|
# access it without the programmer passing it around. |
|
1105
|
127
|
|
|
|
|
3377
|
$self->_set_this( $record ); |
|
1106
|
|
|
|
|
|
|
|
|
1107
|
|
|
|
|
|
|
# Remove leading and trailing whitespace from all fields. We always want to |
|
1108
|
|
|
|
|
|
|
# do this. Otherwise we end up with weird looking text. I do this first so |
|
1109
|
|
|
|
|
|
|
# that all the customized code sees is the filtered data. |
|
1110
|
127
|
100
|
|
10783
|
|
1237
|
traverse { trim( m/HASH/ ? $b : $a ) } $record; |
|
|
10782
|
|
|
|
|
231338
|
|
|
1111
|
|
|
|
|
|
|
|
|
1112
|
|
|
|
|
|
|
# Run the custom record filter, if there is one. If the filter returns |
|
1113
|
|
|
|
|
|
|
# "false", then we bypass this entire record. |
|
1114
|
127
|
|
|
|
|
4927
|
my $code = $self->on_record; |
|
1115
|
127
|
|
|
|
|
256
|
my $continue = 1; |
|
1116
|
|
|
|
|
|
|
|
|
1117
|
127
|
100
|
|
|
|
354
|
$continue = $code->( $self, $record ) if defined $code; |
|
1118
|
127
|
100
|
|
|
|
345
|
unless ($continue) { |
|
1119
|
1
|
|
|
|
|
28
|
$self->_increment_count; # Record processed. |
|
1120
|
1
|
|
|
|
|
4
|
return; |
|
1121
|
|
|
|
|
|
|
} |
|
1122
|
|
|
|
|
|
|
|
|
1123
|
|
|
|
|
|
|
# Insert constants into the output. Do this before the mapping. The mapping |
|
1124
|
|
|
|
|
|
|
# will always override constants. I want data from the input. |
|
1125
|
|
|
|
|
|
|
# |
|
1126
|
|
|
|
|
|
|
# I had used a regular hash. Perl kept re-using the same memory location. |
|
1127
|
|
|
|
|
|
|
# The records were overwriting each other. Switched to a hash reference so I |
|
1128
|
|
|
|
|
|
|
# can force Perl to allocate new memory for every record. |
|
1129
|
126
|
|
|
|
|
249
|
my $save = {}; |
|
1130
|
126
|
100
|
|
|
|
3621
|
if ($self->_has_constants) { |
|
1131
|
15
|
|
|
|
|
364
|
my $constants = $self->_constants; |
|
1132
|
15
|
|
|
|
|
77
|
%$save = %$constants; |
|
1133
|
|
|
|
|
|
|
} |
|
1134
|
|
|
|
|
|
|
|
|
1135
|
|
|
|
|
|
|
# This is the transform step. It converts the input record into an output |
|
1136
|
|
|
|
|
|
|
# record. The mapping can be either a hash reference of transformations or |
|
1137
|
|
|
|
|
|
|
# a code reference that does all of the transformations. |
|
1138
|
126
|
|
|
|
|
513
|
my $mapping = $self->mapping; |
|
1139
|
126
|
100
|
|
|
|
455
|
if (ref( $mapping ) eq 'CODE') { |
|
1140
|
8
|
|
|
|
|
32
|
my $return = $mapping->( $self, $record ); |
|
1141
|
|
|
|
|
|
|
|
|
1142
|
|
|
|
|
|
|
# Merge with constants that might have been set above. |
|
1143
|
8
|
100
|
|
|
|
54
|
$save = {%$save, %$return} if defined $return; |
|
1144
|
|
|
|
|
|
|
} else { |
|
1145
|
118
|
|
|
|
|
527
|
while (my ($to, $from) = each %$mapping) { |
|
1146
|
278
|
|
|
|
|
411
|
my $seperator = '; '; |
|
1147
|
278
|
50
|
|
|
|
587
|
if (ref( $from ) eq 'ARRAY') { |
|
1148
|
0
|
|
|
|
|
0
|
$seperator = $from->[1]; |
|
1149
|
0
|
|
|
|
|
0
|
$from = $from->[0]; # Do this LAST! |
|
1150
|
|
|
|
|
|
|
} |
|
1151
|
|
|
|
|
|
|
|
|
1152
|
278
|
|
|
|
|
651
|
my $values = $self->get( $from ); |
|
1153
|
278
|
100
|
|
|
|
648
|
if (ref( $values ) eq '' ) { $save->{$to} = $values; } |
|
|
261
|
50
|
|
|
|
1280
|
|
|
1154
|
|
|
|
|
|
|
elsif (ref( $values ) eq 'ARRAY') { |
|
1155
|
17
|
50
|
|
22
|
|
169
|
my $invalid = first { defined( $_ ) && ref( $_ ) ne '' } @$values; |
|
|
21
|
|
|
|
|
121
|
|
|
1156
|
17
|
100
|
|
|
|
89
|
if (defined $invalid) { |
|
1157
|
13
|
|
|
|
|
26
|
my $type = ref( $invalid ); |
|
1158
|
13
|
|
|
|
|
81
|
$self->status( 'ERROR', "Data structure of type $type found by mapping '$from' to '$to'" ); |
|
1159
|
13
|
|
|
|
|
115
|
$save->{$to} = undef; |
|
1160
|
|
|
|
|
|
|
} else { |
|
1161
|
4
|
|
|
|
|
13
|
my @usable = grep { hascontent( $_ ) } @$values; |
|
|
8
|
|
|
|
|
80
|
|
|
1162
|
4
|
50
|
|
|
|
45
|
if(scalar @usable) { $save->{$to} = join( $seperator, @usable ); } |
|
|
4
|
|
|
|
|
52
|
|
|
1163
|
0
|
|
|
|
|
0
|
else { $save->{$to} = undef; } |
|
1164
|
|
|
|
|
|
|
} |
|
1165
|
0
|
|
|
|
|
0
|
} else { $save->{$to} = undef; } |
|
1166
|
|
|
|
|
|
|
} |
|
1167
|
|
|
|
|
|
|
} |
|
1168
|
|
|
|
|
|
|
|
|
1169
|
|
|
|
|
|
|
# We're done with this record. Finish up. |
|
1170
|
126
|
|
|
|
|
3305
|
$self->_output->write( $self, $save ); |
|
1171
|
126
|
|
|
|
|
487
|
$self->status( 'STATUS' ); |
|
1172
|
|
|
|
|
|
|
|
|
1173
|
|
|
|
|
|
|
# Increase the record count. Do this last so that any status messages from |
|
1174
|
|
|
|
|
|
|
# the input source reflect the correct record number. |
|
1175
|
126
|
|
|
|
|
3502
|
$self->_increment_count; |
|
1176
|
|
|
|
|
|
|
} |
|
1177
|
|
|
|
|
|
|
|
|
1178
|
|
|
|
|
|
|
|
|
1179
|
|
|
|
|
|
|
=head3 status |
|
1180
|
|
|
|
|
|
|
|
|
1181
|
|
|
|
|
|
|
This method displays a status message. B<ETL::Pipeline> calls this method to |
|
1182
|
|
|
|
|
|
|
report on the progress of pipeline. It takes one or two parameters - the message |
|
1183
|
|
|
|
|
|
|
type (required) and the message itself (optional). |
|
1184
|
|
|
|
|
|
|
|
|
1185
|
|
|
|
|
|
|
The type can be anything. These are the ones that B<ETL::Pipeline> uses... |
|
1186
|
|
|
|
|
|
|
|
|
1187
|
|
|
|
|
|
|
=over |
|
1188
|
|
|
|
|
|
|
|
|
1189
|
|
|
|
|
|
|
=item DEBUG |
|
1190
|
|
|
|
|
|
|
|
|
1191
|
|
|
|
|
|
|
Messages used for debugging problems. You should only use these temporarily to |
|
1192
|
|
|
|
|
|
|
look for specific issues. Otherwise they clog up the display for the end user. |
|
1193
|
|
|
|
|
|
|
|
|
1194
|
|
|
|
|
|
|
=item END |
|
1195
|
|
|
|
|
|
|
|
|
1196
|
|
|
|
|
|
|
The pipeline has finished. The input source is closed. The output destination |
|
1197
|
|
|
|
|
|
|
is still open. It will be closed immediately after. There is no message text. |
|
1198
|
|
|
|
|
|
|
|
|
1199
|
|
|
|
|
|
|
=item ERROR |
|
1200
|
|
|
|
|
|
|
|
|
1201
|
|
|
|
|
|
|
Report an error message to the user. These are not necessarily fatal errors. |
|
1202
|
|
|
|
|
|
|
|
|
1203
|
|
|
|
|
|
|
=item INFO |
|
1204
|
|
|
|
|
|
|
|
|
1205
|
|
|
|
|
|
|
An informational message to the user. |
|
1206
|
|
|
|
|
|
|
|
|
1207
|
|
|
|
|
|
|
=item START |
|
1208
|
|
|
|
|
|
|
|
|
1209
|
|
|
|
|
|
|
The pipeline is just starting. The output destination is open. But the input |
|
1210
|
|
|
|
|
|
|
source is not. There is no message text. |
|
1211
|
|
|
|
|
|
|
|
|
1212
|
|
|
|
|
|
|
=item STATUS |
|
1213
|
|
|
|
|
|
|
|
|
1214
|
|
|
|
|
|
|
Progress update. This is sent every after every input record. |
|
1215
|
|
|
|
|
|
|
|
|
1216
|
|
|
|
|
|
|
=back |
|
1217
|
|
|
|
|
|
|
|
|
1218
|
|
|
|
|
|
|
See L</Custom logging> for information about adding your own log method. |
|
1219
|
|
|
|
|
|
|
|
|
1220
|
|
|
|
|
|
|
=cut |
|
1221
|
|
|
|
|
|
|
|
|
1222
|
|
|
|
|
|
|
sub status { |
|
1223
|
314
|
|
|
315
|
1
|
755
|
my ($self, $type, $message) = @_; |
|
1224
|
314
|
|
|
|
|
727
|
$type = uc( $type ); |
|
1225
|
|
|
|
|
|
|
|
|
1226
|
314
|
100
|
|
|
|
1189
|
if ($type eq 'START') { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
1227
|
56
|
|
|
|
|
82
|
my $name; |
|
1228
|
56
|
|
|
|
|
4963
|
say 'Processing...'; |
|
1229
|
|
|
|
|
|
|
} elsif ($type eq 'END') { |
|
1230
|
55
|
|
|
|
|
97
|
my $name; |
|
1231
|
55
|
|
|
|
|
2569
|
say 'Finished!'; |
|
1232
|
|
|
|
|
|
|
} elsif ($type eq 'STATUS') { |
|
1233
|
126
|
|
|
|
|
3005
|
my $count = $self->count; |
|
1234
|
126
|
50
|
|
|
|
488
|
say "Processed record #$count..." unless $count % 50; |
|
1235
|
|
|
|
|
|
|
} else { |
|
1236
|
77
|
|
|
|
|
1779
|
my $count = $self->count; |
|
1237
|
77
|
|
|
|
|
273
|
my $source = $self->input->source; |
|
1238
|
|
|
|
|
|
|
|
|
1239
|
77
|
50
|
|
|
|
311
|
if (hascontent( $source )) { |
|
1240
|
77
|
|
|
|
|
4381
|
say "$type [record #$count at $source] $message"; |
|
1241
|
|
|
|
|
|
|
} else { |
|
1242
|
0
|
|
|
|
|
0
|
say "$type [record #$count] $message"; |
|
1243
|
|
|
|
|
|
|
} |
|
1244
|
|
|
|
|
|
|
} |
|
1245
|
|
|
|
|
|
|
} |
|
1246
|
|
|
|
|
|
|
|
|
1247
|
|
|
|
|
|
|
|
|
1248
|
|
|
|
|
|
|
=head2 Utility Functions |
|
1249
|
|
|
|
|
|
|
|
|
1250
|
|
|
|
|
|
|
These methods can be used inside L</mapping> code references. Unless otherwise |
|
1251
|
|
|
|
|
|
|
noted, these all work on L<the current record|/this>. |
|
1252
|
|
|
|
|
|
|
|
|
1253
|
|
|
|
|
|
|
my $etl = ETL::Pipeline->new( { |
|
1254
|
|
|
|
|
|
|
... |
|
1255
|
|
|
|
|
|
|
mapping => {A => sub { shift->function( ... ) }}, |
|
1256
|
|
|
|
|
|
|
... |
|
1257
|
|
|
|
|
|
|
} ); |
|
1258
|
|
|
|
|
|
|
|
|
1259
|
|
|
|
|
|
|
=head3 coalesce |
|
1260
|
|
|
|
|
|
|
|
|
1261
|
|
|
|
|
|
|
Emulates the SQL Server C<COALESCE> command. It takes a list of field names for |
|
1262
|
|
|
|
|
|
|
L</get> and returns the value of the first non-blank field. |
|
1263
|
|
|
|
|
|
|
|
|
1264
|
|
|
|
|
|
|
# First non-blank field |
|
1265
|
|
|
|
|
|
|
$etl->coalesce( 'Patient', 'Complainant', 'From' ); |
|
1266
|
|
|
|
|
|
|
|
|
1267
|
|
|
|
|
|
|
# Actual value if no non-blank fields |
|
1268
|
|
|
|
|
|
|
$etl->coalesce( 'Date', \$today ); |
|
1269
|
|
|
|
|
|
|
|
|
1270
|
|
|
|
|
|
|
In the first example, B<coalesce> looks at the B<Patient> field first. If it's |
|
1271
|
|
|
|
|
|
|
blank, then B<coalesce> looks at the B<Complainant> field. Same thing - if it's |
|
1272
|
|
|
|
|
|
|
blank, B<coalesce> returns the B<From> field. |
|
1273
|
|
|
|
|
|
|
|
|
1274
|
|
|
|
|
|
|
I<Blank> means C<undef>, empty string, or all whitespace. This is different |
|
1275
|
|
|
|
|
|
|
than the SQL version. |
|
1276
|
|
|
|
|
|
|
|
|
1277
|
|
|
|
|
|
|
The second examples shows an actual value passed as a scalar reference. Because |
|
1278
|
|
|
|
|
|
|
it's a reference, B<coalesce> recognizes that it is not a field name for |
|
1279
|
|
|
|
|
|
|
L</get>. B<coalesce> uses the value in C<$today> if the B<Date> field is blank. |
|
1280
|
|
|
|
|
|
|
|
|
1281
|
|
|
|
|
|
|
B<coalesce> returns an empty string if all of the fields are blank. |
|
1282
|
|
|
|
|
|
|
|
|
1283
|
|
|
|
|
|
|
=cut |
|
1284
|
|
|
|
|
|
|
|
|
1285
|
|
|
|
|
|
|
sub coalesce { |
|
1286
|
8
|
|
|
9
|
1
|
27
|
my $self = shift; |
|
1287
|
|
|
|
|
|
|
|
|
1288
|
8
|
|
|
|
|
13
|
my $result = ''; |
|
1289
|
8
|
|
|
|
|
21
|
foreach my $field (@_) { |
|
1290
|
14
|
100
|
|
|
|
76
|
my $value = (ref( $field ) eq 'SCALAR') ? $$field : $self->get( $field ); |
|
1291
|
14
|
100
|
|
|
|
44
|
if (hascontent( $value )) { |
|
1292
|
8
|
|
|
|
|
62
|
$result = $value; |
|
1293
|
8
|
|
|
|
|
13
|
last; |
|
1294
|
|
|
|
|
|
|
} |
|
1295
|
|
|
|
|
|
|
} |
|
1296
|
8
|
|
|
|
|
18
|
return $result; |
|
1297
|
|
|
|
|
|
|
} |
|
1298
|
|
|
|
|
|
|
|
|
1299
|
|
|
|
|
|
|
|
|
1300
|
|
|
|
|
|
|
=head3 foreach |
|
1301
|
|
|
|
|
|
|
|
|
1302
|
|
|
|
|
|
|
Executes a CODE reference against repeating sub-records. XML files, for example, |
|
1303
|
|
|
|
|
|
|
have repeating nodes. B<foreach> allows you to format multiple fields from the |
|
1304
|
|
|
|
|
|
|
same record. It looks like this... |
|
1305
|
|
|
|
|
|
|
|
|
1306
|
|
|
|
|
|
|
# Capture the resulting strings. |
|
1307
|
|
|
|
|
|
|
my @results = $etl->foreach( sub { ... }, '/File/People' ); |
|
1308
|
|
|
|
|
|
|
|
|
1309
|
|
|
|
|
|
|
# Combine the resulting strings. |
|
1310
|
|
|
|
|
|
|
join( '; ', $etl->foreach( sub { ... }, '/File/People' ) ); |
|
1311
|
|
|
|
|
|
|
|
|
1312
|
|
|
|
|
|
|
B<foreach> calls L</get> to retrieve a list of sub-records. It replaces L</this> |
|
1313
|
|
|
|
|
|
|
with each sub-record in turn and executes the code reference. You can use any of |
|
1314
|
|
|
|
|
|
|
the standard unitlity functions inside the code reference. They will operate |
|
1315
|
|
|
|
|
|
|
only on the current sub-record. |
|
1316
|
|
|
|
|
|
|
|
|
1317
|
|
|
|
|
|
|
B<foreach> returns a single string per sub-record. Blank strings are discarded. |
|
1318
|
|
|
|
|
|
|
I<Blank> means C<undef>, empty strings, or all whitespace. You can filter |
|
1319
|
|
|
|
|
|
|
sub-records by returning C<undef> from the code reference. |
|
1320
|
|
|
|
|
|
|
|
|
1321
|
|
|
|
|
|
|
For example, you might do something like this to format names from XML... |
|
1322
|
|
|
|
|
|
|
|
|
1323
|
|
|
|
|
|
|
# Format names "last, first" and put a semi-colon between multiple names. |
|
1324
|
|
|
|
|
|
|
$etl->format( '; ', $etl->foreach( |
|
1325
|
|
|
|
|
|
|
sub { $etl->format( ', ', '/Last', '/First' ) }, |
|
1326
|
|
|
|
|
|
|
'/File/People' |
|
1327
|
|
|
|
|
|
|
) ); |
|
1328
|
|
|
|
|
|
|
|
|
1329
|
|
|
|
|
|
|
# Same thing, but using parameters. |
|
1330
|
|
|
|
|
|
|
$etl->format( '; ', $etl->foreach( |
|
1331
|
|
|
|
|
|
|
sub { |
|
1332
|
|
|
|
|
|
|
my ($object, $record) = @_; |
|
1333
|
|
|
|
|
|
|
$object->format( ', ', '/Last', '/First' ) |
|
1334
|
|
|
|
|
|
|
}, |
|
1335
|
|
|
|
|
|
|
'/File/People' |
|
1336
|
|
|
|
|
|
|
) ); |
|
1337
|
|
|
|
|
|
|
|
|
1338
|
|
|
|
|
|
|
B<foreach> passed two parameters to the code reference... |
|
1339
|
|
|
|
|
|
|
|
|
1340
|
|
|
|
|
|
|
=over |
|
1341
|
|
|
|
|
|
|
|
|
1342
|
|
|
|
|
|
|
=item The current B<ETL::Pipeline> object. |
|
1343
|
|
|
|
|
|
|
|
|
1344
|
|
|
|
|
|
|
=item The current sub-record. This will be the same value as L</this>. |
|
1345
|
|
|
|
|
|
|
|
|
1346
|
|
|
|
|
|
|
=back |
|
1347
|
|
|
|
|
|
|
|
|
1348
|
|
|
|
|
|
|
The code reference should return a string. If it returns an ARRAY reference, |
|
1349
|
|
|
|
|
|
|
B<foreach> flattens it, discarding any blank elements. So if you have to return |
|
1350
|
|
|
|
|
|
|
multiple values, B<foreach> tries to do something intelligent. |
|
1351
|
|
|
|
|
|
|
|
|
1352
|
|
|
|
|
|
|
B<foreach> sets L</this> before executing the CODE reference. The code can call |
|
1353
|
|
|
|
|
|
|
any of the other utility functions with field names relative to the sub-record. |
|
1354
|
|
|
|
|
|
|
I<Please note, the code cannot access fields outside of the sub-record>. |
|
1355
|
|
|
|
|
|
|
Instead, cache these in a local variable before called B<foreach>. |
|
1356
|
|
|
|
|
|
|
|
|
1357
|
|
|
|
|
|
|
my $x = $etl->get( '/File/MainPerson' ); |
|
1358
|
|
|
|
|
|
|
join( '; ', $etl->foreach( sub { |
|
1359
|
|
|
|
|
|
|
my $y = $etl->format( ', ', '/Last', '/First' ); |
|
1360
|
|
|
|
|
|
|
"$y is with $x"; |
|
1361
|
|
|
|
|
|
|
}, '/File/People' ); |
|
1362
|
|
|
|
|
|
|
|
|
1363
|
|
|
|
|
|
|
=head4 Calling foreach |
|
1364
|
|
|
|
|
|
|
|
|
1365
|
|
|
|
|
|
|
B<foreach> accepts the code reference as the first parameter. All remaining |
|
1366
|
|
|
|
|
|
|
parameters are field names. B<foreach> passes them through L</get> one at a |
|
1367
|
|
|
|
|
|
|
time. Each field should resolve to a repeating node. |
|
1368
|
|
|
|
|
|
|
|
|
1369
|
|
|
|
|
|
|
B<foreach> returns a list. The list may be empty or have one element. But it is |
|
1370
|
|
|
|
|
|
|
always a list. You can use Perl functions such as C<join> to convert the list |
|
1371
|
|
|
|
|
|
|
into a single value. |
|
1372
|
|
|
|
|
|
|
|
|
1373
|
|
|
|
|
|
|
=cut |
|
1374
|
|
|
|
|
|
|
|
|
1375
|
|
|
|
|
|
|
sub foreach { |
|
1376
|
29
|
|
|
30
|
1
|
230
|
my $self = shift; |
|
1377
|
29
|
|
|
|
|
38
|
my $code = shift; |
|
1378
|
|
|
|
|
|
|
|
|
1379
|
|
|
|
|
|
|
# Cache the current record. I need to restore this later so other function |
|
1380
|
|
|
|
|
|
|
# calls work normally. |
|
1381
|
29
|
|
|
|
|
577
|
my $this = $self->this; |
|
1382
|
|
|
|
|
|
|
|
|
1383
|
|
|
|
|
|
|
# Retrieve the repeating sub-records. |
|
1384
|
29
|
|
|
|
|
62
|
my $all = []; |
|
1385
|
29
|
|
|
|
|
75
|
foreach my $item (@_) { |
|
1386
|
29
|
|
|
|
|
86
|
my $current = $self->get( $item ); |
|
1387
|
29
|
50
|
|
|
|
169
|
if (ref( $current ) eq 'ARRAY') { push @$all, @$current; } |
|
|
0
|
|
|
|
|
0
|
|
|
1388
|
29
|
|
|
|
|
75
|
else { push @$all, $current; } |
|
1389
|
|
|
|
|
|
|
} |
|
1390
|
|
|
|
|
|
|
|
|
1391
|
|
|
|
|
|
|
# Execute the code reference against each sub-record. |
|
1392
|
29
|
|
|
|
|
37
|
my @results; |
|
1393
|
29
|
|
|
|
|
64
|
foreach my $record (@$all) { |
|
1394
|
29
|
|
|
|
|
876
|
$self->_set_this( $record ); |
|
1395
|
29
|
|
|
|
|
62
|
local $_ = $record; |
|
1396
|
29
|
|
|
|
|
83
|
my @values = $code->( $self, $_ ); |
|
1397
|
|
|
|
|
|
|
|
|
1398
|
29
|
50
|
33
|
|
|
160
|
if (scalar( @values ) == 1 && ref( $values[0] ) eq 'ARRAY') { |
|
1399
|
0
|
|
|
|
|
0
|
push @results, @{$values[0]}; |
|
|
0
|
|
|
|
|
0
|
|
|
1400
|
29
|
|
|
|
|
90
|
} else { push @results, @values; } |
|
1401
|
|
|
|
|
|
|
} |
|
1402
|
|
|
|
|
|
|
|
|
1403
|
|
|
|
|
|
|
# Restore the current record and return all of the results. |
|
1404
|
29
|
|
|
|
|
816
|
$self->_set_this( $this ); |
|
1405
|
29
|
50
|
|
|
|
60
|
return grep { ref( $_ ) eq '' && hascontent( $_ ) } @results; |
|
|
29
|
|
|
|
|
137
|
|
|
1406
|
|
|
|
|
|
|
} |
|
1407
|
|
|
|
|
|
|
|
|
1408
|
|
|
|
|
|
|
|
|
1409
|
|
|
|
|
|
|
=head3 format |
|
1410
|
|
|
|
|
|
|
|
|
1411
|
|
|
|
|
|
|
Builds a string from a list of fields, discarding blank fields. That's the main |
|
1412
|
|
|
|
|
|
|
purpose of the function - don't use entirely blank strings. This prevents things |
|
1413
|
|
|
|
|
|
|
like orphanded commas from showing up in your data. |
|
1414
|
|
|
|
|
|
|
|
|
1415
|
|
|
|
|
|
|
B<format> can both concateneate (C<join>) fields or format them (C<sprintf>). |
|
1416
|
|
|
|
|
|
|
A SCALAR reference signifies a format. A regular string indicates concatenation. |
|
1417
|
|
|
|
|
|
|
|
|
1418
|
|
|
|
|
|
|
# Concatenate fields (aka join) |
|
1419
|
|
|
|
|
|
|
$etl->format( "\n\n", 'D', 'E', 'F' ); |
|
1420
|
|
|
|
|
|
|
|
|
1421
|
|
|
|
|
|
|
# Format fields (aka sprintf) |
|
1422
|
|
|
|
|
|
|
$etl->format( \'%s, %s (%s)', 'D', 'E', 'F' ); |
|
1423
|
|
|
|
|
|
|
|
|
1424
|
|
|
|
|
|
|
You can nest constructs with an ARRAY reference. The seperator or format string |
|
1425
|
|
|
|
|
|
|
is the first element. The remaining elements are more fields (or other nested |
|
1426
|
|
|
|
|
|
|
ARRAY references). Basically, B<format> recursively calls itself passing the |
|
1427
|
|
|
|
|
|
|
array as parameters. |
|
1428
|
|
|
|
|
|
|
|
|
1429
|
|
|
|
|
|
|
# Blank lines between. Third line is two fields seperated by a space. |
|
1430
|
|
|
|
|
|
|
$etl->format( "\n\n", 'D', 'E', [' ', 'F', 'G'] ); |
|
1431
|
|
|
|
|
|
|
|
|
1432
|
|
|
|
|
|
|
# Blank lines between. Third line is formatted. |
|
1433
|
|
|
|
|
|
|
$etl->format( "\n\n", 'D', 'E', [\'-- from %s %s', 'F', 'G'] ); |
|
1434
|
|
|
|
|
|
|
|
|
1435
|
|
|
|
|
|
|
I<Blank> means C<undef>, empty string, or all whitespace. B<format> returns an |
|
1436
|
|
|
|
|
|
|
empty string if all of fields are blank. |
|
1437
|
|
|
|
|
|
|
|
|
1438
|
|
|
|
|
|
|
=head4 Format until |
|
1439
|
|
|
|
|
|
|
|
|
1440
|
|
|
|
|
|
|
B<format> optionally accepts a CODE reference to stop processing early. |
|
1441
|
|
|
|
|
|
|
B<format> passes each value into the code reference. If the code returns |
|
1442
|
|
|
|
|
|
|
B<true>, then B<format> stops processing fields and returns. The code reference |
|
1443
|
|
|
|
|
|
|
comes before the seperator/format. |
|
1444
|
|
|
|
|
|
|
|
|
1445
|
|
|
|
|
|
|
# Concantenate fields until one of them is the word "END". |
|
1446
|
|
|
|
|
|
|
$etl->format( sub { $_ eq 'END' }, "\n\n", '/*[idx > 8]' ); |
|
1447
|
|
|
|
|
|
|
|
|
1448
|
|
|
|
|
|
|
B<format> sets C<$_> to the field value. It also passes the value as the first |
|
1449
|
|
|
|
|
|
|
and only parameter. Your code can use either C<$_> or C<shift> to access the |
|
1450
|
|
|
|
|
|
|
value. |
|
1451
|
|
|
|
|
|
|
|
|
1452
|
|
|
|
|
|
|
You can include code references inside an ARRAY reference too. The code only |
|
1453
|
|
|
|
|
|
|
stops processing inside that substring. It continues processing the outer set of |
|
1454
|
|
|
|
|
|
|
fields after the ARRAY. |
|
1455
|
|
|
|
|
|
|
|
|
1456
|
|
|
|
|
|
|
# The last line concatenates fields until one of them is the word "END". |
|
1457
|
|
|
|
|
|
|
$etl->format( "\n\n", 'A', 'B', [sub { $_ eq 'END' }, ' ', '/*[idx > 8]'] ); |
|
1458
|
|
|
|
|
|
|
|
|
1459
|
|
|
|
|
|
|
# Do the conditional concatenate in the middle. Results in 3 lines. |
|
1460
|
|
|
|
|
|
|
$etl->format( "\n\n", 'A', [sub { $_ eq 'END' }, ' ', '/*[idx > 8]'], 'B' ); |
|
1461
|
|
|
|
|
|
|
|
|
1462
|
|
|
|
|
|
|
What happens if you have a CODE reference and an ARRAY reference, like this? |
|
1463
|
|
|
|
|
|
|
|
|
1464
|
|
|
|
|
|
|
$etl->format( sub { $_ eq 'END' }, "\n\n", 'A', [' ', 'B', 'C'], 'D' ); |
|
1465
|
|
|
|
|
|
|
|
|
1466
|
|
|
|
|
|
|
B<format> retrieves the ARRAY reference as a single string. It then sends that |
|
1467
|
|
|
|
|
|
|
entire string through the CODE reference. If the code returns B<true>, |
|
1468
|
|
|
|
|
|
|
processing stops. In other words, B<format> treats the results of an ARRAY |
|
1469
|
|
|
|
|
|
|
reference just like any other field. |
|
1470
|
|
|
|
|
|
|
|
|
1471
|
|
|
|
|
|
|
=cut |
|
1472
|
|
|
|
|
|
|
|
|
1473
|
|
|
|
|
|
|
sub format { |
|
1474
|
86
|
|
|
87
|
1
|
180
|
my $self = shift; |
|
1475
|
86
|
|
|
|
|
98
|
my $conditional = shift; |
|
1476
|
86
|
|
|
|
|
93
|
my $seperator; |
|
1477
|
|
|
|
|
|
|
|
|
1478
|
|
|
|
|
|
|
# Process the fixed parameters. |
|
1479
|
86
|
100
|
|
|
|
119
|
if (ref( $conditional ) eq 'CODE') { |
|
1480
|
4
|
|
|
|
|
6
|
$seperator = shift; |
|
1481
|
|
|
|
|
|
|
} else { |
|
1482
|
82
|
|
|
|
|
89
|
$seperator = $conditional; |
|
1483
|
82
|
|
|
|
|
101
|
$conditional = undef ; |
|
1484
|
|
|
|
|
|
|
} |
|
1485
|
|
|
|
|
|
|
|
|
1486
|
|
|
|
|
|
|
# Retrieve the fields. |
|
1487
|
86
|
|
|
|
|
92
|
my @results; |
|
1488
|
86
|
|
|
|
|
90
|
my $stop = 0; |
|
1489
|
|
|
|
|
|
|
|
|
1490
|
86
|
|
|
|
|
115
|
foreach my $name (@_) { |
|
1491
|
|
|
|
|
|
|
# Retrieve the value for this field. |
|
1492
|
146
|
|
|
|
|
172
|
my $values; |
|
1493
|
146
|
100
|
|
|
|
227
|
if (ref( $name ) eq 'ARRAY') { |
|
1494
|
56
|
|
|
|
|
139
|
$values = $self->format( @$name ); |
|
1495
|
|
|
|
|
|
|
} else { |
|
1496
|
90
|
|
|
|
|
185
|
$values = $self->get( $name ); |
|
1497
|
|
|
|
|
|
|
} |
|
1498
|
|
|
|
|
|
|
|
|
1499
|
|
|
|
|
|
|
# Check the results. |
|
1500
|
146
|
50
|
|
|
|
626
|
$values = [$values] unless ref( $values ) eq 'ARRAY'; |
|
1501
|
146
|
100
|
|
|
|
204
|
if (defined $conditional) { |
|
1502
|
12
|
|
|
|
|
18
|
foreach my $item (@$values) { |
|
1503
|
12
|
|
|
|
|
17
|
local $_ = $item; |
|
1504
|
12
|
100
|
|
|
|
27
|
if ($conditional->( $_ )) { |
|
1505
|
2
|
|
|
|
|
9
|
$stop = 1; |
|
1506
|
2
|
|
|
|
|
7
|
last; |
|
1507
|
10
|
|
|
|
|
53
|
} else { push @results, $item; } |
|
1508
|
|
|
|
|
|
|
} |
|
1509
|
134
|
|
|
|
|
193
|
} else { push @results, @$values; } |
|
1510
|
|
|
|
|
|
|
|
|
1511
|
|
|
|
|
|
|
# Terminate the loop early. |
|
1512
|
146
|
100
|
|
|
|
438
|
last if $stop; |
|
1513
|
|
|
|
|
|
|
} |
|
1514
|
|
|
|
|
|
|
|
|
1515
|
|
|
|
|
|
|
# Return the formatted results. |
|
1516
|
86
|
100
|
|
|
|
133
|
if (ref( $seperator ) eq 'SCALAR') { |
|
1517
|
22
|
100
|
|
15
|
|
140
|
if (any { hascontent( $_ ) } @results) { |
|
|
14
|
|
|
|
|
41
|
|
|
1518
|
11
|
|
|
11
|
|
257
|
no warnings 'redundant'; |
|
|
11
|
|
|
|
|
39
|
|
|
|
11
|
|
|
|
|
11487
|
|
|
1519
|
14
|
|
|
|
|
214
|
return sprintf( $$seperator, @results ); |
|
1520
|
8
|
|
|
|
|
18
|
} else { return ''; } |
|
1521
|
64
|
|
|
|
|
93
|
} else { return join( $seperator, grep { hascontent( $_ ) } @results ); } |
|
|
120
|
|
|
|
|
611
|
|
|
1522
|
|
|
|
|
|
|
} |
|
1523
|
|
|
|
|
|
|
|
|
1524
|
|
|
|
|
|
|
|
|
1525
|
|
|
|
|
|
|
=head3 from |
|
1526
|
|
|
|
|
|
|
|
|
1527
|
|
|
|
|
|
|
Return data from a hash, like the one from L<ETL::Pipeline::Output::Memory>. The |
|
1528
|
|
|
|
|
|
|
first parameter is the hash reference. The remaining parameters are field names |
|
1529
|
|
|
|
|
|
|
whose values become the hash keys. It's a convenient shorthand for accessing |
|
1530
|
|
|
|
|
|
|
a hash, with all of the error checking built in. |
|
1531
|
|
|
|
|
|
|
|
|
1532
|
|
|
|
|
|
|
$etl->from( $etl->output->hash, qr/myID/i, qr/Site/i ); |
|
1533
|
|
|
|
|
|
|
|
|
1534
|
|
|
|
|
|
|
To pass a string literal, use a scalar reference. |
|
1535
|
|
|
|
|
|
|
|
|
1536
|
|
|
|
|
|
|
$etl->from( \%hash, qr/myID/i, \'Date' ); |
|
1537
|
|
|
|
|
|
|
|
|
1538
|
|
|
|
|
|
|
This is equivalent to... |
|
1539
|
|
|
|
|
|
|
|
|
1540
|
|
|
|
|
|
|
$hash{$etl->get( qr/myID/i )}->{'Date'} |
|
1541
|
|
|
|
|
|
|
|
|
1542
|
|
|
|
|
|
|
B<from> returns C<undef> is any one key does not exist. |
|
1543
|
|
|
|
|
|
|
|
|
1544
|
|
|
|
|
|
|
B<from> automatically dereferences arrays. So if you store multiple values, the |
|
1545
|
|
|
|
|
|
|
function returns them as a list instead of the list reference. Scalar values and |
|
1546
|
|
|
|
|
|
|
hash references are returned as-is. |
|
1547
|
|
|
|
|
|
|
|
|
1548
|
|
|
|
|
|
|
=cut |
|
1549
|
|
|
|
|
|
|
|
|
1550
|
|
|
|
|
|
|
sub from { |
|
1551
|
6
|
|
|
7
|
1
|
25
|
my $self = shift; |
|
1552
|
6
|
|
|
|
|
9
|
my $value = shift; |
|
1553
|
|
|
|
|
|
|
|
|
1554
|
6
|
|
|
|
|
21
|
foreach my $field (@_) { |
|
1555
|
8
|
100
|
|
|
|
37
|
if (ref( $value ) ne 'HASH' ) { return undef ; } |
|
|
1
|
50
|
|
|
|
4
|
|
|
|
|
100
|
|
|
|
|
|
|
1556
|
0
|
|
|
|
|
0
|
elsif (!defined( $field ) ) { return undef ; } |
|
1557
|
1
|
|
|
|
|
5
|
elsif (ref( $field ) eq 'SCALAR' ) { $value = $value->{$$field}; } |
|
1558
|
|
|
|
|
|
|
else { |
|
1559
|
6
|
|
|
|
|
19
|
my $key = $self->get( $field ); |
|
1560
|
6
|
50
|
|
|
|
21
|
if (hascontent( $key )) { $value = $value->{$key}; } |
|
|
6
|
|
|
|
|
59
|
|
|
1561
|
0
|
|
|
|
|
0
|
else { return undef ; } |
|
1562
|
|
|
|
|
|
|
} |
|
1563
|
|
|
|
|
|
|
} |
|
1564
|
5
|
100
|
|
|
|
21
|
return (ref( $value ) eq 'ARRAY' ? @$value : $value); |
|
1565
|
|
|
|
|
|
|
} |
|
1566
|
|
|
|
|
|
|
|
|
1567
|
|
|
|
|
|
|
|
|
1568
|
|
|
|
|
|
|
=head3 name |
|
1569
|
|
|
|
|
|
|
|
|
1570
|
|
|
|
|
|
|
Format fields as a person's name. Names are common data elements. This function |
|
1571
|
|
|
|
|
|
|
provides a common format. Yet is flexible enough to handle customization. |
|
1572
|
|
|
|
|
|
|
|
|
1573
|
|
|
|
|
|
|
# Simple name formatted as "last, first". |
|
1574
|
|
|
|
|
|
|
$etl->name( 'Last', 'First' ); |
|
1575
|
|
|
|
|
|
|
|
|
1576
|
|
|
|
|
|
|
# Simple name formatted "first last". The format is the first element. |
|
1577
|
|
|
|
|
|
|
$etl->name( \'%s %s', 'First', 'Last' ); |
|
1578
|
|
|
|
|
|
|
|
|
1579
|
|
|
|
|
|
|
# Add a role or description in parenthesis, if it's there. |
|
1580
|
|
|
|
|
|
|
$etl->name( 'Last', 'First', ['Role'] ); |
|
1581
|
|
|
|
|
|
|
|
|
1582
|
|
|
|
|
|
|
# Add two fields with a custom format if at least one exists. |
|
1583
|
|
|
|
|
|
|
$etl->name( 'Last', 'First', [\'(%s; %s)', 'Role', 'Type'] ); |
|
1584
|
|
|
|
|
|
|
|
|
1585
|
|
|
|
|
|
|
# Same thing, but only adds the semi-colon if both values are there. |
|
1586
|
|
|
|
|
|
|
$etl->name( 'Last', 'First', [['; ', 'Role', 'Type']] ); |
|
1587
|
|
|
|
|
|
|
|
|
1588
|
|
|
|
|
|
|
# Long hand way of writing the above. |
|
1589
|
|
|
|
|
|
|
$etl->name( 'Last', 'First', [\'(%s)', ['; ', 'Role', 'Type']] ); |
|
1590
|
|
|
|
|
|
|
|
|
1591
|
|
|
|
|
|
|
If B<name> doesn't do what you want, try L</build>. L</build> is more flexible. |
|
1592
|
|
|
|
|
|
|
As a matter of fact, B<name> calls L</build> internally. |
|
1593
|
|
|
|
|
|
|
|
|
1594
|
|
|
|
|
|
|
=cut |
|
1595
|
|
|
|
|
|
|
|
|
1596
|
|
|
|
|
|
|
sub name { |
|
1597
|
16
|
|
|
17
|
1
|
73
|
my $self = shift; |
|
1598
|
|
|
|
|
|
|
# Initialize name format. |
|
1599
|
16
|
100
|
|
|
|
32
|
my $name_format = ref( $_[0] ) eq 'SCALAR' ? shift : ', '; |
|
1600
|
16
|
|
|
|
|
19
|
my @name_fields; |
|
1601
|
|
|
|
|
|
|
|
|
1602
|
16
|
|
|
|
|
21
|
my $role_format = \'(%s)'; |
|
1603
|
16
|
|
|
|
|
20
|
my @role_fields; |
|
1604
|
|
|
|
|
|
|
|
|
1605
|
|
|
|
|
|
|
# Process name and role fields. Anything after that is just extra text |
|
1606
|
|
|
|
|
|
|
# appended to the result. |
|
1607
|
16
|
|
|
|
|
46
|
for (my $item = shift; defined $item; $item = shift) { |
|
1608
|
38
|
100
|
|
|
|
65
|
if (ref( $item ) eq 'ARRAY') { |
|
1609
|
8
|
100
|
|
|
|
46
|
$role_format = shift( @$item ) if ref( $item->[0] ) eq 'SCALAR'; |
|
1610
|
8
|
|
|
|
|
21
|
@role_fields = @$item; |
|
1611
|
8
|
|
|
|
|
17
|
last; |
|
1612
|
30
|
|
|
|
|
53
|
} else { push @name_fields, $item; } |
|
1613
|
|
|
|
|
|
|
} |
|
1614
|
16
|
|
|
|
|
25
|
my $last_name = shift @name_fields; |
|
1615
|
|
|
|
|
|
|
|
|
1616
|
|
|
|
|
|
|
# Build the string using the "build" method. Elements are concatenated with |
|
1617
|
|
|
|
|
|
|
# a single space between them. This properly leaves out any blank elements. |
|
1618
|
16
|
|
|
|
|
55
|
return $self->format( ' ', |
|
1619
|
|
|
|
|
|
|
[$name_format, $last_name, [' ', @name_fields]], |
|
1620
|
|
|
|
|
|
|
[$role_format, @role_fields], |
|
1621
|
|
|
|
|
|
|
@_ |
|
1622
|
|
|
|
|
|
|
); |
|
1623
|
|
|
|
|
|
|
} |
|
1624
|
|
|
|
|
|
|
|
|
1625
|
|
|
|
|
|
|
|
|
1626
|
|
|
|
|
|
|
=head3 piece |
|
1627
|
|
|
|
|
|
|
|
|
1628
|
|
|
|
|
|
|
Split a string and extract one or more of the individual pieces. This can come |
|
1629
|
|
|
|
|
|
|
in handy with file names, for example. A file split on the period has two pieces |
|
1630
|
|
|
|
|
|
|
- the name and the extension, piece 1 and piece 2 respectively. Here are some |
|
1631
|
|
|
|
|
|
|
examples... |
|
1632
|
|
|
|
|
|
|
|
|
1633
|
|
|
|
|
|
|
# File name: Example.JPG |
|
1634
|
|
|
|
|
|
|
# Returns: Example |
|
1635
|
|
|
|
|
|
|
$etl->piece( 'Filename', qr|\.|, 1 ); |
|
1636
|
|
|
|
|
|
|
|
|
1637
|
|
|
|
|
|
|
# Returns: JPG |
|
1638
|
|
|
|
|
|
|
$etl->piece( 'Filename', qr|\.|, 2 ); |
|
1639
|
|
|
|
|
|
|
|
|
1640
|
|
|
|
|
|
|
B<piece> takes a minimum of 3 parameters... |
|
1641
|
|
|
|
|
|
|
|
|
1642
|
|
|
|
|
|
|
=over |
|
1643
|
|
|
|
|
|
|
|
|
1644
|
|
|
|
|
|
|
=item 1. Any field name valid for L</get> |
|
1645
|
|
|
|
|
|
|
|
|
1646
|
|
|
|
|
|
|
=item 2. Regular expression for splitting the field |
|
1647
|
|
|
|
|
|
|
|
|
1648
|
|
|
|
|
|
|
=item 3. Piece number to extract (the first piece is B<1>, not B<0>) |
|
1649
|
|
|
|
|
|
|
|
|
1650
|
|
|
|
|
|
|
=back |
|
1651
|
|
|
|
|
|
|
|
|
1652
|
|
|
|
|
|
|
B<piece> accepts any field name valid with L</get>. Multiple values are |
|
1653
|
|
|
|
|
|
|
concatenated with a single space. You can specify a different seperator using |
|
1654
|
|
|
|
|
|
|
the same syntax as L</mapping> - an array reference. In that array, the first |
|
1655
|
|
|
|
|
|
|
element is the field name and the second is the seperator string. |
|
1656
|
|
|
|
|
|
|
|
|
1657
|
|
|
|
|
|
|
The second parameter for B<piece> is a regular expression. B<piece> passes this |
|
1658
|
|
|
|
|
|
|
to C<split> and breaks apart the field value. |
|
1659
|
|
|
|
|
|
|
|
|
1660
|
|
|
|
|
|
|
The third parameter returns one or more pieces from the split string. In the |
|
1661
|
|
|
|
|
|
|
simplest form, this is a single number. And B<piece> returns that piece from the |
|
1662
|
|
|
|
|
|
|
split string. Note that pieces start at number 1, not 0 like array indexes. |
|
1663
|
|
|
|
|
|
|
|
|
1664
|
|
|
|
|
|
|
A negative piece number starts from the end of the string. For example, B<-2> |
|
1665
|
|
|
|
|
|
|
returns the second to last piece. You can also include a length - number of |
|
1666
|
|
|
|
|
|
|
pieces to return starting at the given position. The default length is B<1>. |
|
1667
|
|
|
|
|
|
|
|
|
1668
|
|
|
|
|
|
|
# Filename: abc_def_ghi_jkl_mno_pqr |
|
1669
|
|
|
|
|
|
|
# Returns: abc def |
|
1670
|
|
|
|
|
|
|
$etl->piece( 'Filename', qr/_/, '1,2' ); |
|
1671
|
|
|
|
|
|
|
|
|
1672
|
|
|
|
|
|
|
# Returns: ghi jkl mno |
|
1673
|
|
|
|
|
|
|
$etl->piece( 'Filename', qr/_/, '3,3' ); |
|
1674
|
|
|
|
|
|
|
|
|
1675
|
|
|
|
|
|
|
# Returns: mno pqr |
|
1676
|
|
|
|
|
|
|
$etl->piece( 'Filename', qr/_/, '-2,2' ); |
|
1677
|
|
|
|
|
|
|
|
|
1678
|
|
|
|
|
|
|
Notice that the multiple pieces are re-joined using a space. You can specify the |
|
1679
|
|
|
|
|
|
|
seperator string after the length. Do not put spaces after the commas. B<piece> |
|
1680
|
|
|
|
|
|
|
will mistakenly use it as part of the seperator. |
|
1681
|
|
|
|
|
|
|
|
|
1682
|
|
|
|
|
|
|
# Filename: abc_def_ghi_jkl_mno_pqr |
|
1683
|
|
|
|
|
|
|
# Returns: abc+def |
|
1684
|
|
|
|
|
|
|
$etl->piece( 'Filename', qr/_/, '1,2,+' ); |
|
1685
|
|
|
|
|
|
|
|
|
1686
|
|
|
|
|
|
|
# Returns: ghi,jkl,mno |
|
1687
|
|
|
|
|
|
|
$etl->piece( 'Filename', qr/_/, '3,3,,' ); |
|
1688
|
|
|
|
|
|
|
|
|
1689
|
|
|
|
|
|
|
# Returns: ghi -jkl -mno |
|
1690
|
|
|
|
|
|
|
$etl->piece( 'Filename', qr/_/, '3,3, -' ); |
|
1691
|
|
|
|
|
|
|
|
|
1692
|
|
|
|
|
|
|
A blank length returns all pieces from the start position to the end, just like |
|
1693
|
|
|
|
|
|
|
the Perl C<splice> function. |
|
1694
|
|
|
|
|
|
|
|
|
1695
|
|
|
|
|
|
|
# Filename: abc_def_ghi_jkl_mno_pqr |
|
1696
|
|
|
|
|
|
|
# Returns: ghi jkl mno pqr |
|
1697
|
|
|
|
|
|
|
$etl->piece( 'Filename', qr/_/, '3,' ); |
|
1698
|
|
|
|
|
|
|
|
|
1699
|
|
|
|
|
|
|
# Returns: ghi+jkl+mno+pqr |
|
1700
|
|
|
|
|
|
|
$etl->piece( 'Filename', qr/_/, '3,,+' ); |
|
1701
|
|
|
|
|
|
|
|
|
1702
|
|
|
|
|
|
|
=head4 Recursive pieces |
|
1703
|
|
|
|
|
|
|
|
|
1704
|
|
|
|
|
|
|
Imagine a name like I<Public, John Q., MD>. How would you parse out the middle |
|
1705
|
|
|
|
|
|
|
initial by hand? First, you piece the string by comma. Next you split the |
|
1706
|
|
|
|
|
|
|
second piece of that by a space. B<piece> lets you do the same thing. |
|
1707
|
|
|
|
|
|
|
|
|
1708
|
|
|
|
|
|
|
# Name: Public, John Q., MD |
|
1709
|
|
|
|
|
|
|
# Returns: Q. |
|
1710
|
|
|
|
|
|
|
$etl->piece( 'Name', qr/,/, 2, qr/ /, 2 ); |
|
1711
|
|
|
|
|
|
|
|
|
1712
|
|
|
|
|
|
|
# Returns: John |
|
1713
|
|
|
|
|
|
|
$etl->piece( 'Name', qr/,/, 2, qr/ /, 1 ); |
|
1714
|
|
|
|
|
|
|
|
|
1715
|
|
|
|
|
|
|
B<piece> will take the results from the first split and use it as the input to |
|
1716
|
|
|
|
|
|
|
the second split. It will continue to do this for as many pairs of expressions |
|
1717
|
|
|
|
|
|
|
and piece numbers as you send. |
|
1718
|
|
|
|
|
|
|
|
|
1719
|
|
|
|
|
|
|
=cut |
|
1720
|
|
|
|
|
|
|
|
|
1721
|
|
|
|
|
|
|
sub piece { |
|
1722
|
16
|
|
|
17
|
1
|
94
|
my $self = shift; |
|
1723
|
16
|
|
|
|
|
21
|
my $field = shift; |
|
1724
|
|
|
|
|
|
|
|
|
1725
|
|
|
|
|
|
|
# Retrieve the initial value from the field. |
|
1726
|
16
|
|
|
|
|
20
|
my $seperator = ' '; |
|
1727
|
16
|
50
|
|
|
|
50
|
if (ref( $field ) eq 'ARRAY') { |
|
1728
|
0
|
|
0
|
|
|
0
|
$seperator = $field->[1] // ' '; |
|
1729
|
0
|
|
|
|
|
0
|
$field = $field->[0]; |
|
1730
|
|
|
|
|
|
|
} |
|
1731
|
16
|
|
|
|
|
31
|
my $value = $self->get( $field ); |
|
1732
|
16
|
50
|
|
|
|
35
|
$value = trim( join( $seperator, @$value ) ) if ref( $value ) eq 'ARRAY'; |
|
1733
|
|
|
|
|
|
|
|
|
1734
|
|
|
|
|
|
|
# Recursively split the string. |
|
1735
|
16
|
|
|
|
|
33
|
while (scalar @_) { |
|
1736
|
18
|
|
|
|
|
41
|
my $split = shift; |
|
1737
|
18
|
|
|
|
|
48
|
my @location = split /,/, shift, 3; |
|
1738
|
|
|
|
|
|
|
|
|
1739
|
18
|
|
|
|
|
60
|
my @pieces = split( $split, $value ); |
|
1740
|
18
|
50
|
|
|
|
50
|
if (scalar( @location ) == 0) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
1741
|
0
|
|
|
|
|
0
|
$value = $pieces[0]; |
|
1742
|
|
|
|
|
|
|
} elsif (scalar( @location ) == 1) { |
|
1743
|
12
|
100
|
|
|
|
36
|
my $index = $location[0] > 0 ? $location[0] - 1 : $location[0]; |
|
1744
|
12
|
|
|
|
|
20
|
$value = $pieces[$index]; |
|
1745
|
|
|
|
|
|
|
} elsif (scalar( @location ) == 2) { |
|
1746
|
2
|
|
|
|
|
5
|
my @parts; |
|
1747
|
2
|
50
|
|
|
|
8
|
if (hascontent( $location[1] )) { |
|
1748
|
2
|
|
|
|
|
23
|
@parts = splice @pieces, $location[0] - 1, $location[1]; |
|
1749
|
|
|
|
|
|
|
} else { |
|
1750
|
0
|
|
|
|
|
0
|
@parts = splice @pieces, $location[0] - 1; |
|
1751
|
|
|
|
|
|
|
} |
|
1752
|
2
|
|
|
|
|
7
|
$value = join( ' ', @parts ); |
|
1753
|
|
|
|
|
|
|
} else { |
|
1754
|
4
|
|
|
|
|
8
|
my @parts; |
|
1755
|
4
|
100
|
|
|
|
16
|
if (hascontent( $location[1] )) { |
|
1756
|
2
|
|
|
|
|
32
|
@parts = splice @pieces, $location[0] - 1, $location[1]; |
|
1757
|
|
|
|
|
|
|
} else { |
|
1758
|
2
|
|
|
|
|
19
|
@parts = splice @pieces, $location[0] - 1; |
|
1759
|
|
|
|
|
|
|
} |
|
1760
|
4
|
|
|
|
|
11
|
$value = join( $location[2], @parts ); |
|
1761
|
|
|
|
|
|
|
} |
|
1762
|
18
|
|
|
|
|
47
|
$value = trim( $value ); |
|
1763
|
|
|
|
|
|
|
} |
|
1764
|
|
|
|
|
|
|
|
|
1765
|
|
|
|
|
|
|
# Return the value extracted from the last split. |
|
1766
|
16
|
|
100
|
|
|
190
|
return $value // ''; |
|
1767
|
|
|
|
|
|
|
} |
|
1768
|
|
|
|
|
|
|
|
|
1769
|
|
|
|
|
|
|
|
|
1770
|
|
|
|
|
|
|
=head3 replace |
|
1771
|
|
|
|
|
|
|
|
|
1772
|
|
|
|
|
|
|
Substitute one string for another. This function uses the C<s///> operator and |
|
1773
|
|
|
|
|
|
|
returns the modified string. B<replace> accepts a field name for L</get>. A |
|
1774
|
|
|
|
|
|
|
little more convenient that calling L</get> and applying C<s///> yourself. |
|
1775
|
|
|
|
|
|
|
|
|
1776
|
|
|
|
|
|
|
B<replace> takes three parameters... |
|
1777
|
|
|
|
|
|
|
|
|
1778
|
|
|
|
|
|
|
=over |
|
1779
|
|
|
|
|
|
|
|
|
1780
|
|
|
|
|
|
|
=item The field to change |
|
1781
|
|
|
|
|
|
|
|
|
1782
|
|
|
|
|
|
|
=item The regular expression to match against |
|
1783
|
|
|
|
|
|
|
|
|
1784
|
|
|
|
|
|
|
=item The string to replace the match with |
|
1785
|
|
|
|
|
|
|
|
|
1786
|
|
|
|
|
|
|
=back |
|
1787
|
|
|
|
|
|
|
|
|
1788
|
|
|
|
|
|
|
All instances of the matching pattern are replaced. For the patterns, you can |
|
1789
|
|
|
|
|
|
|
use strings or regular expression references. |
|
1790
|
|
|
|
|
|
|
|
|
1791
|
|
|
|
|
|
|
=cut |
|
1792
|
|
|
|
|
|
|
|
|
1793
|
|
|
|
|
|
|
sub replace { |
|
1794
|
4
|
|
|
4
|
1
|
27
|
my ($self, $field, $match, $change) = @_; |
|
1795
|
|
|
|
|
|
|
|
|
1796
|
4
|
|
|
|
|
13
|
my $string = $self->get( $field ); |
|
1797
|
4
|
|
|
|
|
51
|
$string =~ s/$match/$change/g; |
|
1798
|
4
|
|
|
|
|
15
|
return $string; |
|
1799
|
|
|
|
|
|
|
} |
|
1800
|
|
|
|
|
|
|
|
|
1801
|
|
|
|
|
|
|
|
|
1802
|
|
|
|
|
|
|
=head2 Other |
|
1803
|
|
|
|
|
|
|
|
|
1804
|
|
|
|
|
|
|
=head3 is_valid |
|
1805
|
|
|
|
|
|
|
|
|
1806
|
|
|
|
|
|
|
This method returns true or false. True means that the pipeline is ready to |
|
1807
|
|
|
|
|
|
|
go. False, of course, means that there's a problem. In a list context, |
|
1808
|
|
|
|
|
|
|
B<is_invalid> returns the false value and an error message. On success, the |
|
1809
|
|
|
|
|
|
|
error message is C<undef>. |
|
1810
|
|
|
|
|
|
|
|
|
1811
|
|
|
|
|
|
|
=cut |
|
1812
|
|
|
|
|
|
|
|
|
1813
|
|
|
|
|
|
|
sub is_valid { |
|
1814
|
71
|
|
|
71
|
1
|
189
|
my $self = shift; |
|
1815
|
71
|
|
|
|
|
157
|
my $error = undef; |
|
1816
|
|
|
|
|
|
|
|
|
1817
|
71
|
100
|
|
|
|
1679
|
if (!defined $self->_work_in) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
1818
|
3
|
|
|
|
|
7
|
$error = 'The working folder was not set'; |
|
1819
|
|
|
|
|
|
|
} elsif (!defined $self->_input) { |
|
1820
|
4
|
|
|
|
|
11
|
$error = 'The "input" object was not set'; |
|
1821
|
|
|
|
|
|
|
} elsif (!defined $self->_output) { |
|
1822
|
2
|
|
|
|
|
4
|
$error = 'The "output" object was not set'; |
|
1823
|
|
|
|
|
|
|
} else { |
|
1824
|
62
|
|
|
|
|
1827
|
my $found = $self->_has_constants; |
|
1825
|
|
|
|
|
|
|
|
|
1826
|
62
|
|
|
|
|
1358
|
my $mapping = $self->_mapping; |
|
1827
|
62
|
100
|
66
|
|
|
426
|
if (ref( $mapping ) eq 'CODE' ) { $found++; } |
|
|
4
|
100
|
|
|
|
8
|
|
|
1828
|
51
|
|
|
|
|
99
|
elsif (ref( $mapping ) eq 'HASH' && scalar( $mapping ) > 0) { $found++; } |
|
1829
|
|
|
|
|
|
|
|
|
1830
|
62
|
100
|
|
|
|
198
|
$error = 'The mapping was not set' unless $found; |
|
1831
|
|
|
|
|
|
|
} |
|
1832
|
|
|
|
|
|
|
|
|
1833
|
71
|
100
|
|
|
|
280
|
if (wantarray) { |
|
1834
|
62
|
100
|
|
|
|
291
|
return ((defined( $error ) ? 0 : 1), $error); |
|
1835
|
|
|
|
|
|
|
} else { |
|
1836
|
9
|
100
|
|
|
|
103
|
return (defined( $error ) ? 0 : 1); |
|
1837
|
|
|
|
|
|
|
} |
|
1838
|
|
|
|
|
|
|
} |
|
1839
|
|
|
|
|
|
|
|
|
1840
|
|
|
|
|
|
|
|
|
1841
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
|
1842
|
|
|
|
|
|
|
# Internal methods and attributes. |
|
1843
|
|
|
|
|
|
|
|
|
1844
|
|
|
|
|
|
|
# These attributes define field aliases. This is how column names work for Excel |
|
1845
|
|
|
|
|
|
|
# and CSV. The script may also define aliases to shortcut long names. |
|
1846
|
|
|
|
|
|
|
|
|
1847
|
|
|
|
|
|
|
has '_alias' => ( |
|
1848
|
|
|
|
|
|
|
default => sub { {input => [], pipeline => []} }, |
|
1849
|
|
|
|
|
|
|
init_arg => undef, |
|
1850
|
|
|
|
|
|
|
is => 'ro', |
|
1851
|
|
|
|
|
|
|
isa => 'HashRef[ArrayRef[HashRef[Str]]]', |
|
1852
|
|
|
|
|
|
|
); |
|
1853
|
|
|
|
|
|
|
|
|
1854
|
|
|
|
|
|
|
has '_alias_cache' => ( |
|
1855
|
|
|
|
|
|
|
default => sub { {} }, |
|
1856
|
|
|
|
|
|
|
handles => {_alias_cache_built => 'count'}, |
|
1857
|
|
|
|
|
|
|
is => 'ro', |
|
1858
|
|
|
|
|
|
|
isa => 'HashRef[ArrayRef[Str]]', |
|
1859
|
|
|
|
|
|
|
traits => [qw/Hash/], |
|
1860
|
|
|
|
|
|
|
); |
|
1861
|
|
|
|
|
|
|
|
|
1862
|
|
|
|
|
|
|
has '_alias_type' => ( |
|
1863
|
|
|
|
|
|
|
default => 'pipeline', |
|
1864
|
|
|
|
|
|
|
init_arg => undef, |
|
1865
|
|
|
|
|
|
|
is => 'rw', |
|
1866
|
|
|
|
|
|
|
isa => 'Str', |
|
1867
|
|
|
|
|
|
|
); |
|
1868
|
|
|
|
|
|
|
|
|
1869
|
|
|
|
|
|
|
|
|
1870
|
|
|
|
|
|
|
# This private method creates the ETL::Pipeline::Input and ETL::Pipeline::Output |
|
1871
|
|
|
|
|
|
|
# objects. It allows me to centralize the error handling. The program dies if |
|
1872
|
|
|
|
|
|
|
# there's an error. It means that something is wrong with the corresponding |
|
1873
|
|
|
|
|
|
|
# class. And I don't want to hide those errors. You can only fix errors if you |
|
1874
|
|
|
|
|
|
|
# know about them. |
|
1875
|
|
|
|
|
|
|
# |
|
1876
|
|
|
|
|
|
|
# Override or modify this method if you want to perform extra checks. |
|
1877
|
|
|
|
|
|
|
# |
|
1878
|
|
|
|
|
|
|
# The first parameter is a string with either "Input" or "Output". |
|
1879
|
|
|
|
|
|
|
# The method appends this value onto "ETL::Pipeline". For example, "Input" |
|
1880
|
|
|
|
|
|
|
# becomes "ETL::Pipeline::Input". |
|
1881
|
|
|
|
|
|
|
# |
|
1882
|
|
|
|
|
|
|
# The rest of the parameters are passed directly into the constructor for the |
|
1883
|
|
|
|
|
|
|
# class this method instantiates. |
|
1884
|
|
|
|
|
|
|
sub _object_of_class { |
|
1885
|
130
|
|
|
130
|
|
218
|
my $self = shift; |
|
1886
|
130
|
|
|
|
|
243
|
my $action = shift; |
|
1887
|
|
|
|
|
|
|
|
|
1888
|
130
|
|
|
|
|
338
|
my @arguments = @_; |
|
1889
|
130
|
50
|
66
|
|
|
669
|
@arguments = @{$arguments[0]} if |
|
|
0
|
|
|
|
|
0
|
|
|
1890
|
|
|
|
|
|
|
scalar( @arguments ) == 1 |
|
1891
|
|
|
|
|
|
|
&& ref( $arguments[0] ) eq 'ARRAY' |
|
1892
|
|
|
|
|
|
|
; |
|
1893
|
|
|
|
|
|
|
|
|
1894
|
130
|
|
|
|
|
290
|
my $class = shift @arguments; |
|
1895
|
130
|
100
|
|
|
|
452
|
if (substr( $class, 0, 1 ) eq '+') { |
|
1896
|
11
|
|
|
|
|
23
|
$class = substr( $class, 1 ); |
|
1897
|
|
|
|
|
|
|
} else { |
|
1898
|
119
|
|
|
|
|
311
|
my $base = "ETL::Pipeline::$action"; |
|
1899
|
119
|
50
|
|
|
|
459
|
$class = "${base}::$class" if substr( $class, 0, length( $base ) ) ne $base; |
|
1900
|
|
|
|
|
|
|
} |
|
1901
|
|
|
|
|
|
|
|
|
1902
|
130
|
|
|
|
|
304
|
my %attributes = @arguments; |
|
1903
|
130
|
|
|
|
|
268
|
$attributes{pipeline} = $self; |
|
1904
|
|
|
|
|
|
|
|
|
1905
|
11
|
|
|
11
|
|
5789
|
my $object = eval "use $class; $class->new( \%attributes )"; |
|
|
11
|
|
|
11
|
|
23640
|
|
|
|
11
|
|
|
9
|
|
369
|
|
|
|
11
|
|
|
9
|
|
6513
|
|
|
|
11
|
|
|
8
|
|
4214
|
|
|
|
11
|
|
|
8
|
|
351
|
|
|
|
9
|
|
|
6
|
|
95
|
|
|
|
9
|
|
|
6
|
|
32
|
|
|
|
9
|
|
|
|
|
285
|
|
|
|
9
|
|
|
|
|
87
|
|
|
|
9
|
|
|
|
|
23
|
|
|
|
9
|
|
|
|
|
195
|
|
|
|
8
|
|
|
|
|
105
|
|
|
|
8
|
|
|
|
|
26
|
|
|
|
8
|
|
|
|
|
204
|
|
|
|
8
|
|
|
|
|
77
|
|
|
|
8
|
|
|
|
|
25
|
|
|
|
8
|
|
|
|
|
151
|
|
|
|
6
|
|
|
|
|
79
|
|
|
|
6
|
|
|
|
|
19
|
|
|
|
6
|
|
|
|
|
138
|
|
|
|
6
|
|
|
|
|
63
|
|
|
|
6
|
|
|
|
|
22
|
|
|
|
6
|
|
|
|
|
121
|
|
|
|
130
|
|
|
|
|
11066
|
|
|
1906
|
130
|
50
|
|
|
|
696
|
croak "Error creating $class...\n$@\n" unless defined $object; |
|
1907
|
130
|
|
|
|
|
3317
|
return $object; |
|
1908
|
|
|
|
|
|
|
} |
|
1909
|
|
|
|
|
|
|
|
|
1910
|
|
|
|
|
|
|
|
|
1911
|
|
|
|
|
|
|
=head1 ADVANCED TOPICS |
|
1912
|
|
|
|
|
|
|
|
|
1913
|
|
|
|
|
|
|
=head2 Multiple input sources |
|
1914
|
|
|
|
|
|
|
|
|
1915
|
|
|
|
|
|
|
It is not uncommon to receive your data spread across more than one file. How |
|
1916
|
|
|
|
|
|
|
do you guarantee that each pipeline pulls files from the same working directory |
|
1917
|
|
|
|
|
|
|
(L</work_in>)? You L</chain> the pipelines together. |
|
1918
|
|
|
|
|
|
|
|
|
1919
|
|
|
|
|
|
|
The L</chain> method works like this... |
|
1920
|
|
|
|
|
|
|
|
|
1921
|
|
|
|
|
|
|
ETL::Pipeline->new( { |
|
1922
|
|
|
|
|
|
|
work_in => {search => 'C:\Data', find => qr/Ficticious/}, |
|
1923
|
|
|
|
|
|
|
input => ['Excel', iname => 'main.xlsx' ], |
|
1924
|
|
|
|
|
|
|
mapping => {Name => 'A', Address => 'B', ID => 'C' }, |
|
1925
|
|
|
|
|
|
|
constants => {Type => 1, Information => 'Demographic' }, |
|
1926
|
|
|
|
|
|
|
output => ['SQL', table => 'NewData' ], |
|
1927
|
|
|
|
|
|
|
} )->process->chain( { |
|
1928
|
|
|
|
|
|
|
input => ['Excel', iname => 'notes.xlsx' ], |
|
1929
|
|
|
|
|
|
|
mapping => {User => 'A', Text => 'B', Date => 'C' }, |
|
1930
|
|
|
|
|
|
|
constants => {Type => 2, Information => 'Note' }, |
|
1931
|
|
|
|
|
|
|
output => ['SQL', table => 'OtherData' ], |
|
1932
|
|
|
|
|
|
|
} )->process; |
|
1933
|
|
|
|
|
|
|
|
|
1934
|
|
|
|
|
|
|
When the first pipeline finishes, it creates a new object with the same |
|
1935
|
|
|
|
|
|
|
L</work_in>. The code then calls L</process> on the new object. The second |
|
1936
|
|
|
|
|
|
|
pipeline copies L</work_in> from the first pipeline. |
|
1937
|
|
|
|
|
|
|
|
|
1938
|
|
|
|
|
|
|
=head2 Writing an input source |
|
1939
|
|
|
|
|
|
|
|
|
1940
|
|
|
|
|
|
|
B<ETL::Pipeline> provides some basic, generic input sources. Inevitably, you |
|
1941
|
|
|
|
|
|
|
will come across data that doesn't fit one of these. No problem. |
|
1942
|
|
|
|
|
|
|
B<ETL::Pipeline> lets you create your own input sources. |
|
1943
|
|
|
|
|
|
|
|
|
1944
|
|
|
|
|
|
|
An input source is a L<Moose> class that implements the L<ETL::Pipeline::Input> |
|
1945
|
|
|
|
|
|
|
role. The role requires that you define the L<ETL::Pipeline::Input/run> method. |
|
1946
|
|
|
|
|
|
|
B<ETL::Pipeline> calls that method. Name your class B<ETL::Pipeline::Input::*> |
|
1947
|
|
|
|
|
|
|
and the L</input> method can find it automatically. |
|
1948
|
|
|
|
|
|
|
|
|
1949
|
|
|
|
|
|
|
See L<ETL::Pipeline::Input> for more details. |
|
1950
|
|
|
|
|
|
|
|
|
1951
|
|
|
|
|
|
|
=head2 Writing an output destination |
|
1952
|
|
|
|
|
|
|
|
|
1953
|
|
|
|
|
|
|
B<ETL::Pipeline> does not have any default output destinations. Output |
|
1954
|
|
|
|
|
|
|
destinations are customized. You have something you want done with the data. |
|
1955
|
|
|
|
|
|
|
And that something intimately ties into your specific business. You will have |
|
1956
|
|
|
|
|
|
|
to write at least one output destination to do anything useful. |
|
1957
|
|
|
|
|
|
|
|
|
1958
|
|
|
|
|
|
|
An output destination is a L<Moose> class that implements the |
|
1959
|
|
|
|
|
|
|
L<ETL::Pipeline::Output> role. The role defines required methods. |
|
1960
|
|
|
|
|
|
|
B<ETL::Pipeline> calls those methods. Name your class |
|
1961
|
|
|
|
|
|
|
B<ETL::Pipeline::Output::*> and the L</output> method can find it automatically. |
|
1962
|
|
|
|
|
|
|
|
|
1963
|
|
|
|
|
|
|
See L<ETL::Pipeline::Output> for more details. |
|
1964
|
|
|
|
|
|
|
|
|
1965
|
|
|
|
|
|
|
=head2 Why are the inputs and outputs separate? |
|
1966
|
|
|
|
|
|
|
|
|
1967
|
|
|
|
|
|
|
Wouldn't it make sense to have an input source for Excel and an output |
|
1968
|
|
|
|
|
|
|
destination for Excel? |
|
1969
|
|
|
|
|
|
|
|
|
1970
|
|
|
|
|
|
|
Input sources are generic. It takes the same code to read from one Excel file |
|
1971
|
|
|
|
|
|
|
as another. Output destinations, on the other hand, are customized for your |
|
1972
|
|
|
|
|
|
|
business - with data validation and business logic. |
|
1973
|
|
|
|
|
|
|
|
|
1974
|
|
|
|
|
|
|
B<ETL::Pipeline> assumes that you have multiple input sources. Different |
|
1975
|
|
|
|
|
|
|
feeds use different formats. But output destinations will be much fewer. |
|
1976
|
|
|
|
|
|
|
You're writing data into a centralized place. |
|
1977
|
|
|
|
|
|
|
|
|
1978
|
|
|
|
|
|
|
For these reasons, it makes sense to keep the input sources and output |
|
1979
|
|
|
|
|
|
|
destinations separate. You can easily add more inputs without affecting the |
|
1980
|
|
|
|
|
|
|
outputs. |
|
1981
|
|
|
|
|
|
|
|
|
1982
|
|
|
|
|
|
|
=head2 Custom logging |
|
1983
|
|
|
|
|
|
|
|
|
1984
|
|
|
|
|
|
|
The default L<status> method send updates to STDOUT. If you want to add log |
|
1985
|
|
|
|
|
|
|
files or integrate with a GUI, then subclass B<ETL::Pipeline> and |
|
1986
|
|
|
|
|
|
|
L<override|Moose::Manual::MethodModifiers/OVERRIDE-AND-SUPER> the L</status> |
|
1987
|
|
|
|
|
|
|
method. |
|
1988
|
|
|
|
|
|
|
|
|
1989
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
1990
|
|
|
|
|
|
|
|
|
1991
|
|
|
|
|
|
|
L<ETL::Pipeline::Input>, L<ETL::Pipeline::Output> |
|
1992
|
|
|
|
|
|
|
|
|
1993
|
|
|
|
|
|
|
=head2 Input Source Formats |
|
1994
|
|
|
|
|
|
|
|
|
1995
|
|
|
|
|
|
|
L<ETL::Pipeline::Input::Excel>, L<ETL::Pipeline::Input::DelimitedText>, |
|
1996
|
|
|
|
|
|
|
L<ETL::Pipeline::Input::JsonFiles>, L<ETL::Pipeline::Input::Xml>, |
|
1997
|
|
|
|
|
|
|
L<ETL::Pipeline::Input::XmlFiles> |
|
1998
|
|
|
|
|
|
|
|
|
1999
|
|
|
|
|
|
|
=head1 REPOSITORY |
|
2000
|
|
|
|
|
|
|
|
|
2001
|
|
|
|
|
|
|
L<https://github.com/rbwohlfarth/ETL-Pipeline> |
|
2002
|
|
|
|
|
|
|
|
|
2003
|
|
|
|
|
|
|
=head1 AUTHOR |
|
2004
|
|
|
|
|
|
|
|
|
2005
|
|
|
|
|
|
|
Robert Wohlfarth <robert.j.wohlfarth@vumc.org> |
|
2006
|
|
|
|
|
|
|
|
|
2007
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
2008
|
|
|
|
|
|
|
|
|
2009
|
|
|
|
|
|
|
Copyright (c) 2021 Robert Wohlfarth |
|
2010
|
|
|
|
|
|
|
|
|
2011
|
|
|
|
|
|
|
This module is free software; you can redistribute it and/or modify it |
|
2012
|
|
|
|
|
|
|
under the same terms as Perl 5.10.0. For more details, see the full text |
|
2013
|
|
|
|
|
|
|
of the licenses in the directory LICENSES. |
|
2014
|
|
|
|
|
|
|
|
|
2015
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful, but |
|
2016
|
|
|
|
|
|
|
without any warranty; without even the implied |
|
2017
|
|
|
|
|
|
|
|
|
2018
|
|
|
|
|
|
|
=cut |
|
2019
|
|
|
|
|
|
|
|
|
2020
|
11
|
|
|
11
|
|
134
|
no Moose; |
|
|
11
|
|
|
|
|
66
|
|
|
|
11
|
|
|
|
|
145
|
|
|
2021
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |