line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::CSV::Merge; |
2
|
|
|
|
|
|
|
# ABSTRACT: Merge two CSV files |
3
|
|
|
|
|
|
|
|
4
|
4
|
|
|
4
|
|
162647
|
use Modern::Perl '2010'; |
|
4
|
|
|
|
|
29202
|
|
|
4
|
|
|
|
|
22
|
|
5
|
4
|
|
|
4
|
|
2454
|
use Moo 1.001000; |
|
4
|
|
|
|
|
37516
|
|
|
4
|
|
|
|
|
23
|
|
6
|
4
|
|
|
4
|
|
4283
|
use IO::File; |
|
4
|
|
|
|
|
11
|
|
|
4
|
|
|
|
|
483
|
|
7
|
4
|
|
|
4
|
|
3000
|
use Text::CSV_XS; |
|
4
|
|
|
|
|
30387
|
|
|
4
|
|
|
|
|
229
|
|
8
|
4
|
|
|
4
|
|
4974
|
use DBI; # for DBD::CSV |
|
4
|
|
|
|
|
49575
|
|
|
4
|
|
|
|
|
255
|
|
9
|
4
|
|
|
4
|
|
2263
|
use Log::Dispatch; |
|
4
|
|
|
|
|
691052
|
|
|
4
|
|
|
|
|
133
|
|
10
|
4
|
|
|
4
|
|
1954
|
use autodie; |
|
4
|
|
|
|
|
50307
|
|
|
4
|
|
|
|
|
22
|
|
11
|
4
|
|
|
4
|
|
18628
|
use utf8; |
|
4
|
|
|
|
|
6
|
|
|
4
|
|
|
|
|
32
|
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
has +logger => ( |
15
|
|
|
|
|
|
|
is => 'lazy', |
16
|
|
|
|
|
|
|
builder => sub { |
17
|
0
|
|
|
0
|
|
0
|
Log::Dispatch->new( |
18
|
|
|
|
|
|
|
outputs => [ |
19
|
|
|
|
|
|
|
[ 'File', autoflush => 1, min_level => 'debug', filename => 'merge.log', newline => 1, mode => '>>' ], |
20
|
|
|
|
|
|
|
#[ 'Screen', min_level => 'info', newline => 1 ], |
21
|
|
|
|
|
|
|
], |
22
|
|
|
|
|
|
|
); |
23
|
|
|
|
|
|
|
} |
24
|
|
|
|
|
|
|
); |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
has +csv_parser => ( |
27
|
|
|
|
|
|
|
is => 'lazy', |
28
|
|
|
|
|
|
|
builder => sub { |
29
|
0
|
0
|
|
0
|
|
0
|
Text::CSV_XS->new({ binary => 1, eol => $/ }) |
30
|
|
|
|
|
|
|
or die "Cannot use CSV: " . Text::CSV_XS->error_diag(); |
31
|
|
|
|
|
|
|
} |
32
|
|
|
|
|
|
|
); |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
has +dbh => ( |
35
|
|
|
|
|
|
|
is => 'lazy', |
36
|
|
|
|
|
|
|
# must return only a code ref |
37
|
|
|
|
|
|
|
builder => sub { |
38
|
0
|
0
|
|
0
|
|
0
|
DBI->connect("dbi:CSV:", undef, undef, { |
39
|
|
|
|
|
|
|
RaiseError => 1, |
40
|
|
|
|
|
|
|
PrintError => 1, |
41
|
|
|
|
|
|
|
f_ext => ".csv/r", |
42
|
|
|
|
|
|
|
# Better performance with XS |
43
|
|
|
|
|
|
|
csv_class => "Text::CSV_XS", |
44
|
|
|
|
|
|
|
# csv_null => 1, |
45
|
|
|
|
|
|
|
}) or die "Cannot connect: $DBI::errstr"; |
46
|
|
|
|
|
|
|
} |
47
|
|
|
|
|
|
|
); |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
has base_file => ( |
50
|
|
|
|
|
|
|
is => 'rw', |
51
|
|
|
|
|
|
|
required => 1, |
52
|
|
|
|
|
|
|
#allow external names to be different from class attribute |
53
|
|
|
|
|
|
|
init_arg => 'base', |
54
|
|
|
|
|
|
|
#validate it |
55
|
|
|
|
|
|
|
#isa => sub {}, |
56
|
|
|
|
|
|
|
coerce => sub { |
57
|
|
|
|
|
|
|
my $base_fh = IO::File->new( $_[0], '<' ) or die "$_[0]: $!"; |
58
|
|
|
|
|
|
|
$base_fh->binmode(":utf8"); |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
return $base_fh; |
61
|
|
|
|
|
|
|
} |
62
|
|
|
|
|
|
|
); |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
has merge_file => ( |
65
|
|
|
|
|
|
|
# We use only the raw file name/path and do not create a FH here, unlike base_file(). |
66
|
|
|
|
|
|
|
is => 'rw', |
67
|
|
|
|
|
|
|
init_arg => 'merge', |
68
|
|
|
|
|
|
|
required => 1 |
69
|
|
|
|
|
|
|
); |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
has output_file => ( |
72
|
|
|
|
|
|
|
is => 'rw', |
73
|
|
|
|
|
|
|
init_arg => 'output', |
74
|
|
|
|
|
|
|
# an output file name is NOT required |
75
|
|
|
|
|
|
|
required => 0, |
76
|
|
|
|
|
|
|
default => 'merged_output.csv', |
77
|
|
|
|
|
|
|
coerce => sub { |
78
|
|
|
|
|
|
|
my $output_fh = IO::File->new( $_[0], '>' ) or die "$_[0]: $!"; |
79
|
|
|
|
|
|
|
$output_fh->binmode(":utf8"); |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
return $output_fh; |
82
|
|
|
|
|
|
|
} |
83
|
|
|
|
|
|
|
); |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
has columns=> ( |
86
|
|
|
|
|
|
|
is => 'rw', |
87
|
|
|
|
|
|
|
required => 1, |
88
|
|
|
|
|
|
|
); |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
has search_field => ( |
91
|
|
|
|
|
|
|
is => 'rw', |
92
|
|
|
|
|
|
|
required => 1, |
93
|
|
|
|
|
|
|
init_arg => 'search' |
94
|
|
|
|
|
|
|
); |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
has first_row_is_headers => ( |
97
|
|
|
|
|
|
|
is => 'rw', |
98
|
|
|
|
|
|
|
required => 1, |
99
|
|
|
|
|
|
|
#validate it |
100
|
|
|
|
|
|
|
isa => sub { |
101
|
|
|
|
|
|
|
# @TODO: there's got to be a better way to do this! |
102
|
|
|
|
|
|
|
die "Must be 1 or 0" unless $_[0] =~ /'1'|'0'/ || $_[0] == 1 || $_[0] == 0; |
103
|
|
|
|
|
|
|
}, |
104
|
|
|
|
|
|
|
); |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
#=method BUILD |
107
|
|
|
|
|
|
|
#Constructor. |
108
|
|
|
|
|
|
|
#=cut |
109
|
|
|
|
|
|
|
#sub BUILD { |
110
|
|
|
|
|
|
|
# my $self = shift; |
111
|
|
|
|
|
|
|
#} |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
sub merge { |
114
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
115
|
|
|
|
|
|
|
|
116
|
0
|
|
|
|
|
0
|
$self->csv_parser->column_names( $self->columns ); |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
# Loop through the base file to find missing data |
119
|
|
|
|
|
|
|
#@TODO: make into $self->rows? |
120
|
0
|
|
|
|
|
0
|
my @rows; |
121
|
|
|
|
|
|
|
|
122
|
0
|
|
|
|
|
0
|
while ( my $row = $self->csv_parser->getline_hr( $self->base_file ) ) { |
123
|
|
|
|
|
|
|
# skip column names |
124
|
0
|
0
|
0
|
|
|
0
|
next if ($. == 1 and $self->first_row_is_headers); |
125
|
|
|
|
|
|
|
|
126
|
0
|
0
|
|
|
|
0
|
if ( $self->csv_parser->parse($row) ) { |
127
|
|
|
|
|
|
|
# keep a list of null columns in this row |
128
|
0
|
|
|
|
|
0
|
my @nulls; |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
# might be slightly more efficient to use while() |
131
|
0
|
|
|
|
|
0
|
foreach my $key ( keys %{$row} ) { |
|
0
|
|
|
|
|
0
|
|
132
|
|
|
|
|
|
|
# which fields is this row missing? |
133
|
0
|
0
|
0
|
|
|
0
|
if ( $row->{$key} eq 'NULL' or $row->{$key} eq "" ) { |
134
|
0
|
|
|
|
|
0
|
push @nulls, $key; |
135
|
|
|
|
|
|
|
|
136
|
0
|
|
|
|
|
0
|
$self->logger->info("Missing data in column: $key for '$row->{$self->search_field}'"); |
137
|
|
|
|
|
|
|
} |
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
# make a hash of arrays |
141
|
0
|
0
|
|
|
|
0
|
if ( @nulls ) { |
142
|
|
|
|
|
|
|
# search $merge_file for the missing data's row |
143
|
0
|
|
|
|
|
0
|
$" = ','; # reset the list separator for array interpolation to suit SQL |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
# To get the original case for the columns, specify the column |
146
|
|
|
|
|
|
|
# names rather than using SELECT *, since it normalizes to |
147
|
|
|
|
|
|
|
# lowercase, per: |
148
|
|
|
|
|
|
|
# http://stackoverflow.com/questions/3350775/dbdcsv-returns-header-in-lower-case |
149
|
0
|
0
|
|
|
|
0
|
my $sth = $self->dbh->prepare( |
150
|
0
|
|
|
|
|
0
|
"select @{$self->columns} from $self->{merge_file} where $self->{search_field} = ?" |
151
|
|
|
|
|
|
|
) or die "Cannot prepare: " . $self->dbh->errstr (); |
152
|
|
|
|
|
|
|
|
153
|
0
|
|
|
|
|
0
|
$sth->execute($row->{$self->search_field}); |
154
|
|
|
|
|
|
|
|
155
|
0
|
|
|
|
|
0
|
while ( my $filler = $sth->fetchrow_hashref() ) { |
156
|
0
|
|
|
|
|
0
|
foreach my $gap ( @nulls ) { |
157
|
0
|
0
|
0
|
|
|
0
|
if (exists $filler->{$gap} and defined $filler->{$gap} and $filler->{$gap} ne "") { |
|
|
|
0
|
|
|
|
|
158
|
0
|
|
|
|
|
0
|
$self->logger->info( |
159
|
|
|
|
|
|
|
"Found Data: '$gap' = '$filler->{$gap}' for '$row->{$self->search_field}'" |
160
|
|
|
|
|
|
|
); |
161
|
|
|
|
|
|
|
|
162
|
0
|
|
|
|
|
0
|
$row->{$gap} = $filler->{$gap}; |
163
|
|
|
|
|
|
|
} else { |
164
|
0
|
|
|
|
|
0
|
$self->logger->info( |
165
|
|
|
|
|
|
|
"Data not Found for column: '$gap' for '$row->{$self->search_field}' $self->{merge_file}" |
166
|
|
|
|
|
|
|
); |
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
} |
169
|
|
|
|
|
|
|
} |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
# Be efficient and neat! |
172
|
0
|
|
|
|
|
0
|
$sth->finish(); |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
# insert the updated row as a reference; even if not processed, the |
176
|
|
|
|
|
|
|
# row will still appear in the final output. |
177
|
0
|
|
|
|
|
0
|
push @rows, $row; |
178
|
|
|
|
|
|
|
} else { |
179
|
0
|
|
|
|
|
0
|
my $err = $self->csv_parser->error_input; |
180
|
0
|
|
|
|
|
0
|
$self->logger->error("Failed to parse line: $err"); |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
# Ensure we've processed to the end of the file |
185
|
0
|
0
|
|
|
|
0
|
$self->csv_parser->eof or $self->csv_parser->error_diag(); |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
# print does NOT want an actual array! Use a hash slice, instead: |
188
|
|
|
|
|
|
|
#$self->csv_parser->print($output_fh, [ @$_{@columns} ]) for @rows; |
189
|
|
|
|
|
|
|
# |
190
|
|
|
|
|
|
|
# Or, here I've switched to Text::CSV_XS's specific print_hr(), which |
191
|
|
|
|
|
|
|
# is simply missing from the PP (Pure Perl) version. |
192
|
0
|
|
|
|
|
0
|
$self->csv_parser->print_hr($self->output_file, $_) for @rows; |
193
|
|
|
|
|
|
|
}; |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
sub DEMOLISH { |
196
|
1
|
|
|
1
|
1
|
1867
|
my $self = shift; |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
## Clean up! |
199
|
1
|
|
|
|
|
17
|
$self->base_file->close(); |
200
|
1
|
50
|
|
|
|
45
|
$self->output_file->close() or die "output.csv: $!"; |
201
|
|
|
|
|
|
|
} |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
1; |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
__END__ |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
=pod |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
=head1 NAME |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
Text::CSV::Merge - Merge two CSV files |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
=head1 VERSION |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
version 0.002 |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
=head1 SYNOPSIS |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
my $merger = Text::CSV::Merge->new({ |
221
|
|
|
|
|
|
|
base => 'into.csv', |
222
|
|
|
|
|
|
|
merge => 'from.csv', |
223
|
|
|
|
|
|
|
output => 'output.csv', # optional |
224
|
|
|
|
|
|
|
columns => [q/email name age/], |
225
|
|
|
|
|
|
|
search => 'email', |
226
|
|
|
|
|
|
|
first_row_is_headers => 1 # optional |
227
|
|
|
|
|
|
|
}); |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
$merger->merge(); |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
=head1 DESCRIPTION |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
The use case for this module is when one has two CSV files of largely the same structure. Yet, the 'from.csv' has data which 'into.csv' lacks. |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
In this initial release, Text::CSV::Merge only fills in empty cells; it does not overwrite data in 'into.csv' which also exists in 'from.csv'. |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
=head2 Subclassing |
238
|
|
|
|
|
|
|
Text::CSV::Merge may be subclassed. In the subclass, the following attributes may be overridden: |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
=over 4 |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
=item * |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
C<csv_parser> |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
=item * |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
C<dbh> |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
=item * |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
C<logger> |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
=back |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
=head1 ATTRIBUTES |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
=head2 logger |
259
|
|
|
|
|
|
|
The logger for all operations in this module. |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
The logger records data gaps in the base CSV file, and records which data from the merge CSV file was used fill the gaps in the base CSV file. |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
=head2 csv_parser |
264
|
|
|
|
|
|
|
The CSV parser used internally is an immutable class property. |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
The internal CSV parser is the XS version of Text::CSV: Text::CSV_XS. You may use Text::CSV::PP if you wish, but using any other parser which does not duplicate Text::CSV's API will probably not work without modifying the source of this module. |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
Text::CSV_XS is also used, hard-coded, as the parser for DBD::CSV. This is configurable, however, and may be made configurable by the end-user in a future release. It can be overridden in a subclass. |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
=head2 dbh |
271
|
|
|
|
|
|
|
Create reusable DBI connection to the CSV data to be merged in to base file. |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
This method is overridable in a subclass. A good use of this would be to merge data into an existing CSV file from a database, or XML file. It must conform to the DBI's API, however. |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
DBD::CSV is a base requirement for this module. |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
=head2 base_file |
278
|
|
|
|
|
|
|
The CSV file into which new data will be merged. |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
The base file is readonly, not read-write. This prevents accidental trashing of the original data. |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
=head2 merge_file |
283
|
|
|
|
|
|
|
The CSV file used to find data to merge into C<base_file>. |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
=head2 output_file |
286
|
|
|
|
|
|
|
The output file into which the merge results are written. |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
I felt it imperative not to alter the original data files. I may make this a configurable option in the future, but wold likely set its default to 'false'. |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
=head2 columns |
291
|
|
|
|
|
|
|
The columns to be merged. |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
A column to be merged must exist in both C<base_file> and C<merge_file>. Other than that requirement, each file may have other columns which do not exist in the other. |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
=head2 search_field |
296
|
|
|
|
|
|
|
The column/field to match rows in C<merge_file>. |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
This column must exist in both files and be identially cased. |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
=head2 first_row_is_headers |
301
|
|
|
|
|
|
|
1 if the CSV files' first row are its headers; 0 if not. |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
If there are no headers, then the column names supplied by the C<columns> argument/property are applied to the columns in each file virtually, in numerical orders as they were passed in the list. |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
=head1 METHODS |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
=head2 merge() |
308
|
|
|
|
|
|
|
Main method and is public. |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
C<merge()> performs the actual merge of the two CSV files. |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
=head2 DEMOLISH() |
313
|
|
|
|
|
|
|
This method locally overrides a Moo built-in. |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
It close out all file handles, which will only occur after a call to C<merge()>. |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
=head1 SEE ALSO |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
=over 4 |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
=item * |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
L<Text::CSV_XS> |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
=back |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
=head1 AUTHOR |
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
Michael Gatto <mgatto@lisantra.com> |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
This software is copyright (c) 2013 by Michael Gatto. |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
This is free software; you can redistribute it and/or modify it under |
336
|
|
|
|
|
|
|
the same terms as the Perl 5 programming language system itself. |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
=cut |