File Coverage

blib/lib/Text/CSV/Merge.pm

Criterion	Covered	Total	%
statement	27	59	45.7
branch	1	20	5.0
condition	0	12	0.0
subroutine	9	13	69.2
pod	2	2	100.0
total	39	106	36.7

line	stmt	bran	cond	sub	pod	time	code
1							package Text::CSV::Merge;
2							# ABSTRACT: Merge two CSV files
3
4	4			4		162647	use Modern::Perl '2010';
	4					29202
	4					22
5	4			4		2454	use Moo 1.001000;
	4					37516
	4					23
6	4			4		4283	use IO::File;
	4					11
	4					483
7	4			4		3000	use Text::CSV_XS;
	4					30387
	4					229
8	4			4		4974	use DBI; # for DBD::CSV
	4					49575
	4					255
9	4			4		2263	use Log::Dispatch;
	4					691052
	4					133
10	4			4		1954	use autodie;
	4					50307
	4					22
11	4			4		18628	use utf8;
	4					6
	4					32
12
13
14							has +logger => (
15							is => 'lazy',
16							builder => sub {
17	0			0		0	Log::Dispatch->new(
18							outputs => [
19							[ 'File', autoflush => 1, min_level => 'debug', filename => 'merge.log', newline => 1, mode => '>>' ],
20							#[ 'Screen', min_level => 'info', newline => 1 ],
21							],
22							);
23							}
24							);
25
26							has +csv_parser => (
27							is => 'lazy',
28							builder => sub {
29	0	0		0		0	Text::CSV_XS->new({ binary => 1, eol => $/ })
30							or die "Cannot use CSV: " . Text::CSV_XS->error_diag();
31							}
32							);
33
34							has +dbh => (
35							is => 'lazy',
36							# must return only a code ref
37							builder => sub {
38	0	0		0		0	DBI->connect("dbi:CSV:", undef, undef, {
39							RaiseError => 1,
40							PrintError => 1,
41							f_ext => ".csv/r",
42							# Better performance with XS
43							csv_class => "Text::CSV_XS",
44							# csv_null => 1,
45							}) or die "Cannot connect: $DBI::errstr";
46							}
47							);
48
49							has base_file => (
50							is => 'rw',
51							required => 1,
52							#allow external names to be different from class attribute
53							init_arg => 'base',
54							#validate it
55							#isa => sub {},
56							coerce => sub {
57							my $base_fh = IO::File->new( $_[0], '<' ) or die "$_[0]: $!";
58							$base_fh->binmode(":utf8");
59
60							return $base_fh;
61							}
62							);
63
64							has merge_file => (
65							# We use only the raw file name/path and do not create a FH here, unlike base_file().
66							is => 'rw',
67							init_arg => 'merge',
68							required => 1
69							);
70
71							has output_file => (
72							is => 'rw',
73							init_arg => 'output',
74							# an output file name is NOT required
75							required => 0,
76							default => 'merged_output.csv',
77							coerce => sub {
78							my $output_fh = IO::File->new( $_[0], '>' ) or die "$_[0]: $!";
79							$output_fh->binmode(":utf8");
80
81							return $output_fh;
82							}
83							);
84
85							has columns=> (
86							is => 'rw',
87							required => 1,
88							);
89
90							has search_field => (
91							is => 'rw',
92							required => 1,
93							init_arg => 'search'
94							);
95
96							has first_row_is_headers => (
97							is => 'rw',
98							required => 1,
99							#validate it
100							isa => sub {
101							# @TODO: there's got to be a better way to do this!
102							die "Must be 1 or 0" unless $_[0] =~ /'1'\|'0'/ \|\| $_[0] == 1 \|\| $_[0] == 0;
103							},
104							);
105
106							#=method BUILD
107							#Constructor.
108							#=cut
109							#sub BUILD {
110							# my $self = shift;
111							#}
112
113							sub merge {
114	0			0	1	0	my $self = shift;
115
116	0					0	$self->csv_parser->column_names( $self->columns );
117
118							# Loop through the base file to find missing data
119							#@TODO: make into $self->rows?
120	0					0	my @rows;
121
122	0					0	while ( my $row = $self->csv_parser->getline_hr( $self->base_file ) ) {
123							# skip column names
124	0	0	0			0	next if ($. == 1 and $self->first_row_is_headers);
125
126	0	0				0	if ( $self->csv_parser->parse($row) ) {
127							# keep a list of null columns in this row
128	0					0	my @nulls;
129
130							# might be slightly more efficient to use while()
131	0					0	foreach my $key ( keys %{$row} ) {
	0					0
132							# which fields is this row missing?
133	0	0	0			0	if ( $row->{$key} eq 'NULL' or $row->{$key} eq "" ) {
134	0					0	push @nulls, $key;
135
136	0					0	$self->logger->info("Missing data in column: $key for '$row->{$self->search_field}'");
137							}
138							}
139
140							# make a hash of arrays
141	0	0				0	if ( @nulls ) {
142							# search $merge_file for the missing data's row
143	0					0	$" = ','; # reset the list separator for array interpolation to suit SQL
144
145							# To get the original case for the columns, specify the column
146							# names rather than using SELECT *, since it normalizes to
147							# lowercase, per:
148							# http://stackoverflow.com/questions/3350775/dbdcsv-returns-header-in-lower-case
149	0	0				0	my $sth = $self->dbh->prepare(
150	0					0	"select @{$self->columns} from $self->{merge_file} where $self->{search_field} = ?"
151							) or die "Cannot prepare: " . $self->dbh->errstr ();
152
153	0					0	$sth->execute($row->{$self->search_field});
154
155	0					0	while ( my $filler = $sth->fetchrow_hashref() ) {
156	0					0	foreach my $gap ( @nulls ) {
157	0	0	0			0	if (exists $filler->{$gap} and defined $filler->{$gap} and $filler->{$gap} ne "") {
			0
158	0					0	$self->logger->info(
159							"Found Data: '$gap' = '$filler->{$gap}' for '$row->{$self->search_field}'"
160							);
161
162	0					0	$row->{$gap} = $filler->{$gap};
163							} else {
164	0					0	$self->logger->info(
165							"Data not Found for column: '$gap' for '$row->{$self->search_field}' $self->{merge_file}"
166							);
167							}
168							}
169							}
170
171							# Be efficient and neat!
172	0					0	$sth->finish();
173							}
174
175							# insert the updated row as a reference; even if not processed, the
176							# row will still appear in the final output.
177	0					0	push @rows, $row;
178							} else {
179	0					0	my $err = $self->csv_parser->error_input;
180	0					0	$self->logger->error("Failed to parse line: $err");
181							}
182							}
183
184							# Ensure we've processed to the end of the file
185	0	0				0	$self->csv_parser->eof or $self->csv_parser->error_diag();
186
187							# print does NOT want an actual array! Use a hash slice, instead:
188							#$self->csv_parser->print($output_fh, [ @$_{@columns} ]) for @rows;
189							#
190							# Or, here I've switched to Text::CSV_XS's specific print_hr(), which
191							# is simply missing from the PP (Pure Perl) version.
192	0					0	$self->csv_parser->print_hr($self->output_file, $_) for @rows;
193							};
194
195							sub DEMOLISH {
196	1			1	1	1867	my $self = shift;
197
198							## Clean up!
199	1					17	$self->base_file->close();
200	1	50				45	$self->output_file->close() or die "output.csv: $!";
201							}
202
203
204							1;
205
206							__END__
207
208							=pod
209
210							=head1 NAME
211
212							Text::CSV::Merge - Merge two CSV files
213
214							=head1 VERSION
215
216							version 0.002
217
218							=head1 SYNOPSIS
219
220							my $merger = Text::CSV::Merge->new({
221							base => 'into.csv',
222							merge => 'from.csv',
223							output => 'output.csv', # optional
224							columns => [q/email name age/],
225							search => 'email',
226							first_row_is_headers => 1 # optional
227							});
228
229							$merger->merge();
230
231							=head1 DESCRIPTION
232
233							The use case for this module is when one has two CSV files of largely the same structure. Yet, the 'from.csv' has data which 'into.csv' lacks.
234
235							In this initial release, Text::CSV::Merge only fills in empty cells; it does not overwrite data in 'into.csv' which also exists in 'from.csv'.
236
237							=head2 Subclassing
238							Text::CSV::Merge may be subclassed. In the subclass, the following attributes may be overridden:
239
240							=over 4
241
242							=item *
243
244							C<csv_parser>
245
246							=item *
247
248							C<dbh>
249
250							=item *
251
252							C<logger>
253
254							=back
255
256							=head1 ATTRIBUTES
257
258							=head2 logger
259							The logger for all operations in this module.
260
261							The logger records data gaps in the base CSV file, and records which data from the merge CSV file was used fill the gaps in the base CSV file.
262
263							=head2 csv_parser
264							The CSV parser used internally is an immutable class property.
265
266							The internal CSV parser is the XS version of Text::CSV: Text::CSV_XS. You may use Text::CSV::PP if you wish, but using any other parser which does not duplicate Text::CSV's API will probably not work without modifying the source of this module.
267
268							Text::CSV_XS is also used, hard-coded, as the parser for DBD::CSV. This is configurable, however, and may be made configurable by the end-user in a future release. It can be overridden in a subclass.
269
270							=head2 dbh
271							Create reusable DBI connection to the CSV data to be merged in to base file.
272
273							This method is overridable in a subclass. A good use of this would be to merge data into an existing CSV file from a database, or XML file. It must conform to the DBI's API, however.
274
275							DBD::CSV is a base requirement for this module.
276
277							=head2 base_file
278							The CSV file into which new data will be merged.
279
280							The base file is readonly, not read-write. This prevents accidental trashing of the original data.
281
282							=head2 merge_file
283							The CSV file used to find data to merge into C<base_file>.
284
285							=head2 output_file
286							The output file into which the merge results are written.
287
288							I felt it imperative not to alter the original data files. I may make this a configurable option in the future, but wold likely set its default to 'false'.
289
290							=head2 columns
291							The columns to be merged.
292
293							A column to be merged must exist in both C<base_file> and C<merge_file>. Other than that requirement, each file may have other columns which do not exist in the other.
294
295							=head2 search_field
296							The column/field to match rows in C<merge_file>.
297
298							This column must exist in both files and be identially cased.
299
300							=head2 first_row_is_headers
301							1 if the CSV files' first row are its headers; 0 if not.
302
303							If there are no headers, then the column names supplied by the C<columns> argument/property are applied to the columns in each file virtually, in numerical orders as they were passed in the list.
304
305							=head1 METHODS
306
307							=head2 merge()
308							Main method and is public.
309
310							C<merge()> performs the actual merge of the two CSV files.
311
312							=head2 DEMOLISH()
313							This method locally overrides a Moo built-in.
314
315							It close out all file handles, which will only occur after a call to C<merge()>.
316
317							=head1 SEE ALSO
318
319							=over 4
320
321							=item *
322
323							L<Text::CSV_XS>
324
325							=back
326
327							=head1 AUTHOR
328
329							Michael Gatto <mgatto@lisantra.com>
330
331							=head1 COPYRIGHT AND LICENSE
332
333							This software is copyright (c) 2013 by Michael Gatto.
334
335							This is free software; you can redistribute it and/or modify it under
336							the same terms as the Perl 5 programming language system itself.
337
338							=cut