File Coverage

blib/lib/Text/CSV/Hashify.pm

Criterion	Covered	Total	%
statement	111	111	100.0
branch	53	56	94.6
condition	23	23	100.0
subroutine	18	18	100.0
pod	6	7	85.7
total	211	215	98.1

line	stmt	bran	cond	sub	pod	time	code
1							package Text::CSV::Hashify;
2	6			6		4631	use strict;
	6					33
	6					126
3	6			6		105	use 5.8.0;
	6					18
4	6			6		21	use Carp;
	6					9
	6					223
5	6			6		2273	use IO::File;
	6					40959
	6					559
6	6			6		2624	use IO::Zlib;
	6					297826
	6					36
7	6			6		283	use Scalar::Util qw( reftype looks_like_number );
	6					10
	6					254
8	6			6		3427	use Text::CSV;
	6					63433
	6					233
9	6			6		2095	use open qw( :encoding(UTF-8) :std );
	6					5473
	6					43
10
11							BEGIN {
12	6			6		86629	use Exporter ();
	6					12
	6					107
13	6			6		24	use vars qw($VERSION @ISA @EXPORT);
	6					21
	6					334
14	6			6		18	$VERSION = '0.10';
15	6					114	@ISA = qw(Exporter);
16	6					6963	@EXPORT = qw( hashify );
17							}
18
19							=head1 NAME
20
21							Text::CSV::Hashify - Turn a CSV file into a Perl hash
22
23							=head1 VERSION
24
25							This document refers to version 0.10 of Text::CSV::Hashify. This version was
26							released Janaury 21 2018.
27
28							=head1 SYNOPSIS
29
30							# Simple functional interface
31							use Text::CSV::Hashify;
32							$hash_ref = hashify('/path/to/file.csv', 'primary_key');
33
34							# Object-oriented interface
35							use Text::CSV::Hashify;
36							$obj = Text::CSV::Hashify->new( {
37							file => '/path/to/file.csv',
38							format => 'hoh', # hash of hashes, which is default
39							key => 'id', # needed except when format is 'aoh'
40							max_rows => 20, # number of records to read; defaults to all
41							... # other key-value pairs possible for Text::CSV
42							} );
43
44							# all records requested
45							$hash_ref = $obj->all;
46
47							# arrayref of fields input
48							$fields_ref = $obj->fields;
49
50							# hashref of specified record
51							$record_ref = $obj->record('value_of_key');
52
53							# value of one field in one record
54							$datum = $obj->datum('value_of_key', 'field');
55
56							# arrayref of all unique keys seen
57							$keys_ref = $obj->keys;
58
59							=head1 DESCRIPTION
60
61							The Comma-Separated-Value ('CSV') format is the most common way to store
62							spreadsheets or the output of relational database queries in plain-text
63							format. However, since commas (or other designated field-separator
64							characters) may be embedded within data entries, the parsing of delimited
65							records is non-trivial. Fortunately, in Perl this parsing is well handled by
66							CPAN distribution L. This
67							permits us to address more specific data manipulation problems by building
68							modules on top of F.
69
70							B In this document we will use I as a catch-all for tab-delimited
71							files, pipe-delimited files, and so forth. Please refer to the documentation
72							for Text::CSV to learn how to handle field separator characters other than the
73							comma.
74
75							F is designed for the case where you simply want to turn a
76							CSV file into a Perl hash. In particular, it is designed for the case where:
77
78							=over 4
79
80							=item *
81
82							the CSV file's first record is a list of fields in the
83							ancestral database table; and
84
85							=item *
86
87							one field (column) functions as a primary key, I each record's entry in
88							that field is non-null and is distinct from every other record's entry
89							therein.
90
91							=back
92
93							F turns that kind of CSV file into one big hash of hashes.
94
95							F can now take gzip-compressed (F<.gz>) files as input as
96							well as uncompressed files.
97
98							=head2 Primary Case: CSV (with primary key) to Hash of Hashes
99
100							Text::CSV::Hashify is designed for the case where you simply want to turn a
101							CSV file into a Perl hash. In particular, it is designed for the case where
102							(a) the CSV file's first record is a list of fields in the ancestral database
103							table and (b) one field (column) functions as a B, I each
104							record's entry in that field is non-null and is distinct from every other
105							record's entry therein.
106
107							Text::CSV::Hashify turns that kind of CSV file into one big hash of hashes.
108							Elements of this hash are keyed on the entries in the designated primary key
109							field and the value for each element is a hash reference of all the data in a
110							particular database record (including the primary key field and its value).
111
112							=head2 Secondary Case: CSV (lacking primary key) to Array of Hashes
113
114							You may, however, encounter cases where a CSV file's header row contains the
115							list of database fields but no field is capable of serving as a primary key,
116							I there is no field in which the entry for that field in any record is
117							guaranteed to be distinct from the entries in that field for all other
118							records.
119
120							In this case, while an individual record can be turned into a hash,
121							the CSV file as a whole cannot accurately be turned into a hash of hashes. As
122							a fallback, Text::CSV::Hashify can, upon request, turn this into an array of
123							hashes. In this case, you will not be able to look up a particular record by
124							its primary key. You will instead have to know its index position within the
125							array (which is equivalent to knowing its record number in the original CSV
126							file minus C<1>).
127
128							=head2 Interfaces
129
130							Text::CSV::Hashify provides two interfaces: one functional, one
131							object-oriented.
132
133							Use the functional interface when all you want is to turn a CSV file with a
134							primary key field into a hash of hashes.
135
136							Use the object-oriented interface for any more sophisticated manipulation of
137							the CSV file. This includes:
138
139							=over 4
140
141							=item * Text::CSV options
142
143							Access to any of the options available to Text::CSV, such as use of a
144							separator character other than a comma.
145
146							=item * Limit number of records
147
148							Selection of a limited number of records from the CSV file, rather than
149							slurping the whole file into your in-memory hash.
150
151							=item * Array of hash references format
152
153							Probably better than the default hash of hash references format when the CSV
154							file has no field able to serve as a primary key.
155
156							=item * Metadata
157
158							Access to the list of fields, the list of all primary key values, the values
159							in an individual record, or the value of an individual field in an individual
160							record.
161
162							=back
163
164							B On the recommendation of the authors/maintainers of Text::CSV,
165							Text::CSV::Hashify will internally always set Text::CSV's C 1>
166							option.
167
168							=head1 FUNCTIONAL INTERFACE
169
170							Text::CSV::Hashify by default exports one function: C.
171
172							$hash_ref = hashify('/path/to/file.csv', 'primary_key');
173
174							or
175
176							$hash_ref = hashify('/path/to/file.csv.gz', 'primary_key');
177
178							Function takes two arguments: path to CSV file; field in that file which
179							serves as primary key. If the path to the input file ends in F<.gz>, it is
180							assumed to be compressed by F. If the file name ends in F<.psv> (or
181							F<.psv.gz>), the separator character is assumed to be a pipe (C<\|>). If the
182							file name ends in F<.tsv> (or F<.tsv.gz>), the separator character is assumed
183							to be a tab (C< >). Otherwise, the separator character will be assumed to be
184							a comma (C<,>).
185
186							Returns a reference to a hash of hash references.
187
188							=cut
189
190							sub hashify {
191	10	100		10	0	20228	croak "'hashify()' must have two arguments"
192							unless @_ == 2;
193	9					21	my @args = @_;
194	9					29	for (my $i=0;$i<=$#args;$i++) {
195	18	100				140	croak "'hashify()' argument at index '$i' not true" unless $args[$i];
196							}
197	8					29	my %obj_args = (
198							file => $args[0],
199							key => $args[1],
200							);
201							$obj_args{sep_char} =
202							($obj_args{file} =~ m/\.psv(\.gz)?$/)
203							? '\|'
204	8	100				59	: ($obj_args{file} =~ m/\.tsv(\.gz)?$/)
		100
205							? "\t"
206							: ',';
207	8					29	my $obj = Text::CSV::Hashify->new( \%obj_args );
208	8					25	return $obj->all();
209							}
210
211							=head1 OBJECT-ORIENTED INTERFACE
212
213							=head2 C
214
215							=over 4
216
217							=item * Purpose
218
219							Text::CSV::Hashify constructor.
220
221							=item * Arguments
222
223							$obj = Text::CSV::Hashify->new( {
224							file => '/path/to/file.csv',
225							format => 'hoh', # hash of hashes, which is default
226							key => 'id', # needed except when format is 'aoh'
227							max_rows => 20, # number of records to read; defaults to all
228							... # other key-value pairs possible for Text::CSV
229							} );
230
231							Single hash reference. Required element is:
232
233							=over 4
234
235							=item * C
236
237							String: path to CSV file serving as input. If the path to the input file ends
238							in F<.gz>, it is assumed to be compressed by F.
239
240							=back
241
242							Element usually needed:
243
244							=over 4
245
246							=item * C
247
248							String: name of field in CSV file serving as unique key. Needed except when
249							optional element C is C.
250
251							=back
252
253							Optional elements are:
254
255							=over 4
256
257							=item * C
258
259							String: possible values are C and C. Defaults to C (hash of
260							hashes). C will fail if the same value is encountered in more than one
261							record's entry in the C column. So if you know in advance that your data
262							cannot meet this condition, explicitly select C aoh>.
263
264							=item * C
265
266							Number: provide this if you do not wish to populate the hash with all data
267							records from the CSV file. (Will have no effect if the number provided is
268							greater than or equal to the number of data records in the CSV file.)
269
270							=item * Any option available to Text::CSV
271
272							See documentation for either Text::CSV or Text::CSV_XS.
273
274							=back
275
276							=item * Return Value
277
278							Text::CSV::Hashify object.
279
280							=item * Comment
281
282							=back
283
284							=cut
285
286							sub new {
287	34			34	1	22614	my ($class, $args) = @_;
288	34					50	my %data;
289
290	34	100	100			507	croak "Argument to 'new()' must be hashref"
291							unless (ref($args) and reftype($args) eq 'HASH');
292	32	100				311	croak "Argument to 'new()' must have 'file' element" unless $args->{file};
293							croak "Cannot locate file '$args->{file}'"
294	31	100				642	unless (-f $args->{file});
295	30					112	$data{file} = delete $args->{file};
296
297	30	100	100			111	if ($args->{format} and ($args->{format} !~ m/^(?:h\|a)oh$/i) ) {
298	1					86	croak "Entry '$args->{format}' for format is invalid'";
299							}
300	29		100			106	$data{format} = delete $args->{format} \|\| 'hoh';
301
302	29	100	100			86	if (! exists $args->{key} and $data{format} ne 'aoh') {
303	1					80	croak "Argument to 'new()' must have 'key' element unless 'format' element is 'aoh'";
304							}
305	28					53	$data{key} = delete $args->{key};
306
307	28	100				60	if (defined($args->{max_rows})) {
308	8	100				100	if ($args->{max_rows} !~ m/^[0-9]+$/) {
309	3					243	croak "'max_rows' option, if defined, must be numeric";
310							}
311							else {
312	5					14	$data{max_rows} = delete $args->{max_rows};
313							}
314							}
315							# We've now handled all the Text::CSV::Hashify::new-specific options.
316							# Any remaining options are assumed to be intended for Text::CSV::new().
317
318	25					51	$args->{binary} = 1;
319	25	50				136	my $csv = Text::CSV->new ( $args )
320							or croak "Cannot use CSV: ".Text::CSV->error_diag ();
321	25					2836	my $IN;
322	25	100				85	if ($data{file} =~ m/\.gz$/) {
323	8					36	$IN = IO::Zlib->new($data{file}, "rb");
324							}
325							else {
326	17					77	$IN = IO::File->new($data{file}, "r");
327							}
328	25	50				24066	croak "Unable to open '$data{file}' for reading"
329							unless defined $IN;
330	25					696	my $header_ref = $csv->getline($IN);
331	25					1965	my %header_fields_seen;
332	25					41	for (@{$header_ref}) {
	25					59
333	514	100				637	if (exists $header_fields_seen{$_}) {
334	1					100	croak "Duplicate field '$_' observed in '$data{file}'";
335							}
336							else {
337	513					725	$header_fields_seen{$_}++;
338							}
339							}
340	24					51	$data{fields} = $header_ref;
341	24					33	$csv->column_names(@{$header_ref});
	24					109
342
343							# 'hoh format
344	24					1763	my %keys_seen;
345	24					52	my @keys_list = ();
346	24					42	my %parsed_data;
347							# 'aoh' format
348							my @parsed_data;
349
350	24					87	PARSE_FILE: while (my $record = $csv->getline_hr($IN)) {
351	4239	100				620747	if ($data{format} eq 'hoh') {
352	4229					7523	my $kk = $record->{$data{key}};
353	4229	100				7058	if ($keys_seen{$kk}) {
354	1					87	croak "Key '$kk' already seen";
355							}
356							else {
357	4228					7191	$keys_seen{$kk}++;
358	4228					5871	push @keys_list, $kk;
359	4228					6830	$parsed_data{$kk} = $record;
360							last PARSE_FILE if (
361							defined $data{max_rows} and
362							scalar(keys %parsed_data) == $data{max_rows}
363	4228	100	100			13439	);
364							}
365							}
366							else { # format: 'aoh'
367	10					16	push @parsed_data, $record;
368							last PARSE_FILE if (
369							defined $data{max_rows} and
370							scalar(@parsed_data) == $data{max_rows}
371	10	100	100			53	);
372							}
373							}
374	23	50				1750	$IN->close or croak "Unable to close $data{file} after reading";
375	23	100				1579	$data{all} = ($data{format} eq 'aoh') ? \@parsed_data : \%parsed_data;
376	23	100				84	$data{keys} = \@keys_list if $data{format} eq 'hoh';
377	23					58	$data{csv} = $csv;
378	23					37	while (my ($k,$v) = each %{$args}) {
	60					164
379	37					94	$data{$k} = $v;
380							}
381	23					12791	return bless \%data, $class;
382							}
383
384							=head2 C
385
386							=over 4
387
388							=item * Purpose
389
390							Get a representation of all data found in a CSV input file.
391
392							=item * Arguments
393
394							$hash_ref = $obj->all; # when format is default or 'hoh'
395							$array_ref = $obj->all; # when format is 'aoh'
396
397							=item * Return Value
398
399							Reference representing all data records in the CSV input file. In the default
400							case, or if you have specifically requested C 'hoh'>, the return
401							value is a hash reference. When you have requested C 'aoh'>, the
402							return value is an array reference.
403
404							=item * Comment
405
406							In the default (C) case, the return value is equivalent to that of
407							C.
408
409							=back
410
411							=cut
412
413							sub all {
414	12			12	1	2172	my ($self) = @_;
415	12					313	return $self->{all};
416							}
417
418							=head2 C
419
420							=over 4
421
422							=item * Purpose
423
424							Get a list of the fields in the CSV source.
425
426							=item * Arguments
427
428							$fields_ref = $obj->fields;
429
430							=item * Return Value
431
432							Array reference.
433
434							=item * Comment
435
436							If any field names are duplicate, you will not get this far, as C would
437							have died.
438
439							=back
440
441							=cut
442
443							sub fields {
444	3			3	1	1343	my ($self) = @_;
445	3					6	return $self->{fields};
446							}
447
448							=head2 C
449
450							=over 4
451
452							=item * Purpose
453
454							Get a hash representing one record in the CSV input file.
455
456							=item * Arguments
457
458							$record_ref = $obj->record('value_of_key');
459
460							One argument. In the default case (C 'hoh'>), this argument is the value in the record in the column serving as unique key.
461
462							In the C 'aoh'> case, this will be index position of the data record
463							in the array. (The header row will be at index C<0>.)
464
465							=item * Return Value
466
467							Hash reference.
468
469							=back
470
471							=cut
472
473							sub record {
474	15			15	1	8358	my ($self, $key) = @_;
475	15	100	100			664	croak "Argument to 'record()' either not defined or non-empty"
476							unless (defined $key and $key ne '');
477							($self->{format} eq 'aoh')
478							? return $self->{all}->[$key]
479	9	100				33	: return $self->{all}->{$key};
480							}
481
482							=head2 C
483
484							=over 4
485
486							=item * Purpose
487
488							Get value of one field in one record.
489
490							=item * Arguments
491
492							$datum = $obj->datum('value_of_key', 'field');
493
494							List of two arguments: the value in the record in the column serving as unique
495							key; the name of the field.
496
497							=item * Return Value
498
499							Scalar.
500
501							=back
502
503							=cut
504
505							sub datum {
506	14			14	1	6672	my ($self, @args) = @_;
507	14	100				231	croak "'datum()' needs two arguments" unless @args == 2;
508	11					32	for (my $i=0;$i<=$#args;$i++) {
509	19	100	100			465	croak "Argument to 'datum()' at index '$i' either not defined or non-empty"
510							unless ((defined($args[$i])) and ($args[$i] ne ''));
511							}
512							($self->{format} eq 'aoh')
513							? return $self->{all}->[$args[0]]->{$args[1]}
514	5	100				27	: return $self->{all}->{$args[0]}->{$args[1]};
515							}
516
517							=head2 C
518
519							=over 4
520
521							=item * Purpose
522
523							Get a list of all unique keys found in the input file.
524
525							=item * Arguments
526
527							$keys_ref = $obj->keys;
528
529							=item * Return Value
530
531							Array reference.
532
533							=item * Comment
534
535							If you have selected C 'aoh'> in the options to C, the
536							C method is inappropriate and will cause your program to die.
537
538							=back
539
540							=cut
541
542							sub keys {
543	4			4	1	2555	my ($self) = @_;
544	4	100				16	if (exists $self->{keys}) {
545	3					9	return $self->{keys};
546							}
547							else {
548	1					68	croak "'keys()' method not appropriate when 'format' is 'aoh'";
549							}
550							}
551
552							=head1 AUTHOR
553
554							James E Keenan
555							CPAN ID: jkeenan
556							jkeenan@cpan.org
557							http://thenceforward.net/perl/modules/Text-CSV-Hashify
558
559							=head1 COPYRIGHT
560
561							This program is free software; you can redistribute
562							it and/or modify it under the same terms as Perl itself.
563
564							The full text of the license can be found in the
565							LICENSE file included with this module.
566
567							Copyright 2012-2018, James E Keenan. All rights reserved.
568
569							=head1 BUGS
570
571							There are no bug reports outstanding on Text::CSV::Hashify as of the most recent
572							CPAN upload date of this distribution.
573
574							=head1 SUPPORT
575
576							To report any bugs or make any feature requests, please send mail to
577							C or use the web interface at
578							L.
579
580							=head1 ACKNOWLEDGEMENTS
581
582							Thanks to Christine Shieh for serving as the alpha consumer of this
583							library's output.
584
585							=head1 OTHER CPAN DISTRIBUTIONS
586
587							=head2 Text-CSV and Text-CSV_XS
588
589							These distributions underlie Text-CSV-Hashify and provide all of its
590							file-parsing functionality. Where possible, install both. That will enable
591							you to process a file with a single, shared interface but have access to the
592							faster processing speeds of XS where available.
593
594							=head2 Text-CSV-Slurp
595
596							Like Text-CSV-Hashify, Text-CSV-Slurp slurps an entire CSV file into memory,
597							but stores it as an array of hashes instead.
598
599							=head2 Text-CSV-Auto
600
601							This distribution inspired the C option to C.
602
603							=cut
604
605							1;
606