File Coverage

lib/Parse/File/Taxonomy.pm

Criterion	Covered	Total	%
statement	172	172	100.0
branch	58	62	93.5
condition	11	11	100.0
subroutine	17	17	100.0
pod	12	13	92.3
total	270	275	98.1

line	stmt	bran	cond	sub	pod	time	code
1							package Parse::File::Taxonomy;
2	4			4		12497	use strict;
	4					9
	4					171
3	4			4		24	use Carp;
	4					7
	4					336
4	4			4		3000	use Text::CSV;
	4					98487
	4					28
5	4			4		200	use Scalar::Util qw( reftype );
	4					7
	4					15617
6							our $VERSION = '0.02';
7							#use Data::Dump;
8
9							=head1 NAME
10
11							Parse::File::Taxonomy - Validate a file for use as a taxonomy
12
13							=head1 SYNOPSIS
14
15							use Parse::File::Taxonomy;
16
17							$source = "./t/data/alpha.csv";
18							$obj = Parse::File::Taxonomy->new( {
19							file => $source,
20							} );
21
22							$hashified_taxonomy = $obj->hashify_taxonomy();
23
24							=head1 DESCRIPTION
25
26							This module takes as input a plain-text file, verifies that it can be used as
27							a taxonomy, then creates a Perl data structure representing that taxonomy.
28
29							B
30
31							=head2 Taxonomy: definition
32
33							For the purpose of this module, a B is defined as a tree-like data
34							structure with a root node, zero or more branch (child) nodes, and one or more
35							leaf nodes. The root node and each branch node must have at least one child
36							node, but leaf nodes have no child nodes. The number of branches
37							between a leaf node and the root node is variable.
38
39							B
40
41							Root
42							\|
43							----------------------------------------------------
44							\| \| \| \|
45							Branch Branch Branch Leaf
46							\| \| \|
47							------------------------- ------------ \|
48							\| \| \| \| \|
49							Branch Branch Leaf Leaf Branch
50							\| \| \|
51							\| ------------ \|
52							\| \| \| \|
53							Leaf Leaf Leaf Leaf
54
55							=head2 Taxonomy File: definition
56
57							For the purpose of this module, a B is (a) a CSV file in which
58							one column holds data on the position of each record within the taxonomy and
59							(b) in which each node in the tree other than the root node is uniquely
60							represented by a record within the file.
61
62							=head3 CSV
63
64							B<"CSV">, strictly speaking, refers to B:
65
66							path,nationality,gender,age,income,id_no
67
68							For the purpose of this module, however, column separators in a taxonomy file
69							may be any user-specified character handled by the F library.
70							Formats frequently observed are B:
71
72							path nationality gender age income id_no
73
74							and B:
75
76							path\|nationality\|gender\|age\|income\|id_no
77
78							The documentation for F comments that the CSV format could
79							perhaps better [be] called ASV (anything separated values)">, but we shall for
80							convenience use "CSV" herein regardless of the specific delimiter.
81
82							Since it is often the case that the characters used as column separators may
83							occur within the data recorded in the columns as well, it is customary to
84							quote either all columns:
85
86							"path","nationality","gender","age","income","id_no"
87
88							or, at the very least, all columns which can hold
89							data other than pure integers or floating-point numbers:
90
91							"path","nationality","gender",age,income,id_no
92
93							=head3 Tree structure
94
95							To qualify as a taxonomy file, it is not sufficient for a file to be in CSV
96							format. In each non-header record in that file, one column must hold data
97							capable of exactly specifying the record's position in the taxonomy, I
98							the route or B from the root node to the node being represented by that
99							record. That data must itself be in delimiter-separated format. Each
100							non-root node in the taxonomy must have exactly one record holding its path
101							data. Within that path column the value corresponding to the root node need
102							not be specified, I may be represented by an empty string.
103
104							Let's rewrite Diagram 1 with values to make this clear.
105
106							B
107
108							""
109							\|
110							----------------------------------------------------
111							\| \| \| \|
112							Alpha Beta Gamma Delta
113							\| \| \|
114							------------------------- ------------ \|
115							\| \| \| \| \|
116							Epsilon Zeta Eta Theta Iota
117							\| \| \|
118							\| ------------ \|
119							\| \| \| \|
120							Kappa Lambda Mu Nu
121
122							Let us suppose that our taxonomy file held comma-separated, quoted records.
123							Let us further supposed that the column holding taxonomy paths was, not
124							surprisingly, called C and that the separator within the C column
125							was a pipe (C<\|>) character. Let us further suppose that for now we are not
126							concerned with the data in any columns other than C so that, for purpose
127							of illustration, they will hold empty (albeit quoted) string.
128
129							Then the taxonomy file describing the tree in
130							Diagram 2 would look like this:
131
132							"path","nationality","gender","age","income","id_no"
133							"\|Alpha","","","","",""
134							"\|Alpha\|Epsilon","","","","",""
135							"\|Alpha\|Epsilon\|Kappa","","","","",""
136							"\|Alpha\|Zeta","","","","",""
137							"\|Alpha\|Zeta\|Lambda","","","","",""
138							"\|Alpha\|Zeta\|Mu","","","","",""
139							"\|Beta","","","","",""
140							"\|Beta\|Eta","","","","",""
141							"\|Beta\|Theta","","","","",""
142							"\|Gamma","","","","",""
143							"\|Gamma\|Iota","","","","",""
144							"\|Gamma\|Iota\|Nu","","","","",""
145							"\|Delta","","","","",""
146
147							Note that while in the C<\|Gamma> branch we ultimately have only one leaf node,
148							C<\|Gamma\|Iota\|Nu>, we require separate records in the taxonomy file for
149							C<\|Gamma> and C<\|Gamma\|Iota>. To put this another way, the existence of a
150							C leaf must not be taken to "auto-vivify" C<\|Gamma> and
151							C<\|Gamma\|Iota> nodes. Each non-root node must be explicitly represented in
152							the taxonomy file for the file to be considered valid.
153
154							Note further that there is no restriction on the values of the B of
155							the C across records. It only the B path that must be unique.
156							Let us illustrate that by modifying the data in Diagram 2:
157
158							B
159
160							""
161							\|
162							----------------------------------------------------
163							\| \| \| \|
164							Alpha Beta Gamma Delta
165							\| \| \|
166							------------------------- ------------ \|
167							\| \| \| \| \|
168							Epsilon Zeta Eta Theta Iota
169							\| \| \|
170							\| ------------ \|
171							\| \| \| \|
172							Kappa Lambda Mu Delta
173
174							Here we have two leaf nodes each named C. However, we follow different
175							paths from the root node to get to each of them. The taxonomy file
176							representing this tree would look like this:
177
178							"path","nationality","gender","age","income","id_no"
179							"\|Alpha","","","","",""
180							"\|Alpha\|Epsilon","","","","",""
181							"\|Alpha\|Epsilon\|Kappa","","","","",""
182							"\|Alpha\|Zeta","","","","",""
183							"\|Alpha\|Zeta\|Lambda","","","","",""
184							"\|Alpha\|Zeta\|Mu","","","","",""
185							"\|Beta","","","","",""
186							"\|Beta\|Eta","","","","",""
187							"\|Beta\|Theta","","","","",""
188							"\|Gamma","","","","",""
189							"\|Gamma\|Iota","","","","",""
190							"\|Gamma\|Iota\|Delta","","","","",""
191							"\|Delta","","","","",""
192
193							=head2 Taxonomy Validation
194
195							The C constructor, C, will probe a taxonomy file
196							provided to it as an argument to determine whether it can be considered a
197							valid taxonomy according to the description provided above.
198
199							TODO: Cnew() should also be able to accept a
200							reference to an array of CSV records already held in memory.
201
202							TODO: What would it mean for Cnew() to accept a
203							filehandle as an argument, rather than a file? Would that be difficult to
204							implement?
205
206							TODO: The user of this library, however, must be permitted to write
207							B which will be applied to a
208							taxonomy by means of a C method called on a
209							Parse::File::Taxonomy object. Should the file fail to meet those rules, the
210							user may choose not to proceed further even though the taxonomy meets the
211							basic validation criteria implemented in the constructor. This method will
212							take a reference to an array of subroutines references as its argument. Each
213							such code reference will be a user-defined rule which the taxonomy must obey.
214							The method will apply each code reference to the taxonomy in sequence and will
215							return with a true value if and only if all the individual criteria return
216							true as well.
217
218							=head1 METHODS
219
220							=head2 C
221
222							=over 4
223
224							=item * Purpose
225
226							Parse::File::Taxonomy constructor.
227
228							=item * Arguments
229
230							$source = "./t/data/alpha.csv";
231							$obj = Parse::File::Taxonomy->new( {
232							file => $source,
233							} );
234
235							Single hash reference. Elements in that hash are keyed on:
236
237							=over 4
238
239							=item * C
240
241							Absolute or relative path to the incoming taxonomy file. Currently
242							B (but this may change if we implement ability to use a list of CSV
243							strings instead of a file).
244
245							=item * C
246
247							If the column to be used as the "path" column in the incoming taxonomy file is
248							B the first column, this option must be set to the integer representing
249							the "path" column's index position (count starts at 0). Optional; defaults to C<0>.
250
251							=item * C
252
253							If the string used to distinguish components of the path in the path column in
254							the incoming taxonomy file is not a pipe (C<\|>), this option must be set.
255							Optional; defaults to C<\|>.
256
257							=item * Text::CSV options
258
259							Any other options which could normally be passed to Cnew()> will
260							be passed through to that module's constructor. On the recommendation of the
261							Text::CSV documentation, C is always set to a true value.
262
263							=back
264
265							=item * Return Value
266
267							Parse::File::Taxonomy object.
268
269							=item * Comment
270
271							C will throw an exception under any of the following conditions:
272
273							=over 4
274
275							=item * Argument to C is not a reference.
276
277							=item * Argument to C is not a hash reference.
278
279							=item * Unable to locate the file which is the value of the C element.
280
281							=item * Argument to C element is not an integer.
282
283							=item * Argument to C is greater than the index number of the
284							last element in the header row of the incoming taxonomy file, I the
285							C is wrong.
286
287							=item * The same field is found more than once in the header row of the
288							incoming taxonomy file.
289
290							=item * Unable to open or close the incoming taxonomy file for reading.
291
292							=item * In the column designated as the "path" column, the same value is
293							observed more than once.
294
295							=item * A non-parent node's parent node cannot be located in the incoming taxonomy file.
296
297							=item * A data row has a number of fields different from the number of fields
298							in the header row.
299
300							=back
301
302							=back
303
304							=cut
305
306							sub new {
307	17			17	1	16912	my ($class, $args) = @_;
308	17					38	my %data;
309
310	17	100	100			762	croak "Argument to 'new()' must be hashref"
311							unless (ref($args) and reftype($args) eq 'HASH');
312	15	100				242	croak "Argument to 'new()' must have 'file' element" unless $args->{file};
313	14	100				550	croak "Cannot locate file '$args->{file}'"
314							unless (-f $args->{file});
315	13					61	$data{file} = delete $args->{file};
316
317	13	100				59	if (exists $args->{path_col_idx}) {
318	2	100				168	croak "Argument to 'path_col_idx' must be integer"
319							unless $args->{path_col_idx} =~ m/^\d+$/;
320							}
321	12		100			89	$data{path_col_idx} = delete $args->{path_col_idx} \|\| 0;
322	12	100				62	$data{path_col_sep} = exists $args->{path_col_sep}
323							? $args->{path_col_sep}
324							: '\|';
325	12	100				38	if (exists $args->{path_col_sep}) {
326	2					6	$data{path_col_sep} = $args->{path_col_sep};
327	2					6	delete $args->{path_col_sep};
328							}
329							else {
330	10					28	$data{path_col_sep} = '\|';
331							}
332
333							# We've now handled all the Parse::File::Taxonomy-specific options.
334							# Any remaining options are assumed to be intended for Text::CSV::new().
335
336	12					32	$args->{binary} = 1;
337	12	50				100	my $csv = Text::CSV->new ( $args )
338							or croak "Cannot use CSV: ".Text::CSV->error_diag ();
339	12	50				1938	open my $IN, "<", $data{file}
340							or croak "Unable to open '$data{file}' for reading";
341	12					70	my $header_ref = $csv->getline($IN);
342
343	12					302	croak "Argument to 'path_col_idx' exceeds index of last field in header row in '$data{file}'"
344	12	100				42134	if $data{path_col_idx} > $#{$header_ref};
345
346	11					19	my %header_fields_seen;
347	11					19	for (@{$header_ref}) {
	11					396
348	65	100				109	if (exists $header_fields_seen{$_}) {
349	1					123	croak "Duplicate field '$_' observed in '$data{file}'";
350							}
351							else {
352	64					230	$header_fields_seen{$_}++;
353							}
354							}
355	10					190	$data{fields} = $header_ref;
356	10					22	my $field_count = scalar(@{$data{fields}});
	10					56
357	10					40	$data{path_col} = $data{fields}->[$data{path_col_idx}];
358
359	10					131	my $data_records = $csv->getline_all($IN);
360	10	50				47546	close $IN or croak "Unable to close after reading";
361
362
363							# Confirm no duplicate entries in column holding path:
364							# Confirm all rows have same number of columns as header:
365	10					33	my @bad_count_records = ();
366	10					30	my %paths_seen = ();
367	10					17	for my $rec (@{$data_records}) {
	10					31
368	131					250	$paths_seen{$rec->[$data{path_col_idx}]}++;
369	131					111	my $this_row_count = scalar(@{$rec});
	131					138
370	131	100				245	if ($this_row_count != $field_count) {
371	2					9	push @bad_count_records,
372							[ $rec->[$data{path_col_idx}], $this_row_count ];
373							}
374							}
375	10					28	my @dupe_paths = ();
376	10					117	for my $path (sort keys %paths_seen) {
377	128	100				222	push @dupe_paths, $path if $paths_seen{$path} > 1;
378							}
379	10					36	my $error_msg = <
380							No duplicate entries are permitted in column designated as path.
381							The following entries appear the number of times shown:
382							ERROR_MSG_DUPE
383	10					26	for my $path (@dupe_paths) {
384	2					10	$error_msg .= " $path:" . sprintf(" %6s\n" => $paths_seen{$path});
385							}
386	10	100				246	croak $error_msg if @dupe_paths;
387
388	9					39	$error_msg = <
389							Header row had $field_count records. The following records had different counts:
390							ERROR_MSG_WRONG_COUNT
391	9					20	for my $rec (@bad_count_records) {
392	2					10	$error_msg .= " $rec->[0]: $rec->[1]\n";
393							}
394	9	100				334	croak $error_msg if @bad_count_records;
395
396							# Confirm each node appears in taxonomy:
397	8					16	my $path_args = { map { $_ => $args->{$_} } keys %{$args} };
	8					42
	8					30
398	8					30	$path_args->{sep_char} = $data{path_col_sep};
399	8	50				70	my $path_csv = Text::CSV->new ( $path_args )
400							or croak "Cannot use CSV: ".Text::CSV->error_diag ();
401	8					872	my %missing_parents = ();
402	8					62	for my $path (sort keys %paths_seen) {
403	102					241	my $status = $path_csv->parse($path);
404	102					15620	my @columns = $path_csv->fields();
405	102	100				811	if (@columns > 2) {
406	70					262	my $parent =
407							join($path_args->{sep_char} => @columns[0 .. ($#columns - 1)]);
408	70	100				267	unless (exists $paths_seen{$parent}) {
409	2					6	$missing_parents{$path} = $parent;
410							}
411							}
412							}
413	8					30	$error_msg = <
414							Each node in the taxonomy must have a parent.
415							The following nodes lack the expected parent:
416							ERROR_MSG_ORPHAN
417	8					45	for my $path (sort keys %missing_parents) {
418	2					5	$error_msg .= " $path: $missing_parents{$path}\n";
419							}
420	8	100				211	croak $error_msg if scalar(keys %missing_parents);
421	7					23	$data{data_records} = $data_records;
422
423
424	7					13	while (my ($k,$v) = each %{$args}) {
	14					85
425	7					22	$data{$k} = $v;
426							}
427	7					435	return bless \%data, $class;
428							}
429
430							=head2 C
431
432							=over 4
433
434							=item * Purpose
435
436							Identify the names of the columns in the taxonomy.
437
438							=item * Arguments
439
440							my $fields = $self->fields();
441
442							No arguments; the information is already inside the object.
443
444							=item * Return Value
445
446							Reference to an array holding a list of the columns as they appear in the
447							header row of the incoming taxonomy file.
448
449							=item * Comment
450
451							Read-only.
452
453							=back
454
455							=cut
456
457							sub fields {
458	4			4	1	2737	my $self = shift;
459	4					21	return $self->{fields};
460							}
461
462							=head2 C
463
464							=over 4
465
466							=item * Purpose
467
468							Identify the index position (count starts at 0) of the column in the incoming
469							taxonomy file which serves as the path column.
470
471							=item * Arguments
472
473							my $path_col_idx = $self->path_col_idx;
474
475							No arguments; the information is already inside the object.
476
477							=item * Return Value
478
479							Integer in the range from 0 to 1 less than the number of columns in the header
480							row.
481
482							=item * Comment
483
484							Read-only.
485
486							=back
487
488							=cut
489
490							sub path_col_idx {
491	2			2	1	3042	my $self = shift;
492	2					10	return $self->{path_col_idx};
493							}
494
495							=head2 C
496
497							=over 4
498
499							=item * Purpose
500
501							Identify the name of the column in the incoming taxonomy which serves as the
502							path column.
503
504							=item * Arguments
505
506							my $path_col = $self->path_col;
507
508							No arguments; the information is already inside the object.
509
510							=item * Return Value
511
512							String.
513
514							=item * Comment
515
516							Read-only.
517
518							=back
519
520							=cut
521
522							sub path_col {
523	2			2	1	1347	my $self = shift;
524	2					12	return $self->{path_col};
525							}
526
527							=head2 C
528
529							=over 4
530
531							=item * Purpose
532
533							Identify the string used to separate path components once the taxonomy has
534							been created. This is just a "getter" and is logically distinct from the
535							option to C which is, in effect, a "setter."
536
537							=item * Arguments
538
539							my $path_col_sep = $self->path_col_sep;
540
541							No arguments; the information is already inside the object.
542
543							=item * Return Value
544
545							String.
546
547							=item * Comment
548
549							Read-only.
550
551							=back
552
553							=cut
554
555							sub path_col_sep {
556	2			2	1	1235	my $self = shift;
557	2					9	return $self->{path_col_sep};
558							}
559
560							=head2 C
561
562							=over 4
563
564							=item * Purpose
565
566							Once the taxonomy has been validated, get a list of its data rows as a Perl
567							data structure.
568
569							=item * Arguments
570
571							$data_records = $self->data_records;
572
573							None.
574
575							=item * Return Value
576
577							Reference to array of array references. The array will hold the data records
578							found in the incoming taxonomy file in their order in that file.
579
580							=item * Comment
581
582							Does not contain any information about the fields in the taxonomy, so you
583							should probably either (a) use in conjunction with C method above;
584							or (b) use C.
585
586							=back
587
588							=cut
589
590							sub data_records {
591	2			2	1	603	my $self = shift;
592	2					9	return $self->{data_records};
593							}
594
595							=head2 C
596
597							=over 4
598
599							=item * Purpose
600
601							Once the taxonomy has been validated, get a list of its header and data rows as a Perl
602							data structure.
603
604							=item * Arguments
605
606							$data_records = $self->fields_and_data_records;
607
608							None.
609
610							=item * Return Value
611
612							Reference to array of array references. The first element in the array will
613							hold the header row (same as output of C). The remaining elements
614							will hold the data records found in the incoming taxonomy file in their order
615							in that file.
616
617							=back
618
619							=cut
620
621							sub fields_and_data_records {
622	1			1	1	1452	my $self = shift;
623	1					5	my @all_rows = $self->fields;
624	1					3	for my $row (@{$self->data_records}) {
	1					4
625	13					18	push @all_rows, $row;
626							}
627	1					4	return \@all_rows;
628							}
629
630							=head2 C
631
632							=over 4
633
634							=item * Purpose
635
636							Once the taxonomy has been validated, get a list of its data rows as a Perl
637							data structure. In each element of this list, the path is now represented as
638							an array reference rather than a string.
639
640							=item * Arguments
641
642							$data_records_path_components = $self->data_records_path_components;
643
644							None.
645
646							=item * Return Value
647
648							Reference to array of array references. The array will hold the data records
649							found in the incoming taxonomy file in their order in that file.
650
651							=item * Comment
652
653							Does not contain any information about the fields in the taxonomy, so you
654							should probably either (a) use in conjunction with C method above;
655							or (b) use C.
656
657							=back
658
659							=cut
660
661							sub data_records_path_components {
662	1			1	1	1585	my $self = shift;
663	1					3	my @all_rows = ();
664	1					2	for my $row (@{$self->{data_records}}) {
	1					4
665	13					18	my $path_col = $row->[$self->{path_col_idx}];
666	13					55	my @path_components = split(/\Q$self->{path_col_sep}\E/, $path_col);
667	13					15	my @rewritten = ();
668	13					16	for (my $i = 0; $i <= $#{$row}; $i++) {
	91					155
669	78	100				109	if ($i != $self->{path_col_idx}) {
670	65					115	push @rewritten, $row->[$i];
671							}
672							else {
673	13					24	push @rewritten, \@path_components;
674							}
675							}
676	13					35	push @all_rows, \@rewritten;
677							}
678	1					5	return \@all_rows;
679							}
680
681							=head2 C
682
683							=over 4
684
685							=item * Purpose
686
687							Once the taxonomy has been validated, get a list of its data rows as a Perl
688							data structure. The first element in this list is an array reference holding
689							the header row. In each data element of this list, the path is now represented as
690							an array reference rather than a string.
691
692							=item * Arguments
693
694							$fields_and_data_records_path_components = $self->fields_and_data_records_path_components;
695
696							None.
697
698							=item * Return Value
699
700							Reference to array of array references. The array will hold the data records
701							found in the incoming taxonomy file in their order in that file.
702
703							=back
704
705							=cut
706
707							sub fields_and_data_records_path_components {
708	1			1	1	1447	my $self = shift;
709	1					4	my @all_rows = $self->fields;
710	1					2	for my $row (@{$self->{data_records}}) {
	1					3
711	13					18	my $path_col = $row->[$self->{path_col_idx}];
712	13					47	my @path_components = split(/\Q$self->{path_col_sep}\E/, $path_col);
713	13					17	my @rewritten = ();
714	13					15	for (my $i = 0; $i <= $#{$row}; $i++) {
	91					160
715	78	100				110	if ($i != $self->{path_col_idx}) {
716	65					107	push @rewritten, $row->[$i];
717							}
718							else {
719	13					22	push @rewritten, \@path_components;
720							}
721							}
722	13					46	push @all_rows, \@rewritten;
723							}
724	1					6	return \@all_rows;
725							}
726
727							=head2 C
728
729							=over 4
730
731							=item * Purpose
732
733							Display the number of descendant (multi-generational) nodes each node in the
734							taxonomy has.
735
736							=item * Arguments
737
738							$child_counts = $self->child_counts();
739
740							None.
741
742							=item * Return Value
743
744							Reference to hash in which each element is keyed on the value of the path
745							column in the incoming taxonomy file.
746
747							=back
748
749							=cut
750
751							sub child_counts {
752	4			4	1	2334	my $self = shift;
753	4					7	my %child_counts = map { $_->[$self->{path_col_idx}] => 0 } @{$self->{data_records}};
	52					111
	4					21
754	4					22	for my $node (keys %child_counts) {
755	52					145	for my $other_node ( grep { ! m/^\Q$node\E$/ } keys %child_counts) {
	676					2116
756	624	100				2438	$child_counts{$node}++
757							if $other_node =~ m/^\Q$node$self->{path_col_sep}\E/;
758							}
759							}
760	4					20	return \%child_counts;
761							}
762
763							=head2 C
764
765							=over 4
766
767							=item * Purpose
768
769							Get the total number of descendant nodes for one specific node in a validated
770							taxonomy.
771
772							=item * Arguments
773
774							$child_count = $self->get_child_count('\|Path\|To\|Node');
775
776							String containing node's path as spelled in the taxonomy.
777
778							=item * Return Value
779
780							Unsigned integer >= 0. Any node whose child count is C<0> is by definition a
781							leaf node.
782
783							=item * Comment
784
785							Will throw an exception if the node does not exist or is misspelled.
786
787							=back
788
789							=cut
790
791							sub get_child_count {
792	3			3	1	2162	my ($self, $node) = @_;
793	3					9	my $child_counts = $self->child_counts();
794	3	100				201	croak "Node '$node' not found" unless exists $child_counts->{$node};
795	2					12	return $child_counts->{$node};
796							}
797
798							=head2 C
799
800							=over 4
801
802							=item * Purpose
803
804							Turn a validated taxonomy into a Perl hash keyed on the column designated as
805							the path column.
806
807							=item * Arguments
808
809							$hashref = $self->hashify_taxonomy();
810
811							Takes an optional hashref holding a list of any of the following elements:
812
813							=over 4
814
815							=item * C
816
817							Boolean, defaulting to C<0>. By default, C will spell the
818							key of the hash exactly as the value of the path column is spelled in the
819							taxonomy -- which in turn is the way it was spelled in the incoming file.
820							That is, a path in the taxonomy spelled C<\|Alpha\|Beta\|Gamma> will be spelled
821							as a key in exactly the same way.
822
823							However, since in many cases (including the example above) the root node of the taxonomy will be empty, the
824							user may wish to remove the first instance of C. The user would
825							do so by setting C to a true value.
826
827							$hashref = $self->hashify_taxonomy( {
828							remove_leading_path_col_sep => 1,
829							} );
830
831							In that case they key would now be spelled: C.
832
833							Note further that if the C switch is set to a true value, any
834							setting to C will be ignored.
835
836							=item * C
837
838							A string which will be used in composing the key of the hashref returned by
839							this method. The user may select this key if she does not want to use the
840							value found in the incoming CSV file (which by default will be the pipe
841							character (C<\|>) and which may be overridden with the C argument
842							to C.
843
844							$hashref = $self->hashify_taxonomy( {
845							key_delim => q{ - },
846							} );
847
848							In the above variant, a path that in the incoming taxonomy file was
849							represented by C<\|Alpha\|Beta\|Gamma> will in C<$hashref> be represented by
850							C< - Alpha - Beta - Gamma>.
851
852							=item * C
853
854							A string which will be used in composing the key of the hashref returned by
855							this method. The user will set this switch if she wishes to have the root
856							note explicitly represented. Using this switch will automatically cause
857							C to be ignored.
858
859							Suppose the user wished to have C be the text for the root
860							node. Suppose further that the user wanted to use the string C< - > as the
861							delimiter within the key.
862
863							$hashref = $self->hashify_taxonomy( {
864							root_str => q{All Suppliers},
865							key_delim => q{ - },
866							} );
867
868							Then incoming path C<\|Alpha\|Beta\|Gamma> would be keyed as:
869
870							All Suppliers - Alpha - Beta - Gamma
871
872							=back
873
874							=item * Return Value
875
876							Hash reference. The number of elements in this hash should be equal to the
877							number of non-header records in the taxonomy.
878
879							=back
880
881							=cut
882
883							sub hashify_taxonomy {
884	8			8	1	19623	my ($self, $args) = @_;
885	8	100				40	if (defined $args) {
886	7	100	100			380	croak "Argument to 'new()' must be hashref"
887							unless (ref($args) and reftype($args) eq 'HASH');
888							}
889	6					12	my %hashified = ();
890	6					21	my $fields = $self->{fields};
891	6					15	my %idx2col = map { $_ => $fields->[$_] } (0 .. $#{$fields});
	36					117
	6					17
892	6					17	for my $rec (@{$self->{data_records}}) {
	6					20
893	78					56	my $rowkey;
894	78	100				108	if ($args->{root_str}) {
895	26					44	$rowkey = $args->{root_str} . $rec->[$self->{path_col_idx}];
896							}
897							else {
898	52	100				65	if ($args->{remove_leading_path_col_sep}) {
899	26					180	($rowkey = $rec->[$self->{path_col_idx}]) =~ s/^\Q$self->{path_col_sep}\E(.*)/$1/;
900							}
901							else {
902	26					36	$rowkey = $rec->[$self->{path_col_idx}];
903							}
904							}
905	78	100				138	if ($args->{key_delim}) {
906	39					158	$rowkey =~ s/\Q$self->{path_col_sep}\E/$args->{key_delim}/g;
907							}
908	78					69	my $rowdata = { map { $idx2col{$_} => $rec->[$_] } (0 .. $#{$fields}) };
	468					867
	78					113
909	78					211	$hashified{$rowkey} = $rowdata;
910							}
911	6					32	return \%hashified;
912							}
913
914							sub local_validate {
915	4			4	0	3587	my ($self, $args) = @_;
916
917	4	100	100			351	croak "Argument to local_validate() must be an array ref"
918							unless defined $args and ref($args) eq 'ARRAY';
919	2					3	foreach my $rule (@{$args}) {
	2					6
920	4	100				154	croak "Each element in arrayref of arguments to local_validate() must be a code ref"
921							unless ref($rule) eq 'CODE';
922							}
923							# TODO: implementation; documentation
924
925	1					6	return 1;
926							}
927
928							1;
929
930							# vim: formatoptions=crqot