File Coverage

blib/lib/App/combinesheets.pm

Criterion	Covered	Total	%
statement	16	18	88.8
branch			n/a
condition			n/a
subroutine	6	6	100.0
pod			n/a
total	22	24	91.6

line	stmt	sub	time	code
1				#-----------------------------------------------------------------
2				# App::combinesheets
3				# Author: Martin Senger
4				# For copyright and disclaimer se below.
5				#
6				# ABSTRACT: command-line tool merging CSV and TSV spreadsheets
7				# PODNAME: App::combinesheets
8				#-----------------------------------------------------------------
9	7	7	231286	use warnings;
	7		20
	7		226
10	7	7	34	use strict;
	7		16
	7		436
11
12				package App::combinesheets;
13
14				our $VERSION = '0.2.14'; # VERSION
15
16	7	7	44	use base 'App::Cmd::Simple';
	7		12
	7		7576
17
18	7	7	669179	use Pod::Usage;
	7		643404
	7		1275
19	7	7	83	use Pod::Find qw(pod_where);
	7		18
	7		413
20
21	7	7	12925	use Text::CSV::Simple;
	0
	0
22				use Text::CSV_XS;
23				use File::Spec;
24				use File::Temp;
25				use File::Which;
26				use File::BOM qw( :all );
27				use Algorithm::Loops qw( NestedLoops );
28				use autouse 'IO::CaptureOutput' => qw(capture_exec);
29
30				# reserved keywords in the configuration
31				use constant {
32				CFG_MATCH => 'MATCH',
33				CFG_PROG => 'PROG',
34				CFG_PROGS => 'PROGS',
35				CFG_PERL => 'PERL',
36				};
37
38				# types of input files
39				use constant {
40				TYPE_CSV => 'csv',
41				TYPE_TSV => 'tsv',
42				# TYPE_XSL => 'xsl', # not-yet-supported
43				};
44
45				# hash keys describing an input ($inputs)
46				use constant {
47				INPUT_FILE => 'file',
48				INPUT_TYPE => 'type',
49				INPUT_MATCHED_BY => 'matched_by',
50				INPUT_MATCHED_BY_INDEX => 'matched_by_index',
51				INPUT_HEADERS => 'headers',
52				INPUT_CONTENT => 'content',
53				};
54
55				# hash keys describing wanted fields ($wanted_columns)
56				use constant {
57				CFG_TYPE => 'type', # what kind of input (MATCH, PROG, PROGS or PERL)
58				CFG_OUT_COL => 'ocol', # a name for this column used in the output
59				# keys used for the normal (MATCH) columns
60				CFG_ID => 'id', # which input
61				CFG_IN_COL => 'icol', # a column in such input
62				# keys used for the calculated columns (i.e. of type PROG, PROGS or PERL)
63				CFG_EXT => 'id', # name of the external program or Perl external subroutine
64				PERL_DETAILS => '_perl_details_', # added during the config processing
65				};
66
67				# ----------------------------------------------------------------
68				# Command-line arguments and script usage
69				# ----------------------------------------------------------------
70				sub usage_desc {
71				my $self = shift;
72				return "%c -config -inputs [other otions...]";
73				}
74				sub opt_spec {
75				return (
76				[ 'h' => "display a short usage message" ],
77				[ 'help' => "display a full usage message" ],
78				[ 'man\|m' => "display a full manual page" ],
79				[ 'version\|v' => "display a version" ],
80				[],
81				[ 'config\|cfg=s' => "" ],
82				[ 'inputs\|i=s@{1,}' => " in the form: =[,=...] (e.g. PERSON=,CAR=)" ],
83				[ 'outfile\|o=s' => "" ],
84				[ 'check\|c' => "only check the configuration" ],
85
86				{ getopt_conf => ['no_bundling', 'no_ignore_case', 'auto_abbrev'] }
87				);
88				}
89				sub validate_args {
90				my ($self, $opt, $args) = @_;
91
92				# show various levels of help and exit
93				my $pod_where = pod_where ({-inc => 1}, __PACKAGE__);
94				if ($opt->h) {
95				print "Usage: " . $self->usage();
96				if ($^S) { die "Okay\n" } else { exit (0) };
97				}
98				pod2usage (-input => $pod_where, -verbose => 1, -exitval => 0) if $opt->help;
99				pod2usage (-input => $pod_where, -verbose => 2, -exitval => 0) if $opt->man;
100
101				# show version and exit
102				if ($opt->version) {
103				## no critic
104				no strict; # because the $VERSION will be added only when
105				no warnings; # the distribution is fully built up
106				print "$VERSION\n";
107				if ($^S) { die "Okay\n" } else { exit (0) };
108				}
109
110				# check required command-line arguments
111				$self->usage_error ("Parameter '-config' is required.")
112				unless $opt->config;
113				$self->usage_error ("Parameter '-inputs' is required.")
114				unless $opt->inputs;
115
116				return;
117				}
118				sub usage_error {
119				my ( $self, $error ) = @_;
120				die "Error: $error\nUsage: " . $self->usage->text;
121				}
122
123				# ----------------------------------------------------------------
124				# The main part
125				# ----------------------------------------------------------------
126				my $inputs; # keys are input IDs
127				sub execute {
128				my ($self, $opt, $args) = @_;
129
130				$inputs = {}; # just in case somebody calls execute() twice
131
132				my @opt_inputs = split (m{,}, join (',', @{ $opt->inputs }));
133				my $opt_outfile = $opt->outfile;
134				my $opt_cfgfile = $opt->config;
135
136				# prepare output handler
137				my $combined;
138				if ($opt_outfile and not $opt->check) {
139				open ($combined, '>', $opt_outfile)
140				or die "[ER00] Cannot open file $opt_outfile for writing: $!\n";
141				} else {
142				$combined = *STDOUT;
143				}
144
145				# read configuration
146				my $wanted_cols = []; # each element: { CFG_TYPE, CFG_ID, CFG_IN_COL, CFG_OUT_COL... }
147				my $known_inputs = {}; # input ID => 1 ... the same input IDs as in $wanted_cols (for speed)
148				my $matches = {}; # input ID => matching column/header
149				my $config;
150				open ($config, '<', $opt_cfgfile)
151				or die "[ER00] Cannot read configuration file $opt_cfgfile: $!\n";
152				my $line_count = 0;
153				while (<$config>) {
154				$line_count++;
155				chomp;
156				next if m{^\s*$}; # ignore empty lines
157				next if m{^\s*#}; # ignore comment lines
158				s{^\s+\|\s+$}{}g; # trim whitespaces
159				my ($input_id, $input_col, $output_col) = split (m{\s\t\s}, $_, 3);
160				unless ($input_id and defined $input_col) {
161				warn "[WR01] Configuration line $line_count ignored: '$_'\n";
162				next;
163				}
164				$input_id = uc ($input_id); # make config keys upper-case
165				if ($input_id eq CFG_MATCH) {
166				my ($input_id, $column) = split (m{\s=\s}, $input_col, 2);
167				unless ($input_id and $column !~ m{^\s*$}) {
168				warn "[WR02] Bad format in configuration line $line_count: '$input_col'. Ignored.\n";
169				next;
170				}
171				$matches->{ uc ($input_id) } = $column;
172				next;
173				}
174				my $wanted_col = {};
175				if ($input_id eq CFG_PROG or $input_id eq CFG_PROGS or $input_id eq CFG_PERL) {
176				$wanted_col->{CFG_TYPE()} = $input_id;
177				$wanted_col->{CFG_EXT()} = $input_col;
178				if (defined $output_col) {
179				$wanted_col->{CFG_OUT_COL()} = $output_col;
180				} else {
181				warn "[WR10] Missing output column name in configuration line $line_count: '$input_col'.\n";
182				$wanted_col->{CFG_OUT_COL()} = '';
183				}
184				} else {
185				$wanted_col->{CFG_TYPE()} = CFG_MATCH;
186				$wanted_col->{CFG_ID()} = $input_id;
187				$wanted_col->{CFG_IN_COL()} = $input_col;
188				$wanted_col->{CFG_OUT_COL()} = (defined $output_col ? $output_col : $input_col);
189				$known_inputs->{$input_id} = 1;
190				}
191				push (@$wanted_cols, $wanted_col);
192				}
193				close $config;
194
195				# prepare for calculated columns
196				foreach my $col (@$wanted_cols) {
197				if ($col->{CFG_TYPE()} eq CFG_PROG or $col->{CFG_TYPE()} eq CFG_PROGS) {
198
199				# locate the external program
200				$col->{CFG_EXT()} = find_prog ($col->{CFG_EXT()});
201
202				} elsif ($col->{CFG_TYPE()} eq CFG_PERL) {
203
204				# load the wanted Perl module
205				my $call = $col->{CFG_EXT()};
206				$call =~ m{^(.+)((::)\|(->))(.*)$};
207				my $module = $1;
208				my $subroutine = $5;
209				my $how_to_call = $2; # can be '::' or '->'
210				unless ($module) {
211				warn "[WR11] Missing module name in '[PERL] " . $col->{CFG_OUT_COL()} . "'. Column ignored.\n";
212				$col->{ignored} = 1;
213				next;
214				}
215				unless ($subroutine) {
216				warn "[WR12] Missing subroutine name in '[PERL] " . $col->{CFG_OUT_COL()} . "'. Column ignored.\n";
217				$col->{ignored} = 1;
218				next;
219				}
220				if ($module =~ m{^:+}) {
221				warn "[WR13] Uncomplete module name in '[PERL] " . $col->{CFG_OUT_COL()} . "'. Column ignored.\n";
222				$col->{ignored} = 1;
223				next;
224				}
225				eval "require $module"; ## no critic
226				if ($@) {
227				warn "[WR14] Cannot load module '$module': $@. Column '" . $col->{CFG_OUT_COL()} . " ignored\n";
228				$col->{ignored} = 1;
229				next;
230				}
231				$module->import();
232
233				# remember what we just parsed and checked
234				$col->{PERL_DETAILS()} = {};
235				$col->{PERL_DETAILS()}->{what_to_call} = $module . $how_to_call . $subroutine;
236				$col->{PERL_DETAILS()}->{module} = $module;
237				$col->{PERL_DETAILS()}->{subroutine} = $subroutine;
238				$col->{PERL_DETAILS()}->{how_to_call} = $how_to_call;
239				}
240				}
241				$wanted_cols = [ grep { not $_->{ignored} } @$wanted_cols ];
242
243				# locate expected inputs
244				my $primary_input; # ID of the first input
245				foreach my $opt_input (@opt_inputs) {
246				my ($key, $value) = split (m{\s=\s}, $opt_input, 2);
247				next unless $key;
248				next unless $value;
249				$key = uc ($key);
250				unless (exists $known_inputs->{$key}) {
251				warn "[WR03] Configuration does not recognize the input named '$key'. Input ignored.\n";
252				next;
253				}
254				unless (exists $matches->{$key}) {
255				warn "[WR04] Input named '$key' does not have any MATCH column defined in configuration. Input ignored.\n";
256				next;
257				}
258				$primary_input = $key unless $primary_input; # remember which input came first
259				my $input = { INPUT_FILE() => $value,
260				INPUT_MATCHED_BY() => $matches->{$key} };
261				if ($value =~ m{\.csv$}i) {
262				$input->{INPUT_TYPE()} = TYPE_CSV;
263				} else {
264				$input->{INPUT_TYPE()} = TYPE_TSV;
265				}
266				$inputs->{$key} = $input;
267				}
268				die "[ER01] No valid inputs specified. Exiting.\n"
269				unless scalar keys (%$inputs) > 0;
270
271				# read headers from all inputs
272				my $headers_by_id = {}; # used for re-using the same headers once read, and for some checks
273				foreach my $input_id (keys %$inputs) {
274				my $input = $inputs->{$input_id};
275				my $headers;
276				if (exists $headers_by_id->{$input_id}) {
277				$headers = $headers_by_id->{$input_id}; # copy already known headers
278				} else {
279				$headers = read_headers ($input);
280				}
281
282				# add new properties to $input
283				unless (exists $headers->{ $input->{INPUT_MATCHED_BY()} }) {
284				warn ("[WR05] Input '$input_id' does not contain the matching header '" . $input->{INPUT_MATCHED_BY()} .
285				"'. Input ignored\n");
286				delete $inputs->{$input_id};
287				next;
288				}
289				$headers_by_id->{$input_id} = $headers
290				unless exists $headers_by_id->{$input_id};
291				$input->{INPUT_HEADERS()} = $headers;
292				$input->{INPUT_MATCHED_BY_INDEX()} = $headers->{ $input->{INPUT_MATCHED_BY()} };
293				}
294
295				# check real headers vs. headers as defined in configuration
296				my $already_reported = {};
297				foreach my $col (@$wanted_cols) {
298				next if $col->{CFG_TYPE()} ne CFG_MATCH; # check is done only for normal columns
299				my $input_id = $col->{CFG_ID()};
300				if (exists $headers_by_id->{$input_id}) {
301				# does the requested column exist in this input's headers?
302				unless (column_exists ($input_id, $col->{CFG_IN_COL()})) {
303				warn "[WR06] Column '$col->{CFG_IN_COL()}' not found in the input '$input_id'. Column will be ignored.\n";
304				$col->{ignored} = 1;
305				}
306				next;
307
308				} elsif (!exists $already_reported->{$input_id}) {
309				$already_reported->{$input_id} = 1;
310				warn "[WR07] Configuration defines columns from an input '$input_id' but no such input given (or was ignored). These columns will be ignored.\n";
311				}
312				$col->{ignored} = 1;
313				}
314				$wanted_cols = [ grep { not $_->{ignored} } @$wanted_cols ];
315
316				foreach my $input_id (keys %$matches) {
317				next unless exists $inputs->{$input_id}; # ignoring matches whose inputs are already ignored
318				# does the matching column exist in this input's headers?
319				unless (column_exists ($input_id, $matches->{$input_id})) {
320				die "[ER02] Matching column '$matches->{$input_id}' not found in the input '$input_id'. Must exit.\n";
321				}
322				}
323
324				# do we still have a primary input?
325				unless (exists $inputs->{$primary_input}) {
326				die "[ER03] Due to errors, the primary input '$primary_input' is now ignored. Must exit.\n";
327				}
328
329				# end of checking
330				exit (0) if $opt->check;
331
332				# read all inputs into memory
333				foreach my $input_id (keys %$inputs) {
334				my $input = $inputs->{$input_id};
335				my $content = read_content ($input);
336				$input->{INPUT_CONTENT()} = $content;
337				}
338
339				# output combined headers
340				my @header_line = ();
341				foreach my $col (@$wanted_cols) {
342				push (@header_line, $col->{CFG_OUT_COL()});
343				}
344				print $combined join ("\t", @header_line) . "\n"
345				unless scalar @header_line == 0;
346
347				# combine all inputs and make output lines
348				foreach my $matching_content (sort keys %{ $inputs->{$primary_input}->{INPUT_CONTENT()} }) {
349				# $matching_content is, for example, a publication title ("An Atlas of....")
350
351				# inputs may have more lines with the same value in the matching columns
352				# therefore, extract first the matching lines from all inputs
353				my $lines_to_combine = [];
354				my $inputs_to_combine = {}; # keys are inputs' CFG_IDs, values are indeces into $lines_to_combine
355
356				foreach my $col (@$wanted_cols) {
357				if ($col->{CFG_TYPE()} eq CFG_MATCH) {
358				unless (exists $inputs_to_combine->{ $col->{CFG_ID()} }) {
359				# remember the same lines (from the same input) only once
360				my $input = $inputs->{ $col->{CFG_ID()} };
361				push (@$lines_to_combine, $input->{INPUT_CONTENT()}->{$matching_content} \|\| [undef]);
362				$inputs_to_combine->{ $col->{CFG_ID()} } = $#$lines_to_combine;
363				}
364				}
365				}
366
367				# make all combinantions of matching lines
368
369				# let's have 3 inputs, identified by K, L and M
370				# there are three matching lines in K, two in L and one in M:
371				# my $lines_to_combine = [
372				# [ "line1", "line2", "line3", ], # from input K
373				# [ "lineX", "lineY", ], # from input L
374				# [ "lineQ", ], # from input M
375				# );
376				# my $inputs_to_combine = { K => 0, L => 1, M => 2 };
377				#
378				# the subroutine create_output_line() will be called 6 times
379				# with the following arguments:
380				# line1, lineX, lineQ
381				# line1, lineY, lineQ
382				# line2, lineX, lineQ
383				# line2, lineY, lineQ
384				# line3, lineX, lineQ
385				# line3, lineY, lineQ
386
387				NestedLoops ($lines_to_combine,
388				sub {
389				my @input_lines = @_;
390				my @output_line = ();
391				my @calculated = (); # indeces of the yet-to-be-calculated elements
392				my $column_count = -1;
393				foreach my $col (@$wanted_cols) { # $col defines what data to push into @output_line
394				$column_count++;
395				if ($col->{CFG_TYPE()} eq CFG_MATCH) {
396				my $input = $inputs->{ $col->{CFG_ID()} };
397				my $input_line = @input_lines[$inputs_to_combine->{ $col->{CFG_ID()} }];
398				# use Data::Dumper;
399				# print Dumper (\@input_lines);
400				# print Dumper ($inputs_to_combine);
401				my $idx = $input->{INPUT_HEADERS()}->{ $col->{CFG_IN_COL()} };
402				my $value = $input_line->[$idx] \|\| '';
403				push (@output_line, $value);
404				} else {
405				push (@calculated, $column_count);
406				push (@output_line, '');
407				}
408				}
409				# insert the calculated columns
410				foreach my $idx (@calculated) {
411				if ($wanted_cols->[$idx]->{CFG_TYPE()} eq CFG_PROG) {
412				$output_line[$idx] = call_prog ($wanted_cols->[$idx], \@header_line, \@output_line);
413				} elsif ($wanted_cols->[$idx]->{CFG_TYPE()} eq CFG_PROGS) {
414				$output_line[$idx] = call_prog_simple ($wanted_cols->[$idx]);
415				} elsif ($wanted_cols->[$idx]->{CFG_TYPE()} eq CFG_PERL) {
416				$output_line[$idx] = call_perl ($wanted_cols->[$idx], \@header_line, \@output_line);
417				}
418				}
419
420				print $combined join ("\t", @output_line) . "\n"
421				unless scalar @output_line == 0;
422				});
423				}
424				close $combined if $opt_outfile;
425				}
426
427				# ----------------------------------------------------------------
428				# Call a Perl subroutine (from any module) in order to get a value for
429				# a "calculated" column. $column defines which column to fill,
430				# $header_line is an arra is an arrayref with column headers and the
431				# $data_line is another arrayref with the values for the current row.
432				#
433				# $column->{PERL_DETAILS} contains all details needed for the call
434				# ----------------------------------------------------------------
435				sub call_perl {
436				my ($column, $header_line, $data_line) = @_;
437
438				no strict; ## no critic
439				my $what_to_call = $column->{PERL_DETAILS()}->{what_to_call};
440				my $how_to_call = $column->{PERL_DETAILS()}->{how_to_call};
441				my $module = $column->{PERL_DETAILS()}->{module};
442				my $subroutine = $column->{PERL_DETAILS()}->{subroutine};
443
444				if ($how_to_call eq '->') {
445				return $module->$subroutine ($column, $header_line, $data_line);
446				} else {
447				return &$what_to_call ($column, $header_line, $data_line);
448				}
449				}
450
451				# ----------------------------------------------------------------
452				# Call an external program in order to get a value for a "calculated"
453				# column. $column defines which column to fill, $header_line is an
454				# arra is an arrayref with column headers and the $data_line is
455				# another arrayref with the values for the current row.
456				#
457				# $column->{CFG_EXT} contains a program name to call
458				# ----------------------------------------------------------------
459				sub call_prog {
460				my ($column, $header_line, $data_line) = @_;
461
462				# prepare an input file for the external program
463				my $tmp = File::Temp->new();
464				for (my $i = 0; $i < @$header_line;$i++) {
465				print $tmp $header_line->[$i] . "\t" . $data_line->[$i] . "\n";
466				}
467
468				# call it
469				return _call_it ($column->{CFG_EXT()}, $tmp);
470				}
471
472				# ----------------------------------------------------------------
473				# Call an external program (without any command-line arguments) in
474				# order to get a value for a "calculated" column.
475				#
476				# $column->{CFG_EXT} contains a program name to call
477				# ----------------------------------------------------------------
478				sub call_prog_simple {
479				my ($column) = @_;
480				return _call_it ($column->{CFG_EXT()});
481				}
482
483				# ----------------------------------------------------------------
484				#
485				# ----------------------------------------------------------------
486				sub _call_it {
487				my @command = @_;
488				my ($stdout, $stderr, $success, $exit_code) = capture_exec (@command);
489				if ($exit_code != 0 or $stderr) {
490				my $errmsg = '[ER05] Failed command: ' . join (' ', map {"'$_'"} @command) . "\n";
491				$errmsg .= "STDERR: $stderr\n" if $stderr;
492				$errmsg .= "EXIT CODE: $exit_code\n";
493				die $errmsg;
494				}
495				chomp $stdout; # remove the last newline
496				$stdout =~ s{\n}{ }g; # better to replace newlines
497				return $stdout;
498				}
499
500				# ----------------------------------------------------------------
501				# Locate given $prgname and return it, usually with an added path. Or
502				# die if such program cannot be found or it is not executable.
503				# ----------------------------------------------------------------
504				sub find_prog {
505				my $prgname = shift;
506				my $full_name;
507
508				# 1) try the name as it is (e.g. the ones with an absolute path)
509				if (-e $prgname and -x $prgname and
510				File::Spec->file_name_is_absolute ($prgname)) {
511				return $prgname;
512				}
513
514				# 2) try to find it on system PATH
515				$full_name = which ($prgname);
516				if ($full_name ) {
517				chomp $full_name;
518				return $full_name;
519				}
520
521				# 3) try the environment variable with a path
522				if (exists $ENV{COMBINE_SHEETS_EXT_PATH}) {
523				$full_name = File::Spec->catfile ($ENV{COMBINE_SHEETS_EXT_PATH}, $prgname);
524				return maybe_die ($full_name);
525				}
526
527				# 4) try to find it in the current directory
528				$full_name = File::Spec->catfile ('./', $prgname);
529				return maybe_die ($full_name);
530				}
531				sub maybe_die {
532				my $prg = shift;
533				die "[ER04] '$prg' not found or is not executable.\n"
534				unless -e $prg and -x $prg;
535				return $prg;
536				}
537
538				# ----------------------------------------------------------------
539				# Does the requested $column exist in the given input's headers?
540				# ----------------------------------------------------------------
541				sub column_exists {
542				my ($input_id, $column) = @_;
543				return exists $inputs->{$input_id}->{INPUT_HEADERS()}->{$column};
544				}
545
546				# ----------------------------------------------------------------
547				# Read the headers (the first line) form an input file (given in
548				# hashref $input) and store them in the hashref $headers, each od them
549				# with its index as it appears in the read file. Do nothing if
550				# $headers already contains headers from the same input identifier.
551				# ----------------------------------------------------------------
552				sub read_headers {
553				my ($input) = @_;
554
555				my $headers;
556				if ($input->{INPUT_TYPE()} eq TYPE_CSV) {
557				$headers = read_csv_headers ($input->{INPUT_FILE()});
558				} else {
559				$headers = read_tsv_headers ($input->{INPUT_FILE()});
560				}
561				my $new_headers = {};
562				my $column_index = 0;
563				foreach my $column (@$headers) {
564				$new_headers->{$column} = $column_index++;
565				}
566				return $new_headers;
567				}
568
569				# ----------------------------------------------------------------
570				#
571				# ----------------------------------------------------------------
572				sub read_csv_headers {
573				my ($file) = @_;
574				my $line = read_first_line ($file);
575
576				my $parser = Text::CSV_XS->new ({
577				allow_loose_quotes => 1,
578				escape_char => "\\",
579				});
580				if ($parser->parse ($line)) {
581				return [ $parser->fields ];
582				} else {
583				die "[ER04] Parsing CSV file $file failed: " .
584				$parser->error_input . "\n" .
585				$parser->error_diag() . "\n";
586				}
587				}
588
589				# ----------------------------------------------------------------
590				#
591				# ----------------------------------------------------------------
592				sub read_tsv_headers {
593				my ($file) = @_;
594				my $line = read_first_line ($file);
595				return [ split (m{\t}, $line) ];
596				}
597
598				# ----------------------------------------------------------------
599				#
600				# ----------------------------------------------------------------
601				sub read_first_line {
602				my ($file) = @_;
603				my $fh;
604				open_bom ($fh, $file); # or open ($fh, '<', $file)
605				# or die "[ER00] Cannot read input file $file: $!\n";
606				my $line = <$fh>; # read just one line
607				close $fh;
608				$line =~ s{(\r\|\n)+$}{}; # remove newlines of any kind
609				return $line;
610				}
611
612				# ----------------------------------------------------------------
613				# Stringify a hashref
614				# ----------------------------------------------------------------
615				sub ph {
616				my $hashref = shift;
617				my $result = '';
618				my ($key, $value);
619				while (($key, $value) = each (%$hashref)) {
620				$result .= "$key => $value,";
621				}
622				return substr ($result, 0, -1);
623				}
624
625				# ----------------------------------------------------------------
626				# Read contents...
627				# ----------------------------------------------------------------
628				sub read_content {
629				my ($input) = @_;
630				my $content;
631				if ($input->{INPUT_TYPE()} eq TYPE_CSV) {
632				$content = read_csv_content ($input->{INPUT_FILE()}, $input->{INPUT_MATCHED_BY_INDEX()});
633				} else {
634				$content = read_tsv_content ($input->{INPUT_FILE()}, $input->{INPUT_MATCHED_BY_INDEX()});
635				}
636				return $content;
637				}
638
639				# ----------------------------------------------------------------
640				#
641				# ----------------------------------------------------------------
642				sub read_tsv_content {
643				my ($file, $matched_index) = @_;
644				my $fh;
645				open_bom ($fh, $file); # or open ($fh, '<', $file)
646				# or die "[ER00] Cannot read input file $file: $!\n";
647				my $content = {};
648				my $line_count = 0;
649				while (my $line = <$fh>) {
650				next if $line_count++ == 0; # skip header line
651				next if $line =~ m{^\s*$}; # ignore empty lines
652				$line =~ s{(\r\|\n)+$}{}; # remove newlines of any kind
653				my @data = split (m{\t}, $line);
654				$content->{ $data[$matched_index] } = [] unless $content->{ $data[$matched_index] };
655				push (@{ $content->{ $data[$matched_index] } }, [@data]);
656				}
657				close $fh;
658				return $content;
659				}
660
661				# ----------------------------------------------------------------
662				#
663				# ----------------------------------------------------------------
664				sub read_csv_content {
665				my ($file, $matched_index) = @_;
666				my $count_lines = 0;
667				my $content = {};
668
669				# create a CSV parser; any error in reading input will be fatal
670				my $csv = Text::CSV_XS->new ({
671				allow_loose_quotes => 1,
672				escape_char => "\\",
673				auto_diag => 1,
674				});
675
676				# read the CSV input
677				open_bom (my $fh, $file);
678				while (<$fh>) {
679				if ($csv->parse ($_)) {
680				next if $count_lines++ == 0; # headers are ignored
681				my @data = $csv->fields;
682				if (@data) {
683				push (@{ $content->{ $data[$matched_index] } }, \@data);
684				}
685				} else {
686				my $err = $csv->error_input;
687				warn "[WR09] Possible a wrong or not-readable input file '$file': $err\n";
688				exit (1);
689				}
690				}
691
692				# $parser->add_trigger (after_parse => sub {
693				# my ($self, $data) = @_;
694				# return if $count_lines++ == 0; # headers are ignored
695				# $content->{ $data->[$matched_index] } = [] unless $content->{ $data->[$matched_index] };
696				# push (@{ $content->{ $data->[$matched_index] } }, $data);
697				# });
698				# read CSV input (the result is not used here; everything is done in triggers)
699				# $parser->read_file ($file);
700
701				return $content;
702				}
703				1;
704
705
706
707				=pod
708
709				=head1 NAME
710
711				App::combinesheets - command-line tool merging CSV and TSV spreadsheets
712
713				=head1 VERSION
714
715				version 0.2.14
716
717				=head1 SYNOPSIS
718
719				combinesheets -h
720				combinesheets -help
721				combinesheets -man
722				combinesheets -version
723
724				combinesheets -config -inputs [] [-outfile ]
725
726				where has the form: = [=...]
727				where are: -check
728
729				=head1 DESCRIPTION
730
731				B is a command-line tool merging together two or more
732				spreadsheets. The spreadsheets can be COMMA-separated or TAB-separated
733				files, each of them having the first line with column headers. Data in
734				one of the column (it can be a different column in each input
735				spreadsheet) serve to match lines. For example, having two spreadsheets,
736				PERSON and CAR, with the following contents:
737
738				persons.tsv:
739
740				Surname First name Sex Age Nickname
741				Novak Jan M 52 Honza
742				Gudernova Jitka F 56
743				Senger Martin M 61 Tulak
744
745				cars.tsv:
746
747				Model Year Owned by
748				Praga 1936 Someone else
749				Mini 1968 Gudernova
750				Skoda 2002 Senger
751
752				we want to merge these spreadsheet by C in persons.tsv and by
753				C in cars.tsv. There are two possible results, depending
754				which spreadsheet is used as the first one (a primary one). If the
755				persons.tsv is the first, the result will be (which columns are
756				included in the result will be described later in this document):
757
758				combinesheets -cfg config.cfg -in PERSON=persons.tsv CAR=cars.csv
759
760				First name Surname Model Sex Nickname Age Year Owned by
761				Jitka Gudernova Mini F 56 1968 Gudernova
762				Jan Novak M Honza 52
763				Martin Senger Skoda M Tulak 61 2002 Senger
764
765				Or, if the cars.tsv is the first, the result will be:
766
767				combinesheets -cfg config.cfg -in CAR=cars.csv PERSON=persons.tsv
768
769				First name Surname Model Sex Nickname Age Year Owned by
770				Jitka Gudernova Mini F 56 1968 Gudernova
771				Martin Senger Skoda M Tulak 61 2002 Senger
772				Praga 1936 Someone else
773
774				Of course, if both input spreadsheets have only the matching lines,
775				both results will be the same (it will not matter which one of them is
776				considered the primary one).
777
778				The rows in the resulting spreadsheet are sorted by values in the
779				column that was used as a matching column in the primary input.
780
781				The information which columns should be used to match the input
782				spreadsheets and which columns should appear in the resulting
783				spreadsheet is read from a configuration file (see the C<-config>
784				- or C<-cfg> - argument).
785
786				The command-line arguments and options can be specified with single or
787				double dash. Most of them can be abbreviated to the nearest non-biased
788				length. They are case-sensitive.
789
790				=head2 Duplicated values in the matching columns
791
792				If there are repeated (the same) values in the column that serves as
793				matching criterion then the resulting spreadsheet will have as many
794				output lines (for a particular matching value) as is the number of all
795				combinations of the lines with that matching values in all input
796				spreadsheets. For example, let's have C and C,
797				assuming that a book can have more authors and any author can
798				contribute to any number of books:
799
800				books.tsv:
801				Title Note Author
802				Book 1 from B1-a Kim
803				Book 2 from B2-b Kim
804				Book 3 from B3-c Katrin
805				Book 1 from B1-d Blanka
806				Book 2 from B2-e Katrin
807
808				authors.tsv:
809				Age Name
810				28 Kim
811				20 Katrin
812				30 Blanka
813				50 Lazy author
814
815				The output (again, depending on which input is considered a primary
816				input) will be (a list of included column is defined in the
817				configuration file - see later):
818
819				combinesheets -cfg books_to_authors.cfg -in BOOK=books.tsv AUTHOR=authors.tsv
820
821				Name Title Age Note
822				Blanka Book 1 30 from B1-d
823				Katrin Book 3 20 from B3-c
824				Katrin Book 2 20 from B2-e
825				Kim Book 1 28 from B1-a
826				Kim Book 2 28 from B2-b
827
828				combinesheets -cfg books_to_authors.cfg -in AUTHOR=authors.tsv BOOK=books.tsv
829
830				Name Title Age Note
831				Blanka Book 1 30 from B1-d
832				Katrin Book 3 20 from B3-c
833				Katrin Book 2 20 from B2-e
834				Kim Book 1 28 from B1-a
835				Kim Book 2 28 from B2-b
836				Lazy author 50
837
838				=head1 ADVANCED USAGE
839
840				Additionally to the merging columns from one or more spreadsheets,
841				this script can also add completely new columns to the resulting
842				spreadsheet, the columns that do not exist in any of the input
843				spreadsheet. Such columns are called C.
844
845				Each C is created either by an external,
846				command-line driven, program, or by a Perl subroutine. In both cases,
847				the user must create (write) such external program or such Perl
848				subroutine. Therefore, this usage is meant more for developers than
849				for the end users.
850
851				Note that this advanced feature is meant only for new columns, not for
852				new rows. Therefore, it cannot be used, for example, to create rows
853				with totals of columns.
854
855				=head2 Calculated columns by external programs
856
857				If specified, an external program is invoked for each row. It can be
858				specified either by a keyword B or by a keyword B - see
859				syntax in the I section. In both cases, the
860				value of the standard output of these programs become the value of the
861				calculated column (a trailing newline of this standard output is
862				removed and other newlines are replaced by spaces).
863
864				A program defined by the B is called without any arguments
865				(C ~~in I stands for a I). That's why it does not have~~
866				any knowledge for which row it has been invoked. Its usage is,
867				therefore, for column values that are not dependent on other values
868				from the spreadsheet. For example, for the C shown above,
869				you can add a column C by calling a UNIX program C
870				- again, see an example the I
871				section.
872
873				A program defined by the B is called with one argument which is
874				a filename. This file contains the current row; each of its lines has
875				two, TAB-separated, fields. The first field is the column name and the
876				second field is the column value. For example, when processing the
877				last row of the C given above, the file will have the
878				following content:
879
880				Model Skoda
881				Year 2002
882				Owned by Senger
883
884				The files are only temporary and will be removed when
885				C finishes.
886
887				=head2 Calculated columns by a Perl subroutine
888
889				If specified by the keyword B, a Perl subroutine is called for
890				each row with the three arguments:
891
892				=over
893
894				=item 1
895
896				A hashref with information about the current column. Not often used
897				but may be handy if the same subroutine deals with more columns and,
898				therefore, needs to know for which column it was invoked. See the
899				I example in the I section.
900
901				=item 2
902
903				An arrayref with all column names.
904
905				=item 3
906
907				An arrayref with all column values - in the same order as the column
908				names.
909
910				=back
911
912				Actually, depending how the subroutine is defined in the
913				configuration, it may get as the first argument the module/class name
914				where it belongs to. If you define it like this:
915
916				PERL Module::Example::test
917
918				the C subroutine is called, indeed, with the three arguments as
919				described above. However, if your definition is rather:
920
921				PERL Module::Example->test
922
923				then the C subroutine is considered a Perl method and its first
924				argument is the module/class name. It is up to you to decide how you
925				want/need to write your functions. Again, an example is available in
926				the I section.
927
928				The return value of the subroutine will become a new value in the
929				calculated column. Do not return undef but rather an empty string if
930				the value cannot be created.
931
932				What is an advantage of writing my own module/package if I can simply
933				write an external program (perhaps also in Perl) doing exactly the
934				same? The Perl module stays in the memory for the whole time of
935				processing all input rows and, therefore, you can re-use some
936				calculations done for the previous rows. An example about it
937				(C) is given in the I
938				section.
939
940				=head1 ARGUMENTS and OPTIONS
941
942				=over 4
943
944				=item B<-config >
945
946				A filename with a configuration file. This is a mandatory
947				parameter. The configuration file describes:
948
949				=over
950
951				=item *
952
953				which columns in individual input spreadsheets should be
954				included in the resulting spreadsheet,
955
956				=item *
957
958				what names should be given to the resulting columns
959
960				=item *
961
962				in which order should be the columns in the resulting
963				spreadsheet
964
965				=item *
966
967				which columns should be used to match individual lines,
968
969				=back
970
971				The configuration file is a TAB-separated file (with no header
972				line). Empty lines and lines starting with a "#" character are
973				ignored. Each line has two columns, in some cases there is an optional
974				third column. Here is a configuration file used in the example above:
975
976				# Columns to match records from individual inputs
977				MATCH PERSON=Surname
978				MATCH CAR=Owned by
979				MATCH CHILD=Parent
980
981				# Columns - how they be in rows
982				PERSON First name
983				PERSON Surname
984				CAR Model
985				PERSON Sex
986				CHILD Name
987				CHILD Born
988				PERSON Nickname
989				PERSON Age
990				CAR Year
991				CAR Owned by
992
993				The first column is either a reserved word C, or an identifier
994				of an input spreadsheet. There are also few other reserved words - see
995				more about them a bit later.
996
997				The identifier can be almost anything (and it does not appear in the
998				input spreadsheet itself). It is also used in the command-line
999				argument C<-inputs> where it corresponds to a real file name of the
1000				input. The lines with identifiers define what columns will be in the
1001				result: the second column is the header of the wanted columns and an
1002				optional third column (not used in the example above) is the header
1003				used in the result. The resulting columns will be in the same order as
1004				are these lines in the configuration file.
1005
1006				The reserved word C is used to define how to match lines in the
1007				input spreadsheets. The format of its second column is:
1008
1009				=
1010
1011				There should be one MATCH line for each input spreadsheet. The data in
1012				the column defined by the "column-header" will be used to find the
1013				corresponding lines. In our example, the data in the column I
1014				in the C will be matched with the data in the column
1015				I in the C (the rows having the same values in
1016				these two columns will be merged into one resulting row).
1017
1018				B
1019
1020				If you want to add so-called I as described in the
1021				L you need to use few additional reserved words in the
1022				configuration file. These words are B, B and/or
1023				B. They are used in the place where the new calculated column
1024				should be placed. Their lines have the program name or the Perl
1025				subroutine name in the second column, and they have mandatory third
1026				column with the resulting name of the calculated column.
1027
1028				For example, we wish to add two columns to the input spreadsheet
1029				C. The input file (the same as in the introduction) is:
1030
1031				Model Year Owned by
1032				Praga 1936 Someone else
1033				Mini 1968 Gudernova
1034				Skoda 2002 Senger
1035
1036				We wish to add a column I that shows the difference between
1037				the actual year and the value from the I column. We have a
1038				shell script C doing it:
1039
1040				#!/bin/bash
1041				YEAR=`grep Year $1 \| cut -f2`
1042				NOW=`date +%Y`
1043				echo $(($NOW-$YEAR))
1044
1045				The configuration file C (assuming that we want the other
1046				columns to remain the same) is:
1047
1048				MATCH CAR=Owned by
1049
1050				CAR Owned by
1051				CAR Model
1052				CAR Year
1053				PROG age.sh Car age
1054
1055				When we run:
1056
1057				combinesheets -config cars.cfg -inputs CAR=cars.tsv
1058
1059				we get this result:
1060
1061				Owned by Model Year Car age
1062				Gudernova Mini 1968 44
1063				Senger Skoda 2002 10
1064				Someone else Praga 1936 76
1065
1066				You can see that there is no need to use C for really
1067				combining I sheets, an input can be just one sheet.
1068
1069				Another example adds a I column to the same input, a column
1070				named I that gets its value from a UNIX command
1071				C. This program does not get any information which row it has
1072				been invoked for. The configuration file is now (note the new line
1073				with the B):
1074
1075				MATCH CAR=Owned by
1076
1077				CAR Owned by
1078				CAR Model
1079				CAR Year
1080				PROG age.sh Car age
1081				PROGS date Last updated
1082
1083				and the result is now:
1084
1085				Owned by Model Year Car age Last updated
1086				Gudernova Mini 1968 44 Mon Feb 27 12:32:04 AST 2012
1087				Senger Skoda 2002 10 Mon Feb 27 12:32:04 AST 2012
1088				Someone else Praga 1936 76 Mon Feb 27 12:32:04 AST 2012
1089
1090				The last possibility is to call a Perl subroutine - using the reserved
1091				word B in the configuration file. Let's have an input
1092				spreadsheet (C) with data about flights:
1093
1094				Date Flight Airport From Airport To
1095				2009-01-18 AY838 London LHR Helsinki Vantaa
1096				2009-01-22 AY839 Helsinki Vantaa London LHR
1097				2009-03-15 NW2 Manila Tokyo Narita
1098				2009-03-21 NW1 Tokyo Narita Manila
1099				2011-05-06 SV326 Sharm El Sheik Jeddah
1100				2011-07-31 RJ700 Amman Jeddah
1101				2011-09-21 ME369 Jeddah Beirut
1102				2011-09-24 ME368 Beirut Jeddah
1103				2011-12-02 EZY3064 Prague London Stansted
1104				2011-12-09 EZY3067 London Stansted Prague
1105				2012-01-26 MS663 Cairo Jeddah
1106
1107				We want to add columns with the international airport codes for both
1108				I and I. The new columns will be named
1109				I `and I. The Perl subroutine will use a web`
1110				service to find the code. The subroutine will use a closure that will
1111				remember already fetched codes so the web service does not need to be
1112				called several times for the same airport name.
1113
1114				The configuration file C is:
1115
1116				MATCH FLY=Date
1117
1118				FLY Date
1119				FLY Flight
1120				FLY Airport From
1121				PERL Airport->find_code Code From
1122				FLY Airport To
1123				PERL Airport->find_code Code To
1124
1125				The name of the subroutine is attached to the module where it comes
1126				from by either B<::> or B<-E> notation.
1127
1128				The invocation is:
1129
1130				combinesheets -config flights.cfg -inputs FLY=flights.tsv
1131
1132				The full code for the module C, the file C is
1133				here:
1134
1135				package Airport;
1136				use warnings;
1137				use strict;
1138
1139				use LWP::Simple;
1140				use JSON;
1141
1142				# preparing a closure in order not to fetch the same airport code again and again
1143				my $already_found = make_already_found();
1144				sub make_already_found {
1145				my $already_found = {};
1146				return sub {
1147				my ($airport_name, $airport_code) = @_;
1148				if (exists $already_found->{$airport_name}) {
1149				if ($airport_code) {
1150				$already_found->{$airport_name} = $airport_code;
1151				}
1152				return $already_found->{$airport_name};
1153				} else {
1154				$already_found->{$airport_name} = ($airport_code ? $airport_code : 1);
1155				return 0;
1156				}
1157				}
1158				}
1159
1160				sub find_code {
1161				my ($class, $column, $header_line, $data_line) = @_;
1162
1163				my $column_with_airport_name = $column->{ocol};
1164				$column_with_airport_name =~ s{Code}{Airport};
1165
1166				my $airport_name;
1167				for (my $i = 0; $i < @$header_line; $i++) {
1168				if ($header_line->[$i] eq $column_with_airport_name) {
1169				$airport_name = $data_line->[$i];
1170				last;
1171				}
1172				}
1173				return '' unless $airport_name;
1174
1175				# now we have an airport name...
1176				my $airport_code = $already_found->($airport_name);
1177				return $airport_code if $airport_code;
1178
1179				#... go and find its airport code
1180				$airport_code = '';
1181				my $escaped_airport_name = $airport_name;
1182				$escaped_airport_name =~ tr{ }{+};
1183				my $url = "http://airportcode.riobard.com/search?q=$escaped_airport_name&fmt=json";
1184				my $content = get ($url);
1185				warn "Cannot get a response for '$url'\n"
1186				unless defined $content;
1187				my $json = JSON->new->allow_nonref;
1188				my $data = $json->decode ($content);
1189				foreach my $code (@$data) {
1190				$airport_code .= $code->{code} . ",";
1191				}
1192				chop ($airport_code) if $airport_code; # removing the trailing comma
1193
1194				$already_found->($airport_name, $airport_code);
1195				return $airport_code;
1196				}
1197				1;
1198
1199				When run it creates the following output. Note that some airports have
1200				more than one code because the name was ambiguous. Well, this is just
1201				an example, isn't it?
1202
1203				Date Flight Airport From Code From Airport To Code To
1204				2009-01-18 AY838 London LHR LHR Helsinki Vantaa HEL
1205				2009-01-22 AY839 Helsinki Vantaa HEL London LHR LHR
1206				2009-03-15 NW2 Manila MXA,MNL Tokyo Narita NRT
1207				2009-03-21 NW1 Tokyo Narita NRT Manila MXA,MNL
1208				2011-05-06 SV326 Sharm El Sheik SSH Jeddah JED
1209				2011-07-31 RJ700 Amman ADJ,AMM Jeddah JED
1210				2011-09-21 ME369 Jeddah JED Beirut BEY
1211				2011-09-24 ME368 Beirut BEY Jeddah JED
1212				2011-12-02 EZY3064 Prague PRG London Stansted STN
1213				2011-12-09 EZY3067 London Stansted STN Prague PRG
1214				2012-01-26 MS663 Cairo CAI,CIR Jeddah JED
1215
1216				=item B<-inputs = [=...]>
1217
1218				Each C<-inputs> can have one or more file names, and there can be one
1219				or more C<-inputs> arguments. It defines what are the input
1220				spreadsheets and how they are identified in the configuration file
1221				(see the C<-config> argument). For example, the inputs for our example
1222				above can be specified in any of these ways:
1223
1224				-inputs PERSON=persons.tsv -inputs CAR=cars.tsv
1225				-inputs PERSON=persons.tsv CAR=cars.tsv
1226				-inputs PERSON=persons.tsv,CAR=cars.tsv
1227
1228				The first file name is considered to be the C input (see the
1229				description above): the resulting spreadsheet will have the same
1230				number of lines as the primary input.
1231
1232				The file names ending with the C<.csv> are considered to be in the
1233				COMMA-separated formats, all others are considered to be
1234				TAB-separated.
1235
1236				This is a mandatory parameter.
1237
1238				=item B<-outfile >
1239
1240				An optional parameter specifying a filename of the combined result. By
1241				default, it is created on STDOUT. It is always in the TAB-separated
1242				format.
1243
1244				=item B<-check>
1245
1246				This option causes that the configuration file and the input files
1247				(only their header lines will be read) will be checked for errors but
1248				no resulting spreadsheet will be created.
1249
1250				=item B<-ignorecases>
1251
1252				Not yet implemented.
1253
1254				=item B
1255
1256				=over 8
1257
1258				=item B<-h>
1259
1260				Print a brief usage message and exits.
1261
1262				=item B<-help>
1263
1264				Print a brief usage message with options and exit.
1265
1266				=item B<-man>
1267
1268				Print a full usage message and exit.
1269
1270				=item B<-version>
1271
1272				Print the version and exit.
1273
1274				=back
1275
1276				=back
1277
1278				=head1 ENVIRONMENT VARIABLES
1279
1280				=head3 COMBINE_SHEETS_EXT_PATH
1281
1282				It contains a path that is used when looking for external programs
1283				(when the reserved words PROG or PROGS are used). For example, the
1284				C directory in the source distribution of this package has an
1285				external program C. The full invocation can be done by:
1286
1287				COMBINE_SHEETS_EXT_PATH=examples bin/combinesheets -cfg examples/cars.cfg --inputs CAR=examples/cars.csv
1288
1289				=head1 DEPENDENCIES
1290
1291				In order to run this tool you need Perl and the following Perl modules
1292				to be installed:
1293
1294				App::Cmd::Simple
1295				Text::CSV::Simple
1296				Text::CSV_XS
1297				File::BOM
1298				Getopt::Long::Descriptive
1299				Pod::Usage
1300				Algorithm::Loops
1301
1302				Optionally (if your configuration file uses the reserved word PROG or
1303				PROGS for calculated columns):
1304
1305				IO::CaptureOutput
1306
1307				=head1 KNOWN BUGS, MISSING FEATURES
1308
1309				=over
1310
1311				=item *
1312
1313				Columns are identified by their header names. There is no way
1314				to identify them simply by their order (column number).
1315
1316				=item *
1317
1318				The input spreadsheet are read first into memory. Which may be
1319				a problem with really huge spreadsheets.
1320
1321				=item *
1322
1323				The inputs can be COMMA-separated or TAB-separated. It would
1324				be perhaps nice to allow also the Excel spreadsheets.
1325
1326				=item *
1327
1328				Comparing header names and rows is case-sensitive only. There
1329				is a plan to implement the option C<-ignorecases>,
1330
1331				=back
1332
1333				Some of these missing features may be implemented later.
1334
1335				=head1 SUPPORT
1336
1337				You can find documentation for this module with the perldoc command.
1338
1339				perldoc App::combinesheets
1340
1341				You can also look for information at:
1342
1343				=over 4
1344
1345				=item * RT: CPAN's request tracker
1346
1347				L
1348
1349				=item * AnnoCPAN: Annotated CPAN documentation
1350
1351				L
1352
1353				=item * CPAN Ratings
1354
1355				L
1356
1357				=item * Search CPAN
1358
1359				L
1360
1361				=back
1362
1363				=head1 AUTHOR
1364
1365				Martin Senger
1366
1367				=head1 COPYRIGHT AND LICENSE
1368
1369				This software is copyright (c) 2013 by Martin Senger, CBRC - KAUST (Computational Biology Research Center - King Abdullah University of Science and Technology) All Rights Reserved..
1370
1371				This is free software; you can redistribute it and/or modify it under
1372				the same terms as the Perl 5 programming language system itself.
1373
1374				=cut
1375
1376
1377				__END__