line
stmt
bran
cond
sub
pod
time
code
1
#-----------------------------------------------------------------
2
# App::combinesheets
3
# Author: Martin Senger
4
# For copyright and disclaimer se below.
5
#
6
# ABSTRACT: command-line tool merging CSV and TSV spreadsheets
7
# PODNAME: App::combinesheets
8
#-----------------------------------------------------------------
9
7
7
231286
use warnings;
7
20
7
226
10
7
7
34
use strict;
7
16
7
436
11
12
package App::combinesheets;
13
14
our $VERSION = '0.2.14'; # VERSION
15
16
7
7
44
use base 'App::Cmd::Simple';
7
12
7
7576
17
18
7
7
669179
use Pod::Usage;
7
643404
7
1275
19
7
7
83
use Pod::Find qw(pod_where);
7
18
7
413
20
21
7
7
12925
use Text::CSV::Simple;
0
0
22
use Text::CSV_XS;
23
use File::Spec;
24
use File::Temp;
25
use File::Which;
26
use File::BOM qw( :all );
27
use Algorithm::Loops qw( NestedLoops );
28
use autouse 'IO::CaptureOutput' => qw(capture_exec);
29
30
# reserved keywords in the configuration
31
use constant {
32
CFG_MATCH => 'MATCH',
33
CFG_PROG => 'PROG',
34
CFG_PROGS => 'PROGS',
35
CFG_PERL => 'PERL',
36
};
37
38
# types of input files
39
use constant {
40
TYPE_CSV => 'csv',
41
TYPE_TSV => 'tsv',
42
# TYPE_XSL => 'xsl', # not-yet-supported
43
};
44
45
# hash keys describing an input ($inputs)
46
use constant {
47
INPUT_FILE => 'file',
48
INPUT_TYPE => 'type',
49
INPUT_MATCHED_BY => 'matched_by',
50
INPUT_MATCHED_BY_INDEX => 'matched_by_index',
51
INPUT_HEADERS => 'headers',
52
INPUT_CONTENT => 'content',
53
};
54
55
# hash keys describing wanted fields ($wanted_columns)
56
use constant {
57
CFG_TYPE => 'type', # what kind of input (MATCH, PROG, PROGS or PERL)
58
CFG_OUT_COL => 'ocol', # a name for this column used in the output
59
# keys used for the normal (MATCH) columns
60
CFG_ID => 'id', # which input
61
CFG_IN_COL => 'icol', # a column in such input
62
# keys used for the calculated columns (i.e. of type PROG, PROGS or PERL)
63
CFG_EXT => 'id', # name of the external program or Perl external subroutine
64
PERL_DETAILS => '_perl_details_', # added during the config processing
65
};
66
67
# ----------------------------------------------------------------
68
# Command-line arguments and script usage
69
# ----------------------------------------------------------------
70
sub usage_desc {
71
my $self = shift;
72
return "%c -config -inputs [other otions...]";
73
}
74
sub opt_spec {
75
return (
76
[ 'h' => "display a short usage message" ],
77
[ 'help' => "display a full usage message" ],
78
[ 'man|m' => "display a full manual page" ],
79
[ 'version|v' => "display a version" ],
80
[],
81
[ 'config|cfg=s' => "" ],
82
[ 'inputs|i=s@{1,}' => " in the form: =[,=...] (e.g. PERSON=,CAR=)" ],
83
[ 'outfile|o=s' => "" ],
84
[ 'check|c' => "only check the configuration" ],
85
86
{ getopt_conf => ['no_bundling', 'no_ignore_case', 'auto_abbrev'] }
87
);
88
}
89
sub validate_args {
90
my ($self, $opt, $args) = @_;
91
92
# show various levels of help and exit
93
my $pod_where = pod_where ({-inc => 1}, __PACKAGE__);
94
if ($opt->h) {
95
print "Usage: " . $self->usage();
96
if ($^S) { die "Okay\n" } else { exit (0) };
97
}
98
pod2usage (-input => $pod_where, -verbose => 1, -exitval => 0) if $opt->help;
99
pod2usage (-input => $pod_where, -verbose => 2, -exitval => 0) if $opt->man;
100
101
# show version and exit
102
if ($opt->version) {
103
## no critic
104
no strict; # because the $VERSION will be added only when
105
no warnings; # the distribution is fully built up
106
print "$VERSION\n";
107
if ($^S) { die "Okay\n" } else { exit (0) };
108
}
109
110
# check required command-line arguments
111
$self->usage_error ("Parameter '-config' is required.")
112
unless $opt->config;
113
$self->usage_error ("Parameter '-inputs' is required.")
114
unless $opt->inputs;
115
116
return;
117
}
118
sub usage_error {
119
my ( $self, $error ) = @_;
120
die "Error: $error\nUsage: " . $self->usage->text;
121
}
122
123
# ----------------------------------------------------------------
124
# The main part
125
# ----------------------------------------------------------------
126
my $inputs; # keys are input IDs
127
sub execute {
128
my ($self, $opt, $args) = @_;
129
130
$inputs = {}; # just in case somebody calls execute() twice
131
132
my @opt_inputs = split (m{,}, join (',', @{ $opt->inputs }));
133
my $opt_outfile = $opt->outfile;
134
my $opt_cfgfile = $opt->config;
135
136
# prepare output handler
137
my $combined;
138
if ($opt_outfile and not $opt->check) {
139
open ($combined, '>', $opt_outfile)
140
or die "[ER00] Cannot open file $opt_outfile for writing: $!\n";
141
} else {
142
$combined = *STDOUT;
143
}
144
145
# read configuration
146
my $wanted_cols = []; # each element: { CFG_TYPE, CFG_ID, CFG_IN_COL, CFG_OUT_COL... }
147
my $known_inputs = {}; # input ID => 1 ... the same input IDs as in $wanted_cols (for speed)
148
my $matches = {}; # input ID => matching column/header
149
my $config;
150
open ($config, '<', $opt_cfgfile)
151
or die "[ER00] Cannot read configuration file $opt_cfgfile: $!\n";
152
my $line_count = 0;
153
while (<$config>) {
154
$line_count++;
155
chomp;
156
next if m{^\s*$}; # ignore empty lines
157
next if m{^\s*#}; # ignore comment lines
158
s{^\s+|\s+$}{}g; # trim whitespaces
159
my ($input_id, $input_col, $output_col) = split (m{\s*\t\s*}, $_, 3);
160
unless ($input_id and defined $input_col) {
161
warn "[WR01] Configuration line $line_count ignored: '$_'\n";
162
next;
163
}
164
$input_id = uc ($input_id); # make config keys upper-case
165
if ($input_id eq CFG_MATCH) {
166
my ($input_id, $column) = split (m{\s*=\s*}, $input_col, 2);
167
unless ($input_id and $column !~ m{^\s*$}) {
168
warn "[WR02] Bad format in configuration line $line_count: '$input_col'. Ignored.\n";
169
next;
170
}
171
$matches->{ uc ($input_id) } = $column;
172
next;
173
}
174
my $wanted_col = {};
175
if ($input_id eq CFG_PROG or $input_id eq CFG_PROGS or $input_id eq CFG_PERL) {
176
$wanted_col->{CFG_TYPE()} = $input_id;
177
$wanted_col->{CFG_EXT()} = $input_col;
178
if (defined $output_col) {
179
$wanted_col->{CFG_OUT_COL()} = $output_col;
180
} else {
181
warn "[WR10] Missing output column name in configuration line $line_count: '$input_col'.\n";
182
$wanted_col->{CFG_OUT_COL()} = '';
183
}
184
} else {
185
$wanted_col->{CFG_TYPE()} = CFG_MATCH;
186
$wanted_col->{CFG_ID()} = $input_id;
187
$wanted_col->{CFG_IN_COL()} = $input_col;
188
$wanted_col->{CFG_OUT_COL()} = (defined $output_col ? $output_col : $input_col);
189
$known_inputs->{$input_id} = 1;
190
}
191
push (@$wanted_cols, $wanted_col);
192
}
193
close $config;
194
195
# prepare for calculated columns
196
foreach my $col (@$wanted_cols) {
197
if ($col->{CFG_TYPE()} eq CFG_PROG or $col->{CFG_TYPE()} eq CFG_PROGS) {
198
199
# locate the external program
200
$col->{CFG_EXT()} = find_prog ($col->{CFG_EXT()});
201
202
} elsif ($col->{CFG_TYPE()} eq CFG_PERL) {
203
204
# load the wanted Perl module
205
my $call = $col->{CFG_EXT()};
206
$call =~ m{^(.+)((::)|(->))(.*)$};
207
my $module = $1;
208
my $subroutine = $5;
209
my $how_to_call = $2; # can be '::' or '->'
210
unless ($module) {
211
warn "[WR11] Missing module name in '[PERL] " . $col->{CFG_OUT_COL()} . "'. Column ignored.\n";
212
$col->{ignored} = 1;
213
next;
214
}
215
unless ($subroutine) {
216
warn "[WR12] Missing subroutine name in '[PERL] " . $col->{CFG_OUT_COL()} . "'. Column ignored.\n";
217
$col->{ignored} = 1;
218
next;
219
}
220
if ($module =~ m{^:+}) {
221
warn "[WR13] Uncomplete module name in '[PERL] " . $col->{CFG_OUT_COL()} . "'. Column ignored.\n";
222
$col->{ignored} = 1;
223
next;
224
}
225
eval "require $module"; ## no critic
226
if ($@) {
227
warn "[WR14] Cannot load module '$module': $@. Column '" . $col->{CFG_OUT_COL()} . " ignored\n";
228
$col->{ignored} = 1;
229
next;
230
}
231
$module->import();
232
233
# remember what we just parsed and checked
234
$col->{PERL_DETAILS()} = {};
235
$col->{PERL_DETAILS()}->{what_to_call} = $module . $how_to_call . $subroutine;
236
$col->{PERL_DETAILS()}->{module} = $module;
237
$col->{PERL_DETAILS()}->{subroutine} = $subroutine;
238
$col->{PERL_DETAILS()}->{how_to_call} = $how_to_call;
239
}
240
}
241
$wanted_cols = [ grep { not $_->{ignored} } @$wanted_cols ];
242
243
# locate expected inputs
244
my $primary_input; # ID of the first input
245
foreach my $opt_input (@opt_inputs) {
246
my ($key, $value) = split (m{\s*=\s*}, $opt_input, 2);
247
next unless $key;
248
next unless $value;
249
$key = uc ($key);
250
unless (exists $known_inputs->{$key}) {
251
warn "[WR03] Configuration does not recognize the input named '$key'. Input ignored.\n";
252
next;
253
}
254
unless (exists $matches->{$key}) {
255
warn "[WR04] Input named '$key' does not have any MATCH column defined in configuration. Input ignored.\n";
256
next;
257
}
258
$primary_input = $key unless $primary_input; # remember which input came first
259
my $input = { INPUT_FILE() => $value,
260
INPUT_MATCHED_BY() => $matches->{$key} };
261
if ($value =~ m{\.csv$}i) {
262
$input->{INPUT_TYPE()} = TYPE_CSV;
263
} else {
264
$input->{INPUT_TYPE()} = TYPE_TSV;
265
}
266
$inputs->{$key} = $input;
267
}
268
die "[ER01] No valid inputs specified. Exiting.\n"
269
unless scalar keys (%$inputs) > 0;
270
271
# read headers from all inputs
272
my $headers_by_id = {}; # used for re-using the same headers once read, and for some checks
273
foreach my $input_id (keys %$inputs) {
274
my $input = $inputs->{$input_id};
275
my $headers;
276
if (exists $headers_by_id->{$input_id}) {
277
$headers = $headers_by_id->{$input_id}; # copy already known headers
278
} else {
279
$headers = read_headers ($input);
280
}
281
282
# add new properties to $input
283
unless (exists $headers->{ $input->{INPUT_MATCHED_BY()} }) {
284
warn ("[WR05] Input '$input_id' does not contain the matching header '" . $input->{INPUT_MATCHED_BY()} .
285
"'. Input ignored\n");
286
delete $inputs->{$input_id};
287
next;
288
}
289
$headers_by_id->{$input_id} = $headers
290
unless exists $headers_by_id->{$input_id};
291
$input->{INPUT_HEADERS()} = $headers;
292
$input->{INPUT_MATCHED_BY_INDEX()} = $headers->{ $input->{INPUT_MATCHED_BY()} };
293
}
294
295
# check real headers vs. headers as defined in configuration
296
my $already_reported = {};
297
foreach my $col (@$wanted_cols) {
298
next if $col->{CFG_TYPE()} ne CFG_MATCH; # check is done only for normal columns
299
my $input_id = $col->{CFG_ID()};
300
if (exists $headers_by_id->{$input_id}) {
301
# does the requested column exist in this input's headers?
302
unless (column_exists ($input_id, $col->{CFG_IN_COL()})) {
303
warn "[WR06] Column '$col->{CFG_IN_COL()}' not found in the input '$input_id'. Column will be ignored.\n";
304
$col->{ignored} = 1;
305
}
306
next;
307
308
} elsif (!exists $already_reported->{$input_id}) {
309
$already_reported->{$input_id} = 1;
310
warn "[WR07] Configuration defines columns from an input '$input_id' but no such input given (or was ignored). These columns will be ignored.\n";
311
}
312
$col->{ignored} = 1;
313
}
314
$wanted_cols = [ grep { not $_->{ignored} } @$wanted_cols ];
315
316
foreach my $input_id (keys %$matches) {
317
next unless exists $inputs->{$input_id}; # ignoring matches whose inputs are already ignored
318
# does the matching column exist in this input's headers?
319
unless (column_exists ($input_id, $matches->{$input_id})) {
320
die "[ER02] Matching column '$matches->{$input_id}' not found in the input '$input_id'. Must exit.\n";
321
}
322
}
323
324
# do we still have a primary input?
325
unless (exists $inputs->{$primary_input}) {
326
die "[ER03] Due to errors, the primary input '$primary_input' is now ignored. Must exit.\n";
327
}
328
329
# end of checking
330
exit (0) if $opt->check;
331
332
# read all inputs into memory
333
foreach my $input_id (keys %$inputs) {
334
my $input = $inputs->{$input_id};
335
my $content = read_content ($input);
336
$input->{INPUT_CONTENT()} = $content;
337
}
338
339
# output combined headers
340
my @header_line = ();
341
foreach my $col (@$wanted_cols) {
342
push (@header_line, $col->{CFG_OUT_COL()});
343
}
344
print $combined join ("\t", @header_line) . "\n"
345
unless scalar @header_line == 0;
346
347
# combine all inputs and make output lines
348
foreach my $matching_content (sort keys %{ $inputs->{$primary_input}->{INPUT_CONTENT()} }) {
349
# $matching_content is, for example, a publication title ("An Atlas of....")
350
351
# inputs may have more lines with the same value in the matching columns
352
# therefore, extract first the matching lines from all inputs
353
my $lines_to_combine = [];
354
my $inputs_to_combine = {}; # keys are inputs' CFG_IDs, values are indeces into $lines_to_combine
355
356
foreach my $col (@$wanted_cols) {
357
if ($col->{CFG_TYPE()} eq CFG_MATCH) {
358
unless (exists $inputs_to_combine->{ $col->{CFG_ID()} }) {
359
# remember the same lines (from the same input) only once
360
my $input = $inputs->{ $col->{CFG_ID()} };
361
push (@$lines_to_combine, $input->{INPUT_CONTENT()}->{$matching_content} || [undef]);
362
$inputs_to_combine->{ $col->{CFG_ID()} } = $#$lines_to_combine;
363
}
364
}
365
}
366
367
# make all combinantions of matching lines
368
369
# let's have 3 inputs, identified by K, L and M
370
# there are three matching lines in K, two in L and one in M:
371
# my $lines_to_combine = [
372
# [ "line1", "line2", "line3", ], # from input K
373
# [ "lineX", "lineY", ], # from input L
374
# [ "lineQ", ], # from input M
375
# );
376
# my $inputs_to_combine = { K => 0, L => 1, M => 2 };
377
#
378
# the subroutine create_output_line() will be called 6 times
379
# with the following arguments:
380
# line1, lineX, lineQ
381
# line1, lineY, lineQ
382
# line2, lineX, lineQ
383
# line2, lineY, lineQ
384
# line3, lineX, lineQ
385
# line3, lineY, lineQ
386
387
NestedLoops ($lines_to_combine,
388
sub {
389
my @input_lines = @_;
390
my @output_line = ();
391
my @calculated = (); # indeces of the yet-to-be-calculated elements
392
my $column_count = -1;
393
foreach my $col (@$wanted_cols) { # $col defines what data to push into @output_line
394
$column_count++;
395
if ($col->{CFG_TYPE()} eq CFG_MATCH) {
396
my $input = $inputs->{ $col->{CFG_ID()} };
397
my $input_line = @input_lines[$inputs_to_combine->{ $col->{CFG_ID()} }];
398
# use Data::Dumper;
399
# print Dumper (\@input_lines);
400
# print Dumper ($inputs_to_combine);
401
my $idx = $input->{INPUT_HEADERS()}->{ $col->{CFG_IN_COL()} };
402
my $value = $input_line->[$idx] || '';
403
push (@output_line, $value);
404
} else {
405
push (@calculated, $column_count);
406
push (@output_line, '');
407
}
408
}
409
# insert the calculated columns
410
foreach my $idx (@calculated) {
411
if ($wanted_cols->[$idx]->{CFG_TYPE()} eq CFG_PROG) {
412
$output_line[$idx] = call_prog ($wanted_cols->[$idx], \@header_line, \@output_line);
413
} elsif ($wanted_cols->[$idx]->{CFG_TYPE()} eq CFG_PROGS) {
414
$output_line[$idx] = call_prog_simple ($wanted_cols->[$idx]);
415
} elsif ($wanted_cols->[$idx]->{CFG_TYPE()} eq CFG_PERL) {
416
$output_line[$idx] = call_perl ($wanted_cols->[$idx], \@header_line, \@output_line);
417
}
418
}
419
420
print $combined join ("\t", @output_line) . "\n"
421
unless scalar @output_line == 0;
422
});
423
}
424
close $combined if $opt_outfile;
425
}
426
427
# ----------------------------------------------------------------
428
# Call a Perl subroutine (from any module) in order to get a value for
429
# a "calculated" column. $column defines which column to fill,
430
# $header_line is an arra is an arrayref with column headers and the
431
# $data_line is another arrayref with the values for the current row.
432
#
433
# $column->{PERL_DETAILS} contains all details needed for the call
434
# ----------------------------------------------------------------
435
sub call_perl {
436
my ($column, $header_line, $data_line) = @_;
437
438
no strict; ## no critic
439
my $what_to_call = $column->{PERL_DETAILS()}->{what_to_call};
440
my $how_to_call = $column->{PERL_DETAILS()}->{how_to_call};
441
my $module = $column->{PERL_DETAILS()}->{module};
442
my $subroutine = $column->{PERL_DETAILS()}->{subroutine};
443
444
if ($how_to_call eq '->') {
445
return $module->$subroutine ($column, $header_line, $data_line);
446
} else {
447
return &$what_to_call ($column, $header_line, $data_line);
448
}
449
}
450
451
# ----------------------------------------------------------------
452
# Call an external program in order to get a value for a "calculated"
453
# column. $column defines which column to fill, $header_line is an
454
# arra is an arrayref with column headers and the $data_line is
455
# another arrayref with the values for the current row.
456
#
457
# $column->{CFG_EXT} contains a program name to call
458
# ----------------------------------------------------------------
459
sub call_prog {
460
my ($column, $header_line, $data_line) = @_;
461
462
# prepare an input file for the external program
463
my $tmp = File::Temp->new();
464
for (my $i = 0; $i < @$header_line;$i++) {
465
print $tmp $header_line->[$i] . "\t" . $data_line->[$i] . "\n";
466
}
467
468
# call it
469
return _call_it ($column->{CFG_EXT()}, $tmp);
470
}
471
472
# ----------------------------------------------------------------
473
# Call an external program (without any command-line arguments) in
474
# order to get a value for a "calculated" column.
475
#
476
# $column->{CFG_EXT} contains a program name to call
477
# ----------------------------------------------------------------
478
sub call_prog_simple {
479
my ($column) = @_;
480
return _call_it ($column->{CFG_EXT()});
481
}
482
483
# ----------------------------------------------------------------
484
#
485
# ----------------------------------------------------------------
486
sub _call_it {
487
my @command = @_;
488
my ($stdout, $stderr, $success, $exit_code) = capture_exec (@command);
489
if ($exit_code != 0 or $stderr) {
490
my $errmsg = '[ER05] Failed command: ' . join (' ', map {"'$_'"} @command) . "\n";
491
$errmsg .= "STDERR: $stderr\n" if $stderr;
492
$errmsg .= "EXIT CODE: $exit_code\n";
493
die $errmsg;
494
}
495
chomp $stdout; # remove the last newline
496
$stdout =~ s{\n}{ }g; # better to replace newlines
497
return $stdout;
498
}
499
500
# ----------------------------------------------------------------
501
# Locate given $prgname and return it, usually with an added path. Or
502
# die if such program cannot be found or it is not executable.
503
# ----------------------------------------------------------------
504
sub find_prog {
505
my $prgname = shift;
506
my $full_name;
507
508
# 1) try the name as it is (e.g. the ones with an absolute path)
509
if (-e $prgname and -x $prgname and
510
File::Spec->file_name_is_absolute ($prgname)) {
511
return $prgname;
512
}
513
514
# 2) try to find it on system PATH
515
$full_name = which ($prgname);
516
if ($full_name ) {
517
chomp $full_name;
518
return $full_name;
519
}
520
521
# 3) try the environment variable with a path
522
if (exists $ENV{COMBINE_SHEETS_EXT_PATH}) {
523
$full_name = File::Spec->catfile ($ENV{COMBINE_SHEETS_EXT_PATH}, $prgname);
524
return maybe_die ($full_name);
525
}
526
527
# 4) try to find it in the current directory
528
$full_name = File::Spec->catfile ('./', $prgname);
529
return maybe_die ($full_name);
530
}
531
sub maybe_die {
532
my $prg = shift;
533
die "[ER04] '$prg' not found or is not executable.\n"
534
unless -e $prg and -x $prg;
535
return $prg;
536
}
537
538
# ----------------------------------------------------------------
539
# Does the requested $column exist in the given input's headers?
540
# ----------------------------------------------------------------
541
sub column_exists {
542
my ($input_id, $column) = @_;
543
return exists $inputs->{$input_id}->{INPUT_HEADERS()}->{$column};
544
}
545
546
# ----------------------------------------------------------------
547
# Read the headers (the first line) form an input file (given in
548
# hashref $input) and store them in the hashref $headers, each od them
549
# with its index as it appears in the read file. Do nothing if
550
# $headers already contains headers from the same input identifier.
551
# ----------------------------------------------------------------
552
sub read_headers {
553
my ($input) = @_;
554
555
my $headers;
556
if ($input->{INPUT_TYPE()} eq TYPE_CSV) {
557
$headers = read_csv_headers ($input->{INPUT_FILE()});
558
} else {
559
$headers = read_tsv_headers ($input->{INPUT_FILE()});
560
}
561
my $new_headers = {};
562
my $column_index = 0;
563
foreach my $column (@$headers) {
564
$new_headers->{$column} = $column_index++;
565
}
566
return $new_headers;
567
}
568
569
# ----------------------------------------------------------------
570
#
571
# ----------------------------------------------------------------
572
sub read_csv_headers {
573
my ($file) = @_;
574
my $line = read_first_line ($file);
575
576
my $parser = Text::CSV_XS->new ({
577
allow_loose_quotes => 1,
578
escape_char => "\\",
579
});
580
if ($parser->parse ($line)) {
581
return [ $parser->fields ];
582
} else {
583
die "[ER04] Parsing CSV file $file failed: " .
584
$parser->error_input . "\n" .
585
$parser->error_diag() . "\n";
586
}
587
}
588
589
# ----------------------------------------------------------------
590
#
591
# ----------------------------------------------------------------
592
sub read_tsv_headers {
593
my ($file) = @_;
594
my $line = read_first_line ($file);
595
return [ split (m{\t}, $line) ];
596
}
597
598
# ----------------------------------------------------------------
599
#
600
# ----------------------------------------------------------------
601
sub read_first_line {
602
my ($file) = @_;
603
my $fh;
604
open_bom ($fh, $file); # or open ($fh, '<', $file)
605
# or die "[ER00] Cannot read input file $file: $!\n";
606
my $line = <$fh>; # read just one line
607
close $fh;
608
$line =~ s{(\r|\n)+$}{}; # remove newlines of any kind
609
return $line;
610
}
611
612
# ----------------------------------------------------------------
613
# Stringify a hashref
614
# ----------------------------------------------------------------
615
sub ph {
616
my $hashref = shift;
617
my $result = '';
618
my ($key, $value);
619
while (($key, $value) = each (%$hashref)) {
620
$result .= "$key => $value,";
621
}
622
return substr ($result, 0, -1);
623
}
624
625
# ----------------------------------------------------------------
626
# Read contents...
627
# ----------------------------------------------------------------
628
sub read_content {
629
my ($input) = @_;
630
my $content;
631
if ($input->{INPUT_TYPE()} eq TYPE_CSV) {
632
$content = read_csv_content ($input->{INPUT_FILE()}, $input->{INPUT_MATCHED_BY_INDEX()});
633
} else {
634
$content = read_tsv_content ($input->{INPUT_FILE()}, $input->{INPUT_MATCHED_BY_INDEX()});
635
}
636
return $content;
637
}
638
639
# ----------------------------------------------------------------
640
#
641
# ----------------------------------------------------------------
642
sub read_tsv_content {
643
my ($file, $matched_index) = @_;
644
my $fh;
645
open_bom ($fh, $file); # or open ($fh, '<', $file)
646
# or die "[ER00] Cannot read input file $file: $!\n";
647
my $content = {};
648
my $line_count = 0;
649
while (my $line = <$fh>) {
650
next if $line_count++ == 0; # skip header line
651
next if $line =~ m{^\s*$}; # ignore empty lines
652
$line =~ s{(\r|\n)+$}{}; # remove newlines of any kind
653
my @data = split (m{\t}, $line);
654
$content->{ $data[$matched_index] } = [] unless $content->{ $data[$matched_index] };
655
push (@{ $content->{ $data[$matched_index] } }, [@data]);
656
}
657
close $fh;
658
return $content;
659
}
660
661
# ----------------------------------------------------------------
662
#
663
# ----------------------------------------------------------------
664
sub read_csv_content {
665
my ($file, $matched_index) = @_;
666
my $count_lines = 0;
667
my $content = {};
668
669
# create a CSV parser; any error in reading input will be fatal
670
my $csv = Text::CSV_XS->new ({
671
allow_loose_quotes => 1,
672
escape_char => "\\",
673
auto_diag => 1,
674
});
675
676
# read the CSV input
677
open_bom (my $fh, $file);
678
while (<$fh>) {
679
if ($csv->parse ($_)) {
680
next if $count_lines++ == 0; # headers are ignored
681
my @data = $csv->fields;
682
if (@data) {
683
push (@{ $content->{ $data[$matched_index] } }, \@data);
684
}
685
} else {
686
my $err = $csv->error_input;
687
warn "[WR09] Possible a wrong or not-readable input file '$file': $err\n";
688
exit (1);
689
}
690
}
691
692
# $parser->add_trigger (after_parse => sub {
693
# my ($self, $data) = @_;
694
# return if $count_lines++ == 0; # headers are ignored
695
# $content->{ $data->[$matched_index] } = [] unless $content->{ $data->[$matched_index] };
696
# push (@{ $content->{ $data->[$matched_index] } }, $data);
697
# });
698
# read CSV input (the result is not used here; everything is done in triggers)
699
# $parser->read_file ($file);
700
701
return $content;
702
}
703
1;
704
705
706
707
=pod
708
709
=head1 NAME
710
711
App::combinesheets - command-line tool merging CSV and TSV spreadsheets
712
713
=head1 VERSION
714
715
version 0.2.14
716
717
=head1 SYNOPSIS
718
719
combinesheets -h
720
combinesheets -help
721
combinesheets -man
722
combinesheets -version
723
724
combinesheets -config -inputs [] [-outfile ]
725
726
where has the form: = [=...]
727
where are: -check
728
729
=head1 DESCRIPTION
730
731
B is a command-line tool merging together two or more
732
spreadsheets. The spreadsheets can be COMMA-separated or TAB-separated
733
files, each of them having the first line with column headers. Data in
734
one of the column (it can be a different column in each input
735
spreadsheet) serve to match lines. For example, having two spreadsheets,
736
PERSON and CAR, with the following contents:
737
738
persons.tsv:
739
740
Surname First name Sex Age Nickname
741
Novak Jan M 52 Honza
742
Gudernova Jitka F 56
743
Senger Martin M 61 Tulak
744
745
cars.tsv:
746
747
Model Year Owned by
748
Praga 1936 Someone else
749
Mini 1968 Gudernova
750
Skoda 2002 Senger
751
752
we want to merge these spreadsheet by C in persons.tsv and by
753
C in cars.tsv. There are two possible results, depending
754
which spreadsheet is used as the first one (a primary one). If the
755
persons.tsv is the first, the result will be (which columns are
756
included in the result will be described later in this document):
757
758
combinesheets -cfg config.cfg -in PERSON=persons.tsv CAR=cars.csv
759
760
First name Surname Model Sex Nickname Age Year Owned by
761
Jitka Gudernova Mini F 56 1968 Gudernova
762
Jan Novak M Honza 52
763
Martin Senger Skoda M Tulak 61 2002 Senger
764
765
Or, if the cars.tsv is the first, the result will be:
766
767
combinesheets -cfg config.cfg -in CAR=cars.csv PERSON=persons.tsv
768
769
First name Surname Model Sex Nickname Age Year Owned by
770
Jitka Gudernova Mini F 56 1968 Gudernova
771
Martin Senger Skoda M Tulak 61 2002 Senger
772
Praga 1936 Someone else
773
774
Of course, if both input spreadsheets have only the matching lines,
775
both results will be the same (it will not matter which one of them is
776
considered the primary one).
777
778
The rows in the resulting spreadsheet are sorted by values in the
779
column that was used as a matching column in the primary input.
780
781
The information which columns should be used to match the input
782
spreadsheets and which columns should appear in the resulting
783
spreadsheet is read from a configuration file (see the C<-config>
784
- or C<-cfg> - argument).
785
786
The command-line arguments and options can be specified with single or
787
double dash. Most of them can be abbreviated to the nearest non-biased
788
length. They are case-sensitive.
789
790
=head2 Duplicated values in the matching columns
791
792
If there are repeated (the same) values in the column that serves as
793
matching criterion then the resulting spreadsheet will have as many
794
output lines (for a particular matching value) as is the number of all
795
combinations of the lines with that matching values in all input
796
spreadsheets. For example, let's have C and C,
797
assuming that a book can have more authors and any author can
798
contribute to any number of books:
799
800
books.tsv:
801
Title Note Author
802
Book 1 from B1-a Kim
803
Book 2 from B2-b Kim
804
Book 3 from B3-c Katrin
805
Book 1 from B1-d Blanka
806
Book 2 from B2-e Katrin
807
808
authors.tsv:
809
Age Name
810
28 Kim
811
20 Katrin
812
30 Blanka
813
50 Lazy author
814
815
The output (again, depending on which input is considered a primary
816
input) will be (a list of included column is defined in the
817
configuration file - see later):
818
819
combinesheets -cfg books_to_authors.cfg -in BOOK=books.tsv AUTHOR=authors.tsv
820
821
Name Title Age Note
822
Blanka Book 1 30 from B1-d
823
Katrin Book 3 20 from B3-c
824
Katrin Book 2 20 from B2-e
825
Kim Book 1 28 from B1-a
826
Kim Book 2 28 from B2-b
827
828
combinesheets -cfg books_to_authors.cfg -in AUTHOR=authors.tsv BOOK=books.tsv
829
830
Name Title Age Note
831
Blanka Book 1 30 from B1-d
832
Katrin Book 3 20 from B3-c
833
Katrin Book 2 20 from B2-e
834
Kim Book 1 28 from B1-a
835
Kim Book 2 28 from B2-b
836
Lazy author 50
837
838
=head1 ADVANCED USAGE
839
840
Additionally to the merging columns from one or more spreadsheets,
841
this script can also add completely new columns to the resulting
842
spreadsheet, the columns that do not exist in any of the input
843
spreadsheet. Such columns are called C.
844
845
Each C is created either by an external,
846
command-line driven, program, or by a Perl subroutine. In both cases,
847
the user must create (write) such external program or such Perl
848
subroutine. Therefore, this usage is meant more for developers than
849
for the end users.
850
851
Note that this advanced feature is meant only for new columns, not for
852
new rows. Therefore, it cannot be used, for example, to create rows
853
with totals of columns.
854
855
=head2 Calculated columns by external programs
856
857
If specified, an external program is invoked for each row. It can be
858
specified either by a keyword B or by a keyword B - see
859
syntax in the I section. In both cases, the
860
value of the standard output of these programs become the value of the
861
calculated column (a trailing newline of this standard output is
862
removed and other newlines are replaced by spaces).
863
864
A program defined by the B is called without any arguments
865
(C in I stands for a I). That's why it does not have
866
any knowledge for which row it has been invoked. Its usage is,
867
therefore, for column values that are not dependent on other values
868
from the spreadsheet. For example, for the C shown above,
869
you can add a column C by calling a UNIX program C
870
- again, see an example the I
871
section.
872
873
A program defined by the B is called with one argument which is
874
a filename. This file contains the current row; each of its lines has
875
two, TAB-separated, fields. The first field is the column name and the
876
second field is the column value. For example, when processing the
877
last row of the C given above, the file will have the
878
following content:
879
880
Model Skoda
881
Year 2002
882
Owned by Senger
883
884
The files are only temporary and will be removed when
885
C finishes.
886
887
=head2 Calculated columns by a Perl subroutine
888
889
If specified by the keyword B, a Perl subroutine is called for
890
each row with the three arguments:
891
892
=over
893
894
=item 1
895
896
A hashref with information about the current column. Not often used
897
but may be handy if the same subroutine deals with more columns and,
898
therefore, needs to know for which column it was invoked. See the
899
I example in the I section.
900
901
=item 2
902
903
An arrayref with all column names.
904
905
=item 3
906
907
An arrayref with all column values - in the same order as the column
908
names.
909
910
=back
911
912
Actually, depending how the subroutine is defined in the
913
configuration, it may get as the first argument the module/class name
914
where it belongs to. If you define it like this:
915
916
PERL Module::Example::test
917
918
the C subroutine is called, indeed, with the three arguments as
919
described above. However, if your definition is rather:
920
921
PERL Module::Example->test
922
923
then the C subroutine is considered a Perl method and its first
924
argument is the module/class name. It is up to you to decide how you
925
want/need to write your functions. Again, an example is available in
926
the I section.
927
928
The return value of the subroutine will become a new value in the
929
calculated column. Do not return undef but rather an empty string if
930
the value cannot be created.
931
932
What is an advantage of writing my own module/package if I can simply
933
write an external program (perhaps also in Perl) doing exactly the
934
same? The Perl module stays in the memory for the whole time of
935
processing all input rows and, therefore, you can re-use some
936
calculations done for the previous rows. An example about it
937
(C) is given in the I
938
section.
939
940
=head1 ARGUMENTS and OPTIONS
941
942
=over 4
943
944
=item B<-config >
945
946
A filename with a configuration file. This is a mandatory
947
parameter. The configuration file describes:
948
949
=over
950
951
=item *
952
953
which columns in individual input spreadsheets should be
954
included in the resulting spreadsheet,
955
956
=item *
957
958
what names should be given to the resulting columns
959
960
=item *
961
962
in which order should be the columns in the resulting
963
spreadsheet
964
965
=item *
966
967
which columns should be used to match individual lines,
968
969
=back
970
971
The configuration file is a TAB-separated file (with no header
972
line). Empty lines and lines starting with a "#" character are
973
ignored. Each line has two columns, in some cases there is an optional
974
third column. Here is a configuration file used in the example above:
975
976
# Columns to match records from individual inputs
977
MATCH PERSON=Surname
978
MATCH CAR=Owned by
979
MATCH CHILD=Parent
980
981
# Columns - how they be in rows
982
PERSON First name
983
PERSON Surname
984
CAR Model
985
PERSON Sex
986
CHILD Name
987
CHILD Born
988
PERSON Nickname
989
PERSON Age
990
CAR Year
991
CAR Owned by
992
993
The first column is either a reserved word C, or an identifier
994
of an input spreadsheet. There are also few other reserved words - see
995
more about them a bit later.
996
997
The identifier can be almost anything (and it does not appear in the
998
input spreadsheet itself). It is also used in the command-line
999
argument C<-inputs> where it corresponds to a real file name of the
1000
input. The lines with identifiers define what columns will be in the
1001
result: the second column is the header of the wanted columns and an
1002
optional third column (not used in the example above) is the header
1003
used in the result. The resulting columns will be in the same order as
1004
are these lines in the configuration file.
1005
1006
The reserved word C is used to define how to match lines in the
1007
input spreadsheets. The format of its second column is:
1008
1009
=
1010
1011
There should be one MATCH line for each input spreadsheet. The data in
1012
the column defined by the "column-header" will be used to find the
1013
corresponding lines. In our example, the data in the column I
1014
in the C will be matched with the data in the column
1015
I in the C (the rows having the same values in
1016
these two columns will be merged into one resulting row).
1017
1018
B
1019
1020
If you want to add so-called I as described in the
1021
L"ADVANCED USAGE"> you need to use few additional reserved words in the
1022
configuration file. These words are B, B and/or
1023
B. They are used in the place where the new calculated column
1024
should be placed. Their lines have the program name or the Perl
1025
subroutine name in the second column, and they have mandatory third
1026
column with the resulting name of the calculated column.
1027
1028
For example, we wish to add two columns to the input spreadsheet
1029
C. The input file (the same as in the introduction) is:
1030
1031
Model Year Owned by
1032
Praga 1936 Someone else
1033
Mini 1968 Gudernova
1034
Skoda 2002 Senger
1035
1036
We wish to add a column I that shows the difference between
1037
the actual year and the value from the I column. We have a
1038
shell script C doing it:
1039
1040
#!/bin/bash
1041
YEAR=`grep Year $1 | cut -f2`
1042
NOW=`date +%Y`
1043
echo $(($NOW-$YEAR))
1044
1045
The configuration file C (assuming that we want the other
1046
columns to remain the same) is:
1047
1048
MATCH CAR=Owned by
1049
1050
CAR Owned by
1051
CAR Model
1052
CAR Year
1053
PROG age.sh Car age
1054
1055
When we run:
1056
1057
combinesheets -config cars.cfg -inputs CAR=cars.tsv
1058
1059
we get this result:
1060
1061
Owned by Model Year Car age
1062
Gudernova Mini 1968 44
1063
Senger Skoda 2002 10
1064
Someone else Praga 1936 76
1065
1066
You can see that there is no need to use C for really
1067
combining I sheets, an input can be just one sheet.
1068
1069
Another example adds a I column to the same input, a column
1070
named I that gets its value from a UNIX command
1071
C. This program does not get any information which row it has
1072
been invoked for. The configuration file is now (note the new line
1073
with the B):
1074
1075
MATCH CAR=Owned by
1076
1077
CAR Owned by
1078
CAR Model
1079
CAR Year
1080
PROG age.sh Car age
1081
PROGS date Last updated
1082
1083
and the result is now:
1084
1085
Owned by Model Year Car age Last updated
1086
Gudernova Mini 1968 44 Mon Feb 27 12:32:04 AST 2012
1087
Senger Skoda 2002 10 Mon Feb 27 12:32:04 AST 2012
1088
Someone else Praga 1936 76 Mon Feb 27 12:32:04 AST 2012
1089
1090
The last possibility is to call a Perl subroutine - using the reserved
1091
word B in the configuration file. Let's have an input
1092
spreadsheet (C) with data about flights:
1093
1094
Date Flight Airport From Airport To
1095
2009-01-18 AY838 London LHR Helsinki Vantaa
1096
2009-01-22 AY839 Helsinki Vantaa London LHR
1097
2009-03-15 NW2 Manila Tokyo Narita
1098
2009-03-21 NW1 Tokyo Narita Manila
1099
2011-05-06 SV326 Sharm El Sheik Jeddah
1100
2011-07-31 RJ700 Amman Jeddah
1101
2011-09-21 ME369 Jeddah Beirut
1102
2011-09-24 ME368 Beirut Jeddah
1103
2011-12-02 EZY3064 Prague London Stansted
1104
2011-12-09 EZY3067 London Stansted Prague
1105
2012-01-26 MS663 Cairo Jeddah
1106
1107
We want to add columns with the international airport codes for both
1108
I and I. The new columns will be named
1109
I and I. The Perl subroutine will use a web
1110
service to find the code. The subroutine will use a closure that will
1111
remember already fetched codes so the web service does not need to be
1112
called several times for the same airport name.
1113
1114
The configuration file C is:
1115
1116
MATCH FLY=Date
1117
1118
FLY Date
1119
FLY Flight
1120
FLY Airport From
1121
PERL Airport->find_code Code From
1122
FLY Airport To
1123
PERL Airport->find_code Code To
1124
1125
The name of the subroutine is attached to the module where it comes
1126
from by either B<::> or B<-E> notation.
1127
1128
The invocation is:
1129
1130
combinesheets -config flights.cfg -inputs FLY=flights.tsv
1131
1132
The full code for the module C, the file C is
1133
here:
1134
1135
package Airport;
1136
use warnings;
1137
use strict;
1138
1139
use LWP::Simple;
1140
use JSON;
1141
1142
# preparing a closure in order not to fetch the same airport code again and again
1143
my $already_found = make_already_found();
1144
sub make_already_found {
1145
my $already_found = {};
1146
return sub {
1147
my ($airport_name, $airport_code) = @_;
1148
if (exists $already_found->{$airport_name}) {
1149
if ($airport_code) {
1150
$already_found->{$airport_name} = $airport_code;
1151
}
1152
return $already_found->{$airport_name};
1153
} else {
1154
$already_found->{$airport_name} = ($airport_code ? $airport_code : 1);
1155
return 0;
1156
}
1157
}
1158
}
1159
1160
sub find_code {
1161
my ($class, $column, $header_line, $data_line) = @_;
1162
1163
my $column_with_airport_name = $column->{ocol};
1164
$column_with_airport_name =~ s{Code}{Airport};
1165
1166
my $airport_name;
1167
for (my $i = 0; $i < @$header_line; $i++) {
1168
if ($header_line->[$i] eq $column_with_airport_name) {
1169
$airport_name = $data_line->[$i];
1170
last;
1171
}
1172
}
1173
return '' unless $airport_name;
1174
1175
# now we have an airport name...
1176
my $airport_code = $already_found->($airport_name);
1177
return $airport_code if $airport_code;
1178
1179
#... go and find its airport code
1180
$airport_code = '';
1181
my $escaped_airport_name = $airport_name;
1182
$escaped_airport_name =~ tr{ }{+};
1183
my $url = "http://airportcode.riobard.com/search?q=$escaped_airport_name&fmt=json";
1184
my $content = get ($url);
1185
warn "Cannot get a response for '$url'\n"
1186
unless defined $content;
1187
my $json = JSON->new->allow_nonref;
1188
my $data = $json->decode ($content);
1189
foreach my $code (@$data) {
1190
$airport_code .= $code->{code} . ",";
1191
}
1192
chop ($airport_code) if $airport_code; # removing the trailing comma
1193
1194
$already_found->($airport_name, $airport_code);
1195
return $airport_code;
1196
}
1197
1;
1198
1199
When run it creates the following output. Note that some airports have
1200
more than one code because the name was ambiguous. Well, this is just
1201
an example, isn't it?
1202
1203
Date Flight Airport From Code From Airport To Code To
1204
2009-01-18 AY838 London LHR LHR Helsinki Vantaa HEL
1205
2009-01-22 AY839 Helsinki Vantaa HEL London LHR LHR
1206
2009-03-15 NW2 Manila MXA,MNL Tokyo Narita NRT
1207
2009-03-21 NW1 Tokyo Narita NRT Manila MXA,MNL
1208
2011-05-06 SV326 Sharm El Sheik SSH Jeddah JED
1209
2011-07-31 RJ700 Amman ADJ,AMM Jeddah JED
1210
2011-09-21 ME369 Jeddah JED Beirut BEY
1211
2011-09-24 ME368 Beirut BEY Jeddah JED
1212
2011-12-02 EZY3064 Prague PRG London Stansted STN
1213
2011-12-09 EZY3067 London Stansted STN Prague PRG
1214
2012-01-26 MS663 Cairo CAI,CIR Jeddah JED
1215
1216
=item B<-inputs = [=...]>
1217
1218
Each C<-inputs> can have one or more file names, and there can be one
1219
or more C<-inputs> arguments. It defines what are the input
1220
spreadsheets and how they are identified in the configuration file
1221
(see the C<-config> argument). For example, the inputs for our example
1222
above can be specified in any of these ways:
1223
1224
-inputs PERSON=persons.tsv -inputs CAR=cars.tsv
1225
-inputs PERSON=persons.tsv CAR=cars.tsv
1226
-inputs PERSON=persons.tsv,CAR=cars.tsv
1227
1228
The first file name is considered to be the C input (see the
1229
description above): the resulting spreadsheet will have the same
1230
number of lines as the primary input.
1231
1232
The file names ending with the C<.csv> are considered to be in the
1233
COMMA-separated formats, all others are considered to be
1234
TAB-separated.
1235
1236
This is a mandatory parameter.
1237
1238
=item B<-outfile >
1239
1240
An optional parameter specifying a filename of the combined result. By
1241
default, it is created on STDOUT. It is always in the TAB-separated
1242
format.
1243
1244
=item B<-check>
1245
1246
This option causes that the configuration file and the input files
1247
(only their header lines will be read) will be checked for errors but
1248
no resulting spreadsheet will be created.
1249
1250
=item B<-ignorecases>
1251
1252
Not yet implemented.
1253
1254
=item B
1255
1256
=over 8
1257
1258
=item B<-h>
1259
1260
Print a brief usage message and exits.
1261
1262
=item B<-help>
1263
1264
Print a brief usage message with options and exit.
1265
1266
=item B<-man>
1267
1268
Print a full usage message and exit.
1269
1270
=item B<-version>
1271
1272
Print the version and exit.
1273
1274
=back
1275
1276
=back
1277
1278
=head1 ENVIRONMENT VARIABLES
1279
1280
=head3 COMBINE_SHEETS_EXT_PATH
1281
1282
It contains a path that is used when looking for external programs
1283
(when the reserved words PROG or PROGS are used). For example, the
1284
C directory in the source distribution of this package has an
1285
external program C. The full invocation can be done by:
1286
1287
COMBINE_SHEETS_EXT_PATH=examples bin/combinesheets -cfg examples/cars.cfg --inputs CAR=examples/cars.csv
1288
1289
=head1 DEPENDENCIES
1290
1291
In order to run this tool you need Perl and the following Perl modules
1292
to be installed:
1293
1294
App::Cmd::Simple
1295
Text::CSV::Simple
1296
Text::CSV_XS
1297
File::BOM
1298
Getopt::Long::Descriptive
1299
Pod::Usage
1300
Algorithm::Loops
1301
1302
Optionally (if your configuration file uses the reserved word PROG or
1303
PROGS for calculated columns):
1304
1305
IO::CaptureOutput
1306
1307
=head1 KNOWN BUGS, MISSING FEATURES
1308
1309
=over
1310
1311
=item *
1312
1313
Columns are identified by their header names. There is no way
1314
to identify them simply by their order (column number).
1315
1316
=item *
1317
1318
The input spreadsheet are read first into memory. Which may be
1319
a problem with really huge spreadsheets.
1320
1321
=item *
1322
1323
The inputs can be COMMA-separated or TAB-separated. It would
1324
be perhaps nice to allow also the Excel spreadsheets.
1325
1326
=item *
1327
1328
Comparing header names and rows is case-sensitive only. There
1329
is a plan to implement the option C<-ignorecases>,
1330
1331
=back
1332
1333
Some of these missing features may be implemented later.
1334
1335
=head1 SUPPORT
1336
1337
You can find documentation for this module with the perldoc command.
1338
1339
perldoc App::combinesheets
1340
1341
You can also look for information at:
1342
1343
=over 4
1344
1345
=item * RT: CPAN's request tracker
1346
1347
L
1348
1349
=item * AnnoCPAN: Annotated CPAN documentation
1350
1351
L
1352
1353
=item * CPAN Ratings
1354
1355
L
1356
1357
=item * Search CPAN
1358
1359
L
1360
1361
=back
1362
1363
=head1 AUTHOR
1364
1365
Martin Senger
1366
1367
=head1 COPYRIGHT AND LICENSE
1368
1369
This software is copyright (c) 2013 by Martin Senger, CBRC - KAUST (Computational Biology Research Center - King Abdullah University of Science and Technology) All Rights Reserved..
1370
1371
This is free software; you can redistribute it and/or modify it under
1372
the same terms as the Perl 5 programming language system itself.
1373
1374
=cut
1375
1376
1377
__END__