Coverage Summary
File Coverage
blib/lib/Convert/MRC.pm
Criterion
Covered
Total
%
statement
57
457
12.4
branch
6
238
2.5
condition
0
91
0.0
subroutine
16
31
51.6
pod
6
6
100.0
total
85
823
10.3
line
stmt
bran
cond
sub
pod
time
code
1
#
2
# This file is part of Convert-MRC
3
#
4
# This software is copyright (c) 2013 by Alan K. Melby.
5
#
6
# This is free software; you can redistribute it and/or modify it under
7
# the same terms as the Perl 5 programming language system itself.
8
#
9
# MRC to TBX converter
10
# written June-Nov 2008 by Nathan E. Rasmussen
11
# Modified 2013 by Nathan G. Glenn
12
13
# Example input data follows:
14
15
# TEST DATA HERE
16
17
package Convert::MRC;
18
5
5
445899
use strict;
5
17
5
179
19
5
5
28
use warnings;
5
8
5
134
20
5
5
2422
use Data::Dumper;
5
6796
5
386
21
5
5
32
use Carp;
5
9
5
320
22
5
5
12265
use English qw(-no_match_vars);
5
31921
5
42
23
24
5
5
7358
use Log::Message::Simple qw (:STD);
5
391518
5
1117
25
26
#import global constants used in processing
27
5
5
4030
use Convert::MRC::Variables;
5
17
5
1527
28
29
# ABSTRACT: CONVERT MRC TO TBX-BASIC
30
our $VERSION = '4.03'; # VERSION
31
32
5
5
6633
use open ':encoding(utf8)', ':std'; # incoming/outgoing data will be UTF-8
5
8819
5
38
33
34
our @origARGV = @ARGV;
35
local @ARGV = (q{-}) unless @ARGV; # if no filenames given, take std input
36
37
#use batch() if called as a script
38
__PACKAGE__->new->batch(@ARGV) unless caller;
39
40
#allows us to get some kind of version string during development, when $VERSION is undefined
41
#($VERSION is inserted by a dzil plugin at build time)
42
sub _version {
43
## no critic (ProhibitNoStrict)
44
5
5
131448
no strict 'vars';
5
17
5
7446
45
0
0
0
0
return $VERSION || q{??};
46
}
47
48
49
sub new {
50
5
5
1
40348
my ($class) = @_;
51
5
28
my $self = bless {}, $class;
52
5
64
$self->_init;
53
5
20
return $self;
54
}
55
56
sub _init {
57
5
5
14
my ($self) = @_;
58
5
36
$self->input_fh( \*STDIN );
59
5
35
$self->tbx_fh( \*STDOUT );
60
5
26
$self->log_fh( \*STDERR );
61
5
9
return;
62
}
63
64
65
sub tbx_fh {
66
## no critic (RequireBriefOpen)
67
5
5
1
15
my ( $application, $fh ) = @_;
68
5
50
23
if ($fh) {
69
5
50
25
if ( ref($fh) eq 'GLOB' ) {
70
5
15
$application->{tbx_fh} = $fh;
71
}
72
else {
73
0
0
0
open my $fh2, '>', $fh or die "Couldn't open $fh";
74
0
0
$application->{tbx_fh} = $fh2;
75
}
76
}
77
5
16
return $application->{tbx_fh};
78
}
79
80
81
sub log_fh {
82
## no critic (RequireBriefOpen)
83
5
5
1
18
my ( $application, $fh ) = @_;
84
5
50
93
if ($fh) {
85
5
50
25
if ( ref($fh) eq 'GLOB' ) {
86
5
17
$application->{log_fh} = $fh;
87
}
88
else {
89
0
0
0
open my $fh2, '>', $fh or die "Couldn't open $fh";
90
0
0
$application->{log_fh} = $fh2;
91
}
92
}
93
5
15
return $application->{log_fh};
94
}
95
96
#same thing as Log::Message::Simple::error, but verbose is always off.
97
sub _error {
98
0
0
0
my ($msg) = @_;
99
0
0
error $msg, 0;
100
0
0
return;
101
}
102
103
#prints the given message to the current log file handle.
104
sub _log {
105
0
0
0
my ( $self, $message ) = @_;
106
0
0
print { $self->{log_fh} } $message;
0
0
107
0
0
return;
108
}
109
110
111
sub input_fh {
112
## no critic (RequireBriefOpen)
113
5
5
1
13
my ( $application, $fh ) = @_;
114
5
50
41
if ($fh) {
115
5
50
46
if ( ref($fh) eq 'GLOB' ) {
0
116
5
43
$application->{input_fh} = $fh;
117
}
118
#emulate diamond operator
119
elsif ($fh eq q{-}){
120
0
0
$application->{input_fh} = \*STDIN;
121
}
122
else {
123
0
0
0
open my $fh2, '<', $fh or die "Couldn't open $fh";
124
0
0
$application->{input_fh} = $fh2;
125
}
126
}
127
5
17
return $application->{input_fh};
128
}
129
130
131
sub batch {
132
0
0
1
my ( $self, @mrc_files ) = @_;
133
## no critic (ProhibitOneArgSelect)
134
0
for my $mrc (@mrc_files) {
135
136
# find an appropriate name for output and warning files
137
0
my $suffix = _get_suffix($mrc);
138
139
#set output, error and input files
140
0
my $outTBX = "$mrc$suffix.tbx";
141
0
my $outWarn = "$mrc$suffix.log";
142
143
# print STDERR "See $outTBX and $outWarn for output.\n";
144
0
$self->input_fh($mrc);
145
0
$self->log_fh($outWarn);
146
0
$self->tbx_fh($outTBX);
147
148
#convert the input file, sending output to appropriate file handles
149
0
$self->convert;
150
151
# close these so that they are written.
152
0
close $self->log_fh();
153
0
close $self->tbx_fh();
154
155
# close input too, since it's been exhausted.
156
0
close $self->input_fh();
157
158
# print STDERR "Finished processing $mrc.\n";
159
}
160
0
return;
161
}
162
163
# return a file suffix to ensure nothing is overwritten
164
sub _get_suffix {
165
0
0
my ($file_name) = @_;
166
0
my $suffix = q{};
167
0
0
$suffix--
168
while ( -e "$file_name$suffix.tbx" or -e "$file_name$suffix.log" );
169
0
return $suffix;
170
}
171
172
173
sub convert {
174
## no critic (ProhibitOneArgSelect)
175
0
0
1
my ($self) = @_;
176
0
my $select = select $self->{tbx_fh};
177
178
# informative header for the log file
179
0
my $version = _version();
180
0
msg("MRC2TBX converter version $version");
181
182
#if called as a script, output this
183
# if ( not caller ) {
184
# msg( "Called with "
185
# . scalar @origARGV
186
# . " argument"
187
# . ( @origARGV == 1 ? '' : 's' ) . ":\n\t"
188
# . ( join "\t", @origARGV ) );
189
# }
190
191
# set up per-file status flags
192
0
my %header; # contains the header information
193
0
my $segment = 'header'; # header, body, back
194
195
# what's open; need to be accessible in all methods
196
0
$self->{concept} = undef;
197
0
$self->{langSet} = undef;
198
0
$self->{term} = undef;
199
0
$self->{party} = undef;
200
0
$self->{langSetDefined} = 0;
201
202
#array containing all rows for an ID
203
0
$self->{unsortedTerm} = undef;
204
205
0
my @party; # collect all rows for a responsible party
206
my %responsible; # accumulate parties by type
207
0
my ( @idsUsed, @linksMade ); # track these
208
0
my $started = 0; # flag for MRCTermTable line (start-of-processing)
209
0
my $aborted = 0; # flag for early end-of-processing
210
# process the file
211
0
while ( readline( $self->{input_fh} ) ) {
212
213
# eliminate a (totally superfluous) byte-order mark
214
0
0
s/^(?:\xef\xbb\xbf|\x{feff})// if $INPUT_LINE_NUMBER == 1;
215
216
#check for =MRCtermTable at the beginning of the file to begin processing
217
0
0
if (/^=MRCtermTable/i) { # start processing
218
0
$started = 1;
219
0
next;
220
}
221
0
0
next unless $started;
222
223
0
0
next if (/^\s*$/); # if it's only whitespace
224
0
my $row;
225
0
0
next unless $row = $self->_parseRow($_);
226
227
# (if the row won't parse, _parseRow() returns undef)
228
229
# if in header, A row?
230
231
# print STDOUT $segment;
232
# print STDOUT Dumper $row;
233
# A-row: build header
234
0
0
0
if ( $segment eq 'header' && $row->{'ID'} eq 'A' ) {
235
0
0
$self->_buildHeader( $self->_parseRow($_), \%header )
236
or _error "Could not interpret header line $INPUT_LINE_NUMBER, skipped.";
237
}
238
239
# not A-row: print header, segment = body
240
0
0
0
if ( $segment eq 'header' && $row->{'ID'} ne 'A' ) {
241
242
# better have enough for a header!
243
0
0
unless ( $self->_printHeader( \%header ) ) {
244
0
_error
245
"TBX header could not be completed because a required A-row is missing or malformed.";
246
0
$aborted = 1;
247
0
last;
248
}
249
0
$segment = 'body';
250
}
251
252
# if in body, C row?
253
254
# C-row: lots to do
255
0
0
0
if ( $segment eq 'body' && exists $row->{'Concept'} ) {
256
257
# catch a misordered-input problem
258
259
# The next 3 if tests are one action in principle.
260
# Each depends on the preceding, and all depend on the
261
# closeX() subs being no-ops if it's already closed,
262
# and on the fact that nothing follows terms in langSet
263
# or follows langSet in termEntry. Meddle not, blah blah.
264
265
{
266
## no critic (ProhibitNoWarnings)
267
5
5
47
no warnings 'uninitialized';
5
10
5
37092
0
268
## use critic
269
270
# concept, langSet, term might be undef
271
# if new concept, close old and open new
272
0
0
if ( $row->{'Concept'} ne $self->{concept} ) {
273
0
$self->_closeTerm();
274
0
$self->_closeLangSet();
275
0
$self->_closeConcept();
276
277
# open concept
278
0
$self->{concept} = $row->{'Concept'};
279
0
print '\n";
280
281
# (not row ID, which may go further)
282
0
push @idsUsed, 'C' . $self->{concept};
283
}
284
285
# if new langSet ...
286
0
0
0
if ( exists $row->{'LangSet'}
287
&& $row->{'LangSet'} ne $self->{langSet} )
288
{
289
0
$self->_closeTerm();
290
0
$self->_closeLangSet();
291
292
# open langSet
293
0
$self->{langSet} = $row->{'LangSet'};
294
0
print '\n";
295
}
296
297
# if new term ...
298
0
0
0
if ( exists $row->{'Term'}
299
&& $row->{'Term'} ne $self->{term} )
300
{
301
0
$self->_closeTerm();
302
303
# open term
304
0
$self->{term} = $row->{'Term'};
305
0
undef $self->{unsortedTerm}; # redundant
306
0
push @idsUsed,
307
'C' . $self->{concept} . $self->{langSet} . $self->{term};
308
}
309
} # resume warnings on uninitialized values
310
311
# verify legal insertion
312
0
my $level; # determine where we are from row ID
313
0
0
if ( defined $row->{'Term'} ) {
0
0
314
0
$level = 'Term';
315
}
316
elsif ( defined $row->{'LangSet'} ) {
317
0
0
if ( defined $self->{term} ) {
318
0
_error
319
"LangSet-level row out of order in line $INPUT_LINE_NUMBER, skipped.";
320
0
next;
321
}
322
0
$level = 'LangSet';
323
}
324
elsif ( defined $row->{'Concept'} ) {
325
0
0
if ( defined $self->{langSet} ) {
326
0
_error
327
"Concept-level row out of order in line $INPUT_LINE_NUMBER, skipped.";
328
0
next;
329
}
330
0
$level = 'Concept';
331
}
332
else {
333
#this should never happen; missing level is found when reading the row in
334
0
croak "Can't find level in row $INPUT_LINE_NUMBER, stopped";
335
}
336
337
# (can't happen)
338
339
# is the datcat allowed at the level of the ID?
340
0
0
unless ( $legalIn{$level}{ $row->{'DatCat'} } ) {
341
0
_error
342
"Data category '$row->{'DatCat'}' not allowed at the $level level in line $INPUT_LINE_NUMBER, skipped.";
343
0
next;
344
}
345
346
# set langSetDefined if definition (legal only at langSet level)
347
0
0
$self->{langSetDefined} = 1 if ( $row->{'DatCat'} eq 'definition' );
348
349
# bookkeeping: record links made
350
0
0
push @linksMade, $row->{'Link'}->{'Value'}
351
if ( defined $row->{'Link'} );
352
353
# print item, or push into pre-tig list, depending
354
0
0
if ( $level eq 'Term' ) {
355
0
push @{ $self->{unsortedTerm} }, $row;
0
356
}
357
else {
358
0
$self->_printRow($row);
359
}
360
361
} # end if (in body, reading C-row)
362
363
# not C-row: close any structures, segment = back
364
0
0
0
if ( $segment eq 'body' && !exists $row->{'Concept'} ) {
365
0
$self->_closeTerm();
366
0
$self->_closeLangSet();
367
0
$self->_closeConcept();
368
0
print "
\n";
369
0
$segment = 'back';
370
0
print "\n";
371
}
372
373
# if in back, R row?
374
# R-row: separate parties, verify legality, stack it up
375
0
0
0
if ( $segment eq 'back' && exists $row->{'Party'} ) {
376
377
# have we changed parties?
378
0
0
0
if ( defined $self->{party} && $row->{'Party'} ne $self->{party} ) {
379
380
# change parties
381
0
my $type;
382
383
# what kind of party is the old one?
384
0
my $topRow = shift @party;
385
0
0
if ( $topRow->{'DatCat'} eq 'type' ) {
386
0
$type = $topRow->{'Value'};
387
}
388
else {
389
0
unshift @party, $topRow;
390
0
$type = 'unknown';
391
}
392
393
# file its info under its type and clean it out
394
0
push @{ $responsible{$type} }, [@party];
0
395
0
undef @party;
396
}
397
398
# no? OK, add it to the current party.
399
0
$self->{party} = $row->{'Party'}; # the party don't stop!
400
# article says the first row must be type, but we can sort:
401
0
0
if ( $row->{'DatCat'} eq 'type' ) {
402
0
unshift @party, $row;
403
}
404
else {
405
0
push @party, $row;
406
}
407
} # end if (in back and reading R-row)
408
409
# not R-row: warn file is misordered, last line
410
# this code only runs if the A C R order is broken
411
0
0
0
if ( $segment eq 'back' && !exists $row->{'Party'} ) {
412
0
_error
413
"Don't know what to do with line $INPUT_LINE_NUMBER, processing stopped. The rows in your file are not in proper A C R order.";
414
0
$aborted = 1;
415
0
last;
416
}
417
418
} # end while (<$self->input_fh>)
419
420
# finish up
421
422
# if in body, close structures, body
423
0
0
if ( $segment eq 'body' ) {
424
0
$self->_closeTerm();
425
0
$self->_closeLangSet();
426
0
$self->_closeConcept();
427
0
print "\n";
428
}
429
430
# if in back, sort and print parties, close back
431
0
0
if ( $segment eq 'back' ) {
432
433
# file the last party under its type
434
0
my $type;
435
0
my $topRow = shift @party;
436
0
0
if ( $topRow->{'DatCat'} eq 'type' ) {
437
0
$type = $topRow->{'Value'};
438
}
439
else {
440
0
unshift @party, $topRow;
441
0
$type = 'unknown';
442
}
443
0
push @{ $responsible{$type} }, [@party];
0
444
445
# print a refObjectList for each type of party,
446
# within which each arrayref gets noted and _printRow()ed.
447
0
0
if ( exists $responsible{'person'} ) {
448
0
print "\n";
449
0
push @idsUsed, $_->[0]->{'ID'} foreach @{ $responsible{'person'} };
0
450
0
$self->_printRow($_) foreach @{ $responsible{'person'} };
0
451
0
print "\n";
452
}
453
0
0
if ( exists $responsible{'organization'} ) {
454
0
print "\n";
455
0
push @idsUsed, $_->[0]->{'ID'}
456
0
foreach @{ $responsible{'organization'} };
457
0
$self->_printRow($_) foreach @{ $responsible{'organization'} };
0
458
0
print "\n";
459
}
460
0
0
if ( exists $responsible{'unknown'} ) {
461
0
_error
462
"At least one of your responsible parties has no type (person, organization, etc.) and has been provisionally printed as a respParty. To conform to TBX-Basic, you must list each party as either a person or an organization.";
463
0
print "\n";
464
0
push @idsUsed, $_->[0]->{'ID'} foreach @{ $responsible{'unknown'} };
0
465
0
$self->_printRow($_) foreach @{ $responsible{'unknown'} };
0
466
0
print "\n";
467
}
468
0
print "\n";
469
}
470
471
# closing formalities
472
0
0
if ( not $started ) {
473
0
my $err =
474
"The input MRC is missing a line beginning with =MRCTermTable. You must include such a line to switch on the TBX converter -- all preceding material is ignored.";
475
476
0
carp $err;
477
0
_error $err;
478
479
0
$self->_finish_processing($select);
480
0
return;
481
}
482
483
#in case the file was header only
484
0
0
0
if ( $segment eq 'header' and not $aborted ) {
485
486
#check and print header
487
0
0
unless ( $self->_printHeader( \%header ) ) {
488
0
_error
489
"TBX header could not be completed because a required A-row is missing or malformed.";
490
0
$aborted = 1;
491
}
492
493
#alert user to lack of content
494
0
_error('The file contained no concepts or parties.');
495
496
#close the opened, and empty, body element
497
0
print "\n";
498
}
499
500
0
0
if ($aborted) {
501
0
carp "See log -- processing could not be completed.\n";
502
0
$self->_finish_processing($select);
503
0
return;
504
}
505
506
0
print "\n\n";
507
0
0
msg( "File includes links to:\n\t" . ( join "\n\t", @linksMade ) )
508
if @linksMade;
509
510
0
0
msg "File includes IDs:\n\t" . ( join "\n\t", @idsUsed )
511
if @idsUsed;
512
513
# TODO: is this necessary? also look for tbx_fh and input_fh
514
# next open would close implicitly but not reset $INPUT_LINE_NUMBER
515
0
$self->_finish_processing($select);
516
0
return;
517
}
518
519
sub _finish_processing {
520
## no critic (ProhibitOneArgSelect)
521
0
0
my ( $self, $select ) = @_;
522
523
#clear all processing data
524
0
delete $self->{concept};
525
0
delete $self->{langSet};
526
0
delete $self->{term};
527
0
delete $self->{party};
528
0
delete $self->{unsortedTerm};
529
0
delete $self->{party};
530
0
delete $self->{langSetDefined};
531
532
#print all messages to the object's log
533
0
$self->_log( Log::Message::Simple->stack_as_string() );
534
0
Log::Message::Simple->flush();
535
536
0
select $select;
537
538
# user's responsibility to close the various filehandles
539
0
return;
540
}
541
542
543
# do nothing if no term level is open
544
sub _closeTerm {
545
0
0
my ($self) = @_;
546
0
0
if ( defined $self->{term} ) {
547
548
# print STDOUT Dumper $self->{unsortedTerm} ;
549
# print STDOUT Dumper $self;
550
0
0
my $id = ${ $self->{unsortedTerm} }[0]->{'ID'} ||
551
552
#necessary for error reporting; $ID might be undef
553
'C' . $self->{concept} . $self->{langSet} . $self->{term};
554
0
my $tig = $self->_sortRefs( @{ $self->{unsortedTerm} } );
0
555
0
my $posContext = pop @$tig;
556
0
0
0
unless ( $posContext || $self->{langSetDefined} ) {
557
0
_error
558
0
"Term $id (see line @{[$INPUT_LINE_NUMBER - 1]}) is lacking an element necessary for TBX-Basic.\n\tTo make it valid for human use only, add one of:\n\t\ta definition (at the language level)\n\t\tan example of use in context (at the term level).\n\tTo make it valid for human or machine processing, add its part of speech (at the term level).";
559
}
560
0
$self->_printRow($tig);
561
0
undef $self->{term};
562
0
undef $self->{unsortedTerm};
563
}
564
0
return;
565
}
566
567
# nothing if no lang level is open
568
sub _closeLangSet {
569
0
0
my ($self) = @_;
570
0
0
if ( defined $self->{langSet} ) {
571
0
print "\n";
572
0
undef $self->{langSet};
573
0
undef $self->{langSetDefined};
574
}
575
0
return;
576
}
577
578
# nothing if no concept level is open
579
sub _closeConcept {
580
0
0
my ($self) = @_;
581
0
0
if ( defined $self->{concept} ) {
582
0
print "\n";
583
0
undef $self->{concept};
584
}
585
0
return;
586
}
587
588
589
my $NUM_MONTHS = 12;
590
sub _parseRow {
591
0
0
my ( $self, $row_text ) = @_;
592
0
$row_text =~ s/\s*$//; # super-chomp: cut off any trailing whitespace at all
593
# later, split will eliminate between-field whitespace
594
# and the keyword and langtag parsers will eliminate other space
595
# outside of values
596
597
# fields are delimited by at least one tab and possibly spaces
598
0
my @field = split / *(?:\t *)+/, $row_text;
599
600
# grab the three mandatory fields
601
0
my %row;
602
0
$row{'ID'} = shift @field;
603
0
$row{'DatCat'} = shift @field;
604
0
$row{'Value'} = shift @field;
605
606
# verify essential completeness
607
0
0
0
unless ( $row{'ID'} && $row{'DatCat'} && $row{'Value'} ) {
0
608
0
_error "Incomplete row in line $INPUT_LINE_NUMBER, skipped.";
609
0
return;
610
}
611
612
# verify well-formed ID and extract its semantics
613
0
0
if ( $row{'ID'} =~ /^[Cc] *(\d{3}) *($langCode)? *(\d*)$/ ) {
0
0
614
0
0
0
if ( $3 && !$2 ) {
615
0
_error
616
"Bad ID '$row{'ID'}' (no language section) in line $INPUT_LINE_NUMBER, skipped.";
617
0
return;
618
}
619
0
$row{'Concept'} = $1;
620
0
0
$row{'LangSet'} = "\L$2" if ($2); # smash to lowercase
621
0
0
0
$row{'Term'} = 0 + $3 if ( $2 && $3 ne q{} ); # cast to int
622
# clean up the ID itself
623
0
$row{'ID'} = "C$row{'Concept'}";
624
0
0
$row{'ID'} .= $row{'LangSet'} if $row{'LangSet'};
625
0
0
$row{'ID'} .= $row{'Term'} if defined $row{'Term'};
626
}
627
elsif ( $row{'ID'} =~ /^[Rr] *(\d{3})$/ ) {
628
0
$row{'Party'} = $1;
629
0
$row{'ID'} = "R$1";
630
}
631
elsif ( $row{'ID'} =~ /^[Aa]$/ ) {
632
633
# this is a header line and okey-dokey
634
0
$row{'ID'} = 'A';
635
}
636
else {
637
0
_error
638
"Bad ID '$row{'ID'}' (format not recognized) in line $INPUT_LINE_NUMBER, skipped.";
639
0
return;
640
}
641
642
# correct case of the datcat, or warn and skip if can't match
643
0
0
if ( my $correct = $correctCaps{'DatCat'}{ lc( $row{'DatCat'} ) } ) {
644
645
# the datcat is recognized
646
0
0
unless ( $row{'DatCat'} eq $correct ) {
647
0
_error "Correcting '$row{'DatCat'}' to '$correct' in line $INPUT_LINE_NUMBER.";
648
0
$row{'DatCat'} = $correct;
649
}
650
}
651
else {
652
0
_error "Unknown data category '$row{'DatCat'}' in line $INPUT_LINE_NUMBER, skipped.";
653
0
return;
654
}
655
656
# parse off any local language override in Value
657
0
0
if ( $row{'Value'} =~ /^\[($langCode)] *(.*)$/ ) {
658
0
$row{'RowLang'} = " xml:lang=\"\L$1\""; # lower case
659
0
$row{'Value'} = $2;
660
} # otherwise RowLang will (warn and) print nothing when asked
661
662
# check certain Values against picklists and case-correct
663
0
0
if ( $row{'DatCat'} eq 'termLocation' ) {
0
664
0
0
if ( my $correct = $correctCaps{'termLocation'}{ lc( $row{'Value'} ) } )
665
{
666
# the value is a recognized termLocation
667
0
0
unless ( $row{'Value'} eq $correct ) {
668
0
_error "Correcting '$row{'Value'}' to '$correct' in line $INPUT_LINE_NUMBER.";
669
0
$row{'Value'} = $correct;
670
}
671
}
672
else {
673
0
_error
674
"Unfamiliar termLocation '$row{'Value'}' in line $INPUT_LINE_NUMBER. If this is a location in a user interface, consult the suggested values in the TBX spec.";
675
676
# but DON'T return undef, because this should not
677
# lead to skipping the row, unlike other picklists
678
}
679
}
680
elsif ( $correctCaps{ $row{'DatCat'} } ) {
681
0
my %caps = %{ $correctCaps{ $row{'DatCat'} } };
0
682
683
# grab a correction hash appropriate to DatCat,
684
# if one exists
685
0
0
if ( my $correct = $caps{ lc( $row{'Value'} ) } ) {
686
0
0
unless ( $row{'Value'} eq $correct ) {
687
0
_error "Correcting '$row{'Value'}' to '$correct' in line $INPUT_LINE_NUMBER.";
688
0
$row{'Value'} = $correct;
689
}
690
}
691
else {
692
0
_error
693
"'$row{'Value'}' not a valid $row{'DatCat'} in line $INPUT_LINE_NUMBER, skipped. See picklist for valid values.";
694
0
return;
695
}
696
} # else it's not a correctible datcat, so let it be
697
698
# get additional fields and language tags off of the row
699
# forcing the keyword to one initial cap and prewriting the XMLattr
700
0
foreach (@field) {
701
0
my $keyword;
702
0
0
if (/^([^:]+): *(?:\[($langCode)])? *(.+)$/) {
703
0
$keyword = "\u\L$1";
704
0
$row{$keyword}{'Value'} = $3;
705
0
0
$row{$keyword}{'FieldLang'} = " xml:lang=\"\L$2\"" if $2;
706
}
707
else {
708
0
_error "Can't parse additional field '$_' in line $INPUT_LINE_NUMBER, ignored.";
709
0
next;
710
}
711
712
# check if a FieldLang makes sense
713
0
0
0
if ( $row{$keyword}{'FieldLang'} && !$allowed{$keyword}{'FieldLang'} ) {
714
0
_error
715
"Language tag makes no sense with keyword '$keyword' in line $INPUT_LINE_NUMBER, ignored.";
716
0
delete $row{$keyword}{'FieldLang'};
717
}
718
719
# check if this datcat can have this keyword
720
# this bit might be better done in the controller?
721
# heh. Too late now.
722
0
0
unless ( $allowed{ $row{'DatCat'} }{$keyword} ) {
723
0
_error
724
"Data category $row{'DatCat'} does not allow keyword '$keyword', ignored in line $INPUT_LINE_NUMBER.";
725
0
0
0
if ( $keyword eq 'Source' or $keyword eq 'Note' ) {
726
0
_error
727
"You may attach a source or note to an entire term entry (or a language section or concept entry) by placing it on its own line with the appropriate ID, like this: \n\t$row{ 'ID' }\t\l$keyword\t$row{ $keyword }{ 'Value' }";
728
}
729
0
delete $row{$keyword};
730
}
731
}
732
# check for malformed Date
733
0
0
if ( $row{'Date'} ) {
734
0
0
if ( $row{'Date'}{'Value'} =~ /^(\d{4})-(\d{2})-(\d{2})$/ ) {
735
0
0
0
if ( $1 eq '0000' || $2 eq '00' || $3 eq '00' ) {
0
0
0
0
736
0
_error
737
"Consider correcting: Zeroes in date '$row{'Date'}{'Value'}', line $INPUT_LINE_NUMBER.";
738
}
739
elsif ( $2 <= $NUM_MONTHS && $3 <= $NUM_MONTHS ) {
740
0
_error
741
"Consider double-checking: Month and day are ambiguous in '$row{'Date'}{'Value'}', line $INPUT_LINE_NUMBER.";
742
}
743
elsif ( $2 > $NUM_MONTHS ) {
744
0
_error "Consider correcting: Month $2 is nonsense in line $INPUT_LINE_NUMBER.";
745
}
746
}
747
else {
748
0
_error
749
"Date '$row{'Date'}{'Value'}' not in ISO format (yyyy-mm-dd) in line $INPUT_LINE_NUMBER, ignored.";
750
0
delete $row{'Date'};
751
}
752
}
753
754
# check for Link where it's needed
755
0
0
0
if ( $row{'DatCat'} eq 'transactionType' ) {
0
756
0
0
_error
757
"Consider adding information: No responsible party linked in line $INPUT_LINE_NUMBER."
758
unless $row{'Link'};
759
}
760
elsif (
761
$row{'DatCat'} =~ /^(?:crossReference|externalCrossReference|xGraphic)$/
762
&& !$row{'Link'} )
763
{
764
0
_error "$row{'DatCat'} without Link in line $INPUT_LINE_NUMBER, skipped.";
765
0
return;
766
}
767
768
0
return \%row;
769
}
770
771
sub _buildHeader {
772
0
0
my ( $self, $srcRef, $destRef ) = @_;
773
0
my $destKey;
774
0
0
return unless ( $destKey = $corresp{ $srcRef->{'DatCat'} } );
775
776
# print STDOUT "$destKey\n" . Dumper ($destRef) . "\n" . Dumper ($srcRef) . "\n";
777
# a validity check, not just a pointless translation
778
0
0
0
if ( $destKey eq 'Language' and defined $destRef->{'Language'} ) {
779
0
_error "Duplicate workingLanguage ignored in line $INPUT_LINE_NUMBER.";
780
0
return;
781
}
782
0
push @{ $destRef->{$destKey} }, $srcRef->{'Value'};
0
783
0
return 1;
784
}
785
786
sub _printHeader {
787
0
0
my ( $self, $info ) = @_;
788
789
# my $info = %{shift}; # that's a copy, but the hash is small
790
0
0
0
return unless ( defined $info->{'Language'} && defined $info->{'Source'} );
791
0
print <<"REQUIRED1";
792
793
794
795
796
797
798
termbase from MRC file
799
REQUIRED1
800
801
# print termbase-wide subjects, if such there be
802
0
_error
803
"Termbase-wide subject fields are recorded in the element of the TBX header."
804
0
0
0
if ( exists $info->{'Subject'} and scalar @{ $info->{'Subject'} } );
805
0
my $sbj;
806
0
print <<"SUBJECT" while $sbj = shift @{ $info->{'Subject'} };
0
807
entire termbase concerns subject: $sbj
808
SUBJECT
809
0
my $version = _version();
810
0
print <<"REQUIRED2";
811
812
813
generated by Convert::MRC version $version
814
815
REQUIRED2
816
0
while ( my $src = shift @{ $info->{'Source'} } ) {
0
817
0
print <<"SOURCE";
818
819
$src
820
821
SOURCE
822
}
823
824
0
print <<'REQUIRED3';
825
826
827
TBXBasicXCSV02.xcs
828
REQUIRED3
829
830
# my $sbj;
831
# print <{'Subject'}};
832
#$sbj
833
#SUBJECT
834
835
0
print <<'REQUIRED3';
836
837
838
839
840
REQUIRED3
841
842
0
return 1;
843
}
844
845
# structure a term's worth of data rows for printing
846
sub _sortRefs {## no critic (RequireArgUnpacking)
847
0
0
my ( $self, @rows ) = @_;
848
0
my ( @termGrp, @auxInfo, $term, $pos, $context, $ID );
849
850
0
0
$ID = $_[0]->{'ID'}
851
852
#this is necessary for printing diagnostics when something has gone wrong ($ID would be undef otherwise)
853
|| 'C' . $self->{concept} . $self->{langSet} . $self->{term};
854
855
# print STDOUT Dumper $_[0];
856
# print STDOUT Dumper \@rows;
857
# print STDOUT Dumper $self;
858
0
for my $row (@rows) {
859
0
0
if ( not defined $row->{'DatCat'} ) {
860
861
#this should never happen; it should be caught during row parsing.
862
0
next;
863
}
864
0
my $datCat = $row->{'DatCat'};
865
0
0
if ( $datCat eq 'term' ) {
0
866
0
unshift @termGrp, $row; # stick it on the front
867
0
$term = 1;
868
}
869
elsif ( my $position = $position{$datCat} ) {
870
0
0
if ( 'termGrp' eq $position ) {
0
871
0
push @termGrp, $row; # stick it on the back
872
0
0
$pos = 1 if $datCat eq 'partOfSpeech';
873
}
874
elsif ( 'auxInfo' eq $position ) {
875
0
push @auxInfo, $row;
876
0
0
$context = 1 if $datCat eq 'context';
877
}
878
}
879
else {
880
#should never happen; should be caught during row parsing
881
0
_error "Data category '$datCat' is not allowed at the term level.";
882
}
883
}
884
885
0
0
if ( not $term ) {
886
0
_error
887
0
"There is no term row for '$ID', although other data categories describe such a term. See line @{[$INPUT_LINE_NUMBER - 1]}.";
888
}
889
890
0
0
if ( not $pos ) {
891
0
_error
892
0
"Term $ID lacks a partOfSpeech row. This TBX file may not be machine processed. See line @{[$INPUT_LINE_NUMBER - 1]}.";
893
}
894
895
0
unshift @auxInfo, \@termGrp;
896
0
0
push @auxInfo, ( $pos || $context ); # 1 or undef
897
0
return \@auxInfo;
898
}
899
900
sub _printRow {
901
0
0
my ( $self, $item ) = @_;
902
## no critic (ProhibitNoWarnings)
903
5
5
69
no warnings 'uninitialized'; # for undefined language attributes
5
12
5
6623
904
## use critic
905
0
0
if ( ref $item eq 'HASH' ) { # printing a single item
0
906
# print as appropriate
907
0
my $datCat;
908
0
$datCat = $item->{'DatCat'};
909
0
0
if ( not defined $datCat ) {
910
911
#should never happen; rows with undefined datcats are skipped.
912
0
_error "Data category undefined. Cannot print row at $INPUT_LINE_NUMBER";
913
0
return;
914
}
915
916
# sort by datcat
917
0
0
0
if ( $datCat eq 'term' ) {
0
0
0
0
0
0
0
918
0
print "$item->{'Value'} \n";
919
920
# we deliberately ignore RowLang, because LangSet
921
# should give the language of a term entry
922
}
923
924
# note and source as standalones, not keyword-fields
925
elsif ( $datCat eq 'note' ) {
926
0
print "{'RowLang'}>$item->{'Value'} \n";
927
}
928
929
elsif ( $datCat =~ /^(?:source|customerSubset|projectSubset)$/ ) {
930
0
print
931
"{'RowLang'}>$item->{'Value'} \n";
932
}
933
934
# sorry this one's so gross, but it is
935
elsif ( $datCat eq 'transactionType' ) {
936
0
print "\n";
937
0
print
938
"\t$item->{'Value'} \n";
939
0
0
print "\t$item->{'Date'}->{'Value'} \n"
940
if $item->{'Date'};
941
942
#I don't think Note is allowed in transationType (Nate G)
943
0
0
print
944
"\t{'Note'}->{'FieldLang'}>$item->{'Note'}->{'Value'} \n"
945
if $item->{'Note'};
946
0
0
0
if ( $item->{'Responsibility'} || $item->{'Link'} ) {
947
0
print "\t
948
0
0
print " target=\"$item->{'Link'}->{'Value'}\""
949
if $item->{'Link'};
950
0
print
951
"$item->{'Responsibility'}->{'FieldLang'}>$item->{'Responsibility'}->{'Value'}";
952
0
0
print "Responsible Party"
953
unless $item->{'Responsibility'}->{'Value'};
954
0
print "\n";
955
}
956
0
print "\n";
957
}
958
959
elsif ( $datCat eq 'crossReference' ) {
960
0
print
961
"[{'Link'}->{'Value'}\"$item->{'RowLang'}>$item->{'Value'}](\"$item-)\n";
962
}
963
964
elsif ($datCat eq 'externalCrossReference'
965
|| $datCat eq 'xGraphic' )
966
{
967
0
print
968
"{'Link'}->{'Value'}\"$item->{'RowLang'}>$item->{'Value'} \n";
969
}
970
971
elsif ( $datCat =~ /^(?:email|title|role|org|uid|tel|adr|fn)$/ ) {
972
0
print "\t- $item->{'Value'}
\n";
973
974
# RowLang is ignored here too -- attr not allowed
975
}
976
977
elsif ( $meta{$datCat} eq 'termNote' ) {
978
0
print
979
"{'RowLang'}>$item->{'Value'} \n"
980
; # using tigs means no termNoteGrp
981
}
982
983
else { # everything else is easy
984
0
my $meta;
985
0
0
$meta = $meta{$datCat}
986
or die "_printRow() can't print a $datCat "; # shouldn't happen
987
0
print "<${meta}Grp>\n";
988
0
print
989
"\t<$meta type=\"$datCat\"$item->{'RowLang'}>$item->{'Value'}$meta>\n";
990
991
#I don't think Note is allowed in transationType (Nate G)
992
0
0
print
993
"\t{'Note'}->{'FieldLang'}>$item->{'Note'}->{'Value'} \n"
994
if $item->{'Note'};
995
0
0
print
996
"\t{'Source'}->{'FieldLang'}>$item->{'Source'}->{'Value'} \n"
997
if $item->{'Source'};
998
0
print "${meta}Grp>\n";
999
}
1000
1001
}
1002
elsif ( ref $item eq 'ARRAY' ) {
1003
1004
# if first item isn't arrayref, it's a resp-party
1005
0
0
if ( ref $item->[0] ne 'ARRAY' ) {
1006
0
print "[0]->{'ID'}\">\n";
1007
0
$self->_printRow($_) foreach @$item;
1008
0
print "\n";
1009
}
1010
else {
1011
# then it's a tig
1012
0
my $termGrp = shift @$item;
1013
0
my $id;
1014
0
0
if ( exists $termGrp->[0] ) {
1015
1016
# if there's a term or any termNote
1017
0
$id = $termGrp->[0]->{'ID'};
1018
}
1019
else {
1020
#should never happen (right? Nate G)
1021
# if must, get the ID from an auxInfo
1022
# (implies the input is defective)
1023
0
$id = $item->[0]->{'ID'};
1024
}
1025
0
print "\n";
1026
1027
# if this were an ntig
1028
0
$self->_printRow($_) foreach @$termGrp;
1029
1030
#
1031
0
$self->_printRow($_) foreach @$item;
1032
0
print "\n";
1033
}
1034
}
1035
else {
1036
#this should never happen
1037
0
die "_printRow() called incorrectly, stopped";
1038
}
1039
0
return;
1040
}
1041
1042
1;
1043
1044
__END__