File Coverage

blib/lib/Convert/TBX/Basic.pm
Criterion Covered Total %
statement 7 9 77.7
branch n/a
condition n/a
subroutine 3 3 100.0
pod n/a
total 10 12 83.3


line stmt bran cond sub pod time code
1             #
2             # This file is part of Convert-TBX-Basic
3             #
4             # This software is copyright (c) 2016 by Alan K. Melby.
5             #
6             # This is free software; you can redistribute it and/or modify it under
7             # the same terms as the Perl 5 programming language system itself.
8             #
9             package Convert::TBX::Basic;
10 1     1   17185 use strict;
  1         1  
  1         22  
11 1     1   3 use warnings;
  1         1  
  1         28  
12             # ABSTRACT: Convert TBX-Basic data into TBX-Min
13             our $VERSION = '0.03'; # VERSION
14 1     1   1694 use XML::Twig;
  0            
  0            
15             use autodie;
16             use Path::Tiny;
17             use Carp;
18             use Log::Any '$log';
19             use TBX::Min 0.07;
20             use Try::Tiny;
21             use Exporter::Easy (
22             OK => ['basic2min']
23             );
24             use open ':encoding(utf-8)', ':std'; #this ensures output file is UTF-8
25              
26             my %status_map = (
27             'preferredTerm-admn-sts' => 'preferred',
28             'admittedTerm-admn-sts' => 'admitted',
29             'deprecatedTerm-admn-sts' => 'notRecommended',
30             'supersededTerm-admn-st' => 'obsolete'
31             );
32              
33             sub basic2min {
34             @_ == 3 or croak 'Usage: basic2min(data, source-language, target-language)';
35             my ($data, $source, $target) = @_;
36              
37             my $fh = _get_handle($data);
38              
39             # build a twig out of the input document
40             my $twig = XML::Twig->new(
41             output_encoding => 'UTF-8',
42             do_not_chain_handlers => 1,
43             keep_spaces => 0,
44              
45             # these store new entries, langGroups and termGroups
46             start_tag_handlers => {
47             termEntry => \&_entry_start,
48             langSet => \&_langStart,
49             tig => \&_termGrpStart,
50             },
51              
52             TwigHandlers => {
53             # header attributes
54             title => \&_title,
55             sourceDesc => \&_source_desc,
56             'titleStmt/note' => \&_title_note,
57              
58             # decide whether to add a new entry
59             termEntry => \&_entry,
60              
61             # becomes part of the current TBX::Min::ConceptEntry object
62             'termEntry/descrip[@type="subjectField"]' => sub {
63             shift->{tbx_min_min_current_entry}->
64             subject_field($_->text)},
65              
66             # these become attributes of the current
67             # TBX::Min::TIG object
68             'tig/termNote[@type="administrativeStatus"]' => \&_status,
69             term => sub {shift->{tbx_min_current_term_grp}->
70             term($_->text)},
71             'tig/termNote[@type="partOfSpeech"]' => sub {
72             shift->{tbx_min_current_term_grp}->
73             part_of_speech($_->text)},
74             'tig/note' => \&_as_note,
75             'tig/admin[@type="customerSubset"]' => sub {
76             shift->{tbx_min_current_term_grp}->customer($_->text)},
77              
78             # the information which cannot be converted faithfully
79             # gets added as a note to the current TBX::Min::TIG,
80             # with its data category prepended
81             'tig/admin' => \&_as_note,
82             'tig/descrip' => \&_as_note,
83             'tig/termNote' => \&_as_note,
84             'tig/transac' => \&_as_note,
85             'tig/transacNote' => \&_as_note,
86             'tig/transacGrp/date' => \&_as_note,
87              
88             # add no-op handlers for twigs not needing conversion
89             # so that they aren't logged as being skipped
90             'sourceDesc/p' => sub {}, # treated in sourceDesc handler
91             titleStmt => sub {},
92             fileDesc => sub {},
93             martifHeader => sub {},
94             text => sub {},
95             body => sub {},
96             martif => sub {},
97             langSet => sub {},
98             tig => sub {},
99             transacGrp => sub {},
100              
101             # log anything that wasn't converted
102             _default_ => \&_log_missed,
103             }
104             );
105              
106             # provide language info to the handlers via storage in the twig
107             $twig->{tbx_languages} = [lc($source), lc($target)];
108              
109             my $min = TBX::Min->new();
110             $min->source_lang($source);
111             $min->target_lang($target);
112              
113             # use handlers to process individual tags and
114             # add information to $min
115             $twig->{tbx_min} = $min;
116             $twig->safe_parse($fh); #using safe_parse here prevents crash when encoded (the open ':encoding(utf-8)) file is passed in
117              
118             # warn if the document didn't have tig's of the given source and
119             # target language
120             if(keys %{ $twig->{tbx_found_languages} } != 2 and
121             $log->is_warn){
122             # find the difference between the expected languages
123             # and those found in the TBX document
124             my %missing;
125             @missing{ lc $min->source_lang, lc $min->target_lang() } = undef;
126             delete @missing{ keys %{$twig->{tbx_found_languages}} };
127             $log->warn('could not find langSets for language(s): ' .
128             join ', ', sort keys %missing);
129             }
130              
131             return $min;
132             }
133              
134             sub _get_handle {
135             my ($data) = @_;
136             my $fh;
137             if((ref $data) eq 'SCALAR'){
138             open $fh, '<', $data; ## no critic(RequireBriefOpen)
139             }else{
140             $fh = path($data)->filehandle('<');
141             }
142             return $fh;
143             }
144              
145             ######################
146             ### XML TWIG HANDLERS
147             ######################
148             # all of the twig handlers store state on the XML::Twig object. A bit kludgy,
149             # but it works.
150              
151             sub _title {
152             my ($twig, $node) = @_;
153             $twig->{tbx_min}->id($node->text);
154             return 0;
155             }
156              
157             sub _title_note {
158             my ($twig, $node) = @_;
159             my $description = $twig->{tbx_min}->description || '';
160             $twig->{tbx_min}->description($description . $node->text . "\n");
161             return 0;
162             }
163              
164             sub _source_desc {
165             my ($twig, $node) = @_;
166             for my $p ($node->children('p')){
167             my $description = $twig->{tbx_min}->description || '';
168             $twig->{tbx_min}->description(
169             $description . $p->text . "\n");
170             }
171             return 0;
172             }
173              
174             # remove whitespace and convert to TBX-Min picklist value
175             sub _status {
176             my ($twig, $node) = @_;
177             my $status = $node->text;
178             $status =~ s/[\s\v]//g;
179             $twig->{tbx_min_current_term_grp}->status($status_map{$status});
180             return 0;
181             }
182              
183             # turn the node info into a note labeled with the type;
184             # the type becomes a noteKey and the info becomes noteValue
185             sub _as_note {
186             my ($twig, $node) = @_;
187             my $grp = $twig->{tbx_min_current_term_grp};
188              
189             if (@{$grp->note_groups} > 0)
190             {
191             &_noteStart($twig, $node->text, $node->att('type'));
192             }
193             else
194             {
195             &_noteGrpStart($twig);
196             &_noteStart($twig, $node->text, $node->att('type'));
197             }
198              
199             return 1;
200             }
201              
202             # add a new entry to the list of those found in this file
203             sub _entry_start {
204             my ($twig, $node) = @_;
205             my $entry = TBX::Min::TermEntry->new();
206             if($node->att('id')){
207             $entry->id($node->att('id'));
208             }else{
209             carp 'found entry missing id attribute';
210             }
211             $twig->{tbx_min_min_current_entry} = $entry;
212             return 1;
213             }
214              
215             # add the entry to the TBX::Min object if it has any langGroups
216             sub _entry {
217             my ($twig, $node) = @_;
218             my $entry = $twig->{tbx_min_min_current_entry};
219             if(@{$entry->lang_groups}){
220             $twig->{tbx_min}->add_entry($entry);
221             }elsif($log->is_info){
222             $log->info('element ' . $node->xpath . ' not converted');
223             }
224             return;
225             }
226              
227             #just set the subject_field of the current entry
228             sub _subjectField {
229             my ($twig, $node) = @_;
230             $twig->{tbx_min_min_current_entry}->subject_field($node->text);
231             return 1;
232             }
233              
234             # Create a new LangGroup, add it to the current entry,
235             # and set it as the current LangGroup.
236             # This langSet is ignored if its language is different from
237             # the source and target languages specified to basic2min
238             sub _langStart {
239             my ($twig, $node) = @_;
240             my $lang_grp;
241             my $lang = $node->att('xml:lang');
242             if(!$lang){
243             # skip if missing language
244             $log->warn('skipping langSet without language: ' .
245             $node->xpath) if $log->is_warn;
246             $node->ignore;
247             return 1;
248             }elsif(!grep {$_ eq lc $lang} @{$twig->{tbx_languages}}){
249             # skip if non-applicable language
250             $node->ignore;
251             return 1;
252             }
253              
254             $lang_grp = TBX::Min::LangSet->new();
255             $lang_grp->code($lang);
256             $twig->{tbx_found_languages}{lc $lang} = undef;
257             $twig->{tbx_min_min_current_entry}->add_lang_group($lang_grp);
258             $twig->{tbx_min_current_lang_grp} = $lang_grp;
259             return 1;
260             }
261              
262             # Create a new termGroup, add it to the current langGroup,
263             # and set it as the current termGroup.
264             sub _termGrpStart {
265             my ($twig) = @_;
266             my $term = TBX::Min::TIG->new();
267             $twig->{tbx_min_current_lang_grp}->add_term_group($term);
268             $twig->{tbx_min_current_term_grp} = $term;
269             return 1;
270             }
271              
272             sub _noteGrpStart {
273             my ($twig) = @_;
274             my $group = TBX::Min::NoteGrp->new;
275             $twig->{tbx_min_current_term_grp}->add_note_group($group);
276             $twig->{tbx_min_current_note_grp} = $group;
277             return 1;
278             }
279              
280             sub _noteStart {
281             my ($twig, $value, $key) = @_;
282             my $note = TBX::Min::Note->new(noteValue => $value, noteKey => $key);
283             $twig->{tbx_min_current_note_grp}->add_note($note);
284             $twig->{tbx_min_current_note} = $note;
285             return 1;
286             }
287              
288             # log that an element was not converted
289             sub _log_missed {
290             my (undef, $node) = @_;
291             $log->info('element ' . $node->xpath . ' not converted')
292             if $log->is_info();
293             return;
294             }
295              
296             1;
297              
298             __END__