File Coverage

blib/lib/Lingua/Ogmios/DocumentRecord.pm
Criterion Covered Total %
statement 12 256 4.6
branch 0 76 0.0
condition 0 24 0.0
subroutine 4 18 22.2
pod 0 11 0.0
total 16 385 4.1


line stmt bran cond sub pod time code
1             package Lingua::Ogmios::DocumentRecord;
2              
3 16     16   74 use strict;
  16         26  
  16         340  
4 16     16   69 use warnings;
  16         25  
  16         358  
5              
6             # use XML::Simple;
7 16     16   74 use Lingua::Ogmios::Annotations;
  16         36  
  16         333  
8 16     16   8147 use Lingua::Ogmios::Annotations::LogProcessing;
  16         38  
  16         42642  
9              
10             my $debug_devel_level = 0;
11              
12             sub new {
13 0     0 0   my ($class, $document_record, $platformConfig) = @_;
14              
15 0           my $doc = {
16             'annotations' => Lingua::Ogmios::Annotations->new($platformConfig),
17             'id' => undef,
18             'attributes' => [],
19             };
20              
21 0           bless $doc, $class;
22              
23             # Parsing the file and loading into the structures
24 0           $doc->setId($document_record);
25 0           $doc->_parse($document_record, $platformConfig);
26             # Making the indexes
27              
28 0           return($doc);
29             }
30              
31             sub setId {
32 0     0 0   my ($self, $document_record) = @_;
33              
34 0           my $id;
35              
36 0 0         if (UNIVERSAL::isa($document_record, 'XML::LibXML::Element')) {
37 0           $id = $document_record->getAttribute("id");
38             } else {
39 0           $id = $document_record;
40             }
41              
42 0 0         if (defined($id)) {
43 0           $self->getAnnotations->{'id'} = $id;
44             }
45             else {
46 0           warn "No id for record for doc " . ($self->getCount + 1) . "\n";
47             }
48             }
49              
50             sub getId {
51 0     0 0   my ($self) = @_;
52              
53 0           return($self->getAnnotations->{'id'});
54             }
55              
56             sub setAttributes {
57 0     0 0   my ($self, $attributes) = @_;
58 0           my $attr;
59              
60 0           foreach $attr (@$attributes) {
61 0           push @{$self->{'attributes'}}, {'nodeName' => $attr->nodeName,
  0            
62             'value' => $attr->value,
63             };
64             }
65             }
66              
67             sub getAttributes {
68 0     0 0   my ($self) = @_;
69              
70 0           return($self->{'attributes'});
71             }
72              
73             sub _parse {
74 0     0     my ($self, $document_record, $platformConfig) = @_;
75              
76 0           my $lingAnalysisLoad = $platformConfig->linguisticAnnotationLoading;
77              
78 0           warn "Processing document " . $self->getId . "\n";
79              
80 0           my @attr = $document_record->attributes;
81 0           $self->setAttributes(\@attr);
82              
83 0           my $acquisition_section_node = $document_record->getChildrenByTagName('acquisition')->get_node(1);
84             # my $acquisition_section_node;
85 0 0         if (defined $acquisition_section_node) {
86             # $acquisition_section_node = $acquisition_section_node_orig->cloneNode(1);
87 0           $self->getAnnotations->setAcquisitionSection($acquisition_section_node);
88 0           $self->getAnnotations->setLanguageFromXMLAndProperties($acquisition_section_node);
89 0 0         if (!defined($self->getAnnotations->getLanguage)) {
90 0           $self->getAnnotations->setLanguage(uc($platformConfig->getOgmiosDefaultLanguage));
91 0           print STDERR $self->getAnnotations->getLanguage . "\n";
92             }
93 0           $self->getAnnotations->setURLs($acquisition_section_node);
94 0           $self->getAnnotations->loadCanonicalDocument($acquisition_section_node);
95             } else {
96 0           die "no acquisition node\n";
97             }
98             # for $document_record ($document_record->getChildrenByTagName('linguisticAnalysis')) {
99             # TODO
100 0 0 0       if ((defined $lingAnalysisLoad) && ($lingAnalysisLoad == 1) && (defined $document_record->getChildrenByTagName('linguisticAnalysis')->get_node(1))) {
      0        
101 0           $self->getAnnotations->loadLinguisticAnalysis($document_record->getChildrenByTagName('linguisticAnalysis')->get_node(1));
102             }
103              
104 0           my $relevance_section_node = $document_record->getChildrenByTagName('relevance')->get_node(1);
105 0 0         if (defined $relevance_section_node) {
106 0           $self->getAnnotations->setRelevanceSection($relevance_section_node);
107             }
108             }
109              
110              
111             sub setAnnotations {
112 0     0 0   my ($self) = @_;
113 0           $self->{'annotations'} = undef;
114             }
115              
116             sub getAnnotations {
117 0     0 0   my ($self) = @_;
118              
119 0           return($self->{'annotations'});
120             }
121              
122             sub _char_type_identification {
123 0     0     my ($self, $character) = @_;
124              
125              
126             # Definition of the character types
127 0           my $alpha="[A-Za-z\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{FF}\x{0400}-\x{0482}\x{048A}-\x{04FF}]";
128 0           my $num="[0-9]";
129 0           my $sep="[ \\s\\t\\n\\r]";
130              
131 0           my $current_char_type;
132             my $current_char_type_string;
133              
134             # default is symbol
135 0           $current_char_type = 4;$current_char_type_string = "symb";
  0            
136              
137             # print STDERR "$character\n";
138              
139 0 0         if($character=~/$alpha/o){$current_char_type = 1;$current_char_type_string = "alpha";};
  0            
  0            
140 0 0         if($character=~/$num/o){$current_char_type = 2;$current_char_type_string = "num";};
  0            
  0            
141 0 0         if($character=~/$sep/o){$current_char_type = 3;$current_char_type_string = "sep";};
  0            
  0            
142              
143 0           return($current_char_type, $current_char_type_string);
144             }
145              
146             sub _tokenCreation {
147 0     0     my ($self,$current_token_string, $current_token_string_length, $previous_char_type_string, $offset) = @_;
148              
149             # code for the character types
150             # 1: alphabetic
151             # 2: numeric
152             # 3: separator
153             # 4: symbol
154             # 0 : not defined
155             # warn "add token: $current_token_string\n";
156              
157              
158             # warn "Creation of new token\n";
159 0           my $token = Lingua::Ogmios::Annotations::Token->new(
160             {'content' => $current_token_string,
161             'type' => $previous_char_type_string,
162             'from' => $offset,
163             'to' => $offset + $current_token_string_length - 1,
164             });
165             # $token->print;
166 0           return($token);
167             }
168              
169             sub tokenisation {
170 0     0 0   my ($self) = @_;
171              
172 0 0         if ($self->getAnnotations->getTokenLevel->getSize != 0) {
173 0           warn "tokens exist - no tokenisation required\n";
174 0           return(0);
175             }
176              
177 0           warn "[LOG] Tokenisation (" . $self->getId . ")\n";
178              
179 0           my $canonicalDocument = $self->getAnnotations->getCanonicalDocument;
180              
181             # warn $self->getId . "\n";
182             # warn $canonicalDocument . "\n";
183              
184 0           my @characters = split //, $canonicalDocument;
185              
186 0 0         if (scalar @characters) {
187              
188 0           my $character;
189              
190 0           my $current_token_string = "";
191 0           my $current_token_string_length = 0;
192 0           my $current_token_type = 0;
193 0           my $current_char_type = 0;
194 0           my $current_char_type_string = "";
195 0           my $previous_char_type = 0;
196 0           my $previous_char_type_string = "";
197 0           my $offset = 0;
198              
199 0           my $current_id;
200             my $current_token;
201              
202 0           $character = $characters[0];
203 0           ($current_char_type, $current_char_type_string) = $self->_char_type_identification($character);
204 0           $current_token_string_length = 1;
205 0           $current_token_string = $character;
206 0           $previous_char_type = $current_char_type;
207 0           $previous_char_type_string = $current_char_type_string;
208              
209 0 0         if ($current_char_type == 4) {
210 0           $current_token = $self->_tokenCreation($character, 1, $current_char_type_string, $offset);
211 0           $current_id = $self->getAnnotations->addToken($current_token);
212 0           $current_token_string = "$character";
213 0           $current_token_string_length = 1;
214 0           $previous_char_type = $current_char_type;
215 0           $previous_char_type_string = $current_char_type_string;
216 0           $offset += $current_token_string_length;
217             }
218 0           my $i;
219 0           for($i=1;$i
220 0           $character = $characters[$i];
221             # identification of the type of the current character
222              
223 0           ($current_char_type, $current_char_type_string) = $self->_char_type_identification($character);
224              
225 0 0 0       if (($current_char_type == $previous_char_type) && ($current_char_type != 4) &&
      0        
      0        
226             (!($self->getAnnotations->getSectionLevel->existsElementFromIndex('from', $offset + $current_token_string_length ))) &&
227             (!($self->getAnnotations->getSectionLevel->existsElementFromIndex('to', $offset + $current_token_string_length - 1)))) {
228 0           $current_token_string .= $character;
229 0           $current_token_string_length++;
230             } else {
231 0 0         if ($previous_char_type != 4) {
232 0           $current_token = $self->_tokenCreation($current_token_string, $current_token_string_length, $previous_char_type_string, $offset);
233 0           $current_id = $self->getAnnotations->addToken($current_token);
234 0           $offset += $current_token_string_length;
235             }
236 0 0         if ($current_char_type == 4) {
237 0           $current_token_string = $character;
238 0           $current_token_string_length = 1;
239 0           $current_token = $self->_tokenCreation($current_token_string, $current_token_string_length, $current_char_type_string, $offset);
240 0           $current_id = $self->getAnnotations->addToken($current_token);
241 0           $offset += $current_token_string_length;
242             }
243             # and create a new token string
244 0           $previous_char_type = $current_char_type;
245 0           $previous_char_type_string = $current_char_type_string;
246 0           $current_token_string = $character;
247 0           $current_token_string_length = 1;
248             }
249             }
250 0 0         if ($current_char_type != 4) {
251 0           $current_token = $self->_tokenCreation($current_token_string, $current_token_string_length, $previous_char_type_string, $offset);
252 0           $current_id = $self->getAnnotations->addToken($current_token);
253             }
254             } else {
255 0           $self->getAnnotations
256             }
257            
258 0           $self->getAnnotations->getSectionLevel->rebuildIndex();
259            
260 0           $self->getAnnotations->addLogProcessing(
261             Lingua::Ogmios::Annotations::LogProcessing->new(
262             { 'comments' => 'Found ' . $self->getAnnotations->getTokenLevel->getSize . ' tokens\n',
263             'list_modified_level' => ["token_level"],
264             }
265             )
266             );
267             # $self->getAnnotations->addLogProcessing(
268             # Lingua::Ogmios::Annotations::LogProcessing->new(
269             # { 'comments' => 'Found ' . $self->getAnnotations->getSectionLevel->getSize . ' sections\n',
270             # }
271             # )
272             # );
273 0           $self->getAnnotations->addLogProcessing(Lingua::Ogmios::Annotations::LogProcessing->new(
274             { 'software_name' => 'internal processing',
275             'comments' => 'Tokenisation. Can not be change\n',
276             'list_modified_level' => ["token_level"],
277             }));
278 0           warn "[LOG] Check merging identification of the end and start position (1)\n";
279             }
280              
281             sub tokenisation2 {
282 0     0 0   my ($self) = @_;
283              
284 0           warn "[LOG] Tokenisation2 (" . $self->getId . ")\n";
285              
286 0           my $canonicalDocument = $self->getAnnotations->getCanonicalDocument;
287              
288 0           my @characters = split //, $canonicalDocument;
289              
290 0           my $character;
291              
292 0           my $current_token_string = "";
293 0           my $current_token_string_length = 0;
294 0           my $current_token_type = 0;
295 0           my $current_char_type = 0;
296 0           my $current_char_type_string = "";
297 0           my $previous_char_type = 0;
298 0           my $previous_char_type_string = "";
299 0           my $offset = 0;
300              
301 0           my $current_id;
302             my $current_token;
303              
304 0           $self->getAnnotations->addLogProcessing(Lingua::Ogmios::Annotations::LogProcessing->new(
305             { 'software_name' => 'internal processing',
306             'comments' => 'Tokenisation. Can not be change\n',
307             }));
308 0           $character = $characters[0];
309 0           ($current_char_type, $current_char_type_string) = $self->_char_type_identification($character);
310 0           $current_token_string_length = 1;
311 0           $current_token_string = $character;
312 0           $previous_char_type = $current_char_type;
313 0           $previous_char_type_string = $current_char_type_string;
314              
315 0 0         if ($current_char_type == 4) {
316 0           $current_token = $self->_tokenCreation($character, 1, $current_char_type_string, $offset);
317 0           $current_id = $self->getAnnotations->addToken($current_token);
318 0           $current_token_string = "$character";
319 0           $current_token_string_length = 1;
320 0           $previous_char_type = $current_char_type;
321 0           $previous_char_type_string = $current_char_type_string;
322 0 0         if ($self->getAnnotations->getSectionLevel->existsElementFromIndex('from', $offset)) {
323 0           $self->getAnnotations->getSectionLevel->changeRefFromIndexField('from', $offset, $current_token);
324             }
325 0           $offset += $current_token_string_length;
326 0 0         if ($self->getAnnotations->getSectionLevel->existsElementFromIndex('to', $offset)) {
327 0           $self->getAnnotations->getSectionLevel->changeRefFromIndexField('to', $offset, $current_token);
328             }
329             }
330 0           my $i;
331 0           for($i=1;$i
332 0           $character = $characters[$i];
333             # identification of the type of the current character
334              
335 0           ($current_char_type, $current_char_type_string) = $self->_char_type_identification($character);
336              
337 0 0 0       if (($current_char_type == $previous_char_type) && ($current_char_type != 4) &&
      0        
      0        
338             (!($self->getAnnotations->getSectionLevel->existsElementFromIndex('from', $offset + $current_token_string_length ))) &&
339             (!($self->getAnnotations->getSectionLevel->existsElementFromIndex('to', $offset + $current_token_string_length )))) {
340 0           $current_token_string .= $character;
341 0           $current_token_string_length++;
342             } else {
343 0 0         if ($previous_char_type != 4) {
344 0           $current_token = $self->_tokenCreation($current_token_string, $current_token_string_length, $previous_char_type_string, $offset);
345 0           $current_id = $self->getAnnotations->addToken($current_token);
346 0 0         if ($self->getAnnotations->getSectionLevel->existsElementFromIndex('from', $offset)) {
347 0           $self->getAnnotations->getSectionLevel->changeRefFromIndexField('from', $offset, $current_token);
348             }
349 0 0         if ($self->getAnnotations->getSectionLevel->existsElementFromIndex('to', $offset + $current_token_string_length)) {
350 0           $self->getAnnotations->getSectionLevel->changeRefFromIndexField('to', $offset + $current_token_string_length, $current_token);
351             }
352 0           $offset += $current_token_string_length;
353             }
354 0 0         if ($current_char_type == 4) {
355 0           $current_token_string = $character;
356 0           $current_token_string_length = 1;
357 0           $current_token = $self->_tokenCreation($current_token_string, $current_token_string_length, $current_char_type_string, $offset);
358 0           $current_id = $self->getAnnotations->addToken($current_token);
359 0 0         if ($self->getAnnotations->getSectionLevel->existsElementFromIndex('from', $offset)) {
360 0           $self->getAnnotations->getSectionLevel->changeRefFromIndexField('from', $offset, $current_token);
361             }
362 0 0         if ($self->getAnnotations->getSectionLevel->existsElementFromIndex('to', $offset + $current_token_string_length)) {
363 0           $self->getAnnotations->getSectionLevel->changeRefFromIndexField('to', $offset + $current_token_string_length, $current_token);
364             }
365 0           $offset += $current_token_string_length;
366             }
367             # and create a new token string
368 0           $previous_char_type = $current_char_type;
369 0           $previous_char_type_string = $current_char_type_string;
370 0           $current_token_string = $character;
371 0           $current_token_string_length = 1;
372             }
373             }
374 0 0         if ($current_char_type != 4) {
375 0           $current_token = $self->_tokenCreation($current_token_string, $current_token_string_length, $previous_char_type_string, $offset);
376 0           $current_id = $self->getAnnotations->addToken($current_token);
377             }
378 0 0         if ($self->getAnnotations->getSectionLevel->existsElementFromIndex('from', $offset + $current_token_string_length)) {
379 0           $self->getAnnotations->getSectionLevel->changeRefFromIndexField('from', $offset + $current_token_string_length, $current_token);
380             }
381 0 0         if ($self->getAnnotations->getSectionLevel->existsElementFromIndex('to', $offset + $current_token_string_length)) {
382 0           $self->getAnnotations->getSectionLevel->changeRefFromIndexField('to', $offset + $current_token_string_length, $current_token);
383             }
384              
385 0           $self->getAnnotations->getSectionLevel->rebuildIndex();
386              
387 0 0         if ($self->getAnnotations->getTokenLevel->getSize == 0) {
388              
389 0           $self->getAnnotations->addLogProcessing(
390             Lingua::Ogmios::Annotations::LogProcessing->new(
391             { 'comments' => 'Found ' . $self->getAnnotations->getTokenLevel->getSize . ' tokens\n',
392             }
393             )
394             );
395 0           $self->getAnnotations->addLogProcessing(
396             Lingua::Ogmios::Annotations::LogProcessing->new(
397             { 'comments' => 'Found ' . $self->getAnnotations->getSectionLevel->getSize . ' sections\n',
398             }
399             )
400             );
401             }
402 0           warn "[LOG] Check merging identification of the end and start position (2)\n";
403             }
404              
405             sub computeSectionFromToken {
406 0     0 0   my ($self, $record_log) = @_;
407              
408 0           warn "[LOG] Compute Section Ref From Tokens (" . $self->getId . ")\n";
409              
410 0           my $token;
411 0           my $lasttoken = $self->getAnnotations->getTokenLevel->getLastElement;
412 0           my $section;
413 0           foreach $section (@{$self->getAnnotations->getSectionLevel->getElements}) {
  0            
414             # warn "check for section " . $section->getId . " (" . $section->getFrom . " - " . $section->getTo . ")\n";
415 0 0         if ($self->getAnnotations->getTokenLevel->existsElementFromIndex('from', $section->getFrom)) {
416 0           $token = $self->getAnnotations->getTokenLevel->getElementFromIndex('from', $section->getFrom)->[0];
417 0           $self->getAnnotations->getSectionLevel->changeRefFromIndexField('from', $token->getFrom, $token);
418             # } else {
419             # die "==> " . $section->getFrom . "\n";
420             }
421 0 0         if ($self->getAnnotations->getTokenLevel->existsElementFromIndex('to', $section->getTo)) {
422 0           $token = $self->getAnnotations->getTokenLevel->getElementFromIndex('to', $section->getTo)->[0];
423             #warn "in to\n";
424 0           $self->getAnnotations->getSectionLevel->changeRefFromIndexField('to', $token->getTo, $token);
425             } else {
426 0 0         if ($self->getAnnotations->getTokenLevel->existsElementFromIndex('from', $section->getTo)) {
427 0           $token = $self->getAnnotations->getTokenLevel->getElementFromIndex('from', $section->getTo)->[0];
428 0 0         if (defined $token->previous) {
429 0           $self->getAnnotations->getSectionLevel->changeRefFromIndexField('to', $token->getFrom, $token->previous);
430             } else {
431 0           $self->getAnnotations->getSectionLevel->changeRefFromIndexField('to', $token->getFrom, $token);
432             }
433 0           warn "ok\n";
434             } else {
435 0 0         if (ref($section->getTo) eq "Lingua::Ogmios::Annotations::Token") {
436             # $self->getAnnotations->getSectionLevel->addElementToIndex($section->getTo,'to');
437             } else {
438 0 0         if ($self->getAnnotations->getTokenLevel->existsElementFromIndex('to', $section->getTo - 1)) {
    0          
439 0           $token = $self->getAnnotations->getTokenLevel->getElementFromIndex('to', $section->getTo - 1)->[0];
440 0           $self->getAnnotations->getSectionLevel->changeRefFromIndexField('to', $section->getTo, $token);
441             # } elsif ($self->getAnnotations->getTokenLevel->existsElementFromIndex('to', $section->getTo + 1)) {
442             # $token = $self->getAnnotations->getTokenLevel->getElementFromIndex('to', $section->getTo + 1)->[0];
443             # $self->getAnnotations->getSectionLevel->changeRefFromIndexField('to', $section->getTo, $token);
444 0           warn "ok2\n";
445             } elsif ($self->getAnnotations->getTokenLevel->existsElementFromIndex('to', $section->getTo + 1)) {
446 0           $token = $self->getAnnotations->getTokenLevel->getElementFromIndex('to', $section->getTo + 1)->[0];
447 0           $self->getAnnotations->getSectionLevel->changeRefFromIndexField('to', $section->getTo, $token);
448             } else {
449 0           warn "set to last token\n";
450 0           $self->getAnnotations->getSectionLevel->changeRefFromIndexField('to', $section->getTo, $lasttoken);
451             # die "==> " . $section->getTo->getContent . "\n";
452             }
453             }
454             }
455             # } else {
456             # warn "not defined\n";
457             # }
458             }
459             # warn "Check for corrected section " . $section->getId . " (" . $section->getFrom . " - " . $section->getTo . ")\n";
460              
461             # warn "Last token " . $lasttoken->getId . " (" . $lasttoken->getFrom . " - " . $lasttoken->getTo . ")\n";
462             # warn "Check for corrected section " . $section->getId . " (" . $section->getFrom . " - " . $section->getTo . ")\n";
463             # warn ".\n";
464             }
465             # exit;
466             # warn "===\n";
467 0           $self->getAnnotations->getSectionLevel->rebuildIndex();
468             # warn "+++\n";
469 0 0         if ($record_log) {
470 0           $self->getAnnotations->addLogProcessing(
471             Lingua::Ogmios::Annotations::LogProcessing->new(
472             { 'comments' => 'Found ' . $self->getAnnotations->getSectionLevel->getSize . ' sections\n',
473             'list_modified_level' => ["section_level"],
474             }
475             )
476             );
477             }
478 0           warn "[LOG] Check merging identification of the end and start position (3)\n";
479             }
480              
481              
482             sub XMLout {
483 0     0 0   my ($self) = @_;
484              
485 0           my $str;
486             my $attr;
487              
488 0           $str = '
489 0           foreach $attr (@{$self->getAttributes}) {
  0            
490 0           $str .= " " . $attr->{'nodeName'} . '="' . $attr->{'value'} . '"';
491             }
492 0           $str .= ">\n";
493              
494 0           $str .= $self->getAnnotations->XMLout;
495 0           $str .= " \n";
496              
497 0           return($str);
498             }
499              
500              
501             1;
502              
503             __END__