File Coverage

blib/lib/Lingua/Ogmios/DocumentRecord.pm

Criterion	Covered	Total	%
statement	12	256	4.6
branch	0	76	0.0
condition	0	24	0.0
subroutine	4	18	22.2
pod	0	11	0.0
total	16	385	4.1

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::Ogmios::DocumentRecord;
2
3	16			16		74	use strict;
	16					26
	16					340
4	16			16		69	use warnings;
	16					25
	16					358
5
6							# use XML::Simple;
7	16			16		74	use Lingua::Ogmios::Annotations;
	16					36
	16					333
8	16			16		8147	use Lingua::Ogmios::Annotations::LogProcessing;
	16					38
	16					42642
9
10							my $debug_devel_level = 0;
11
12							sub new {
13	0			0	0		my ($class, $document_record, $platformConfig) = @_;
14
15	0						my $doc = {
16							'annotations' => Lingua::Ogmios::Annotations->new($platformConfig),
17							'id' => undef,
18							'attributes' => [],
19							};
20
21	0						bless $doc, $class;
22
23							# Parsing the file and loading into the structures
24	0						$doc->setId($document_record);
25	0						$doc->_parse($document_record, $platformConfig);
26							# Making the indexes
27
28	0						return($doc);
29							}
30
31							sub setId {
32	0			0	0		my ($self, $document_record) = @_;
33
34	0						my $id;
35
36	0	0					if (UNIVERSAL::isa($document_record, 'XML::LibXML::Element')) {
37	0						$id = $document_record->getAttribute("id");
38							} else {
39	0						$id = $document_record;
40							}
41
42	0	0					if (defined($id)) {
43	0						$self->getAnnotations->{'id'} = $id;
44							}
45							else {
46	0						warn "No id for record for doc " . ($self->getCount + 1) . "\n";
47							}
48							}
49
50							sub getId {
51	0			0	0		my ($self) = @_;
52
53	0						return($self->getAnnotations->{'id'});
54							}
55
56							sub setAttributes {
57	0			0	0		my ($self, $attributes) = @_;
58	0						my $attr;
59
60	0						foreach $attr (@$attributes) {
61	0						push @{$self->{'attributes'}}, {'nodeName' => $attr->nodeName,
	0
62							'value' => $attr->value,
63							};
64							}
65							}
66
67							sub getAttributes {
68	0			0	0		my ($self) = @_;
69
70	0						return($self->{'attributes'});
71							}
72
73							sub _parse {
74	0			0			my ($self, $document_record, $platformConfig) = @_;
75
76	0						my $lingAnalysisLoad = $platformConfig->linguisticAnnotationLoading;
77
78	0						warn "Processing document " . $self->getId . "\n";
79
80	0						my @attr = $document_record->attributes;
81	0						$self->setAttributes(\@attr);
82
83	0						my $acquisition_section_node = $document_record->getChildrenByTagName('acquisition')->get_node(1);
84							# my $acquisition_section_node;
85	0	0					if (defined $acquisition_section_node) {
86							# $acquisition_section_node = $acquisition_section_node_orig->cloneNode(1);
87	0						$self->getAnnotations->setAcquisitionSection($acquisition_section_node);
88	0						$self->getAnnotations->setLanguageFromXMLAndProperties($acquisition_section_node);
89	0	0					if (!defined($self->getAnnotations->getLanguage)) {
90	0						$self->getAnnotations->setLanguage(uc($platformConfig->getOgmiosDefaultLanguage));
91	0						print STDERR $self->getAnnotations->getLanguage . "\n";
92							}
93	0						$self->getAnnotations->setURLs($acquisition_section_node);
94	0						$self->getAnnotations->loadCanonicalDocument($acquisition_section_node);
95							} else {
96	0						die "no acquisition node\n";
97							}
98							# for $document_record ($document_record->getChildrenByTagName('linguisticAnalysis')) {
99							# TODO
100	0	0	0				if ((defined $lingAnalysisLoad) && ($lingAnalysisLoad == 1) && (defined $document_record->getChildrenByTagName('linguisticAnalysis')->get_node(1))) {
			0
101	0						$self->getAnnotations->loadLinguisticAnalysis($document_record->getChildrenByTagName('linguisticAnalysis')->get_node(1));
102							}
103
104	0						my $relevance_section_node = $document_record->getChildrenByTagName('relevance')->get_node(1);
105	0	0					if (defined $relevance_section_node) {
106	0						$self->getAnnotations->setRelevanceSection($relevance_section_node);
107							}
108							}
109
110
111							sub setAnnotations {
112	0			0	0		my ($self) = @_;
113	0						$self->{'annotations'} = undef;
114							}
115
116							sub getAnnotations {
117	0			0	0		my ($self) = @_;
118
119	0						return($self->{'annotations'});
120							}
121
122							sub _char_type_identification {
123	0			0			my ($self, $character) = @_;
124
125
126							# Definition of the character types
127	0						my $alpha="[A-Za-z\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{FF}\x{0400}-\x{0482}\x{048A}-\x{04FF}]";
128	0						my $num="[0-9]";
129	0						my $sep="[ \\s\\t\\n\\r]";
130
131	0						my $current_char_type;
132							my $current_char_type_string;
133
134							# default is symbol
135	0						$current_char_type = 4;$current_char_type_string = "symb";
	0
136
137							# print STDERR "$character\n";
138
139	0	0					if($character=~/$alpha/o){$current_char_type = 1;$current_char_type_string = "alpha";};
	0
	0
140	0	0					if($character=~/$num/o){$current_char_type = 2;$current_char_type_string = "num";};
	0
	0
141	0	0					if($character=~/$sep/o){$current_char_type = 3;$current_char_type_string = "sep";};
	0
	0
142
143	0						return($current_char_type, $current_char_type_string);
144							}
145
146							sub _tokenCreation {
147	0			0			my ($self,$current_token_string, $current_token_string_length, $previous_char_type_string, $offset) = @_;
148
149							# code for the character types
150							# 1: alphabetic
151							# 2: numeric
152							# 3: separator
153							# 4: symbol
154							# 0 : not defined
155							# warn "add token: $current_token_string\n";
156
157
158							# warn "Creation of new token\n";
159	0						my $token = Lingua::Ogmios::Annotations::Token->new(
160							{'content' => $current_token_string,
161							'type' => $previous_char_type_string,
162							'from' => $offset,
163							'to' => $offset + $current_token_string_length - 1,
164							});
165							# $token->print;
166	0						return($token);
167							}
168
169							sub tokenisation {
170	0			0	0		my ($self) = @_;
171
172	0	0					if ($self->getAnnotations->getTokenLevel->getSize != 0) {
173	0						warn "tokens exist - no tokenisation required\n";
174	0						return(0);
175							}
176
177	0						warn "[LOG] Tokenisation (" . $self->getId . ")\n";
178
179	0						my $canonicalDocument = $self->getAnnotations->getCanonicalDocument;
180
181							# warn $self->getId . "\n";
182							# warn $canonicalDocument . "\n";
183
184	0						my @characters = split //, $canonicalDocument;
185
186	0	0					if (scalar @characters) {
187
188	0						my $character;
189
190	0						my $current_token_string = "";
191	0						my $current_token_string_length = 0;
192	0						my $current_token_type = 0;
193	0						my $current_char_type = 0;
194	0						my $current_char_type_string = "";
195	0						my $previous_char_type = 0;
196	0						my $previous_char_type_string = "";
197	0						my $offset = 0;
198
199	0						my $current_id;
200							my $current_token;
201
202	0						$character = $characters[0];
203	0						($current_char_type, $current_char_type_string) = $self->_char_type_identification($character);
204	0						$current_token_string_length = 1;
205	0						$current_token_string = $character;
206	0						$previous_char_type = $current_char_type;
207	0						$previous_char_type_string = $current_char_type_string;
208
209	0	0					if ($current_char_type == 4) {
210	0						$current_token = $self->_tokenCreation($character, 1, $current_char_type_string, $offset);
211	0						$current_id = $self->getAnnotations->addToken($current_token);
212	0						$current_token_string = "$character";
213	0						$current_token_string_length = 1;
214	0						$previous_char_type = $current_char_type;
215	0						$previous_char_type_string = $current_char_type_string;
216	0						$offset += $current_token_string_length;
217							}
218	0						my $i;
219	0						for($i=1;$i
220	0						$character = $characters[$i];
221							# identification of the type of the current character
222
223	0						($current_char_type, $current_char_type_string) = $self->_char_type_identification($character);
224
225	0	0	0				if (($current_char_type == $previous_char_type) && ($current_char_type != 4) &&
			0
			0
226							(!($self->getAnnotations->getSectionLevel->existsElementFromIndex('from', $offset + $current_token_string_length ))) &&
227							(!($self->getAnnotations->getSectionLevel->existsElementFromIndex('to', $offset + $current_token_string_length - 1)))) {
228	0						$current_token_string .= $character;
229	0						$current_token_string_length++;
230							} else {
231	0	0					if ($previous_char_type != 4) {
232	0						$current_token = $self->_tokenCreation($current_token_string, $current_token_string_length, $previous_char_type_string, $offset);
233	0						$current_id = $self->getAnnotations->addToken($current_token);
234	0						$offset += $current_token_string_length;
235							}
236	0	0					if ($current_char_type == 4) {
237	0						$current_token_string = $character;
238	0						$current_token_string_length = 1;
239	0						$current_token = $self->_tokenCreation($current_token_string, $current_token_string_length, $current_char_type_string, $offset);
240	0						$current_id = $self->getAnnotations->addToken($current_token);
241	0						$offset += $current_token_string_length;
242							}
243							# and create a new token string
244	0						$previous_char_type = $current_char_type;
245	0						$previous_char_type_string = $current_char_type_string;
246	0						$current_token_string = $character;
247	0						$current_token_string_length = 1;
248							}
249							}
250	0	0					if ($current_char_type != 4) {
251	0						$current_token = $self->_tokenCreation($current_token_string, $current_token_string_length, $previous_char_type_string, $offset);
252	0						$current_id = $self->getAnnotations->addToken($current_token);
253							}
254							} else {
255	0						$self->getAnnotations
256							}
257
258	0						$self->getAnnotations->getSectionLevel->rebuildIndex();
259
260	0						$self->getAnnotations->addLogProcessing(
261							Lingua::Ogmios::Annotations::LogProcessing->new(
262							{ 'comments' => 'Found ' . $self->getAnnotations->getTokenLevel->getSize . ' tokens\n',
263							'list_modified_level' => ["token_level"],
264							}
265							)
266							);
267							# $self->getAnnotations->addLogProcessing(
268							# Lingua::Ogmios::Annotations::LogProcessing->new(
269							# { 'comments' => 'Found ' . $self->getAnnotations->getSectionLevel->getSize . ' sections\n',
270							# }
271							# )
272							# );
273	0						$self->getAnnotations->addLogProcessing(Lingua::Ogmios::Annotations::LogProcessing->new(
274							{ 'software_name' => 'internal processing',
275							'comments' => 'Tokenisation. Can not be change\n',
276							'list_modified_level' => ["token_level"],
277							}));
278	0						warn "[LOG] Check merging identification of the end and start position (1)\n";
279							}
280
281							sub tokenisation2 {
282	0			0	0		my ($self) = @_;
283
284	0						warn "[LOG] Tokenisation2 (" . $self->getId . ")\n";
285
286	0						my $canonicalDocument = $self->getAnnotations->getCanonicalDocument;
287
288	0						my @characters = split //, $canonicalDocument;
289
290	0						my $character;
291
292	0						my $current_token_string = "";
293	0						my $current_token_string_length = 0;
294	0						my $current_token_type = 0;
295	0						my $current_char_type = 0;
296	0						my $current_char_type_string = "";
297	0						my $previous_char_type = 0;
298	0						my $previous_char_type_string = "";
299	0						my $offset = 0;
300
301	0						my $current_id;
302							my $current_token;
303
304	0						$self->getAnnotations->addLogProcessing(Lingua::Ogmios::Annotations::LogProcessing->new(
305							{ 'software_name' => 'internal processing',
306							'comments' => 'Tokenisation. Can not be change\n',
307							}));
308	0						$character = $characters[0];
309	0						($current_char_type, $current_char_type_string) = $self->_char_type_identification($character);
310	0						$current_token_string_length = 1;
311	0						$current_token_string = $character;
312	0						$previous_char_type = $current_char_type;
313	0						$previous_char_type_string = $current_char_type_string;
314
315	0	0					if ($current_char_type == 4) {
316	0						$current_token = $self->_tokenCreation($character, 1, $current_char_type_string, $offset);
317	0						$current_id = $self->getAnnotations->addToken($current_token);
318	0						$current_token_string = "$character";
319	0						$current_token_string_length = 1;
320	0						$previous_char_type = $current_char_type;
321	0						$previous_char_type_string = $current_char_type_string;
322	0	0					if ($self->getAnnotations->getSectionLevel->existsElementFromIndex('from', $offset)) {
323	0						$self->getAnnotations->getSectionLevel->changeRefFromIndexField('from', $offset, $current_token);
324							}
325	0						$offset += $current_token_string_length;
326	0	0					if ($self->getAnnotations->getSectionLevel->existsElementFromIndex('to', $offset)) {
327	0						$self->getAnnotations->getSectionLevel->changeRefFromIndexField('to', $offset, $current_token);
328							}
329							}
330	0						my $i;
331	0						for($i=1;$i
332	0						$character = $characters[$i];
333							# identification of the type of the current character
334
335	0						($current_char_type, $current_char_type_string) = $self->_char_type_identification($character);
336
337	0	0	0				if (($current_char_type == $previous_char_type) && ($current_char_type != 4) &&
			0
			0
338							(!($self->getAnnotations->getSectionLevel->existsElementFromIndex('from', $offset + $current_token_string_length ))) &&
339							(!($self->getAnnotations->getSectionLevel->existsElementFromIndex('to', $offset + $current_token_string_length )))) {
340	0						$current_token_string .= $character;
341	0						$current_token_string_length++;
342							} else {
343	0	0					if ($previous_char_type != 4) {
344	0						$current_token = $self->_tokenCreation($current_token_string, $current_token_string_length, $previous_char_type_string, $offset);
345	0						$current_id = $self->getAnnotations->addToken($current_token);
346	0	0					if ($self->getAnnotations->getSectionLevel->existsElementFromIndex('from', $offset)) {
347	0						$self->getAnnotations->getSectionLevel->changeRefFromIndexField('from', $offset, $current_token);
348							}
349	0	0					if ($self->getAnnotations->getSectionLevel->existsElementFromIndex('to', $offset + $current_token_string_length)) {
350	0						$self->getAnnotations->getSectionLevel->changeRefFromIndexField('to', $offset + $current_token_string_length, $current_token);
351							}
352	0						$offset += $current_token_string_length;
353							}
354	0	0					if ($current_char_type == 4) {
355	0						$current_token_string = $character;
356	0						$current_token_string_length = 1;
357	0						$current_token = $self->_tokenCreation($current_token_string, $current_token_string_length, $current_char_type_string, $offset);
358	0						$current_id = $self->getAnnotations->addToken($current_token);
359	0	0					if ($self->getAnnotations->getSectionLevel->existsElementFromIndex('from', $offset)) {
360	0						$self->getAnnotations->getSectionLevel->changeRefFromIndexField('from', $offset, $current_token);
361							}
362	0	0					if ($self->getAnnotations->getSectionLevel->existsElementFromIndex('to', $offset + $current_token_string_length)) {
363	0						$self->getAnnotations->getSectionLevel->changeRefFromIndexField('to', $offset + $current_token_string_length, $current_token);
364							}
365	0						$offset += $current_token_string_length;
366							}
367							# and create a new token string
368	0						$previous_char_type = $current_char_type;
369	0						$previous_char_type_string = $current_char_type_string;
370	0						$current_token_string = $character;
371	0						$current_token_string_length = 1;
372							}
373							}
374	0	0					if ($current_char_type != 4) {
375	0						$current_token = $self->_tokenCreation($current_token_string, $current_token_string_length, $previous_char_type_string, $offset);
376	0						$current_id = $self->getAnnotations->addToken($current_token);
377							}
378	0	0					if ($self->getAnnotations->getSectionLevel->existsElementFromIndex('from', $offset + $current_token_string_length)) {
379	0						$self->getAnnotations->getSectionLevel->changeRefFromIndexField('from', $offset + $current_token_string_length, $current_token);
380							}
381	0	0					if ($self->getAnnotations->getSectionLevel->existsElementFromIndex('to', $offset + $current_token_string_length)) {
382	0						$self->getAnnotations->getSectionLevel->changeRefFromIndexField('to', $offset + $current_token_string_length, $current_token);
383							}
384
385	0						$self->getAnnotations->getSectionLevel->rebuildIndex();
386
387	0	0					if ($self->getAnnotations->getTokenLevel->getSize == 0) {
388
389	0						$self->getAnnotations->addLogProcessing(
390							Lingua::Ogmios::Annotations::LogProcessing->new(
391							{ 'comments' => 'Found ' . $self->getAnnotations->getTokenLevel->getSize . ' tokens\n',
392							}
393							)
394							);
395	0						$self->getAnnotations->addLogProcessing(
396							Lingua::Ogmios::Annotations::LogProcessing->new(
397							{ 'comments' => 'Found ' . $self->getAnnotations->getSectionLevel->getSize . ' sections\n',
398							}
399							)
400							);
401							}
402	0						warn "[LOG] Check merging identification of the end and start position (2)\n";
403							}
404
405							sub computeSectionFromToken {
406	0			0	0		my ($self, $record_log) = @_;
407
408	0						warn "[LOG] Compute Section Ref From Tokens (" . $self->getId . ")\n";
409
410	0						my $token;
411	0						my $lasttoken = $self->getAnnotations->getTokenLevel->getLastElement;
412	0						my $section;
413	0						foreach $section (@{$self->getAnnotations->getSectionLevel->getElements}) {
	0
414							# warn "check for section " . $section->getId . " (" . $section->getFrom . " - " . $section->getTo . ")\n";
415	0	0					if ($self->getAnnotations->getTokenLevel->existsElementFromIndex('from', $section->getFrom)) {
416	0						$token = $self->getAnnotations->getTokenLevel->getElementFromIndex('from', $section->getFrom)->[0];
417	0						$self->getAnnotations->getSectionLevel->changeRefFromIndexField('from', $token->getFrom, $token);
418							# } else {
419							# die "==> " . $section->getFrom . "\n";
420							}
421	0	0					if ($self->getAnnotations->getTokenLevel->existsElementFromIndex('to', $section->getTo)) {
422	0						$token = $self->getAnnotations->getTokenLevel->getElementFromIndex('to', $section->getTo)->[0];
423							#warn "in to\n";
424	0						$self->getAnnotations->getSectionLevel->changeRefFromIndexField('to', $token->getTo, $token);
425							} else {
426	0	0					if ($self->getAnnotations->getTokenLevel->existsElementFromIndex('from', $section->getTo)) {
427	0						$token = $self->getAnnotations->getTokenLevel->getElementFromIndex('from', $section->getTo)->[0];
428	0	0					if (defined $token->previous) {
429	0						$self->getAnnotations->getSectionLevel->changeRefFromIndexField('to', $token->getFrom, $token->previous);
430							} else {
431	0						$self->getAnnotations->getSectionLevel->changeRefFromIndexField('to', $token->getFrom, $token);
432							}
433	0						warn "ok\n";
434							} else {
435	0	0					if (ref($section->getTo) eq "Lingua::Ogmios::Annotations::Token") {
436							# $self->getAnnotations->getSectionLevel->addElementToIndex($section->getTo,'to');
437							} else {
438	0	0					if ($self->getAnnotations->getTokenLevel->existsElementFromIndex('to', $section->getTo - 1)) {
		0
439	0						$token = $self->getAnnotations->getTokenLevel->getElementFromIndex('to', $section->getTo - 1)->[0];
440	0						$self->getAnnotations->getSectionLevel->changeRefFromIndexField('to', $section->getTo, $token);
441							# } elsif ($self->getAnnotations->getTokenLevel->existsElementFromIndex('to', $section->getTo + 1)) {
442							# $token = $self->getAnnotations->getTokenLevel->getElementFromIndex('to', $section->getTo + 1)->[0];
443							# $self->getAnnotations->getSectionLevel->changeRefFromIndexField('to', $section->getTo, $token);
444	0						warn "ok2\n";
445							} elsif ($self->getAnnotations->getTokenLevel->existsElementFromIndex('to', $section->getTo + 1)) {
446	0						$token = $self->getAnnotations->getTokenLevel->getElementFromIndex('to', $section->getTo + 1)->[0];
447	0						$self->getAnnotations->getSectionLevel->changeRefFromIndexField('to', $section->getTo, $token);
448							} else {
449	0						warn "set to last token\n";
450	0						$self->getAnnotations->getSectionLevel->changeRefFromIndexField('to', $section->getTo, $lasttoken);
451							# die "==> " . $section->getTo->getContent . "\n";
452							}
453							}
454							}
455							# } else {
456							# warn "not defined\n";
457							# }
458							}
459							# warn "Check for corrected section " . $section->getId . " (" . $section->getFrom . " - " . $section->getTo . ")\n";
460
461							# warn "Last token " . $lasttoken->getId . " (" . $lasttoken->getFrom . " - " . $lasttoken->getTo . ")\n";
462							# warn "Check for corrected section " . $section->getId . " (" . $section->getFrom . " - " . $section->getTo . ")\n";
463							# warn ".\n";
464							}
465							# exit;
466							# warn "===\n";
467	0						$self->getAnnotations->getSectionLevel->rebuildIndex();
468							# warn "+++\n";
469	0	0					if ($record_log) {
470	0						$self->getAnnotations->addLogProcessing(
471							Lingua::Ogmios::Annotations::LogProcessing->new(
472							{ 'comments' => 'Found ' . $self->getAnnotations->getSectionLevel->getSize . ' sections\n',
473							'list_modified_level' => ["section_level"],
474							}
475							)
476							);
477							}
478	0						warn "[LOG] Check merging identification of the end and start position (3)\n";
479							}
480
481
482							sub XMLout {
483	0			0	0		my ($self) = @_;
484
485	0						my $str;
486							my $attr;
487
488	0						$str = '
489	0						foreach $attr (@{$self->getAttributes}) {
	0
490	0						$str .= " " . $attr->{'nodeName'} . '="' . $attr->{'value'} . '"';
491							}
492	0						$str .= ">\n";
493
494	0						$str .= $self->getAnnotations->XMLout;
495	0						$str .= " \n";
496
497	0						return($str);
498							}
499
500
501							1;
502
503							__END__