File Coverage

blib/lib/Alvis/NLPPlatform/Annotation.pm

Criterion	Covered	Total	%
statement	7	9	77.7
branch			n/a
condition			n/a
subroutine	3	3	100.0
pod			n/a
total	10	12	83.3

line	stmt	sub	time	code
1				#!/usr/bin/perl -w
2
3
4				###
5				### Package Annotation
6				###
7				### Last updated:
8				### Thursday, August 31st, 2006
9				### Julien DERIVIERE, Thierry Hamon
10				### e-mail: julien.deriviere@lipn.univ-paris13.fr, thierry.hamon@lipn.univ-paris13.fr
11
12				package Alvis::NLPPlatform::Annotation;
13
14	3	3	15	use strict;
	3		3
	3		92
15	3	3	14	use warnings;
	3		5
	3		74
16
17	3	3	1586	use Alvis::NLPPlatform::MyReceiver;
	0
	0
18				use Time::HiRes qw(gettimeofday tv_interval);
19
20				use Encode;
21
22				our $VERSION=$Alvis::NLPPlatform::VERSION;
23
24				our $xmlhead = "\n\n";
25				our $xmlfoot = "\n";
26
27
28				our $phrase_idx;
29				our $syntactic_relation_idx;
30
31				our $document_record_head;
32				our $document_record_id;
33				our $canonicalDocument;
34				our $acquisitionData;
35				our $originalDocument;
36				our $metaData;
37				our $links;
38				our $analysis;
39				our $relevance;
40				our $documenturl;
41				our $ALVISLANGUAGE;
42				our $nb_max_tokens;
43
44				my $is_in_canonical;
45				my $is_in_acquisition;
46				my $is_in_original;
47				my $is_in_meta;
48				my $is_in_links;
49				my $is_in_analysis;
50				my $is_in_relevance;
51
52
53				my $header="";
54
55				my $end_layer;
56
57				# Only for sorting xml id
58
59
60				sub read_key_id{
61				my $id=$_[0];
62				$id=~m/^(token\|word)([0-9]+)/o;
63				return $2;
64				}
65
66				sub sort_keys {
67
68				my $key1;
69				my $key2;
70				$a=~m/^(token\|word\|lemma\|phrase\|morphosyntactic_features\|sentence\|semantic_unit\|syntactic_relation\|log_processing\|semantic_features)([0-9]+)/;
71				$key1=$2;
72				$b=~m/^(token\|word\|lemma\|phrase\|morphosyntactic_features\|sentence\|semantic_unit\|syntactic_relation\|log_processing\|semantic_features)([0-9]+)/;
73				$key2=$2;
74				return -1 if ($key1 < $key2);
75				return 0 if ($key1 == $key2);
76				return 1 if ($key1 > $key2);
77
78				}
79
80
81				sub sort{
82				my ($ref_hashtable) = @_;
83
84
85				return(sort sort_keys keys %$ref_hashtable);
86
87
88				}
89
90				sub sort_keys_lex{
91
92				my $key1;
93				my $key2;
94
95
96				my $type1;
97				my $type2;
98				if ($a=~m/^(token\|word\|lemma\|phrase\|morphosyntactic_features\|sentence\|semantic_unit\|syntactic_relation\|log_processing)([0-9]+)/) {
99				$type1 = $1;
100				$key1=$2;
101				if ($b=~m/^(token\|word\|lemma\|phrase\|morphosyntactic_features\|sentence\|semantic_unit\|syntactic_relation\|log_processing)([0-9]+)/) {
102				$type2 = $1;
103				$key2=$2;
104				if ($type1 eq $type2) {
105				return ($key1 <=> $key2);
106				# return $a cmp $b;
107				} else {
108				return $a cmp $b;
109				}
110				} else {
111				return $a cmp $b;
112				}
113				} else {
114				return $a cmp $b;
115				}
116
117				}
118
119
120				sub render{
121				my $annot=$_[0];
122				my $descriptor = $_[1];
123
124
125				my $markup_name = "";
126				my $key = "";
127				my $content = "";
128				my $tmp = "";
129				my $element = "";
130				my $was_list_tab = "";
131				my $was_list_hash = "";
132				my $list_of = "";
133				my $index = "";
134
135				my $layer_tokens=0;
136				my $layer_words=0;
137				my $layer_sentences=0;
138				my $layer_tag=0;
139				my $layer_lemma=0;
140				my $layer_semantic=0;
141				my $layer_semantic_features=0;
142				my $layer_phrase=0;
143				my $layer_log=0;
144				my $layer_parsing=0;
145
146				my $indent=" "; # for indent
147
148
149
150				# print STDERR "--\n";
151				# Hash table scaning
152				foreach $key(sort sort_keys_lex keys %$annot){
153				# print STDERR "$key\n";
154
155				# my $hash_key = $annot->{"$key"};
156
157				# Get the key content (it is a reference)
158				$content=$annot->{$key};
159
160				if(ref($content) eq "HASH"){
161				# It's a reference to a hash table
162				$markup_name=$content->{"datatype"};
163
164
165				# Print in the descriptor the information related to the level
166				if(($markup_name=~/^token/) && ($layer_tokens==0)){
167				if($end_layer ne ""){
168				print_Annotation($descriptor, "$indent$end_layer");
169				}
170				$layer_tokens=1;
171				$end_layer="\n";
172				print_Annotation($descriptor, "\n");
173				print_Annotation($descriptor, "$indent\n");
174				print_Annotation($descriptor, "$indent\n");
175				print_Annotation($descriptor, "$indent\n");
176				print_Annotation($descriptor, "\n");
177				print_Annotation($descriptor, "$indent\n");
178				}
179
180				if(($markup_name=~/^word/) && ($layer_words==0)){
181				if($end_layer ne ""){
182				print_Annotation($descriptor, "$indent$end_layer");
183				}
184				$layer_words=1;
185				$end_layer="\n";
186				print_Annotation($descriptor, "\n");
187				print_Annotation($descriptor, "$indent\n");
188				print_Annotation($descriptor, "$indent\n");
189				print_Annotation($descriptor, "$indent\n");
190				print_Annotation($descriptor, "\n");
191				print_Annotation($descriptor, "$indent\n");
192				}
193
194				if(($markup_name=~/^sentence/) && ($layer_sentences==0)){
195				if($end_layer ne ""){
196				print_Annotation($descriptor, "$indent$end_layer");
197				}
198				$layer_sentences=1;
199				$end_layer="\n";
200				print_Annotation($descriptor, "\n");
201				print_Annotation($descriptor, "$indent\n");
202				print_Annotation($descriptor, "$indent\n");
203				print_Annotation($descriptor, "$indent\n");
204				print_Annotation($descriptor, "\n");
205				print_Annotation($descriptor, "$indent\n");
206				}
207
208				if(($markup_name=~/^morphosyntactic/) && ($layer_tag==0)){
209				if($end_layer ne ""){
210				print_Annotation($descriptor, "$indent$end_layer");
211				}
212				$layer_tag=1;
213				$end_layer="\n";
214				print_Annotation($descriptor, "\n");
215				print_Annotation($descriptor, "$indent\n");
216				print_Annotation($descriptor, "$indent\n");
217				print_Annotation($descriptor, "$indent\n");
218				print_Annotation($descriptor, "\n");
219				print_Annotation($descriptor, "$indent\n");
220				}
221
222				if(($markup_name=~/^syntactic/) && ($layer_parsing==0)){
223				if($end_layer ne ""){
224				print_Annotation($descriptor, "$indent$end_layer");
225				}
226				$layer_parsing=1;
227				$end_layer="\n";
228				print_Annotation($descriptor, "\n");
229				print_Annotation($descriptor, "$indent\n");
230				print_Annotation($descriptor, "$indent\n");
231				print_Annotation($descriptor, "$indent\n");
232				print_Annotation($descriptor, "\n");
233				print_Annotation($descriptor, "$indent\n");
234				}
235
236				if(($markup_name=~/^lemma/) && ($layer_lemma==0)){
237				if($end_layer ne ""){
238				print_Annotation($descriptor, "$indent$end_layer");
239				}
240				$layer_lemma=1;
241				$end_layer="\n";
242				print_Annotation($descriptor, "\n");
243				print_Annotation($descriptor, "$indent\n");
244				print_Annotation($descriptor, "$indent\n");
245				print_Annotation($descriptor, "$indent\n");
246				print_Annotation($descriptor, "\n");
247				print_Annotation($descriptor, "$indent\n");
248				}
249
250				if(($markup_name=~/^semantic_unit/) && ($layer_semantic==0)){
251				if($end_layer ne ""){
252				print_Annotation($descriptor, "$indent$end_layer");
253				}
254				$layer_semantic=1;
255				$end_layer="\n";
256				print_Annotation($descriptor, "\n");
257				print_Annotation($descriptor, "$indent\n");
258				print_Annotation($descriptor, "$indent\n");
259				print_Annotation($descriptor, "$indent\n");
260				print_Annotation($descriptor, "\n");
261				print_Annotation($descriptor, "$indent\n");
262				}
263
264				if(($markup_name=~/^semantic_features/) && ($layer_semantic_features==0)){
265				if($end_layer ne ""){
266				print_Annotation($descriptor, "$indent$end_layer");
267				}
268				$layer_semantic_features=1;
269				$end_layer="\n";
270				print_Annotation($descriptor, "\n");
271				print_Annotation($descriptor, "$indent\n");
272				print_Annotation($descriptor, "$indent\n");
273				print_Annotation($descriptor, "$indent\n");
274				print_Annotation($descriptor, "\n");
275				print_Annotation($descriptor, "$indent\n");
276				}
277
278				if(($markup_name=~/^phrase/) && ($layer_phrase==0)){
279				if($end_layer ne ""){
280				print_Annotation($descriptor, "$indent$end_layer");
281				}
282				$layer_phrase=1;
283				$end_layer="\n";
284				print_Annotation($descriptor, "\n");
285				print_Annotation($descriptor, "$indent\n");
286				print_Annotation($descriptor, "$indent\n");
287				print_Annotation($descriptor, "$indent\n");
288				print_Annotation($descriptor, "\n");
289				print_Annotation($descriptor, "$indent\n");
290				}
291
292				if(($markup_name=~/^log/) && ($layer_log==0)){
293				if($end_layer ne ""){
294				print_Annotation($descriptor, "$indent$end_layer");
295				}
296				$layer_log=1;
297				$end_layer="\n";
298				print_Annotation($descriptor, "\n");
299				print_Annotation($descriptor, "$indent\n");
300				print_Annotation($descriptor, "$indent\n");
301				print_Annotation($descriptor, "$indent\n");
302				print_Annotation($descriptor, "\n");
303				print_Annotation($descriptor, "$indent\n");
304				}
305
306				# Recursively scan the the mark
307				print_Annotation($descriptor, "$indent<$markup_name>\n");
308				$tmp=$indent;
309				$indent=$indent." ";
310				render($content, $descriptor);
311				$indent=$tmp;
312				print_Annotation($descriptor, "$indent\n");
313				}
314				if(ref($content) eq "ARRAY"){
315				# It's a reference to an array
316				print_Annotation($descriptor, "$indent<$key>");
317				$was_list_tab=0;
318				$was_list_hash=0;
319				$index=0;
320
321				# Scan the elements of the array
322				foreach $element (sort sort_keys_lex @$content){ ###########
323				$index++;
324
325				if(ref($element) eq "HASH"){
326				# the array contains has table references
327
328				# Add a carriage return but ignore rendondante ones
329				if(!$was_list_hash){
330				print_Annotation($descriptor, "\n");
331				}
332				# list mark
333				if(($key=~/^list\-(.+) *\n?/)){
334				if(!$was_list_hash){
335				$list_of=$1;
336				print_Annotation($descriptor, "$indent <$list_of>\n");
337				$was_list_hash=1;
338				}else{
339				print_Annotation($descriptor, "$indent \n");
340				print_Annotation($descriptor, "$indent <$list_of>\n");
341				}
342				}
343
344				# Recursive call since it is a hash table reference
345				$tmp=$indent;
346				$indent=$indent." ";
347				render($element, $descriptor);
348				$indent=$tmp;
349				}
350
351				# Process the element of the list
352				if(!ref($element)){
353
354				if($key=~/^list\-(.+) *\n?/){
355				# mark "list-"
356				$element =~ s/\\n/\n/g;
357				$element =~ s/\\r/\r/g;
358				$element =~ s/\\t/\t/g;
359				print_Annotation($descriptor, "\n$indent <$1>$element");
360				$was_list_tab=1;
361				}else{
362				# mark which is not of the type "list-"
363				print_Annotation($descriptor, $element);
364				if($index
365				print_Annotation($descriptor, "\n$indent<$key>");
366				}
367				}
368				}
369				}
370
371				if($was_list_tab){
372				print_Annotation($descriptor, "\n$indent");
373				}
374				if(($key=~/^list\-(.+) *\n?/) && ($was_list_hash)){
375				print_Annotation($descriptor, "$indent \n$indent");
376				}
377				print_Annotation($descriptor, "\n");
378				}
379
380				if(!ref($content)){
381				# Don't print the datatype field
382				# as it's only an internal use
383				# for generate mark name
384				if($key ne "datatype"){
385				Alvis::NLPPlatform::XMLEntities::encode($content);
386				$content =~ s/\\n/\n/g;
387				$content =~ s/\\r/\r/g;
388				$content =~ s/\\t/\t/g;
389				print_Annotation($descriptor, "$indent<$key>$content\n");
390				}
391				}
392				}
393				# print STDERR "++\n";
394				return(0);
395				}
396
397				sub print_documentCollectionHeader {
398				my $descriptor = $_[0];
399
400				print_Annotation($descriptor, "\n");
401				print_Annotation($descriptor, "\n");
402				}
403
404				sub print_documentCollectionFooter {
405				my $descriptor = $_[0];
406
407				print_Annotation($descriptor, "\n");
408				}
409
410				sub render_xml{
411				my $doc_xml_hash = $_[0];
412				my $descriptor = $_[1];
413				my $printCollectionHeaderFooter = $_[2];
414				my $h_config = $_[3];
415
416
417				# then section has to be removed as soon as possible !!
418				if (0 && ((defined $h_config) && ($h_config->{'linguistic_annotation'}->{'ENABLE_SEMANTIC_TAG'}))) {
419				# TH: Very bad thing : just a hack for the end of the project :-(
420
421				open XMLOUTFILE, $h_config->{'TMPFILE'} . ".ast.out" or die "No such file or directory\n";
422
423				my @tab_xmlout = ;
424
425				close XMLOUTFILE;
426
427				shift @tab_xmlout;
428				shift @tab_xmlout;
429
430				pop @tab_xmlout;
431
432				print_Annotation($descriptor, @tab_xmlout);
433
434
435				} else {
436				my $indent=" "; # for indent
437				$end_layer="";
438
439				if ($printCollectionHeaderFooter) {
440				&print_documentCollectionHeader($descriptor);
441				}
442				print_Annotation($descriptor, $Alvis::NLPPlatform::Annotation::document_record_head);
443				print_Annotation($descriptor, " \n");
444				print_Annotation($descriptor, $Alvis::NLPPlatform::Annotation::acquisitionData);
445				print_Annotation($descriptor, $Alvis::NLPPlatform::Annotation::originalDocument);
446				print_Annotation($descriptor, $Alvis::NLPPlatform::Annotation::canonicalDocument);
447				print_Annotation($descriptor, $Alvis::NLPPlatform::Annotation::metaData);
448				print_Annotation($descriptor, $Alvis::NLPPlatform::Annotation::links);
449				print_Annotation($descriptor, $Alvis::NLPPlatform::Annotation::analysis);
450				print_Annotation($descriptor, " \n");
451				print_Annotation($descriptor, " \n");
452
453				# call to the recursive function dunmping annotations
454				render($doc_xml_hash, $descriptor);
455
456
457				if($end_layer ne ""){
458				print_Annotation($descriptor, "$indent$end_layer");
459				$end_layer="";
460				}
461
462				print_Annotation($descriptor, " \n");
463				print_Annotation($descriptor, $Alvis::NLPPlatform::Annotation::relevance);
464				print_Annotation($descriptor, "\n");
465				if ($printCollectionHeaderFooter) {
466				&print_documentCollectionFooter($descriptor);
467				}
468				}
469				return(0);
470				}
471
472
473				sub load_xml
474				{
475				my $doc_xml_in = $_[0];
476				my $h_config = $_[1];
477				my @doc_xml;
478				@doc_xml = split /\n/, $doc_xml_in;
479				my $i;
480
481
482				my $myreceiver = Alvis::NLPPlatform::MyReceiver->new();
483				my $parser = XML::Parser::PerlSAX->new(Handler => $myreceiver);
484
485				my $line;
486				$canonicalDocument=""; $is_in_canonical=0;
487				$acquisitionData=""; $is_in_acquisition=0;
488				$originalDocument=""; $is_in_original=0;
489				$metaData=""; $is_in_meta=0;
490				$links=""; $is_in_links=0;
491				$analysis=""; $is_in_analysis=0;
492				$relevance=""; $is_in_relevance=0;
493
494				my $enter="";
495				my $n_line;
496				$n_line=1;
497
498				$i=0;
499				$ALVISLANGUAGE="EN"; # default language is English
500				while ($i < scalar(@doc_xml)) {
501				$line=$doc_xml[$i];
502				$line .= "\n";
503				$i++;
504				$n_line++;
505				$enter .= $line;
506				#
507				if($line=~/([^<]+)<\/property>/i)
508				{
509				$ALVISLANGUAGE=uc($1);
510				}
511				# Get the document id
512				if($line=~//){
513				$document_record_head = $line;
514				$document_record_id=$1;
515				}
516				# canonicalDocument
517				if($line=~/]*>/i){$is_in_canonical=1;}
518				if($is_in_canonical==1){$canonicalDocument.=$line;}
519				if($line=~/<\/canonicalDocument>/i){$is_in_canonical=0;}
520				# acquisitionData
521				if($line=~/]*>/i){$is_in_acquisition=1;}
522				if($is_in_acquisition==1){$acquisitionData.=$line;}
523				if($line=~/<\/acquisitionData>/i){$is_in_acquisition=0;}
524				# originalDocument
525				if($line=~/\/]*>/i){$is_in_original=1;}
526				if($line=~//i){$is_in_original=0;$originalDocument=$line;}
527				if($is_in_original==1){$originalDocument.=$line;}
528				if($line=~/<\/originalDocument>/i){$is_in_original=0;}
529				# metaData
530				if($line=~/\/]*>/i){$is_in_meta=1;}
531				if($line=~//i){$is_in_meta=0;$metaData=$line;}
532				if($is_in_meta==1){$metaData.=$line;}
533				if($line=~/<\/metaData>/i){$is_in_meta=0;}
534				# links
535				if($line=~/\/]*>/i){$is_in_links=1;}
536				if($line=~//i){$is_in_links=0;$links=$line;}
537				if($is_in_links==1){$links.=$line;}
538				if($line=~/<\/links>/i){$is_in_links=0;}
539				# analysis
540				if($line=~/\/]*>/i){$is_in_analysis=1;}
541				if($line=~//i){$is_in_analysis=0; $analysis=$line;}
542				if($is_in_analysis==1){$analysis.=$line;}
543				if($line=~/<\/analysis>/i){$is_in_analysis=0;}
544				# relevance
545				if($line=~/\/]*>/i){$is_in_relevance=1;}
546				if($line=~//i){$is_in_relevance=0;$relevance=$line;}
547				if($is_in_relevance==1){$relevance.=$line;}
548				if($line=~/<\/relevance>/i){$is_in_relevance=0;}
549
550				# Stop analysis when the first Go out "" is encountered
551				if($line=~/<\/documentRecord>/i){
552				if (defined $doc_xml[$i]) {
553				$enter .= $doc_xml[$i];
554				}
555				last;
556				}
557				}
558
559				# print STDERR "$enter\n";
560
561				$enter=~s/ encoding = \"([^\"]*)\"/ encoding=\"UTF-8\"/;
562				if($enter=~/(<\?xml version="[0-9\.]+")(.?)([ \s\t]
563				$header=$1.$2;
564				}else{
565				$enter=$header.$enter;
566				}
567				$acquisitionData=~/([^<]+)<\/url>/g;
568				$documenturl=$1;
569
570				my $string_parse;
571				if ((!exists $h_config->{"XML_INPUT"}->{"LINGUISTIC_ANNOTATION_LOADING"}) \|\| ($h_config->{"XML_INPUT"}->{"LINGUISTIC_ANNOTATION_LOADING"} != 0)) {
572				warn " Loading existing linguistic annotations if necessary\n";
573				$parser->parse(Source=>{String=>$enter});
574
575
576				# Caveat !!! we assume that there is only named entities in the loaded documents
577				$Alvis::NLPPlatform::last_semantic_unit = $myreceiver->{"counter_id"};
578
579
580
581				}
582				$string_parse = $myreceiver->{"tab_object"};
583				return($string_parse);
584
585				}
586
587
588				sub print_Annotation
589				{
590				my ($descriptor, $string) = @_;
591
592				# print STDERR "ref : " . ref($descriptor) . "\n";
593
594
595
596				if (ref($descriptor) eq "IO::Socket::INET") {
597				print $descriptor Encode::decode_utf8($string);
598				# print $descriptor $string;
599				# print STDERR "Descriptor is a SOCKET\n";
600				}
601				if (ref($descriptor) eq "GLOB") {
602				print $descriptor Encode::decode_utf8($string);
603				# print $descriptor $string;
604				# print STDERR "Descriptor is a STREAM (GLOB)\n";
605				}
606				if (ref($descriptor) eq "SCALAR") {
607				$$descriptor .= Encode::decode_utf8($string);
608				# $$descriptor .= $string;
609				# print STDERR "Descriptor is a SCALAR\n";
610				}
611				unless (ref($descriptor)) {
612				print STDERR "Critical error: descriptor is not a reference at all.\n";
613				exit(-1);
614				}
615				# print STDERR "$string\n";
616
617				# print STDERR Encode::decode_utf8($string);
618
619				return(1);
620				}
621
622				1;
623
624				__END__