|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package Lingua::NL::FactoidExtractor;  | 
| 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
3
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
20744
 | 
 use 5.008007;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
30
 | 
    | 
| 
4
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
5
 | 
 use strict;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2886
 | 
    | 
| 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 require Exporter;  | 
| 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our @ISA = qw(Exporter);  | 
| 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our @EXPORT = qw(extract);  | 
| 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $VERSION = '1.4';  | 
| 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #Declare global variables  | 
| 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our @factoids;  | 
| 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 my @functionwords=("alle","alles","andere","anderen","beide","dat","deze","dezelfde","die","dingen","dit","een","geen","hem","hen","het","hij","ieder","iedereen","iemand","iets","ik","je","jij","meer","men","mensen","niemand","niets","ons","sommige","sommigen","u","veel","vele","velen","waaraan","waaronder","wat","we","weinig","welke","wie","wij","ze","zich","zichzelf","zij","zijn","zo","zoveel");  | 
| 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 my %functionwords = map { $_ => 1 } @functionwords;  | 
| 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # We do not save factoids of which the subject is only a pronoun   | 
| 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
18
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Package variables needed for reading the xml input  | 
| 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 my($level, %rel, %word, %root, %level, %frame, %cat, %lcat, %head, %sc, %index, %begin, %wh, %ids_for_index, @ids, %clauses_done);  | 
| 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 my $sentence_initial; # boolean  | 
| 
21
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 my $doctitle;  | 
| 
22
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
23
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub extract ($$) {  | 
| 
24
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
  
0
  
 | 
 
 | 
   my ($inputfile,$verbose) = @_;  | 
| 
25
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
26
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   undef @factoids;   | 
| 
27
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   undef %rel; undef %word; undef %root; undef %level; undef %frame; undef %cat; undef %lcat; undef %head; undef %sc; undef %index; undef %begin; undef %wh;  | 
| 
 
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
28
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   undef %ids_for_index; undef @ids; undef %clauses_done;  | 
| 
 
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
29
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
30
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   $level = 0;  | 
| 
31
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   $sentence_initial = 0;  | 
| 
32
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   $doctitle = "";  | 
| 
33
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   print STDERR "Parsing $inputfile...\n";  | 
| 
34
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     | 
| 
35
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   open (ALP,"< $inputfile") or die "$! $inputfile\n";  | 
| 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
37
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   while (my $line=) {  | 
| 
38
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ($line =~ /
 | 
| 
39
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	$level++;  | 
| 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }   | 
| 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       | 
| 
42
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $id="";   | 
| 
43
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ($line =~ / id=\"([0-9]+)\" /) {  | 
| 
44
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	$id=$1;  | 
| 
45
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
46
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ($line =~ / begin=\"([0-9]+)\" /) {  | 
| 
47
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	my $begin = $1;  | 
| 
48
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	$begin{$id} = $begin;  | 
| 
49
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
50
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ($line =~ / rel=\"([^\"]+)\"/) {  | 
| 
51
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	my $rel=$1;  | 
| 
52
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	$rel{$id} = $rel;  | 
| 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
54
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ($line =~ / frame=\"([^\"]+)\"/) {  | 
| 
55
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $frame=$1;  | 
| 
56
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $frame{$id} = $frame;  | 
| 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
59
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ($line =~ / cat=\"([^\"]+)\"/) {  | 
| 
60
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $cat=$1;  | 
| 
61
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $cat{$id} = $cat;  | 
| 
62
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
63
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ($line =~ / lcat=\"([^\"]+)\"/) {  | 
| 
64
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $lcat=$1;  | 
| 
65
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $lcat{$id} = $lcat;  | 
| 
66
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
67
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ($line =~ / sc=\"([^\"]+)\"/) {  | 
| 
68
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $sc=$1;  | 
| 
69
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $sc{$id} = $sc;  | 
| 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # can be copula, passive, etc.  | 
| 
71
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
72
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ($line =~ / word=\"([^\"]+?)[\.\,]?\"/) {  | 
| 
73
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	my $word=$1;  | 
| 
74
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	$word{$id} = $word;  | 
| 
75
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
76
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ($line =~ / root=\"([^\"]+?)[\.\,]?\"/) {  | 
| 
77
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	my $root=$1;  | 
| 
78
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	$root{$id} = $root;  | 
| 
79
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }    | 
| 
80
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ($line =~ / wh=\"([^\"]+)\"/) {  | 
| 
81
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $wh=$1;  | 
| 
82
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $wh{$id} = $wh;  | 
| 
83
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }      | 
| 
84
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ($line =~ / index=\"([^\"]+)\"/) {  | 
| 
85
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	my $index=$1;  | 
| 
86
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	$index{$id} = $index;  | 
| 
87
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	push(@{$ids_for_index{$index}},$id);  | 
| 
 
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
88
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
89
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
90
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $level{$id} = $level;  | 
| 
91
 | 
0
 | 
  
  0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
     if ($line =~ /<\/node>/ or $line =~ /\/>/) {  | 
| 
92
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     	$level--;  | 
| 
93
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
94
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       | 
| 
95
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ($line =~ /(.*)<\/sentence>/) {  | 
| 
96
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	print "\# $1\n" if ($verbose);   | 
| 
97
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
98
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
99
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   close(ALP);  | 
| 
100
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
101
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   @ids = sort {$a <=> $b} keys %rel;  | 
| 
 
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
102
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
103
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   my $sentence_initial = 0; # boolean  | 
| 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
105
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   foreach my $id (sort {$a <=> $b} keys %rel) {  | 
| 
 
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
106
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
107
 | 
0
 | 
  
  0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
     if (defined $cat{$id} && $cat{$id} =~ /^(smain|ssub|sv1)$/) {  | 
| 
108
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	if (not defined $clauses_done{$id}) {  | 
| 
109
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    my ($new_head_id,$subj_id,$subject,$voice) = &_generate_factoid($id,$1,"","","");  | 
| 
110
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    # return subject (and its id) of main clause because we need it in embedded clause vc  | 
| 
111
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    # (either as subject or as object in the case of a passive main clause)  | 
| 
112
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    while (defined $new_head_id) {  | 
| 
113
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		if (not defined $clauses_done{$new_head_id}) {  | 
| 
114
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		    $id = $new_head_id;  | 
| 
115
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		    if ($voice eq "passive") {  | 
| 
116
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 			($new_head_id,$subj_id,$subject,$voice) = &_generate_factoid($id,"vc/body",$subj_id,"",$subject);  | 
| 
117
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 			# if passive, then store the subject of the main clause in the object slot of the vc  | 
| 
118
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		    } else {  | 
| 
119
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 			($new_head_id,$subj_id,$subject,$voice) = &_generate_factoid($id,"vc/body",$subj_id,$subject,"");  | 
| 
120
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		    }  | 
| 
121
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		}  | 
| 
122
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    }  | 
| 
123
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
124
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
125
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
126
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
127
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   my $factoids = join("\n",@factoids);  | 
| 
128
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   return $factoids;  | 
| 
129
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 };  | 
| 
130
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
131
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _generate_factoid($$) {  | 
| 
132
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
 
 | 
 
 | 
     my ($clause_id,$clausetype,$subj_id,$subject,$object) = @_;  | 
| 
133
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       | 
| 
134
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $clauses_done{$clause_id} = 1;  | 
| 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
136
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $verb="";  | 
| 
137
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my @modifiers;  | 
| 
138
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       | 
| 
139
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $new_head_id;   | 
| 
140
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # if there is a vc or body in the clause then this is an embedded factoid  | 
| 
141
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # with the same subject as the main clause  | 
| 
142
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
143
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my @headed_ids = _get_headed_ids($clause_id);  | 
| 
144
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     #print STDERR "headed ids for clause $clause_id: @headed_ids\n";  | 
| 
145
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $voice="active";  | 
| 
146
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $verb_type="";  | 
| 
147
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $tuple_type="factoid";  | 
| 
148
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $obj_id;  | 
| 
149
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       | 
| 
150
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $info = "";  | 
| 
151
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
152
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     foreach my $id (@headed_ids) {  | 
| 
153
 | 
0
 | 
  
  0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 	if (defined $sc{$id} && $sc{$id} eq "passive") {  | 
| 
154
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    $voice = "passive";  | 
| 
155
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
156
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	my $rel = $rel{$id};  | 
| 
157
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	my $frame="";  | 
| 
158
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	if (defined $frame{$id}) {  | 
| 
159
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    $frame = $frame{$id};  | 
| 
160
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
161
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	  | 
| 
162
 | 
0
 | 
  
  0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 	if ($rel eq "hd" && $verb eq "" && $frame =~ /verb/) {  | 
| 
 
 | 
 
 | 
  
  0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
  
  0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
  
  0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
163
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    # if the verb slot was not yet filled with a main verb  | 
| 
164
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    #$verb = "hd:".$word{$id};  | 
| 
165
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    #print STDERR "hd: $id\n";  | 
| 
166
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    $verb = "hd:".$root{$id};   | 
| 
167
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    $verb_type = $sc{$id};  | 
| 
168
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    # use root (lemma) of verb  | 
| 
169
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	} elsif ($rel eq "vc" or $rel eq "body") {  | 
| 
170
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    # get the underlying factoid recursively by returning the current id as new head id  | 
| 
171
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    $new_head_id = $id;  | 
| 
172
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	} elsif ($rel eq "su" && $subject eq "") {  | 
| 
173
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    # if the subject slot was not yet filled with the subject of the main clause  | 
| 
174
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    $subject = "su:".&_get_constituent($id);  | 
| 
175
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    $subj_id = $id;  | 
| 
176
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             if ($begin{$id} eq "0") {  | 
| 
177
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		$sentence_initial = 1;   | 
| 
178
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }     | 
| 
179
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	} elsif ($rel =~ /^(obj1|obj2|predc)$/) {  | 
| 
180
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    my $rel = $1;  | 
| 
181
 | 
0
 | 
  
  0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 	    if ($object =~ /su:/ && $rel eq "obj1") {  | 
| 
182
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		# if the object slot already contains the subject of the main clause (in case of passive voice) don't add it again  | 
| 
183
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    } else {  | 
| 
184
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		$object .= "$rel:".&_get_constituent($id);  | 
| 
185
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		$obj_id = $id;  | 
| 
186
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    }  | 
| 
187
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	} elsif ($rel =~ /^(mod|pc|predm|ld)$/) {  | 
| 
188
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    my $modifier = "$1:".&_get_constituent($id);  | 
| 
189
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    #print STDERR "Mod: $modifier\n";  | 
| 
190
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    push (@modifiers,$modifier);  | 
| 
191
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}   | 
| 
192
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	  | 
| 
193
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
194
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       | 
| 
195
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # transform passive clauses  | 
| 
196
 | 
0
 | 
  
  0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
     if ($subject eq "" && $object =~ /su:/) {  | 
| 
197
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	my $m=0;  | 
| 
198
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	$info .= "passive-to-active ";  | 
| 
199
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	foreach my $modifier (@modifiers) {  | 
| 
200
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    if ($modifier =~ /door (.+)$/) {  | 
| 
201
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		$subject = $1;  | 
| 
202
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		splice(@modifiers,$m,1);  | 
| 
203
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		$info .= "modifier-to-subject ";  | 
| 
204
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    }   | 
| 
205
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    $m++;  | 
| 
206
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
207
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	if ($subject eq "") {  | 
| 
208
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    # if none of the modifiers starts with 'door'  | 
| 
209
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    $subject = "MEN";  | 
| 
210
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
211
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
212
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       | 
| 
213
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # transform double object constructions to a factoid and a definition  | 
| 
214
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ($object =~ s/([a-z0-9]+):(.+) ([a-z0-9]+):(.+)/$1:$2|$3:$4/) {  | 
| 
215
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# double object construction, e.g. "het wordt het Silicon Valley van India genoemd"  | 
| 
216
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	$info .= "double-object-to-definition ";  | 
| 
217
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	$tuple_type = "definition";  | 
| 
218
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	my $definition = "<$tuple_type id='$clause_id' subj='$1:$2' verb='IS' obj='$3:$4' mods='' topic='$doctitle'> # $info";  | 
| 
219
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	$definition = &_clean_up($definition);  | 
| 
220
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	push (@factoids,$definition);  | 
| 
221
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	$tuple_type = "factoid";  | 
| 
222
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
223
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       | 
| 
224
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # transform copular constructions without modifiers to definitions  | 
| 
225
 | 
0
 | 
  
  0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
     if ($verb_type eq "copula" && $subject =~ /\S/ && $object =~ /\S/){  | 
| 
 
 | 
 
 | 
 
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
226
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	$verb = "IS";  | 
| 
227
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	$tuple_type = "definition";  | 
| 
228
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	$info .= "copula-to-definition ";  | 
| 
229
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
230
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
231
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # resolve relative pronouns: replace die/dat/wat by the most recent NP.  | 
| 
232
 | 
0
 | 
  
  0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
     if ($subject =~ /:(die|dat|wat) *$/i && defined($subj_id)) {  | 
| 
233
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	#print STDERR "Get recent cat id for subject id $subj_id ($subject)\n";  | 
| 
234
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $head_id = &_get_recent_cat_id($subj_id,"np");  | 
| 
235
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $subject = "su:".&_get_constituent($head_id);  | 
| 
236
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $info .= "pron-to-np ";  | 
| 
237
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
238
 | 
0
 | 
  
  0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
     if ($object =~ /:(die|dat|wat) *$/i && defined($obj_id)) {  | 
| 
239
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $head_id = &_get_recent_cat_id($obj_id,"np");  | 
| 
240
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $object = "obj:".&_get_constituent($head_id);  | 
| 
241
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $info .= "pron-to-np ";  | 
| 
242
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }      | 
| 
243
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       | 
| 
244
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
245
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $subject = &_clean_up($subject);  | 
| 
246
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $object = &_clean_up($object);  | 
| 
247
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       | 
| 
248
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ($object =~ s/ ([0-9]{4})$//) {  | 
| 
249
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	push(@modifiers,$1);  | 
| 
250
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# if the object ends in a year then move it to the modifiers  | 
| 
251
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
252
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $modifiers = join("|",@modifiers);  | 
| 
253
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
254
 | 
0
 | 
  
  0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
     if ($sentence_initial &&  $subject =~ /^(de|het|een|die|dat|deze|dit|alle|andere|dezelfde|geen|ieder|meer|veel|vele|weinig|welke|zoveel) /i) {  | 
| 
255
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $subject = lcfirst($subject);  | 
| 
256
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # at the beginning of a sentence, lowercase determiners  | 
| 
257
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
258
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $factoid = "<$tuple_type id='$clause_id' subj='$subject' verb='$verb' obj='$object' mods='$modifiers' topic='$doctitle'> # $info";  | 
| 
259
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $factoid = &_clean_up($factoid);  | 
| 
260
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       | 
| 
261
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     #print STDERR "Verb type: $verb_type\n";  | 
| 
262
 | 
0
 | 
  
  0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
     if ($verb eq "" or (($object eq "") && (not defined $modifiers[0]) && ($verb_type =~ /(aux|passive)/))) {  | 
| 
 
 | 
 
 | 
 
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
 
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
263
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# throw away empty passives for which the sub clause has been raised, e.g. ("Dit rijk wordt")  | 
| 
264
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } else {  | 
| 
265
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	push(@factoids,$factoid);  | 
| 
266
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
267
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return ($new_head_id,$subj_id,"su:$subject",$voice);  | 
| 
268
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
269
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
270
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _get_constituent($) {  | 
| 
271
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
 
 | 
 
 | 
     my ($start_id) = @_;  | 
| 
272
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $constituent = "";  | 
| 
273
 | 
0
 | 
  
  0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
     if (not defined $cat{$start_id} && not defined $lcat{$start_id} && defined $index{$start_id}) {  | 
| 
 
 | 
 
 | 
 
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
274
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	  | 
| 
275
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# find the constituent that has the same index  | 
| 
276
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	if (defined $index{$start_id}) {  | 
| 
277
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	  my $index = $index{$start_id};  | 
| 
278
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     	  foreach my $index_id (@{$ids_for_index{$index}}) {  | 
| 
 
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
279
 | 
0
 | 
  
  0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
     	    if (defined $cat{$index_id} or defined $lcat{$index_id}) {      | 
| 
280
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		$start_id = $index_id;  | 
| 
281
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		last;  | 
| 
282
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	    }  | 
| 
283
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	  }  | 
| 
284
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
285
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
286
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $rellevel = $level{$start_id};  | 
| 
287
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $constituent .= "$word{$start_id} " if (defined $word{$start_id});  | 
| 
288
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $id=$start_id;  | 
| 
289
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $id++;  | 
| 
290
 | 
0
 | 
 
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
     while ($id <= $ids[-1] && $level{$id} > $rellevel) {  | 
| 
291
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	last if ($rel{$id} eq "rhd");  | 
| 
292
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	$constituent .= "$word{$id} " if (defined $word{$id});	      | 
| 
293
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	$id++;  | 
| 
294
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
295
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return $constituent;  | 
| 
296
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
297
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
298
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _get_recent_cat_id($$) {  | 
| 
299
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
 
 | 
 
 | 
     my ($id,$search_cat) = @_;  | 
| 
300
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $head_id = $id;  | 
| 
301
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $cat_of_head_id="";  | 
| 
302
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if (defined $cat{$head_id}) {  | 
| 
 
 | 
 
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
303
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	$cat_of_head_id = $cat{$head_id};  | 
| 
304
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } elsif (defined $lcat{$head_id}) {  | 
| 
305
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	$cat_of_head_id = $lcat{$head_id};  | 
| 
306
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
307
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     #print STDERR "Head id: $head_id, Search cat: $search_cat\n";  | 
| 
308
 | 
0
 | 
 
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
     while ($cat_of_head_id ne $search_cat && $head_id > 0) {  | 
| 
309
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	$head_id--;  | 
| 
310
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         #print STDERR "Head id: $head_id\n";  | 
| 
311
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
312
 | 
0
 | 
  
  0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
     if ($head_id == 0 && $cat{$head_id} ne $search_cat){  | 
| 
313
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# if no note of type search_cat was found before then the original id is returned  | 
| 
314
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# (for example, when a sentence starts with a pronoun, there is no preceding NP)  | 
| 
315
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	return $id;  | 
| 
316
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
317
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return $head_id;  | 
| 
318
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
319
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
320
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _get_headed_ids($) {  | 
| 
321
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
 
 | 
 
 | 
     my ($head_id) = @_;  | 
| 
322
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $headlevel = $level{$head_id};  | 
| 
323
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my @headed_ids;  | 
| 
324
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $id = $head_id;  | 
| 
325
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $id++;  | 
| 
326
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     push(@headed_ids,$id) if ($level{$id} == $headlevel+1);  | 
| 
327
 | 
0
 | 
 
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
     while ($id < $ids[-1] && $level{$id} > $headlevel) {  | 
| 
328
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $id++;  | 
| 
329
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         push(@headed_ids,$id) if ($level{$id} == $headlevel+1);  | 
| 
330
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
331
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return @headed_ids;  | 
| 
332
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
333
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
334
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _clean_up($) {  | 
| 
335
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
 
 | 
 
 | 
     my ($string) = @_;  | 
| 
336
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $string =~ s/[a-z0-9]+: *//g;  | 
| 
337
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $string =~ s/ +/ /g;  | 
| 
338
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $string =~ s/^ //;  | 
| 
339
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $string =~ s/[,.] *$//;  | 
| 
340
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $string =~ s/=\' /=\'/g;  | 
| 
341
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $string =~ s/ \' /\' /g;  | 
| 
342
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $string =~ s/ \'>/\'>/g;  | 
| 
343
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return $string;  | 
| 
344
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
345
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
346
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
347
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;  | 
| 
348
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 __END__  |