line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::NL::FactoidExtractor; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
20744
|
use 5.008007; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
30
|
|
4
|
1
|
|
|
1
|
|
5
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
2886
|
|
5
|
|
|
|
|
|
|
require Exporter; |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
8
|
|
|
|
|
|
|
our @EXPORT = qw(extract); |
9
|
|
|
|
|
|
|
our $VERSION = '1.4'; |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
#Declare global variables |
12
|
|
|
|
|
|
|
our @factoids; |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
my @functionwords=("alle","alles","andere","anderen","beide","dat","deze","dezelfde","die","dingen","dit","een","geen","hem","hen","het","hij","ieder","iedereen","iemand","iets","ik","je","jij","meer","men","mensen","niemand","niets","ons","sommige","sommigen","u","veel","vele","velen","waaraan","waaronder","wat","we","weinig","welke","wie","wij","ze","zich","zichzelf","zij","zijn","zo","zoveel"); |
15
|
|
|
|
|
|
|
my %functionwords = map { $_ => 1 } @functionwords; |
16
|
|
|
|
|
|
|
# We do not save factoids of which the subject is only a pronoun |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
# Package variables needed for reading the xml input |
19
|
|
|
|
|
|
|
my($level, %rel, %word, %root, %level, %frame, %cat, %lcat, %head, %sc, %index, %begin, %wh, %ids_for_index, @ids, %clauses_done); |
20
|
|
|
|
|
|
|
my $sentence_initial; # boolean |
21
|
|
|
|
|
|
|
my $doctitle; |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
sub extract ($$) { |
24
|
0
|
|
|
0
|
0
|
|
my ($inputfile,$verbose) = @_; |
25
|
|
|
|
|
|
|
|
26
|
0
|
|
|
|
|
|
undef @factoids; |
27
|
0
|
|
|
|
|
|
undef %rel; undef %word; undef %root; undef %level; undef %frame; undef %cat; undef %lcat; undef %head; undef %sc; undef %index; undef %begin; undef %wh; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
28
|
0
|
|
|
|
|
|
undef %ids_for_index; undef @ids; undef %clauses_done; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
|
30
|
0
|
|
|
|
|
|
$level = 0; |
31
|
0
|
|
|
|
|
|
$sentence_initial = 0; |
32
|
0
|
|
|
|
|
|
$doctitle = ""; |
33
|
0
|
|
|
|
|
|
print STDERR "Parsing $inputfile...\n"; |
34
|
|
|
|
|
|
|
|
35
|
0
|
0
|
|
|
|
|
open (ALP,"< $inputfile") or die "$! $inputfile\n"; |
36
|
|
|
|
|
|
|
|
37
|
0
|
|
|
|
|
|
while (my $line=) { |
38
|
0
|
0
|
|
|
|
|
if ($line =~ /
|
39
|
0
|
|
|
|
|
|
$level++; |
40
|
|
|
|
|
|
|
} |
41
|
|
|
|
|
|
|
|
42
|
0
|
|
|
|
|
|
my $id=""; |
43
|
0
|
0
|
|
|
|
|
if ($line =~ / id=\"([0-9]+)\" /) { |
44
|
0
|
|
|
|
|
|
$id=$1; |
45
|
|
|
|
|
|
|
} |
46
|
0
|
0
|
|
|
|
|
if ($line =~ / begin=\"([0-9]+)\" /) { |
47
|
0
|
|
|
|
|
|
my $begin = $1; |
48
|
0
|
|
|
|
|
|
$begin{$id} = $begin; |
49
|
|
|
|
|
|
|
} |
50
|
0
|
0
|
|
|
|
|
if ($line =~ / rel=\"([^\"]+)\"/) { |
51
|
0
|
|
|
|
|
|
my $rel=$1; |
52
|
0
|
|
|
|
|
|
$rel{$id} = $rel; |
53
|
|
|
|
|
|
|
} |
54
|
0
|
0
|
|
|
|
|
if ($line =~ / frame=\"([^\"]+)\"/) { |
55
|
0
|
|
|
|
|
|
my $frame=$1; |
56
|
0
|
|
|
|
|
|
$frame{$id} = $frame; |
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
|
59
|
0
|
0
|
|
|
|
|
if ($line =~ / cat=\"([^\"]+)\"/) { |
60
|
0
|
|
|
|
|
|
my $cat=$1; |
61
|
0
|
|
|
|
|
|
$cat{$id} = $cat; |
62
|
|
|
|
|
|
|
} |
63
|
0
|
0
|
|
|
|
|
if ($line =~ / lcat=\"([^\"]+)\"/) { |
64
|
0
|
|
|
|
|
|
my $lcat=$1; |
65
|
0
|
|
|
|
|
|
$lcat{$id} = $lcat; |
66
|
|
|
|
|
|
|
} |
67
|
0
|
0
|
|
|
|
|
if ($line =~ / sc=\"([^\"]+)\"/) { |
68
|
0
|
|
|
|
|
|
my $sc=$1; |
69
|
0
|
|
|
|
|
|
$sc{$id} = $sc; |
70
|
|
|
|
|
|
|
# can be copula, passive, etc. |
71
|
|
|
|
|
|
|
} |
72
|
0
|
0
|
|
|
|
|
if ($line =~ / word=\"([^\"]+?)[\.\,]?\"/) { |
73
|
0
|
|
|
|
|
|
my $word=$1; |
74
|
0
|
|
|
|
|
|
$word{$id} = $word; |
75
|
|
|
|
|
|
|
} |
76
|
0
|
0
|
|
|
|
|
if ($line =~ / root=\"([^\"]+?)[\.\,]?\"/) { |
77
|
0
|
|
|
|
|
|
my $root=$1; |
78
|
0
|
|
|
|
|
|
$root{$id} = $root; |
79
|
|
|
|
|
|
|
} |
80
|
0
|
0
|
|
|
|
|
if ($line =~ / wh=\"([^\"]+)\"/) { |
81
|
0
|
|
|
|
|
|
my $wh=$1; |
82
|
0
|
|
|
|
|
|
$wh{$id} = $wh; |
83
|
|
|
|
|
|
|
} |
84
|
0
|
0
|
|
|
|
|
if ($line =~ / index=\"([^\"]+)\"/) { |
85
|
0
|
|
|
|
|
|
my $index=$1; |
86
|
0
|
|
|
|
|
|
$index{$id} = $index; |
87
|
0
|
|
|
|
|
|
push(@{$ids_for_index{$index}},$id); |
|
0
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
|
90
|
0
|
|
|
|
|
|
$level{$id} = $level; |
91
|
0
|
0
|
0
|
|
|
|
if ($line =~ /<\/node>/ or $line =~ /\/>/) { |
92
|
0
|
|
|
|
|
|
$level--; |
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
|
95
|
0
|
0
|
|
|
|
|
if ($line =~ /(.*)<\/sentence>/) { |
96
|
0
|
0
|
|
|
|
|
print "\# $1\n" if ($verbose); |
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
} |
99
|
0
|
|
|
|
|
|
close(ALP); |
100
|
|
|
|
|
|
|
|
101
|
0
|
|
|
|
|
|
@ids = sort {$a <=> $b} keys %rel; |
|
0
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
|
103
|
0
|
|
|
|
|
|
my $sentence_initial = 0; # boolean |
104
|
|
|
|
|
|
|
|
105
|
0
|
|
|
|
|
|
foreach my $id (sort {$a <=> $b} keys %rel) { |
|
0
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
|
107
|
0
|
0
|
0
|
|
|
|
if (defined $cat{$id} && $cat{$id} =~ /^(smain|ssub|sv1)$/) { |
108
|
0
|
0
|
|
|
|
|
if (not defined $clauses_done{$id}) { |
109
|
0
|
|
|
|
|
|
my ($new_head_id,$subj_id,$subject,$voice) = &_generate_factoid($id,$1,"","",""); |
110
|
|
|
|
|
|
|
# return subject (and its id) of main clause because we need it in embedded clause vc |
111
|
|
|
|
|
|
|
# (either as subject or as object in the case of a passive main clause) |
112
|
0
|
|
|
|
|
|
while (defined $new_head_id) { |
113
|
0
|
0
|
|
|
|
|
if (not defined $clauses_done{$new_head_id}) { |
114
|
0
|
|
|
|
|
|
$id = $new_head_id; |
115
|
0
|
0
|
|
|
|
|
if ($voice eq "passive") { |
116
|
0
|
|
|
|
|
|
($new_head_id,$subj_id,$subject,$voice) = &_generate_factoid($id,"vc/body",$subj_id,"",$subject); |
117
|
|
|
|
|
|
|
# if passive, then store the subject of the main clause in the object slot of the vc |
118
|
|
|
|
|
|
|
} else { |
119
|
0
|
|
|
|
|
|
($new_head_id,$subj_id,$subject,$voice) = &_generate_factoid($id,"vc/body",$subj_id,$subject,""); |
120
|
|
|
|
|
|
|
} |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
} |
123
|
|
|
|
|
|
|
} |
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
} |
126
|
|
|
|
|
|
|
|
127
|
0
|
|
|
|
|
|
my $factoids = join("\n",@factoids); |
128
|
0
|
|
|
|
|
|
return $factoids; |
129
|
|
|
|
|
|
|
}; |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
sub _generate_factoid($$) { |
132
|
0
|
|
|
0
|
|
|
my ($clause_id,$clausetype,$subj_id,$subject,$object) = @_; |
133
|
|
|
|
|
|
|
|
134
|
0
|
|
|
|
|
|
$clauses_done{$clause_id} = 1; |
135
|
|
|
|
|
|
|
|
136
|
0
|
|
|
|
|
|
my $verb=""; |
137
|
0
|
|
|
|
|
|
my @modifiers; |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
my $new_head_id; |
140
|
|
|
|
|
|
|
# if there is a vc or body in the clause then this is an embedded factoid |
141
|
|
|
|
|
|
|
# with the same subject as the main clause |
142
|
|
|
|
|
|
|
|
143
|
0
|
|
|
|
|
|
my @headed_ids = _get_headed_ids($clause_id); |
144
|
|
|
|
|
|
|
#print STDERR "headed ids for clause $clause_id: @headed_ids\n"; |
145
|
0
|
|
|
|
|
|
my $voice="active"; |
146
|
0
|
|
|
|
|
|
my $verb_type=""; |
147
|
0
|
|
|
|
|
|
my $tuple_type="factoid"; |
148
|
0
|
|
|
|
|
|
my $obj_id; |
149
|
|
|
|
|
|
|
|
150
|
0
|
|
|
|
|
|
my $info = ""; |
151
|
|
|
|
|
|
|
|
152
|
0
|
|
|
|
|
|
foreach my $id (@headed_ids) { |
153
|
0
|
0
|
0
|
|
|
|
if (defined $sc{$id} && $sc{$id} eq "passive") { |
154
|
0
|
|
|
|
|
|
$voice = "passive"; |
155
|
|
|
|
|
|
|
} |
156
|
0
|
|
|
|
|
|
my $rel = $rel{$id}; |
157
|
0
|
|
|
|
|
|
my $frame=""; |
158
|
0
|
0
|
|
|
|
|
if (defined $frame{$id}) { |
159
|
0
|
|
|
|
|
|
$frame = $frame{$id}; |
160
|
|
|
|
|
|
|
} |
161
|
|
|
|
|
|
|
|
162
|
0
|
0
|
0
|
|
|
|
if ($rel eq "hd" && $verb eq "" && $frame =~ /verb/) { |
|
|
0
|
0
|
|
|
|
|
|
|
0
|
0
|
|
|
|
|
|
|
0
|
0
|
|
|
|
|
|
|
0
|
|
|
|
|
|
163
|
|
|
|
|
|
|
# if the verb slot was not yet filled with a main verb |
164
|
|
|
|
|
|
|
#$verb = "hd:".$word{$id}; |
165
|
|
|
|
|
|
|
#print STDERR "hd: $id\n"; |
166
|
0
|
|
|
|
|
|
$verb = "hd:".$root{$id}; |
167
|
0
|
|
|
|
|
|
$verb_type = $sc{$id}; |
168
|
|
|
|
|
|
|
# use root (lemma) of verb |
169
|
|
|
|
|
|
|
} elsif ($rel eq "vc" or $rel eq "body") { |
170
|
|
|
|
|
|
|
# get the underlying factoid recursively by returning the current id as new head id |
171
|
0
|
|
|
|
|
|
$new_head_id = $id; |
172
|
|
|
|
|
|
|
} elsif ($rel eq "su" && $subject eq "") { |
173
|
|
|
|
|
|
|
# if the subject slot was not yet filled with the subject of the main clause |
174
|
0
|
|
|
|
|
|
$subject = "su:".&_get_constituent($id); |
175
|
0
|
|
|
|
|
|
$subj_id = $id; |
176
|
0
|
0
|
|
|
|
|
if ($begin{$id} eq "0") { |
177
|
0
|
|
|
|
|
|
$sentence_initial = 1; |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
} elsif ($rel =~ /^(obj1|obj2|predc)$/) { |
180
|
0
|
|
|
|
|
|
my $rel = $1; |
181
|
0
|
0
|
0
|
|
|
|
if ($object =~ /su:/ && $rel eq "obj1") { |
182
|
|
|
|
|
|
|
# if the object slot already contains the subject of the main clause (in case of passive voice) don't add it again |
183
|
|
|
|
|
|
|
} else { |
184
|
0
|
|
|
|
|
|
$object .= "$rel:".&_get_constituent($id); |
185
|
0
|
|
|
|
|
|
$obj_id = $id; |
186
|
|
|
|
|
|
|
} |
187
|
|
|
|
|
|
|
} elsif ($rel =~ /^(mod|pc|predm|ld)$/) { |
188
|
0
|
|
|
|
|
|
my $modifier = "$1:".&_get_constituent($id); |
189
|
|
|
|
|
|
|
#print STDERR "Mod: $modifier\n"; |
190
|
0
|
|
|
|
|
|
push (@modifiers,$modifier); |
191
|
|
|
|
|
|
|
} |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
# transform passive clauses |
196
|
0
|
0
|
0
|
|
|
|
if ($subject eq "" && $object =~ /su:/) { |
197
|
0
|
|
|
|
|
|
my $m=0; |
198
|
0
|
|
|
|
|
|
$info .= "passive-to-active "; |
199
|
0
|
|
|
|
|
|
foreach my $modifier (@modifiers) { |
200
|
0
|
0
|
|
|
|
|
if ($modifier =~ /door (.+)$/) { |
201
|
0
|
|
|
|
|
|
$subject = $1; |
202
|
0
|
|
|
|
|
|
splice(@modifiers,$m,1); |
203
|
0
|
|
|
|
|
|
$info .= "modifier-to-subject "; |
204
|
|
|
|
|
|
|
} |
205
|
0
|
|
|
|
|
|
$m++; |
206
|
|
|
|
|
|
|
} |
207
|
0
|
0
|
|
|
|
|
if ($subject eq "") { |
208
|
|
|
|
|
|
|
# if none of the modifiers starts with 'door' |
209
|
0
|
|
|
|
|
|
$subject = "MEN"; |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
# transform double object constructions to a factoid and a definition |
214
|
0
|
0
|
|
|
|
|
if ($object =~ s/([a-z0-9]+):(.+) ([a-z0-9]+):(.+)/$1:$2|$3:$4/) { |
215
|
|
|
|
|
|
|
# double object construction, e.g. "het wordt het Silicon Valley van India genoemd" |
216
|
0
|
|
|
|
|
|
$info .= "double-object-to-definition "; |
217
|
0
|
|
|
|
|
|
$tuple_type = "definition"; |
218
|
0
|
|
|
|
|
|
my $definition = "<$tuple_type id='$clause_id' subj='$1:$2' verb='IS' obj='$3:$4' mods='' topic='$doctitle'> # $info"; |
219
|
0
|
|
|
|
|
|
$definition = &_clean_up($definition); |
220
|
0
|
|
|
|
|
|
push (@factoids,$definition); |
221
|
0
|
|
|
|
|
|
$tuple_type = "factoid"; |
222
|
|
|
|
|
|
|
} |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
# transform copular constructions without modifiers to definitions |
225
|
0
|
0
|
0
|
|
|
|
if ($verb_type eq "copula" && $subject =~ /\S/ && $object =~ /\S/){ |
|
|
|
0
|
|
|
|
|
226
|
0
|
|
|
|
|
|
$verb = "IS"; |
227
|
0
|
|
|
|
|
|
$tuple_type = "definition"; |
228
|
0
|
|
|
|
|
|
$info .= "copula-to-definition "; |
229
|
|
|
|
|
|
|
} |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
# resolve relative pronouns: replace die/dat/wat by the most recent NP. |
232
|
0
|
0
|
0
|
|
|
|
if ($subject =~ /:(die|dat|wat) *$/i && defined($subj_id)) { |
233
|
|
|
|
|
|
|
#print STDERR "Get recent cat id for subject id $subj_id ($subject)\n"; |
234
|
0
|
|
|
|
|
|
my $head_id = &_get_recent_cat_id($subj_id,"np"); |
235
|
0
|
|
|
|
|
|
$subject = "su:".&_get_constituent($head_id); |
236
|
0
|
|
|
|
|
|
$info .= "pron-to-np "; |
237
|
|
|
|
|
|
|
} |
238
|
0
|
0
|
0
|
|
|
|
if ($object =~ /:(die|dat|wat) *$/i && defined($obj_id)) { |
239
|
0
|
|
|
|
|
|
my $head_id = &_get_recent_cat_id($obj_id,"np"); |
240
|
0
|
|
|
|
|
|
$object = "obj:".&_get_constituent($head_id); |
241
|
0
|
|
|
|
|
|
$info .= "pron-to-np "; |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
|
245
|
0
|
|
|
|
|
|
$subject = &_clean_up($subject); |
246
|
0
|
|
|
|
|
|
$object = &_clean_up($object); |
247
|
|
|
|
|
|
|
|
248
|
0
|
0
|
|
|
|
|
if ($object =~ s/ ([0-9]{4})$//) { |
249
|
0
|
|
|
|
|
|
push(@modifiers,$1); |
250
|
|
|
|
|
|
|
# if the object ends in a year then move it to the modifiers |
251
|
|
|
|
|
|
|
} |
252
|
0
|
|
|
|
|
|
my $modifiers = join("|",@modifiers); |
253
|
|
|
|
|
|
|
|
254
|
0
|
0
|
0
|
|
|
|
if ($sentence_initial && $subject =~ /^(de|het|een|die|dat|deze|dit|alle|andere|dezelfde|geen|ieder|meer|veel|vele|weinig|welke|zoveel) /i) { |
255
|
0
|
|
|
|
|
|
$subject = lcfirst($subject); |
256
|
|
|
|
|
|
|
# at the beginning of a sentence, lowercase determiners |
257
|
|
|
|
|
|
|
} |
258
|
0
|
|
|
|
|
|
my $factoid = "<$tuple_type id='$clause_id' subj='$subject' verb='$verb' obj='$object' mods='$modifiers' topic='$doctitle'> # $info"; |
259
|
0
|
|
|
|
|
|
$factoid = &_clean_up($factoid); |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
#print STDERR "Verb type: $verb_type\n"; |
262
|
0
|
0
|
0
|
|
|
|
if ($verb eq "" or (($object eq "") && (not defined $modifiers[0]) && ($verb_type =~ /(aux|passive)/))) { |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
263
|
|
|
|
|
|
|
# throw away empty passives for which the sub clause has been raised, e.g. ("Dit rijk wordt") |
264
|
|
|
|
|
|
|
} else { |
265
|
0
|
|
|
|
|
|
push(@factoids,$factoid); |
266
|
|
|
|
|
|
|
} |
267
|
0
|
|
|
|
|
|
return ($new_head_id,$subj_id,"su:$subject",$voice); |
268
|
|
|
|
|
|
|
} |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
sub _get_constituent($) { |
271
|
0
|
|
|
0
|
|
|
my ($start_id) = @_; |
272
|
0
|
|
|
|
|
|
my $constituent = ""; |
273
|
0
|
0
|
0
|
|
|
|
if (not defined $cat{$start_id} && not defined $lcat{$start_id} && defined $index{$start_id}) { |
|
|
|
0
|
|
|
|
|
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
# find the constituent that has the same index |
276
|
0
|
0
|
|
|
|
|
if (defined $index{$start_id}) { |
277
|
0
|
|
|
|
|
|
my $index = $index{$start_id}; |
278
|
0
|
|
|
|
|
|
foreach my $index_id (@{$ids_for_index{$index}}) { |
|
0
|
|
|
|
|
|
|
279
|
0
|
0
|
0
|
|
|
|
if (defined $cat{$index_id} or defined $lcat{$index_id}) { |
280
|
0
|
|
|
|
|
|
$start_id = $index_id; |
281
|
0
|
|
|
|
|
|
last; |
282
|
|
|
|
|
|
|
} |
283
|
|
|
|
|
|
|
} |
284
|
|
|
|
|
|
|
} |
285
|
|
|
|
|
|
|
} |
286
|
0
|
|
|
|
|
|
my $rellevel = $level{$start_id}; |
287
|
0
|
0
|
|
|
|
|
$constituent .= "$word{$start_id} " if (defined $word{$start_id}); |
288
|
0
|
|
|
|
|
|
my $id=$start_id; |
289
|
0
|
|
|
|
|
|
$id++; |
290
|
0
|
|
0
|
|
|
|
while ($id <= $ids[-1] && $level{$id} > $rellevel) { |
291
|
0
|
0
|
|
|
|
|
last if ($rel{$id} eq "rhd"); |
292
|
0
|
0
|
|
|
|
|
$constituent .= "$word{$id} " if (defined $word{$id}); |
293
|
0
|
|
|
|
|
|
$id++; |
294
|
|
|
|
|
|
|
} |
295
|
0
|
|
|
|
|
|
return $constituent; |
296
|
|
|
|
|
|
|
} |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
sub _get_recent_cat_id($$) { |
299
|
0
|
|
|
0
|
|
|
my ($id,$search_cat) = @_; |
300
|
0
|
|
|
|
|
|
my $head_id = $id; |
301
|
0
|
|
|
|
|
|
my $cat_of_head_id=""; |
302
|
0
|
0
|
|
|
|
|
if (defined $cat{$head_id}) { |
|
|
0
|
|
|
|
|
|
303
|
0
|
|
|
|
|
|
$cat_of_head_id = $cat{$head_id}; |
304
|
|
|
|
|
|
|
} elsif (defined $lcat{$head_id}) { |
305
|
0
|
|
|
|
|
|
$cat_of_head_id = $lcat{$head_id}; |
306
|
|
|
|
|
|
|
} |
307
|
|
|
|
|
|
|
#print STDERR "Head id: $head_id, Search cat: $search_cat\n"; |
308
|
0
|
|
0
|
|
|
|
while ($cat_of_head_id ne $search_cat && $head_id > 0) { |
309
|
0
|
|
|
|
|
|
$head_id--; |
310
|
|
|
|
|
|
|
#print STDERR "Head id: $head_id\n"; |
311
|
|
|
|
|
|
|
} |
312
|
0
|
0
|
0
|
|
|
|
if ($head_id == 0 && $cat{$head_id} ne $search_cat){ |
313
|
|
|
|
|
|
|
# if no note of type search_cat was found before then the original id is returned |
314
|
|
|
|
|
|
|
# (for example, when a sentence starts with a pronoun, there is no preceding NP) |
315
|
0
|
|
|
|
|
|
return $id; |
316
|
|
|
|
|
|
|
} |
317
|
0
|
|
|
|
|
|
return $head_id; |
318
|
|
|
|
|
|
|
} |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
sub _get_headed_ids($) { |
321
|
0
|
|
|
0
|
|
|
my ($head_id) = @_; |
322
|
0
|
|
|
|
|
|
my $headlevel = $level{$head_id}; |
323
|
0
|
|
|
|
|
|
my @headed_ids; |
324
|
0
|
|
|
|
|
|
my $id = $head_id; |
325
|
0
|
|
|
|
|
|
$id++; |
326
|
0
|
0
|
|
|
|
|
push(@headed_ids,$id) if ($level{$id} == $headlevel+1); |
327
|
0
|
|
0
|
|
|
|
while ($id < $ids[-1] && $level{$id} > $headlevel) { |
328
|
0
|
|
|
|
|
|
$id++; |
329
|
0
|
0
|
|
|
|
|
push(@headed_ids,$id) if ($level{$id} == $headlevel+1); |
330
|
|
|
|
|
|
|
} |
331
|
0
|
|
|
|
|
|
return @headed_ids; |
332
|
|
|
|
|
|
|
} |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
sub _clean_up($) { |
335
|
0
|
|
|
0
|
|
|
my ($string) = @_; |
336
|
0
|
|
|
|
|
|
$string =~ s/[a-z0-9]+: *//g; |
337
|
0
|
|
|
|
|
|
$string =~ s/ +/ /g; |
338
|
0
|
|
|
|
|
|
$string =~ s/^ //; |
339
|
0
|
|
|
|
|
|
$string =~ s/[,.] *$//; |
340
|
0
|
|
|
|
|
|
$string =~ s/=\' /=\'/g; |
341
|
0
|
|
|
|
|
|
$string =~ s/ \' /\' /g; |
342
|
0
|
|
|
|
|
|
$string =~ s/ \'>/\'>/g; |
343
|
0
|
|
|
|
|
|
return $string; |
344
|
|
|
|
|
|
|
} |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
1; |
348
|
|
|
|
|
|
|
__END__ |