File Coverage

blib/lib/Lingua/NL/FactoidExtractor.pm

Criterion	Covered	Total	%
statement	6	210	2.8
branch	0	100	0.0
condition	0	72	0.0
subroutine	2	8	25.0
pod	0	1	0.0
total	8	391	2.0

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::NL::FactoidExtractor;
2
3	1			1		20744	use 5.008007;
	1					3
	1					30
4	1			1		5	use strict;
	1					1
	1					2886
5							require Exporter;
6
7							our @ISA = qw(Exporter);
8							our @EXPORT = qw(extract);
9							our $VERSION = '1.4';
10
11							#Declare global variables
12							our @factoids;
13
14							my @functionwords=("alle","alles","andere","anderen","beide","dat","deze","dezelfde","die","dingen","dit","een","geen","hem","hen","het","hij","ieder","iedereen","iemand","iets","ik","je","jij","meer","men","mensen","niemand","niets","ons","sommige","sommigen","u","veel","vele","velen","waaraan","waaronder","wat","we","weinig","welke","wie","wij","ze","zich","zichzelf","zij","zijn","zo","zoveel");
15							my %functionwords = map { $_ => 1 } @functionwords;
16							# We do not save factoids of which the subject is only a pronoun
17
18							# Package variables needed for reading the xml input
19							my($level, %rel, %word, %root, %level, %frame, %cat, %lcat, %head, %sc, %index, %begin, %wh, %ids_for_index, @ids, %clauses_done);
20							my $sentence_initial; # boolean
21							my $doctitle;
22
23							sub extract ($$) {
24	0			0	0		my ($inputfile,$verbose) = @_;
25
26	0						undef @factoids;
27	0						undef %rel; undef %word; undef %root; undef %level; undef %frame; undef %cat; undef %lcat; undef %head; undef %sc; undef %index; undef %begin; undef %wh;
	0
	0
	0
	0
	0
	0
	0
	0
	0
	0
	0
28	0						undef %ids_for_index; undef @ids; undef %clauses_done;
	0
	0
29
30	0						$level = 0;
31	0						$sentence_initial = 0;
32	0						$doctitle = "";
33	0						print STDERR "Parsing $inputfile...\n";
34
35	0	0					open (ALP,"< $inputfile") or die "$! $inputfile\n";
36
37	0						while (my $line=) {
38	0	0					if ($line =~ /
39	0						$level++;
40							}
41
42	0						my $id="";
43	0	0					if ($line =~ / id=\"([0-9]+)\" /) {
44	0						$id=$1;
45							}
46	0	0					if ($line =~ / begin=\"([0-9]+)\" /) {
47	0						my $begin = $1;
48	0						$begin{$id} = $begin;
49							}
50	0	0					if ($line =~ / rel=\"([^\"]+)\"/) {
51	0						my $rel=$1;
52	0						$rel{$id} = $rel;
53							}
54	0	0					if ($line =~ / frame=\"([^\"]+)\"/) {
55	0						my $frame=$1;
56	0						$frame{$id} = $frame;
57							}
58
59	0	0					if ($line =~ / cat=\"([^\"]+)\"/) {
60	0						my $cat=$1;
61	0						$cat{$id} = $cat;
62							}
63	0	0					if ($line =~ / lcat=\"([^\"]+)\"/) {
64	0						my $lcat=$1;
65	0						$lcat{$id} = $lcat;
66							}
67	0	0					if ($line =~ / sc=\"([^\"]+)\"/) {
68	0						my $sc=$1;
69	0						$sc{$id} = $sc;
70							# can be copula, passive, etc.
71							}
72	0	0					if ($line =~ / word=\"([^\"]+?)[\.\,]?\"/) {
73	0						my $word=$1;
74	0						$word{$id} = $word;
75							}
76	0	0					if ($line =~ / root=\"([^\"]+?)[\.\,]?\"/) {
77	0						my $root=$1;
78	0						$root{$id} = $root;
79							}
80	0	0					if ($line =~ / wh=\"([^\"]+)\"/) {
81	0						my $wh=$1;
82	0						$wh{$id} = $wh;
83							}
84	0	0					if ($line =~ / index=\"([^\"]+)\"/) {
85	0						my $index=$1;
86	0						$index{$id} = $index;
87	0						push(@{$ids_for_index{$index}},$id);
	0
88							}
89
90	0						$level{$id} = $level;
91	0	0	0				if ($line =~ /<\/node>/ or $line =~ /\/>/) {
92	0						$level--;
93							}
94
95	0	0					if ($line =~ /(.*)<\/sentence>/) {
96	0	0					print "\# $1\n" if ($verbose);
97							}
98							}
99	0						close(ALP);
100
101	0						@ids = sort {$a <=> $b} keys %rel;
	0
102
103	0						my $sentence_initial = 0; # boolean
104
105	0						foreach my $id (sort {$a <=> $b} keys %rel) {
	0
106
107	0	0	0				if (defined $cat{$id} && $cat{$id} =~ /^(smain\|ssub\|sv1)$/) {
108	0	0					if (not defined $clauses_done{$id}) {
109	0						my ($new_head_id,$subj_id,$subject,$voice) = &_generate_factoid($id,$1,"","","");
110							# return subject (and its id) of main clause because we need it in embedded clause vc
111							# (either as subject or as object in the case of a passive main clause)
112	0						while (defined $new_head_id) {
113	0	0					if (not defined $clauses_done{$new_head_id}) {
114	0						$id = $new_head_id;
115	0	0					if ($voice eq "passive") {
116	0						($new_head_id,$subj_id,$subject,$voice) = &_generate_factoid($id,"vc/body",$subj_id,"",$subject);
117							# if passive, then store the subject of the main clause in the object slot of the vc
118							} else {
119	0						($new_head_id,$subj_id,$subject,$voice) = &_generate_factoid($id,"vc/body",$subj_id,$subject,"");
120							}
121							}
122							}
123							}
124							}
125							}
126
127	0						my $factoids = join("\n",@factoids);
128	0						return $factoids;
129							};
130
131							sub _generate_factoid($$) {
132	0			0			my ($clause_id,$clausetype,$subj_id,$subject,$object) = @_;
133
134	0						$clauses_done{$clause_id} = 1;
135
136	0						my $verb="";
137	0						my @modifiers;
138
139							my $new_head_id;
140							# if there is a vc or body in the clause then this is an embedded factoid
141							# with the same subject as the main clause
142
143	0						my @headed_ids = _get_headed_ids($clause_id);
144							#print STDERR "headed ids for clause $clause_id: @headed_ids\n";
145	0						my $voice="active";
146	0						my $verb_type="";
147	0						my $tuple_type="factoid";
148	0						my $obj_id;
149
150	0						my $info = "";
151
152	0						foreach my $id (@headed_ids) {
153	0	0	0				if (defined $sc{$id} && $sc{$id} eq "passive") {
154	0						$voice = "passive";
155							}
156	0						my $rel = $rel{$id};
157	0						my $frame="";
158	0	0					if (defined $frame{$id}) {
159	0						$frame = $frame{$id};
160							}
161
162	0	0	0				if ($rel eq "hd" && $verb eq "" && $frame =~ /verb/) {
		0	0
		0	0
		0	0
		0
163							# if the verb slot was not yet filled with a main verb
164							#$verb = "hd:".$word{$id};
165							#print STDERR "hd: $id\n";
166	0						$verb = "hd:".$root{$id};
167	0						$verb_type = $sc{$id};
168							# use root (lemma) of verb
169							} elsif ($rel eq "vc" or $rel eq "body") {
170							# get the underlying factoid recursively by returning the current id as new head id
171	0						$new_head_id = $id;
172							} elsif ($rel eq "su" && $subject eq "") {
173							# if the subject slot was not yet filled with the subject of the main clause
174	0						$subject = "su:".&_get_constituent($id);
175	0						$subj_id = $id;
176	0	0					if ($begin{$id} eq "0") {
177	0						$sentence_initial = 1;
178							}
179							} elsif ($rel =~ /^(obj1\|obj2\|predc)$/) {
180	0						my $rel = $1;
181	0	0	0				if ($object =~ /su:/ && $rel eq "obj1") {
182							# if the object slot already contains the subject of the main clause (in case of passive voice) don't add it again
183							} else {
184	0						$object .= "$rel:".&_get_constituent($id);
185	0						$obj_id = $id;
186							}
187							} elsif ($rel =~ /^(mod\|pc\|predm\|ld)$/) {
188	0						my $modifier = "$1:".&_get_constituent($id);
189							#print STDERR "Mod: $modifier\n";
190	0						push (@modifiers,$modifier);
191							}
192
193							}
194
195							# transform passive clauses
196	0	0	0				if ($subject eq "" && $object =~ /su:/) {
197	0						my $m=0;
198	0						$info .= "passive-to-active ";
199	0						foreach my $modifier (@modifiers) {
200	0	0					if ($modifier =~ /door (.+)$/) {
201	0						$subject = $1;
202	0						splice(@modifiers,$m,1);
203	0						$info .= "modifier-to-subject ";
204							}
205	0						$m++;
206							}
207	0	0					if ($subject eq "") {
208							# if none of the modifiers starts with 'door'
209	0						$subject = "MEN";
210							}
211							}
212
213							# transform double object constructions to a factoid and a definition
214	0	0					if ($object =~ s/([a-z0-9]+):(.+) ([a-z0-9]+):(.+)/$1:$2\|$3:$4/) {
215							# double object construction, e.g. "het wordt het Silicon Valley van India genoemd"
216	0						$info .= "double-object-to-definition ";
217	0						$tuple_type = "definition";
218	0						my $definition = "<$tuple_type id='$clause_id' subj='$1:$2' verb='IS' obj='$3:$4' mods='' topic='$doctitle'> # $info";
219	0						$definition = &_clean_up($definition);
220	0						push (@factoids,$definition);
221	0						$tuple_type = "factoid";
222							}
223
224							# transform copular constructions without modifiers to definitions
225	0	0	0				if ($verb_type eq "copula" && $subject =~ /\S/ && $object =~ /\S/){
			0
226	0						$verb = "IS";
227	0						$tuple_type = "definition";
228	0						$info .= "copula-to-definition ";
229							}
230
231							# resolve relative pronouns: replace die/dat/wat by the most recent NP.
232	0	0	0				if ($subject =~ /:(die\|dat\|wat) *$/i && defined($subj_id)) {
233							#print STDERR "Get recent cat id for subject id $subj_id ($subject)\n";
234	0						my $head_id = &_get_recent_cat_id($subj_id,"np");
235	0						$subject = "su:".&_get_constituent($head_id);
236	0						$info .= "pron-to-np ";
237							}
238	0	0	0				if ($object =~ /:(die\|dat\|wat) *$/i && defined($obj_id)) {
239	0						my $head_id = &_get_recent_cat_id($obj_id,"np");
240	0						$object = "obj:".&_get_constituent($head_id);
241	0						$info .= "pron-to-np ";
242							}
243
244
245	0						$subject = &_clean_up($subject);
246	0						$object = &_clean_up($object);
247
248	0	0					if ($object =~ s/ ([0-9]{4})$//) {
249	0						push(@modifiers,$1);
250							# if the object ends in a year then move it to the modifiers
251							}
252	0						my $modifiers = join("\|",@modifiers);
253
254	0	0	0				if ($sentence_initial && $subject =~ /^(de\|het\|een\|die\|dat\|deze\|dit\|alle\|andere\|dezelfde\|geen\|ieder\|meer\|veel\|vele\|weinig\|welke\|zoveel) /i) {
255	0						$subject = lcfirst($subject);
256							# at the beginning of a sentence, lowercase determiners
257							}
258	0						my $factoid = "<$tuple_type id='$clause_id' subj='$subject' verb='$verb' obj='$object' mods='$modifiers' topic='$doctitle'> # $info";
259	0						$factoid = &_clean_up($factoid);
260
261							#print STDERR "Verb type: $verb_type\n";
262	0	0	0				if ($verb eq "" or (($object eq "") && (not defined $modifiers[0]) && ($verb_type =~ /(aux\|passive)/))) {
			0
			0
263							# throw away empty passives for which the sub clause has been raised, e.g. ("Dit rijk wordt")
264							} else {
265	0						push(@factoids,$factoid);
266							}
267	0						return ($new_head_id,$subj_id,"su:$subject",$voice);
268							}
269
270							sub _get_constituent($) {
271	0			0			my ($start_id) = @_;
272	0						my $constituent = "";
273	0	0	0				if (not defined $cat{$start_id} && not defined $lcat{$start_id} && defined $index{$start_id}) {
			0
274
275							# find the constituent that has the same index
276	0	0					if (defined $index{$start_id}) {
277	0						my $index = $index{$start_id};
278	0						foreach my $index_id (@{$ids_for_index{$index}}) {
	0
279	0	0	0				if (defined $cat{$index_id} or defined $lcat{$index_id}) {
280	0						$start_id = $index_id;
281	0						last;
282							}
283							}
284							}
285							}
286	0						my $rellevel = $level{$start_id};
287	0	0					$constituent .= "$word{$start_id} " if (defined $word{$start_id});
288	0						my $id=$start_id;
289	0						$id++;
290	0		0				while ($id <= $ids[-1] && $level{$id} > $rellevel) {
291	0	0					last if ($rel{$id} eq "rhd");
292	0	0					$constituent .= "$word{$id} " if (defined $word{$id});
293	0						$id++;
294							}
295	0						return $constituent;
296							}
297
298							sub _get_recent_cat_id($$) {
299	0			0			my ($id,$search_cat) = @_;
300	0						my $head_id = $id;
301	0						my $cat_of_head_id="";
302	0	0					if (defined $cat{$head_id}) {
		0
303	0						$cat_of_head_id = $cat{$head_id};
304							} elsif (defined $lcat{$head_id}) {
305	0						$cat_of_head_id = $lcat{$head_id};
306							}
307							#print STDERR "Head id: $head_id, Search cat: $search_cat\n";
308	0		0				while ($cat_of_head_id ne $search_cat && $head_id > 0) {
309	0						$head_id--;
310							#print STDERR "Head id: $head_id\n";
311							}
312	0	0	0				if ($head_id == 0 && $cat{$head_id} ne $search_cat){
313							# if no note of type search_cat was found before then the original id is returned
314							# (for example, when a sentence starts with a pronoun, there is no preceding NP)
315	0						return $id;
316							}
317	0						return $head_id;
318							}
319
320							sub _get_headed_ids($) {
321	0			0			my ($head_id) = @_;
322	0						my $headlevel = $level{$head_id};
323	0						my @headed_ids;
324	0						my $id = $head_id;
325	0						$id++;
326	0	0					push(@headed_ids,$id) if ($level{$id} == $headlevel+1);
327	0		0				while ($id < $ids[-1] && $level{$id} > $headlevel) {
328	0						$id++;
329	0	0					push(@headed_ids,$id) if ($level{$id} == $headlevel+1);
330							}
331	0						return @headed_ids;
332							}
333
334							sub _clean_up($) {
335	0			0			my ($string) = @_;
336	0						$string =~ s/[a-z0-9]+: *//g;
337	0						$string =~ s/ +/ /g;
338	0						$string =~ s/^ //;
339	0						$string =~ s/[,.] *$//;
340	0						$string =~ s/=\' /=\'/g;
341	0						$string =~ s/ \' /\' /g;
342	0						$string =~ s/ \'>/\'>/g;
343	0						return $string;
344							}
345
346
347							1;
348							__END__