line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::EN::Semtags::Engine;
|
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
739
|
use strict;
|
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
32
|
|
4
|
1
|
|
|
1
|
|
5
|
use warnings;
|
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
34
|
|
5
|
|
|
|
|
|
|
#use Data::Dumper;
|
6
|
1
|
|
|
1
|
|
1714
|
use WordNet::QueryData 1.46;
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
use Lingua::EN::Tagger 0.11;
|
8
|
|
|
|
|
|
|
use List::Util qw(min max);
|
9
|
|
|
|
|
|
|
use Lingua::EN::Semtags::Sentence;
|
10
|
|
|
|
|
|
|
use Lingua::EN::Semtags::LangUnit;
|
11
|
|
|
|
|
|
|
use constant SEMTAG_ISA_INDEX => 1; # TODO May be calculated
|
12
|
|
|
|
|
|
|
use constant PHRASE_FRAME_SIZE => 3;
|
13
|
|
|
|
|
|
|
use constant MIN_ISAS => 3;
|
14
|
|
|
|
|
|
|
use constant ISAS => 'hypes'; # Hypernyms
|
15
|
|
|
|
|
|
|
use constant TRUE => 1;
|
16
|
|
|
|
|
|
|
use constant FALSE => 0;
|
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
our $VERSION = '0.01';
|
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
#============================================================
|
21
|
|
|
|
|
|
|
sub new {
|
22
|
|
|
|
|
|
|
#============================================================
|
23
|
|
|
|
|
|
|
my ($invocant, %args) = @_;
|
24
|
|
|
|
|
|
|
my $self = bless ({}, ref $invocant || $invocant);
|
25
|
|
|
|
|
|
|
$self->_init(%args);
|
26
|
|
|
|
|
|
|
return $self;
|
27
|
|
|
|
|
|
|
}
|
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
#============================================================
|
30
|
|
|
|
|
|
|
sub _init {
|
31
|
|
|
|
|
|
|
#============================================================
|
32
|
|
|
|
|
|
|
my ($self, %args) = @_;
|
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# Initialize attributes
|
35
|
|
|
|
|
|
|
$self->{wn} = WordNet::QueryData->new;
|
36
|
|
|
|
|
|
|
$self->{tagger} = Lingua::EN::Tagger->new;
|
37
|
|
|
|
|
|
|
$self->{verbose} = FALSE;
|
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
# Set the args that came from the constructor
|
40
|
|
|
|
|
|
|
foreach my $arg (sort keys %args) {
|
41
|
|
|
|
|
|
|
die "Unknown argument: $arg!" unless exists $self->{$arg};
|
42
|
|
|
|
|
|
|
$self->{$arg} = $args{$arg};
|
43
|
|
|
|
|
|
|
}
|
44
|
|
|
|
|
|
|
}
|
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
#============================================================
|
47
|
|
|
|
|
|
|
sub semtags {
|
48
|
|
|
|
|
|
|
#============================================================
|
49
|
|
|
|
|
|
|
my ($self, $string) = @_;
|
50
|
|
|
|
|
|
|
my @semtags = ();
|
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
foreach my $lunit ($self->sentence($string)->lunits) {
|
53
|
|
|
|
|
|
|
my $semtag = ($lunit->isas)[SEMTAG_ISA_INDEX];
|
54
|
|
|
|
|
|
|
$semtag =~ s/#\w#\d+$//;
|
55
|
|
|
|
|
|
|
push @semtags, uc $semtag;
|
56
|
|
|
|
|
|
|
}
|
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
return @semtags;
|
59
|
|
|
|
|
|
|
}
|
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
#============================================================
|
62
|
|
|
|
|
|
|
sub sentence {
|
63
|
|
|
|
|
|
|
#============================================================
|
64
|
|
|
|
|
|
|
my ($self, $string) = @_;
|
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
my $sentence = Lingua::EN::Semtags::Sentence->new(string => $string);
|
67
|
|
|
|
|
|
|
$self->_detect_words($sentence);
|
68
|
|
|
|
|
|
|
$self->_detect_phrases($sentence);
|
69
|
|
|
|
|
|
|
$self->_set_lunits($sentence);
|
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
return $sentence;
|
72
|
|
|
|
|
|
|
}
|
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
# Detects the POS of every token in the string. Populates $sentence->word_tokens.
|
75
|
|
|
|
|
|
|
# Only tokens of nouns, verbs, adjectives, adverbs go into $sentence->word_tokens.
|
76
|
|
|
|
|
|
|
#============================================================
|
77
|
|
|
|
|
|
|
sub _detect_words {
|
78
|
|
|
|
|
|
|
#============================================================
|
79
|
|
|
|
|
|
|
my ($self, $sentence) = @_;
|
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
$sentence->string(&_clean_for_words($sentence->string));
|
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
my $tagged_string = $self->tagger->get_readable($sentence->string);
|
84
|
|
|
|
|
|
|
foreach my $token_pos (split /\s/, $tagged_string) {
|
85
|
|
|
|
|
|
|
my ($token, $pos) = split /\//, $token_pos;
|
86
|
|
|
|
|
|
|
# Nouns, verbs, adjectives, adverbs
|
87
|
|
|
|
|
|
|
$sentence->word_tokens->{$token} = $pos if ($pos =~ /^(NN|VB|JJ|RB)/);
|
88
|
|
|
|
|
|
|
}
|
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
# print '_detect_words: ', Dumper($sentence->word_tokens) if $self->verbose;
|
91
|
|
|
|
|
|
|
}
|
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
# Detects WordNet phrases. Updates $sentence->string: glues phrase tokens
|
94
|
|
|
|
|
|
|
# together with underscores. Populates $sentence->phrase_tokens.
|
95
|
|
|
|
|
|
|
#============================================================
|
96
|
|
|
|
|
|
|
sub _detect_phrases {
|
97
|
|
|
|
|
|
|
#============================================================
|
98
|
|
|
|
|
|
|
my ($self, $sentence) = @_;
|
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
$sentence->string(&_clean_for_phrases($sentence->string));
|
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
# Move a frame across the sentence and test the contents for a sense
|
103
|
|
|
|
|
|
|
my @tokens = split /\s/, $sentence->string;
|
104
|
|
|
|
|
|
|
for (my $i = 0; $i < @tokens; $i++) {
|
105
|
|
|
|
|
|
|
my $phrase_string = $tokens[$i];
|
106
|
|
|
|
|
|
|
my $frame = min($i + 1 + PHRASE_FRAME_SIZE, scalar @tokens);
|
107
|
|
|
|
|
|
|
for (my $j = $i + 1; $j < $frame; $j++) {
|
108
|
|
|
|
|
|
|
$phrase_string .= ' '.$tokens[$j];
|
109
|
|
|
|
|
|
|
if ($self->wn->validForms($phrase_string)) {
|
110
|
|
|
|
|
|
|
print "_detect_phrases: [$phrase_string]\n"
|
111
|
|
|
|
|
|
|
if $self->verbose;
|
112
|
|
|
|
|
|
|
my @phrase_tokens = split /\s/, $phrase_string;
|
113
|
|
|
|
|
|
|
my $phrase_string_padded = join '_', @phrase_tokens;
|
114
|
|
|
|
|
|
|
(my $string = $sentence->string) =~
|
115
|
|
|
|
|
|
|
s/$phrase_string/$phrase_string_padded/g;
|
116
|
|
|
|
|
|
|
$sentence->string($string);
|
117
|
|
|
|
|
|
|
$sentence->phrase_tokens->{$phrase_string_padded} = TRUE;
|
118
|
|
|
|
|
|
|
$i += $#phrase_tokens; # Avoid frame overlaps
|
119
|
|
|
|
|
|
|
last; # Stop growing the frame if a phrase is detected
|
120
|
|
|
|
|
|
|
}
|
121
|
|
|
|
|
|
|
}
|
122
|
|
|
|
|
|
|
}
|
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
# print '_detect_phrases: ', Dumper($sentence->phrase_tokens) if $self->verbose;
|
125
|
|
|
|
|
|
|
}
|
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
#============================================================
|
128
|
|
|
|
|
|
|
sub _set_lunits {
|
129
|
|
|
|
|
|
|
#============================================================
|
130
|
|
|
|
|
|
|
my ($self, $sentence) = @_;
|
131
|
|
|
|
|
|
|
my %word_tokens = %{$sentence->word_tokens};
|
132
|
|
|
|
|
|
|
my %phrase_tokens = %{$sentence->phrase_tokens};
|
133
|
|
|
|
|
|
|
my %seen_tokens = ();
|
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
foreach my $token (split /\s/, $sentence->string) {
|
136
|
|
|
|
|
|
|
if ((exists $word_tokens{$token} or exists $phrase_tokens{$token}) and
|
137
|
|
|
|
|
|
|
!exists $seen_tokens{$token}) {
|
138
|
|
|
|
|
|
|
$seen_tokens{$token} = TRUE;
|
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
my $pos = exists $word_tokens{$token} ? $word_tokens{$token} : undef;
|
141
|
|
|
|
|
|
|
my $is_word = exists $word_tokens{$token} ? TRUE : FALSE;
|
142
|
|
|
|
|
|
|
my $is_phrase = exists $phrase_tokens{$token} ? TRUE : FALSE;
|
143
|
|
|
|
|
|
|
my $lunit = Lingua::EN::Semtags::LangUnit->new(
|
144
|
|
|
|
|
|
|
pos => $pos,
|
145
|
|
|
|
|
|
|
token => $token,
|
146
|
|
|
|
|
|
|
is_word => $is_word,
|
147
|
|
|
|
|
|
|
is_phrase => $is_phrase
|
148
|
|
|
|
|
|
|
);
|
149
|
|
|
|
|
|
|
$self->_set_isas($lunit) if $self->_set_sense($lunit);
|
150
|
|
|
|
|
|
|
$sentence->add_lunit($lunit) if &_is_meaningful($lunit);
|
151
|
|
|
|
|
|
|
}
|
152
|
|
|
|
|
|
|
}
|
153
|
|
|
|
|
|
|
}
|
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
#============================================================
|
156
|
|
|
|
|
|
|
sub _set_sense {
|
157
|
|
|
|
|
|
|
#============================================================
|
158
|
|
|
|
|
|
|
my ($self, $lunit) = @_;
|
159
|
|
|
|
|
|
|
my $token = $lunit->token;
|
160
|
|
|
|
|
|
|
my $sense = undef;
|
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
if ($sense = $self->_sense($lunit)) {
|
163
|
|
|
|
|
|
|
print "_set_sense: [$token] is [$sense]\n" if $self->verbose;
|
164
|
|
|
|
|
|
|
$lunit->sense($sense);
|
165
|
|
|
|
|
|
|
return TRUE;
|
166
|
|
|
|
|
|
|
} else {
|
167
|
|
|
|
|
|
|
print "_set_sense: [$token] has no sense!\n" if $self->verbose;
|
168
|
|
|
|
|
|
|
return FALSE;
|
169
|
|
|
|
|
|
|
}
|
170
|
|
|
|
|
|
|
}
|
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
#============================================================
|
173
|
|
|
|
|
|
|
sub _sense {
|
174
|
|
|
|
|
|
|
#============================================================
|
175
|
|
|
|
|
|
|
my ($self, $lunit) = @_;
|
176
|
|
|
|
|
|
|
my $token = $lunit->token;
|
177
|
|
|
|
|
|
|
my $poswn = $lunit->pos ? $lunit->poswn : undef;
|
178
|
|
|
|
|
|
|
my $sense = undef;
|
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
# Query for the token without a POS
|
181
|
|
|
|
|
|
|
my @senses = $self->wn->validForms($token);
|
182
|
|
|
|
|
|
|
if (@senses == 1) {
|
183
|
|
|
|
|
|
|
$sense = $senses[0];
|
184
|
|
|
|
|
|
|
} elsif (@senses > 1) { # Requires disambiguation
|
185
|
|
|
|
|
|
|
if (defined $poswn) {
|
186
|
|
|
|
|
|
|
# Query for the token with a POS
|
187
|
|
|
|
|
|
|
my @senses_pos = $self->wn->validForms("$token#$poswn");
|
188
|
|
|
|
|
|
|
if (@senses_pos == 1) {
|
189
|
|
|
|
|
|
|
$sense = $senses_pos[0];
|
190
|
|
|
|
|
|
|
} elsif (@senses_pos > 1) {
|
191
|
|
|
|
|
|
|
$sense = $self->_disambiguate_senses(@senses_pos);
|
192
|
|
|
|
|
|
|
} else {
|
193
|
|
|
|
|
|
|
$sense = $self->_disambiguate_senses(@senses);
|
194
|
|
|
|
|
|
|
}
|
195
|
|
|
|
|
|
|
} else {
|
196
|
|
|
|
|
|
|
$sense = $self->_disambiguate_senses(@senses);
|
197
|
|
|
|
|
|
|
}
|
198
|
|
|
|
|
|
|
}
|
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
return $sense;
|
201
|
|
|
|
|
|
|
}
|
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
#============================================================
|
204
|
|
|
|
|
|
|
sub _disambiguate_senses {
|
205
|
|
|
|
|
|
|
#============================================================
|
206
|
|
|
|
|
|
|
my ($self, @senses) = @_;
|
207
|
|
|
|
|
|
|
my %freqs2senses = ();
|
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
foreach my $sense (@senses) {
|
210
|
|
|
|
|
|
|
my $freq = $self->wn->frequency("$sense#1");
|
211
|
|
|
|
|
|
|
$freqs2senses{$freq} = $sense;
|
212
|
|
|
|
|
|
|
}
|
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
# We are interested in the most frequently used sense
|
215
|
|
|
|
|
|
|
my $max_freq = max keys %freqs2senses;
|
216
|
|
|
|
|
|
|
my $sense = $freqs2senses{$max_freq};
|
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
print "_disambiguate_senses: [@senses]->[$sense]\n" if $self->verbose;
|
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
return $sense;
|
221
|
|
|
|
|
|
|
}
|
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
#============================================================
|
224
|
|
|
|
|
|
|
sub _set_isas {
|
225
|
|
|
|
|
|
|
#============================================================
|
226
|
|
|
|
|
|
|
my ($self, $lunit) = @_;
|
227
|
|
|
|
|
|
|
my $isa = $lunit->sense;
|
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
while ($isa = ($self->wn->querySense($isa, ISAS))[0]) {
|
230
|
|
|
|
|
|
|
$lunit->add_isa($isa);
|
231
|
|
|
|
|
|
|
}
|
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
my @isas = $lunit->isas;
|
234
|
|
|
|
|
|
|
print "_set_isas: [@isas]\n" if $self->verbose;
|
235
|
|
|
|
|
|
|
}
|
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
#============================================================
|
238
|
|
|
|
|
|
|
sub wn { $_[0]->{wn}; }
|
239
|
|
|
|
|
|
|
sub tagger { $_[0]->{tagger}; }
|
240
|
|
|
|
|
|
|
sub verbose { defined $_[1] ? $_[0]->{verbose} = $_[1] : $_[0]->{verbose}; }
|
241
|
|
|
|
|
|
|
#============================================================
|
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
#============================================================
|
244
|
|
|
|
|
|
|
sub _clean_for_words {
|
245
|
|
|
|
|
|
|
#============================================================
|
246
|
|
|
|
|
|
|
for (my $string = $_[0]) {
|
247
|
|
|
|
|
|
|
s/\// /g;
|
248
|
|
|
|
|
|
|
s/\s+/ /g; # Collapse multiple spaces into one
|
249
|
|
|
|
|
|
|
return $string;
|
250
|
|
|
|
|
|
|
}
|
251
|
|
|
|
|
|
|
}
|
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
#============================================================
|
254
|
|
|
|
|
|
|
sub _clean_for_phrases {
|
255
|
|
|
|
|
|
|
#============================================================
|
256
|
|
|
|
|
|
|
for (my $string = $_[0]) {
|
257
|
|
|
|
|
|
|
s/\W/ /g; # Remove non-word chars
|
258
|
|
|
|
|
|
|
s/\b\w\b//g; # Remove single chars
|
259
|
|
|
|
|
|
|
s/\s+/ /g; # Collapse multiple spaces into one
|
260
|
|
|
|
|
|
|
return $string;
|
261
|
|
|
|
|
|
|
}
|
262
|
|
|
|
|
|
|
}
|
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
#============================================================
|
265
|
|
|
|
|
|
|
sub _is_meaningful {
|
266
|
|
|
|
|
|
|
#============================================================
|
267
|
|
|
|
|
|
|
return $_[0]->isas > MIN_ISAS ? TRUE : FALSE;
|
268
|
|
|
|
|
|
|
}
|
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
TRUE;
|
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
__END__
|