line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# WordNet::Extend::Locate.pm version 0.041 |
2
|
|
|
|
|
|
|
# Updated: 08/06/17 |
3
|
|
|
|
|
|
|
# |
4
|
|
|
|
|
|
|
# Jon Rusert, University of Minnesota Duluth |
5
|
|
|
|
|
|
|
# ruse0008 at d.umn.edu |
6
|
|
|
|
|
|
|
# |
7
|
|
|
|
|
|
|
# Ted Pedersen, University of Minnesota Duluth |
8
|
|
|
|
|
|
|
# tpederse at d.umn.edu |
9
|
|
|
|
|
|
|
# |
10
|
|
|
|
|
|
|
# This program is free software: you can redistribute it and/or modify |
11
|
|
|
|
|
|
|
# it under the terms of the GNU General Public License as published by |
12
|
|
|
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
13
|
|
|
|
|
|
|
# (at your option) any later version. |
14
|
|
|
|
|
|
|
# |
15
|
|
|
|
|
|
|
# This program is distributed in the hope that it will be useful, |
16
|
|
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
17
|
|
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
18
|
|
|
|
|
|
|
# GNU General Public License for more details. |
19
|
|
|
|
|
|
|
# |
20
|
|
|
|
|
|
|
# You should have received a copy of the GNU General Public License |
21
|
|
|
|
|
|
|
# along with this program. If not, see . |
22
|
|
|
|
|
|
|
# |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
package WordNet::Extend::Locate; |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
=head1 NAME |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
WordNet::Extend::Locate - Perl modules for locating where in WordNet a |
29
|
|
|
|
|
|
|
lemma should be inserted. |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=head1 SYNOPSIS |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
=head2 Basic Usage Example |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
use WordNet::Extend::Locate; |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
my $locate = WordNet::Extend::Locate->new(); |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
$locate->stopList('(the|is|at)'); |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
$locate->setCleanUp(1); |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
$locate->preProcessing(); |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
$locate->toggleCompareGlosses(1,1,0); |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
$locate->setBonus(25); |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
$locate->toggleRefineSense(0); |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
print "Finding location for 'dog noun withdef.1 man's best friend'\n"; |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
@location = @{$locate->locate("dog\tnoun\twithdef.1\tman\'s best friend")}; |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
print "Location found: @location\n"; |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=head1 DESCRIPTION |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=head2 Introduction |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
WordNet is a widely used tool in NLP and other research areas. A drawback of WordNet is the amount of time between updates. WordNet was last updated and released in December, 2006, and no further updates are planned. WordNet::Extend::Locate aims to help users decide where a good place to insert new lemmas into WordNet is by presenting several different methods to run. Users can then take the suggestion from Locate and use that with WordNet::Extend::Insert or simply use it as a guiding point and choose their own location. |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=over |
64
|
|
|
|
|
|
|
=cut |
65
|
|
|
|
|
|
|
|
66
|
1
|
|
|
1
|
|
80426
|
use WordNet::QueryData; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
#use Wiktionary::Parser; |
68
|
|
|
|
|
|
|
use Getopt::Long; |
69
|
|
|
|
|
|
|
use File::Spec; |
70
|
|
|
|
|
|
|
use Lingua::Stem; |
71
|
|
|
|
|
|
|
use Lingua::EN::Tagger; |
72
|
|
|
|
|
|
|
use WordNet::Similarity::vector; |
73
|
|
|
|
|
|
|
#use List::Util; |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS); |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
@ISA = qw(Exporter); |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
%EXPORT_TAGS = (); |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
@EXPORT_OK = (); |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
@EXPORT = (); |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
$VERSION = '0.041'; |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
#**************Variables********************** |
88
|
|
|
|
|
|
|
$wn = WordNet::QueryData->new; #to be used to access data from wordnet |
89
|
|
|
|
|
|
|
$stemmer = Lingua::Stem->new; #used to stem words for better overlaps etc. |
90
|
|
|
|
|
|
|
$tagger = Lingua::EN::Tagger->new; #used to tag words' pos for similarity measure |
91
|
|
|
|
|
|
|
$measure = WordNet::Similarity::vector->new ($wn); #used to measure similarity for Similarity |
92
|
|
|
|
|
|
|
@wordNetNouns; #stores all words for noun sense from wordnet |
93
|
|
|
|
|
|
|
@wordNetVerbs; #stores all words for verb sense from wordnet |
94
|
|
|
|
|
|
|
%wnGlosses = (); |
95
|
|
|
|
|
|
|
@wnNounSenses; |
96
|
|
|
|
|
|
|
@wnVerbSenses; |
97
|
|
|
|
|
|
|
%wnHypes = (); |
98
|
|
|
|
|
|
|
%wnHypos = (); |
99
|
|
|
|
|
|
|
%wnSyns = (); |
100
|
|
|
|
|
|
|
%wnFreq = (); |
101
|
|
|
|
|
|
|
#our $wikParser = Wiktionary::Parser->new(); #Parses data from wiktionary pages. |
102
|
|
|
|
|
|
|
#$stopList = "(the|is|at|which|on|a|an|and|or|up|in|so)"; #default stop list. |
103
|
|
|
|
|
|
|
$stopList = "(a|about|above|after|again|against|all|am|an|and|any|are|aren't|as|at|be|because|been|before|being|below|between|both|but|by|can't|cannot|could|couldn't|did|didn't|do|does|doesn't|doing|don't|down|during|each|few|for|from|further|had|hadn't|has|hasn't|have|haven't|having|he|he'd|he'll|he's|her|here|here's|hers|herself|him|himself|his|how|how's|i|i'd|i'll|i'm|i've|if|in|into|is|isn't|it|it's|its|itself|let's|me|more|most|mustn't|my|myself|no|nor|not|of|off|on|once|only|or|other|ought|our|ours|ourselves|out|over|own|same|shan't|she|she'd|she'll|she's|should|shouldn't|so|some|such|than|that|that's|the|their|theirs|them|themselves|then|there|there's|these|they|they'd|they'll|they're|they've|this|those|through|to|too|under|until|up|very|was|wasn't|we|we'd|we'll|we're|we've|were|weren't|what|what's|when|when's|where|where's|which|while|who|who's|whom|why|why's|with|won't|would|wouldn't|you|you'd|you'll|you're|you've|your|yours|yourself|yourselves)"; |
104
|
|
|
|
|
|
|
$preProcessed = 0; #Flag to determine if preProcessing() has been called. |
105
|
|
|
|
|
|
|
$cleanUp = 1; #If cleanUp is on, glosses will be cleanedUp, can be toggled with setCleanUp(); |
106
|
|
|
|
|
|
|
$userCleanUp = ""; #Cleanup step specified by user in addCleanUp(); |
107
|
|
|
|
|
|
|
$useHypeGlosses = 1; #Toggle for use of hypernym glosses in comparisons. |
108
|
|
|
|
|
|
|
$useHypoGlosses = 1; #Toggle for use of hyponym glosses in comparisons. |
109
|
|
|
|
|
|
|
$useSynsGlosses = 1; #Toggle for use of synset glosses in comparisons. |
110
|
|
|
|
|
|
|
$bonus = 10; #Bonus to be used for lemmas that contain the new lemma. Can be set with setBonus(); |
111
|
|
|
|
|
|
|
$refineSense = 0; #Toggle for use of refineSense() method, default on. |
112
|
|
|
|
|
|
|
$help = 0; |
113
|
|
|
|
|
|
|
$scoringMethod = 'baseline'; |
114
|
|
|
|
|
|
|
@scoringMethods = ('baseline', 'BwS', 'Similarity', 'Word2Vec'); |
115
|
|
|
|
|
|
|
$stemming = 0; #Toggle for stemming on or off. |
116
|
|
|
|
|
|
|
$stemmed = 0; #flag for use in BwS |
117
|
|
|
|
|
|
|
$cValue = 0; #Confidence value for w2veccompare can be set in setConfidenceValue() |
118
|
|
|
|
|
|
|
#********************************************* |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
GetOptions('help' => \$help); |
121
|
|
|
|
|
|
|
if($help == 1) |
122
|
|
|
|
|
|
|
{ |
123
|
|
|
|
|
|
|
printHelp(); |
124
|
|
|
|
|
|
|
exit(0); |
125
|
|
|
|
|
|
|
} |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
=head2 Methods |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
The following methods are defined in this package: |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
=head3 Public methods |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
=over |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
=item $obj->new() |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
The constructor for WordNet::Extend::Locate objects. |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
Parameters: none. |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
Return value: the new blessed object |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
=cut |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
sub new |
146
|
|
|
|
|
|
|
{ |
147
|
|
|
|
|
|
|
my $class = shift; |
148
|
|
|
|
|
|
|
my $self = {}; |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
$self->{errorString} = ''; |
151
|
|
|
|
|
|
|
$self->{error}=0; |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
bless $self, $class; |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
return $self; |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
=item $obj->getError() |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
Allows the object to check if any errors have occurred. |
161
|
|
|
|
|
|
|
Returns an array ($error, $errorString), where $error |
162
|
|
|
|
|
|
|
value equal to 1 represents a warning and 2 represents |
163
|
|
|
|
|
|
|
an error and $errString contains the possible error. |
164
|
|
|
|
|
|
|
For example, if a user forgets to run preProcessing() before |
165
|
|
|
|
|
|
|
a method that relies on it, the error would be 2 and errorString |
166
|
|
|
|
|
|
|
would mention that preProcessing had not been run. |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
Parameter: None |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
Returns: array of the form ($error, $errorString). |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
=cut |
173
|
|
|
|
|
|
|
sub getError() |
174
|
|
|
|
|
|
|
{ |
175
|
|
|
|
|
|
|
my $self = shift; |
176
|
|
|
|
|
|
|
my $error = $self->{error}; |
177
|
|
|
|
|
|
|
my $errString = $self->{errorString}; |
178
|
|
|
|
|
|
|
$self->{error}=0; |
179
|
|
|
|
|
|
|
$self->{errorString} = ""; |
180
|
|
|
|
|
|
|
$errString =~ s/^[\r\n\t ]+//; |
181
|
|
|
|
|
|
|
return ($error, $errString); |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
=item $obj->locateFile($input_file, $output_file) |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
Attempts to locate best WordNet position for each word |
187
|
|
|
|
|
|
|
from input file into WordNet, outputs results to output file. |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
Parameter: location of input file and output file respectively |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
Returns: nothing |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=cut |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
sub locateFile() |
196
|
|
|
|
|
|
|
{ |
197
|
|
|
|
|
|
|
my $input = File::Spec->canonpath($_[1]); |
198
|
|
|
|
|
|
|
my $output = File::Spec->canonpath($_[2]); |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
#Attempts to open input data |
201
|
|
|
|
|
|
|
open DATA, "$input" or die $!; |
202
|
|
|
|
|
|
|
open (OUTDATA, '>', "$output") or die $!; |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
#if preProcessing() hasn't been called, call it. |
205
|
|
|
|
|
|
|
if($preProcessed == 0) |
206
|
|
|
|
|
|
|
{ |
207
|
|
|
|
|
|
|
preProcessing(); |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
my @outLemma = ("","",""); |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
while() #While lemmas are left in the input data |
213
|
|
|
|
|
|
|
{ |
214
|
|
|
|
|
|
|
for $tempIn (split("\n")) #processes data line by line. |
215
|
|
|
|
|
|
|
{ |
216
|
|
|
|
|
|
|
@outLemma = @{locate($tempIn)}; |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
if(scalar @outLemma > 0)#only print if ideal lemma found |
219
|
|
|
|
|
|
|
{ |
220
|
|
|
|
|
|
|
$tempOut = "$outLemma[0]\t$outLemma[1]\t$outLemma[2]\n"; |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
print OUTDATA "$tempOut"; |
223
|
|
|
|
|
|
|
} |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
} |
227
|
|
|
|
|
|
|
close DATA; |
228
|
|
|
|
|
|
|
close OUTDATA; |
229
|
|
|
|
|
|
|
} |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
=item $obj->locate($wordPosGloss) |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
Takes in single lemma with gloss and returns location of best |
234
|
|
|
|
|
|
|
insertion point in WordNet. |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
Parameter: Lemma string in format of 'word\tpos\titem-id\tdef' |
237
|
|
|
|
|
|
|
NOTE: String must only be separated by \t no space. |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
Returns: Array in format of (item-id, WordNet sense, operation) |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
=cut |
242
|
|
|
|
|
|
|
sub locate() |
243
|
|
|
|
|
|
|
{ |
244
|
|
|
|
|
|
|
my $base = 0; |
245
|
|
|
|
|
|
|
if(scalar @_ == 2)#checks if method entered by object. |
246
|
|
|
|
|
|
|
{ |
247
|
|
|
|
|
|
|
$base = 1; |
248
|
|
|
|
|
|
|
} |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
#if preProcessing() hasn't been called, call it. |
251
|
|
|
|
|
|
|
if($preProcessed == 0) |
252
|
|
|
|
|
|
|
{ |
253
|
|
|
|
|
|
|
preProcessing(); |
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
my @inLemma = (); |
257
|
|
|
|
|
|
|
if(ref($_[$base]) eq 'ARRAY') #distinguishes between lemmas sent in as array vs string in \t format |
258
|
|
|
|
|
|
|
{ |
259
|
|
|
|
|
|
|
@inLemma =@{$_[$base]}; |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
else |
262
|
|
|
|
|
|
|
{ |
263
|
|
|
|
|
|
|
@inLemma = split("\t", $_[$base]); #stores lemma as formatted above |
264
|
|
|
|
|
|
|
} |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
my @outLemma = (); |
267
|
|
|
|
|
|
|
#word2vec handles all the wordnet words at once, while the other methods handle them one at a time |
268
|
|
|
|
|
|
|
if($scoringMethod eq 'Word2Vec') |
269
|
|
|
|
|
|
|
{ |
270
|
|
|
|
|
|
|
@outLemma = @{word2VecCompare(\@inLemma)}; |
271
|
|
|
|
|
|
|
} |
272
|
|
|
|
|
|
|
else |
273
|
|
|
|
|
|
|
{ |
274
|
|
|
|
|
|
|
@outLemma = @{processLemma(\@inLemma)}; |
275
|
|
|
|
|
|
|
} |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
return \@outLemma; |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
} |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
=item $obj->stopList($newStopList) |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
Takes in new stop list, in regex form |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
Parameter:the new stop list in regex substitution form (w1|w2|...|wn) |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
Returns: nothing |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
=cut |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
sub stopList() |
292
|
|
|
|
|
|
|
{ |
293
|
|
|
|
|
|
|
my $base = 0; |
294
|
|
|
|
|
|
|
if(scalar @_ == 2)#checks if method entered by object. |
295
|
|
|
|
|
|
|
{ |
296
|
|
|
|
|
|
|
$base = 1; |
297
|
|
|
|
|
|
|
} |
298
|
|
|
|
|
|
|
my $tempStopList = $_[$base]; |
299
|
|
|
|
|
|
|
if($tempStopList =~ /\(.*(\|.*)?\)/g) |
300
|
|
|
|
|
|
|
{ |
301
|
|
|
|
|
|
|
$stopList = $tempStopList; |
302
|
|
|
|
|
|
|
} |
303
|
|
|
|
|
|
|
else |
304
|
|
|
|
|
|
|
{ |
305
|
|
|
|
|
|
|
my $self = shift; |
306
|
|
|
|
|
|
|
$self->{error} = 1; |
307
|
|
|
|
|
|
|
$self->{errorString} = "Proposed stop list not in regex substition form (w1|w2|...|wn), default remains"; |
308
|
|
|
|
|
|
|
} |
309
|
|
|
|
|
|
|
} |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
=item $obj->setCleanUp($switch) |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
Allows the user to toggle whether or not |
314
|
|
|
|
|
|
|
glosses should be cleaned up. |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
Parameter: 0 or 1 to turn clean up off or on respectively |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
Returns: nothing |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
=cut |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
sub setCleanUp() |
323
|
|
|
|
|
|
|
{ |
324
|
|
|
|
|
|
|
my $base = 0; |
325
|
|
|
|
|
|
|
if(scalar @_ == 2)#checks if method entered by object. |
326
|
|
|
|
|
|
|
{ |
327
|
|
|
|
|
|
|
$base = 1; |
328
|
|
|
|
|
|
|
} |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
if($_[$base] == 0) #turns cleanUp off. |
331
|
|
|
|
|
|
|
{ |
332
|
|
|
|
|
|
|
$cleanUp = 0; |
333
|
|
|
|
|
|
|
} |
334
|
|
|
|
|
|
|
else #turns cleanUp on. |
335
|
|
|
|
|
|
|
{ |
336
|
|
|
|
|
|
|
$cleanUp = 1; |
337
|
|
|
|
|
|
|
} |
338
|
|
|
|
|
|
|
} |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
=item $obj->addCleanUp($cleanUp) |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
Allows the user to add their own |
343
|
|
|
|
|
|
|
regex for cleaning up the glosses. |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
Parameter: Regex representing the cleanup |
346
|
|
|
|
|
|
|
the user wants performed. |
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
Returns: Nothing |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
=cut |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
sub addCleanUp() |
353
|
|
|
|
|
|
|
{ |
354
|
|
|
|
|
|
|
my $base = 0; |
355
|
|
|
|
|
|
|
if(scalar @_ == 2)#checks if method entered by object. |
356
|
|
|
|
|
|
|
{ |
357
|
|
|
|
|
|
|
$base = 1; |
358
|
|
|
|
|
|
|
} |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
my $tempCleanUp = $_[$base]; |
361
|
|
|
|
|
|
|
if($tempCleanUp =~ /(s|t)\/.*\/g?/g) |
362
|
|
|
|
|
|
|
{ |
363
|
|
|
|
|
|
|
$userCleanUp = $tempCleanUp; |
364
|
|
|
|
|
|
|
} |
365
|
|
|
|
|
|
|
else |
366
|
|
|
|
|
|
|
{ |
367
|
|
|
|
|
|
|
my $self = shift; |
368
|
|
|
|
|
|
|
$self->{error} = 1; |
369
|
|
|
|
|
|
|
$self->{errorString} = "Clean Up not in regex format '/.../', default remains on"; |
370
|
|
|
|
|
|
|
} |
371
|
|
|
|
|
|
|
} |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
=item $obj->preProcessing() |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
Highly increases speed of program by making |
376
|
|
|
|
|
|
|
as many outside calls as possible and storing |
377
|
|
|
|
|
|
|
outside info to be used later. |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
Parameter: none |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
Returns: nothing |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
=cut |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
sub preProcessing() |
386
|
|
|
|
|
|
|
{ |
387
|
|
|
|
|
|
|
$preProcessed = 1; #Flag that preProcessing has been called. |
388
|
|
|
|
|
|
|
@wordNetNouns = $wn->listAllWords('noun'); #Stores all nouns from wordNet for multiple uses. |
389
|
|
|
|
|
|
|
@wordNetVerbs = $wn->listAllWords('verb'); #Stores all verbs from wordNet for multiple uses. |
390
|
|
|
|
|
|
|
#reset all glosses, senses, etc. |
391
|
|
|
|
|
|
|
%wnGlosses = (); |
392
|
|
|
|
|
|
|
@wnNounSenses; |
393
|
|
|
|
|
|
|
@wnVerbSenses; |
394
|
|
|
|
|
|
|
%wnHypes = (); |
395
|
|
|
|
|
|
|
%wnHypos = (); |
396
|
|
|
|
|
|
|
%wnSyns = (); |
397
|
|
|
|
|
|
|
%wnFreq = (); |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
#Preemptively retrieves glosses, hypes, hypos, and syns for all senses as they will be used every iteration. |
401
|
|
|
|
|
|
|
foreach my $noun (@wordNetNouns) |
402
|
|
|
|
|
|
|
{ |
403
|
|
|
|
|
|
|
my @nSenses = $wn->querySense("$noun\#n"); #gets all senses for that word |
404
|
|
|
|
|
|
|
foreach my $curNSense (@nSenses) |
405
|
|
|
|
|
|
|
{ |
406
|
|
|
|
|
|
|
#stores in noun senses to differentiate from verbs. |
407
|
|
|
|
|
|
|
push(@wnNounSenses, $curNSense); |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
#obtain each gloss and clean up before inserting into hash. |
410
|
|
|
|
|
|
|
my @nGlosses = $wn->querySense($curNSense, "glos"); |
411
|
|
|
|
|
|
|
my $tempSenseGloss = $nGlosses[0]; |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
if($cleanUp == 1) |
414
|
|
|
|
|
|
|
{ |
415
|
|
|
|
|
|
|
#Clean up the words in the temporary sense gloss. |
416
|
|
|
|
|
|
|
$tempSenseGloss =~ s/(\(|\)|\.)//g; |
417
|
|
|
|
|
|
|
$tempSenseGloss =~ s/^a-zA-Z//g; |
418
|
|
|
|
|
|
|
$tempSenseGloss = lc $tempSenseGloss; #converts all words to lowercase. |
419
|
|
|
|
|
|
|
$tempSenseGloss =~ s/(^|\s)$stopList(\s|$)/ /g; #remove stop words |
420
|
|
|
|
|
|
|
} |
421
|
|
|
|
|
|
|
if($userCleanUp ne "\"\"") |
422
|
|
|
|
|
|
|
{ |
423
|
|
|
|
|
|
|
$tempSenseGloss =~ $userCleanUp; |
424
|
|
|
|
|
|
|
} |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
#if stemming is on, stem each word in each gloss |
427
|
|
|
|
|
|
|
if($stemming == 1) |
428
|
|
|
|
|
|
|
{ |
429
|
|
|
|
|
|
|
my @tempStem = split(' ', $tempSenseGloss); |
430
|
|
|
|
|
|
|
my @stemmedGloss = @{$stemmer->stem(@tempStem)}; |
431
|
|
|
|
|
|
|
$tempSenseGloss = join(' ', @stemmedGloss); |
432
|
|
|
|
|
|
|
} |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
#maps each sense to its gloss |
435
|
|
|
|
|
|
|
$wnGlosses{$curNSense} = $tempSenseGloss; |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
#obtains and stores, hypes, hypos, and syns |
438
|
|
|
|
|
|
|
my @hypes = $wn->querySense($curNSense, "hype"); |
439
|
|
|
|
|
|
|
$wnHypes{$curNSense} = \@hypes; |
440
|
|
|
|
|
|
|
my @hypos = $wn->querySense($curNSense, "hypo"); |
441
|
|
|
|
|
|
|
$wnHypos{$curNSense} = \@hypos; |
442
|
|
|
|
|
|
|
my @syns = $wn->querySense($curNSense, "syns"); |
443
|
|
|
|
|
|
|
$wnSyns{$curNSense} = \@syns; |
444
|
|
|
|
|
|
|
$wnFreq{$curNSense} = $wn->frequency($curNSense); |
445
|
|
|
|
|
|
|
} |
446
|
|
|
|
|
|
|
} |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
#stores verbs' senses' glosses, hypes, hypos, and syns. |
449
|
|
|
|
|
|
|
foreach my $verb (@wordNetVerbs) |
450
|
|
|
|
|
|
|
{ |
451
|
|
|
|
|
|
|
my @vSenses = $wn->querySense("$verb\#v"); #gets all senses for that word |
452
|
|
|
|
|
|
|
foreach my $curVSense (@vSenses) |
453
|
|
|
|
|
|
|
{ |
454
|
|
|
|
|
|
|
#stores in verb senses to differentiate later. |
455
|
|
|
|
|
|
|
push(@wnVerbSenses, $curVSense); |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
#obtain each gloss and clean up before inserting into hash. |
458
|
|
|
|
|
|
|
my @vGlosses = $wn->querySense($curVSense, "glos"); |
459
|
|
|
|
|
|
|
my $tempSenseGloss = $vGlosses[0]; |
460
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
if($cleanUp == 1) |
462
|
|
|
|
|
|
|
{ |
463
|
|
|
|
|
|
|
#Clean up the words in the temporary sense gloss. |
464
|
|
|
|
|
|
|
$tempSenseGloss =~ s/(\(|\)|\.)//g; |
465
|
|
|
|
|
|
|
$tempSenseGloss =~ s/^a-zA-Z//g; |
466
|
|
|
|
|
|
|
$tempSenseGloss = lc $tempSenseGloss; #converts all words to lowercase. |
467
|
|
|
|
|
|
|
$tempSenseGloss =~ s/(^|\s)$stopList(\s|$)/ /g; #remove stop words |
468
|
|
|
|
|
|
|
} |
469
|
|
|
|
|
|
|
if($userCleanUp ne "\"\"") |
470
|
|
|
|
|
|
|
{ |
471
|
|
|
|
|
|
|
$tempSenseGloss =~ $userCleanUp; |
472
|
|
|
|
|
|
|
} |
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
#if stemming is on, stem each word in each gloss |
475
|
|
|
|
|
|
|
if($stemming == 1) |
476
|
|
|
|
|
|
|
{ |
477
|
|
|
|
|
|
|
my @tempStem = split(' ', $tempSenseGloss); |
478
|
|
|
|
|
|
|
my @stemmedGloss = @{$stemmer->stem(@tempStem)}; |
479
|
|
|
|
|
|
|
$tempSenseGloss = join(' ', @stemmedGloss); |
480
|
|
|
|
|
|
|
} |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
#maps each sense to its gloss |
483
|
|
|
|
|
|
|
$wnGlosses{$curVSense} = $tempSenseGloss; |
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
#obtains and stores, hypes, hypos, and syns |
486
|
|
|
|
|
|
|
my @hypes = $wn->querySense($curVSense, "hype"); |
487
|
|
|
|
|
|
|
$wnHypes{$curVSense} = \@hypes; |
488
|
|
|
|
|
|
|
my @hypos = $wn->querySense($curVSense, "hypo"); |
489
|
|
|
|
|
|
|
$wnHypos{$curVSense} = \@hypos; |
490
|
|
|
|
|
|
|
my @syns = $wn->querySense($curVSense, "syns"); |
491
|
|
|
|
|
|
|
$wnSyns{$curVSense} = \@syns; |
492
|
|
|
|
|
|
|
$wnFreq{$curVSense} = $wn->frequency($curVSense); |
493
|
|
|
|
|
|
|
} |
494
|
|
|
|
|
|
|
} |
495
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
} |
498
|
|
|
|
|
|
|
|
499
|
|
|
|
|
|
|
=item $obj->processLemma(@inLemma) |
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
Determines where the OOV Lemma should be |
502
|
|
|
|
|
|
|
inserted into WordNet, returns the output. |
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
Parameter: the lemma to be inserted in array form |
505
|
|
|
|
|
|
|
(lemma, part-of-speech, item-id, definition, def source) |
506
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
Returns: chosen lemma in array form |
508
|
|
|
|
|
|
|
(item-id, WordNet sense, operation) |
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
=cut |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
sub processLemma() |
513
|
|
|
|
|
|
|
{ |
514
|
|
|
|
|
|
|
my $base = 0; |
515
|
|
|
|
|
|
|
if(scalar @_ == 2)#checks if method entered by object. |
516
|
|
|
|
|
|
|
{ |
517
|
|
|
|
|
|
|
$base = 1; |
518
|
|
|
|
|
|
|
} |
519
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
my %senseScores = (); |
521
|
|
|
|
|
|
|
my $highSenseScore = 0; |
522
|
|
|
|
|
|
|
my $highSense = ""; |
523
|
|
|
|
|
|
|
my @inLemma = @{$_[$base]}; |
524
|
|
|
|
|
|
|
my @outLemma = ("","",""); |
525
|
|
|
|
|
|
|
my $attachMerge = ""; |
526
|
|
|
|
|
|
|
my @senses = (); |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
if($preProcessed == 1) |
529
|
|
|
|
|
|
|
{ |
530
|
|
|
|
|
|
|
if($inLemma[1] =~ /noun/) |
531
|
|
|
|
|
|
|
{ |
532
|
|
|
|
|
|
|
@senses = @wnNounSenses; |
533
|
|
|
|
|
|
|
} |
534
|
|
|
|
|
|
|
else |
535
|
|
|
|
|
|
|
{ |
536
|
|
|
|
|
|
|
@senses = @wnVerbSenses; |
537
|
|
|
|
|
|
|
} |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
foreach $curSense (@senses) #runs through each sense of current word |
540
|
|
|
|
|
|
|
{ |
541
|
|
|
|
|
|
|
my $score = scoreSense(\@inLemma, $curSense); |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
if($score >= $highSenseScore) |
544
|
|
|
|
|
|
|
{ |
545
|
|
|
|
|
|
|
$highSenseScore = $score; |
546
|
|
|
|
|
|
|
$highSense = $curSense; |
547
|
|
|
|
|
|
|
} |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
$senseScores{$curSense} = $score; |
550
|
|
|
|
|
|
|
} |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
if($refineSense == 1) |
553
|
|
|
|
|
|
|
{ |
554
|
|
|
|
|
|
|
$highSense = refineSense(\@inLemma, $highSense); |
555
|
|
|
|
|
|
|
} |
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
if($wnFreq{$highSense} == 0) |
558
|
|
|
|
|
|
|
{ |
559
|
|
|
|
|
|
|
$attachMerge = "attach"; |
560
|
|
|
|
|
|
|
} |
561
|
|
|
|
|
|
|
else |
562
|
|
|
|
|
|
|
{ |
563
|
|
|
|
|
|
|
$attachMerge = "merge"; |
564
|
|
|
|
|
|
|
} |
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
$outLemma[0] = $inLemma[2]; |
567
|
|
|
|
|
|
|
$outLemma[1] = $highSense; |
568
|
|
|
|
|
|
|
$outLemma[2] = $attachMerge; |
569
|
|
|
|
|
|
|
return \@outLemma; |
570
|
|
|
|
|
|
|
} |
571
|
|
|
|
|
|
|
else |
572
|
|
|
|
|
|
|
{ |
573
|
|
|
|
|
|
|
my $self = shift; |
574
|
|
|
|
|
|
|
$self->{error} = 2; |
575
|
|
|
|
|
|
|
$self->{errorString} = "PreProcessing must be run before processLemma() is called."; |
576
|
|
|
|
|
|
|
} |
577
|
|
|
|
|
|
|
} |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
=item $obj->toggleCompareGlosses($hype,$hypo,$syns) |
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
Toggles which glosses are used in score sense. |
582
|
|
|
|
|
|
|
by default, the sense, the sense's hypernyms' |
583
|
|
|
|
|
|
|
glosses,hyponyms' glosses, and synsets' glosses |
584
|
|
|
|
|
|
|
are turned on. This method allows for toggling |
585
|
|
|
|
|
|
|
of hypes,hypos,synsets, by passing in three |
586
|
|
|
|
|
|
|
parameters, 1 for on and 0 for off. |
587
|
|
|
|
|
|
|
Example: toggleCompareGlosses(0,0,0) toggles |
588
|
|
|
|
|
|
|
all three off. |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
Parameters: 0 or 1 for toggling hypernyms, hyponyms, |
591
|
|
|
|
|
|
|
and synset comparisons. |
592
|
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
Returns: nothing |
594
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
=cut |
596
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
sub toggleCompareGlosses() |
598
|
|
|
|
|
|
|
{ |
599
|
|
|
|
|
|
|
my $base = 0; |
600
|
|
|
|
|
|
|
if(scalar @_ == 4)#checks if method entered by object. |
601
|
|
|
|
|
|
|
{ |
602
|
|
|
|
|
|
|
$base = 1; |
603
|
|
|
|
|
|
|
} |
604
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
if($_[$base] == 0) |
606
|
|
|
|
|
|
|
{ |
607
|
|
|
|
|
|
|
$useHypeGlosses = 0; |
608
|
|
|
|
|
|
|
} |
609
|
|
|
|
|
|
|
else |
610
|
|
|
|
|
|
|
{ |
611
|
|
|
|
|
|
|
$useHypeGlosses = 1; |
612
|
|
|
|
|
|
|
} |
613
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
$base++; |
615
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
if($_[$base] == 0) |
617
|
|
|
|
|
|
|
{ |
618
|
|
|
|
|
|
|
$useHypoGlosses = 0; |
619
|
|
|
|
|
|
|
} |
620
|
|
|
|
|
|
|
else |
621
|
|
|
|
|
|
|
{ |
622
|
|
|
|
|
|
|
$useHypoGlosess = 1; |
623
|
|
|
|
|
|
|
} |
624
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
$base++; |
626
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
if($_[$base] == 0) |
628
|
|
|
|
|
|
|
{ |
629
|
|
|
|
|
|
|
$useSynsGlosses = 0; |
630
|
|
|
|
|
|
|
} |
631
|
|
|
|
|
|
|
else |
632
|
|
|
|
|
|
|
{ |
633
|
|
|
|
|
|
|
$useSynsGlosses = 1; |
634
|
|
|
|
|
|
|
} |
635
|
|
|
|
|
|
|
} |
636
|
|
|
|
|
|
|
|
637
|
|
|
|
|
|
|
=item $obj->setBonus($bonus) |
638
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
Allows the user to set the bonus that will be |
640
|
|
|
|
|
|
|
used when scoring lemmas that contain the |
641
|
|
|
|
|
|
|
new lemma. |
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
Parameter: the multiplier that should be used in |
644
|
|
|
|
|
|
|
calculating the bonus. |
645
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
Returns: nothing |
647
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
=ctu |
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
sub setBonus() |
651
|
|
|
|
|
|
|
{ |
652
|
|
|
|
|
|
|
my $base = 0; |
653
|
|
|
|
|
|
|
if(scalar @_ == 2)#checks if method entered by object. |
654
|
|
|
|
|
|
|
{ |
655
|
|
|
|
|
|
|
$base = 1; |
656
|
|
|
|
|
|
|
} |
657
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
$bonus = $_[$base]; |
659
|
|
|
|
|
|
|
} |
660
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
=item $obj->scoreSense(@inLemma, $compareSense) |
662
|
|
|
|
|
|
|
|
663
|
|
|
|
|
|
|
Serves as a wrapper method to facilitate the |
664
|
|
|
|
|
|
|
main program by directing it to the currently |
665
|
|
|
|
|
|
|
chosen scoring method. By default the average |
666
|
|
|
|
|
|
|
highest scoring method is chosen. This can be |
667
|
|
|
|
|
|
|
changed with setScoreMethod(). |
668
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
Parameters: the in lemma in array form |
670
|
|
|
|
|
|
|
(lemma, part-of-speech, item-id, definition, def source) |
671
|
|
|
|
|
|
|
and the sense that the lemma is being compared to. |
672
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
Returns: a score of how related the in lemma is to the |
674
|
|
|
|
|
|
|
compareSense. |
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
=cut |
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
sub scoreSense() |
679
|
|
|
|
|
|
|
{ |
680
|
|
|
|
|
|
|
my $base = 0; |
681
|
|
|
|
|
|
|
if(scalar @_ == 3)#checks if method entered by object. |
682
|
|
|
|
|
|
|
{ |
683
|
|
|
|
|
|
|
$base = 1; |
684
|
|
|
|
|
|
|
} |
685
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
my @inLemma = @{$_[$base]}; |
687
|
|
|
|
|
|
|
$base++; |
688
|
|
|
|
|
|
|
my $curSense = $_[$base]; |
689
|
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
my $score = 0; |
691
|
|
|
|
|
|
|
if($scoringMethod eq "baseline") |
692
|
|
|
|
|
|
|
{ |
693
|
|
|
|
|
|
|
$score = baseline(\@inLemma, $curSense); |
694
|
|
|
|
|
|
|
} |
695
|
|
|
|
|
|
|
if($scoringMethod eq "BwS") |
696
|
|
|
|
|
|
|
{ |
697
|
|
|
|
|
|
|
$score = BwS(\@inLemma, $curSense); |
698
|
|
|
|
|
|
|
} |
699
|
|
|
|
|
|
|
if($scoringMethod eq "Similarity") |
700
|
|
|
|
|
|
|
{ |
701
|
|
|
|
|
|
|
$score = Similarity(\@inLemma, $curSense); |
702
|
|
|
|
|
|
|
} |
703
|
|
|
|
|
|
|
|
704
|
|
|
|
|
|
|
return $score; |
705
|
|
|
|
|
|
|
} |
706
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
=item $obj->setScoreMethod($scoreMethod) |
708
|
|
|
|
|
|
|
|
709
|
|
|
|
|
|
|
Allows the user to choose which scoring method |
710
|
|
|
|
|
|
|
should be used by default when running the |
711
|
|
|
|
|
|
|
program from the top. Options are: |
712
|
|
|
|
|
|
|
'baseline' |
713
|
|
|
|
|
|
|
'BwS' - baseline system with stemming and lemmitization |
714
|
|
|
|
|
|
|
--as more are added they will appear here. |
715
|
|
|
|
|
|
|
|
716
|
|
|
|
|
|
|
Parameter: the chosen scoring method |
717
|
|
|
|
|
|
|
|
718
|
|
|
|
|
|
|
Returns: nothing. |
719
|
|
|
|
|
|
|
|
720
|
|
|
|
|
|
|
=cut |
721
|
|
|
|
|
|
|
|
722
|
|
|
|
|
|
|
sub setScoreMethod() |
723
|
|
|
|
|
|
|
{ |
724
|
|
|
|
|
|
|
my $base = 0; |
725
|
|
|
|
|
|
|
|
726
|
|
|
|
|
|
|
if(scalar @_ == 2)#checks if method entered by object. |
727
|
|
|
|
|
|
|
{ |
728
|
|
|
|
|
|
|
$base = 1; |
729
|
|
|
|
|
|
|
} |
730
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
my $scoreMethod = $_[$base]; |
732
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
#check if the score method is in scoring methods. |
734
|
|
|
|
|
|
|
my @matches = grep(/$scoreMethod/, @scoringMethods); |
735
|
|
|
|
|
|
|
if(scalar @matches > 0) |
736
|
|
|
|
|
|
|
{ |
737
|
|
|
|
|
|
|
$scoringMethod = $scoreMethod; |
738
|
|
|
|
|
|
|
} |
739
|
|
|
|
|
|
|
|
740
|
|
|
|
|
|
|
} |
741
|
|
|
|
|
|
|
|
742
|
|
|
|
|
|
|
=item $obj->Similarity(@inLemma, $compareSense) |
743
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
Calculates a score for the passed sense and returns |
745
|
|
|
|
|
|
|
that score. |
746
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
Parameters: the in lemma in array form |
748
|
|
|
|
|
|
|
(lemma, part-of-speech, item-id, definition, def source) |
749
|
|
|
|
|
|
|
and the sense that the lemma is being compared to. |
750
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
Returns: a score of how related the im lemma is to the |
752
|
|
|
|
|
|
|
compareSense. |
753
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
=cut |
755
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
sub Similarity() |
757
|
|
|
|
|
|
|
{ |
758
|
|
|
|
|
|
|
my $base = 0; |
759
|
|
|
|
|
|
|
if(scalar @_ == 3)#checks if method entered by object. |
760
|
|
|
|
|
|
|
{ |
761
|
|
|
|
|
|
|
$base = 1; |
762
|
|
|
|
|
|
|
} |
763
|
|
|
|
|
|
|
|
764
|
|
|
|
|
|
|
my @inLemma = @{$_[$base]}; |
765
|
|
|
|
|
|
|
$base++; |
766
|
|
|
|
|
|
|
my $curSense = $_[$base]; |
767
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
my $def = @inLemma[3]; |
769
|
|
|
|
|
|
|
|
770
|
|
|
|
|
|
|
#split definition and stem the words |
771
|
|
|
|
|
|
|
my @listDef =split(' ', $def); |
772
|
|
|
|
|
|
|
my @defStemmed = @{$stemmer->stem(@listDef)}; |
773
|
|
|
|
|
|
|
|
774
|
|
|
|
|
|
|
#join definition back together and tag with pos |
775
|
|
|
|
|
|
|
$def = join(' ', @defStemmed); |
776
|
|
|
|
|
|
|
my $tagged = $tagger->add_tags($def); |
777
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
#split the tagged definition for individual word processing |
779
|
|
|
|
|
|
|
@tagArray = split(' ', $tagged); |
780
|
|
|
|
|
|
|
my @similar = (); |
781
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
#step through each tagged word and find the first sense in wordnet, then add that to the @similar list |
783
|
|
|
|
|
|
|
foreach my $cur (@tagArray) |
784
|
|
|
|
|
|
|
{ |
785
|
|
|
|
|
|
|
my $pos = ''; |
786
|
|
|
|
|
|
|
if($cur =~ /.*/) |
787
|
|
|
|
|
|
|
{ |
788
|
|
|
|
|
|
|
$pos = 'n'; |
789
|
|
|
|
|
|
|
} |
790
|
|
|
|
|
|
|
else |
791
|
|
|
|
|
|
|
{ |
792
|
|
|
|
|
|
|
if($cur =~ /.*/) |
793
|
|
|
|
|
|
|
{ |
794
|
|
|
|
|
|
|
$pos = 'v'; |
795
|
|
|
|
|
|
|
} |
796
|
|
|
|
|
|
|
else |
797
|
|
|
|
|
|
|
{ |
798
|
|
|
|
|
|
|
if($cur =~ /.*/) |
799
|
|
|
|
|
|
|
{ |
800
|
|
|
|
|
|
|
$pos = 'a'; |
801
|
|
|
|
|
|
|
} |
802
|
|
|
|
|
|
|
} |
803
|
|
|
|
|
|
|
} |
804
|
|
|
|
|
|
|
|
805
|
|
|
|
|
|
|
if(length $pos == 1) |
806
|
|
|
|
|
|
|
{ |
807
|
|
|
|
|
|
|
$cur =~ s/<[nvj\/].{1,3}>//g; |
808
|
|
|
|
|
|
|
@wnQuery = $wn->querySense("$cur#$pos"); |
809
|
|
|
|
|
|
|
push @similar, $wnQuery[0]; |
810
|
|
|
|
|
|
|
} |
811
|
|
|
|
|
|
|
} |
812
|
|
|
|
|
|
|
|
813
|
|
|
|
|
|
|
my $score = 0; |
814
|
|
|
|
|
|
|
|
815
|
|
|
|
|
|
|
foreach my $curSim (@similar) |
816
|
|
|
|
|
|
|
{ |
817
|
|
|
|
|
|
|
my $value = $measure->getRelatedness("$curSense", "$curSim"); |
818
|
|
|
|
|
|
|
$score = $score + $value; |
819
|
|
|
|
|
|
|
} |
820
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
return $score; |
822
|
|
|
|
|
|
|
} |
823
|
|
|
|
|
|
|
|
824
|
|
|
|
|
|
|
=item $obj->BwS(@inLemma, $compareSense) |
825
|
|
|
|
|
|
|
|
826
|
|
|
|
|
|
|
Calculates a score for the passed sense and returns |
827
|
|
|
|
|
|
|
that score. This is a modified baseline() method |
828
|
|
|
|
|
|
|
which adds stemming to the data. |
829
|
|
|
|
|
|
|
|
830
|
|
|
|
|
|
|
Parameters: the in lemma in array form |
831
|
|
|
|
|
|
|
(lemma, part-of-speech, item-id, definition, def source) |
832
|
|
|
|
|
|
|
and the sense that the lemma is being compared to. |
833
|
|
|
|
|
|
|
|
834
|
|
|
|
|
|
|
Returns: a score of how related the in lemma is to the |
835
|
|
|
|
|
|
|
compareSense. |
836
|
|
|
|
|
|
|
|
837
|
|
|
|
|
|
|
=cut |
838
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
sub BwS() |
840
|
|
|
|
|
|
|
{ |
841
|
|
|
|
|
|
|
my $base = 0; |
842
|
|
|
|
|
|
|
if(scalar @_ == 3)#checks if method entered by object. |
843
|
|
|
|
|
|
|
{ |
844
|
|
|
|
|
|
|
$base = 1; |
845
|
|
|
|
|
|
|
} |
846
|
|
|
|
|
|
|
|
847
|
|
|
|
|
|
|
my @inLemma = @{$_[$base]}; |
848
|
|
|
|
|
|
|
$base++; |
849
|
|
|
|
|
|
|
my $curSense = $_[$base]; |
850
|
|
|
|
|
|
|
|
851
|
|
|
|
|
|
|
if($stemmed == 0) |
852
|
|
|
|
|
|
|
{ |
853
|
|
|
|
|
|
|
$stemming = 1; |
854
|
|
|
|
|
|
|
preProcessing(); |
855
|
|
|
|
|
|
|
$stemmed = 1; |
856
|
|
|
|
|
|
|
} |
857
|
|
|
|
|
|
|
|
858
|
|
|
|
|
|
|
return simpleScoreSense(\@inLemma, $curSense); |
859
|
|
|
|
|
|
|
|
860
|
|
|
|
|
|
|
} |
861
|
|
|
|
|
|
|
|
862
|
|
|
|
|
|
|
=item $obj->baseline(@inLemma, $compareSense) |
863
|
|
|
|
|
|
|
|
864
|
|
|
|
|
|
|
Calculates a score for the passed sense then returns |
865
|
|
|
|
|
|
|
that score. This class is a wrapper for the |
866
|
|
|
|
|
|
|
simpleScoreSense() method as it makes sure no stemming |
867
|
|
|
|
|
|
|
or lemmatization is present in the preProcessing(). |
868
|
|
|
|
|
|
|
|
869
|
|
|
|
|
|
|
Parameters: the in lemma in array form |
870
|
|
|
|
|
|
|
(lemma, part-of-speech, item-id, definition, def source) |
871
|
|
|
|
|
|
|
and the sense that the lemma is being compared to. |
872
|
|
|
|
|
|
|
|
873
|
|
|
|
|
|
|
Returns: a score of how related the in lemma is to the |
874
|
|
|
|
|
|
|
compareSense. |
875
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
=cut |
877
|
|
|
|
|
|
|
|
878
|
|
|
|
|
|
|
sub baseline() |
879
|
|
|
|
|
|
|
{ |
880
|
|
|
|
|
|
|
my $base = 0; |
881
|
|
|
|
|
|
|
if(scalar @_ == 3)#checks if method entered by object. |
882
|
|
|
|
|
|
|
{ |
883
|
|
|
|
|
|
|
$base = 1; |
884
|
|
|
|
|
|
|
} |
885
|
|
|
|
|
|
|
|
886
|
|
|
|
|
|
|
my @inLemma = @{$_[$base]}; |
887
|
|
|
|
|
|
|
$base++; |
888
|
|
|
|
|
|
|
my $curSense = $_[$base]; |
889
|
|
|
|
|
|
|
|
890
|
|
|
|
|
|
|
if($stemmed == 1) |
891
|
|
|
|
|
|
|
{ |
892
|
|
|
|
|
|
|
$stemming = 0; |
893
|
|
|
|
|
|
|
preProcessing(); |
894
|
|
|
|
|
|
|
$stemmed = 0; |
895
|
|
|
|
|
|
|
} |
896
|
|
|
|
|
|
|
|
897
|
|
|
|
|
|
|
return simpleScoreSense(\@inLemma, $curSense); |
898
|
|
|
|
|
|
|
|
899
|
|
|
|
|
|
|
} |
900
|
|
|
|
|
|
|
|
901
|
|
|
|
|
|
|
=item $obj->word2VecCompare(@inLemma) |
902
|
|
|
|
|
|
|
|
903
|
|
|
|
|
|
|
Calculates a score for the passed sense by |
904
|
|
|
|
|
|
|
using the gensim Word2Vec model trained on Google |
905
|
|
|
|
|
|
|
news vectors. |
906
|
|
|
|
|
|
|
|
907
|
|
|
|
|
|
|
Parameters: the in lemma in array form |
908
|
|
|
|
|
|
|
(lemma, part-of-speech, item-id, definition, def source) |
909
|
|
|
|
|
|
|
and the sense that the lemma is being compared to. |
910
|
|
|
|
|
|
|
|
911
|
|
|
|
|
|
|
Returns: a score of how related the in lemma is to the |
912
|
|
|
|
|
|
|
compareSense. |
913
|
|
|
|
|
|
|
|
914
|
|
|
|
|
|
|
=cut |
915
|
|
|
|
|
|
|
|
916
|
|
|
|
|
|
|
sub word2VecCompare() |
917
|
|
|
|
|
|
|
{ |
918
|
|
|
|
|
|
|
my $base = 0; |
919
|
|
|
|
|
|
|
if(scalar @_ == 2)#checks if method entered by object. |
920
|
|
|
|
|
|
|
{ |
921
|
|
|
|
|
|
|
$base = 1; |
922
|
|
|
|
|
|
|
} |
923
|
|
|
|
|
|
|
|
924
|
|
|
|
|
|
|
my @inLemma = @{$_[$base]}; |
925
|
|
|
|
|
|
|
my @candidateArray = (); |
926
|
|
|
|
|
|
|
|
927
|
|
|
|
|
|
|
my $tempLemmaGloss = $inLemma[3]; |
928
|
|
|
|
|
|
|
|
929
|
|
|
|
|
|
|
if($cleanUp == 1) |
930
|
|
|
|
|
|
|
{ |
931
|
|
|
|
|
|
|
#Clean up the words in the temp lemma gloss. |
932
|
|
|
|
|
|
|
$tempLemmaGloss =~ s/(\(|\)|\.)//g; |
933
|
|
|
|
|
|
|
$tempLemmaGloss =~ s/^a-zA-Z//g; |
934
|
|
|
|
|
|
|
$tempLemmaGloss = lc $tempLemmaGloss; |
935
|
|
|
|
|
|
|
$tempLemmaGloss =~ s/(^|\s)$stopList(\s|$)/ /g; #remove stop words |
936
|
|
|
|
|
|
|
} |
937
|
|
|
|
|
|
|
|
938
|
|
|
|
|
|
|
|
939
|
|
|
|
|
|
|
if($inLemma[1] eq 'noun') |
940
|
|
|
|
|
|
|
{ |
941
|
|
|
|
|
|
|
@candidateArray = @wordNetNouns; |
942
|
|
|
|
|
|
|
} |
943
|
|
|
|
|
|
|
else |
944
|
|
|
|
|
|
|
{ |
945
|
|
|
|
|
|
|
@candidateArray = @wordNetVerbs; |
946
|
|
|
|
|
|
|
} |
947
|
|
|
|
|
|
|
|
948
|
|
|
|
|
|
|
open (WNFILE, '>', "tmpfile") or die $!; |
949
|
|
|
|
|
|
|
print WNFILE "$cValue\n"; |
950
|
|
|
|
|
|
|
print WNFILE "$inLemma[0]\n"; #print OOV Lemma first which will be handled by python |
951
|
|
|
|
|
|
|
print WNFILE "$tempLemmaGloss\n"; |
952
|
|
|
|
|
|
|
#create a file of all candidate WordNet words to be passed to python word2vec |
953
|
|
|
|
|
|
|
foreach $curW (@candidateArray) |
954
|
|
|
|
|
|
|
{ |
955
|
|
|
|
|
|
|
if($curW !~ /(^|\s)$stopList(\s|$)/g) |
956
|
|
|
|
|
|
|
{ |
957
|
|
|
|
|
|
|
print WNFILE "$curW\n"; |
958
|
|
|
|
|
|
|
} |
959
|
|
|
|
|
|
|
} |
960
|
|
|
|
|
|
|
close WNFILE; |
961
|
|
|
|
|
|
|
|
962
|
|
|
|
|
|
|
#open(my $ideal, "|-", "python ~/WordNet-Extend/word2vecSimilarity.py tmpfile $inLemma[0]") or die "Cannot run python script: $!"; |
963
|
|
|
|
|
|
|
|
964
|
|
|
|
|
|
|
$ideal =`python -W ignore ~/bin/word2vecSimilarity.py tmpfile`; |
965
|
|
|
|
|
|
|
|
966
|
|
|
|
|
|
|
chomp $ideal; |
967
|
|
|
|
|
|
|
my $attachMerge = ""; |
968
|
|
|
|
|
|
|
if($wnFreq{$ideal} == 0) |
969
|
|
|
|
|
|
|
{ |
970
|
|
|
|
|
|
|
$attachMerge = "attach"; |
971
|
|
|
|
|
|
|
} |
972
|
|
|
|
|
|
|
else |
973
|
|
|
|
|
|
|
{ |
974
|
|
|
|
|
|
|
$attachMerge = "merge"; |
975
|
|
|
|
|
|
|
} |
976
|
|
|
|
|
|
|
|
977
|
|
|
|
|
|
|
my $pos = ""; |
978
|
|
|
|
|
|
|
if($inLemma[1] eq 'noun') |
979
|
|
|
|
|
|
|
{ |
980
|
|
|
|
|
|
|
$pos = 'n'; |
981
|
|
|
|
|
|
|
} |
982
|
|
|
|
|
|
|
else |
983
|
|
|
|
|
|
|
{ |
984
|
|
|
|
|
|
|
$pos = 'v' |
985
|
|
|
|
|
|
|
} |
986
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
my @outLemma = (); |
988
|
|
|
|
|
|
|
if($ideal ne "") |
989
|
|
|
|
|
|
|
{ |
990
|
|
|
|
|
|
|
@outLemma = ("$inLemma[2]", "$ideal#$pos#1", "$attachMerge"); |
991
|
|
|
|
|
|
|
} |
992
|
|
|
|
|
|
|
# else |
993
|
|
|
|
|
|
|
# { |
994
|
|
|
|
|
|
|
# my $self = shift; |
995
|
|
|
|
|
|
|
# $self->{error} = 1; |
996
|
|
|
|
|
|
|
# $self->{errorString} = "No ideal found, consider changing confidence value"; |
997
|
|
|
|
|
|
|
# } |
998
|
|
|
|
|
|
|
#unlink 'tmpfile'; |
999
|
|
|
|
|
|
|
|
1000
|
|
|
|
|
|
|
return \@outLemma; |
1001
|
|
|
|
|
|
|
|
1002
|
|
|
|
|
|
|
} |
1003
|
|
|
|
|
|
|
|
1004
|
|
|
|
|
|
|
=item $obj->setConfidenceValue() |
1005
|
|
|
|
|
|
|
|
1006
|
|
|
|
|
|
|
Allows the user to set the confidence value for word2vecCompare(). |
1007
|
|
|
|
|
|
|
The confidence value is the cutoff for the similarity score. If |
1008
|
|
|
|
|
|
|
the similarity score is below the confidence value it will be dropped. |
1009
|
|
|
|
|
|
|
This aims to increase accuracy but will reduce recall. |
1010
|
|
|
|
|
|
|
|
1011
|
|
|
|
|
|
|
Parameters: the new confidence value, default is set to 0 |
1012
|
|
|
|
|
|
|
|
1013
|
|
|
|
|
|
|
Returns: Nothing |
1014
|
|
|
|
|
|
|
|
1015
|
|
|
|
|
|
|
=cut |
1016
|
|
|
|
|
|
|
|
1017
|
|
|
|
|
|
|
sub setConfidenceValue() |
1018
|
|
|
|
|
|
|
{ |
1019
|
|
|
|
|
|
|
my $base = 0; |
1020
|
|
|
|
|
|
|
|
1021
|
|
|
|
|
|
|
if(scalar @_ == 2)#checks if method entered by object |
1022
|
|
|
|
|
|
|
{ |
1023
|
|
|
|
|
|
|
$base = 1; |
1024
|
|
|
|
|
|
|
} |
1025
|
|
|
|
|
|
|
|
1026
|
|
|
|
|
|
|
my $newCValue = $_[$base]; |
1027
|
|
|
|
|
|
|
|
1028
|
|
|
|
|
|
|
$cValue = $newCValue; |
1029
|
|
|
|
|
|
|
|
1030
|
|
|
|
|
|
|
} |
1031
|
|
|
|
|
|
|
|
1032
|
|
|
|
|
|
|
|
1033
|
|
|
|
|
|
|
=item $obj->simpleScoreSense(@inLemma, $compareSense) |
1034
|
|
|
|
|
|
|
|
1035
|
|
|
|
|
|
|
Calculates a score for the passed sense then |
1036
|
|
|
|
|
|
|
returns that score. This is the baseline system which |
1037
|
|
|
|
|
|
|
was submitted for SemEval16 task 14. This algorithm |
1038
|
|
|
|
|
|
|
scores by overlapping words found in the lemma's gloss |
1039
|
|
|
|
|
|
|
and also with the lemma's hypernym and hyponyms' glosses. |
1040
|
|
|
|
|
|
|
|
1041
|
|
|
|
|
|
|
Parameters: the in lemma in array form |
1042
|
|
|
|
|
|
|
(lemma, part-of-speech, item-id, definition, def source) |
1043
|
|
|
|
|
|
|
and the sense that the lemma is being compared to. |
1044
|
|
|
|
|
|
|
|
1045
|
|
|
|
|
|
|
Returns: a score of how related the in lemma is to the |
1046
|
|
|
|
|
|
|
compareSense. |
1047
|
|
|
|
|
|
|
|
1048
|
|
|
|
|
|
|
=cut |
1049
|
|
|
|
|
|
|
|
1050
|
|
|
|
|
|
|
sub simpleScoreSense() |
1051
|
|
|
|
|
|
|
{ |
1052
|
|
|
|
|
|
|
my $base = 0; |
1053
|
|
|
|
|
|
|
if(scalar @_ == 3)#checks if method entered by object. |
1054
|
|
|
|
|
|
|
{ |
1055
|
|
|
|
|
|
|
$base = 1; |
1056
|
|
|
|
|
|
|
} |
1057
|
|
|
|
|
|
|
|
1058
|
|
|
|
|
|
|
my @inLemma = @{$_[$base]}; |
1059
|
|
|
|
|
|
|
$base++; |
1060
|
|
|
|
|
|
|
my $curSense = $_[$base]; |
1061
|
|
|
|
|
|
|
my $word = substr($curSense, 0, index($curSense, '#')); #extracts base word. |
1062
|
|
|
|
|
|
|
|
1063
|
|
|
|
|
|
|
#_________________Sense Gloss_________________________________ |
1064
|
|
|
|
|
|
|
my @curSenseGloss = split (' ', $wnGlosses{$curSense}); #initialize current sense gloss. |
1065
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
my @extendedGloss = getExtendedGloss($curSense); |
1067
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
#________________Lemma Gloss_________________________________ |
1069
|
|
|
|
|
|
|
my $tempLemmaGloss = $inLemma[3]; |
1070
|
|
|
|
|
|
|
|
1071
|
|
|
|
|
|
|
|
1072
|
|
|
|
|
|
|
if($cleanUp == 1) |
1073
|
|
|
|
|
|
|
{ |
1074
|
|
|
|
|
|
|
#Clean up the words in the temp lemma gloss. |
1075
|
|
|
|
|
|
|
$tempLemmaGloss =~ s/(\(|\)|\.)//g; |
1076
|
|
|
|
|
|
|
$tempLemmaGloss =~ s/^a-zA-Z//g; |
1077
|
|
|
|
|
|
|
$tempLemmaGloss = lc $tempLemmaGloss; |
1078
|
|
|
|
|
|
|
$tempLemmaGloss =~ s/(^|\s)$stopList(\s|$)/ /g; #remove stop words |
1079
|
|
|
|
|
|
|
} |
1080
|
|
|
|
|
|
|
if($userCleanUp ne "\"\"") |
1081
|
|
|
|
|
|
|
{ |
1082
|
|
|
|
|
|
|
$tempLemmaGloss =~ $userCleanUp; |
1083
|
|
|
|
|
|
|
} |
1084
|
|
|
|
|
|
|
|
1085
|
|
|
|
|
|
|
my @curLemmaGloss = split(' ', $tempLemmaGloss); |
1086
|
|
|
|
|
|
|
|
1087
|
|
|
|
|
|
|
|
1088
|
|
|
|
|
|
|
#__________________Overlaps__________________________________ |
1089
|
|
|
|
|
|
|
my $glossLength = 0; |
1090
|
|
|
|
|
|
|
my $overlaps = 0.0; #number of overlapped words. |
1091
|
|
|
|
|
|
|
|
1092
|
|
|
|
|
|
|
#scan through each word from the sense gloss and see if any overlap on the lemma gloss. |
1093
|
|
|
|
|
|
|
for my $lWord (0..$#curLemmaGloss) |
1094
|
|
|
|
|
|
|
{ |
1095
|
|
|
|
|
|
|
$glossLength = $glossLength + length $curLemmaGloss[$lWord]; |
1096
|
|
|
|
|
|
|
if($curLemmaGloss[$lWord] =~ /\b$word\b/) #if lemma contains current word from sense itself |
1097
|
|
|
|
|
|
|
{ |
1098
|
|
|
|
|
|
|
$overlaps = $overlaps + $bonus*(length $word); |
1099
|
|
|
|
|
|
|
} |
1100
|
|
|
|
|
|
|
|
1101
|
|
|
|
|
|
|
$spaceWord = $word; |
1102
|
|
|
|
|
|
|
$spaceWord =~ s/_/ /g; #substitute underscores for spaces for comparison below |
1103
|
|
|
|
|
|
|
if($spaceWord =~ /(^\w+\s\b$curLemmaGloss[$lWord]\b$)|(^\b$curLemmaGloss[$lWord]\b\s\w+$)/) |
1104
|
|
|
|
|
|
|
{ |
1105
|
|
|
|
|
|
|
$overlaps = $overlaps + $bonus*(length $curLemmaGloss[$lWord]); |
1106
|
|
|
|
|
|
|
} |
1107
|
|
|
|
|
|
|
|
1108
|
|
|
|
|
|
|
for my $sWord (0..$#curSenseGloss) |
1109
|
|
|
|
|
|
|
{ |
1110
|
|
|
|
|
|
|
if($curLemmaGloss[$lWord] =~ /\b\Q$curSenseGloss[$sWord]\E\b?/) |
1111
|
|
|
|
|
|
|
{ |
1112
|
|
|
|
|
|
|
$overlaps = $overlaps + length $curSenseGloss[$sWord]; |
1113
|
|
|
|
|
|
|
} |
1114
|
|
|
|
|
|
|
} |
1115
|
|
|
|
|
|
|
for my $extWord (0..$#extendedGloss) |
1116
|
|
|
|
|
|
|
{ |
1117
|
|
|
|
|
|
|
if($curLemmaGloss[$lWord] =~ /\b\Q$extendedGloss[$extWord]\E\b?/) |
1118
|
|
|
|
|
|
|
{ |
1119
|
|
|
|
|
|
|
$overlaps = $overlaps + length $extendedGloss[$extWord]; |
1120
|
|
|
|
|
|
|
} |
1121
|
|
|
|
|
|
|
} |
1122
|
|
|
|
|
|
|
|
1123
|
|
|
|
|
|
|
} |
1124
|
|
|
|
|
|
|
|
1125
|
|
|
|
|
|
|
$score = $overlaps/$glossLength; |
1126
|
|
|
|
|
|
|
|
1127
|
|
|
|
|
|
|
return $score; |
1128
|
|
|
|
|
|
|
} |
1129
|
|
|
|
|
|
|
|
1130
|
|
|
|
|
|
|
=item $obj->getExtendedGloss($compareSense) |
1131
|
|
|
|
|
|
|
|
1132
|
|
|
|
|
|
|
Calculates the extended gloss based on which |
1133
|
|
|
|
|
|
|
glosses are toggled and returns an array |
1134
|
|
|
|
|
|
|
|
1135
|
|
|
|
|
|
|
which contains the full glosses. |
1136
|
|
|
|
|
|
|
|
1137
|
|
|
|
|
|
|
Parameter: the sense which the extended gloss is |
1138
|
|
|
|
|
|
|
based on |
1139
|
|
|
|
|
|
|
|
1140
|
|
|
|
|
|
|
Returns: an array which contains the extended gloss |
1141
|
|
|
|
|
|
|
|
1142
|
|
|
|
|
|
|
=cut |
1143
|
|
|
|
|
|
|
|
1144
|
|
|
|
|
|
|
sub getExtendedGloss() |
1145
|
|
|
|
|
|
|
{ |
1146
|
|
|
|
|
|
|
my $base = 0; |
1147
|
|
|
|
|
|
|
if(scalar @_ == 2)#checks if method entered by object. |
1148
|
|
|
|
|
|
|
{ |
1149
|
|
|
|
|
|
|
$base = 1; |
1150
|
|
|
|
|
|
|
} |
1151
|
|
|
|
|
|
|
|
1152
|
|
|
|
|
|
|
my $curSense = $_[$base]; |
1153
|
|
|
|
|
|
|
my @extendedGloss = (); |
1154
|
|
|
|
|
|
|
|
1155
|
|
|
|
|
|
|
#__________________Hype Gloss_________________________________ |
1156
|
|
|
|
|
|
|
if($useHypeGlosses == 1) |
1157
|
|
|
|
|
|
|
{ |
1158
|
|
|
|
|
|
|
#Now expands to hypernyms glosses in overlaps |
1159
|
|
|
|
|
|
|
my @senseHypes = @{$wnHypes{$curSense}}; |
1160
|
|
|
|
|
|
|
my @senseHypeGloss = (); |
1161
|
|
|
|
|
|
|
my $tempAllHypeGloss = ""; |
1162
|
|
|
|
|
|
|
|
1163
|
|
|
|
|
|
|
for my $hype (0..$#senseHypes) |
1164
|
|
|
|
|
|
|
{ |
1165
|
|
|
|
|
|
|
my $tempHypeGloss = $wnGlosses{$hype}; |
1166
|
|
|
|
|
|
|
|
1167
|
|
|
|
|
|
|
$tempAllHypeGloss = $tempAllHypeGloss . " " . $tempHypeGloss; |
1168
|
|
|
|
|
|
|
} |
1169
|
|
|
|
|
|
|
|
1170
|
|
|
|
|
|
|
@senseHypeGloss = split(' ', $tempAllHypeGloss); |
1171
|
|
|
|
|
|
|
|
1172
|
|
|
|
|
|
|
push(@extendedGloss, @senseHypeGloss); |
1173
|
|
|
|
|
|
|
} |
1174
|
|
|
|
|
|
|
|
1175
|
|
|
|
|
|
|
#________________Hypo Gloss__________________________________ |
1176
|
|
|
|
|
|
|
if($useHypoGlosses == 1) |
1177
|
|
|
|
|
|
|
{ |
1178
|
|
|
|
|
|
|
#adds in hyponyms' glosses in overlaps |
1179
|
|
|
|
|
|
|
my @senseHypos = @{$wnHypos{$curSense}}; |
1180
|
|
|
|
|
|
|
my @senseHypoGloss = (); |
1181
|
|
|
|
|
|
|
my $tempAllHypoGloss = ""; |
1182
|
|
|
|
|
|
|
|
1183
|
|
|
|
|
|
|
for my $hypo (0..$#senseHypos) |
1184
|
|
|
|
|
|
|
{ |
1185
|
|
|
|
|
|
|
my $tempHypoGloss = $wnGlosses{$hypo}; |
1186
|
|
|
|
|
|
|
|
1187
|
|
|
|
|
|
|
$tempAllHypoGloss = $tempAllHypoGloss . " " . $tempHypoGloss; |
1188
|
|
|
|
|
|
|
} |
1189
|
|
|
|
|
|
|
|
1190
|
|
|
|
|
|
|
@senseHypoGloss = split(' ', $tempAllHypoGloss); |
1191
|
|
|
|
|
|
|
push(@extendedGloss, @senseHypoGloss); |
1192
|
|
|
|
|
|
|
} |
1193
|
|
|
|
|
|
|
|
1194
|
|
|
|
|
|
|
#_________________Syns Gloss_________________________________ |
1195
|
|
|
|
|
|
|
if($useSynsGlosses == 1) |
1196
|
|
|
|
|
|
|
{ |
1197
|
|
|
|
|
|
|
#adds in synsets' glosses in overlaps |
1198
|
|
|
|
|
|
|
my @senseSyns = @{$wnSyns{$curSense}}; |
1199
|
|
|
|
|
|
|
my @senseSynsGloss = (); |
1200
|
|
|
|
|
|
|
my $tempAllSynsGloss = ""; |
1201
|
|
|
|
|
|
|
|
1202
|
|
|
|
|
|
|
for my $syns (0..$#senseSyns) |
1203
|
|
|
|
|
|
|
{ |
1204
|
|
|
|
|
|
|
if(!($syns =~ /\b$word\b/)) #do not repeat sense |
1205
|
|
|
|
|
|
|
{ |
1206
|
|
|
|
|
|
|
my $tempSynsGloss = $wnGlosses{$syns}; |
1207
|
|
|
|
|
|
|
|
1208
|
|
|
|
|
|
|
$tempAllSynsGloss = $tempAllSynsGloss . " " . $tempSynsGloss; |
1209
|
|
|
|
|
|
|
} |
1210
|
|
|
|
|
|
|
} |
1211
|
|
|
|
|
|
|
|
1212
|
|
|
|
|
|
|
@senseSynsGloss = split(' ', $tempAllSynsGloss); |
1213
|
|
|
|
|
|
|
push(@extendedGloss, @senseSynsGloss); |
1214
|
|
|
|
|
|
|
} |
1215
|
|
|
|
|
|
|
|
1216
|
|
|
|
|
|
|
return \@extendedGloss; |
1217
|
|
|
|
|
|
|
} |
1218
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
=item $obj->toggleRefineSense($toggle) |
1220
|
|
|
|
|
|
|
|
1221
|
|
|
|
|
|
|
Allows user to toggle refineSense() on/off. |
1222
|
|
|
|
|
|
|
|
1223
|
|
|
|
|
|
|
Parameter: 0 or 1 to toggle the refine sense method |
1224
|
|
|
|
|
|
|
on or off respectively in the processLemma method. |
1225
|
|
|
|
|
|
|
|
1226
|
|
|
|
|
|
|
Returns: nothing |
1227
|
|
|
|
|
|
|
|
1228
|
|
|
|
|
|
|
=cut |
1229
|
|
|
|
|
|
|
|
1230
|
|
|
|
|
|
|
sub toggleRefineSense() |
1231
|
|
|
|
|
|
|
{ |
1232
|
|
|
|
|
|
|
if($_[0] == 0) |
1233
|
|
|
|
|
|
|
{ |
1234
|
|
|
|
|
|
|
$refineSense = 0; |
1235
|
|
|
|
|
|
|
} |
1236
|
|
|
|
|
|
|
else |
1237
|
|
|
|
|
|
|
{ |
1238
|
|
|
|
|
|
|
$refineSense = 1; |
1239
|
|
|
|
|
|
|
} |
1240
|
|
|
|
|
|
|
} |
1241
|
|
|
|
|
|
|
|
1242
|
|
|
|
|
|
|
=item $obj->refineSense(@inLemma, $highSense) |
1243
|
|
|
|
|
|
|
|
1244
|
|
|
|
|
|
|
Refines chosen sense, by determing which |
1245
|
|
|
|
|
|
|
numbered sense should be chosen. |
1246
|
|
|
|
|
|
|
|
1247
|
|
|
|
|
|
|
Parameters: the in lemma in form of |
1248
|
|
|
|
|
|
|
(lemma, part-of-speech, item-id, definition, def source) |
1249
|
|
|
|
|
|
|
and the sense which currently bests matches the inlemma. |
1250
|
|
|
|
|
|
|
|
1251
|
|
|
|
|
|
|
Returns:the new highest scoring sense |
1252
|
|
|
|
|
|
|
|
1253
|
|
|
|
|
|
|
=cut |
1254
|
|
|
|
|
|
|
|
1255
|
|
|
|
|
|
|
sub refineSense() |
1256
|
|
|
|
|
|
|
{ |
1257
|
|
|
|
|
|
|
my $base = 0; |
1258
|
|
|
|
|
|
|
if(scalar @_ == 3)#checks if method entered by object. |
1259
|
|
|
|
|
|
|
{ |
1260
|
|
|
|
|
|
|
$base = 1; |
1261
|
|
|
|
|
|
|
} |
1262
|
|
|
|
|
|
|
|
1263
|
|
|
|
|
|
|
my @inLemma = @{$_[$base]}; |
1264
|
|
|
|
|
|
|
|
1265
|
|
|
|
|
|
|
$base++; |
1266
|
|
|
|
|
|
|
my $highSense = $_[$base]; |
1267
|
|
|
|
|
|
|
my $word = substr($highSense, 0, index($highSense, '#')); #extracts base word. |
1268
|
|
|
|
|
|
|
my $shortSense = substr($inLemma[1], 0, 1); |
1269
|
|
|
|
|
|
|
my $sense = $word . "#" . $shortSense; |
1270
|
|
|
|
|
|
|
my $highSenseScore = 0; |
1271
|
|
|
|
|
|
|
my $rSenseScore = 0; |
1272
|
|
|
|
|
|
|
my $refineHigh = "$sense#1"; #assume first sense. |
1273
|
|
|
|
|
|
|
my $tempLemmaGloss = $inLemma[3]; |
1274
|
|
|
|
|
|
|
|
1275
|
|
|
|
|
|
|
if($cleanUp == 1) |
1276
|
|
|
|
|
|
|
{ |
1277
|
|
|
|
|
|
|
#Clean up the words in the temp lemma gloss. |
1278
|
|
|
|
|
|
|
$tempLemmaGloss =~ s/(\(|\)|\.)//g; |
1279
|
|
|
|
|
|
|
$tempLemmaGloss =~ s/^a-zA-Z//g; |
1280
|
|
|
|
|
|
|
$tempLemmaGloss = lc $tempLemmaGloss; |
1281
|
|
|
|
|
|
|
$tempLemmaGloss =~ s/(^|\s)$stopList(\s|$)/ /g; #remove stop words |
1282
|
|
|
|
|
|
|
} |
1283
|
|
|
|
|
|
|
if($userCleanUp ne "\"\"") |
1284
|
|
|
|
|
|
|
{ |
1285
|
|
|
|
|
|
|
$tempLemmaGloss =~ $userCleanUp; |
1286
|
|
|
|
|
|
|
} |
1287
|
|
|
|
|
|
|
|
1288
|
|
|
|
|
|
|
my @refineLemmaGloss = split(' ', $tempLemmaGloss); |
1289
|
|
|
|
|
|
|
|
1290
|
|
|
|
|
|
|
my $rGlossLength = 0.0; |
1291
|
|
|
|
|
|
|
my $rOverlaps = 0.0; |
1292
|
|
|
|
|
|
|
my @refineSenses = $wn->querySense($sense); #obtains the other senses for the same word. |
1293
|
|
|
|
|
|
|
for my $rSense (0..$#refineSenses) |
1294
|
|
|
|
|
|
|
{ |
1295
|
|
|
|
|
|
|
my $tempSenseGloss = $wnGlosses{$rSense}; |
1296
|
|
|
|
|
|
|
|
1297
|
|
|
|
|
|
|
for my $rLemma (0..$#refineLemmaGloss) |
1298
|
|
|
|
|
|
|
{ |
1299
|
|
|
|
|
|
|
$rGlossLength = $rGlossLength + length $refineLemmaGloss[$rLemma]; |
1300
|
|
|
|
|
|
|
if($refineLemmaGlos[$rLemma] ne $word) |
1301
|
|
|
|
|
|
|
{ |
1302
|
|
|
|
|
|
|
if($tempSenseGloss =~ /$refineLemmaGloss[$rLemma]/) |
1303
|
|
|
|
|
|
|
{ |
1304
|
|
|
|
|
|
|
$rOverlaps = $rOverlaps + length $refineLemmaGloss[$rLemma]; |
1305
|
|
|
|
|
|
|
} |
1306
|
|
|
|
|
|
|
} |
1307
|
|
|
|
|
|
|
|
1308
|
|
|
|
|
|
|
} |
1309
|
|
|
|
|
|
|
|
1310
|
|
|
|
|
|
|
$rSenseScore = $rOverlaps/$rGlossLength; |
1311
|
|
|
|
|
|
|
if($rSenseScore > $highSenseScore) |
1312
|
|
|
|
|
|
|
{ |
1313
|
|
|
|
|
|
|
$highSenseScore = $rSenseScore; |
1314
|
|
|
|
|
|
|
$refineHigh = $rHypo; |
1315
|
|
|
|
|
|
|
} |
1316
|
|
|
|
|
|
|
} |
1317
|
|
|
|
|
|
|
|
1318
|
|
|
|
|
|
|
$highSense = $refineHigh; |
1319
|
|
|
|
|
|
|
|
1320
|
|
|
|
|
|
|
return $highSense; |
1321
|
|
|
|
|
|
|
|
1322
|
|
|
|
|
|
|
} |
1323
|
|
|
|
|
|
|
|
1324
|
|
|
|
|
|
|
|
1325
|
|
|
|
|
|
|
#**************printHelp()********************** |
1326
|
|
|
|
|
|
|
# Prints indepth help guide to screen. |
1327
|
|
|
|
|
|
|
#*********************************************** |
1328
|
|
|
|
|
|
|
sub printHelp() |
1329
|
|
|
|
|
|
|
{ |
1330
|
|
|
|
|
|
|
printUsage(); |
1331
|
|
|
|
|
|
|
print "Takes in lemmas from file and attempts to\n"; |
1332
|
|
|
|
|
|
|
print "insert them into WordNet by first finding\n"; |
1333
|
|
|
|
|
|
|
print "a hypernym, then either a) merging the \n"; |
1334
|
|
|
|
|
|
|
print "lemma with the hypernym or b) attaching \n"; |
1335
|
|
|
|
|
|
|
print "the lemma to the hypernym.\n"; |
1336
|
|
|
|
|
|
|
} |
1337
|
|
|
|
|
|
|
|
1338
|
|
|
|
|
|
|
1; |