line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# WordNet::Similarity::jcn.pm version 2.04 |
2
|
|
|
|
|
|
|
# (Last updated $Id: jcn.pm,v 1.23 2008/03/27 06:21:17 sidz1979 Exp $) |
3
|
|
|
|
|
|
|
# |
4
|
|
|
|
|
|
|
# Semantic Similarity Measure package implementing the measure |
5
|
|
|
|
|
|
|
# described by Jiang and Conrath (1997). |
6
|
|
|
|
|
|
|
# |
7
|
|
|
|
|
|
|
# Copyright (c) 2005, |
8
|
|
|
|
|
|
|
# |
9
|
|
|
|
|
|
|
# Ted Pedersen, University of Minnesota Duluth |
10
|
|
|
|
|
|
|
# tpederse at d.umn.edu |
11
|
|
|
|
|
|
|
# |
12
|
|
|
|
|
|
|
# Siddharth Patwardhan, University of Utah, Salt Lake City |
13
|
|
|
|
|
|
|
# sidd at cs.utah.edu |
14
|
|
|
|
|
|
|
# |
15
|
|
|
|
|
|
|
# Jason Michelizzi, Univeristy of Minnesota Duluth |
16
|
|
|
|
|
|
|
# mich0212 at d.umn.edu |
17
|
|
|
|
|
|
|
# |
18
|
|
|
|
|
|
|
# This program is free software; you can redistribute it and/or |
19
|
|
|
|
|
|
|
# modify it under the terms of the GNU General Public License |
20
|
|
|
|
|
|
|
# as published by the Free Software Foundation; either version 2 |
21
|
|
|
|
|
|
|
# of the License, or (at your option) any later version. |
22
|
|
|
|
|
|
|
# |
23
|
|
|
|
|
|
|
# This program is distributed in the hope that it will be useful, |
24
|
|
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
25
|
|
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
26
|
|
|
|
|
|
|
# GNU General Public License for more details. |
27
|
|
|
|
|
|
|
# |
28
|
|
|
|
|
|
|
# You should have received a copy of the GNU General Public License |
29
|
|
|
|
|
|
|
# along with this program; if not, write to |
30
|
|
|
|
|
|
|
# |
31
|
|
|
|
|
|
|
# The Free Software Foundation, Inc., |
32
|
|
|
|
|
|
|
# 59 Temple Place - Suite 330, |
33
|
|
|
|
|
|
|
# Boston, MA 02111-1307, USA. |
34
|
|
|
|
|
|
|
# |
35
|
|
|
|
|
|
|
# ------------------------------------------------------------------ |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
package WordNet::Similarity::jcn; |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
=head1 NAME |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
WordNet::Similarity::jcn - Perl module for computing semantic relatedness |
42
|
|
|
|
|
|
|
of word senses according to the method described by Jiang and Conrath |
43
|
|
|
|
|
|
|
(1997). |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=head1 SYNOPSIS |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
use WordNet::Similarity::jcn; |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
use WordNet::QueryData; |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
my $wn = WordNet::QueryData->new(); |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
my $rel = WordNet::Similarity::jcn->new($wn); |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
my $value = $rel->getRelatedness("car#n#1", "bus#n#2"); |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
($error, $errorString) = $rel->getError(); |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
die "$errorString\n" if($error); |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
print "car (sense 1) <-> bus (sense 2) = $value\n"; |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=head1 DESCRIPTION |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
This module computes the semantic relatedness of word senses according to |
66
|
|
|
|
|
|
|
the method described by Jiang and Conrath (1997). This measure is based on |
67
|
|
|
|
|
|
|
a combination of using edge counts in the WordNet 'is-a' hierarchy and |
68
|
|
|
|
|
|
|
using the information content values of the WordNet concepts, as described |
69
|
|
|
|
|
|
|
in the paper by Jiang and Conrath. Their measure, however, computes values |
70
|
|
|
|
|
|
|
that indicate the semantic distance between words (as opposed to their |
71
|
|
|
|
|
|
|
semantic relatedness). In this implementation of the measure we invert the |
72
|
|
|
|
|
|
|
value so as to obtain a measure of semantic relatedness. Other issues that |
73
|
|
|
|
|
|
|
arise due to this inversion (such as handling of zero values in the |
74
|
|
|
|
|
|
|
denominator) have been taken care of as special cases. |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
=over |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
=cut |
79
|
|
|
|
|
|
|
|
80
|
4
|
|
|
4
|
|
6222
|
use strict; |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
90
|
|
81
|
4
|
|
|
4
|
|
17
|
use warnings; |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
94
|
|
82
|
|
|
|
|
|
|
|
83
|
4
|
|
|
4
|
|
17
|
use Exporter; |
|
4
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
126
|
|
84
|
4
|
|
|
4
|
|
1702
|
use WordNet::Similarity::ICFinder; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
our (@ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS); |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
@ISA = qw(WordNet::Similarity::ICFinder); |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
%EXPORT_TAGS = (); |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
@EXPORT_OK = (); |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
@EXPORT = (); |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
our $VERSION = '2.04'; |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
# the 'new' method is supplied by WordNet::Similarity |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=item $jcn->getRelatedness ($synset1, $synset2) |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
Computes the relatedness of two word senses using an information content |
103
|
|
|
|
|
|
|
scheme. See the discussion section below for detailed information on how |
104
|
|
|
|
|
|
|
the jcn measure calculates relatedness. |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
Parameters: two word senses in "word#pos#sense" format. |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
Returns: Unless a problem occurs, the return value is the relatedness |
109
|
|
|
|
|
|
|
score. If no path exists between the two word senses, then a large |
110
|
|
|
|
|
|
|
negative number is returned. If an error occures, then the error level |
111
|
|
|
|
|
|
|
is set to non-zero and an error string is created (see the description |
112
|
|
|
|
|
|
|
of getError()). Note: the error level will also be set to 1 and an |
113
|
|
|
|
|
|
|
an error string will be created if no path exists between the words. |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
=cut |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
sub getRelatedness |
118
|
|
|
|
|
|
|
{ |
119
|
|
|
|
|
|
|
my $self = shift; |
120
|
|
|
|
|
|
|
my $wps1 = shift; |
121
|
|
|
|
|
|
|
my $wps2 = shift; |
122
|
|
|
|
|
|
|
my $wn = $self->{wn}; |
123
|
|
|
|
|
|
|
my $class = ref $self || $self; |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
# Check the existence of the WordNet::QueryData object. |
126
|
|
|
|
|
|
|
unless ($wn) { |
127
|
|
|
|
|
|
|
$self->{errorString} .= "\nError (${class}::getRelatedness()) - "; |
128
|
|
|
|
|
|
|
$self->{errorString} .= "A WordNet::QueryData object is required."; |
129
|
|
|
|
|
|
|
$self->{error} = 2; |
130
|
|
|
|
|
|
|
return undef; |
131
|
|
|
|
|
|
|
} |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
# Initialize traces. |
134
|
|
|
|
|
|
|
$self->{traceString} = ""; |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
# JM 1-21-04 |
137
|
|
|
|
|
|
|
# moved input validation code to parseInput() in a super-class |
138
|
|
|
|
|
|
|
my $ret = $self->parseWps ($wps1, $wps2); |
139
|
|
|
|
|
|
|
ref $ret or return $ret; |
140
|
|
|
|
|
|
|
my ($word1, $pos1, undef, $offset1, $word2, $pos2, undef, $offset2) = @{$ret}; |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
my $pos = $pos1; |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
# Now check if the similarity value for these two synsets is in |
145
|
|
|
|
|
|
|
# fact in the cache... if so return the cached value. |
146
|
|
|
|
|
|
|
my $relatedness = |
147
|
|
|
|
|
|
|
$self->{doCache} ? $self->fetchFromCache ($wps1, $wps2) : undef; |
148
|
|
|
|
|
|
|
defined $relatedness and return $relatedness; |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
# Now get down to really finding the relatedness of these two. |
151
|
|
|
|
|
|
|
my $mode = 'offset'; |
152
|
|
|
|
|
|
|
my @LCSs = $self->getLCSbyIC ($offset1, $offset2, $pos, 'offset'); |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
my $ref = shift @LCSs; |
155
|
|
|
|
|
|
|
# check if $ref is a reference, if not, then return undefined |
156
|
|
|
|
|
|
|
# $ref will not be a reference if no LCS was found |
157
|
|
|
|
|
|
|
unless (ref $ref) { |
158
|
|
|
|
|
|
|
return $self->UNRELATED; |
159
|
|
|
|
|
|
|
} |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
my ($lcs, $lcsic) = @{$ref}; |
162
|
|
|
|
|
|
|
my $lcsfreq = $self->getFrequency ($lcs, $pos, 'offset'); |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
# Check for the rare possibility of the root node having 0 |
165
|
|
|
|
|
|
|
# frequency count... |
166
|
|
|
|
|
|
|
# If normal (i.e. freqCount(root) > 0)... Set the minimum distance to the |
167
|
|
|
|
|
|
|
# greatest distance possible + 1... (my replacement for infinity)... |
168
|
|
|
|
|
|
|
# If zero root frequency count... return 0 relatedness, with a warning... |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
my $maxScore; |
171
|
|
|
|
|
|
|
my $rootFreq = $self->getFrequency (0, $pos, 'offset'); |
172
|
|
|
|
|
|
|
if($rootFreq) { |
173
|
|
|
|
|
|
|
# $minDist = (2*(-log(0.001/($self->{offsetFreq}->{$pos}->{0})))) + 1; |
174
|
|
|
|
|
|
|
$maxScore = 2 * -log (0.001 / $rootFreq) + 1; |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
else { |
177
|
|
|
|
|
|
|
$self->{errorString} .= "\nWarning (${class}::getRelatedness()) - "; |
178
|
|
|
|
|
|
|
$self->{errorString} .= "Root node has a zero frequency count."; |
179
|
|
|
|
|
|
|
$self->{error} = ($self->{error} < 1) ? 1 : $self->{error}; |
180
|
|
|
|
|
|
|
return 0; |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
# Foreach lowest common subsumer... |
184
|
|
|
|
|
|
|
# Find the minimum jcn distance between the two subsuming concepts... |
185
|
|
|
|
|
|
|
# Making sure that neither of the 2 concepts have 0 infocontent |
186
|
|
|
|
|
|
|
my $ic1 = $self->IC($offset1, $pos); |
187
|
|
|
|
|
|
|
my $ic2 = $self->IC($offset2, $pos); |
188
|
|
|
|
|
|
|
if ($self->{trace}) { |
189
|
|
|
|
|
|
|
$self->{traceString} .= "Concept1: "; |
190
|
|
|
|
|
|
|
$self->printSet ($pos, $mode, $offset1); |
191
|
|
|
|
|
|
|
$self->{traceString} .= " (IC="; |
192
|
|
|
|
|
|
|
$self->{traceString} .= sprintf ("%.6f", $ic1); |
193
|
|
|
|
|
|
|
$self->{traceString} .= ")\n"; |
194
|
|
|
|
|
|
|
$self->{traceString} .= "Concept2: "; |
195
|
|
|
|
|
|
|
$self->printSet ($pos, $mode, $offset2); |
196
|
|
|
|
|
|
|
$self->{traceString} .= " (IC="; |
197
|
|
|
|
|
|
|
$self->{traceString} .= sprintf ("%.6f", $ic2); |
198
|
|
|
|
|
|
|
$self->{traceString} .= ")\n"; |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
my $distance; |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
# If either of the two concepts have a zero information content... |
204
|
|
|
|
|
|
|
# return 0, for lack of data... |
205
|
|
|
|
|
|
|
if($ic1 && $ic2) { |
206
|
|
|
|
|
|
|
my $ic3 = $self->IC($lcs, $pos); |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
$distance = $ic1 + $ic2 - (2 * $ic3); |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
else { |
211
|
|
|
|
|
|
|
return 0; |
212
|
|
|
|
|
|
|
} |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
# Now if distance turns out to be 0... |
215
|
|
|
|
|
|
|
# implies ic1 == ic2 == ic3 (most probably all three represent |
216
|
|
|
|
|
|
|
# the same concept)... i.e. maximum relatedness... i.e. infinity... |
217
|
|
|
|
|
|
|
# We'll return the maximum possible value ("Our infinity"). |
218
|
|
|
|
|
|
|
# Here's how we got our infinity... |
219
|
|
|
|
|
|
|
# distance = ic1 + ic2 - (2 x ic3) |
220
|
|
|
|
|
|
|
# Largest possible value for (1/distance) is infinity, when distance = 0. |
221
|
|
|
|
|
|
|
# That won't work for us... Whats the next value on the list... |
222
|
|
|
|
|
|
|
# the smallest value of distance greater than 0... |
223
|
|
|
|
|
|
|
# Consider the formula again... distance = ic1 + ic2 - (2 x ic3) |
224
|
|
|
|
|
|
|
# We want the value of distance when ic1 or ic2 have information content |
225
|
|
|
|
|
|
|
# slightly more than that of the root (ic3)... (let ic2 == ic3 == 0) |
226
|
|
|
|
|
|
|
# Assume frequency counts of 0.01 less than the frequency count of the |
227
|
|
|
|
|
|
|
# root for computing ic1... |
228
|
|
|
|
|
|
|
# sim = 1/ic1 |
229
|
|
|
|
|
|
|
# sim = 1/(-log((freq(root) - 0.01)/freq(root))) |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
my $score; |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
if ($distance == 0) { |
234
|
|
|
|
|
|
|
if ($rootFreq > 0.01) { |
235
|
|
|
|
|
|
|
$score = 1 / -log (($rootFreq - 0.01) / $rootFreq); |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
else { |
238
|
|
|
|
|
|
|
# root frequency is 0 |
239
|
|
|
|
|
|
|
return 0; |
240
|
|
|
|
|
|
|
} |
241
|
|
|
|
|
|
|
} |
242
|
|
|
|
|
|
|
else { # distance is non-zero |
243
|
|
|
|
|
|
|
$score = 1 / $distance |
244
|
|
|
|
|
|
|
} |
245
|
|
|
|
|
|
|
$self->{doCache} and $self->storeToCache ($wps1, $wps2, $score); |
246
|
|
|
|
|
|
|
return $score; |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
# JM 1-16-04 |
250
|
|
|
|
|
|
|
# moved subroutine _getLeastCommonSubsumers to Infocontent.pm |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
1; |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
__END__ |