File Coverage

blib/lib/WordNet/Similarity/jcn.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             # WordNet::Similarity::jcn.pm version 2.04
2             # (Last updated $Id: jcn.pm,v 1.23 2008/03/27 06:21:17 sidz1979 Exp $)
3             #
4             # Semantic Similarity Measure package implementing the measure
5             # described by Jiang and Conrath (1997).
6             #
7             # Copyright (c) 2005,
8             #
9             # Ted Pedersen, University of Minnesota Duluth
10             # tpederse at d.umn.edu
11             #
12             # Siddharth Patwardhan, University of Utah, Salt Lake City
13             # sidd at cs.utah.edu
14             #
15             # Jason Michelizzi, Univeristy of Minnesota Duluth
16             # mich0212 at d.umn.edu
17             #
18             # This program is free software; you can redistribute it and/or
19             # modify it under the terms of the GNU General Public License
20             # as published by the Free Software Foundation; either version 2
21             # of the License, or (at your option) any later version.
22             #
23             # This program is distributed in the hope that it will be useful,
24             # but WITHOUT ANY WARRANTY; without even the implied warranty of
25             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26             # GNU General Public License for more details.
27             #
28             # You should have received a copy of the GNU General Public License
29             # along with this program; if not, write to
30             #
31             # The Free Software Foundation, Inc.,
32             # 59 Temple Place - Suite 330,
33             # Boston, MA 02111-1307, USA.
34             #
35             # ------------------------------------------------------------------
36              
37             package WordNet::Similarity::jcn;
38              
39             =head1 NAME
40              
41             WordNet::Similarity::jcn - Perl module for computing semantic relatedness
42             of word senses according to the method described by Jiang and Conrath
43             (1997).
44              
45             =head1 SYNOPSIS
46              
47             use WordNet::Similarity::jcn;
48              
49             use WordNet::QueryData;
50              
51             my $wn = WordNet::QueryData->new();
52              
53             my $rel = WordNet::Similarity::jcn->new($wn);
54              
55             my $value = $rel->getRelatedness("car#n#1", "bus#n#2");
56              
57             ($error, $errorString) = $rel->getError();
58              
59             die "$errorString\n" if($error);
60              
61             print "car (sense 1) <-> bus (sense 2) = $value\n";
62              
63             =head1 DESCRIPTION
64              
65             This module computes the semantic relatedness of word senses according to
66             the method described by Jiang and Conrath (1997). This measure is based on
67             a combination of using edge counts in the WordNet 'is-a' hierarchy and
68             using the information content values of the WordNet concepts, as described
69             in the paper by Jiang and Conrath. Their measure, however, computes values
70             that indicate the semantic distance between words (as opposed to their
71             semantic relatedness). In this implementation of the measure we invert the
72             value so as to obtain a measure of semantic relatedness. Other issues that
73             arise due to this inversion (such as handling of zero values in the
74             denominator) have been taken care of as special cases.
75              
76             =over
77              
78             =cut
79              
80 4     4   8890 use strict;
  4         9  
  4         182  
81 4     4   20 use warnings;
  4         15  
  4         123  
82              
83 4     4   18 use Exporter;
  4         9  
  4         761  
84 4     4   3006 use WordNet::Similarity::ICFinder;
  0            
  0            
85              
86             our (@ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);
87              
88             @ISA = qw(WordNet::Similarity::ICFinder);
89              
90             %EXPORT_TAGS = ();
91              
92             @EXPORT_OK = ();
93              
94             @EXPORT = ();
95              
96             our $VERSION = '2.04';
97              
98             # the 'new' method is supplied by WordNet::Similarity
99              
100             =item $jcn->getRelatedness ($synset1, $synset2)
101              
102             Computes the relatedness of two word senses using an information content
103             scheme. See the discussion section below for detailed information on how
104             the jcn measure calculates relatedness.
105              
106             Parameters: two word senses in "word#pos#sense" format.
107              
108             Returns: Unless a problem occurs, the return value is the relatedness
109             score. If no path exists between the two word senses, then a large
110             negative number is returned. If an error occures, then the error level
111             is set to non-zero and an error string is created (see the description
112             of getError()). Note: the error level will also be set to 1 and an
113             an error string will be created if no path exists between the words.
114              
115             =cut
116              
117             sub getRelatedness
118             {
119             my $self = shift;
120             my $wps1 = shift;
121             my $wps2 = shift;
122             my $wn = $self->{wn};
123             my $class = ref $self || $self;
124              
125             # Check the existence of the WordNet::QueryData object.
126             unless ($wn) {
127             $self->{errorString} .= "\nError (${class}::getRelatedness()) - ";
128             $self->{errorString} .= "A WordNet::QueryData object is required.";
129             $self->{error} = 2;
130             return undef;
131             }
132              
133             # Initialize traces.
134             $self->{traceString} = "";
135              
136             # JM 1-21-04
137             # moved input validation code to parseInput() in a super-class
138             my $ret = $self->parseWps ($wps1, $wps2);
139             ref $ret or return $ret;
140             my ($word1, $pos1, undef, $offset1, $word2, $pos2, undef, $offset2) = @{$ret};
141              
142             my $pos = $pos1;
143              
144             # Now check if the similarity value for these two synsets is in
145             # fact in the cache... if so return the cached value.
146             my $relatedness =
147             $self->{doCache} ? $self->fetchFromCache ($wps1, $wps2) : undef;
148             defined $relatedness and return $relatedness;
149              
150             # Now get down to really finding the relatedness of these two.
151             my $mode = 'offset';
152             my @LCSs = $self->getLCSbyIC ($offset1, $offset2, $pos, 'offset');
153              
154             my $ref = shift @LCSs;
155             # check if $ref is a reference, if not, then return undefined
156             # $ref will not be a reference if no LCS was found
157             unless (ref $ref) {
158             return $self->UNRELATED;
159             }
160              
161             my ($lcs, $lcsic) = @{$ref};
162             my $lcsfreq = $self->getFrequency ($lcs, $pos, 'offset');
163              
164             # Check for the rare possibility of the root node having 0
165             # frequency count...
166             # If normal (i.e. freqCount(root) > 0)... Set the minimum distance to the
167             # greatest distance possible + 1... (my replacement for infinity)...
168             # If zero root frequency count... return 0 relatedness, with a warning...
169              
170             my $maxScore;
171             my $rootFreq = $self->getFrequency (0, $pos, 'offset');
172             if($rootFreq) {
173             # $minDist = (2*(-log(0.001/($self->{offsetFreq}->{$pos}->{0})))) + 1;
174             $maxScore = 2 * -log (0.001 / $rootFreq) + 1;
175             }
176             else {
177             $self->{errorString} .= "\nWarning (${class}::getRelatedness()) - ";
178             $self->{errorString} .= "Root node has a zero frequency count.";
179             $self->{error} = ($self->{error} < 1) ? 1 : $self->{error};
180             return 0;
181             }
182              
183             # Foreach lowest common subsumer...
184             # Find the minimum jcn distance between the two subsuming concepts...
185             # Making sure that neither of the 2 concepts have 0 infocontent
186             my $ic1 = $self->IC($offset1, $pos);
187             my $ic2 = $self->IC($offset2, $pos);
188             if ($self->{trace}) {
189             $self->{traceString} .= "Concept1: ";
190             $self->printSet ($pos, $mode, $offset1);
191             $self->{traceString} .= " (IC=";
192             $self->{traceString} .= sprintf ("%.6f", $ic1);
193             $self->{traceString} .= ")\n";
194             $self->{traceString} .= "Concept2: ";
195             $self->printSet ($pos, $mode, $offset2);
196             $self->{traceString} .= " (IC=";
197             $self->{traceString} .= sprintf ("%.6f", $ic2);
198             $self->{traceString} .= ")\n";
199             }
200              
201             my $distance;
202              
203             # If either of the two concepts have a zero information content...
204             # return 0, for lack of data...
205             if($ic1 && $ic2) {
206             my $ic3 = $self->IC($lcs, $pos);
207              
208             $distance = $ic1 + $ic2 - (2 * $ic3);
209             }
210             else {
211             return 0;
212             }
213              
214             # Now if distance turns out to be 0...
215             # implies ic1 == ic2 == ic3 (most probably all three represent
216             # the same concept)... i.e. maximum relatedness... i.e. infinity...
217             # We'll return the maximum possible value ("Our infinity").
218             # Here's how we got our infinity...
219             # distance = ic1 + ic2 - (2 x ic3)
220             # Largest possible value for (1/distance) is infinity, when distance = 0.
221             # That won't work for us... Whats the next value on the list...
222             # the smallest value of distance greater than 0...
223             # Consider the formula again... distance = ic1 + ic2 - (2 x ic3)
224             # We want the value of distance when ic1 or ic2 have information content
225             # slightly more than that of the root (ic3)... (let ic2 == ic3 == 0)
226             # Assume frequency counts of 0.01 less than the frequency count of the
227             # root for computing ic1...
228             # sim = 1/ic1
229             # sim = 1/(-log((freq(root) - 0.01)/freq(root)))
230              
231             my $score;
232              
233             if ($distance == 0) {
234             if ($rootFreq > 0.01) {
235             $score = 1 / -log (($rootFreq - 0.01) / $rootFreq);
236             }
237             else {
238             # root frequency is 0
239             return 0;
240             }
241             }
242             else { # distance is non-zero
243             $score = 1 / $distance
244             }
245             $self->{doCache} and $self->storeToCache ($wps1, $wps2, $score);
246             return $score;
247             }
248              
249             # JM 1-16-04
250             # moved subroutine _getLeastCommonSubsumers to Infocontent.pm
251              
252             1;
253              
254             __END__