File Coverage

blib/lib/WordNet/Similarity/vector_pairs.pm

Criterion	Covered	Total	%
statement	7	9	77.7
branch			n/a
condition			n/a
subroutine	3	3	100.0
pod			n/a
total	10	12	83.3

line	stmt	sub	time	code
1				# WordNet::Similarity::vector_pairs.pm version 2.04
2				# (Last updated $Id: vector_pairs.pm,v 1.11 2008/03/27 06:21:17 sidz1979 Exp $)
3				#
4				# Module to accept two WordNet synsets and to return a floating point
5				# number that indicates how similar those two synsets are, using a
6				# gloss vector overlap measure based on "context vectors" described by
7				# Schütze (1998).
8				#
9				# Copyright (c) 2005,
10				#
11				# Ted Pedersen, University of Minnesota Duluth
12				# tpederse at d.umn.edu
13				#
14				# Siddharth Patwardhan, University of Utah, Salt Lake City
15				# sidd at cs.utah.edu
16				#
17				# Satanjeev Banerjee, Carnegie Mellon University, Pittsburgh
18				# banerjee+ at cs.cmu.edu
19				#
20				# This program is free software; you can redistribute it and/or
21				# modify it under the terms of the GNU General Public License
22				# as published by the Free Software Foundation; either version 2
23				# of the License, or (at your option) any later version.
24				#
25				# This program is distributed in the hope that it will be useful,
26				# but WITHOUT ANY WARRANTY; without even the implied warranty of
27				# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28				# GNU General Public License for more details.
29				#
30				# You should have received a copy of the GNU General Public License
31				# along with this program; if not, write to
32				#
33				# The Free Software Foundation, Inc.,
34				# 59 Temple Place - Suite 330,
35				# Boston, MA 02111-1307, USA.
36				#
37				# ------------------------------------------------------------------
38
39				package WordNet::Similarity::vector_pairs;
40
41				=head1 NAME
42
43				WordNet::Similarity::vector_pairs - module for computing semantic relatedness
44				of word senses using second order co-occurrence vectors of glosses of the word
45				senses.
46
47				=head1 SYNOPSIS
48
49				use WordNet::Similarity::vector_pairs;
50
51				use WordNet::QueryData;
52
53				my $wn = WordNet::QueryData->new();
54
55				my $vector_pairs = WordNet::Similarity::vector_pairs->new($wn);
56
57				my $value = $vector_pairs->getRelatedness("car#n#1", "bus#n#2");
58
59				($error, $errorString) = $vector_pairs->getError();
60
61				die "$errorString\n" if($error);
62
63				print "car (sense 1) <-> bus (sense 2) = $value\n";
64
65				=head1 DESCRIPTION
66
67				SchEtze (1998) creates what he calls context vectors (second order
68				co-occurrence vectors) of pieces of text for the purpose of Word Sense
69				Discrimination. This idea is adopted by Patwardhan and Pedersen to represent
70				the word senses by second-order co-occurrence vectors of their dictionary
71				(WordNet) definitions. The relatedness of two senses is then computed as
72				the cosine of their representative gloss vectors.
73
74				A concept is represented by its own gloss, as well as the glosses of the
75				neighboring senses as specified in the vector-relation.dat file. Each
76				gloss is converted into a second order vector by replacing the words in
77				the gloss with co-occurrence vectors for those words. The overall measure
78				of relatedness between two concepts is determined by taking the pairwise
79				cosines between these expanded glosses. If vector-relation.dat consists
80				of:
81
82				example-example
83				glos-glos
84				hypo-hypo
85
86				then three pairwise cosine measurements are made to determine the
87				relatedness of concepts A and B. The examples found in the glosses
88				of A and B are expanded and measured, then the glosses themselves are
89				expanded and measured, and then the hyponyms of A and B are expanded
90				and measured. Then, the values of these three pairwise measures are summed
91				to create the overall relatedness score.
92
93				=over
94
95				=cut
96
97	1	1	3670	use strict;
	1		3
	1		43
98	1	1	7	use WordNet::vectorFile;
	1		3
	1		45
99	1	1	1195	use WordNet::Similarity::GlossFinder;
	0
	0
100				use File::Spec;
101				use vars qw($VERSION @ISA);
102
103				@ISA = qw(WordNet::Similarity::GlossFinder);
104				$VERSION = '2.04';
105
106				WordNet::Similarity::addConfigOption("vectordb", 0, "p", undef);
107
108				=item $measure->initialize($file)
109
110				Overrides the initialize method in the parent class (GlossFinder.pm). This method
111				essentially initializes the measure for use.
112
113				Parameters: $file -- configuration file.
114
115				Returns: none.
116
117				=cut
118
119				# Initialization of the WordNet::Similarity::vector_pairs object... parses the config file and sets up
120				# global variables, or sets them to default values.
121				# INPUT PARAMS : $paramFile .. File containing the module specific params.
122				# RETURN VALUES : (none)
123				sub initialize
124				{
125				my $self = shift;
126				my $vectorDB;
127				my $documentCount;
128				my $wn = $self->{wn};
129				my $readDims;
130				my $readVectors;
131
132				# Look for the default vector relation file...
133				if(!defined $self->{relationDefault})
134				{
135				my $path;
136				my $header;
137				my @possiblePaths = ();
138
139				# Look for all possible default data files installed.
140				foreach $path (@INC)
141				{
142				# JM 1-16-04 -- modified to use File::Spec
143				my $file = File::Spec->catfile($path, 'WordNet', 'vector-pairs-relation.dat');
144				push @possiblePaths, $file if(-e $file);
145				}
146
147				# If there are multiple possibilities, get the one in the correct format.
148				foreach $path (@possiblePaths)
149				{
150				next if(!open(RELATIONS, $path));
151				$header = ;
152				$header =~ s/\s+//g;
153				if($header =~ /RelationFile/)
154				{
155				$self->{relationDefault} = $path;
156				close(RELATIONS);
157				last;
158				}
159				close(RELATIONS);
160				}
161				}
162
163				# Call the initialize method of the super-class.
164				$self->SUPER::initialize(@_);
165
166				# Initialize the vector cache.
167				$self->{vCache} = ();
168				$self->{vCacheQ} = ();
169				$self->{vCacheSize} = 80;
170
171				# Initialize the word vector database interface...
172				if(!defined $self->{vectordb} \|\| $self->{vectordb} eq "")
173				{
174				my $path;
175				my $header;
176				my @possiblePaths = ();
177				$vectorDB = "";
178
179				# Look for all possible default data files installed.
180				foreach $path (@INC)
181				{
182				# JM 1-16-04 -- modified to use File::Spec
183				my $file = File::Spec->catfile($path, 'WordNet', 'wordvectors.dat');
184				push @possiblePaths, $file if(-e $file);
185				}
186
187				# If there are multiple possibilities, get the one in the correct format.
188				foreach $path (@possiblePaths)
189				{
190				next if(!open(VECTORS, $path));
191				$header = ;
192				$header =~ s/\s+//g;
193				if($header =~ /DOCUMENTCOUNT/)
194				{
195				$vectorDB = $path;
196				close(VECTORS);
197				last;
198				}
199				close(VECTORS);
200				}
201				}
202				else
203				{
204				$vectorDB = $self->{vectordb};
205				}
206
207				# If database still not specified...
208				if(!defined $vectorDB \|\| $vectorDB eq "")
209				{
210				$self->{errorString} .= "\nError (WordNet::Similarity::vector_pairs->initialize()) - ";
211				$self->{errorString} .= "Word Vector database file not specified. Use configuration file.";
212				$self->{error} = 2;
213				return;
214				}
215
216				# Get the documentCount, dimensions and vectors...
217				($documentCount, $readDims, $readVectors) = WordNet::vectorFile->readVectors($vectorDB);
218				if(!defined $documentCount \|\| !defined $readDims \|\| !defined $readVectors)
219				{
220				$self->{errorString} .= "\nError (WordNet::Similarity::vector_pairs->initialize()) - ";
221				$self->{errorString} .= "Error reading the vector database file.";
222				$self->{error} = 2;
223				return;
224				}
225
226				# Load the word vector dimensions...
227				my $key;
228				$self->{numberOfDimensions} = scalar(keys(%{$readDims}));
229				foreach $key (keys %{$readDims})
230				{
231				my $ans = $readDims->{$key};
232				my @prts = split(/\s+/, $ans);
233				$self->{wordIndex}->{$key} = $prts[0];
234				$self->{indexWord}->[$prts[0]] = $key;
235				}
236
237				# Set up the interface to the word vectors...
238				foreach $key (keys %{$readVectors})
239				{
240				my $vec = $readVectors->{$key};
241				if(defined $vec)
242				{
243				$self->{table}->{$key} = $vec;
244				}
245				}
246				}
247
248				=item $measure->traceOptions()
249
250				This method is internally called to determine the extra options
251				specified by this measure (apart from the default options specified
252				in the WordNet::Similarity base class).
253
254				Parameters: none.
255
256				Returns: none.
257
258				=cut
259
260				# show all config options specific to this module
261				sub traceOptions
262				{
263				my $self = shift;
264				$self->{traceString} .= "vectorDB File :: ".((defined $self->{vectordb})?"$self->{vectordb}":"")."\n";
265				$self->SUPER::traceOptions();
266				}
267
268				=item $vector_pairs->getRelatedness
269
270				Computes the relatedness of two word senses using the Vector Algorithm.
271
272				Parameters: two word senses in "word#pos#sense" format.
273
274				Returns: Unless a problem occurs, the return value is the relatedness
275				score, which is greater-than or equal-to 0. If an error occurs,
276				then the error level is set to non-zero and an error
277				string is created (see the description of getError()).
278
279				=cut
280
281				sub getRelatedness
282				{
283				my $self = shift;
284				my $wps1 = shift;
285				my $wps2 = shift;
286				my $wn = $self->{wn};
287				my $wntools = $self->{wntools};
288				my $class = ref $self \|\| $self;
289
290				# Check the existence of the WordNet::QueryData object.
291				unless($wn)
292				{
293				$self->{errorString} .= "\nError (${class}::getRelatedness()) - ";
294				$self->{errorString} .= "A WordNet::QueryData object is required.";
295				$self->{error} = 2;
296				return undef;
297				}
298
299				# Check the existence of the WordNet::Tools object.
300				unless($wntools)
301				{
302				$self->{errorString} .= "\nError (${class}::getRelatedness()) - ";
303				$self->{errorString} .= "A WordNet::Tools object is required.";
304				$self->{error} = 2;
305				return undef;
306				}
307
308				# Using validation code from parseWps() in a super-class
309				my $ret = $self->parseWps($wps1, $wps2);
310				ref $ret or return undef;
311
312				# Initialize traces.
313				$self->{traceString} = "";
314
315				# Now check if the similarity value for these two synsets is in
316				# fact in the cache... if so return the cached value.
317				my $relatedness =
318				$self->{doCache} ? $self->fetchFromCache ($wps1, $wps2) : undef;
319				defined $relatedness and return $relatedness;
320
321				# Now get down to really finding the relatedness of these two.
322				# see if any traces reqd. if so, put in the synset arrays.
323				if($self->{trace})
324				{
325				# ah so we do need SOME traces! put in the synset names.
326				$self->{traceString} = "Synset 1: $wps1\n";
327				$self->{traceString} .= "Synset 2: $wps2\n";
328				}
329
330				# initialize the score
331				my $score = 0;
332				my $i = 0;
333
334				# Get the gloss strings from the get_wn_info module
335				my ($firstStringArray, $secondStringArray, $weightsArray, $functionsStringArray) = $self->getSuperGlosses($wps1, $wps2);
336				for($i = 0; $i < scalar(@{$weightsArray}); $i++)
337				{
338				my $functionsScore = 0;
339				my $funcStringPrinted = 0;
340				my $firstString = $firstStringArray->[$i];
341				my $secondString = $secondStringArray->[$i];
342				my $weight = $weightsArray->[$i];
343				my $functionsString = $functionsStringArray->[$i];
344
345				# so those are the two strings for this relation pair. Get the vectors
346				# Preprocess...
347				$firstString =~ s/\'//g;
348				$firstString =~ s/[^a-z0-9]+/ /g;
349				$firstString =~ s/^\s+//;
350				$firstString =~ s/\s+$//;
351				$firstString = $wntools->compoundify($firstString);
352				$secondString =~ s/\'//g;
353				$secondString =~ s/[^a-z0-9]+/ /g;
354				$secondString =~ s/^\s+//;
355				$secondString =~ s/\s+$//;
356				$secondString = $wntools->compoundify($secondString);
357
358				# Get vectors... score...
359				my $a;
360				my $maga;
361				my $sizea;
362				my $b;
363				my $magb;
364				my $sizeb;
365				my $trr1;
366				my $trr2;
367
368				# see if any traces reqd. if so, put in the synset arrays.
369				($a, $trr1, $maga) = $self->_getVector($firstString);
370				&_norm($a, $maga);
371
372				($b, $trr2, $magb) = $self->_getVector($secondString);
373				&_norm($b, $magb);
374
375				$functionsScore = &_inner($a, $b);
376				$score += $functionsScore;
377
378				# check if the two strings need to be reported in the trace.
379				if($self->{trace})
380				{
381				if(!$funcStringPrinted)
382				{
383				$self->{traceString} .= "$functionsString: $functionsScore\n";
384				$self->{traceString} .= "\nString: \"$firstString\"\n$trr1\n";
385				$self->{traceString} .= "\nString: \"$secondString\"\n$trr2\n";
386				$funcStringPrinted = 1;
387				}
388				}
389				}
390
391				# Average the score...
392				$score /= $i if($i > 0);
393
394				# that does all the scoring. Put in cache if doing cacheing. Then
395				# return the score.
396				$self->{doCache} and $self->storeToCache($wps1, $wps2, $score);
397
398				return $score;
399				}
400
401				# Method to compute a context vector from a given body of text...
402				sub _getVector
403				{
404				my $self = shift;
405				my $text = shift;
406				my $ret = {};
407				return $ret if(!defined $text);
408				my @words = split(/\s+/, $text);
409				my $word;
410				my %types;
411				my $fstFlag = 1;
412				my $localTraces = "";
413				my $kk;
414				my $mag;
415
416				# [trace]
417				if($self->{trace})
418				{
419				$localTraces .= "Word Vectors for: ";
420				}
421				# [/trace]
422
423				foreach $word (@words)
424				{
425				$types{$word} = 1 if($word !~ /[XGES]{3}\d{5}[XGES]{3}/);
426				}
427				foreach $word (keys %types)
428				{
429				if(defined $self->{table}->{$word} && !defined $self->{stopHash}->{$word})
430				{
431				my %pieces = split(/\s+/, $self->{table}->{$word});
432
433				# [trace]
434				if($self->{trace})
435				{
436				$localTraces .= ", " if(!$fstFlag);
437				$localTraces .= "$word";
438				$fstFlag = 0;
439				}
440				# [/trace]
441
442				foreach $kk (keys %pieces)
443				{
444				$ret->{$kk} = ((defined $ret->{$kk})?($ret->{$kk}):0) + $pieces{$kk};
445				}
446				}
447				}
448
449				$mag = 0;
450				foreach $kk (keys %{$ret})
451				{
452				$mag += ($ret->{$kk} * $ret->{$kk});
453				}
454
455				return ($ret, $localTraces, sqrt($mag));
456				}
457
458				# Normalizes the sparse vector.
459				sub _norm
460				{
461				my $vec = shift;
462				my $mag = shift;
463
464				if(defined $vec && defined $mag && $mag != 0)
465				{
466				my $key;
467				foreach $key (keys %{$vec})
468				{
469				$vec->{$key} /= $mag;
470				}
471				}
472				}
473
474				# Inner product of two sparse vectors.
475				sub _inner
476				{
477				my $vec1 = shift;
478				my $vec2 = shift;
479				my ($size1, $size2);
480				my $prod = 0;
481
482				return 0 if(!defined $vec1 \|\| !defined $vec2);
483
484				$size1 = scalar(keys(%{$vec1}));
485				$size2 = scalar(keys(%{$vec2}));
486
487				if(defined $size1 && defined $size2 && $size1 < $size2)
488				{
489				my $key;
490				foreach $key (keys %{$vec1})
491				{
492				$prod += ($vec1->{$key} * $vec2->{$key}) if(defined $vec2->{$key});
493				}
494				}
495				else
496				{
497				my $key;
498				foreach $key (keys %{$vec2})
499				{
500				$prod += ($vec1->{$key} * $vec2->{$key}) if(defined $vec1->{$key});
501				}
502				}
503
504				return $prod;
505				}
506
507				1;
508
509				__END__