File Coverage

blib/lib/WordNet/Similarity/vector_pairs.pm
Criterion Covered Total %
statement 7 9 77.7
branch n/a
condition n/a
subroutine 3 3 100.0
pod n/a
total 10 12 83.3


line stmt bran cond sub pod time code
1             # WordNet::Similarity::vector_pairs.pm version 2.04
2             # (Last updated $Id: vector_pairs.pm,v 1.11 2008/03/27 06:21:17 sidz1979 Exp $)
3             #
4             # Module to accept two WordNet synsets and to return a floating point
5             # number that indicates how similar those two synsets are, using a
6             # gloss vector overlap measure based on "context vectors" described by
7             # Schütze (1998).
8             #
9             # Copyright (c) 2005,
10             #
11             # Ted Pedersen, University of Minnesota Duluth
12             # tpederse at d.umn.edu
13             #
14             # Siddharth Patwardhan, University of Utah, Salt Lake City
15             # sidd at cs.utah.edu
16             #
17             # Satanjeev Banerjee, Carnegie Mellon University, Pittsburgh
18             # banerjee+ at cs.cmu.edu
19             #
20             # This program is free software; you can redistribute it and/or
21             # modify it under the terms of the GNU General Public License
22             # as published by the Free Software Foundation; either version 2
23             # of the License, or (at your option) any later version.
24             #
25             # This program is distributed in the hope that it will be useful,
26             # but WITHOUT ANY WARRANTY; without even the implied warranty of
27             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28             # GNU General Public License for more details.
29             #
30             # You should have received a copy of the GNU General Public License
31             # along with this program; if not, write to
32             #
33             # The Free Software Foundation, Inc.,
34             # 59 Temple Place - Suite 330,
35             # Boston, MA 02111-1307, USA.
36             #
37             # ------------------------------------------------------------------
38              
39             package WordNet::Similarity::vector_pairs;
40              
41             =head1 NAME
42              
43             WordNet::Similarity::vector_pairs - module for computing semantic relatedness
44             of word senses using second order co-occurrence vectors of glosses of the word
45             senses.
46              
47             =head1 SYNOPSIS
48              
49             use WordNet::Similarity::vector_pairs;
50              
51             use WordNet::QueryData;
52              
53             my $wn = WordNet::QueryData->new();
54              
55             my $vector_pairs = WordNet::Similarity::vector_pairs->new($wn);
56              
57             my $value = $vector_pairs->getRelatedness("car#n#1", "bus#n#2");
58              
59             ($error, $errorString) = $vector_pairs->getError();
60              
61             die "$errorString\n" if($error);
62              
63             print "car (sense 1) <-> bus (sense 2) = $value\n";
64              
65             =head1 DESCRIPTION
66              
67             SchEtze (1998) creates what he calls context vectors (second order
68             co-occurrence vectors) of pieces of text for the purpose of Word Sense
69             Discrimination. This idea is adopted by Patwardhan and Pedersen to represent
70             the word senses by second-order co-occurrence vectors of their dictionary
71             (WordNet) definitions. The relatedness of two senses is then computed as
72             the cosine of their representative gloss vectors.
73              
74             A concept is represented by its own gloss, as well as the glosses of the
75             neighboring senses as specified in the vector-relation.dat file. Each
76             gloss is converted into a second order vector by replacing the words in
77             the gloss with co-occurrence vectors for those words. The overall measure
78             of relatedness between two concepts is determined by taking the pairwise
79             cosines between these expanded glosses. If vector-relation.dat consists
80             of:
81              
82             example-example
83             glos-glos
84             hypo-hypo
85              
86             then three pairwise cosine measurements are made to determine the
87             relatedness of concepts A and B. The examples found in the glosses
88             of A and B are expanded and measured, then the glosses themselves are
89             expanded and measured, and then the hyponyms of A and B are expanded
90             and measured. Then, the values of these three pairwise measures are summed
91             to create the overall relatedness score.
92              
93             =over
94              
95             =cut
96              
97 1     1   3670 use strict;
  1         3  
  1         43  
98 1     1   7 use WordNet::vectorFile;
  1         3  
  1         45  
99 1     1   1195 use WordNet::Similarity::GlossFinder;
  0            
  0            
100             use File::Spec;
101             use vars qw($VERSION @ISA);
102              
103             @ISA = qw(WordNet::Similarity::GlossFinder);
104             $VERSION = '2.04';
105              
106             WordNet::Similarity::addConfigOption("vectordb", 0, "p", undef);
107              
108             =item $measure->initialize($file)
109              
110             Overrides the initialize method in the parent class (GlossFinder.pm). This method
111             essentially initializes the measure for use.
112              
113             Parameters: $file -- configuration file.
114              
115             Returns: none.
116              
117             =cut
118              
119             # Initialization of the WordNet::Similarity::vector_pairs object... parses the config file and sets up
120             # global variables, or sets them to default values.
121             # INPUT PARAMS : $paramFile .. File containing the module specific params.
122             # RETURN VALUES : (none)
123             sub initialize
124             {
125             my $self = shift;
126             my $vectorDB;
127             my $documentCount;
128             my $wn = $self->{wn};
129             my $readDims;
130             my $readVectors;
131              
132             # Look for the default vector relation file...
133             if(!defined $self->{relationDefault})
134             {
135             my $path;
136             my $header;
137             my @possiblePaths = ();
138            
139             # Look for all possible default data files installed.
140             foreach $path (@INC)
141             {
142             # JM 1-16-04 -- modified to use File::Spec
143             my $file = File::Spec->catfile($path, 'WordNet', 'vector-pairs-relation.dat');
144             push @possiblePaths, $file if(-e $file);
145             }
146            
147             # If there are multiple possibilities, get the one in the correct format.
148             foreach $path (@possiblePaths)
149             {
150             next if(!open(RELATIONS, $path));
151             $header = ;
152             $header =~ s/\s+//g;
153             if($header =~ /RelationFile/)
154             {
155             $self->{relationDefault} = $path;
156             close(RELATIONS);
157             last;
158             }
159             close(RELATIONS);
160             }
161             }
162              
163             # Call the initialize method of the super-class.
164             $self->SUPER::initialize(@_);
165              
166             # Initialize the vector cache.
167             $self->{vCache} = ();
168             $self->{vCacheQ} = ();
169             $self->{vCacheSize} = 80;
170              
171             # Initialize the word vector database interface...
172             if(!defined $self->{vectordb} || $self->{vectordb} eq "")
173             {
174             my $path;
175             my $header;
176             my @possiblePaths = ();
177             $vectorDB = "";
178              
179             # Look for all possible default data files installed.
180             foreach $path (@INC)
181             {
182             # JM 1-16-04 -- modified to use File::Spec
183             my $file = File::Spec->catfile($path, 'WordNet', 'wordvectors.dat');
184             push @possiblePaths, $file if(-e $file);
185             }
186            
187             # If there are multiple possibilities, get the one in the correct format.
188             foreach $path (@possiblePaths)
189             {
190             next if(!open(VECTORS, $path));
191             $header = ;
192             $header =~ s/\s+//g;
193             if($header =~ /DOCUMENTCOUNT/)
194             {
195             $vectorDB = $path;
196             close(VECTORS);
197             last;
198             }
199             close(VECTORS);
200             }
201             }
202             else
203             {
204             $vectorDB = $self->{vectordb};
205             }
206              
207             # If database still not specified...
208             if(!defined $vectorDB || $vectorDB eq "")
209             {
210             $self->{errorString} .= "\nError (WordNet::Similarity::vector_pairs->initialize()) - ";
211             $self->{errorString} .= "Word Vector database file not specified. Use configuration file.";
212             $self->{error} = 2;
213             return;
214             }
215              
216             # Get the documentCount, dimensions and vectors...
217             ($documentCount, $readDims, $readVectors) = WordNet::vectorFile->readVectors($vectorDB);
218             if(!defined $documentCount || !defined $readDims || !defined $readVectors)
219             {
220             $self->{errorString} .= "\nError (WordNet::Similarity::vector_pairs->initialize()) - ";
221             $self->{errorString} .= "Error reading the vector database file.";
222             $self->{error} = 2;
223             return;
224             }
225            
226             # Load the word vector dimensions...
227             my $key;
228             $self->{numberOfDimensions} = scalar(keys(%{$readDims}));
229             foreach $key (keys %{$readDims})
230             {
231             my $ans = $readDims->{$key};
232             my @prts = split(/\s+/, $ans);
233             $self->{wordIndex}->{$key} = $prts[0];
234             $self->{indexWord}->[$prts[0]] = $key;
235             }
236              
237             # Set up the interface to the word vectors...
238             foreach $key (keys %{$readVectors})
239             {
240             my $vec = $readVectors->{$key};
241             if(defined $vec)
242             {
243             $self->{table}->{$key} = $vec;
244             }
245             }
246             }
247              
248             =item $measure->traceOptions()
249              
250             This method is internally called to determine the extra options
251             specified by this measure (apart from the default options specified
252             in the WordNet::Similarity base class).
253              
254             Parameters: none.
255              
256             Returns: none.
257              
258             =cut
259              
260             # show all config options specific to this module
261             sub traceOptions
262             {
263             my $self = shift;
264             $self->{traceString} .= "vectorDB File :: ".((defined $self->{vectordb})?"$self->{vectordb}":"")."\n";
265             $self->SUPER::traceOptions();
266             }
267              
268             =item $vector_pairs->getRelatedness
269              
270             Computes the relatedness of two word senses using the Vector Algorithm.
271              
272             Parameters: two word senses in "word#pos#sense" format.
273              
274             Returns: Unless a problem occurs, the return value is the relatedness
275             score, which is greater-than or equal-to 0. If an error occurs,
276             then the error level is set to non-zero and an error
277             string is created (see the description of getError()).
278              
279             =cut
280              
281             sub getRelatedness
282             {
283             my $self = shift;
284             my $wps1 = shift;
285             my $wps2 = shift;
286             my $wn = $self->{wn};
287             my $wntools = $self->{wntools};
288             my $class = ref $self || $self;
289            
290             # Check the existence of the WordNet::QueryData object.
291             unless($wn)
292             {
293             $self->{errorString} .= "\nError (${class}::getRelatedness()) - ";
294             $self->{errorString} .= "A WordNet::QueryData object is required.";
295             $self->{error} = 2;
296             return undef;
297             }
298              
299             # Check the existence of the WordNet::Tools object.
300             unless($wntools)
301             {
302             $self->{errorString} .= "\nError (${class}::getRelatedness()) - ";
303             $self->{errorString} .= "A WordNet::Tools object is required.";
304             $self->{error} = 2;
305             return undef;
306             }
307              
308             # Using validation code from parseWps() in a super-class
309             my $ret = $self->parseWps($wps1, $wps2);
310             ref $ret or return undef;
311              
312             # Initialize traces.
313             $self->{traceString} = "";
314              
315             # Now check if the similarity value for these two synsets is in
316             # fact in the cache... if so return the cached value.
317             my $relatedness =
318             $self->{doCache} ? $self->fetchFromCache ($wps1, $wps2) : undef;
319             defined $relatedness and return $relatedness;
320            
321             # Now get down to really finding the relatedness of these two.
322             # see if any traces reqd. if so, put in the synset arrays.
323             if($self->{trace})
324             {
325             # ah so we do need SOME traces! put in the synset names.
326             $self->{traceString} = "Synset 1: $wps1\n";
327             $self->{traceString} .= "Synset 2: $wps2\n";
328             }
329            
330             # initialize the score
331             my $score = 0;
332             my $i = 0;
333            
334             # Get the gloss strings from the get_wn_info module
335             my ($firstStringArray, $secondStringArray, $weightsArray, $functionsStringArray) = $self->getSuperGlosses($wps1, $wps2);
336             for($i = 0; $i < scalar(@{$weightsArray}); $i++)
337             {
338             my $functionsScore = 0;
339             my $funcStringPrinted = 0;
340             my $firstString = $firstStringArray->[$i];
341             my $secondString = $secondStringArray->[$i];
342             my $weight = $weightsArray->[$i];
343             my $functionsString = $functionsStringArray->[$i];
344            
345             # so those are the two strings for this relation pair. Get the vectors
346             # Preprocess...
347             $firstString =~ s/\'//g;
348             $firstString =~ s/[^a-z0-9]+/ /g;
349             $firstString =~ s/^\s+//;
350             $firstString =~ s/\s+$//;
351             $firstString = $wntools->compoundify($firstString);
352             $secondString =~ s/\'//g;
353             $secondString =~ s/[^a-z0-9]+/ /g;
354             $secondString =~ s/^\s+//;
355             $secondString =~ s/\s+$//;
356             $secondString = $wntools->compoundify($secondString);
357              
358             # Get vectors... score...
359             my $a;
360             my $maga;
361             my $sizea;
362             my $b;
363             my $magb;
364             my $sizeb;
365             my $trr1;
366             my $trr2;
367              
368             # see if any traces reqd. if so, put in the synset arrays.
369             ($a, $trr1, $maga) = $self->_getVector($firstString);
370             &_norm($a, $maga);
371              
372             ($b, $trr2, $magb) = $self->_getVector($secondString);
373             &_norm($b, $magb);
374              
375             $functionsScore = &_inner($a, $b);
376             $score += $functionsScore;
377              
378             # check if the two strings need to be reported in the trace.
379             if($self->{trace})
380             {
381             if(!$funcStringPrinted)
382             {
383             $self->{traceString} .= "$functionsString: $functionsScore\n";
384             $self->{traceString} .= "\nString: \"$firstString\"\n$trr1\n";
385             $self->{traceString} .= "\nString: \"$secondString\"\n$trr2\n";
386             $funcStringPrinted = 1;
387             }
388             }
389             }
390              
391             # Average the score...
392             $score /= $i if($i > 0);
393              
394             # that does all the scoring. Put in cache if doing cacheing. Then
395             # return the score.
396             $self->{doCache} and $self->storeToCache($wps1, $wps2, $score);
397              
398             return $score;
399             }
400              
401             # Method to compute a context vector from a given body of text...
402             sub _getVector
403             {
404             my $self = shift;
405             my $text = shift;
406             my $ret = {};
407             return $ret if(!defined $text);
408             my @words = split(/\s+/, $text);
409             my $word;
410             my %types;
411             my $fstFlag = 1;
412             my $localTraces = "";
413             my $kk;
414             my $mag;
415              
416             # [trace]
417             if($self->{trace})
418             {
419             $localTraces .= "Word Vectors for: ";
420             }
421             # [/trace]
422              
423             foreach $word (@words)
424             {
425             $types{$word} = 1 if($word !~ /[XGES]{3}\d{5}[XGES]{3}/);
426             }
427             foreach $word (keys %types)
428             {
429             if(defined $self->{table}->{$word} && !defined $self->{stopHash}->{$word})
430             {
431             my %pieces = split(/\s+/, $self->{table}->{$word});
432              
433             # [trace]
434             if($self->{trace})
435             {
436             $localTraces .= ", " if(!$fstFlag);
437             $localTraces .= "$word";
438             $fstFlag = 0;
439             }
440             # [/trace]
441              
442             foreach $kk (keys %pieces)
443             {
444             $ret->{$kk} = ((defined $ret->{$kk})?($ret->{$kk}):0) + $pieces{$kk};
445             }
446             }
447             }
448              
449             $mag = 0;
450             foreach $kk (keys %{$ret})
451             {
452             $mag += ($ret->{$kk} * $ret->{$kk});
453             }
454              
455             return ($ret, $localTraces, sqrt($mag));
456             }
457              
458             # Normalizes the sparse vector.
459             sub _norm
460             {
461             my $vec = shift;
462             my $mag = shift;
463              
464             if(defined $vec && defined $mag && $mag != 0)
465             {
466             my $key;
467             foreach $key (keys %{$vec})
468             {
469             $vec->{$key} /= $mag;
470             }
471             }
472             }
473              
474             # Inner product of two sparse vectors.
475             sub _inner
476             {
477             my $vec1 = shift;
478             my $vec2 = shift;
479             my ($size1, $size2);
480             my $prod = 0;
481              
482             return 0 if(!defined $vec1 || !defined $vec2);
483              
484             $size1 = scalar(keys(%{$vec1}));
485             $size2 = scalar(keys(%{$vec2}));
486              
487             if(defined $size1 && defined $size2 && $size1 < $size2)
488             {
489             my $key;
490             foreach $key (keys %{$vec1})
491             {
492             $prod += ($vec1->{$key} * $vec2->{$key}) if(defined $vec2->{$key});
493             }
494             }
495             else
496             {
497             my $key;
498             foreach $key (keys %{$vec2})
499             {
500             $prod += ($vec1->{$key} * $vec2->{$key}) if(defined $vec1->{$key});
501             }
502             }
503              
504             return $prod;
505             }
506              
507             1;
508              
509             __END__