File Coverage

blib/lib/WordNet/stem.pm

Criterion	Covered	Total	%
statement	9	43	20.9
branch	0	8	0.0
condition			n/a
subroutine	3	6	50.0
pod	3	3	100.0
total	15	60	25.0

line	stmt	bran	sub	pod	time	code
1						# WordNet::stem.pm version 2.04
2						# (Last updated $Id: stem.pm,v 1.1 2008/03/27 05:13:01 sidz1979 Exp $)
3						#
4						# Package used by WordNet::Similarity::lesk module that
5						# computes semantic relatedness of word senses in WordNet
6						# using gloss overlaps.
7						#
8						# Copyright (c) 2005,
9						#
10						# Ted Pedersen, University of Minnesota Duluth
11						# tpederse at d.umn.edu
12						#
13						# Satanjeev Banerjee, Carnegie Mellon University, Pittsburgh
14						# banerjee+ at cs.cmu.edu
15						#
16						# This program is free software; you can redistribute it and/or
17						# modify it under the terms of the GNU General Public License
18						# as published by the Free Software Foundation; either version 2
19						# of the License, or (at your option) any later version.
20						#
21						# This program is distributed in the hope that it will be useful,
22						# but WITHOUT ANY WARRANTY; without even the implied warranty of
23						# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24						# GNU General Public License for more details.
25						#
26						# You should have received a copy of the GNU General Public License
27						# along with this program; if not, write to
28						#
29						# The Free Software Foundation, Inc.,
30						# 59 Temple Place - Suite 330,
31						# Boston, MA 02111-1307, USA.
32						#
33						# ------------------------------------------------------------------
34
35						package WordNet::stem;
36
37						=head1 NAME
38
39						WordNet::stem - Module that find the stem of a word or the stems of a
40						string of words, using WordNet.
41
42						=head1 SYNOPSIS
43
44						use WordNet::stem;
45
46						my $wn = WordNet::QueryData->new();
47
48						my $stemmer = WordNet::stem->new($wn)
49
50						my @stems = $stemmer->stemWord($word);
51
52						my $string = $stemmer->stemString($inString, $cache);
53
54						=head1 DESCRIPTION
55
56						This module uses the internal stemming algorithm of WordNet to
57						stem words and strings of words. This module is used by the
58						lesk measure of the WordNet::Similarity package.
59
60						=head2 Methods
61
62						=over
63
64						=cut
65
66	1		1		6	use strict;
	1				2
	1				32
67	1		1		5	use Exporter;
	1				2
	1				52
68	1		1		11	use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
	1				2
	1				596
69
70						@ISA = qw(Exporter);
71
72						%EXPORT_TAGS = ();
73
74						@EXPORT_OK = ();
75
76						@EXPORT = ();
77
78						$VERSION = '2.04';
79
80						=item new
81
82						Creates a new stemmer object and initilizes it with a
83						WordNet::QueryData object.
84
85						Parameters: $wn
86
87						Returns: $stemmer
88
89						=cut
90
91						# function to create the stemmer object
92						sub new
93						{
94	0		0	1		my $className = shift;
95	0					my $wn = shift;
96	0					my $self = {};
97
98	0					$self->{wn} = $wn;
99	0					$self->{wordStemHash} = ();
100	0					$self->{stringStemHash} = ();
101	0					bless($self, $className);
102
103	0					return $self;
104						}
105
106						=item stemString
107
108						Takes a string of words as input and returns a string of stemmed words.
109
110						Parameters: $inString
111
112						Returns: $retString
113
114						=cut
115
116						# Function to take a string, and process it in such a way that all the
117						# words in it get stemmed. Note that if a single word has two or more
118						# possible stems, we return the original surface form since there is
119						# no way to select from the competing stems. The stem of the string
120						# can be cached if requested. Useful if the calling function knows
121						# which strings it will have to stem over and over again. Strings that
122						# will be only stemmed ones need not be cached - thereby saving space.
123						sub stemString
124						{
125	0		0	1		my $self = shift;
126	0					my $inputString = shift;
127	0					my $cache = shift;
128
129						# whether or not this string has been requested for cacheing,
130						# check in the cache
131	0	0				return $self->{'stringStemHash'}->{$inputString} if (defined $self->{'stringStemHash'}->{$inputString});
132
133						# Not in cache. Stem.
134
135						# for each word in the input get the stem and put in the output string
136	0					my $outputString = "";
137	0					while ($inputString =~ /(\w+)/g)
138						{
139	0					my $word = $1;
140	0					my @stems = $self->stemWord($word);
141
142						# if multiple or no stems, use surface form.
143	0	0				$outputString .= ($#stems != 0) ? "$word " : "$stems[0] ";
144						}
145
146						# if cache required, do so
147	0	0				$self->{'stringStemHash'}->{$inputString} = $outputString if (defined($cache));
148
149						# return the string
150	0					return($outputString);
151						}
152
153						=item stemWord
154
155						Takes a word as input and returns its stems. A word may have more than
156						one stem. All are returned.
157
158						Parameters: $word
159
160						Returns: @stems
161
162						=back
163
164						=cut
165
166						# stem the word passed to this function and return an array of words
167						# that contain all the possible stems of this word. All possible stems
168						# of the word may include the surface form too if its a valid WordNet
169						# lemma.
170						sub stemWord
171						{
172	0		0	1		my $self = shift;
173	0					my $word = shift;
174	0					my $wn = $self->{wn};
175	0					my @stems = ();
176
177						# if not in the cache, create and put in cache
178	0	0				if (!defined $self->{wordStemHash}->{$word})
179						{
180						# So not in the hash. gotta check for all possible parts of speech.
181	0					my %stems = ();
182	0					my $possiblePartsOfSpeech = "nvar";
183
184	0					my $pos;
185	0					while ("nvar" =~ /(.)/g)
186						{
187	0					foreach ($wn->validForms("$word\#$1"))
188						{
189						# put underscore for space
190	0					$_ =~ s/ /_/g;
191
192						# remove part of speech if any
193	0					$_ =~ s/\#\w$//;
194
195						# put in stems hash (the hash allows us to not worry about
196						# multiple copies of the same stem!)
197	0					$stems{$_} = 1;
198						}
199						}
200
201						# put in the cache
202	0					$self->{wordStemHash}->{$word} = join(" ", (keys %stems));
203						}
204
205						# return the stems
206	0					return (split / /, $self->{wordStemHash}->{$word});
207						}
208
209						1;
210
211						__END__