line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::Stem::En; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=head1 NAME |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
Lingua::Stem::En - Porter's stemming algorithm for 'generic' English |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 SYNOPSIS |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
use Lingua::Stem::En; |
10
|
|
|
|
|
|
|
my $stems = Lingua::Stem::En::stem({ -words => $word_list_reference, |
11
|
|
|
|
|
|
|
-locale => 'en', |
12
|
|
|
|
|
|
|
-exceptions => $exceptions_hash, |
13
|
|
|
|
|
|
|
}); |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=head1 DESCRIPTION |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
This routine applies the Porter Stemming Algorithm to its parameters, |
18
|
|
|
|
|
|
|
returning the stemmed words. |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
It is derived from the C program "stemmer.c" |
21
|
|
|
|
|
|
|
as found in freewais and elsewhere, which contains these notes: |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
Purpose: Implementation of the Porter stemming algorithm documented |
24
|
|
|
|
|
|
|
in: Porter, M.F., "An Algorithm For Suffix Stripping," |
25
|
|
|
|
|
|
|
Program 14 (3), July 1980, pp. 130-137. |
26
|
|
|
|
|
|
|
Provenance: Written by B. Frakes and C. Cox, 1986. |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
I have re-interpreted areas that use Frakes and Cox's "WordSize" |
29
|
|
|
|
|
|
|
function. My version may misbehave on short words starting with "y", |
30
|
|
|
|
|
|
|
but I can't think of any examples. |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
The step numbers correspond to Frakes and Cox, and are probably in |
33
|
|
|
|
|
|
|
Porter's article (which I've not seen). |
34
|
|
|
|
|
|
|
Porter's algorithm still has rough spots (e.g current/currency, -ings words), |
35
|
|
|
|
|
|
|
which I've not attempted to cure, although I have added |
36
|
|
|
|
|
|
|
support for the British -ise suffix. |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
=head1 CHANGES |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
1999.06.15 - Changed to '.pm' module, moved into Lingua::Stem namespace, |
42
|
|
|
|
|
|
|
optionalized the export of the 'stem' routine |
43
|
|
|
|
|
|
|
into the caller's namespace, added named parameters |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
1999.06.24 - Switch core implementation of the Porter stemmer to |
46
|
|
|
|
|
|
|
the one written by Jim Richardson |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
2000.08.25 - 2.11 Added stemming cache |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2000.09.14 - 2.12 Fixed *major* :( implementation error of Porter's algorithm |
51
|
|
|
|
|
|
|
Error was entirely my fault - I completely forgot to include |
52
|
|
|
|
|
|
|
rule sets 2,3, and 4 starting with Lingua::Stem 0.30. |
53
|
|
|
|
|
|
|
-- Jerilyn Franz |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
2003.09.28 - 2.13 Corrected documentation error pointed out by Simon Cozens. |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
2005.11.20 - 2.14 Changed rule declarations to conform to Perl style convention |
58
|
|
|
|
|
|
|
for 'private' subroutines. Changed Exporter invokation to more |
59
|
|
|
|
|
|
|
portable 'require' vice 'use'. |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
2006.02.14 - 2.15 Added ability to pass word list by 'handle' for in-place stemming. |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
2009.07.27 - 2.16 Documentation Fix |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
2020.06.20 - 2.30 Version renumber for module consistency. |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
2020.09.26 - 2.31 Fix for Latin1/UTF8 issue in documentation |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=cut |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
####################################################################### |
72
|
|
|
|
|
|
|
# Initialization |
73
|
|
|
|
|
|
|
####################################################################### |
74
|
|
|
|
|
|
|
|
75
|
2
|
|
|
2
|
|
13
|
use strict; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
62
|
|
76
|
2
|
|
|
2
|
|
10
|
use warnings; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
60
|
|
77
|
|
|
|
|
|
|
require Exporter; |
78
|
2
|
|
|
2
|
|
10
|
use Carp; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
129
|
|
79
|
2
|
|
|
2
|
|
21
|
use vars qw (@ISA @EXPORT_OK @EXPORT %EXPORT_TAGS $VERSION); |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
219
|
|
80
|
|
|
|
|
|
|
BEGIN { |
81
|
2
|
|
|
2
|
|
8
|
$VERSION = "2.31"; |
82
|
2
|
|
|
|
|
68
|
@ISA = qw (Exporter); |
83
|
2
|
|
|
|
|
14
|
@EXPORT = (); |
84
|
2
|
|
|
|
|
5
|
@EXPORT_OK = qw (stem clear_stem_cache stem_caching); |
85
|
2
|
|
|
|
|
1214
|
%EXPORT_TAGS = (); |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
my $Stem_Caching = 0; |
89
|
|
|
|
|
|
|
my $Stem_Cache = {}; |
90
|
|
|
|
|
|
|
my %Stem_Cache2 = (); |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
# |
93
|
|
|
|
|
|
|
#V Porter.pm V2.11 25 Aug 2000 stemming cache |
94
|
|
|
|
|
|
|
# Porter.pm V2.1 21 Jun 1999 with '&$sub if defined' not 'eval ""' |
95
|
|
|
|
|
|
|
# Porter.pm V2.0 25 Nov 1994 (for Perl 5.000) |
96
|
|
|
|
|
|
|
# porter.pl V1.0 10 Aug 1994 (for Perl 4.036) |
97
|
|
|
|
|
|
|
# Jim Richardson, University of Sydney |
98
|
|
|
|
|
|
|
# jimr@maths.usyd.edu.au or http://www.maths.usyd.edu.au:8000/jimr.html |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
# Find a canonical stem for a word, assumed to consist entirely of |
101
|
|
|
|
|
|
|
# lower-case letters. The approach is from |
102
|
|
|
|
|
|
|
# |
103
|
|
|
|
|
|
|
# M. F. Porter, An algorithm for suffix stripping, Program (Automated |
104
|
|
|
|
|
|
|
# Library and Information Systems) 14 (3) 130-7, July 1980. |
105
|
|
|
|
|
|
|
# |
106
|
|
|
|
|
|
|
# This algorithm is used by WAIS: for example, see freeWAIS-0.3 at |
107
|
|
|
|
|
|
|
# |
108
|
|
|
|
|
|
|
# http://kudzu.cnidr.org/cnidr_projects/cnidr_projects.html |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
# Some additional rules are used here, mainly to allow for British spellings |
111
|
|
|
|
|
|
|
# like -ise. They are marked ** in the code. |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
# Initialization required before using subroutine stem: |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
# We count syllables slightly differently from Porter: we say the syllable |
116
|
|
|
|
|
|
|
# count increases on each occurrence in the word of an adjacent pair |
117
|
|
|
|
|
|
|
# |
118
|
|
|
|
|
|
|
# [aeiouy][^aeiou] |
119
|
|
|
|
|
|
|
# |
120
|
|
|
|
|
|
|
# This avoids any need to define vowels and consonants, or confusion over |
121
|
|
|
|
|
|
|
# 'y'. It also works slightly better: our definition gives two syllables |
122
|
|
|
|
|
|
|
# in 'yttrium', while Porter's gives only one because the initial 'y' is |
123
|
|
|
|
|
|
|
# taken to be a consonant. But it is not quite obvious: for example, |
124
|
|
|
|
|
|
|
# consider 'mayfly' where, when working backwards (see below), the 'yf' |
125
|
|
|
|
|
|
|
# matches the above pattern, even though it is the 'ay' which in Porter's |
126
|
|
|
|
|
|
|
# terms increments the syllable count. |
127
|
|
|
|
|
|
|
# |
128
|
|
|
|
|
|
|
# We wish to match the above in context, working backwards from the end of |
129
|
|
|
|
|
|
|
# the word: the appropriate regular expression is |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
my $syl = '[aeiou]*[^aeiou][^aeiouy]*[aeiouy]'; |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
# (This works because [^aeiouy] is a subset of [^aeiou].) If we want two |
134
|
|
|
|
|
|
|
# syllables ("m>1" in Porter's terminology) we can just match $syl$syl. |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
# For step 1b we need to be able to detect the presence of a vowel: here |
137
|
|
|
|
|
|
|
# we revert to Porter's definition that a vowel is [aeiou], or y preceded |
138
|
|
|
|
|
|
|
# by a consonant. (If the . below is a vowel, then the . is the desired |
139
|
|
|
|
|
|
|
# vowel; if the . is a consonant the y is the desired vowel.) |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
my $hasvow = '[^aeiouy]*([aeiou]|y.)'; |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
=head1 METHODS |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
=cut |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
####################################################################### |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
=over 4 |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
=item stem({ -words => \@words, -locale => 'en', -exceptions => \%exceptions }); |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
Stems a list of passed words using the rules of US English. Returns |
154
|
|
|
|
|
|
|
an anonymous array reference to the stemmed words. |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
Example: |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
my @words = ( 'wordy', 'another' ); |
159
|
|
|
|
|
|
|
my $stemmed_words = Lingua::Stem::En::stem({ -words => \@words, |
160
|
|
|
|
|
|
|
-locale => 'en', |
161
|
|
|
|
|
|
|
-exceptions => \%exceptions, |
162
|
|
|
|
|
|
|
}); |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
If the first element of @words is a list reference, then the stemming is performed 'in place' |
165
|
|
|
|
|
|
|
on that list (modifying the passed list directly instead of copying it to a new array). |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
This is only useful if you do not need to keep the original list. If you B need to keep |
168
|
|
|
|
|
|
|
the original list, use the normal semantic of having 'stem' return a new list instead - that |
169
|
|
|
|
|
|
|
is faster than making your own copy B using the 'in place' semantics since the primary |
170
|
|
|
|
|
|
|
difference between 'in place' and 'by value' stemming is the creation of a copy of the original |
171
|
|
|
|
|
|
|
list. If you B need the original list, then the 'in place' stemming is about 60% faster. |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
Example of 'in place' stemming: |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
my $words = [ 'wordy', 'another' ]; |
176
|
|
|
|
|
|
|
my $stemmed_words = Lingua::Stem::En::stem({ -words => [$words], |
177
|
|
|
|
|
|
|
-locale => 'en', |
178
|
|
|
|
|
|
|
-exceptions => \%exceptions, |
179
|
|
|
|
|
|
|
}); |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
The 'in place' mode returns a reference to the original list with the words stemmed. |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
=back |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
=cut |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
sub stem { |
188
|
18
|
50
|
|
18
|
1
|
40
|
return [] if ($#_ == -1); |
189
|
18
|
|
|
|
|
20
|
my $parm_ref; |
190
|
18
|
50
|
|
|
|
37
|
if (ref $_[0]) { |
191
|
18
|
|
|
|
|
25
|
$parm_ref = shift; |
192
|
|
|
|
|
|
|
} else { |
193
|
0
|
|
|
|
|
0
|
$parm_ref = { @_ }; |
194
|
|
|
|
|
|
|
} |
195
|
|
|
|
|
|
|
|
196
|
18
|
|
|
|
|
25
|
my $words = []; |
197
|
18
|
|
|
|
|
26
|
my $locale = 'en'; |
198
|
18
|
|
|
|
|
24
|
my $exceptions = {}; |
199
|
18
|
|
|
|
|
51
|
foreach (keys %$parm_ref) { |
200
|
54
|
|
|
|
|
80
|
my $key = lc ($_); |
201
|
54
|
|
|
|
|
77
|
my $value = $parm_ref->{$key}; |
202
|
54
|
100
|
|
|
|
107
|
if ($key eq '-words') { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
203
|
18
|
|
|
|
|
48
|
@$words = @$value; |
204
|
18
|
100
|
|
|
|
46
|
if (ref($words->[0]) eq 'ARRAY'){ |
205
|
3
|
|
|
|
|
6
|
$words = $words->[0]; |
206
|
|
|
|
|
|
|
} |
207
|
|
|
|
|
|
|
} elsif ($key eq '-exceptions') { |
208
|
18
|
|
|
|
|
48
|
$exceptions = $parm_ref->{$key}; |
209
|
|
|
|
|
|
|
} elsif ($key eq '-locale') { |
210
|
18
|
|
|
|
|
35
|
$locale = $parm_ref->{$key}; |
211
|
|
|
|
|
|
|
} else { |
212
|
0
|
|
|
|
|
0
|
croak (__PACKAGE__ . "::stem() - Unknown parameter '$key' with value '$parm_ref->{$key}'\n"); |
213
|
|
|
|
|
|
|
} |
214
|
|
|
|
|
|
|
} |
215
|
|
|
|
|
|
|
|
216
|
18
|
|
|
|
|
29
|
local( $_ ); |
217
|
|
|
|
|
|
|
|
218
|
18
|
|
|
|
|
26
|
foreach (@$words) { |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
# Flatten case |
221
|
180
|
|
|
|
|
300
|
$_ = lc $_; |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
# Check against cache of stemmed words |
224
|
180
|
50
|
|
|
|
272
|
if (exists $Stem_Cache2{$_}) { |
225
|
0
|
|
|
|
|
0
|
$_ = $Stem_Cache2{$_}; |
226
|
0
|
|
|
|
|
0
|
next; |
227
|
|
|
|
|
|
|
} |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
# Check against exceptions list |
230
|
180
|
100
|
|
|
|
281
|
if (exists $exceptions->{$_}) { |
231
|
6
|
|
|
|
|
8
|
$_ = $exceptions->{$_}; |
232
|
6
|
|
|
|
|
14
|
next; |
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
|
235
|
174
|
|
|
|
|
193
|
my $original_word = $_; |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
# Step 0 - remove punctuation |
238
|
174
|
|
|
|
|
229
|
s/'s$//; s/^[^a-z]+//; s/[^a-z]+$//; |
|
174
|
|
|
|
|
288
|
|
|
174
|
|
|
|
|
247
|
|
239
|
174
|
50
|
|
|
|
433
|
next unless /^[a-z]+$/; |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
# Reverse the word so we can easily apply pattern matching to the end: |
242
|
174
|
|
|
|
|
320
|
$_ = reverse $_; |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
# Step 1a: plurals -- sses->ss, ies->i, ss->ss, s->0 |
245
|
|
|
|
|
|
|
|
246
|
174
|
100
|
50
|
|
|
334
|
m!^s! && ( s!^se(ss|i)!$1! || s!^s([^s])!$1! ); |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
# Step 1b: participles -- SYLeed->SYLee, VOWed->VOW, VOWing->VOW; |
249
|
|
|
|
|
|
|
# but ated->ate etc |
250
|
|
|
|
|
|
|
|
251
|
174
|
50
|
50
|
|
|
671
|
s!^dee($syl)!ee$1!o || |
|
|
|
100
|
|
|
|
|
252
|
|
|
|
|
|
|
( |
253
|
|
|
|
|
|
|
s!^(de|gni)($hasvow)!$2!o && |
254
|
|
|
|
|
|
|
( |
255
|
|
|
|
|
|
|
# at->ate, bl->ble, iz->ize, is->ise |
256
|
|
|
|
|
|
|
s!^(ta|lb|[sz]i)!e$1! || # ** ise as well as ize |
257
|
|
|
|
|
|
|
# CC->C (C consonant other than l, s, z) |
258
|
|
|
|
|
|
|
s!^([^aeioulsz])\1!$1! || |
259
|
|
|
|
|
|
|
# (m=1) CVD->CVDe (C consonant, V vowel, D consonant not w, x, y) |
260
|
|
|
|
|
|
|
s!^([^aeiouwxy][aeiouy][^aeiou]+)$!e$1! |
261
|
|
|
|
|
|
|
) |
262
|
|
|
|
|
|
|
); |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
# Step 1c: change y to i: happy->happi, sky->sky |
265
|
|
|
|
|
|
|
|
266
|
174
|
|
|
|
|
379
|
s!^y($hasvow)!i$1!o; |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
# Step 2: double and triple suffices (part 1) |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
# Switch on last three letters (fails harmlessly if subroutine undefined) -- |
271
|
|
|
|
|
|
|
# thanks to Ian Phillipps who wrote |
272
|
|
|
|
|
|
|
# CPAN authors/id/IANPX/Stem-0.1.tar.gz |
273
|
|
|
|
|
|
|
# for suggesting the replacement of |
274
|
|
|
|
|
|
|
# eval( '&S2' . unpack( 'a3', $_ ) ); |
275
|
|
|
|
|
|
|
# (where the eval ignores undefined subroutines) by the much faster |
276
|
|
|
|
|
|
|
# eval { &{ 'S2' . substr( $_, 0, 3 ) } }; |
277
|
|
|
|
|
|
|
# But the following is slightly faster still: |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
{ |
280
|
2
|
|
|
2
|
|
16
|
no strict 'refs'; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
3048
|
|
|
174
|
|
|
|
|
201
|
|
281
|
|
|
|
|
|
|
|
282
|
174
|
|
|
|
|
178
|
my $sub; |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
# Step 3: double and triple suffices, etc (part 2) |
285
|
|
|
|
|
|
|
|
286
|
174
|
50
|
|
|
|
176
|
&$sub if defined &{ $sub = '_S2' . substr( $_, 0, 3 ) }; |
|
174
|
|
|
|
|
692
|
|
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
# Step 3: double and triple suffices, etc (part 2) |
289
|
|
|
|
|
|
|
|
290
|
174
|
50
|
|
|
|
221
|
&$sub if defined &{ $sub = '_S3' . substr( $_, 0, 3 ) }; |
|
174
|
|
|
|
|
508
|
|
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
# Step 4: single suffices on polysyllables |
293
|
|
|
|
|
|
|
|
294
|
174
|
100
|
|
|
|
212
|
&$sub if defined &{ $sub = '_S4' . substr( $_, 0, 2 ) }; |
|
174
|
|
|
|
|
604
|
|
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
} |
297
|
|
|
|
|
|
|
# Step 5a: tidy up final e -- probate->probat, rate->rate; cease->ceas |
298
|
|
|
|
|
|
|
|
299
|
174
|
100
|
50
|
|
|
590
|
m!^e! && ( s!^e($syl$syl)!$1!o || |
|
|
|
50
|
|
|
|
|
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
# Porter's ( m=1 and not *o ) E where o = cvd with d a consonant |
302
|
|
|
|
|
|
|
# not w, x or y: |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
! m!^e[^aeiouwxy][aeiouy][^aeiou]! && # not *o E |
305
|
|
|
|
|
|
|
s!^e($syl[aeiouy]*[^aeiou]*)$!$1!o # m=1 |
306
|
|
|
|
|
|
|
); |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
# Step 5b: double l -- controll->control, roll->roll |
309
|
|
|
|
|
|
|
# ** Note correction: Porter has m>1 here ($syl$syl), but it seems m>0 |
310
|
|
|
|
|
|
|
# ($syl) is wanted to strip an l off controll. |
311
|
|
|
|
|
|
|
|
312
|
174
|
|
|
|
|
253
|
s!^ll($syl)!l$1!o; |
313
|
|
|
|
|
|
|
|
314
|
174
|
|
|
|
|
270
|
$_ = scalar( reverse $_ ); |
315
|
|
|
|
|
|
|
|
316
|
174
|
50
|
|
|
|
294
|
$Stem_Cache2{$original_word} = $_ if $Stem_Caching; |
317
|
|
|
|
|
|
|
} |
318
|
18
|
50
|
|
|
|
40
|
%Stem_Cache2 = () if ($Stem_Caching < 2); |
319
|
|
|
|
|
|
|
|
320
|
18
|
|
|
|
|
61
|
return $words; |
321
|
|
|
|
|
|
|
} |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
############################################################## |
324
|
|
|
|
|
|
|
# Rule set 4 |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
sub _S4la { |
327
|
|
|
|
|
|
|
# SYLSYLal -> SYLSYL |
328
|
0
|
|
|
0
|
|
0
|
s!^la($syl$syl)!$1!o; |
329
|
|
|
|
|
|
|
} |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
sub _S4ec { |
332
|
|
|
|
|
|
|
# SYLSYL[ae]nce -> SYLSYL |
333
|
0
|
|
|
0
|
|
0
|
s!^ecn[ae]($syl$syl)!$1!o; |
334
|
|
|
|
|
|
|
} |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
sub _S4re { |
337
|
|
|
|
|
|
|
# SYLSYLer -> SYLSYL |
338
|
18
|
|
|
18
|
|
89
|
s!^re($syl$syl)!$1!o; |
339
|
|
|
|
|
|
|
} |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
sub _S4ci { |
342
|
|
|
|
|
|
|
# SYLSYLic -> SYLSYL |
343
|
0
|
|
|
0
|
|
|
s!^ci($syl$syl)!$1!o; |
344
|
|
|
|
|
|
|
} |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
sub _S4el { |
347
|
|
|
|
|
|
|
# SYLSYL[ai]ble -> SYLSYL |
348
|
0
|
|
|
0
|
|
|
s!^elb[ai]($syl$syl)!$1!o; |
349
|
|
|
|
|
|
|
} |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
sub _S4tn { |
352
|
|
|
|
|
|
|
# SYLSYLant -> SYLSYL, SYLSYLe?ment -> SYLSYL, SYLSYLent -> SYLSYL |
353
|
0
|
|
|
0
|
|
|
s!^tn(a|e(me?)?)($syl$syl)!$3!o; |
354
|
|
|
|
|
|
|
} |
355
|
|
|
|
|
|
|
sub _S4no { |
356
|
|
|
|
|
|
|
# SYLSYL[st]ion -> SYLSYL[st] |
357
|
0
|
|
|
0
|
|
|
s!^noi([st]$syl$syl)!$1!o; |
358
|
|
|
|
|
|
|
} |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
sub _S4uo { |
361
|
|
|
|
|
|
|
# SYLSYLou -> SYLSYL e.g. homologou -> homolog |
362
|
0
|
|
|
0
|
|
|
s!^uo($syl$syl)!$1!o; |
363
|
|
|
|
|
|
|
} |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
sub _S4ms { |
366
|
|
|
|
|
|
|
# SYLSYLism -> SYLSYL |
367
|
0
|
|
|
0
|
|
|
s!^msi($syl$syl)!$1!o; |
368
|
|
|
|
|
|
|
} |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
sub _S4et { |
371
|
|
|
|
|
|
|
# SYLSYLate -> SYLSYL |
372
|
0
|
|
|
0
|
|
|
s!^eta($syl$syl)!$1!o; |
373
|
|
|
|
|
|
|
} |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
sub _S4it { |
376
|
|
|
|
|
|
|
# SYLSYLiti -> SYLSYL |
377
|
0
|
|
|
0
|
|
|
s!^iti($syl$syl)!$1!o; |
378
|
|
|
|
|
|
|
} |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
sub _S4su { |
381
|
|
|
|
|
|
|
# SYLSYLous -> SYLSYL |
382
|
0
|
|
|
0
|
|
|
s!^suo($syl$syl)!$1!o; |
383
|
|
|
|
|
|
|
} |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
sub _S4ev { |
386
|
|
|
|
|
|
|
# SYLSYLive -> SYLSYL |
387
|
0
|
|
|
0
|
|
|
s!^evi($syl$syl)!$1!o; |
388
|
|
|
|
|
|
|
} |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
sub _S4ez { |
391
|
|
|
|
|
|
|
# SYLSYLize -> SYLSYL |
392
|
0
|
|
|
0
|
|
|
s!^ezi($syl$syl)!$1!o; |
393
|
|
|
|
|
|
|
} |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
sub _S4es { |
396
|
|
|
|
|
|
|
# SYLSYLise -> SYLSYL ** |
397
|
0
|
|
|
0
|
|
|
s!^esi($syl$syl)!$1!o; |
398
|
|
|
|
|
|
|
} |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
############################################################## |
401
|
|
|
|
|
|
|
# Rule set 2 |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
sub _S2lan { |
404
|
|
|
|
|
|
|
# SYLational -> SYLate, SYLtional -> SYLtion |
405
|
0
|
0
|
|
0
|
|
|
s!^lanoita($syl)!eta$1!o || s!^lanoit($syl)!noit$1!o; |
406
|
|
|
|
|
|
|
} |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
sub _S2icn { |
409
|
|
|
|
|
|
|
# SYLanci -> SYLance, SYLency ->SYLence |
410
|
0
|
|
|
0
|
|
|
s!^icn([ae]$syl)!ecn$1!o; |
411
|
|
|
|
|
|
|
} |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
sub _S2res { |
414
|
|
|
|
|
|
|
# SYLiser -> SYLise ** |
415
|
0
|
|
|
0
|
|
|
&_S2rez; |
416
|
|
|
|
|
|
|
} |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
sub _S2rez { |
419
|
|
|
|
|
|
|
# SYLizer -> SYLize |
420
|
0
|
|
|
0
|
|
|
s!^re(.)i($syl)!e$1i$2!o; |
421
|
|
|
|
|
|
|
} |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
sub _S2ilb { |
424
|
|
|
|
|
|
|
# SYLabli -> SYLable, SYLibli -> SYLible ** (e.g. incredibli) |
425
|
0
|
|
|
0
|
|
|
s!^ilb([ai]$syl)!elb$1!o; |
426
|
|
|
|
|
|
|
} |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
sub _S2ill { |
429
|
|
|
|
|
|
|
# SYLalli -> SYLal |
430
|
0
|
|
|
0
|
|
|
s!^illa($syl)!la$1!o; |
431
|
|
|
|
|
|
|
} |
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
sub _S2ilt { |
434
|
|
|
|
|
|
|
# SYLentli -> SYLent |
435
|
0
|
|
|
0
|
|
|
s!^iltne($syl)!tne$1!o |
436
|
|
|
|
|
|
|
} |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
sub _S2ile { |
439
|
|
|
|
|
|
|
# SYLeli -> SYLe |
440
|
0
|
|
|
0
|
|
|
s!^ile($syl)!e$1!o; |
441
|
|
|
|
|
|
|
} |
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
sub _S2ils { |
444
|
|
|
|
|
|
|
# SYLousli -> SYLous |
445
|
0
|
|
|
0
|
|
|
s!^ilsuo($syl)!suo$1!o; |
446
|
|
|
|
|
|
|
} |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
sub _S2noi { |
449
|
|
|
|
|
|
|
# SYLization -> SYLize, SYLisation -> SYLise**, SYLation -> SYLate |
450
|
0
|
0
|
|
0
|
|
|
s!^noita([sz])i($syl)!e$1i$2!o || s!^noita($syl)!eta$1!o; |
451
|
|
|
|
|
|
|
} |
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
sub _S2rot { |
454
|
|
|
|
|
|
|
# SYLator -> SYLate |
455
|
0
|
|
|
0
|
|
|
s!^rota($syl)!eta$1!o; |
456
|
|
|
|
|
|
|
} |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
sub _S2msi { |
459
|
|
|
|
|
|
|
# SYLalism -> SYLal |
460
|
0
|
|
|
0
|
|
|
s!^msila($syl)!la$1!o; |
461
|
|
|
|
|
|
|
} |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
sub _S2sse { |
464
|
|
|
|
|
|
|
# SYLiveness -> SYLive, SYLfulness -> SYLful, SYLousness -> SYLous |
465
|
0
|
|
|
0
|
|
|
s!^ssen(evi|luf|suo)($syl)!$1$2!o; |
466
|
|
|
|
|
|
|
} |
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
sub _S2iti { |
469
|
|
|
|
|
|
|
# SYLaliti -> SYLal, SYLiviti -> SYLive, SYLbiliti ->SYLble |
470
|
0
|
0
|
|
0
|
|
|
s!^iti(la|lib|vi)($syl)! ( $1 eq 'la' ? 'la' : $1 eq 'lib' ? 'elb' : 'evi' ) |
|
0
|
0
|
|
|
|
|
|
471
|
|
|
|
|
|
|
. $2 !eo; |
472
|
|
|
|
|
|
|
} |
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
############################################################## |
475
|
|
|
|
|
|
|
# Rule set 3 |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
sub _S3eta { |
478
|
|
|
|
|
|
|
# SYLicate -> SYLic |
479
|
0
|
|
|
0
|
|
|
s!^etaci($syl)!ci$1!o; |
480
|
|
|
|
|
|
|
} |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
sub _S3evi { |
483
|
|
|
|
|
|
|
# SYLative -> SYL |
484
|
0
|
|
|
0
|
|
|
s!^evita($syl)!$1!o; |
485
|
|
|
|
|
|
|
} |
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
sub _S3ezi |
488
|
|
|
|
|
|
|
{ |
489
|
|
|
|
|
|
|
# SYLalize -> SYLal |
490
|
0
|
|
|
0
|
|
|
s!^ezila($syl)!la$1!o; |
491
|
|
|
|
|
|
|
} |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
sub _S3esi { |
494
|
|
|
|
|
|
|
# SYLalise -> SYLal ** |
495
|
0
|
|
|
0
|
|
|
s!^esila($syl)!la$1!o; |
496
|
|
|
|
|
|
|
} |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
sub _S3iti { |
499
|
|
|
|
|
|
|
# SYLiciti -> SYLic |
500
|
0
|
|
|
0
|
|
|
s!^itici($syl)!ci$1!o; |
501
|
|
|
|
|
|
|
} |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
sub _S3lac { |
504
|
|
|
|
|
|
|
# SYLical -> SYLic |
505
|
0
|
|
|
0
|
|
|
s!^laci($syl)!ci$1!o; |
506
|
|
|
|
|
|
|
} |
507
|
|
|
|
|
|
|
sub _S3luf { |
508
|
|
|
|
|
|
|
# SYLful -> SYL |
509
|
0
|
|
|
0
|
|
|
s!^luf($syl)!$1!o; |
510
|
|
|
|
|
|
|
} |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
sub _S3sse { |
513
|
|
|
|
|
|
|
# SYLness -> SYL |
514
|
0
|
|
|
0
|
|
|
s!^ssen($syl)!$1!o; |
515
|
|
|
|
|
|
|
} |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
############################################################## |
519
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
=over 4 |
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
=item stem_caching({ -level => 0|1|2 }); |
523
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
Sets the level of stem caching. |
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
'0' means 'no caching'. This is the default level. |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
'1' means 'cache per run'. This caches stemming results during a single |
529
|
|
|
|
|
|
|
call to 'stem'. |
530
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
'2' means 'cache indefinitely'. This caches stemming results until |
532
|
|
|
|
|
|
|
either the process exits or the 'clear_stem_cache' method is called. |
533
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
=back |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
=cut |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
sub stem_caching { |
539
|
0
|
|
|
0
|
1
|
|
my $parm_ref; |
540
|
0
|
0
|
|
|
|
|
if (ref $_[0]) { |
541
|
0
|
|
|
|
|
|
$parm_ref = shift; |
542
|
|
|
|
|
|
|
} else { |
543
|
0
|
|
|
|
|
|
$parm_ref = { @_ }; |
544
|
|
|
|
|
|
|
} |
545
|
0
|
|
|
|
|
|
my $caching_level = $parm_ref->{-level}; |
546
|
0
|
0
|
|
|
|
|
if (defined $caching_level) { |
547
|
0
|
0
|
|
|
|
|
if ($caching_level !~ m/^[012]$/) { |
548
|
0
|
|
|
|
|
|
croak(__PACKAGE__ . "::stem_caching() - Legal values are '0','1' or '2'. '$caching_level' is not a legal value"); |
549
|
|
|
|
|
|
|
} |
550
|
0
|
|
|
|
|
|
$Stem_Caching = $caching_level; |
551
|
0
|
0
|
|
|
|
|
if ($caching_level < 2) { |
552
|
0
|
|
|
|
|
|
%Stem_Cache2 = (); |
553
|
|
|
|
|
|
|
} |
554
|
|
|
|
|
|
|
} |
555
|
0
|
|
|
|
|
|
return $Stem_Caching; |
556
|
|
|
|
|
|
|
} |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
############################################################## |
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
=over 4 |
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
=item clear_stem_cache; |
563
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
Clears the cache of stemmed words |
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
=back |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
=cut |
569
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
sub clear_stem_cache { |
571
|
0
|
|
|
0
|
1
|
|
%Stem_Cache2 = (); |
572
|
|
|
|
|
|
|
} |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
############################################################## |
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
=head1 NOTES |
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
This code is almost entirely derived from the Porter 2.1 module |
579
|
|
|
|
|
|
|
written by Jim Richardson. |
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
=head1 SEE ALSO |
582
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
Lingua::Stem |
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
=head1 AUTHOR |
586
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
Jim Richardson, University of Sydney |
588
|
|
|
|
|
|
|
jimr@maths.usyd.edu.au or http://www.maths.usyd.edu.au:8000/jimr.html |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
Integration in Lingua::Stem by |
591
|
|
|
|
|
|
|
Jerilyn Franz, FreeRun Technologies, |
592
|
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
=head1 COPYRIGHT |
595
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
Jim Richardson, University of Sydney |
597
|
|
|
|
|
|
|
Jerilyn Franz, FreeRun Technologies |
598
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
This code is freely available under the same terms as Perl. |
600
|
|
|
|
|
|
|
|
601
|
|
|
|
|
|
|
=head1 BUGS |
602
|
|
|
|
|
|
|
|
603
|
|
|
|
|
|
|
=head1 TODO |
604
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
=cut |
606
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
1; |