File Coverage

blib/lib/Lingua/Stem/En.pm
Criterion Covered Total %
statement 67 121 55.3
branch 24 50 48.0
condition 6 10 60.0
subroutine 8 46 17.3
pod 3 3 100.0
total 108 230 46.9


line stmt bran cond sub pod time code
1             package Lingua::Stem::En;
2              
3             =head1 NAME
4              
5             Lingua::Stem::En - Porter's stemming algorithm for 'generic' English
6              
7             =head1 SYNOPSIS
8              
9             use Lingua::Stem::En;
10             my $stems = Lingua::Stem::En::stem({ -words => $word_list_reference,
11             -locale => 'en',
12             -exceptions => $exceptions_hash,
13             });
14              
15             =head1 DESCRIPTION
16              
17             This routine applies the Porter Stemming Algorithm to its parameters,
18             returning the stemmed words.
19              
20             It is derived from the C program "stemmer.c"
21             as found in freewais and elsewhere, which contains these notes:
22              
23             Purpose: Implementation of the Porter stemming algorithm documented
24             in: Porter, M.F., "An Algorithm For Suffix Stripping,"
25             Program 14 (3), July 1980, pp. 130-137.
26             Provenance: Written by B. Frakes and C. Cox, 1986.
27              
28             I have re-interpreted areas that use Frakes and Cox's "WordSize"
29             function. My version may misbehave on short words starting with "y",
30             but I can't think of any examples.
31              
32             The step numbers correspond to Frakes and Cox, and are probably in
33             Porter's article (which I've not seen).
34             Porter's algorithm still has rough spots (e.g current/currency, -ings words),
35             which I've not attempted to cure, although I have added
36             support for the British -ise suffix.
37              
38             =head1 CHANGES
39              
40            
41             1999.06.15 - Changed to '.pm' module, moved into Lingua::Stem namespace,
42             optionalized the export of the 'stem' routine
43             into the caller's namespace, added named parameters
44              
45             1999.06.24 - Switch core implementation of the Porter stemmer to
46             the one written by Jim Richardson
47              
48             2000.08.25 - 2.11 Added stemming cache
49              
50             2000.09.14 - 2.12 Fixed *major* :( implementation error of Porter's algorithm
51             Error was entirely my fault - I completely forgot to include
52             rule sets 2,3, and 4 starting with Lingua::Stem 0.30.
53             -- Jerilyn Franz
54              
55             2003.09.28 - 2.13 Corrected documentation error pointed out by Simon Cozens.
56              
57             2005.11.20 - 2.14 Changed rule declarations to conform to Perl style convention
58             for 'private' subroutines. Changed Exporter invokation to more
59             portable 'require' vice 'use'.
60              
61             2006.02.14 - 2.15 Added ability to pass word list by 'handle' for in-place stemming.
62              
63             2009.07.27 - 2.16 Documentation Fix
64              
65             2020.06.20 - 2.30 Version renumber for module consistency.
66              
67             =cut
68              
69             #######################################################################
70             # Initialization
71             #######################################################################
72              
73 2     2   13 use strict;
  2         4  
  2         67  
74 2     2   10 use warnings;
  2         4  
  2         65  
75             require Exporter;
76 2     2   9 use Carp;
  2         13  
  2         122  
77 2     2   21 use vars qw (@ISA @EXPORT_OK @EXPORT %EXPORT_TAGS $VERSION);
  2         4  
  2         230  
78             BEGIN {
79 2     2   8 $VERSION = "2.30";
80 2         42 @ISA = qw (Exporter);
81 2         14 @EXPORT = ();
82 2         5 @EXPORT_OK = qw (stem clear_stem_cache stem_caching);
83 2         1237 %EXPORT_TAGS = ();
84             }
85              
86             my $Stem_Caching = 0;
87             my $Stem_Cache = {};
88             my %Stem_Cache2 = ();
89              
90             #
91             #V Porter.pm V2.11 25 Aug 2000 stemming cache
92             # Porter.pm V2.1 21 Jun 1999 with '&$sub if defined' not 'eval ""'
93             # Porter.pm V2.0 25 Nov 1994 (for Perl 5.000)
94             # porter.pl V1.0 10 Aug 1994 (for Perl 4.036)
95             # Jim Richardson, University of Sydney
96             # jimr@maths.usyd.edu.au or http://www.maths.usyd.edu.au:8000/jimr.html
97              
98             # Find a canonical stem for a word, assumed to consist entirely of
99             # lower-case letters. The approach is from
100             #
101             # M. F. Porter, An algorithm for suffix stripping, Program (Automated
102             # Library and Information Systems) 14 (3) 130-7, July 1980.
103             #
104             # This algorithm is used by WAIS: for example, see freeWAIS-0.3 at
105             #
106             # http://kudzu.cnidr.org/cnidr_projects/cnidr_projects.html
107              
108             # Some additional rules are used here, mainly to allow for British spellings
109             # like -ise. They are marked ** in the code.
110              
111             # Initialization required before using subroutine stem:
112              
113             # We count syllables slightly differently from Porter: we say the syllable
114             # count increases on each occurrence in the word of an adjacent pair
115             #
116             # [aeiouy][^aeiou]
117             #
118             # This avoids any need to define vowels and consonants, or confusion over
119             # 'y'. It also works slightly better: our definition gives two syllables
120             # in 'yttrium', while Porter's gives only one because the initial 'y' is
121             # taken to be a consonant. But it is not quite obvious: for example,
122             # consider 'mayfly' where, when working backwards (see below), the 'yf'
123             # matches the above pattern, even though it is the 'ay' which in Porter's
124             # terms increments the syllable count.
125             #
126             # We wish to match the above in context, working backwards from the end of
127             # the word: the appropriate regular expression is
128              
129             my $syl = '[aeiou]*[^aeiou][^aeiouy]*[aeiouy]';
130              
131             # (This works because [^aeiouy] is a subset of [^aeiou].) If we want two
132             # syllables ("m>1" in Porter's terminology) we can just match $syl$syl.
133              
134             # For step 1b we need to be able to detect the presence of a vowel: here
135             # we revert to Porter's definition that a vowel is [aeiou], or y preceded
136             # by a consonant. (If the . below is a vowel, then the . is the desired
137             # vowel; if the . is a consonant the y is the desired vowel.)
138              
139             my $hasvow = '[^aeiouy]*([aeiou]|y.)';
140              
141             =head1 METHODS
142              
143             =cut
144              
145             #######################################################################
146              
147             =over 4
148              
149             =item stem({ -words => \@words, -locale => 'en', -exceptions => \%exceptions });
150              
151             Stems a list of passed words using the rules of US English. Returns
152             an anonymous array reference to the stemmed words.
153              
154             Example:
155              
156             my @words = ( 'wordy', 'another' );
157             my $stemmed_words = Lingua::Stem::En::stem({ -words => \@words,
158             -locale => 'en',
159             -exceptions => \%exceptions,
160             });
161              
162             If the first element of @words is a list reference, then the stemming is performed 'in place'
163             on that list (modifying the passed list directly instead of copying it to a new array).
164              
165             This is only useful if you do not need to keep the original list. If you B need to keep
166             the original list, use the normal semantic of having 'stem' return a new list instead - that
167             is faster than making your own copy B using the 'in place' semantics since the primary
168             difference between 'in place' and 'by value' stemming is the creation of a copy of the original
169             list. If you B need the original list, then the 'in place' stemming is about 60% faster.
170              
171             Example of 'in place' stemming:
172              
173             my $words = [ 'wordy', 'another' ];
174             my $stemmed_words = Lingua::Stem::En::stem({ -words => [$words],
175             -locale => 'en',
176             -exceptions => \%exceptions,
177             });
178              
179             The 'in place' mode returns a reference to the original list with the words stemmed.
180              
181             =back
182              
183             =cut
184              
185             sub stem {
186 18 50   18 1 43 return [] if ($#_ == -1);
187 18         21 my $parm_ref;
188 18 50       38 if (ref $_[0]) {
189 18         21 $parm_ref = shift;
190             } else {
191 0         0 $parm_ref = { @_ };
192             }
193            
194 18         30 my $words = [];
195 18         26 my $locale = 'en';
196 18         24 my $exceptions = {};
197 18         57 foreach (keys %$parm_ref) {
198 54         75 my $key = lc ($_);
199 54         79 my $value = $parm_ref->{$key};
200 54 100       107 if ($key eq '-words') {
    100          
    50          
201 18         48 @$words = @$value;
202 18 100       51 if (ref($words->[0]) eq 'ARRAY'){
203 3         9 $words = $words->[0];
204             }
205             } elsif ($key eq '-exceptions') {
206 18         35 $exceptions = $parm_ref->{$key};
207             } elsif ($key eq '-locale') {
208 18         45 $locale = $parm_ref->{$key};
209             } else {
210 0         0 croak (__PACKAGE__ . "::stem() - Unknown parameter '$key' with value '$parm_ref->{$key}'\n");
211             }
212             }
213            
214 18         29 local( $_ );
215              
216 18         29 foreach (@$words) {
217              
218             # Flatten case
219 180         281 $_ = lc $_;
220              
221             # Check against cache of stemmed words
222 180 50       269 if (exists $Stem_Cache2{$_}) {
223 0         0 $_ = $Stem_Cache2{$_};
224 0         0 next;
225             }
226              
227             # Check against exceptions list
228 180 100       270 if (exists $exceptions->{$_}) {
229 6         12 $_ = $exceptions->{$_};
230 6         11 next;
231             }
232              
233 174         195 my $original_word = $_;
234              
235             # Step 0 - remove punctuation
236 174         223 s/'s$//; s/^[^a-z]+//; s/[^a-z]+$//;
  174         274  
  174         228  
237 174 50       419 next unless /^[a-z]+$/;
238              
239             # Reverse the word so we can easily apply pattern matching to the end:
240 174         271 $_ = reverse $_;
241            
242             # Step 1a: plurals -- sses->ss, ies->i, ss->ss, s->0
243            
244 174 100 50     313 m!^s! && ( s!^se(ss|i)!$1! || s!^s([^s])!$1! );
245            
246             # Step 1b: participles -- SYLeed->SYLee, VOWed->VOW, VOWing->VOW;
247             # but ated->ate etc
248            
249 174 50 50     661 s!^dee($syl)!ee$1!o ||
      100        
250             (
251             s!^(de|gni)($hasvow)!$2!o &&
252             (
253             # at->ate, bl->ble, iz->ize, is->ise
254             s!^(ta|lb|[sz]i)!e$1! || # ** ise as well as ize
255             # CC->C (C consonant other than l, s, z)
256             s!^([^aeioulsz])\1!$1! ||
257             # (m=1) CVD->CVDe (C consonant, V vowel, D consonant not w, x, y)
258             s!^([^aeiouwxy][aeiouy][^aeiou]+)$!e$1!
259             )
260             );
261            
262             # Step 1c: change y to i: happy->happi, sky->sky
263            
264 174         372 s!^y($hasvow)!i$1!o;
265            
266             # Step 2: double and triple suffices (part 1)
267            
268             # Switch on last three letters (fails harmlessly if subroutine undefined) --
269             # thanks to Ian Phillipps who wrote
270             # CPAN authors/id/IANPX/Stem-0.1.tar.gz
271             # for suggesting the replacement of
272             # eval( '&S2' . unpack( 'a3', $_ ) );
273             # (where the eval ignores undefined subroutines) by the much faster
274             # eval { &{ 'S2' . substr( $_, 0, 3 ) } };
275             # But the following is slightly faster still:
276              
277             {
278 2     2   15 no strict 'refs';
  2         3  
  2         3130  
  174         204  
279            
280 174         247 my $sub;
281            
282             # Step 3: double and triple suffices, etc (part 2)
283              
284 174 50       175 &$sub if defined &{ $sub = '_S2' . substr( $_, 0, 3 ) };
  174         665  
285            
286             # Step 3: double and triple suffices, etc (part 2)
287            
288 174 50       222 &$sub if defined &{ $sub = '_S3' . substr( $_, 0, 3 ) };
  174         529  
289            
290             # Step 4: single suffices on polysyllables
291            
292 174 100       250 &$sub if defined &{ $sub = '_S4' . substr( $_, 0, 2 ) };
  174         549  
293            
294             }
295             # Step 5a: tidy up final e -- probate->probat, rate->rate; cease->ceas
296            
297 174 100 50     585 m!^e! && ( s!^e($syl$syl)!$1!o ||
      50        
298            
299             # Porter's ( m=1 and not *o ) E where o = cvd with d a consonant
300             # not w, x or y:
301            
302             ! m!^e[^aeiouwxy][aeiouy][^aeiou]! && # not *o E
303             s!^e($syl[aeiouy]*[^aeiou]*)$!$1!o # m=1
304             );
305            
306             # Step 5b: double l -- controll->control, roll->roll
307             # ** Note correction: Porter has m>1 here ($syl$syl), but it seems m>0
308             # ($syl) is wanted to strip an l off controll.
309            
310 174         252 s!^ll($syl)!l$1!o;
311            
312 174         261 $_ = scalar( reverse $_ );
313              
314 174 50       322 $Stem_Cache2{$original_word} = $_ if $Stem_Caching;
315             }
316 18 50       52 %Stem_Cache2 = () if ($Stem_Caching < 2);
317            
318 18         61 return $words;
319             }
320              
321             ##############################################################
322             # Rule set 4
323              
324             sub _S4la {
325             # SYLSYLal -> SYLSYL
326 0     0   0 s!^la($syl$syl)!$1!o;
327             }
328              
329             sub _S4ec {
330             # SYLSYL[ae]nce -> SYLSYL
331 0     0   0 s!^ecn[ae]($syl$syl)!$1!o;
332             }
333              
334             sub _S4re {
335             # SYLSYLer -> SYLSYL
336 18     18   85 s!^re($syl$syl)!$1!o;
337             }
338              
339             sub _S4ci {
340             # SYLSYLic -> SYLSYL
341 0     0     s!^ci($syl$syl)!$1!o;
342             }
343              
344             sub _S4el {
345             # SYLSYL[ai]ble -> SYLSYL
346 0     0     s!^elb[ai]($syl$syl)!$1!o;
347             }
348              
349             sub _S4tn {
350             # SYLSYLant -> SYLSYL, SYLSYLe?ment -> SYLSYL, SYLSYLent -> SYLSYL
351 0     0     s!^tn(a|e(me?)?)($syl$syl)!$3!o;
352             }
353             sub _S4no {
354             # SYLSYL[st]ion -> SYLSYL[st]
355 0     0     s!^noi([st]$syl$syl)!$1!o;
356             }
357              
358             sub _S4uo {
359             # SYLSYLou -> SYLSYL e.g. homologou -> homolog
360 0     0     s!^uo($syl$syl)!$1!o;
361             }
362              
363             sub _S4ms {
364             # SYLSYLism -> SYLSYL
365 0     0     s!^msi($syl$syl)!$1!o;
366             }
367              
368             sub _S4et {
369             # SYLSYLate -> SYLSYL
370 0     0     s!^eta($syl$syl)!$1!o;
371             }
372              
373             sub _S4it {
374             # SYLSYLiti -> SYLSYL
375 0     0     s!^iti($syl$syl)!$1!o;
376             }
377              
378             sub _S4su {
379             # SYLSYLous -> SYLSYL
380 0     0     s!^suo($syl$syl)!$1!o;
381             }
382              
383             sub _S4ev {
384             # SYLSYLive -> SYLSYL
385 0     0     s!^evi($syl$syl)!$1!o;
386             }
387              
388             sub _S4ez {
389             # SYLSYLize -> SYLSYL
390 0     0     s!^ezi($syl$syl)!$1!o;
391             }
392              
393             sub _S4es {
394             # SYLSYLise -> SYLSYL **
395 0     0     s!^esi($syl$syl)!$1!o;
396             }
397              
398             ##############################################################
399             # Rule set 2
400              
401             sub _S2lan {
402             # SYLational -> SYLate, SYLtional -> SYLtion
403 0 0   0     s!^lanoita($syl)!eta$1!o || s!^lanoit($syl)!noit$1!o;
404             }
405              
406             sub _S2icn {
407             # SYLanci -> SYLance, SYLency ->SYLence
408 0     0     s!^icn([ae]$syl)!ecn$1!o;
409             }
410              
411             sub _S2res {
412             # SYLiser -> SYLise **
413 0     0     &_S2rez;
414             }
415              
416             sub _S2rez {
417             # SYLizer -> SYLize
418 0     0     s!^re(.)i($syl)!e$1i$2!o;
419             }
420              
421             sub _S2ilb {
422             # SYLabli -> SYLable, SYLibli -> SYLible ** (e.g. incredibli)
423 0     0     s!^ilb([ai]$syl)!elb$1!o;
424             }
425              
426             sub _S2ill {
427             # SYLalli -> SYLal
428 0     0     s!^illa($syl)!la$1!o;
429             }
430              
431             sub _S2ilt {
432             # SYLentli -> SYLent
433 0     0     s!^iltne($syl)!tne$1!o
434             }
435              
436             sub _S2ile {
437             # SYLeli -> SYLe
438 0     0     s!^ile($syl)!e$1!o;
439             }
440              
441             sub _S2ils {
442             # SYLousli -> SYLous
443 0     0     s!^ilsuo($syl)!suo$1!o;
444             }
445              
446             sub _S2noi {
447             # SYLization -> SYLize, SYLisation -> SYLise**, SYLation -> SYLate
448 0 0   0     s!^noita([sz])i($syl)!e$1i$2!o || s!^noita($syl)!eta$1!o;
449             }
450              
451             sub _S2rot {
452             # SYLator -> SYLate
453 0     0     s!^rota($syl)!eta$1!o;
454             }
455              
456             sub _S2msi {
457             # SYLalism -> SYLal
458 0     0     s!^msila($syl)!la$1!o;
459             }
460              
461             sub _S2sse {
462             # SYLiveness -> SYLive, SYLfulness -> SYLful, SYLousness -> SYLous
463 0     0     s!^ssen(evi|luf|suo)($syl)!$1$2!o;
464             }
465              
466             sub _S2iti {
467             # SYLaliti -> SYLal, SYLiviti -> SYLive, SYLbiliti ->SYLble
468 0 0   0     s!^iti(la|lib|vi)($syl)! ( $1 eq 'la' ? 'la' : $1 eq 'lib' ? 'elb' : 'evi' )
  0 0          
469             . $2 !eo;
470             }
471              
472             ##############################################################
473             # Rule set 3
474              
475             sub _S3eta {
476             # SYLicate -> SYLic
477 0     0     s!^etaci($syl)!ci$1!o;
478             }
479              
480             sub _S3evi {
481             # SYLative -> SYL
482 0     0     s!^evita($syl)!$1!o;
483             }
484              
485             sub _S3ezi
486             {
487             # SYLalize -> SYLal
488 0     0     s!^ezila($syl)!la$1!o;
489             }
490              
491             sub _S3esi {
492             # SYLalise -> SYLal **
493 0     0     s!^esila($syl)!la$1!o;
494             }
495              
496             sub _S3iti {
497             # SYLiciti -> SYLic
498 0     0     s!^itici($syl)!ci$1!o;
499             }
500              
501             sub _S3lac {
502             # SYLical -> SYLic
503 0     0     s!^laci($syl)!ci$1!o;
504             }
505             sub _S3luf {
506             # SYLful -> SYL
507 0     0     s!^luf($syl)!$1!o;
508             }
509              
510             sub _S3sse {
511             # SYLness -> SYL
512 0     0     s!^ssen($syl)!$1!o;
513             }
514              
515              
516             ##############################################################
517              
518             =over 4
519              
520             =item stem_caching({ -level => 0|1|2 });
521              
522             Sets the level of stem caching.
523              
524             '0' means 'no caching'. This is the default level.
525              
526             '1' means 'cache per run'. This caches stemming results during a single
527             call to 'stem'.
528              
529             '2' means 'cache indefinitely'. This caches stemming results until
530             either the process exits or the 'clear_stem_cache' method is called.
531              
532             =back
533              
534             =cut
535              
536             sub stem_caching {
537 0     0 1   my $parm_ref;
538 0 0         if (ref $_[0]) {
539 0           $parm_ref = shift;
540             } else {
541 0           $parm_ref = { @_ };
542             }
543 0           my $caching_level = $parm_ref->{-level};
544 0 0         if (defined $caching_level) {
545 0 0         if ($caching_level !~ m/^[012]$/) {
546 0           croak(__PACKAGE__ . "::stem_caching() - Legal values are '0','1' or '2'. '$caching_level' is not a legal value");
547             }
548 0           $Stem_Caching = $caching_level;
549 0 0         if ($caching_level < 2) {
550 0           %Stem_Cache2 = ();
551             }
552             }
553 0           return $Stem_Caching;
554             }
555            
556             ##############################################################
557              
558             =over 4
559              
560             =item clear_stem_cache;
561              
562             Clears the cache of stemmed words
563              
564             =back
565              
566             =cut
567              
568             sub clear_stem_cache {
569 0     0 1   %Stem_Cache2 = ();
570             }
571              
572             ##############################################################
573              
574             =head1 NOTES
575              
576             This code is almost entirely derived from the Porter 2.1 module
577             written by Jim Richardson.
578              
579             =head1 SEE ALSO
580              
581             Lingua::Stem
582              
583             =head1 AUTHOR
584              
585             Jim Richardson, University of Sydney
586             jimr@maths.usyd.edu.au or http://www.maths.usyd.edu.au:8000/jimr.html
587              
588             Integration in Lingua::Stem by
589             Jerilyn Franz, FreeRun Technologies,
590            
591              
592             =head1 COPYRIGHT
593              
594             Jim Richardson, University of Sydney
595             Jerilyn Franz, FreeRun Technologies
596              
597             This code is freely available under the same terms as Perl.
598              
599             =head1 BUGS
600              
601             =head1 TODO
602              
603             =cut
604              
605             1;