line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::Stem::Es; |
2
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
86877
|
use Carp; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
230
|
|
4
|
|
|
|
|
|
|
|
5
|
2
|
|
|
2
|
|
10
|
use warnings; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
53
|
|
6
|
2
|
|
|
2
|
|
10
|
use strict; |
|
2
|
|
|
|
|
9
|
|
|
2
|
|
|
|
|
68
|
|
7
|
|
|
|
|
|
|
|
8
|
2
|
|
|
2
|
|
2486
|
use utf8; |
|
2
|
|
|
|
|
22
|
|
|
2
|
|
|
|
|
13
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
require Exporter; |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
our %EXPORT_TAGS = (); |
15
|
|
|
|
|
|
|
our @EXPORT_OK = qw (stem stem_word clear_stem_cache stem_caching); |
16
|
|
|
|
|
|
|
our @EXPORT = (); |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
our $VERSION = '0.04'; |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
our $DEBUG = 0; |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
my $Stem_Caching = 0; |
23
|
|
|
|
|
|
|
my $Stem_Cache = {}; |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
my $vowels = 'aeiouáéíóúü'; |
26
|
|
|
|
|
|
|
my $consonants = 'bcdfghjklmnñpqrstvwxyz'; |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
my $revowel = qr/[$vowels]/; |
29
|
|
|
|
|
|
|
my $reconsonants = qr/[$consonants]/; |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
sub stem { |
32
|
2
|
50
|
|
2
|
1
|
133117
|
return [] if ( $#_ == -1 ); |
33
|
2
|
|
|
|
|
7
|
my $parm_ref; |
34
|
2
|
50
|
|
|
|
12
|
if ( ref $_[0] ) { |
35
|
0
|
|
|
|
|
0
|
$parm_ref = shift; |
36
|
|
|
|
|
|
|
} |
37
|
|
|
|
|
|
|
else { |
38
|
2
|
|
|
|
|
11
|
$parm_ref = {@_}; |
39
|
|
|
|
|
|
|
} |
40
|
|
|
|
|
|
|
|
41
|
2
|
|
|
|
|
6
|
my $words = []; |
42
|
2
|
|
|
|
|
6
|
my $locale = 'es'; |
43
|
2
|
|
|
|
|
7
|
my $exceptions = {}; |
44
|
2
|
|
|
|
|
13
|
foreach ( keys %$parm_ref ) { |
45
|
2
|
|
|
|
|
9
|
my $key = lc($_); |
46
|
2
|
50
|
|
|
|
13
|
if ( $key eq '-words' ) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
47
|
2
|
|
|
|
|
4
|
@$words = @{ $parm_ref->{$key} }; |
|
2
|
|
|
|
|
11640
|
|
48
|
|
|
|
|
|
|
} |
49
|
|
|
|
|
|
|
elsif ( $key eq '-exceptions' ) { |
50
|
0
|
|
|
|
|
0
|
$exceptions = $parm_ref->{$key}; |
51
|
|
|
|
|
|
|
} |
52
|
|
|
|
|
|
|
elsif ( $key eq '-locale' ) { |
53
|
0
|
|
|
|
|
0
|
$locale = $parm_ref->{$key}; |
54
|
|
|
|
|
|
|
} |
55
|
|
|
|
|
|
|
else { |
56
|
0
|
|
|
|
|
0
|
croak( __PACKAGE__ |
57
|
|
|
|
|
|
|
. "::stem() - Unknown parameter '$key' with value '$parm_ref->{$key}'\n" |
58
|
|
|
|
|
|
|
); |
59
|
|
|
|
|
|
|
} |
60
|
|
|
|
|
|
|
} |
61
|
|
|
|
|
|
|
|
62
|
2
|
|
|
|
|
10
|
local ($_); |
63
|
2
|
|
|
|
|
12
|
foreach (@$words) { |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
# Check against exceptions list |
66
|
56780
|
50
|
|
|
|
139614
|
if ( exists $exceptions->{$_} ) { |
67
|
0
|
|
|
|
|
0
|
$_ = $exceptions->{$_}; |
68
|
0
|
|
|
|
|
0
|
next; |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
# Cache and stem |
72
|
56780
|
|
|
|
|
74006
|
my $original_word = $_; |
73
|
56780
|
|
|
|
|
106873
|
$_ = stem_word($_); |
74
|
56780
|
50
|
|
|
|
164802
|
$Stem_Cache->{$original_word} = $_ if $Stem_Caching; |
75
|
|
|
|
|
|
|
} |
76
|
2
|
50
|
|
|
|
15
|
$Stem_Cache = {} if ( $Stem_Caching < 2 ); |
77
|
|
|
|
|
|
|
|
78
|
2
|
|
|
|
|
20
|
return $words; |
79
|
|
|
|
|
|
|
} |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
sub stem_word { |
82
|
56780
|
|
|
56780
|
1
|
69179
|
my $word = shift; |
83
|
|
|
|
|
|
|
|
84
|
56780
|
50
|
|
|
|
141050
|
print "*****************\nOriginal: $word\n" if $DEBUG; |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
# Flatten case |
87
|
56780
|
|
|
|
|
92791
|
$word =~ s/Á/á/g; |
88
|
56780
|
|
|
|
|
68847
|
$word =~ s/É/é/g; |
89
|
56780
|
|
|
|
|
69207
|
$word =~ s/Í/í/g; |
90
|
56780
|
|
|
|
|
89819
|
$word =~ s/Ó/ó/g; |
91
|
56780
|
|
|
|
|
67722
|
$word =~ s/Ú/ú/g; |
92
|
56780
|
|
|
|
|
77124
|
$word =~ s/Ü/ü/g; |
93
|
56780
|
|
|
|
|
70381
|
$word =~ s/Ñ/ñ/g; |
94
|
56780
|
|
|
|
|
99380
|
$word = lc $word; |
95
|
56780
|
50
|
|
|
|
124284
|
print "Flatened word: $word\n" if $DEBUG; |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
# Check against cache of stemmed words |
98
|
56780
|
50
|
33
|
|
|
127313
|
if ( $Stem_Caching && exists $Stem_Cache->{$word} ) { |
99
|
0
|
|
|
|
|
0
|
return $Stem_Cache->{$word}; |
100
|
|
|
|
|
|
|
} |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
# Remove punctuation |
103
|
56780
|
|
|
|
|
249594
|
$word =~ s/[^$vowels$consonants]//g; |
104
|
56780
|
100
|
|
|
|
123274
|
return '' unless $word; |
105
|
56754
|
50
|
|
|
|
103684
|
print "Removed punctuation: $word\n" if $DEBUG; |
106
|
|
|
|
|
|
|
|
107
|
56754
|
|
|
|
|
98711
|
my $RV = define_RV($word); |
108
|
56754
|
|
|
|
|
80836
|
my $suffix; |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
############################################################ |
111
|
|
|
|
|
|
|
########### Step 0 ########### |
112
|
|
|
|
|
|
|
############################################################ |
113
|
|
|
|
|
|
|
# Attached pronoun |
114
|
|
|
|
|
|
|
# Search for the longest among the following suffixes: |
115
|
|
|
|
|
|
|
# me se sela selo selas selos la le lo las les los nos |
116
|
|
|
|
|
|
|
# and delete it, if it comes after one of |
117
|
|
|
|
|
|
|
# a) iéndo ándo ár ér ír |
118
|
|
|
|
|
|
|
# b) ando iendo ar er ir |
119
|
|
|
|
|
|
|
# c) yendo following u |
120
|
|
|
|
|
|
|
# in RV. In the case of c), yendo must lie in RV, but the preceding u can |
121
|
|
|
|
|
|
|
# be outside it. |
122
|
|
|
|
|
|
|
# In the case of a), deletion is followed by removing the acute accent. |
123
|
|
|
|
|
|
|
# Always do step 0 |
124
|
|
|
|
|
|
|
|
125
|
56754
|
100
|
|
|
|
114436
|
if ($RV) { |
126
|
55420
|
|
|
|
|
199787
|
my $pronoun = |
127
|
|
|
|
|
|
|
qr/(selas|selos|sela|selo|las|les|los|nos|me|se|la|le|lo)$/; |
128
|
|
|
|
|
|
|
|
129
|
55420
|
100
|
33
|
|
|
799690
|
if ( ($suffix) = $RV =~ /(?:ándo|iéndo|ár|ér|ír)($pronoun)$/ ) { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
# Case a) |
132
|
234
|
|
|
|
|
2963
|
$word =~ s/$suffix$//; |
133
|
234
|
|
|
|
|
840
|
$word =~ s/á/a/; |
134
|
234
|
|
|
|
|
541
|
$word =~ s/é/e/; |
135
|
234
|
|
|
|
|
454
|
$word =~ s/í/i/; |
136
|
234
|
|
|
|
|
439
|
$word =~ s/ó/o/; |
137
|
234
|
|
|
|
|
404
|
$word =~ s/ú/u/; |
138
|
234
|
|
|
|
|
391
|
$word =~ s/ü/u/; |
139
|
234
|
50
|
|
|
|
863
|
print "Step 0 case a: $word\n" if $DEBUG; |
140
|
|
|
|
|
|
|
} |
141
|
|
|
|
|
|
|
elsif ( ($suffix) = $RV =~ /(?:ando|iendo|ar|er|ir)($pronoun)$/ ) { |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
# Case b) |
144
|
1684
|
|
|
|
|
23267
|
$word =~ s/$suffix$//; |
145
|
1684
|
50
|
|
|
|
7031
|
print "Step 0 case b: $word\n" if $DEBUG; |
146
|
|
|
|
|
|
|
} |
147
|
|
|
|
|
|
|
elsif ( ($suffix) = |
148
|
|
|
|
|
|
|
$word =~ /uyendo($pronoun)$/ and $RV =~ /yendo$pronoun$/ ) |
149
|
|
|
|
|
|
|
{ |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
# Case c) |
152
|
0
|
|
|
|
|
0
|
$word =~ s/$suffix$//; |
153
|
0
|
0
|
|
|
|
0
|
print "Step 0 case c: $word\n" if $DEBUG; |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
############################################################ |
158
|
|
|
|
|
|
|
########### Step 1 ########### |
159
|
|
|
|
|
|
|
############################################################ |
160
|
|
|
|
|
|
|
# Standard suffix removal |
161
|
|
|
|
|
|
|
# Search for the longest among the following suffixes, and perform the |
162
|
|
|
|
|
|
|
# action indicated. |
163
|
|
|
|
|
|
|
# Always do step 1 |
164
|
|
|
|
|
|
|
|
165
|
56754
|
|
|
|
|
118644
|
$RV = define_RV($word); |
166
|
56754
|
|
|
|
|
121383
|
my $R1 = define_R1($word); |
167
|
56754
|
|
|
|
|
114220
|
my $R2 = define_R2($word); |
168
|
|
|
|
|
|
|
|
169
|
56754
|
100
|
100
|
|
|
868733
|
if ( |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
170
|
|
|
|
|
|
|
($suffix) = $R2 =~ |
171
|
|
|
|
|
|
|
/(amientos|imientos|amiento|imiento|anzas|ismos|ables|ibles|istas| |
172
|
|
|
|
|
|
|
anza|icos|icas|ismo|able|ible|ista|osos|osas|ico|ica|oso|osa)$/x |
173
|
|
|
|
|
|
|
) |
174
|
|
|
|
|
|
|
{ |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
# anza anzas ico ica icos icas ismo ismos able ables ible ibles ista istas |
177
|
|
|
|
|
|
|
# oso osa osos osas amiento amientos imiento imientos |
178
|
|
|
|
|
|
|
# delete if in R2 |
179
|
2346
|
|
|
|
|
30433
|
$word =~ s/$suffix$//; |
180
|
2346
|
50
|
|
|
|
6755
|
print "Step 1 case 1: $word\n" if $DEBUG; |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
elsif ( ($suffix) = |
183
|
|
|
|
|
|
|
$R2 =~ /(aciones|adores|adoras|adora|antes?|ancias?|ación|ador)$/ ) |
184
|
|
|
|
|
|
|
{ |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
# adora ador ación adoras adores aciones |
187
|
|
|
|
|
|
|
# delete if in R2 |
188
|
|
|
|
|
|
|
# if preceded by ic, delete if in R2 |
189
|
1766
|
100
|
|
|
|
35967
|
if ( $R2 =~ /ic$suffix$/ ) { |
190
|
86
|
|
|
|
|
1063
|
$word =~ s/ic$suffix$//; |
191
|
|
|
|
|
|
|
} |
192
|
|
|
|
|
|
|
else { |
193
|
1680
|
|
|
|
|
19765
|
$word =~ s/$suffix$//; |
194
|
|
|
|
|
|
|
} |
195
|
1766
|
50
|
|
|
|
5402
|
print "Step 1 case 2: $word\n" if $DEBUG; |
196
|
|
|
|
|
|
|
} |
197
|
|
|
|
|
|
|
elsif ( ($suffix) = $R2 =~ /(logías?)$/ ) { |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
# logía logías |
200
|
|
|
|
|
|
|
# replace with log if in R2 |
201
|
32
|
|
|
|
|
277
|
$word =~ s/$suffix$/log/; |
202
|
32
|
50
|
|
|
|
167
|
print "Step 1 case 3: $word\n" if $DEBUG; |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
elsif ( ($suffix) = $R2 =~ /uci(ones|ón)$/ ) { |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
# ución uciones |
207
|
|
|
|
|
|
|
# replace with u if in R2 |
208
|
42
|
|
|
|
|
491
|
$word =~ s/uci$suffix$/u/; |
209
|
42
|
50
|
|
|
|
131
|
print "Step 1 case 4: $word\n" if $DEBUG; |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
elsif ( ($suffix) = $R2 =~ /(encias?)$/ ) { |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
# encia encias |
214
|
|
|
|
|
|
|
# replace with ente if in R2 |
215
|
246
|
|
|
|
|
2301
|
$word =~ s/$suffix$/ente/; |
216
|
246
|
50
|
|
|
|
804
|
print "Step 1 case 5: $word\n" if $DEBUG; |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
elsif ( $R1 =~ /amente$/ ) { |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
# delete if in R1 |
221
|
|
|
|
|
|
|
# if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2) |
222
|
|
|
|
|
|
|
# otherwise, |
223
|
|
|
|
|
|
|
# if preceded by os, ic or ad, delete if in R2 |
224
|
412
|
100
|
|
|
|
2361
|
if ( ($suffix) = $R2 =~ /(os|ic|ad)amente$/ ) { |
|
|
100
|
|
|
|
|
|
225
|
120
|
|
|
|
|
1553
|
$word =~ s/($suffix)amente$//; |
226
|
|
|
|
|
|
|
} |
227
|
|
|
|
|
|
|
elsif ( ($suffix) = $R2 =~ /((?:at(?=iv))?(?:iv))amente$/ ) { |
228
|
38
|
|
|
|
|
453
|
$word =~ s/($suffix)amente$//; |
229
|
|
|
|
|
|
|
} |
230
|
|
|
|
|
|
|
else { |
231
|
254
|
|
|
|
|
1244
|
$word =~ s/amente$//; |
232
|
|
|
|
|
|
|
} |
233
|
412
|
50
|
|
|
|
1116
|
print "Step 1 case 6: $word\n" if $DEBUG; |
234
|
|
|
|
|
|
|
} |
235
|
|
|
|
|
|
|
elsif ( $R2 =~ /mente$/ ) { |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
# mente |
238
|
|
|
|
|
|
|
# delete if in R2 |
239
|
|
|
|
|
|
|
# if preceded by able, ante or ible, delete if in R2 |
240
|
256
|
100
|
|
|
|
985
|
if ( ($suffix) = $R2 =~ /([ai]ble|ante)mente$/ ) { |
241
|
38
|
|
|
|
|
541
|
$word =~ s/($suffix)mente$//; |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
else { |
244
|
218
|
|
|
|
|
934
|
$word =~ s/mente$//; |
245
|
|
|
|
|
|
|
} |
246
|
256
|
50
|
|
|
|
629
|
print "Step 1 case 7: $word\n" if $DEBUG; |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
elsif ( $R2 =~ /idad(es)?$/ ) { |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
# idad idades |
251
|
|
|
|
|
|
|
# delete if in R2 |
252
|
|
|
|
|
|
|
# if preceded by abil, ic or iv, delete if in R2 |
253
|
436
|
100
|
|
|
|
1900
|
if ( ($suffix) = $R2 =~ /(abil|ic|iv)idad(es)?$/ ) { |
254
|
56
|
|
|
|
|
322
|
$word =~ s/(abil|ic|iv)idad(es)?$//; |
255
|
|
|
|
|
|
|
} |
256
|
|
|
|
|
|
|
else { |
257
|
380
|
|
|
|
|
2121
|
$word =~ s/idad(es)?$//; |
258
|
|
|
|
|
|
|
} |
259
|
436
|
50
|
|
|
|
1223
|
print "Step 1 case 8: $word\n" if $DEBUG; |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
elsif ( ($suffix) = $R2 =~ /(iv[ao]s?)$/ ) { |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
# iva ivo ivas ivos |
264
|
|
|
|
|
|
|
# delete if in R2 |
265
|
|
|
|
|
|
|
# if preceded by at, delete if in R2 |
266
|
500
|
100
|
|
|
|
11843
|
$R2 =~ /at$suffix$/ ? $word =~ s/at$suffix$// : $word =~ s/$suffix$//; |
267
|
500
|
50
|
|
|
|
2123
|
print "Step 1 case 9: $word\n" if $DEBUG; |
268
|
|
|
|
|
|
|
} |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
############################################################ |
271
|
|
|
|
|
|
|
########### Step 2a ########### |
272
|
|
|
|
|
|
|
############################################################ |
273
|
|
|
|
|
|
|
# Verb suffixes beginning 'y' |
274
|
|
|
|
|
|
|
# Search for the longest among the following suffixes in RV, and |
275
|
|
|
|
|
|
|
# if found, delete if preceded by u. (Note that the preceding u |
276
|
|
|
|
|
|
|
# need not be in RV). |
277
|
|
|
|
|
|
|
# ya ye yan yen yeron yendo yo yó yas yes yais yamos |
278
|
|
|
|
|
|
|
# Do step 2a if no ending was removed by step 1 |
279
|
|
|
|
|
|
|
elsif ($word =~ /u(yeron|yendo|yamos|yais|ya[ns]?|ye[ns]?|yo|yó)$/ |
280
|
|
|
|
|
|
|
&& $RV =~ /(yeron|yendo|yamos|yais|ya[ns]?|ye[ns]?|yo|yó)$/ ) |
281
|
|
|
|
|
|
|
{ |
282
|
114
|
|
|
|
|
547
|
$word =~ s/u(yeron|yendo|yamos|yais|ya[ns]?|ye[ns]?|yo|yó)$/u/; |
283
|
114
|
50
|
|
|
|
292
|
print "Step 2a: $word\n" if $DEBUG; |
284
|
|
|
|
|
|
|
} |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
############################################################ |
287
|
|
|
|
|
|
|
########### Step 2b ########### |
288
|
|
|
|
|
|
|
############################################################ |
289
|
|
|
|
|
|
|
# Other verb suffixes |
290
|
|
|
|
|
|
|
# Search for the longest among the following suffixes in RV, and |
291
|
|
|
|
|
|
|
# perform the action indicated. |
292
|
|
|
|
|
|
|
# Do step 2b if step 2a was done but failed to remove a suffix. |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
elsif ( |
295
|
|
|
|
|
|
|
($suffix) = |
296
|
|
|
|
|
|
|
$RV =~ /(iésemos|iéramos|iríamos|eríamos|aríamos|ásemos| |
297
|
|
|
|
|
|
|
áramos|ábamos|isteis|asteis|ieseis|ierais|iremos|iríais|eremos|eríais|aremos| |
298
|
|
|
|
|
|
|
aríais|aseis|arais|abais|ieses|ieras|iendo|ieron|iesen|ieran|iréis|irías|irían| #ancias?| |
299
|
|
|
|
|
|
|
eréis|erías|erían|aréis|arías|arían|íamos|imos|amos|idos|ados|íais|ases|aras|idas| #antes?| |
300
|
|
|
|
|
|
|
adas|abas|ando|aron|asen|aran|aban|iste|aste|iese|iera|iría|irás|irán|ería|erás|erán| |
301
|
|
|
|
|
|
|
aría|arás|arán|áis|ías|ido|ado|ían|ase|ara|ida|ada|aba|iré|irá|eré|erá|aré| |
302
|
|
|
|
|
|
|
ará|ís|as|ir|er|ar|ió|an|id|ed|ad|ía)$/x |
303
|
|
|
|
|
|
|
) |
304
|
|
|
|
|
|
|
{ |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
# delete |
307
|
20092
|
|
|
|
|
276671
|
$word =~ s/$suffix$//; |
308
|
20092
|
50
|
|
|
|
60969
|
print "Step 2b1: $word\n" if $DEBUG; |
309
|
|
|
|
|
|
|
} |
310
|
|
|
|
|
|
|
elsif ( ($suffix) = $RV =~ /(emos|éis|en|es)$/ ) { |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
# en es éis emos |
313
|
|
|
|
|
|
|
# delete, and if preceded by gu delete the u (the gu need not be in RV) |
314
|
3790
|
100
|
|
|
|
51383
|
$word =~ /gu$suffix$/ |
315
|
|
|
|
|
|
|
? $word =~ s/gu$suffix$/g/ |
316
|
|
|
|
|
|
|
: $word =~ s/$suffix$//; |
317
|
3790
|
50
|
|
|
|
11877
|
print "Step 2b2: $word\n" if $DEBUG; |
318
|
|
|
|
|
|
|
} |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
############################################################ |
321
|
|
|
|
|
|
|
########### Step 3 ########### |
322
|
|
|
|
|
|
|
############################################################ |
323
|
|
|
|
|
|
|
# Residual suffix |
324
|
|
|
|
|
|
|
# Search for the longest among the following suffixes in RV, and |
325
|
|
|
|
|
|
|
# perform the action indicated. |
326
|
|
|
|
|
|
|
# Always do step 3. |
327
|
|
|
|
|
|
|
|
328
|
56754
|
|
|
|
|
111442
|
$RV = define_RV($word); |
329
|
|
|
|
|
|
|
|
330
|
56754
|
100
|
|
|
|
288612
|
if ( ($suffix) = $RV =~ /(os|[aoáíó])$/ ) { |
|
|
100
|
|
|
|
|
|
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
# os a o á í ó |
333
|
|
|
|
|
|
|
# delete if in RV |
334
|
15954
|
|
|
|
|
189923
|
$word =~ s/$suffix$//; |
335
|
15954
|
50
|
|
|
|
59374
|
print "Step 3a: $word\n" if $DEBUG; |
336
|
|
|
|
|
|
|
} |
337
|
|
|
|
|
|
|
elsif ( $RV =~ /[eé]$/ ) { |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
# e é |
340
|
|
|
|
|
|
|
# delete if in RV, and if preceded by gu with the u in RV, delete the u. |
341
|
3900
|
100
|
66
|
|
|
13511
|
if ( $word =~ /gu[eé]$/ && $RV =~ /u[eé]$/ ) { |
342
|
60
|
|
|
|
|
270
|
$word =~ s/gu[eé]$/g/; |
343
|
|
|
|
|
|
|
} |
344
|
|
|
|
|
|
|
else { |
345
|
3840
|
|
|
|
|
13165
|
$word =~ s/[eé]$//; |
346
|
|
|
|
|
|
|
} |
347
|
3900
|
50
|
|
|
|
15943
|
print "Step 3b: $word\n" if $DEBUG; |
348
|
|
|
|
|
|
|
} |
349
|
56754
|
50
|
|
|
|
112771
|
print "Before step 4: $word\n" if $DEBUG; |
350
|
|
|
|
|
|
|
############################################################ |
351
|
|
|
|
|
|
|
########### Step 4 ########### |
352
|
|
|
|
|
|
|
############################################################ |
353
|
|
|
|
|
|
|
# Remove the acute accents |
354
|
56754
|
|
|
|
|
89426
|
$word =~ s/á/a/g; |
355
|
56754
|
|
|
|
|
93048
|
$word =~ s/é/e/g; |
356
|
56754
|
|
|
|
|
75199
|
$word =~ s/í/i/g; |
357
|
56754
|
|
|
|
|
65396
|
$word =~ s/ó/o/g; |
358
|
56754
|
|
|
|
|
65502
|
$word =~ s/ú/u/g; |
359
|
56754
|
50
|
|
|
|
103784
|
print "Step 4: $word\n" if $DEBUG; |
360
|
|
|
|
|
|
|
|
361
|
56754
|
|
|
|
|
151749
|
return $word; |
362
|
|
|
|
|
|
|
} |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
sub define_R1 { |
365
|
|
|
|
|
|
|
############################################ |
366
|
|
|
|
|
|
|
######## Find R1 ########### |
367
|
|
|
|
|
|
|
############################################ |
368
|
|
|
|
|
|
|
# R1 is the region after the first non-vowel following a vowel, |
369
|
|
|
|
|
|
|
# or is the null region at the end of the word if there is |
370
|
|
|
|
|
|
|
# no such non-vowel. |
371
|
56754
|
|
|
56754
|
0
|
76834
|
my $word = shift; |
372
|
56754
|
|
|
|
|
57065
|
my $R1; |
373
|
56754
|
|
|
|
|
382077
|
($R1) = $word =~ /^.*?$revowel$reconsonants(.*)$/; |
374
|
56754
|
|
100
|
|
|
148369
|
$R1 ||= ''; |
375
|
56754
|
50
|
|
|
|
107759
|
print "R1: $R1\n" if $DEBUG; |
376
|
56754
|
|
|
|
|
126293
|
return $R1; |
377
|
|
|
|
|
|
|
} |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
sub define_R2 { |
380
|
|
|
|
|
|
|
############################################ |
381
|
|
|
|
|
|
|
######## Find R2 ########### |
382
|
|
|
|
|
|
|
############################################ |
383
|
|
|
|
|
|
|
# R2 is the region after the second non-vowel following a vowel, |
384
|
|
|
|
|
|
|
# or is the null region at the end of the word if there is |
385
|
|
|
|
|
|
|
# no such non-vowel. |
386
|
56754
|
|
|
56754
|
0
|
76139
|
my $word = shift; |
387
|
56754
|
|
|
|
|
54140
|
my $R2; |
388
|
56754
|
|
|
|
|
469891
|
($R2) = $word =~ /^.*?$revowel$reconsonants.*?$revowel$reconsonants(.*)$/; |
389
|
56754
|
|
100
|
|
|
161918
|
$R2 ||= ''; |
390
|
56754
|
50
|
|
|
|
114282
|
print "R2: $R2\n" if $DEBUG; |
391
|
56754
|
|
|
|
|
149626
|
return $R2; |
392
|
|
|
|
|
|
|
} |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
sub define_RV { |
395
|
|
|
|
|
|
|
############################################ |
396
|
|
|
|
|
|
|
######## Find RV ########### |
397
|
|
|
|
|
|
|
############################################ |
398
|
|
|
|
|
|
|
# RV is defined as follows: |
399
|
|
|
|
|
|
|
# If the second letter is a consonant, RV is the region |
400
|
|
|
|
|
|
|
# after the next following vowel. |
401
|
|
|
|
|
|
|
# If the first two letters are vowels, RV is the region |
402
|
|
|
|
|
|
|
# after the next consonant |
403
|
|
|
|
|
|
|
# If the first letter is a consonant and the second a vowel, |
404
|
|
|
|
|
|
|
# RV is the region after the third letter |
405
|
|
|
|
|
|
|
# RV is the end of the word if these positions cannot be found. |
406
|
170262
|
|
|
170262
|
0
|
244709
|
my $word = shift; |
407
|
170262
|
|
|
|
|
193728
|
my $RV; |
408
|
170262
|
100
|
|
|
|
1804568
|
if ( $word =~ /^.$reconsonants.*?$revowel(.*)$/ ) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
409
|
60844
|
|
|
|
|
114218
|
$RV = $1; |
410
|
60844
|
50
|
|
|
|
137546
|
print "$word -- RV: Case 1 '$RV'\n" if $DEBUG; |
411
|
|
|
|
|
|
|
} |
412
|
|
|
|
|
|
|
elsif ( $word =~ /^$revowel{2,}$reconsonants(.*)$/ ) { |
413
|
1542
|
|
|
|
|
2805
|
$RV = $1; |
414
|
1542
|
50
|
|
|
|
3181
|
print "$word -- RV: Case 2 '$RV'\n" if $DEBUG; |
415
|
|
|
|
|
|
|
} |
416
|
|
|
|
|
|
|
elsif ( $word =~ /^$reconsonants$revowel.(.*)$/ ) { |
417
|
106422
|
|
|
|
|
205956
|
$RV = $1; |
418
|
106422
|
50
|
|
|
|
218792
|
print "$word -- RV: Case 3 '$RV'\n" if $DEBUG; |
419
|
|
|
|
|
|
|
} |
420
|
|
|
|
|
|
|
else { |
421
|
1454
|
|
|
|
|
1847
|
$RV = ''; |
422
|
1454
|
50
|
|
|
|
2785
|
print "$word -- RV: Case 4 '$RV'\n" if $DEBUG; |
423
|
|
|
|
|
|
|
} |
424
|
170262
|
|
|
|
|
539420
|
return $RV; |
425
|
|
|
|
|
|
|
} |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
sub stem_caching { |
428
|
0
|
|
|
0
|
1
|
|
my $parm_ref; |
429
|
0
|
0
|
|
|
|
|
if ( ref $_[0] ) { |
430
|
0
|
|
|
|
|
|
$parm_ref = shift; |
431
|
|
|
|
|
|
|
} |
432
|
|
|
|
|
|
|
else { |
433
|
0
|
|
|
|
|
|
$parm_ref = {@_}; |
434
|
|
|
|
|
|
|
} |
435
|
0
|
|
|
|
|
|
my $caching_level = $parm_ref->{-level}; |
436
|
0
|
0
|
|
|
|
|
if ( defined $caching_level ) { |
437
|
0
|
0
|
|
|
|
|
if ( $caching_level !~ m/^[012]$/ ) { |
438
|
0
|
|
|
|
|
|
croak( __PACKAGE__ |
439
|
|
|
|
|
|
|
. q{::stem_caching() - Legal values are '0','1' or '2'.} |
440
|
|
|
|
|
|
|
. qq{ '$caching_level' is not a legal value) } ); |
441
|
|
|
|
|
|
|
} |
442
|
0
|
|
|
|
|
|
$Stem_Caching = $caching_level; |
443
|
|
|
|
|
|
|
} |
444
|
0
|
|
|
|
|
|
return $Stem_Caching; |
445
|
|
|
|
|
|
|
} |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
sub clear_stem_cache { |
448
|
0
|
|
|
0
|
1
|
|
$Stem_Cache = {}; |
449
|
|
|
|
|
|
|
} |
450
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
1; |
452
|
|
|
|
|
|
|
__END__ |