| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Lingua::Stem::Fr; |
|
2
|
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
843
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
31
|
|
|
4
|
1
|
|
|
1
|
|
10
|
use warnings; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
2373
|
|
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
require Exporter; |
|
7
|
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
# Items to export into callers namespace by default. Note: do not export |
|
11
|
|
|
|
|
|
|
# names by default without a very good reason. Use EXPORT_OK instead. |
|
12
|
|
|
|
|
|
|
# Do not simply export all your public functions/methods/constants. |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
# This allows declaration use Lingua::Stem::Fr ':all'; |
|
15
|
|
|
|
|
|
|
# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK |
|
16
|
|
|
|
|
|
|
# will save memory. |
|
17
|
|
|
|
|
|
|
our %EXPORT_TAGS = (); |
|
18
|
|
|
|
|
|
|
our @EXPORT_OK = qw (stem stem_word clear_stem_cache stem_caching); |
|
19
|
|
|
|
|
|
|
our @EXPORT = (); |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
our $VERSION = '0.02'; |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
my $Stem_Caching = 0; |
|
25
|
|
|
|
|
|
|
my $Stem_Cache = {}; |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
sub stem { |
|
29
|
3
|
50
|
|
3
|
0
|
237
|
return [] if ($#_ == -1); |
|
30
|
3
|
|
|
|
|
5
|
my $parm_ref; |
|
31
|
3
|
50
|
|
|
|
22
|
if (ref $_[0]) { |
|
32
|
3
|
|
|
|
|
7
|
$parm_ref = shift; |
|
33
|
|
|
|
|
|
|
} else { |
|
34
|
0
|
|
|
|
|
0
|
$parm_ref = { @_ }; |
|
35
|
|
|
|
|
|
|
} |
|
36
|
|
|
|
|
|
|
|
|
37
|
3
|
|
|
|
|
6
|
my $words = []; |
|
38
|
3
|
|
|
|
|
4
|
my $locale = 'fr'; |
|
39
|
3
|
|
|
|
|
7
|
my $exceptions = {}; |
|
40
|
3
|
|
|
|
|
26
|
foreach (keys %$parm_ref) { |
|
41
|
3
|
|
|
|
|
8
|
my $key = lc ($_); |
|
42
|
3
|
50
|
|
|
|
9
|
if ($key eq '-words') { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
43
|
3
|
|
|
|
|
4
|
@$words = @{$parm_ref->{$key}}; |
|
|
3
|
|
|
|
|
16
|
|
|
44
|
|
|
|
|
|
|
} elsif ($key eq '-exceptions') { |
|
45
|
0
|
|
|
|
|
0
|
$exceptions = $parm_ref->{$key}; |
|
46
|
|
|
|
|
|
|
} elsif ($key eq '-locale') { |
|
47
|
0
|
|
|
|
|
0
|
$locale = $parm_ref->{$key}; |
|
48
|
|
|
|
|
|
|
} else { |
|
49
|
0
|
|
|
|
|
0
|
croak (__PACKAGE__ . "::stem() - Unknown parameter '$key' with value '$parm_ref->{$key}'\n"); |
|
50
|
|
|
|
|
|
|
} |
|
51
|
|
|
|
|
|
|
} |
|
52
|
|
|
|
|
|
|
|
|
53
|
3
|
|
|
|
|
6
|
local( $_ ); |
|
54
|
3
|
|
|
|
|
6
|
foreach (@$words) { |
|
55
|
|
|
|
|
|
|
# Flatten case |
|
56
|
12
|
|
|
|
|
22
|
$_ = lc $_; |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
# Check against exceptions list |
|
59
|
12
|
50
|
|
|
|
29
|
if (exists $exceptions->{$_}) { |
|
60
|
0
|
|
|
|
|
0
|
$_ = $exceptions->{$_}; |
|
61
|
0
|
|
|
|
|
0
|
next; |
|
62
|
|
|
|
|
|
|
} |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
# Check against cache of stemmed words |
|
65
|
12
|
|
|
|
|
18
|
my $original_word = $_; |
|
66
|
12
|
50
|
33
|
|
|
29
|
if ($Stem_Caching && exists $Stem_Cache->{$original_word}) { |
|
67
|
0
|
|
|
|
|
0
|
$_ = $Stem_Cache->{$original_word}; |
|
68
|
0
|
|
|
|
|
0
|
next; |
|
69
|
|
|
|
|
|
|
} |
|
70
|
|
|
|
|
|
|
|
|
71
|
12
|
|
|
|
|
26
|
$_ = stem_word($_); |
|
72
|
|
|
|
|
|
|
|
|
73
|
12
|
50
|
|
|
|
44
|
$Stem_Cache->{$original_word} = $_ if $Stem_Caching; |
|
74
|
|
|
|
|
|
|
} |
|
75
|
3
|
50
|
|
|
|
15
|
$Stem_Cache = {} if ($Stem_Caching < 2); |
|
76
|
|
|
|
|
|
|
|
|
77
|
3
|
|
|
|
|
23
|
return $words; |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
} |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
sub stem_word { |
|
82
|
|
|
|
|
|
|
|
|
83
|
24
|
|
|
24
|
1
|
653
|
our($word) = @_; |
|
84
|
|
|
|
|
|
|
|
|
85
|
24
|
|
|
|
|
49
|
$word = lc $word; |
|
86
|
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
# Check against cache of stemmed words |
|
88
|
24
|
50
|
33
|
|
|
61
|
if ($Stem_Caching && exists $Stem_Cache->{$word}) { |
|
89
|
0
|
|
|
|
|
0
|
return $Stem_Cache->{$word}; |
|
90
|
|
|
|
|
|
|
} |
|
91
|
|
|
|
|
|
|
|
|
92
|
24
|
|
|
|
|
31
|
our($RV, $R1, $R2); |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
### u, i between vowels into upper case. |
|
96
|
24
|
|
|
|
|
85
|
$word =~ s/([aeiouyâàëéêèïîôûù])([ui])([aeiouyâàëéêèïîôûù])/$1.uc($2).$3/eg; |
|
|
8
|
|
|
|
|
53
|
|
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
### y preceded or followed by a vowel into upper case. |
|
99
|
24
|
|
|
|
|
44
|
$word =~ s/([aeiouyâàëéêèïîôûù])(y)/$1.uc($2)/eg; |
|
|
0
|
|
|
|
|
0
|
|
|
100
|
24
|
|
|
|
|
35
|
$word =~ s/(y)([aeiouyâàëéêèïîôûù])/uc($1).$2/eg; |
|
|
0
|
|
|
|
|
0
|
|
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
### u after q into upper case. |
|
103
|
24
|
|
|
|
|
32
|
$word =~ s/(q)(u)/$1.uc($2)/eg; |
|
|
0
|
|
|
|
|
0
|
|
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
#### RV is defined as follows |
|
106
|
24
|
|
|
|
|
33
|
$RV = $word; |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
#### If the first two letters are vowels |
|
109
|
24
|
100
|
|
|
|
157
|
if($word =~ /^[aeiouyâàëéêèïîôûù][aeiouyâàëéêèïîôûù]/) { |
|
|
|
50
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
#### RV is the region after the third letter |
|
112
|
1
|
50
|
|
|
|
9
|
unless ( $RV =~ s/^...// ) { |
|
113
|
0
|
|
|
|
|
0
|
$RV = ""; |
|
114
|
|
|
|
|
|
|
} |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
} elsif ( $word =~ /^.+?[aeiouyâàëéêèïîôûù].+/ ) { |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
#### RV is after the first vowel not beginning or end the word |
|
119
|
23
|
|
|
|
|
82
|
$RV =~ s/^.+?[aeiouyâàëéêèïîôûù]//; |
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
} else { |
|
122
|
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
#### RV is the end of the word |
|
124
|
0
|
|
|
|
|
0
|
$RV = ""; |
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
} |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
#print "Word=$word\nRV=$RV\n"; |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
#### Defining R1 and R2 |
|
131
|
24
|
|
|
|
|
36
|
$R1 = $word; |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
#### R1 is the region after the first non-vowel following a |
|
134
|
|
|
|
|
|
|
#### vowel, or is the null region at the end of the word if |
|
135
|
|
|
|
|
|
|
#### there is no such non-vowel. |
|
136
|
|
|
|
|
|
|
|
|
137
|
24
|
50
|
|
|
|
106
|
unless($R1 =~ s/^.*?[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]//) { |
|
138
|
0
|
|
|
|
|
0
|
$R1 = ""; |
|
139
|
|
|
|
|
|
|
} |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
#print "R1=$R1\n"; |
|
142
|
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
#### R2 is the region after the first non-vowel following a |
|
144
|
|
|
|
|
|
|
#### vowel in R1, or is the null region at the end of the |
|
145
|
|
|
|
|
|
|
#### word if there is no such non-vowel. |
|
146
|
|
|
|
|
|
|
|
|
147
|
24
|
|
|
|
|
34
|
$R2 = $R1; |
|
148
|
|
|
|
|
|
|
|
|
149
|
24
|
50
|
|
|
|
46
|
if($R2) { |
|
150
|
24
|
100
|
|
|
|
94
|
unless($R2 =~ s/^.*?[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]//) { |
|
151
|
4
|
|
|
|
|
7
|
$R2 = ""; |
|
152
|
|
|
|
|
|
|
} |
|
153
|
|
|
|
|
|
|
} |
|
154
|
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
#print "R2=$R2\n"; |
|
156
|
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
#### Step 1: Standard suffix removal |
|
158
|
|
|
|
|
|
|
|
|
159
|
24
|
|
|
|
|
36
|
my $step1 = 0; |
|
160
|
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
#### Search for the longest among the following suffixes, |
|
162
|
|
|
|
|
|
|
#### and perform the action indicated |
|
163
|
|
|
|
|
|
|
|
|
164
|
24
|
|
|
|
|
87
|
my @suffix = qw( |
|
165
|
|
|
|
|
|
|
ance iqUe isme |
|
166
|
|
|
|
|
|
|
able iste eux |
|
167
|
|
|
|
|
|
|
ances iqUes ismes |
|
168
|
|
|
|
|
|
|
ables istes |
|
169
|
|
|
|
|
|
|
); |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
#### delete if in R2 |
|
172
|
24
|
|
|
|
|
60
|
$step1 += stem_killer( $R2, "", "", @suffix ); |
|
173
|
|
|
|
|
|
|
|
|
174
|
24
|
|
|
|
|
77
|
@suffix = qw( |
|
175
|
|
|
|
|
|
|
trice ateur ation |
|
176
|
|
|
|
|
|
|
atrices ateurs ations |
|
177
|
|
|
|
|
|
|
); |
|
178
|
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
#### delete if in R2 |
|
180
|
|
|
|
|
|
|
#### if preceded by ic, delete if in R2 |
|
181
|
|
|
|
|
|
|
#print "Word=$word RV=$RV R1=$R1 R2=$R2\n"; |
|
182
|
24
|
|
33
|
|
|
60
|
$step1 += stem_killer( $R2, "ic", "", @suffix ) |
|
183
|
|
|
|
|
|
|
|| stem_killer( $R1, "ic", "iqU", @suffix ) |
|
184
|
|
|
|
|
|
|
|| stem_killer( $R2, "", "", @suffix ); |
|
185
|
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
|
|
187
|
24
|
|
|
|
|
90
|
@suffix = qw( |
|
188
|
|
|
|
|
|
|
logie logies |
|
189
|
|
|
|
|
|
|
); |
|
190
|
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
#### replace with log if in R2 |
|
192
|
24
|
|
|
|
|
59
|
$step1 += stem_killer( $R2, "", "log", @suffix ); |
|
193
|
|
|
|
|
|
|
|
|
194
|
24
|
|
|
|
|
61
|
@suffix = qw( |
|
195
|
|
|
|
|
|
|
usion ution usions utions |
|
196
|
|
|
|
|
|
|
); |
|
197
|
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
#### replace with u if in R2 |
|
199
|
24
|
|
|
|
|
49
|
$step1 += stem_killer( $R2, "", "u", @suffix ); |
|
200
|
|
|
|
|
|
|
|
|
201
|
24
|
|
|
|
|
50
|
@suffix = qw( |
|
202
|
|
|
|
|
|
|
ence ences |
|
203
|
|
|
|
|
|
|
); |
|
204
|
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
#### replace with ent if in R2 |
|
206
|
24
|
|
|
|
|
53
|
$step1 += stem_killer( $R2, "", "ent", @suffix ); |
|
207
|
|
|
|
|
|
|
|
|
208
|
24
|
|
|
|
|
50
|
@suffix = qw( |
|
209
|
|
|
|
|
|
|
issement issements |
|
210
|
|
|
|
|
|
|
); |
|
211
|
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
#### delete if in R1 and preceded by a non-vowel |
|
213
|
24
|
50
|
|
|
|
55
|
if ( nvprec( $R1, @suffix ) ) { |
|
214
|
0
|
|
|
|
|
0
|
$step1 += stem_killer( $R1, "", "", @suffix); |
|
215
|
|
|
|
|
|
|
} |
|
216
|
|
|
|
|
|
|
|
|
217
|
24
|
|
|
|
|
55
|
@suffix = qw( |
|
218
|
|
|
|
|
|
|
ement ements |
|
219
|
|
|
|
|
|
|
); |
|
220
|
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
#### delete if in RV |
|
222
|
|
|
|
|
|
|
#### if preceded by iv, delete if in R2 |
|
223
|
|
|
|
|
|
|
#### (and if further preceded by at, delete if in R2), otherwise, |
|
224
|
|
|
|
|
|
|
#### if preceded by eus, delete if in R2, else replace by eux if in R1, otherwise, |
|
225
|
|
|
|
|
|
|
#### if preceded by abl or iqU, delete if in R2, otherwise, |
|
226
|
|
|
|
|
|
|
#### if preceded by ièr or Ièr, replace by i if in RV |
|
227
|
24
|
|
66
|
|
|
54
|
$step1 += stem_killer( $RV, "ativ", "", @suffix ) |
|
228
|
|
|
|
|
|
|
|| stem_killer( $R2, "iv", "", @suffix ) |
|
229
|
|
|
|
|
|
|
|| stem_killer( $R2, "(abl|iqU)", "", @suffix ) |
|
230
|
|
|
|
|
|
|
|| stem_killer( $R2, "(ièr|Ièr)", "i", @suffix ) |
|
231
|
|
|
|
|
|
|
|| stem_killer( $R2, "eus", "", @suffix ) |
|
232
|
|
|
|
|
|
|
|| stem_killer( $R1, "eus", "eux", @suffix ) |
|
233
|
|
|
|
|
|
|
|| stem_killer( $RV, "", "", @suffix ); |
|
234
|
|
|
|
|
|
|
|
|
235
|
24
|
|
|
|
|
54
|
@suffix = qw( |
|
236
|
|
|
|
|
|
|
ité ités |
|
237
|
|
|
|
|
|
|
); |
|
238
|
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
#### delete if in R2 |
|
240
|
|
|
|
|
|
|
#### if preceded by abil, delete if in R2, else replace by abl, otherwise, |
|
241
|
|
|
|
|
|
|
#### if preceded by ic, delete if in R2, else replace by iqU, otherwise, |
|
242
|
|
|
|
|
|
|
#### if preceded by iv, delete if in R2 |
|
243
|
24
|
|
33
|
|
|
45
|
$step1 += stem_killer( $R2, "(abil|ic|iv)", "", @suffix ) |
|
244
|
|
|
|
|
|
|
|| stem_killer( $word, "abil", "abl", @suffix ) |
|
245
|
|
|
|
|
|
|
|| stem_killer( $word, "ic", "iqU", @suffix ) |
|
246
|
|
|
|
|
|
|
|| stem_killer( $R2, "", "", @suffix ); |
|
247
|
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
|
|
249
|
24
|
|
|
|
|
56
|
@suffix = qw( |
|
250
|
|
|
|
|
|
|
if ive ifs ives |
|
251
|
|
|
|
|
|
|
); |
|
252
|
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
#### delete if in R2 |
|
254
|
|
|
|
|
|
|
#### if preceded by at, delete if in R2 |
|
255
|
|
|
|
|
|
|
#### (and if further preceded by ic, delete if in R2, else replace by iqU) |
|
256
|
24
|
|
33
|
|
|
59
|
$step1 += stem_killer( $R2, "icat", "", @suffix) |
|
257
|
|
|
|
|
|
|
|| stem_killer( $R2, "at", "", @suffix) |
|
258
|
|
|
|
|
|
|
|| stem_killer( $word, "icat", "iqU", @suffix) |
|
259
|
|
|
|
|
|
|
|| stem_killer( $R2, "", "", @suffix); |
|
260
|
|
|
|
|
|
|
|
|
261
|
24
|
|
|
|
|
62
|
@suffix = qw( |
|
262
|
|
|
|
|
|
|
eaux |
|
263
|
|
|
|
|
|
|
); |
|
264
|
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
#### replace with eau |
|
266
|
24
|
|
|
|
|
61
|
$step1 += stem_killer( $word, "", "eau", @suffix); |
|
267
|
|
|
|
|
|
|
|
|
268
|
24
|
|
|
|
|
45
|
@suffix = qw( |
|
269
|
|
|
|
|
|
|
aux |
|
270
|
|
|
|
|
|
|
); |
|
271
|
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
#### replace with eau |
|
273
|
24
|
|
|
|
|
47
|
$step1 += stem_killer( $R1, "", "al", @suffix); |
|
274
|
|
|
|
|
|
|
|
|
275
|
24
|
|
|
|
|
55
|
@suffix = qw( |
|
276
|
|
|
|
|
|
|
euse euses |
|
277
|
|
|
|
|
|
|
); |
|
278
|
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
#### delete if in R2, else replace by eux if in R1 |
|
280
|
24
|
|
33
|
|
|
49
|
$step1 += stem_killer( $R2, "", "", @suffix) |
|
281
|
|
|
|
|
|
|
|| stem_killer( $R1, "", "eux", @suffix); |
|
282
|
|
|
|
|
|
|
|
|
283
|
24
|
|
|
|
|
50
|
@suffix = qw( |
|
284
|
|
|
|
|
|
|
emment |
|
285
|
|
|
|
|
|
|
); |
|
286
|
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
#### replace with ent |
|
288
|
24
|
|
|
|
|
49
|
my $sufstep2 += stem_killer( $RV, "", "ent", @suffix); |
|
289
|
|
|
|
|
|
|
|
|
290
|
24
|
|
|
|
|
46
|
@suffix = qw( |
|
291
|
|
|
|
|
|
|
amment |
|
292
|
|
|
|
|
|
|
); |
|
293
|
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
#### replace with ant |
|
295
|
24
|
|
|
|
|
47
|
$sufstep2 += stem_killer( $RV, "", "ant", @suffix); |
|
296
|
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
|
|
298
|
24
|
|
|
|
|
48
|
@suffix = qw( |
|
299
|
|
|
|
|
|
|
ment ments |
|
300
|
|
|
|
|
|
|
); |
|
301
|
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
#### delete if preceded by a vowel in RV |
|
303
|
24
|
50
|
|
|
|
56
|
if ( vprec ( $RV, @suffix) ) { |
|
304
|
0
|
|
|
|
|
0
|
$sufstep2 += stem_killer( $RV, "", "", @suffix); |
|
305
|
|
|
|
|
|
|
} |
|
306
|
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
#### Step 2: Verb suffixes |
|
310
|
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
#### Do step 2a if no ending was removed by step 1. |
|
312
|
24
|
|
|
|
|
36
|
my $step2a = 0; |
|
313
|
24
|
100
|
66
|
|
|
66
|
if( ($step1 == 0) || ($sufstep2 > 0) ) { |
|
314
|
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
#### Search for the longest among the following suffixes in RV, |
|
316
|
|
|
|
|
|
|
#### and if found, delete. |
|
317
|
22
|
|
|
|
|
144
|
@suffix = qw( |
|
318
|
|
|
|
|
|
|
îmes ît îtes i ie ies ir ira |
|
319
|
|
|
|
|
|
|
irai iraIent irais irait iras irent |
|
320
|
|
|
|
|
|
|
irez iriez irions irons iront is issaIent |
|
321
|
|
|
|
|
|
|
issais issait issant issante issantes |
|
322
|
|
|
|
|
|
|
issants isse issent isses issez issiez |
|
323
|
|
|
|
|
|
|
issions issons it |
|
324
|
|
|
|
|
|
|
); |
|
325
|
22
|
50
|
|
|
|
50
|
if ( nvprec( $RV, @suffix) ) { |
|
326
|
|
|
|
|
|
|
#print "word:$word RV:$RV R1:$R1 R2:$R2\n"; |
|
327
|
0
|
|
|
|
|
0
|
$step2a += stem_killer( $RV, "", "", @suffix ); |
|
328
|
|
|
|
|
|
|
} |
|
329
|
|
|
|
|
|
|
} |
|
330
|
|
|
|
|
|
|
|
|
331
|
24
|
|
|
|
|
29
|
my $step2b = 0; |
|
332
|
24
|
50
|
|
|
|
61
|
if ( $step2a == 0 ) { |
|
333
|
|
|
|
|
|
|
|
|
334
|
24
|
|
|
|
|
81
|
@suffix = qw( |
|
335
|
|
|
|
|
|
|
ions |
|
336
|
|
|
|
|
|
|
); |
|
337
|
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
#### delete if in R2 |
|
339
|
24
|
|
|
|
|
59
|
$step2b += stem_killer( $R2, "", "", @suffix); |
|
340
|
|
|
|
|
|
|
|
|
341
|
24
|
|
|
|
|
100
|
@suffix = qw( |
|
342
|
|
|
|
|
|
|
é ée ées és èrent er era erai |
|
343
|
|
|
|
|
|
|
eraIent erais erait eras erez eriez |
|
344
|
|
|
|
|
|
|
erions erons eront ez iez |
|
345
|
|
|
|
|
|
|
); |
|
346
|
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
#### delete |
|
348
|
24
|
|
|
|
|
63
|
$step2b += stem_killer( $RV, "", "", @suffix); |
|
349
|
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
#print "Avant word:$word RV:$RV R1:$R1 R2:$R2\n"; |
|
351
|
24
|
|
|
|
|
122
|
@suffix = qw( |
|
352
|
|
|
|
|
|
|
âmes ât âtes a ai aIent ais ait |
|
353
|
|
|
|
|
|
|
ant ante antes ants as asse assent |
|
354
|
|
|
|
|
|
|
asses assiez assions |
|
355
|
|
|
|
|
|
|
); |
|
356
|
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
#### delete |
|
358
|
|
|
|
|
|
|
#### if preceded by e, delete |
|
359
|
24
|
|
66
|
|
|
64
|
$step2b += stem_killer( $RV, "e", "", @suffix) |
|
360
|
|
|
|
|
|
|
|| stem_killer( $RV, "", "", @suffix); |
|
361
|
|
|
|
|
|
|
#print "Apres word:$word RV:$RV R1:$R1 R2:$R2\n"; |
|
362
|
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
} |
|
364
|
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
|
|
366
|
24
|
|
|
|
|
47
|
my $step4 = 1; |
|
367
|
24
|
100
|
66
|
|
|
151
|
if ( $step1 > 0 || $step2a > 0 || $step2b > 0 ) { |
|
|
|
|
100
|
|
|
|
|
|
368
|
|
|
|
|
|
|
#### Step 3 |
|
369
|
|
|
|
|
|
|
#### Replace final Y with i or final ç with c |
|
370
|
13
|
50
|
|
|
|
72
|
if ( $word =~ /Y$|ç$/ ) { |
|
371
|
0
|
|
|
|
|
0
|
$word =~ s/Y$/i/; |
|
372
|
0
|
|
|
|
|
0
|
$word =~ s/ç$/c/; |
|
373
|
0
|
|
|
|
|
0
|
$step4 = 0; |
|
374
|
|
|
|
|
|
|
} |
|
375
|
|
|
|
|
|
|
} |
|
376
|
|
|
|
|
|
|
|
|
377
|
24
|
100
|
66
|
|
|
335
|
if ( $step4 == 1 && $step1 == 0 && $step2a == 0 && $step2b == 0 ) { |
|
|
|
|
66
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
378
|
|
|
|
|
|
|
#### Step 4 |
|
379
|
|
|
|
|
|
|
#### If the word ends s, not preceded by a, i, o, u, è or s, delete it. |
|
380
|
|
|
|
|
|
|
#print "word:$word RV:$RV\n"; |
|
381
|
11
|
100
|
|
|
|
52
|
if ( $word =~ /[^aiouès]s$/ ) { |
|
382
|
4
|
|
|
|
|
11
|
stem_killer( $word , "", "", "s" ); |
|
383
|
|
|
|
|
|
|
} |
|
384
|
|
|
|
|
|
|
|
|
385
|
11
|
|
|
|
|
35
|
@suffix = qw( |
|
386
|
|
|
|
|
|
|
ent |
|
387
|
|
|
|
|
|
|
); |
|
388
|
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
#### delete if in R2 |
|
390
|
11
|
|
|
|
|
31
|
stem_killer( $R2, "", "", @suffix); |
|
391
|
|
|
|
|
|
|
|
|
392
|
11
|
|
|
|
|
20
|
@suffix = qw( |
|
393
|
|
|
|
|
|
|
ion |
|
394
|
|
|
|
|
|
|
); |
|
395
|
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
#### delete if in R2 and preceded by s or t |
|
397
|
11
|
50
|
33
|
|
|
145
|
if ( $R2 =~ /ion$/ && $RV =~ /tion|sion/ ) { |
|
398
|
0
|
|
|
|
|
0
|
stem_killer( $R2, "", "", @suffix); |
|
399
|
|
|
|
|
|
|
} |
|
400
|
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
#(So note that ion is removed only when it is in R2 - as well as being in RV - and preceded by s or t which must be in RV.) |
|
402
|
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
|
|
404
|
11
|
|
|
|
|
29
|
@suffix = qw( |
|
405
|
|
|
|
|
|
|
ier ière Ier Ière |
|
406
|
|
|
|
|
|
|
); |
|
407
|
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
#### replace with i |
|
409
|
11
|
|
|
|
|
26
|
stem_killer( $RV, "", "i", @suffix); |
|
410
|
|
|
|
|
|
|
|
|
411
|
11
|
|
|
|
|
23
|
@suffix = qw( |
|
412
|
|
|
|
|
|
|
e |
|
413
|
|
|
|
|
|
|
); |
|
414
|
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
#### e delete |
|
416
|
|
|
|
|
|
|
#print "word:$word RV:$RV R1:$R1 R2:$R2\n"; |
|
417
|
11
|
|
|
|
|
21
|
stem_killer( $RV, "", "", @suffix); |
|
418
|
|
|
|
|
|
|
|
|
419
|
11
|
|
|
|
|
54
|
@suffix = qw( |
|
420
|
|
|
|
|
|
|
ë |
|
421
|
|
|
|
|
|
|
); |
|
422
|
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
#### if preceded by gu, delete |
|
424
|
11
|
50
|
|
|
|
34
|
if ( $RV =~ /guë$/ ) { |
|
425
|
0
|
|
|
|
|
0
|
stem_killer( $RV, "", "", @suffix); |
|
426
|
|
|
|
|
|
|
} |
|
427
|
|
|
|
|
|
|
} |
|
428
|
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
#### Always do Step 5 and Step 6 |
|
430
|
|
|
|
|
|
|
#### step 5 : Undouble |
|
431
|
|
|
|
|
|
|
#### If the word ends enn, onn, ett, ell or eill, delete the last letter |
|
432
|
24
|
|
|
|
|
43
|
$word =~ s/enn$/en/; |
|
433
|
24
|
|
|
|
|
39
|
$word =~ s/onn$/on/; |
|
434
|
24
|
|
|
|
|
34
|
$word =~ s/ett$/et/; |
|
435
|
24
|
|
|
|
|
28
|
$word =~ s/ell$/el/; |
|
436
|
24
|
|
|
|
|
32
|
$word =~ s/eill$/eil/; |
|
437
|
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
#### step 6 :Un-accent |
|
439
|
|
|
|
|
|
|
#### If the words ends é or è followed by at least one non-vowel, |
|
440
|
|
|
|
|
|
|
#### remove the accent from the e |
|
441
|
24
|
|
|
|
|
42
|
$word =~ s/[éè]([^aeiouyâàëéêèïîôûù]+?)$/e$1/; |
|
442
|
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
#### And finally: |
|
444
|
|
|
|
|
|
|
#### Turn any remaining I, U and Y letters into lower case. |
|
445
|
24
|
|
|
|
|
56
|
$word =~ s/([IUY])/lc($1)/eg; |
|
|
8
|
|
|
|
|
37
|
|
|
446
|
|
|
|
|
|
|
|
|
447
|
24
|
|
|
|
|
99
|
return $word; |
|
448
|
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
} |
|
450
|
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
sub nvprec { |
|
452
|
|
|
|
|
|
|
|
|
453
|
46
|
|
|
46
|
0
|
190
|
my($where, @list) = @_; |
|
454
|
1
|
|
|
1
|
|
8
|
use vars qw($RV $R1 $R2 $word); |
|
|
1
|
|
|
|
|
13
|
|
|
|
1
|
|
|
|
|
193
|
|
|
455
|
46
|
|
|
|
|
95
|
foreach my $p ( sort { length($b) <=> length($a) } @list) { |
|
|
3060
|
|
|
|
|
8260
|
|
|
456
|
818
|
50
|
|
|
|
16477
|
if ($where =~ /[^aeiouyâàëéêèïîôûù]$p$/) { |
|
457
|
0
|
|
|
|
|
0
|
return 1; |
|
458
|
|
|
|
|
|
|
} |
|
459
|
|
|
|
|
|
|
} |
|
460
|
46
|
|
|
|
|
194
|
return; |
|
461
|
|
|
|
|
|
|
} |
|
462
|
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
sub vprec { |
|
464
|
|
|
|
|
|
|
|
|
465
|
24
|
|
|
24
|
0
|
48
|
my($where, @list) = @_; |
|
466
|
1
|
|
|
1
|
|
4
|
use vars qw($RV $R1 $R2 $word); |
|
|
1
|
|
|
|
|
8
|
|
|
|
1
|
|
|
|
|
144
|
|
|
467
|
24
|
|
|
|
|
46
|
foreach my $p ( sort { length($b) <=> length($a) } @list) { |
|
|
24
|
|
|
|
|
80
|
|
|
468
|
48
|
50
|
|
|
|
972
|
if ($where =~ /[aeiouyâàëéêèïîôûù]$p$/) { |
|
469
|
0
|
|
|
|
|
0
|
return 1; |
|
470
|
|
|
|
|
|
|
} |
|
471
|
|
|
|
|
|
|
} |
|
472
|
24
|
|
|
|
|
70
|
return; |
|
473
|
|
|
|
|
|
|
} |
|
474
|
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
sub stem_killer { |
|
476
|
805
|
|
|
805
|
0
|
1985
|
my($where, $pre, $with, @list) = @_; |
|
477
|
1
|
|
|
1
|
|
5
|
use vars qw($RV $R1 $R2 $word); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
360
|
|
|
478
|
805
|
|
|
|
|
961
|
my $done = 0; |
|
479
|
805
|
|
|
|
|
1522
|
foreach my $P (sort { length($b) <=> length($a) } @list) { |
|
|
6175
|
|
|
|
|
7017
|
|
|
480
|
3356
|
100
|
|
|
|
33367
|
if($where =~ /$pre$P$/) { |
|
481
|
26
|
|
|
|
|
247
|
$R2 =~ s/$pre$P$/$with/; |
|
482
|
26
|
|
|
|
|
290
|
$R1 =~ s/$pre$P$/$with/; |
|
483
|
26
|
|
|
|
|
273
|
$RV =~ s/$pre$P$/$with/; |
|
484
|
26
|
|
|
|
|
287
|
$word =~ s/$pre$P$/$with/; |
|
485
|
26
|
|
|
|
|
42
|
$done = 1; |
|
486
|
26
|
|
|
|
|
61
|
last; |
|
487
|
|
|
|
|
|
|
} |
|
488
|
|
|
|
|
|
|
} |
|
489
|
805
|
|
|
|
|
3369
|
return $done; |
|
490
|
|
|
|
|
|
|
} |
|
491
|
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
sub stem_caching { |
|
493
|
0
|
|
|
0
|
1
|
|
my $parm_ref; |
|
494
|
0
|
0
|
|
|
|
|
if (ref $_[0]) { |
|
495
|
0
|
|
|
|
|
|
$parm_ref = shift; |
|
496
|
|
|
|
|
|
|
} else { |
|
497
|
0
|
|
|
|
|
|
$parm_ref = { @_ }; |
|
498
|
|
|
|
|
|
|
} |
|
499
|
0
|
|
|
|
|
|
my $caching_level = $parm_ref->{-level}; |
|
500
|
0
|
0
|
|
|
|
|
if (defined $caching_level) { |
|
501
|
0
|
0
|
|
|
|
|
if ($caching_level !~ m/^[012]$/) { |
|
502
|
0
|
|
|
|
|
|
croak(__PACKAGE__ . "::stem_caching() - Legal values are '0','1' or '2'. '$caching_level' is not a legal value"); |
|
503
|
|
|
|
|
|
|
} |
|
504
|
0
|
|
|
|
|
|
$Stem_Caching = $caching_level; |
|
505
|
|
|
|
|
|
|
} |
|
506
|
0
|
|
|
|
|
|
return $Stem_Caching; |
|
507
|
|
|
|
|
|
|
} |
|
508
|
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
sub clear_stem_cache { |
|
510
|
0
|
|
|
0
|
1
|
|
$Stem_Cache = {}; |
|
511
|
|
|
|
|
|
|
} |
|
512
|
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
1; |
|
514
|
|
|
|
|
|
|
__END__ |