line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::Stem::Fr; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
843
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
31
|
|
4
|
1
|
|
|
1
|
|
10
|
use warnings; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
2373
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
require Exporter; |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
# Items to export into callers namespace by default. Note: do not export |
11
|
|
|
|
|
|
|
# names by default without a very good reason. Use EXPORT_OK instead. |
12
|
|
|
|
|
|
|
# Do not simply export all your public functions/methods/constants. |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
# This allows declaration use Lingua::Stem::Fr ':all'; |
15
|
|
|
|
|
|
|
# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK |
16
|
|
|
|
|
|
|
# will save memory. |
17
|
|
|
|
|
|
|
our %EXPORT_TAGS = (); |
18
|
|
|
|
|
|
|
our @EXPORT_OK = qw (stem stem_word clear_stem_cache stem_caching); |
19
|
|
|
|
|
|
|
our @EXPORT = (); |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
our $VERSION = '0.02'; |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
my $Stem_Caching = 0; |
25
|
|
|
|
|
|
|
my $Stem_Cache = {}; |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
sub stem { |
29
|
3
|
50
|
|
3
|
0
|
237
|
return [] if ($#_ == -1); |
30
|
3
|
|
|
|
|
5
|
my $parm_ref; |
31
|
3
|
50
|
|
|
|
22
|
if (ref $_[0]) { |
32
|
3
|
|
|
|
|
7
|
$parm_ref = shift; |
33
|
|
|
|
|
|
|
} else { |
34
|
0
|
|
|
|
|
0
|
$parm_ref = { @_ }; |
35
|
|
|
|
|
|
|
} |
36
|
|
|
|
|
|
|
|
37
|
3
|
|
|
|
|
6
|
my $words = []; |
38
|
3
|
|
|
|
|
4
|
my $locale = 'fr'; |
39
|
3
|
|
|
|
|
7
|
my $exceptions = {}; |
40
|
3
|
|
|
|
|
26
|
foreach (keys %$parm_ref) { |
41
|
3
|
|
|
|
|
8
|
my $key = lc ($_); |
42
|
3
|
50
|
|
|
|
9
|
if ($key eq '-words') { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
43
|
3
|
|
|
|
|
4
|
@$words = @{$parm_ref->{$key}}; |
|
3
|
|
|
|
|
16
|
|
44
|
|
|
|
|
|
|
} elsif ($key eq '-exceptions') { |
45
|
0
|
|
|
|
|
0
|
$exceptions = $parm_ref->{$key}; |
46
|
|
|
|
|
|
|
} elsif ($key eq '-locale') { |
47
|
0
|
|
|
|
|
0
|
$locale = $parm_ref->{$key}; |
48
|
|
|
|
|
|
|
} else { |
49
|
0
|
|
|
|
|
0
|
croak (__PACKAGE__ . "::stem() - Unknown parameter '$key' with value '$parm_ref->{$key}'\n"); |
50
|
|
|
|
|
|
|
} |
51
|
|
|
|
|
|
|
} |
52
|
|
|
|
|
|
|
|
53
|
3
|
|
|
|
|
6
|
local( $_ ); |
54
|
3
|
|
|
|
|
6
|
foreach (@$words) { |
55
|
|
|
|
|
|
|
# Flatten case |
56
|
12
|
|
|
|
|
22
|
$_ = lc $_; |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
# Check against exceptions list |
59
|
12
|
50
|
|
|
|
29
|
if (exists $exceptions->{$_}) { |
60
|
0
|
|
|
|
|
0
|
$_ = $exceptions->{$_}; |
61
|
0
|
|
|
|
|
0
|
next; |
62
|
|
|
|
|
|
|
} |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
# Check against cache of stemmed words |
65
|
12
|
|
|
|
|
18
|
my $original_word = $_; |
66
|
12
|
50
|
33
|
|
|
29
|
if ($Stem_Caching && exists $Stem_Cache->{$original_word}) { |
67
|
0
|
|
|
|
|
0
|
$_ = $Stem_Cache->{$original_word}; |
68
|
0
|
|
|
|
|
0
|
next; |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
|
71
|
12
|
|
|
|
|
26
|
$_ = stem_word($_); |
72
|
|
|
|
|
|
|
|
73
|
12
|
50
|
|
|
|
44
|
$Stem_Cache->{$original_word} = $_ if $Stem_Caching; |
74
|
|
|
|
|
|
|
} |
75
|
3
|
50
|
|
|
|
15
|
$Stem_Cache = {} if ($Stem_Caching < 2); |
76
|
|
|
|
|
|
|
|
77
|
3
|
|
|
|
|
23
|
return $words; |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
} |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
sub stem_word { |
82
|
|
|
|
|
|
|
|
83
|
24
|
|
|
24
|
1
|
653
|
our($word) = @_; |
84
|
|
|
|
|
|
|
|
85
|
24
|
|
|
|
|
49
|
$word = lc $word; |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
# Check against cache of stemmed words |
88
|
24
|
50
|
33
|
|
|
61
|
if ($Stem_Caching && exists $Stem_Cache->{$word}) { |
89
|
0
|
|
|
|
|
0
|
return $Stem_Cache->{$word}; |
90
|
|
|
|
|
|
|
} |
91
|
|
|
|
|
|
|
|
92
|
24
|
|
|
|
|
31
|
our($RV, $R1, $R2); |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
### u, i between vowels into upper case. |
96
|
24
|
|
|
|
|
85
|
$word =~ s/([aeiouyâàëéêèïîôûù])([ui])([aeiouyâàëéêèïîôûù])/$1.uc($2).$3/eg; |
|
8
|
|
|
|
|
53
|
|
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
### y preceded or followed by a vowel into upper case. |
99
|
24
|
|
|
|
|
44
|
$word =~ s/([aeiouyâàëéêèïîôûù])(y)/$1.uc($2)/eg; |
|
0
|
|
|
|
|
0
|
|
100
|
24
|
|
|
|
|
35
|
$word =~ s/(y)([aeiouyâàëéêèïîôûù])/uc($1).$2/eg; |
|
0
|
|
|
|
|
0
|
|
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
### u after q into upper case. |
103
|
24
|
|
|
|
|
32
|
$word =~ s/(q)(u)/$1.uc($2)/eg; |
|
0
|
|
|
|
|
0
|
|
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
#### RV is defined as follows |
106
|
24
|
|
|
|
|
33
|
$RV = $word; |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
#### If the first two letters are vowels |
109
|
24
|
100
|
|
|
|
157
|
if($word =~ /^[aeiouyâàëéêèïîôûù][aeiouyâàëéêèïîôûù]/) { |
|
|
50
|
|
|
|
|
|
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
#### RV is the region after the third letter |
112
|
1
|
50
|
|
|
|
9
|
unless ( $RV =~ s/^...// ) { |
113
|
0
|
|
|
|
|
0
|
$RV = ""; |
114
|
|
|
|
|
|
|
} |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
} elsif ( $word =~ /^.+?[aeiouyâàëéêèïîôûù].+/ ) { |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
#### RV is after the first vowel not beginning or end the word |
119
|
23
|
|
|
|
|
82
|
$RV =~ s/^.+?[aeiouyâàëéêèïîôûù]//; |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
} else { |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
#### RV is the end of the word |
124
|
0
|
|
|
|
|
0
|
$RV = ""; |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
#print "Word=$word\nRV=$RV\n"; |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
#### Defining R1 and R2 |
131
|
24
|
|
|
|
|
36
|
$R1 = $word; |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
#### R1 is the region after the first non-vowel following a |
134
|
|
|
|
|
|
|
#### vowel, or is the null region at the end of the word if |
135
|
|
|
|
|
|
|
#### there is no such non-vowel. |
136
|
|
|
|
|
|
|
|
137
|
24
|
50
|
|
|
|
106
|
unless($R1 =~ s/^.*?[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]//) { |
138
|
0
|
|
|
|
|
0
|
$R1 = ""; |
139
|
|
|
|
|
|
|
} |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
#print "R1=$R1\n"; |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
#### R2 is the region after the first non-vowel following a |
144
|
|
|
|
|
|
|
#### vowel in R1, or is the null region at the end of the |
145
|
|
|
|
|
|
|
#### word if there is no such non-vowel. |
146
|
|
|
|
|
|
|
|
147
|
24
|
|
|
|
|
34
|
$R2 = $R1; |
148
|
|
|
|
|
|
|
|
149
|
24
|
50
|
|
|
|
46
|
if($R2) { |
150
|
24
|
100
|
|
|
|
94
|
unless($R2 =~ s/^.*?[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]//) { |
151
|
4
|
|
|
|
|
7
|
$R2 = ""; |
152
|
|
|
|
|
|
|
} |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
#print "R2=$R2\n"; |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
#### Step 1: Standard suffix removal |
158
|
|
|
|
|
|
|
|
159
|
24
|
|
|
|
|
36
|
my $step1 = 0; |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
#### Search for the longest among the following suffixes, |
162
|
|
|
|
|
|
|
#### and perform the action indicated |
163
|
|
|
|
|
|
|
|
164
|
24
|
|
|
|
|
87
|
my @suffix = qw( |
165
|
|
|
|
|
|
|
ance iqUe isme |
166
|
|
|
|
|
|
|
able iste eux |
167
|
|
|
|
|
|
|
ances iqUes ismes |
168
|
|
|
|
|
|
|
ables istes |
169
|
|
|
|
|
|
|
); |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
#### delete if in R2 |
172
|
24
|
|
|
|
|
60
|
$step1 += stem_killer( $R2, "", "", @suffix ); |
173
|
|
|
|
|
|
|
|
174
|
24
|
|
|
|
|
77
|
@suffix = qw( |
175
|
|
|
|
|
|
|
trice ateur ation |
176
|
|
|
|
|
|
|
atrices ateurs ations |
177
|
|
|
|
|
|
|
); |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
#### delete if in R2 |
180
|
|
|
|
|
|
|
#### if preceded by ic, delete if in R2 |
181
|
|
|
|
|
|
|
#print "Word=$word RV=$RV R1=$R1 R2=$R2\n"; |
182
|
24
|
|
33
|
|
|
60
|
$step1 += stem_killer( $R2, "ic", "", @suffix ) |
183
|
|
|
|
|
|
|
|| stem_killer( $R1, "ic", "iqU", @suffix ) |
184
|
|
|
|
|
|
|
|| stem_killer( $R2, "", "", @suffix ); |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
|
187
|
24
|
|
|
|
|
90
|
@suffix = qw( |
188
|
|
|
|
|
|
|
logie logies |
189
|
|
|
|
|
|
|
); |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
#### replace with log if in R2 |
192
|
24
|
|
|
|
|
59
|
$step1 += stem_killer( $R2, "", "log", @suffix ); |
193
|
|
|
|
|
|
|
|
194
|
24
|
|
|
|
|
61
|
@suffix = qw( |
195
|
|
|
|
|
|
|
usion ution usions utions |
196
|
|
|
|
|
|
|
); |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
#### replace with u if in R2 |
199
|
24
|
|
|
|
|
49
|
$step1 += stem_killer( $R2, "", "u", @suffix ); |
200
|
|
|
|
|
|
|
|
201
|
24
|
|
|
|
|
50
|
@suffix = qw( |
202
|
|
|
|
|
|
|
ence ences |
203
|
|
|
|
|
|
|
); |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
#### replace with ent if in R2 |
206
|
24
|
|
|
|
|
53
|
$step1 += stem_killer( $R2, "", "ent", @suffix ); |
207
|
|
|
|
|
|
|
|
208
|
24
|
|
|
|
|
50
|
@suffix = qw( |
209
|
|
|
|
|
|
|
issement issements |
210
|
|
|
|
|
|
|
); |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
#### delete if in R1 and preceded by a non-vowel |
213
|
24
|
50
|
|
|
|
55
|
if ( nvprec( $R1, @suffix ) ) { |
214
|
0
|
|
|
|
|
0
|
$step1 += stem_killer( $R1, "", "", @suffix); |
215
|
|
|
|
|
|
|
} |
216
|
|
|
|
|
|
|
|
217
|
24
|
|
|
|
|
55
|
@suffix = qw( |
218
|
|
|
|
|
|
|
ement ements |
219
|
|
|
|
|
|
|
); |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
#### delete if in RV |
222
|
|
|
|
|
|
|
#### if preceded by iv, delete if in R2 |
223
|
|
|
|
|
|
|
#### (and if further preceded by at, delete if in R2), otherwise, |
224
|
|
|
|
|
|
|
#### if preceded by eus, delete if in R2, else replace by eux if in R1, otherwise, |
225
|
|
|
|
|
|
|
#### if preceded by abl or iqU, delete if in R2, otherwise, |
226
|
|
|
|
|
|
|
#### if preceded by ièr or Ièr, replace by i if in RV |
227
|
24
|
|
66
|
|
|
54
|
$step1 += stem_killer( $RV, "ativ", "", @suffix ) |
228
|
|
|
|
|
|
|
|| stem_killer( $R2, "iv", "", @suffix ) |
229
|
|
|
|
|
|
|
|| stem_killer( $R2, "(abl|iqU)", "", @suffix ) |
230
|
|
|
|
|
|
|
|| stem_killer( $R2, "(ièr|Ièr)", "i", @suffix ) |
231
|
|
|
|
|
|
|
|| stem_killer( $R2, "eus", "", @suffix ) |
232
|
|
|
|
|
|
|
|| stem_killer( $R1, "eus", "eux", @suffix ) |
233
|
|
|
|
|
|
|
|| stem_killer( $RV, "", "", @suffix ); |
234
|
|
|
|
|
|
|
|
235
|
24
|
|
|
|
|
54
|
@suffix = qw( |
236
|
|
|
|
|
|
|
ité ités |
237
|
|
|
|
|
|
|
); |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
#### delete if in R2 |
240
|
|
|
|
|
|
|
#### if preceded by abil, delete if in R2, else replace by abl, otherwise, |
241
|
|
|
|
|
|
|
#### if preceded by ic, delete if in R2, else replace by iqU, otherwise, |
242
|
|
|
|
|
|
|
#### if preceded by iv, delete if in R2 |
243
|
24
|
|
33
|
|
|
45
|
$step1 += stem_killer( $R2, "(abil|ic|iv)", "", @suffix ) |
244
|
|
|
|
|
|
|
|| stem_killer( $word, "abil", "abl", @suffix ) |
245
|
|
|
|
|
|
|
|| stem_killer( $word, "ic", "iqU", @suffix ) |
246
|
|
|
|
|
|
|
|| stem_killer( $R2, "", "", @suffix ); |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
|
249
|
24
|
|
|
|
|
56
|
@suffix = qw( |
250
|
|
|
|
|
|
|
if ive ifs ives |
251
|
|
|
|
|
|
|
); |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
#### delete if in R2 |
254
|
|
|
|
|
|
|
#### if preceded by at, delete if in R2 |
255
|
|
|
|
|
|
|
#### (and if further preceded by ic, delete if in R2, else replace by iqU) |
256
|
24
|
|
33
|
|
|
59
|
$step1 += stem_killer( $R2, "icat", "", @suffix) |
257
|
|
|
|
|
|
|
|| stem_killer( $R2, "at", "", @suffix) |
258
|
|
|
|
|
|
|
|| stem_killer( $word, "icat", "iqU", @suffix) |
259
|
|
|
|
|
|
|
|| stem_killer( $R2, "", "", @suffix); |
260
|
|
|
|
|
|
|
|
261
|
24
|
|
|
|
|
62
|
@suffix = qw( |
262
|
|
|
|
|
|
|
eaux |
263
|
|
|
|
|
|
|
); |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
#### replace with eau |
266
|
24
|
|
|
|
|
61
|
$step1 += stem_killer( $word, "", "eau", @suffix); |
267
|
|
|
|
|
|
|
|
268
|
24
|
|
|
|
|
45
|
@suffix = qw( |
269
|
|
|
|
|
|
|
aux |
270
|
|
|
|
|
|
|
); |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
#### replace with eau |
273
|
24
|
|
|
|
|
47
|
$step1 += stem_killer( $R1, "", "al", @suffix); |
274
|
|
|
|
|
|
|
|
275
|
24
|
|
|
|
|
55
|
@suffix = qw( |
276
|
|
|
|
|
|
|
euse euses |
277
|
|
|
|
|
|
|
); |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
#### delete if in R2, else replace by eux if in R1 |
280
|
24
|
|
33
|
|
|
49
|
$step1 += stem_killer( $R2, "", "", @suffix) |
281
|
|
|
|
|
|
|
|| stem_killer( $R1, "", "eux", @suffix); |
282
|
|
|
|
|
|
|
|
283
|
24
|
|
|
|
|
50
|
@suffix = qw( |
284
|
|
|
|
|
|
|
emment |
285
|
|
|
|
|
|
|
); |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
#### replace with ent |
288
|
24
|
|
|
|
|
49
|
my $sufstep2 += stem_killer( $RV, "", "ent", @suffix); |
289
|
|
|
|
|
|
|
|
290
|
24
|
|
|
|
|
46
|
@suffix = qw( |
291
|
|
|
|
|
|
|
amment |
292
|
|
|
|
|
|
|
); |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
#### replace with ant |
295
|
24
|
|
|
|
|
47
|
$sufstep2 += stem_killer( $RV, "", "ant", @suffix); |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
|
298
|
24
|
|
|
|
|
48
|
@suffix = qw( |
299
|
|
|
|
|
|
|
ment ments |
300
|
|
|
|
|
|
|
); |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
#### delete if preceded by a vowel in RV |
303
|
24
|
50
|
|
|
|
56
|
if ( vprec ( $RV, @suffix) ) { |
304
|
0
|
|
|
|
|
0
|
$sufstep2 += stem_killer( $RV, "", "", @suffix); |
305
|
|
|
|
|
|
|
} |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
#### Step 2: Verb suffixes |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
#### Do step 2a if no ending was removed by step 1. |
312
|
24
|
|
|
|
|
36
|
my $step2a = 0; |
313
|
24
|
100
|
66
|
|
|
66
|
if( ($step1 == 0) || ($sufstep2 > 0) ) { |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
#### Search for the longest among the following suffixes in RV, |
316
|
|
|
|
|
|
|
#### and if found, delete. |
317
|
22
|
|
|
|
|
144
|
@suffix = qw( |
318
|
|
|
|
|
|
|
îmes ît îtes i ie ies ir ira |
319
|
|
|
|
|
|
|
irai iraIent irais irait iras irent |
320
|
|
|
|
|
|
|
irez iriez irions irons iront is issaIent |
321
|
|
|
|
|
|
|
issais issait issant issante issantes |
322
|
|
|
|
|
|
|
issants isse issent isses issez issiez |
323
|
|
|
|
|
|
|
issions issons it |
324
|
|
|
|
|
|
|
); |
325
|
22
|
50
|
|
|
|
50
|
if ( nvprec( $RV, @suffix) ) { |
326
|
|
|
|
|
|
|
#print "word:$word RV:$RV R1:$R1 R2:$R2\n"; |
327
|
0
|
|
|
|
|
0
|
$step2a += stem_killer( $RV, "", "", @suffix ); |
328
|
|
|
|
|
|
|
} |
329
|
|
|
|
|
|
|
} |
330
|
|
|
|
|
|
|
|
331
|
24
|
|
|
|
|
29
|
my $step2b = 0; |
332
|
24
|
50
|
|
|
|
61
|
if ( $step2a == 0 ) { |
333
|
|
|
|
|
|
|
|
334
|
24
|
|
|
|
|
81
|
@suffix = qw( |
335
|
|
|
|
|
|
|
ions |
336
|
|
|
|
|
|
|
); |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
#### delete if in R2 |
339
|
24
|
|
|
|
|
59
|
$step2b += stem_killer( $R2, "", "", @suffix); |
340
|
|
|
|
|
|
|
|
341
|
24
|
|
|
|
|
100
|
@suffix = qw( |
342
|
|
|
|
|
|
|
é ée ées és èrent er era erai |
343
|
|
|
|
|
|
|
eraIent erais erait eras erez eriez |
344
|
|
|
|
|
|
|
erions erons eront ez iez |
345
|
|
|
|
|
|
|
); |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
#### delete |
348
|
24
|
|
|
|
|
63
|
$step2b += stem_killer( $RV, "", "", @suffix); |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
#print "Avant word:$word RV:$RV R1:$R1 R2:$R2\n"; |
351
|
24
|
|
|
|
|
122
|
@suffix = qw( |
352
|
|
|
|
|
|
|
âmes ât âtes a ai aIent ais ait |
353
|
|
|
|
|
|
|
ant ante antes ants as asse assent |
354
|
|
|
|
|
|
|
asses assiez assions |
355
|
|
|
|
|
|
|
); |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
#### delete |
358
|
|
|
|
|
|
|
#### if preceded by e, delete |
359
|
24
|
|
66
|
|
|
64
|
$step2b += stem_killer( $RV, "e", "", @suffix) |
360
|
|
|
|
|
|
|
|| stem_killer( $RV, "", "", @suffix); |
361
|
|
|
|
|
|
|
#print "Apres word:$word RV:$RV R1:$R1 R2:$R2\n"; |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
} |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
|
366
|
24
|
|
|
|
|
47
|
my $step4 = 1; |
367
|
24
|
100
|
66
|
|
|
151
|
if ( $step1 > 0 || $step2a > 0 || $step2b > 0 ) { |
|
|
|
100
|
|
|
|
|
368
|
|
|
|
|
|
|
#### Step 3 |
369
|
|
|
|
|
|
|
#### Replace final Y with i or final ç with c |
370
|
13
|
50
|
|
|
|
72
|
if ( $word =~ /Y$|ç$/ ) { |
371
|
0
|
|
|
|
|
0
|
$word =~ s/Y$/i/; |
372
|
0
|
|
|
|
|
0
|
$word =~ s/ç$/c/; |
373
|
0
|
|
|
|
|
0
|
$step4 = 0; |
374
|
|
|
|
|
|
|
} |
375
|
|
|
|
|
|
|
} |
376
|
|
|
|
|
|
|
|
377
|
24
|
100
|
66
|
|
|
335
|
if ( $step4 == 1 && $step1 == 0 && $step2a == 0 && $step2b == 0 ) { |
|
|
|
66
|
|
|
|
|
|
|
|
100
|
|
|
|
|
378
|
|
|
|
|
|
|
#### Step 4 |
379
|
|
|
|
|
|
|
#### If the word ends s, not preceded by a, i, o, u, è or s, delete it. |
380
|
|
|
|
|
|
|
#print "word:$word RV:$RV\n"; |
381
|
11
|
100
|
|
|
|
52
|
if ( $word =~ /[^aiouès]s$/ ) { |
382
|
4
|
|
|
|
|
11
|
stem_killer( $word , "", "", "s" ); |
383
|
|
|
|
|
|
|
} |
384
|
|
|
|
|
|
|
|
385
|
11
|
|
|
|
|
35
|
@suffix = qw( |
386
|
|
|
|
|
|
|
ent |
387
|
|
|
|
|
|
|
); |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
#### delete if in R2 |
390
|
11
|
|
|
|
|
31
|
stem_killer( $R2, "", "", @suffix); |
391
|
|
|
|
|
|
|
|
392
|
11
|
|
|
|
|
20
|
@suffix = qw( |
393
|
|
|
|
|
|
|
ion |
394
|
|
|
|
|
|
|
); |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
#### delete if in R2 and preceded by s or t |
397
|
11
|
50
|
33
|
|
|
145
|
if ( $R2 =~ /ion$/ && $RV =~ /tion|sion/ ) { |
398
|
0
|
|
|
|
|
0
|
stem_killer( $R2, "", "", @suffix); |
399
|
|
|
|
|
|
|
} |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
#(So note that ion is removed only when it is in R2 - as well as being in RV - and preceded by s or t which must be in RV.) |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
|
404
|
11
|
|
|
|
|
29
|
@suffix = qw( |
405
|
|
|
|
|
|
|
ier ière Ier Ière |
406
|
|
|
|
|
|
|
); |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
#### replace with i |
409
|
11
|
|
|
|
|
26
|
stem_killer( $RV, "", "i", @suffix); |
410
|
|
|
|
|
|
|
|
411
|
11
|
|
|
|
|
23
|
@suffix = qw( |
412
|
|
|
|
|
|
|
e |
413
|
|
|
|
|
|
|
); |
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
#### e delete |
416
|
|
|
|
|
|
|
#print "word:$word RV:$RV R1:$R1 R2:$R2\n"; |
417
|
11
|
|
|
|
|
21
|
stem_killer( $RV, "", "", @suffix); |
418
|
|
|
|
|
|
|
|
419
|
11
|
|
|
|
|
54
|
@suffix = qw( |
420
|
|
|
|
|
|
|
ë |
421
|
|
|
|
|
|
|
); |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
#### if preceded by gu, delete |
424
|
11
|
50
|
|
|
|
34
|
if ( $RV =~ /guë$/ ) { |
425
|
0
|
|
|
|
|
0
|
stem_killer( $RV, "", "", @suffix); |
426
|
|
|
|
|
|
|
} |
427
|
|
|
|
|
|
|
} |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
#### Always do Step 5 and Step 6 |
430
|
|
|
|
|
|
|
#### step 5 : Undouble |
431
|
|
|
|
|
|
|
#### If the word ends enn, onn, ett, ell or eill, delete the last letter |
432
|
24
|
|
|
|
|
43
|
$word =~ s/enn$/en/; |
433
|
24
|
|
|
|
|
39
|
$word =~ s/onn$/on/; |
434
|
24
|
|
|
|
|
34
|
$word =~ s/ett$/et/; |
435
|
24
|
|
|
|
|
28
|
$word =~ s/ell$/el/; |
436
|
24
|
|
|
|
|
32
|
$word =~ s/eill$/eil/; |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
#### step 6 :Un-accent |
439
|
|
|
|
|
|
|
#### If the words ends é or è followed by at least one non-vowel, |
440
|
|
|
|
|
|
|
#### remove the accent from the e |
441
|
24
|
|
|
|
|
42
|
$word =~ s/[éè]([^aeiouyâàëéêèïîôûù]+?)$/e$1/; |
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
#### And finally: |
444
|
|
|
|
|
|
|
#### Turn any remaining I, U and Y letters into lower case. |
445
|
24
|
|
|
|
|
56
|
$word =~ s/([IUY])/lc($1)/eg; |
|
8
|
|
|
|
|
37
|
|
446
|
|
|
|
|
|
|
|
447
|
24
|
|
|
|
|
99
|
return $word; |
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
} |
450
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
sub nvprec { |
452
|
|
|
|
|
|
|
|
453
|
46
|
|
|
46
|
0
|
190
|
my($where, @list) = @_; |
454
|
1
|
|
|
1
|
|
8
|
use vars qw($RV $R1 $R2 $word); |
|
1
|
|
|
|
|
13
|
|
|
1
|
|
|
|
|
193
|
|
455
|
46
|
|
|
|
|
95
|
foreach my $p ( sort { length($b) <=> length($a) } @list) { |
|
3060
|
|
|
|
|
8260
|
|
456
|
818
|
50
|
|
|
|
16477
|
if ($where =~ /[^aeiouyâàëéêèïîôûù]$p$/) { |
457
|
0
|
|
|
|
|
0
|
return 1; |
458
|
|
|
|
|
|
|
} |
459
|
|
|
|
|
|
|
} |
460
|
46
|
|
|
|
|
194
|
return; |
461
|
|
|
|
|
|
|
} |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
sub vprec { |
464
|
|
|
|
|
|
|
|
465
|
24
|
|
|
24
|
0
|
48
|
my($where, @list) = @_; |
466
|
1
|
|
|
1
|
|
4
|
use vars qw($RV $R1 $R2 $word); |
|
1
|
|
|
|
|
8
|
|
|
1
|
|
|
|
|
144
|
|
467
|
24
|
|
|
|
|
46
|
foreach my $p ( sort { length($b) <=> length($a) } @list) { |
|
24
|
|
|
|
|
80
|
|
468
|
48
|
50
|
|
|
|
972
|
if ($where =~ /[aeiouyâàëéêèïîôûù]$p$/) { |
469
|
0
|
|
|
|
|
0
|
return 1; |
470
|
|
|
|
|
|
|
} |
471
|
|
|
|
|
|
|
} |
472
|
24
|
|
|
|
|
70
|
return; |
473
|
|
|
|
|
|
|
} |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
sub stem_killer { |
476
|
805
|
|
|
805
|
0
|
1985
|
my($where, $pre, $with, @list) = @_; |
477
|
1
|
|
|
1
|
|
5
|
use vars qw($RV $R1 $R2 $word); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
360
|
|
478
|
805
|
|
|
|
|
961
|
my $done = 0; |
479
|
805
|
|
|
|
|
1522
|
foreach my $P (sort { length($b) <=> length($a) } @list) { |
|
6175
|
|
|
|
|
7017
|
|
480
|
3356
|
100
|
|
|
|
33367
|
if($where =~ /$pre$P$/) { |
481
|
26
|
|
|
|
|
247
|
$R2 =~ s/$pre$P$/$with/; |
482
|
26
|
|
|
|
|
290
|
$R1 =~ s/$pre$P$/$with/; |
483
|
26
|
|
|
|
|
273
|
$RV =~ s/$pre$P$/$with/; |
484
|
26
|
|
|
|
|
287
|
$word =~ s/$pre$P$/$with/; |
485
|
26
|
|
|
|
|
42
|
$done = 1; |
486
|
26
|
|
|
|
|
61
|
last; |
487
|
|
|
|
|
|
|
} |
488
|
|
|
|
|
|
|
} |
489
|
805
|
|
|
|
|
3369
|
return $done; |
490
|
|
|
|
|
|
|
} |
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
sub stem_caching { |
493
|
0
|
|
|
0
|
1
|
|
my $parm_ref; |
494
|
0
|
0
|
|
|
|
|
if (ref $_[0]) { |
495
|
0
|
|
|
|
|
|
$parm_ref = shift; |
496
|
|
|
|
|
|
|
} else { |
497
|
0
|
|
|
|
|
|
$parm_ref = { @_ }; |
498
|
|
|
|
|
|
|
} |
499
|
0
|
|
|
|
|
|
my $caching_level = $parm_ref->{-level}; |
500
|
0
|
0
|
|
|
|
|
if (defined $caching_level) { |
501
|
0
|
0
|
|
|
|
|
if ($caching_level !~ m/^[012]$/) { |
502
|
0
|
|
|
|
|
|
croak(__PACKAGE__ . "::stem_caching() - Legal values are '0','1' or '2'. '$caching_level' is not a legal value"); |
503
|
|
|
|
|
|
|
} |
504
|
0
|
|
|
|
|
|
$Stem_Caching = $caching_level; |
505
|
|
|
|
|
|
|
} |
506
|
0
|
|
|
|
|
|
return $Stem_Caching; |
507
|
|
|
|
|
|
|
} |
508
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
sub clear_stem_cache { |
510
|
0
|
|
|
0
|
1
|
|
$Stem_Cache = {}; |
511
|
|
|
|
|
|
|
} |
512
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
1; |
514
|
|
|
|
|
|
|
__END__ |