File Coverage

blib/lib/Lingua/Stem/Fr.pm

Criterion	Covered	Total	%
statement	146	182	80.2
branch	35	66	53.0
condition	25	45	55.5
subroutine	10	12	83.3
pod	3	7	42.8
total	219	312	70.1

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::Stem::Fr;
2
3	1			1		843	use strict;
	1					2
	1					31
4	1			1		10	use warnings;
	1					1
	1					2373
5
6							require Exporter;
7
8							our @ISA = qw(Exporter);
9
10							# Items to export into callers namespace by default. Note: do not export
11							# names by default without a very good reason. Use EXPORT_OK instead.
12							# Do not simply export all your public functions/methods/constants.
13
14							# This allows declaration use Lingua::Stem::Fr ':all';
15							# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
16							# will save memory.
17							our %EXPORT_TAGS = ();
18							our @EXPORT_OK = qw (stem stem_word clear_stem_cache stem_caching);
19							our @EXPORT = ();
20
21							our $VERSION = '0.02';
22
23
24							my $Stem_Caching = 0;
25							my $Stem_Cache = {};
26
27
28							sub stem {
29	3	50		3	0	237	return [] if ($#_ == -1);
30	3					5	my $parm_ref;
31	3	50				22	if (ref $_[0]) {
32	3					7	$parm_ref = shift;
33							} else {
34	0					0	$parm_ref = { @_ };
35							}
36
37	3					6	my $words = [];
38	3					4	my $locale = 'fr';
39	3					7	my $exceptions = {};
40	3					26	foreach (keys %$parm_ref) {
41	3					8	my $key = lc ($_);
42	3	50				9	if ($key eq '-words') {
		0
		0
43	3					4	@$words = @{$parm_ref->{$key}};
	3					16
44							} elsif ($key eq '-exceptions') {
45	0					0	$exceptions = $parm_ref->{$key};
46							} elsif ($key eq '-locale') {
47	0					0	$locale = $parm_ref->{$key};
48							} else {
49	0					0	croak (__PACKAGE__ . "::stem() - Unknown parameter '$key' with value '$parm_ref->{$key}'\n");
50							}
51							}
52
53	3					6	local( $_ );
54	3					6	foreach (@$words) {
55							# Flatten case
56	12					22	$_ = lc $_;
57
58							# Check against exceptions list
59	12	50				29	if (exists $exceptions->{$_}) {
60	0					0	$_ = $exceptions->{$_};
61	0					0	next;
62							}
63
64							# Check against cache of stemmed words
65	12					18	my $original_word = $_;
66	12	50	33			29	if ($Stem_Caching && exists $Stem_Cache->{$original_word}) {
67	0					0	$_ = $Stem_Cache->{$original_word};
68	0					0	next;
69							}
70
71	12					26	$_ = stem_word($_);
72
73	12	50				44	$Stem_Cache->{$original_word} = $_ if $Stem_Caching;
74							}
75	3	50				15	$Stem_Cache = {} if ($Stem_Caching < 2);
76
77	3					23	return $words;
78
79							}
80
81							sub stem_word {
82
83	24			24	1	653	our($word) = @_;
84
85	24					49	$word = lc $word;
86
87							# Check against cache of stemmed words
88	24	50	33			61	if ($Stem_Caching && exists $Stem_Cache->{$word}) {
89	0					0	return $Stem_Cache->{$word};
90							}
91
92	24					31	our($RV, $R1, $R2);
93
94
95							### u, i between vowels into upper case.
96	24					85	$word =~ s/([aeiouy��])([ui])([aeiouy��])/$1.uc($2).$3/eg;
	8					53
97
98							### y preceded or followed by a vowel into upper case.
99	24					44	$word =~ s/([aeiouy��])(y)/$1.uc($2)/eg;
	0					0
100	24					35	$word =~ s/(y)([aeiouy��])/uc($1).$2/eg;
	0					0
101
102							### u after q into upper case.
103	24					32	$word =~ s/(q)(u)/$1.uc($2)/eg;
	0					0
104
105							#### RV is defined as follows
106	24					33	$RV = $word;
107
108							#### If the first two letters are vowels
109	24	100				157	if($word =~ /^[aeiouy��][aeiouy��]/) {
		50
110
111							#### RV is the region after the third letter
112	1	50				9	unless ( $RV =~ s/^...// ) {
113	0					0	$RV = "";
114							}
115
116							} elsif ( $word =~ /^.+?[aeiouy��].+/ ) {
117
118							#### RV is after the first vowel not beginning or end the word
119	23					82	$RV =~ s/^.+?[aeiouy��]//;
120
121							} else {
122
123							#### RV is the end of the word
124	0					0	$RV = "";
125
126							}
127
128							#print "Word=$word\nRV=$RV\n";
129
130							#### Defining R1 and R2
131	24					36	$R1 = $word;
132
133							#### R1 is the region after the first non-vowel following a
134							#### vowel, or is the null region at the end of the word if
135							#### there is no such non-vowel.
136
137	24	50				106	unless($R1 =~ s/^.*?[aeiouy��][^aeiouy��]//) {
138	0					0	$R1 = "";
139							}
140
141							#print "R1=$R1\n";
142
143							#### R2 is the region after the first non-vowel following a
144							#### vowel in R1, or is the null region at the end of the
145							#### word if there is no such non-vowel.
146
147	24					34	$R2 = $R1;
148
149	24	50				46	if($R2) {
150	24	100				94	unless($R2 =~ s/^.*?[aeiouy��][^aeiouy��]//) {
151	4					7	$R2 = "";
152							}
153							}
154
155							#print "R2=$R2\n";
156
157							#### Step 1: Standard suffix removal
158
159	24					36	my $step1 = 0;
160
161							#### Search for the longest among the following suffixes,
162							#### and perform the action indicated
163
164	24					87	my @suffix = qw(
165							ance iqUe isme
166							able iste eux
167							ances iqUes ismes
168							ables istes
169							);
170
171							#### delete if in R2
172	24					60	$step1 += stem_killer( $R2, "", "", @suffix );
173
174	24					77	@suffix = qw(
175							trice ateur ation
176							atrices ateurs ations
177							);
178
179							#### delete if in R2
180							#### if preceded by ic, delete if in R2
181							#print "Word=$word RV=$RV R1=$R1 R2=$R2\n";
182	24		33			60	$step1 += stem_killer( $R2, "ic", "", @suffix )
183							\|\| stem_killer( $R1, "ic", "iqU", @suffix )
184							\|\| stem_killer( $R2, "", "", @suffix );
185
186
187	24					90	@suffix = qw(
188							logie logies
189							);
190
191							#### replace with log if in R2
192	24					59	$step1 += stem_killer( $R2, "", "log", @suffix );
193
194	24					61	@suffix = qw(
195							usion ution usions utions
196							);
197
198							#### replace with u if in R2
199	24					49	$step1 += stem_killer( $R2, "", "u", @suffix );
200
201	24					50	@suffix = qw(
202							ence ences
203							);
204
205							#### replace with ent if in R2
206	24					53	$step1 += stem_killer( $R2, "", "ent", @suffix );
207
208	24					50	@suffix = qw(
209							issement issements
210							);
211
212							#### delete if in R1 and preceded by a non-vowel
213	24	50				55	if ( nvprec( $R1, @suffix ) ) {
214	0					0	$step1 += stem_killer( $R1, "", "", @suffix);
215							}
216
217	24					55	@suffix = qw(
218							ement ements
219							);
220
221							#### delete if in RV
222							#### if preceded by iv, delete if in R2
223							#### (and if further preceded by at, delete if in R2), otherwise,
224							#### if preceded by eus, delete if in R2, else replace by eux if in R1, otherwise,
225							#### if preceded by abl or iqU, delete if in R2, otherwise,
226							#### if preceded by i�r or I�r, replace by i if in RV
227	24		66			54	$step1 += stem_killer( $RV, "ativ", "", @suffix )
228							\|\| stem_killer( $R2, "iv", "", @suffix )
229							\|\| stem_killer( $R2, "(abl\|iqU)", "", @suffix )
230							\|\| stem_killer( $R2, "(i�r\|I�r)", "i", @suffix )
231							\|\| stem_killer( $R2, "eus", "", @suffix )
232							\|\| stem_killer( $R1, "eus", "eux", @suffix )
233							\|\| stem_killer( $RV, "", "", @suffix );
234
235	24					54	@suffix = qw(
236							it� it�s
237							);
238
239							#### delete if in R2
240							#### if preceded by abil, delete if in R2, else replace by abl, otherwise,
241							#### if preceded by ic, delete if in R2, else replace by iqU, otherwise,
242							#### if preceded by iv, delete if in R2
243	24		33			45	$step1 += stem_killer( $R2, "(abil\|ic\|iv)", "", @suffix )
244							\|\| stem_killer( $word, "abil", "abl", @suffix )
245							\|\| stem_killer( $word, "ic", "iqU", @suffix )
246							\|\| stem_killer( $R2, "", "", @suffix );
247
248
249	24					56	@suffix = qw(
250							if ive ifs ives
251							);
252
253							#### delete if in R2
254							#### if preceded by at, delete if in R2
255							#### (and if further preceded by ic, delete if in R2, else replace by iqU)
256	24		33			59	$step1 += stem_killer( $R2, "icat", "", @suffix)
257							\|\| stem_killer( $R2, "at", "", @suffix)
258							\|\| stem_killer( $word, "icat", "iqU", @suffix)
259							\|\| stem_killer( $R2, "", "", @suffix);
260
261	24					62	@suffix = qw(
262							eaux
263							);
264
265							#### replace with eau
266	24					61	$step1 += stem_killer( $word, "", "eau", @suffix);
267
268	24					45	@suffix = qw(
269							aux
270							);
271
272							#### replace with eau
273	24					47	$step1 += stem_killer( $R1, "", "al", @suffix);
274
275	24					55	@suffix = qw(
276							euse euses
277							);
278
279							#### delete if in R2, else replace by eux if in R1
280	24		33			49	$step1 += stem_killer( $R2, "", "", @suffix)
281							\|\| stem_killer( $R1, "", "eux", @suffix);
282
283	24					50	@suffix = qw(
284							emment
285							);
286
287							#### replace with ent
288	24					49	my $sufstep2 += stem_killer( $RV, "", "ent", @suffix);
289
290	24					46	@suffix = qw(
291							amment
292							);
293
294							#### replace with ant
295	24					47	$sufstep2 += stem_killer( $RV, "", "ant", @suffix);
296
297
298	24					48	@suffix = qw(
299							ment ments
300							);
301
302							#### delete if preceded by a vowel in RV
303	24	50				56	if ( vprec ( $RV, @suffix) ) {
304	0					0	$sufstep2 += stem_killer( $RV, "", "", @suffix);
305							}
306
307
308
309							#### Step 2: Verb suffixes
310
311							#### Do step 2a if no ending was removed by step 1.
312	24					36	my $step2a = 0;
313	24	100	66			66	if( ($step1 == 0) \|\| ($sufstep2 > 0) ) {
314
315							#### Search for the longest among the following suffixes in RV,
316							#### and if found, delete.
317	22					144	@suffix = qw(
318							�mes �t �tes i ie ies ir ira
319							irai iraIent irais irait iras irent
320							irez iriez irions irons iront is issaIent
321							issais issait issant issante issantes
322							issants isse issent isses issez issiez
323							issions issons it
324							);
325	22	50				50	if ( nvprec( $RV, @suffix) ) {
326							#print "word:$word RV:$RV R1:$R1 R2:$R2\n";
327	0					0	$step2a += stem_killer( $RV, "", "", @suffix );
328							}
329							}
330
331	24					29	my $step2b = 0;
332	24	50				61	if ( $step2a == 0 ) {
333
334	24					81	@suffix = qw(
335							ions
336							);
337
338							#### delete if in R2
339	24					59	$step2b += stem_killer( $R2, "", "", @suffix);
340
341	24					100	@suffix = qw(
342							� �e �es �s �rent er era erai
343							eraIent erais erait eras erez eriez
344							erions erons eront ez iez
345							);
346
347							#### delete
348	24					63	$step2b += stem_killer( $RV, "", "", @suffix);
349
350							#print "Avant word:$word RV:$RV R1:$R1 R2:$R2\n";
351	24					122	@suffix = qw(
352							�mes �t �tes a ai aIent ais ait
353							ant ante antes ants as asse assent
354							asses assiez assions
355							);
356
357							#### delete
358							#### if preceded by e, delete
359	24		66			64	$step2b += stem_killer( $RV, "e", "", @suffix)
360							\|\| stem_killer( $RV, "", "", @suffix);
361							#print "Apres word:$word RV:$RV R1:$R1 R2:$R2\n";
362
363							}
364
365
366	24					47	my $step4 = 1;
367	24	100	66			151	if ( $step1 > 0 \|\| $step2a > 0 \|\| $step2b > 0 ) {
			100
368							#### Step 3
369							#### Replace final Y with i or final � with c
370	13	50				72	if ( $word =~ /Y$\|�$/ ) {
371	0					0	$word =~ s/Y$/i/;
372	0					0	$word =~ s/�$/c/;
373	0					0	$step4 = 0;
374							}
375							}
376
377	24	100	66			335	if ( $step4 == 1 && $step1 == 0 && $step2a == 0 && $step2b == 0 ) {
			66
			100
378							#### Step 4
379							#### If the word ends s, not preceded by a, i, o, u, � or s, delete it.
380							#print "word:$word RV:$RV\n";
381	11	100				52	if ( $word =~ /[^aiou�s]s$/ ) {
382	4					11	stem_killer( $word , "", "", "s" );
383							}
384
385	11					35	@suffix = qw(
386							ent
387							);
388
389							#### delete if in R2
390	11					31	stem_killer( $R2, "", "", @suffix);
391
392	11					20	@suffix = qw(
393							ion
394							);
395
396							#### delete if in R2 and preceded by s or t
397	11	50	33			145	if ( $R2 =~ /ion$/ && $RV =~ /tion\|sion/ ) {
398	0					0	stem_killer( $R2, "", "", @suffix);
399							}
400
401							#(So note that ion is removed only when it is in R2 - as well as being in RV - and preceded by s or t which must be in RV.)
402
403
404	11					29	@suffix = qw(
405							ier i�re Ier I�re
406							);
407
408							#### replace with i
409	11					26	stem_killer( $RV, "", "i", @suffix);
410
411	11					23	@suffix = qw(
412							e
413							);
414
415							#### e delete
416							#print "word:$word RV:$RV R1:$R1 R2:$R2\n";
417	11					21	stem_killer( $RV, "", "", @suffix);
418
419	11					54	@suffix = qw(
420							�
421							);
422
423							#### if preceded by gu, delete
424	11	50				34	if ( $RV =~ /gu�$/ ) {
425	0					0	stem_killer( $RV, "", "", @suffix);
426							}
427							}
428
429							#### Always do Step 5 and Step 6
430							#### step 5 : Undouble
431							#### If the word ends enn, onn, ett, ell or eill, delete the last letter
432	24					43	$word =~ s/enn$/en/;
433	24					39	$word =~ s/onn$/on/;
434	24					34	$word =~ s/ett$/et/;
435	24					28	$word =~ s/ell$/el/;
436	24					32	$word =~ s/eill$/eil/;
437
438							#### step 6 :Un-accent
439							#### If the words ends � or � followed by at least one non-vowel,
440							#### remove the accent from the e
441	24					42	$word =~ s/[��]([^aeiouy��]+?)$/e$1/;
442
443							#### And finally:
444							#### Turn any remaining I, U and Y letters into lower case.
445	24					56	$word =~ s/([IUY])/lc($1)/eg;
	8					37
446
447	24					99	return $word;
448
449							}
450
451							sub nvprec {
452
453	46			46	0	190	my($where, @list) = @_;
454	1			1		8	use vars qw($RV $R1 $R2 $word);
	1					13
	1					193
455	46					95	foreach my $p ( sort { length($b) <=> length($a) } @list) {
	3060					8260
456	818	50				16477	if ($where =~ /[^aeiouy��]$p$/) {
457	0					0	return 1;
458							}
459							}
460	46					194	return;
461							}
462
463							sub vprec {
464
465	24			24	0	48	my($where, @list) = @_;
466	1			1		4	use vars qw($RV $R1 $R2 $word);
	1					8
	1					144
467	24					46	foreach my $p ( sort { length($b) <=> length($a) } @list) {
	24					80
468	48	50				972	if ($where =~ /[aeiouy��]$p$/) {
469	0					0	return 1;
470							}
471							}
472	24					70	return;
473							}
474
475							sub stem_killer {
476	805			805	0	1985	my($where, $pre, $with, @list) = @_;
477	1			1		5	use vars qw($RV $R1 $R2 $word);
	1					2
	1					360
478	805					961	my $done = 0;
479	805					1522	foreach my $P (sort { length($b) <=> length($a) } @list) {
	6175					7017
480	3356	100				33367	if($where =~ /$pre$P$/) {
481	26					247	$R2 =~ s/$pre$P$/$with/;
482	26					290	$R1 =~ s/$pre$P$/$with/;
483	26					273	$RV =~ s/$pre$P$/$with/;
484	26					287	$word =~ s/$pre$P$/$with/;
485	26					42	$done = 1;
486	26					61	last;
487							}
488							}
489	805					3369	return $done;
490							}
491
492							sub stem_caching {
493	0			0	1		my $parm_ref;
494	0	0					if (ref $_[0]) {
495	0						$parm_ref = shift;
496							} else {
497	0						$parm_ref = { @_ };
498							}
499	0						my $caching_level = $parm_ref->{-level};
500	0	0					if (defined $caching_level) {
501	0	0					if ($caching_level !~ m/^[012]$/) {
502	0						croak(__PACKAGE__ . "::stem_caching() - Legal values are '0','1' or '2'. '$caching_level' is not a legal value");
503							}
504	0						$Stem_Caching = $caching_level;
505							}
506	0						return $Stem_Caching;
507							}
508
509							sub clear_stem_cache {
510	0			0	1		$Stem_Cache = {};
511							}
512
513							1;
514							__END__