File Coverage

blib/lib/Lingua/Stem/Cistem.pm

Criterion	Covered	Total	%
statement	124	124	100.0
branch	32	32	100.0
condition	50	59	84.7
subroutine	10	10	100.0
pod	4	4	100.0
total	220	229	96.0

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::Stem::Cistem;
2
3	2			2		150107	use strict;
	2					14
	2					53
4	2			2		10	use warnings;
	2					4
	2					54
5
6	2			2		11	use utf8;
	2					3
	2					9
7
8	2			2		102	use 5.010001;
	2					8
9
10							require Exporter;
11
12							BEGIN {
13	2			2		7	$Lingua::Stem::Cistem::VERSION = '0.08';
14	2					31	@Lingua::Stem::Cistem::ISA = qw(Exporter);
15	2					6	@Lingua::Stem::Cistem::EXPORT = qw();
16	2					5	@Lingua::Stem::Cistem::EXPORT_OK = qw(stem segment stem_robust segment_robust);
17	2					222	%Lingua::Stem::Cistem::EXPORT_TAGS = (
18							'all' => [qw(stem segment stem_robust segment_robust)],
19							'orig' => [qw(stem segment)],
20							'robust' => [qw(stem_robust segment_robust)],
21							);
22							}
23
24							sub stem {
25	43		50	43	1	35417	my $word = shift // '';
26	43					71	my $case_insensitive = shift;
27
28	43					100	my $upper = (ucfirst $word eq $word);
29
30	43					83	$word = lc($word);
31	2			2		20	$word =~ tr/äöü/aou/;
	2					3
	2					31
	43					138
32	43					175	$word =~ s/ß/ss/g;
33
34	43					84	$word =~ s/^ge(.{4,})/$1/;
35
36	43					79	$word =~ s/sch/\$/g;
37	43					101	$word =~ s/ei/\%/g;
38	43					70	$word =~ s/ie/\&/g;
39	43					158	$word =~ s/(.)\1/$1*/g;
40
41	43					131	while(length($word)>3) {
42	28	100	66			293	if( length($word)>5 && ($word =~ s/e[mr]$// \|\| $word =~ s/nd$//) ) {}
		100	66
		100	100
			100
43							elsif( (!($upper) \|\| $case_insensitive) && $word =~ s/t$//) {}
44							elsif( $word =~ s/[esn]$//) {}
45	17					33	else { last; }
46							}
47
48	43					134	$word =~ s/(.)\*/$1$1/g;
49	43					72	$word =~ s/\$/sch/g;
50	43					67	$word =~ s/\%/ei/g;
51	43					82	$word =~ s/\&/ie/g;
52
53	43					267	return $word;
54							}
55
56							sub segment {
57	37		50	37	1	19106	my $word = shift // '';
58	37					61	my $case_insensitive = shift;
59
60	37					85	my $upper = (ucfirst $word eq $word);
61
62	37					73	$word = lc($word);
63
64	37					54	my $original = $word;
65
66	37					87	$word =~ s/sch/\$/g;
67	37					65	$word =~ s/ei/\%/g;
68	37					59	$word =~ s/ie/\&/g;
69	37					275	$word =~ s/(.)\1/$1*/g;
70
71	37					56	my $suffix_length = 0;
72
73	37					96	while(length($word)>3){
74	21	100	66			174	if( length($word)>5 && ($word =~ s/(e[mr])$// \|\| $word =~ s/(nd)$//) ) {
		100	66
		100	100
			66
75	3					10	$suffix_length += 2;
76							}
77							elsif( (!($upper) \|\| $case_insensitive) && $word =~ s/t$//) {
78	5					17	$suffix_length++;
79							}
80							elsif( $word =~ s/([esn])$//) {
81	3					9	$suffix_length++;
82							}
83	10					19	else{ last; }
84							}
85
86	37					71	$word =~ s/(.)\*/$1$1/g;
87
88	37					72	$word =~ s/\$/sch/g;
89	37					52	$word =~ s/\%/ei/g;
90	37					61	$word =~ s/\&/ie/g;
91
92	37					68	my $suffix = '';
93
94	37	100				68	if( $suffix_length ) {
95	11					22	$suffix = substr($original, - $suffix_length);
96							}
97
98	37					214	return ($word, $suffix);
99							}
100
101							sub stem_robust {
102	51		50	51	1	27524	my $word = shift // '';
103	51					85	my $case_insensitive = shift;
104	51					81	my $keep_ge_prefix = shift;
105
106	51					117	my $ucfirst = (ucfirst $word eq $word);
107
108	51					121	$word = lc($word);
109	51					21689	$word =~ tr/äöü/aou/;
110	51					151	$word =~ s/([aou])\N{U+0308}/$1/g; # remove U+0308 COMBINING DIAERESIS
111	51					94	$word =~ s/ß/ss/g;
112
113	51	100				119	$word =~ s/^ge(.{4,})/$1/ unless $keep_ge_prefix;
114
115	51					97	$word =~ s/sch/\N{U+0006}/g; # \N{U+0006} ACK
116	51					78	$word =~ s/ei/\N{U+0007}/g; # \N{U+0007} BEL
117	51					418	$word =~ s/ie/\N{U+0008}/g; # \N{U+0008} BS
118
119	51					179	$word =~ s/(.)\1/$1*/g;
120
121	51					316	my @graphemes = $word =~ m/\X/g;
122	51					99	my $length = scalar @graphemes;
123							#my $length = scalar (($word =~ m/\X/g)); # does not work
124
125	51					122	while($length > 3) {
126	32	100	100			293	if ( $length>5 && ($word =~ s/e[mr]$// \|\| $word =~ s/nd$//) ) {$length -= 2;}
	3	100	100			9
		100	100
			100
127	5					16	elsif ( (!($ucfirst) \|\| $case_insensitive) && $word =~ s/t$//) {$length--;}
128	5					15	elsif ( $word =~ s/[esn]$//) {$length--;}
129	19					37	else { last; }
130							}
131
132	51					102	$word =~ s/(.)\*/$1$1/g;
133
134	51					84	$word =~ s/\N{U+0006}/sch/g; # \N{U+0006} ACK
135	51					76	$word =~ s/\N{U+0007}/ei/g; # \N{U+0007} BEL
136	51					78	$word =~ s/\N{U+0008}/ie/g; # \N{U+0008} BS
137
138	51					264	return $word;
139							}
140
141
142							sub segment_robust {
143	51		50	51	1	27627	my $word = shift // '';
144	51					81	my $case_insensitive = shift;
145	51					62	my $keep_ge_prefix = shift;
146
147	51					109	my $ucfirst = (ucfirst $word eq $word);
148
149	51					98	$word = lc($word);
150	51					160	$word =~ tr/äöü/aou/;
151	51					127	$word =~ s/([aou])\N{U+0308}/$1/g; # remove U+0308 COMBINING DIAERESIS
152	51					81	$word =~ s/ß/ss/g;
153
154	51					73	my $prefix = '';
155	51	100	100			208	if (!$keep_ge_prefix && $word =~ s/^ge(.{4,})/$1/) {
156	2					6	$prefix = 'ge';
157							}
158
159	51					69	my $original = $word;
160
161	51					85	$word =~ s/sch/\N{U+0006}/g; # \N{U+0006} ACK
162	51					69	$word =~ s/ei/\N{U+0007}/g; # \N{U+0007} BEL
163	51					70	$word =~ s/ie/\N{U+0008}/g; # \N{U+0008} BS
164
165	51					872	$word =~ s/(.)\1/$1*/g;
166
167	51					322	my @graphemes = $word =~ m/\X/g;
168	51					89	my $length = scalar @graphemes;
169
170	51					73	my $suffix_length = 0;
171
172	51					110	while($length > 3){
173	32	100	100			265	if( $length > 5 && ($word =~ s/(e[mr])$// \|\| $word =~ s/(nd)$//) ) {
		100	100
		100	100
			100
174	3					5	$suffix_length += 2;
175	3					6	$length -= 2;
176							}
177							elsif( (!($ucfirst) \|\| $case_insensitive) && $word =~ s/t$//) {
178	5					9	$suffix_length++;
179	5					9	$length--;
180							}
181							elsif( $word =~ s/([esn])$//) {
182	5					8	$suffix_length++;
183	5					11	$length--;
184							}
185	19					34	else{ last; }
186							}
187
188	51					92	$word =~ s/(.)\*/$1$1/g;
189
190	51					70	$word =~ s/\N{U+0006}/sch/g; # \N{U+0006} ACK
191	51					64	$word =~ s/\N{U+0007}/ei/g; # \N{U+0007} BEL
192	51					67	$word =~ s/\N{U+0008}/ie/g; # \N{U+0008} BS
193
194	51					78	my $suffix = '';
195
196	51	100				81	if( $suffix_length ) {
197	13					143	$suffix = substr($original, - $suffix_length);
198							}
199
200	51					326	return ($prefix, $word, $suffix);
201							}
202
203							1;
204							__END__