line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::Stem::Any; |
2
|
|
|
|
|
|
|
|
3
|
4
|
|
|
4
|
|
213596
|
use v5.8.1; |
|
4
|
|
|
|
|
45
|
|
|
4
|
|
|
|
|
181
|
|
4
|
4
|
|
|
4
|
|
21
|
use utf8; |
|
4
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
29
|
|
5
|
4
|
|
|
4
|
|
77
|
use Carp; |
|
4
|
|
|
|
|
11
|
|
|
4
|
|
|
|
|
555
|
|
6
|
4
|
|
|
4
|
|
24
|
use List::Util qw( any first ); |
|
4
|
|
|
|
|
6
|
|
|
4
|
|
|
|
|
626
|
|
7
|
4
|
|
|
4
|
|
3786
|
use Unicode::CaseFold qw( fc ); |
|
4
|
|
|
|
|
5405
|
|
|
4
|
|
|
|
|
364
|
|
8
|
4
|
|
|
4
|
|
4127
|
use Unicode::Normalize qw( NFC ); |
|
4
|
|
|
|
|
10540
|
|
|
4
|
|
|
|
|
361
|
|
9
|
|
|
|
|
|
|
|
10
|
4
|
|
|
4
|
|
3894
|
use Moo; |
|
4
|
|
|
|
|
69508
|
|
|
4
|
|
|
|
|
25
|
|
11
|
4
|
|
|
4
|
|
10889
|
use namespace::clean; |
|
4
|
|
|
|
|
58271
|
|
|
4
|
|
|
|
|
33
|
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
our $VERSION = '0.05'; |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
my %language_alias = ( |
16
|
|
|
|
|
|
|
nb => 'no', |
17
|
|
|
|
|
|
|
nn => 'no', |
18
|
|
|
|
|
|
|
); |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
has language => ( |
21
|
|
|
|
|
|
|
is => 'rw', |
22
|
|
|
|
|
|
|
isa => sub { |
23
|
|
|
|
|
|
|
croak "Language is not defined" unless defined $_[0]; |
24
|
|
|
|
|
|
|
croak "Invalid language '$_[0]'" unless _is_language($_[0]); |
25
|
|
|
|
|
|
|
}, |
26
|
|
|
|
|
|
|
coerce => sub { $_[0] && ($language_alias{lc $_[0]} || lc $_[0]) }, |
27
|
|
|
|
|
|
|
trigger => 1, |
28
|
|
|
|
|
|
|
default => 'en', |
29
|
|
|
|
|
|
|
); |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
has source => ( |
32
|
|
|
|
|
|
|
is => 'rw', |
33
|
|
|
|
|
|
|
isa => sub { |
34
|
|
|
|
|
|
|
croak "Source is not defined" unless defined $_[0]; |
35
|
|
|
|
|
|
|
croak "Invalid source '$_[0]'" unless _is_source($_[0]); |
36
|
|
|
|
|
|
|
}, |
37
|
|
|
|
|
|
|
trigger => 1, |
38
|
|
|
|
|
|
|
); |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
has cache => ( |
41
|
|
|
|
|
|
|
is => 'rw', |
42
|
|
|
|
|
|
|
coerce => sub { !!$_[0] }, |
43
|
|
|
|
|
|
|
default => 0, |
44
|
|
|
|
|
|
|
trigger => 1, |
45
|
|
|
|
|
|
|
); |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
has exceptions => ( |
48
|
|
|
|
|
|
|
is => 'rw', |
49
|
|
|
|
|
|
|
isa => sub { |
50
|
|
|
|
|
|
|
croak 'Exceptions must be a hashref' |
51
|
|
|
|
|
|
|
if ref $_[0] ne 'HASH'; |
52
|
|
|
|
|
|
|
croak 'Exceptions must only include hashref values' |
53
|
|
|
|
|
|
|
if any { ref $_ ne 'HASH' } values %{$_[0]}; |
54
|
|
|
|
|
|
|
}, |
55
|
|
|
|
|
|
|
default => sub { {} }, |
56
|
|
|
|
|
|
|
); |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
has normalize => ( |
59
|
|
|
|
|
|
|
is => 'rw', |
60
|
|
|
|
|
|
|
coerce => sub { !!$_[0] }, |
61
|
|
|
|
|
|
|
default => 1, |
62
|
|
|
|
|
|
|
); |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
has casefold => ( |
65
|
|
|
|
|
|
|
is => 'rw', |
66
|
|
|
|
|
|
|
coerce => sub { !!$_[0] }, |
67
|
|
|
|
|
|
|
default => 1, |
68
|
|
|
|
|
|
|
); |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
has _stemmer => ( |
71
|
|
|
|
|
|
|
is => 'ro', |
72
|
|
|
|
|
|
|
builder => '_build_stemmer', |
73
|
|
|
|
|
|
|
clearer => 1, |
74
|
|
|
|
|
|
|
lazy => 1, |
75
|
|
|
|
|
|
|
); |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
has _stemmers => ( |
78
|
|
|
|
|
|
|
is => 'ro', |
79
|
|
|
|
|
|
|
default => sub { {} }, |
80
|
|
|
|
|
|
|
); |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
has _cache_data => ( |
83
|
|
|
|
|
|
|
is => 'rw', |
84
|
|
|
|
|
|
|
default => sub { {} }, |
85
|
|
|
|
|
|
|
); |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
my %sources = ( |
88
|
|
|
|
|
|
|
'Lingua::Stem::Snowball' => { |
89
|
|
|
|
|
|
|
languages => {map { $_ => 1 } qw( |
90
|
|
|
|
|
|
|
da de en es fi fr hu it la nl no pt ro ru sv tr |
91
|
|
|
|
|
|
|
)}, |
92
|
|
|
|
|
|
|
builder => sub { |
93
|
|
|
|
|
|
|
my $language = shift; |
94
|
|
|
|
|
|
|
require Lingua::Stem::Snowball; |
95
|
|
|
|
|
|
|
my $stemmer = Lingua::Stem::Snowball->new( |
96
|
|
|
|
|
|
|
lang => $language, |
97
|
|
|
|
|
|
|
encoding => 'UTF-8', |
98
|
|
|
|
|
|
|
); |
99
|
|
|
|
|
|
|
return { |
100
|
|
|
|
|
|
|
stem => sub { $stemmer->stem(shift) }, |
101
|
|
|
|
|
|
|
language => sub { $stemmer->lang(shift) }, |
102
|
|
|
|
|
|
|
}; |
103
|
|
|
|
|
|
|
}, |
104
|
|
|
|
|
|
|
}, |
105
|
|
|
|
|
|
|
'Lingua::Stem::UniNE' => { |
106
|
|
|
|
|
|
|
languages => {map { $_ => 1 } qw( |
107
|
|
|
|
|
|
|
bg cs de fa |
108
|
|
|
|
|
|
|
)}, |
109
|
|
|
|
|
|
|
builder => sub { |
110
|
|
|
|
|
|
|
my $language = shift; |
111
|
|
|
|
|
|
|
require Lingua::Stem::UniNE; |
112
|
|
|
|
|
|
|
my $stemmer = Lingua::Stem::UniNE->new(language => $language); |
113
|
|
|
|
|
|
|
return { |
114
|
|
|
|
|
|
|
stem => sub { $stemmer->stem(shift) }, |
115
|
|
|
|
|
|
|
language => sub { $stemmer->language(shift) }, |
116
|
|
|
|
|
|
|
}; |
117
|
|
|
|
|
|
|
}, |
118
|
|
|
|
|
|
|
}, |
119
|
|
|
|
|
|
|
'Lingua::Stem' => { |
120
|
|
|
|
|
|
|
languages => {map { $_ => 1 } qw( |
121
|
|
|
|
|
|
|
da de en fr gl it no pt ru sv |
122
|
|
|
|
|
|
|
)}, |
123
|
|
|
|
|
|
|
builder => sub { |
124
|
|
|
|
|
|
|
my $language = shift; |
125
|
|
|
|
|
|
|
require Lingua::Stem; |
126
|
|
|
|
|
|
|
my $stemmer = Lingua::Stem->new(-locale => $language); |
127
|
|
|
|
|
|
|
return { |
128
|
|
|
|
|
|
|
stem => sub { @{ $stemmer->stem(shift) }[0] }, |
129
|
|
|
|
|
|
|
language => sub { $stemmer->set_locale(shift) }, |
130
|
|
|
|
|
|
|
}; |
131
|
|
|
|
|
|
|
}, |
132
|
|
|
|
|
|
|
}, |
133
|
|
|
|
|
|
|
'Lingua::Stem::Patch' => { |
134
|
|
|
|
|
|
|
languages => {map { $_ => 1 } qw( |
135
|
|
|
|
|
|
|
eo io pl |
136
|
|
|
|
|
|
|
)}, |
137
|
|
|
|
|
|
|
builder => sub { |
138
|
|
|
|
|
|
|
my $language = shift; |
139
|
|
|
|
|
|
|
require Lingua::Stem::Patch; |
140
|
|
|
|
|
|
|
my $stemmer = Lingua::Stem::Patch->new(language => $language); |
141
|
|
|
|
|
|
|
return { |
142
|
|
|
|
|
|
|
stem => sub { $stemmer->stem(shift) }, |
143
|
|
|
|
|
|
|
language => sub { $stemmer->language(shift) }, |
144
|
|
|
|
|
|
|
}; |
145
|
|
|
|
|
|
|
}, |
146
|
|
|
|
|
|
|
}, |
147
|
|
|
|
|
|
|
); |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
my %languages = map { %{$_->{languages}} } values %sources; |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
my @source_order = qw( |
152
|
|
|
|
|
|
|
Lingua::Stem::Snowball |
153
|
|
|
|
|
|
|
Lingua::Stem::UniNE |
154
|
|
|
|
|
|
|
Lingua::Stem |
155
|
|
|
|
|
|
|
Lingua::Stem::Patch |
156
|
|
|
|
|
|
|
); |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
# functions |
159
|
|
|
|
|
|
|
|
160
|
91
|
|
|
91
|
|
2517
|
sub _is_language { exists $languages{ $_[0] } } |
161
|
37
|
|
|
37
|
|
892
|
sub _is_source { exists $sources{ $_[0] } } |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
# methods |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
sub BUILD { |
166
|
9
|
|
|
9
|
0
|
130
|
my ($self) = @_; |
167
|
|
|
|
|
|
|
|
168
|
9
|
|
|
|
|
25
|
$self->_trigger_language; |
169
|
|
|
|
|
|
|
} |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
# the stemmer is cleared whenever a language or source is updated |
172
|
|
|
|
|
|
|
sub _trigger_language { |
173
|
96
|
|
|
96
|
|
1023
|
my ($self) = @_; |
174
|
|
|
|
|
|
|
|
175
|
96
|
|
|
|
|
1881
|
$self->_clear_stemmer; |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
# keep current source if it supports this language |
178
|
96
|
100
|
100
|
|
|
3362
|
return if $self->source |
179
|
|
|
|
|
|
|
&& $sources{$self->source}{languages}{$self->language}; |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
# use the first supported source for this language |
182
|
|
|
|
|
|
|
$self->source( |
183
|
53
|
|
|
53
|
|
1808
|
first { $sources{$_}{languages}{$self->language} } @source_order |
184
|
31
|
|
|
|
|
3225
|
); |
185
|
|
|
|
|
|
|
} |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
sub _trigger_source { |
188
|
36
|
|
|
36
|
|
426
|
my ($self) = @_; |
189
|
|
|
|
|
|
|
|
190
|
36
|
|
|
|
|
703
|
$self->_clear_stemmer; |
191
|
|
|
|
|
|
|
} |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
# the stemmer is built lazily on first use |
194
|
|
|
|
|
|
|
sub _build_stemmer { |
195
|
65
|
|
|
65
|
|
4177
|
my ($self) = @_; |
196
|
|
|
|
|
|
|
|
197
|
65
|
100
|
|
|
|
1394
|
croak sprintf "Invalid source '%s' for language '%s'" => ( |
198
|
|
|
|
|
|
|
$self->source, $self->language |
199
|
|
|
|
|
|
|
) unless $sources{$self->source}{languages}{$self->language}; |
200
|
|
|
|
|
|
|
|
201
|
64
|
|
66
|
|
|
3470
|
my $stemmer |
202
|
|
|
|
|
|
|
= $self->_stemmers->{$self->source} |
203
|
|
|
|
|
|
|
||= $sources{$self->source}{builder}( $self->language ); |
204
|
|
|
|
|
|
|
|
205
|
64
|
|
|
|
|
1767
|
$stemmer->{language}( $self->language ); |
206
|
|
|
|
|
|
|
|
207
|
64
|
|
|
|
|
3762
|
return $stemmer; |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
sub _get_stem { |
211
|
189
|
|
|
189
|
|
353
|
my ($self, $word) = @_; |
212
|
189
|
|
|
|
|
5027
|
my $exceptions = $self->exceptions->{$self->language}; |
213
|
|
|
|
|
|
|
|
214
|
189
|
50
|
|
|
|
8387
|
return $word unless $word; |
215
|
|
|
|
|
|
|
|
216
|
189
|
50
|
|
1
|
|
4045
|
$word = fc $word if $self->casefold; |
|
1
|
|
|
|
|
520
|
|
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
14
|
|
217
|
189
|
50
|
|
|
|
34523
|
$word = NFC $word if $self->normalize; |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
# get from exceptions |
220
|
189
|
100
|
66
|
|
|
4559
|
return $exceptions->{$word} |
221
|
|
|
|
|
|
|
if $exceptions |
222
|
|
|
|
|
|
|
&& exists $exceptions->{$word}; |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
# stem without caching |
225
|
185
|
100
|
|
|
|
4180
|
return $self->_stemmer->{stem}($word) |
226
|
|
|
|
|
|
|
unless $self->cache; |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
# get from cache |
229
|
6
|
100
|
|
|
|
175
|
return $self->_cache_data->{$self->source}{$self->language}{$word} |
230
|
|
|
|
|
|
|
if exists $self->_cache_data->{$self->source}{$self->language}{$word}; |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
# stem and add to cache |
233
|
4
|
|
|
|
|
218
|
return $self->_cache_data->{$self->source}{$self->language}{$word} |
234
|
|
|
|
|
|
|
= $self->_stemmer->{stem}($word); |
235
|
|
|
|
|
|
|
} |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
sub stem { |
238
|
85
|
|
|
85
|
1
|
34452
|
my $self = shift; |
239
|
|
|
|
|
|
|
|
240
|
85
|
100
|
|
|
|
227
|
return map { $self->_get_stem($_) } @_ |
|
15
|
|
|
|
|
5471
|
|
241
|
|
|
|
|
|
|
if wantarray; |
242
|
|
|
|
|
|
|
|
243
|
75
|
100
|
|
|
|
412
|
return $self->_get_stem(pop) |
244
|
|
|
|
|
|
|
if @_; |
245
|
|
|
|
|
|
|
|
246
|
2
|
|
|
|
|
9
|
return; |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
sub stem_in_place { |
250
|
50
|
|
|
50
|
1
|
81682
|
my ($self, $words) = @_; |
251
|
|
|
|
|
|
|
|
252
|
50
|
50
|
|
|
|
175
|
croak 'Argument to stem_in_place() must be an arrayref' |
253
|
|
|
|
|
|
|
if ref $words ne 'ARRAY'; |
254
|
|
|
|
|
|
|
|
255
|
50
|
|
|
|
|
100
|
for my $word (@$words) { |
256
|
101
|
|
|
|
|
8016
|
$word = $self->_get_stem($word); |
257
|
|
|
|
|
|
|
} |
258
|
|
|
|
|
|
|
|
259
|
50
|
|
|
|
|
13506
|
return; |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
sub languages { |
263
|
3
|
|
|
3
|
1
|
1683
|
my ($self, $source) = @_; |
264
|
3
|
|
|
|
|
3
|
my @languages; |
265
|
|
|
|
|
|
|
|
266
|
3
|
100
|
66
|
|
|
18
|
if ($source && $sources{$source}) { |
|
|
50
|
|
|
|
|
|
267
|
1
|
|
|
|
|
2
|
@languages = sort keys %{$sources{$source}{languages}}; |
|
1
|
|
|
|
|
12
|
|
268
|
|
|
|
|
|
|
} |
269
|
|
|
|
|
|
|
elsif (!$source) { |
270
|
2
|
|
|
|
|
27
|
@languages = sort keys %languages; |
271
|
|
|
|
|
|
|
} |
272
|
|
|
|
|
|
|
|
273
|
3
|
|
|
|
|
25
|
return @languages; |
274
|
|
|
|
|
|
|
} |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
sub sources { |
277
|
3
|
|
|
3
|
1
|
6
|
my ($self, $language) = @_; |
278
|
|
|
|
|
|
|
|
279
|
3
|
100
|
|
|
|
14
|
return @source_order unless $language; |
280
|
|
|
|
|
|
|
|
281
|
4
|
50
|
|
|
|
25
|
return grep { |
282
|
1
|
|
|
|
|
2
|
$sources{$_} && $sources{$_}{languages}{$language} |
283
|
|
|
|
|
|
|
} @source_order; |
284
|
|
|
|
|
|
|
} |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
sub clear_cache { |
287
|
2
|
|
|
2
|
1
|
858
|
my ($self) = @_; |
288
|
|
|
|
|
|
|
|
289
|
2
|
|
|
|
|
40
|
$self->_cache_data( {} ); |
290
|
|
|
|
|
|
|
} |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
sub _trigger_cache { |
293
|
4
|
|
|
4
|
|
65
|
my ($self) = @_; |
294
|
|
|
|
|
|
|
|
295
|
4
|
100
|
|
|
|
97
|
if ( !$self->cache ) { |
296
|
1
|
|
|
|
|
11
|
$self->clear_cache; |
297
|
|
|
|
|
|
|
} |
298
|
|
|
|
|
|
|
} |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
1; |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
__END__ |