| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# SPDX-FileCopyrightText: 2014 Koichi SATOH |
|
2
|
|
|
|
|
|
|
# SPDX-FileCopyrightText: 2026 Wesley Schwengle |
|
3
|
|
|
|
|
|
|
# |
|
4
|
|
|
|
|
|
|
# SPDX-License-Identifier: MIT |
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package Lingua::TermWeight::WordSegmenter::SplitBySpace; |
|
7
|
|
|
|
|
|
|
our $VERSION = '0.01'; |
|
8
|
|
|
|
|
|
|
# ABSTRACT: Simple word segmenter suitable for most european languages |
|
9
|
|
|
|
|
|
|
|
|
10
|
2
|
|
|
2
|
|
350318
|
use v5.20; |
|
|
2
|
|
|
|
|
7
|
|
|
11
|
2
|
|
|
2
|
|
7
|
use warnings; |
|
|
2
|
|
|
|
|
2
|
|
|
|
2
|
|
|
|
|
122
|
|
|
12
|
2
|
|
|
2
|
|
1106
|
use Object::Pad; |
|
|
2
|
|
|
|
|
15489
|
|
|
|
2
|
|
|
|
|
7
|
|
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
class Lingua::TermWeight::WordSegmenter::SplitBySpace { |
|
15
|
|
|
|
|
|
|
|
|
16
|
5
|
|
|
5
|
1
|
8
|
field $lower_case :accessor :param = 0; |
|
17
|
5
|
50
|
|
5
|
1
|
35
|
field $remove_punctuations :accessor :param = 0; |
|
|
5
|
|
|
|
|
6
|
|
|
18
|
5
|
50
|
|
7
|
1
|
14
|
field $stop_words :accessor :param = []; |
|
|
7
|
|
|
|
|
8
|
|
|
19
|
|
|
|
|
|
|
|
|
20
|
7
|
50
|
|
5
|
1
|
33
|
method segment ($document) { |
|
|
5
|
|
|
|
|
1799
|
|
|
|
5
|
|
|
|
|
16
|
|
|
|
5
|
|
|
|
|
8
|
|
|
|
5
|
|
|
|
|
5
|
|
|
21
|
5
|
50
|
|
|
|
33
|
my @words = split /\s+/, ref $document ? $$document : $document; |
|
22
|
|
|
|
|
|
|
|
|
23
|
5
|
100
|
|
|
|
10
|
@words = map lc, @words if $self->lower_case; |
|
24
|
|
|
|
|
|
|
|
|
25
|
5
|
100
|
|
|
|
11
|
if ($self->remove_punctuations) { |
|
26
|
1
|
|
|
|
|
28
|
s/^\W+|\W+$//g for @words; |
|
27
|
|
|
|
|
|
|
} |
|
28
|
|
|
|
|
|
|
|
|
29
|
5
|
100
|
|
|
|
7
|
if (@{ $self->stop_words } != 0) { |
|
|
5
|
|
|
|
|
6
|
|
|
30
|
2
|
|
|
|
|
3
|
my %stop_words = map { ($_ => 1) } @{ $self->stop_words }; |
|
|
4
|
|
|
|
|
10
|
|
|
|
2
|
|
|
|
|
3
|
|
|
31
|
2
|
|
|
|
|
5
|
@words = grep { not exists $stop_words{$_} } @words; |
|
|
52
|
|
|
|
|
71
|
|
|
32
|
|
|
|
|
|
|
} |
|
33
|
|
|
|
|
|
|
|
|
34
|
5
|
|
|
128
|
|
15
|
return sub { shift @words }; |
|
|
128
|
|
|
|
|
287
|
|
|
35
|
|
|
|
|
|
|
} |
|
36
|
|
|
|
|
|
|
} |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
1; |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
__END__ |