line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::TFIDF::WordSegmenter::SplitBySpace; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
# ABSTRACT: Simple word segmenter suitable for most european languages |
4
|
|
|
|
|
|
|
|
5
|
1
|
|
|
1
|
|
1047
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
40
|
|
6
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
31
|
|
7
|
1
|
|
|
1
|
|
966
|
use Smart::Args; |
|
1
|
|
|
|
|
30242
|
|
|
1
|
|
|
|
|
432
|
|
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
sub new { |
10
|
5
|
|
|
5
|
1
|
12339
|
args |
11
|
|
|
|
|
|
|
my $class => 'ClassName', |
12
|
|
|
|
|
|
|
my $lower_case => +{ isa => 'Bool', default => 0 }, |
13
|
|
|
|
|
|
|
my $remove_punctuations => +{ isa => 'Bool', default => 0 }, |
14
|
|
|
|
|
|
|
my $stop_words => +{ isa => 'ArrayRef[Str]', default => [] }; |
15
|
|
|
|
|
|
|
|
16
|
5
|
|
|
|
|
1250
|
bless +{ |
17
|
|
|
|
|
|
|
lower_case => $lower_case, |
18
|
|
|
|
|
|
|
remove_punctuations => $remove_punctuations, |
19
|
|
|
|
|
|
|
stop_words => $stop_words, |
20
|
|
|
|
|
|
|
} => $class; |
21
|
|
|
|
|
|
|
} |
22
|
|
|
|
|
|
|
|
23
|
5
|
|
|
5
|
1
|
62
|
sub lower_case { $_[0]->{lower_case} } |
24
|
|
|
|
|
|
|
|
25
|
5
|
|
|
5
|
1
|
17
|
sub remove_punctuations { $_[0]->{remove_punctuations} } |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
sub segment { |
28
|
5
|
|
|
5
|
1
|
2599
|
args_pos |
29
|
|
|
|
|
|
|
my $self, |
30
|
|
|
|
|
|
|
my $document => 'Ref | Str'; |
31
|
|
|
|
|
|
|
|
32
|
5
|
50
|
|
|
|
453
|
my @words = split /\s+/, ref $document ? $$document : $document; |
33
|
|
|
|
|
|
|
|
34
|
5
|
100
|
|
|
|
19
|
@words = map lc, @words if $self->lower_case; |
35
|
|
|
|
|
|
|
|
36
|
5
|
100
|
|
|
|
23
|
if ($self->remove_punctuations) { |
37
|
1
|
|
|
|
|
127
|
s/^\W+|\W+$//g for @words; |
38
|
|
|
|
|
|
|
} |
39
|
|
|
|
|
|
|
|
40
|
5
|
100
|
|
|
|
7
|
if (@{ $self->stop_words } != 0) { |
|
5
|
|
|
|
|
15
|
|
41
|
2
|
|
|
|
|
4
|
my %stop_words = map { ($_ => 1) } @{ $self->stop_words }; |
|
4
|
|
|
|
|
11
|
|
|
2
|
|
|
|
|
5
|
|
42
|
2
|
|
|
|
|
5
|
@words = grep { not exists $stop_words{$_} } @words; |
|
52
|
|
|
|
|
98
|
|
43
|
|
|
|
|
|
|
} |
44
|
|
|
|
|
|
|
|
45
|
5
|
|
|
128
|
|
31
|
return sub { shift @words }; |
|
128
|
|
|
|
|
647
|
|
46
|
|
|
|
|
|
|
} |
47
|
|
|
|
|
|
|
|
48
|
7
|
|
|
7
|
1
|
26
|
sub stop_words { $_[0]->{stop_words} } |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1; |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
__END__ |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
=pod |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
=encoding UTF-8 |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
=head1 NAME |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
Lingua::TFIDF::WordSegmenter::SplitBySpace - Simple word segmenter suitable for most european languages |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
=head1 VERSION |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
version 0.01 |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
=head1 SYNOPSIS |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
use Lingua::TFIDF::WordSegmenter::SplitBySpace; |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
my $segmenter = Lingua::TFIDF::WordSegmenter::SplitBySpace->new( |
71
|
|
|
|
|
|
|
lower_case => 1, |
72
|
|
|
|
|
|
|
remove_punctuations => 1, |
73
|
|
|
|
|
|
|
stop_words => [qw/i you he she it they a the am are is was were/], |
74
|
|
|
|
|
|
|
); |
75
|
|
|
|
|
|
|
my $iter = $segmenter->segment('Humpty Dumpty sat on wall, ...'); |
76
|
|
|
|
|
|
|
while (defined(my $word = $iter->())) { ... } |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
=head1 DESCRIPTION |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
This class is a simple word segmenter. Like L<Text::TFIDF>, this class segments a sentence into words by spliting by spaces. |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
=head1 METHODS |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
=head2 new([ lower_case => 0 ] [, remove_punctuations => 0 ] [, stop_words => [] ]) |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
Constructor. Takes some optional parameters: |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=over 2 |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
=item lower_case |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
Set off by default. Convert all the words into lower cases. |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
=item remove_punctuations |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
Set off by default. Removes punctuation characters (e.g., commas, periods, quotes, question marks and exclamation marks) from head and tail of segmented words. Note that punctuations at inside of a word (e.g., "King's") will be remain unchanged. |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
=item stop_words |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
Specifies words you want to exclude from segmented words. This is useful for removing functional words. |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
Note that stop word filtering will be performed B<after> C<lower_case> and C<remove_punctuations> options are processed. So, for example, if you enable C<lower_case> option and want to exclude "I" from result, you should supply the stop word list as C<['i']>. |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=back |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
=head2 segment($document | \$document) |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
Executes word segmentation on given C<$document> and returns an word iterator. |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
=head1 AUTHOR |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
Koichi SATOH <sekia@cpan.org> |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
This software is Copyright (c) 2014 by Koichi SATOH. |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
This is free software, licensed under: |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
The MIT (X11) License |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
=cut |