File Coverage

blib/lib/Lingua/TFIDF/WordSegmenter/SplitBySpace.pm

Criterion	Covered	Total	%
statement	28	28	100.0
branch	7	8	87.5
condition			n/a
subroutine	9	9	100.0
pod	5	5	100.0
total	49	50	98.0

line	stmt	bran	sub	pod	time	code
1						package Lingua::TFIDF::WordSegmenter::SplitBySpace;
2
3						# ABSTRACT: Simple word segmenter suitable for most european languages
4
5	1		1		1047	use strict;
	1				1
	1				40
6	1		1		5	use warnings;
	1				2
	1				31
7	1		1		966	use Smart::Args;
	1				30242
	1				432
8
9						sub new {
10	5		5	1	12339	args
11						my $class => 'ClassName',
12						my $lower_case => +{ isa => 'Bool', default => 0 },
13						my $remove_punctuations => +{ isa => 'Bool', default => 0 },
14						my $stop_words => +{ isa => 'ArrayRef[Str]', default => [] };
15
16	5				1250	bless +{
17						lower_case => $lower_case,
18						remove_punctuations => $remove_punctuations,
19						stop_words => $stop_words,
20						} => $class;
21						}
22
23	5		5	1	62	sub lower_case { $_[0]->{lower_case} }
24
25	5		5	1	17	sub remove_punctuations { $_[0]->{remove_punctuations} }
26
27						sub segment {
28	5		5	1	2599	args_pos
29						my $self,
30						my $document => 'Ref \| Str';
31
32	5	50			453	my @words = split /\s+/, ref $document ? $$document : $document;
33
34	5	100			19	@words = map lc, @words if $self->lower_case;
35
36	5	100			23	if ($self->remove_punctuations) {
37	1				127	s/^\W+\|\W+$//g for @words;
38						}
39
40	5	100			7	if (@{ $self->stop_words } != 0) {
	5				15
41	2				4	my %stop_words = map { ($_ => 1) } @{ $self->stop_words };
	4				11
	2				5
42	2				5	@words = grep { not exists $stop_words{$_} } @words;
	52				98
43						}
44
45	5		128		31	return sub { shift @words };
	128				647
46						}
47
48	7		7	1	26	sub stop_words { $_[0]->{stop_words} }
49
50						1;
51
52						__END__
53
54						=pod
55
56						=encoding UTF-8
57
58						=head1 NAME
59
60						Lingua::TFIDF::WordSegmenter::SplitBySpace - Simple word segmenter suitable for most european languages
61
62						=head1 VERSION
63
64						version 0.01
65
66						=head1 SYNOPSIS
67
68						use Lingua::TFIDF::WordSegmenter::SplitBySpace;
69
70						my $segmenter = Lingua::TFIDF::WordSegmenter::SplitBySpace->new(
71						lower_case => 1,
72						remove_punctuations => 1,
73						stop_words => [qw/i you he she it they a the am are is was were/],
74						);
75						my $iter = $segmenter->segment('Humpty Dumpty sat on wall, ...');
76						while (defined(my $word = $iter->())) { ... }
77
78						=head1 DESCRIPTION
79
80						This class is a simple word segmenter. Like L<Text::TFIDF>, this class segments a sentence into words by spliting by spaces.
81
82						=head1 METHODS
83
84						=head2 new([ lower_case => 0 ] [, remove_punctuations => 0 ] [, stop_words => [] ])
85
86						Constructor. Takes some optional parameters:
87
88						=over 2
89
90						=item lower_case
91
92						Set off by default. Convert all the words into lower cases.
93
94						=item remove_punctuations
95
96						Set off by default. Removes punctuation characters (e.g., commas, periods, quotes, question marks and exclamation marks) from head and tail of segmented words. Note that punctuations at inside of a word (e.g., "King's") will be remain unchanged.
97
98						=item stop_words
99
100						Specifies words you want to exclude from segmented words. This is useful for removing functional words.
101
102						Note that stop word filtering will be performed B<after> C<lower_case> and C<remove_punctuations> options are processed. So, for example, if you enable C<lower_case> option and want to exclude "I" from result, you should supply the stop word list as C<['i']>.
103
104						=back
105
106						=head2 segment($document \| \$document)
107
108						Executes word segmentation on given C<$document> and returns an word iterator.
109
110						=head1 AUTHOR
111
112						Koichi SATOH <sekia@cpan.org>
113
114						=head1 COPYRIGHT AND LICENSE
115
116						This software is Copyright (c) 2014 by Koichi SATOH.
117
118						This is free software, licensed under:
119
120						The MIT (X11) License
121
122						=cut