File Coverage

blib/lib/Lingua/TFIDF/WordSegmenter/SplitBySpace.pm
Criterion Covered Total %
statement 28 28 100.0
branch 7 8 87.5
condition n/a
subroutine 9 9 100.0
pod 5 5 100.0
total 49 50 98.0


line stmt bran cond sub pod time code
1             package Lingua::TFIDF::WordSegmenter::SplitBySpace;
2              
3             # ABSTRACT: Simple word segmenter suitable for most european languages
4              
5 1     1   1047 use strict;
  1         1  
  1         40  
6 1     1   5 use warnings;
  1         2  
  1         31  
7 1     1   966 use Smart::Args;
  1         30242  
  1         432  
8              
9             sub new {
10 5     5 1 12339 args
11             my $class => 'ClassName',
12             my $lower_case => +{ isa => 'Bool', default => 0 },
13             my $remove_punctuations => +{ isa => 'Bool', default => 0 },
14             my $stop_words => +{ isa => 'ArrayRef[Str]', default => [] };
15              
16 5         1250 bless +{
17             lower_case => $lower_case,
18             remove_punctuations => $remove_punctuations,
19             stop_words => $stop_words,
20             } => $class;
21             }
22              
23 5     5 1 62 sub lower_case { $_[0]->{lower_case} }
24              
25 5     5 1 17 sub remove_punctuations { $_[0]->{remove_punctuations} }
26              
27             sub segment {
28 5     5 1 2599 args_pos
29             my $self,
30             my $document => 'Ref | Str';
31              
32 5 50       453 my @words = split /\s+/, ref $document ? $$document : $document;
33              
34 5 100       19 @words = map lc, @words if $self->lower_case;
35              
36 5 100       23 if ($self->remove_punctuations) {
37 1         127 s/^\W+|\W+$//g for @words;
38             }
39              
40 5 100       7 if (@{ $self->stop_words } != 0) {
  5         15  
41 2         4 my %stop_words = map { ($_ => 1) } @{ $self->stop_words };
  4         11  
  2         5  
42 2         5 @words = grep { not exists $stop_words{$_} } @words;
  52         98  
43             }
44              
45 5     128   31 return sub { shift @words };
  128         647  
46             }
47              
48 7     7 1 26 sub stop_words { $_[0]->{stop_words} }
49              
50             1;
51              
52             __END__
53              
54             =pod
55              
56             =encoding UTF-8
57              
58             =head1 NAME
59              
60             Lingua::TFIDF::WordSegmenter::SplitBySpace - Simple word segmenter suitable for most european languages
61              
62             =head1 VERSION
63              
64             version 0.01
65              
66             =head1 SYNOPSIS
67              
68             use Lingua::TFIDF::WordSegmenter::SplitBySpace;
69            
70             my $segmenter = Lingua::TFIDF::WordSegmenter::SplitBySpace->new(
71             lower_case => 1,
72             remove_punctuations => 1,
73             stop_words => [qw/i you he she it they a the am are is was were/],
74             );
75             my $iter = $segmenter->segment('Humpty Dumpty sat on wall, ...');
76             while (defined(my $word = $iter->())) { ... }
77              
78             =head1 DESCRIPTION
79              
80             This class is a simple word segmenter. Like L<Text::TFIDF>, this class segments a sentence into words by spliting by spaces.
81              
82             =head1 METHODS
83              
84             =head2 new([ lower_case => 0 ] [, remove_punctuations => 0 ] [, stop_words => [] ])
85              
86             Constructor. Takes some optional parameters:
87              
88             =over 2
89              
90             =item lower_case
91              
92             Set off by default. Convert all the words into lower cases.
93              
94             =item remove_punctuations
95              
96             Set off by default. Removes punctuation characters (e.g., commas, periods, quotes, question marks and exclamation marks) from head and tail of segmented words. Note that punctuations at inside of a word (e.g., "King's") will be remain unchanged.
97              
98             =item stop_words
99              
100             Specifies words you want to exclude from segmented words. This is useful for removing functional words.
101              
102             Note that stop word filtering will be performed B<after> C<lower_case> and C<remove_punctuations> options are processed. So, for example, if you enable C<lower_case> option and want to exclude "I" from result, you should supply the stop word list as C<['i']>.
103              
104             =back
105              
106             =head2 segment($document | \$document)
107              
108             Executes word segmentation on given C<$document> and returns an word iterator.
109              
110             =head1 AUTHOR
111              
112             Koichi SATOH <sekia@cpan.org>
113              
114             =head1 COPYRIGHT AND LICENSE
115              
116             This software is Copyright (c) 2014 by Koichi SATOH.
117              
118             This is free software, licensed under:
119              
120             The MIT (X11) License
121              
122             =cut