line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
1
|
|
|
1
|
|
23730
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
50
|
|
2
|
1
|
|
|
1
|
|
6
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
65
|
|
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
package Text::WordCounter; |
5
|
|
|
|
|
|
|
{ |
6
|
|
|
|
|
|
|
$Text::WordCounter::VERSION = '0.001'; |
7
|
|
|
|
|
|
|
} |
8
|
|
|
|
|
|
|
|
9
|
1
|
|
|
1
|
|
1017
|
use namespace::autoclean; |
|
1
|
|
|
|
|
30995
|
|
|
1
|
|
|
|
|
8
|
|
10
|
1
|
|
|
1
|
|
647
|
use Moose; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
use Lingua::ZH::MMSEG; |
13
|
|
|
|
|
|
|
use Unicode::UCD qw(charinfo); |
14
|
|
|
|
|
|
|
use URI::Find; |
15
|
|
|
|
|
|
|
use Lingua::Stem; |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
has stemming => (is => 'rw', isa => 'Int', default => 0); |
18
|
|
|
|
|
|
|
has stopwords => (is => 'ro', isa => 'HashRef', default => sub { {} }); |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
sub is_stop_word { |
21
|
|
|
|
|
|
|
my( $self, $word, $script ) = @_; |
22
|
|
|
|
|
|
|
return 0 if( $script eq 'Han' ); |
23
|
|
|
|
|
|
|
return 1 if exists $self->stopwords->{lc $word}; |
24
|
|
|
|
|
|
|
return length($word) <= 3; |
25
|
|
|
|
|
|
|
} |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
sub normalize { |
28
|
|
|
|
|
|
|
my ($self, $word) = @_; |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
if ($self->stemming) { |
31
|
|
|
|
|
|
|
my $stemmed = Lingua::Stem::stem($word)->[0]; |
32
|
|
|
|
|
|
|
if ($stemmed ne '') { |
33
|
|
|
|
|
|
|
return $stemmed; |
34
|
|
|
|
|
|
|
} |
35
|
|
|
|
|
|
|
} |
36
|
|
|
|
|
|
|
return lc $word |
37
|
|
|
|
|
|
|
} |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
my %char_cache = (); |
40
|
|
|
|
|
|
|
sub split_scripts { |
41
|
|
|
|
|
|
|
my ( $self, $text ) = @_; |
42
|
|
|
|
|
|
|
my @parts; |
43
|
|
|
|
|
|
|
while ( $text =~ /(\X)/g ) { |
44
|
|
|
|
|
|
|
my $part = $1; |
45
|
|
|
|
|
|
|
my $pos = pos( $text ); |
46
|
|
|
|
|
|
|
my $ord = ord $part; |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
unless ($char_cache{$ord}) { |
49
|
|
|
|
|
|
|
if (scalar(keys(%char_cache)) > 5000) { |
50
|
|
|
|
|
|
|
# XXX: Some LRU cache would be more appropriate, but this cleaning |
51
|
|
|
|
|
|
|
# will probably happen very rarely or never, so there's (hopefully) no |
52
|
|
|
|
|
|
|
# need to bother about it too much |
53
|
|
|
|
|
|
|
undef %char_cache; |
54
|
|
|
|
|
|
|
} |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
$char_cache{$ord} = charinfo($ord); |
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
my $charinfo = $char_cache{$ord}; |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
if( ! defined $charinfo ){ |
61
|
|
|
|
|
|
|
warn "$1 does not look like good UTF8 - no charinfo"; |
62
|
|
|
|
|
|
|
next; |
63
|
|
|
|
|
|
|
} |
64
|
|
|
|
|
|
|
my $script = $charinfo->{script}; |
65
|
|
|
|
|
|
|
if( ! defined $script ){ |
66
|
|
|
|
|
|
|
warn "$1 does not look like good UTF8 - no script"; |
67
|
|
|
|
|
|
|
next; |
68
|
|
|
|
|
|
|
} |
69
|
|
|
|
|
|
|
next if $script eq 'Common'; |
70
|
|
|
|
|
|
|
$text=~ /((\p{$script}|[-0-9:])*)/g; |
71
|
|
|
|
|
|
|
$part .= $1; |
72
|
|
|
|
|
|
|
push @parts, { text => $part, script => $script }; |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
# warn join ' | ', map { $_->{text} } @parts; |
75
|
|
|
|
|
|
|
return @parts; |
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
sub word_count { |
79
|
|
|
|
|
|
|
my ( $self, $text, $features ) = @_; |
80
|
|
|
|
|
|
|
$features ||= {}; |
81
|
|
|
|
|
|
|
for my $part ( $self->split_scripts( $text ) ){ |
82
|
|
|
|
|
|
|
my @words = ( $part->{text} ); |
83
|
|
|
|
|
|
|
if( $part->{script} eq 'Han' ){ |
84
|
|
|
|
|
|
|
@words = mmseg( $part->{text} ); |
85
|
|
|
|
|
|
|
} |
86
|
|
|
|
|
|
|
for my $word ( @words ){ |
87
|
|
|
|
|
|
|
next if $self->is_stop_word( $word, $part->{script} ); |
88
|
|
|
|
|
|
|
$features->{ $self->normalize( $word ) }++; |
89
|
|
|
|
|
|
|
} |
90
|
|
|
|
|
|
|
} |
91
|
|
|
|
|
|
|
return $features; |
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
__PACKAGE__->meta()->make_immutable(); |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
1; |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
# ABSTRACT: counting words in multilingual texts |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
__END__ |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
=pod |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
=head1 NAME |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
Text::WordCounter - counting words in multilingual texts |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
=head1 VERSION |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
version 0.001 |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=head1 SYNOPSIS |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
my $counter = Text::WordCounter->new(); |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
my $word_count = $counter->word_count( $text ) |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
=head1 DESCRIPTION |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
It is quite heuristic, for example '-' and digits inside word characters |
122
|
|
|
|
|
|
|
are treated as a word character, see the tests to find out how all the special |
123
|
|
|
|
|
|
|
cases are resolved, |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
The features parameter should be a hashref and is an accumulator for found |
126
|
|
|
|
|
|
|
features. |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
=head1 ATTRIBUTES |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
=head2 stemming |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
If set stemming via Lingua::Stem is performed on the words. |
133
|
|
|
|
|
|
|
We never managed to make it sanely in multilingual texts. |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
=head2 stopwords |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
A hashref with words to discard. |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
=head1 INSTANCE METHODS |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
=head2 C<is_stop_word> |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
=head2 C<normalize> |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
Lowercases words and stemms them if the C<stemming> attribute is true. |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
=head2 C<split_scripts> |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
=head2 C<word_count> |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
Returns a hashref with word counts. |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
=head1 LIMITATIONS |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
From languages that don't use spaces only Chinese is currently supported |
156
|
|
|
|
|
|
|
(using Lingua::ZH::MMSEG). |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
=head1 SEE ALSO |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
__END__ |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
=head1 AUTHORS |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=over 4 |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
=item * |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
Zbigniew Lukasiak <zlukasiak@opera.com> |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
=item * |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
Tadeusz SoÅnierz, tsosnierz@opera.com |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
=back |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
This software is Copyright (c) 2012 by Opera Software ASA. |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
This is free software, licensed under: |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
The Artistic License 2.0 (GPL Compatible) |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
=cut |