line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::TFIDF; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
# ABSTRACT: Language-independent TF-IDF calculator. |
4
|
|
|
|
|
|
|
|
5
|
1
|
|
|
1
|
|
846
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
39
|
|
6
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
34
|
|
7
|
1
|
|
|
1
|
|
626
|
use Lingua::TFIDF::Types; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
31
|
|
8
|
1
|
|
|
1
|
|
1837
|
use List::MoreUtils qw/uniq/; |
|
1
|
|
|
|
|
1376
|
|
|
1
|
|
|
|
|
92
|
|
9
|
1
|
|
|
1
|
|
7
|
use List::Util qw/sum/; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
58
|
|
10
|
1
|
|
|
1
|
|
5
|
use Smart::Args; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
687
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
our $VERSION = 0.01; |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
sub new { |
15
|
1
|
|
|
1
|
1
|
40
|
args |
16
|
|
|
|
|
|
|
my $class => 'ClassName', |
17
|
|
|
|
|
|
|
my $word_counter => +{ isa => 'Lingua::TFIDF::WordCounter', optional => 1 }, |
18
|
|
|
|
|
|
|
my $word_segmenter => 'Lingua::TFIDF::WordSegmenter'; |
19
|
|
|
|
|
|
|
|
20
|
1
|
50
|
|
|
|
117
|
unless (defined $word_counter) { |
21
|
1
|
|
|
|
|
656
|
require Lingua::TFIDF::WordCounter::Simple; |
22
|
1
|
|
|
|
|
7
|
$word_counter = Lingua::TFIDF::WordCounter::Simple->new; |
23
|
|
|
|
|
|
|
} |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
bless +{ |
26
|
1
|
|
|
|
|
5
|
word_counter => $word_counter, |
27
|
|
|
|
|
|
|
word_segmenter => $word_segmenter, |
28
|
|
|
|
|
|
|
} => $class; |
29
|
|
|
|
|
|
|
} |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
sub idf { |
32
|
3
|
|
|
3
|
1
|
2309
|
args |
33
|
|
|
|
|
|
|
my $self, |
34
|
|
|
|
|
|
|
my $documents => 'ArrayRef[Lingua::TFIDF::TermFrequency] | ArrayRef[Str]'; |
35
|
|
|
|
|
|
|
|
36
|
3
|
50
|
|
|
|
637
|
return +{} if @$documents == 0; |
37
|
|
|
|
|
|
|
|
38
|
2
|
|
|
|
|
7
|
my @tfs = ref $documents->[0] |
39
|
3
|
100
|
|
|
|
15
|
? @$documents : map { $self->tf(document => \$_) } @$documents; |
40
|
3
|
|
|
|
|
4
|
my %idf; |
41
|
3
|
|
|
|
|
6
|
for my $word (uniq map { keys %$_ } @tfs) { |
|
6
|
|
|
|
|
138
|
|
42
|
141
|
|
|
|
|
131
|
my $num_documents_including_word = grep { exists $_->{$word} } @tfs; |
|
282
|
|
|
|
|
443
|
|
43
|
141
|
|
|
|
|
333
|
$idf{$word} = log(@tfs / $num_documents_including_word); |
44
|
|
|
|
|
|
|
} |
45
|
3
|
|
|
|
|
38
|
return \%idf; |
46
|
|
|
|
|
|
|
} |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
sub tf { |
49
|
8
|
|
|
8
|
1
|
1818
|
args |
50
|
|
|
|
|
|
|
my $self, |
51
|
|
|
|
|
|
|
my $document => 'Ref | Str', |
52
|
|
|
|
|
|
|
my $normalize => +{ isa => 'Bool', default => 0 }; |
53
|
|
|
|
|
|
|
|
54
|
8
|
|
|
|
|
1141
|
$self->word_counter->clear; |
55
|
|
|
|
|
|
|
|
56
|
8
|
|
|
|
|
35
|
my $iter = $self->word_segmenter->segment($document); |
57
|
8
|
|
|
|
|
21
|
my $counter = $self->word_counter; |
58
|
8
|
|
|
|
|
23
|
while (defined (my $word = $iter->())) { $counter->add_count($word) } |
|
419
|
|
|
|
|
1241
|
|
59
|
|
|
|
|
|
|
|
60
|
8
|
|
|
|
|
25
|
my $tf = $counter->frequencies; |
61
|
8
|
100
|
|
|
|
64
|
return $tf unless $normalize; |
62
|
|
|
|
|
|
|
|
63
|
3
|
|
|
|
|
50
|
my $total_words = sum values %$tf; |
64
|
3
|
|
|
|
|
29
|
+{ map { ($_ => $tf->{$_} / $total_words) } keys %$tf }; |
|
96
|
|
|
|
|
258
|
|
65
|
|
|
|
|
|
|
} |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
sub tf_idf { |
68
|
2
|
|
|
2
|
1
|
4064
|
args |
69
|
|
|
|
|
|
|
my $self, |
70
|
|
|
|
|
|
|
my $documents => 'ArrayRef[Str]', |
71
|
|
|
|
|
|
|
my $normalize => +{ isa => 'Bool', default => 0 }; |
72
|
|
|
|
|
|
|
|
73
|
2
|
50
|
|
|
|
186
|
return +{} if @$documents == 0; |
74
|
|
|
|
|
|
|
|
75
|
4
|
|
|
|
|
13
|
my @tfs = |
76
|
2
|
|
|
|
|
3
|
map { $self->tf(document => \$_, normalize => $normalize) } @$documents; |
77
|
2
|
|
|
|
|
10
|
my $idf = $self->idf(documents => \@tfs); |
78
|
2
|
|
|
|
|
3
|
my @tf_idf; |
79
|
2
|
|
|
|
|
4
|
for my $tf (@tfs) { |
80
|
4
|
|
|
|
|
19
|
push @tf_idf, +{ map { ($_ => $tf->{$_} * $idf->{$_}) } keys %$tf }; |
|
130
|
|
|
|
|
269
|
|
81
|
|
|
|
|
|
|
} |
82
|
2
|
|
|
|
|
23
|
return \@tf_idf; |
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
|
85
|
16
|
|
|
16
|
0
|
79
|
sub word_counter { $_[0]->{word_counter} } |
86
|
|
|
|
|
|
|
|
87
|
8
|
|
|
8
|
0
|
37
|
sub word_segmenter { $_[0]->{word_segmenter} } |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
1; |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
__END__ |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
=pod |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
=encoding UTF-8 |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=head1 NAME |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
Lingua::TFIDF - Language-independent TF-IDF calculator. |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
=head1 VERSION |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
version 0.01 |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
=head1 SYNOPSIS |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
use Lingua::TFIDF; |
108
|
|
|
|
|
|
|
use Lingua::TFIDF::WordSegmenter::SplitBySpace; |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
my $tf_idf_calc = Lingua::TFIDF->new( |
111
|
|
|
|
|
|
|
# Use a word segmenter for japanese text. |
112
|
|
|
|
|
|
|
word_segmenter => Lingua::TFIDF::WordSegmenter::SplitBySpace->new, |
113
|
|
|
|
|
|
|
); |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
my $document1 = 'Humpty Dumpty sat on a wall...'; |
116
|
|
|
|
|
|
|
my $document2 = 'Remember, remember, the fifth of November...'; |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
my $tf = $tf_idf_calc->tf(document => $document1); |
119
|
|
|
|
|
|
|
# TF of word "Dumpty" in $document1. |
120
|
|
|
|
|
|
|
say $tf->{'Dumpty'}; # 2, if you are referring same text as mine. |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
my $idf = $tf_idf_calc->idf(documents => [$document1, $document2]); |
123
|
|
|
|
|
|
|
say $idf->{'Dumpty'}; # log(2/1) â 0.693147 |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
my $tf_idfs = $tf_idf_calc->tf_idf(documents => [$document1, $document2]); |
126
|
|
|
|
|
|
|
# TF-IDF of word "Dumpty" in $document1. |
127
|
|
|
|
|
|
|
say $tf_idfs->[0]{'Dumpty'}; # 2 log(2/1) â 1.386294 |
128
|
|
|
|
|
|
|
# Ditto. But in $document2. |
129
|
|
|
|
|
|
|
say $tf_idfs->[1]{'Dumpty'}; # 0 |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
=head1 DESCRIPTION |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
Quoting L<Wikipedia|http://en.wikipedia.org/wiki/Tf%E2%80%93idf>: |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
tfâidf, short for term frequencyâinverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. It is often used as a weighting factor in information retrieval and text mining. |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
This module provides feature for calculating TF, IDF and TF-IDF. |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
=head2 MOTIVATION |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
There are several TF-IDF calculator modules in CPAN already, for example L<Text::TFIDF> and L<Lingua::JA::TFIDF>. So why I reinvent the wheel? The reason is language dependency: C<Text::TFIDF> assumes that words in sentence are separated by spaces. This assumption is not true in most east asian languages. And C<Lingua::JA::TFIDF> works only on japanese text. |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
C<Lingua::TFIDF> solves this problem by separating word segmentation process from word frequency counting. You can process documents written in any languages, by providing appropriate word segmenter (see L</CUSTOM WORD SEGMENTER> below.) |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
=head1 METHODS |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
=head2 new(word_segmenter => $segmenter) |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
Constructor. Takes 1 mandatory parameter C<word_segmenter>. |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
=head3 CUSTOM WORD SEGMENTER |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
Although this distribution bundles some language-independent word segmenter, like L<Lingua::TFIDF::WordSegmenter::SplitBySpace>, sometimes language-specifiec word segmenters are more appropriate. You can pass a custom word segmenter object to the calculator. |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
The word segmenter is a plain Perl object that implements C<segment> method. The method takes 1 positional argument C<$document>, which is a string or a B<reference> to string. It is expected to return an word iterator as CodeRef. |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
Roughly speaking, given custom word segmenter will be used like: |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
my $document = 'foo bar baz'; |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
# Can be called with a reference, like |->segment(\$document)|. |
162
|
|
|
|
|
|
|
# Detecting data type is callee's responsibility. |
163
|
|
|
|
|
|
|
my $iter = $word_segmenter->segment($document); |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
while (defined(my $word = $iter->())) { |
166
|
|
|
|
|
|
|
... |
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=head2 idf(documents => \@documents) |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
Calculates IDFs. Result is returned as HashRef, which the keys and values are words and corresponding IDFs respectively. |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
=head2 tf(document => $document | \$document [, normalize => 0]) |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
Calculates TFs. Result is returned as HashRef, which the keys and values are words and corresponding TFs respectively. |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
If optional parameter <normalize> is set true, the TFs are devided by the number of words in the C<$document>. It is useful when comparing TFs with other documents. |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
=head2 tf_idf(documents => \@documents [, normalize => 0]) |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
Calculates TF-IDFs. Result is returned as ArrayRef of HashRef. Each HashRef contains TF-IDF values for corresponding document. |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
=head1 SEE ALSO |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
=over 2 |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
=item L<Lingua::TFIDF::WordSegmenter::LetterNgram> |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
=item L<Lingua::TFIDF::WordSegmenter::SplitBySpace> |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
=item L<Lingua::TFIDF::WordSegmenter::JA::MeCab> |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=back |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
=head1 AUTHOR |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
Koichi SATOH <sekia@cpan.org> |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
This software is Copyright (c) 2014 by Koichi SATOH. |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
This is free software, licensed under: |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
The MIT (X11) License |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
=cut |