File Coverage

blib/lib/Lingua/TFIDF.pm

Criterion	Covered	Total	%
statement	57	57	100.0
branch	7	10	70.0
condition			n/a
subroutine	12	12	100.0
pod	4	6	66.6
total	80	85	94.1

line	stmt	bran	sub	pod	time	code
1						package Lingua::TFIDF;
2
3						# ABSTRACT: Language-independent TF-IDF calculator.
4
5	1		1		846	use strict;
	1				2
	1				39
6	1		1		5	use warnings;
	1				2
	1				34
7	1		1		626	use Lingua::TFIDF::Types;
	1				3
	1				31
8	1		1		1837	use List::MoreUtils qw/uniq/;
	1				1376
	1				92
9	1		1		7	use List::Util qw/sum/;
	1				2
	1				58
10	1		1		5	use Smart::Args;
	1				2
	1				687
11
12						our $VERSION = 0.01;
13
14						sub new {
15	1		1	1	40	args
16						my $class => 'ClassName',
17						my $word_counter => +{ isa => 'Lingua::TFIDF::WordCounter', optional => 1 },
18						my $word_segmenter => 'Lingua::TFIDF::WordSegmenter';
19
20	1	50			117	unless (defined $word_counter) {
21	1				656	require Lingua::TFIDF::WordCounter::Simple;
22	1				7	$word_counter = Lingua::TFIDF::WordCounter::Simple->new;
23						}
24
25						bless +{
26	1				5	word_counter => $word_counter,
27						word_segmenter => $word_segmenter,
28						} => $class;
29						}
30
31						sub idf {
32	3		3	1	2309	args
33						my $self,
34						my $documents => 'ArrayRef[Lingua::TFIDF::TermFrequency] \| ArrayRef[Str]';
35
36	3	50			637	return +{} if @$documents == 0;
37
38	2				7	my @tfs = ref $documents->[0]
39	3	100			15	? @$documents : map { $self->tf(document => \$_) } @$documents;
40	3				4	my %idf;
41	3				6	for my $word (uniq map { keys %$_ } @tfs) {
	6				138
42	141				131	my $num_documents_including_word = grep { exists $_->{$word} } @tfs;
	282				443
43	141				333	$idf{$word} = log(@tfs / $num_documents_including_word);
44						}
45	3				38	return \%idf;
46						}
47
48						sub tf {
49	8		8	1	1818	args
50						my $self,
51						my $document => 'Ref \| Str',
52						my $normalize => +{ isa => 'Bool', default => 0 };
53
54	8				1141	$self->word_counter->clear;
55
56	8				35	my $iter = $self->word_segmenter->segment($document);
57	8				21	my $counter = $self->word_counter;
58	8				23	while (defined (my $word = $iter->())) { $counter->add_count($word) }
	419				1241
59
60	8				25	my $tf = $counter->frequencies;
61	8	100			64	return $tf unless $normalize;
62
63	3				50	my $total_words = sum values %$tf;
64	3				29	+{ map { ($_ => $tf->{$_} / $total_words) } keys %$tf };
	96				258
65						}
66
67						sub tf_idf {
68	2		2	1	4064	args
69						my $self,
70						my $documents => 'ArrayRef[Str]',
71						my $normalize => +{ isa => 'Bool', default => 0 };
72
73	2	50			186	return +{} if @$documents == 0;
74
75	4				13	my @tfs =
76	2				3	map { $self->tf(document => \$_, normalize => $normalize) } @$documents;
77	2				10	my $idf = $self->idf(documents => \@tfs);
78	2				3	my @tf_idf;
79	2				4	for my $tf (@tfs) {
80	4				19	push @tf_idf, +{ map { ($_ => $tf->{$_} * $idf->{$_}) } keys %$tf };
	130				269
81						}
82	2				23	return \@tf_idf;
83						}
84
85	16		16	0	79	sub word_counter { $_[0]->{word_counter} }
86
87	8		8	0	37	sub word_segmenter { $_[0]->{word_segmenter} }
88
89						1;
90
91						__END__
92
93						=pod
94
95						=encoding UTF-8
96
97						=head1 NAME
98
99						Lingua::TFIDF - Language-independent TF-IDF calculator.
100
101						=head1 VERSION
102
103						version 0.01
104
105						=head1 SYNOPSIS
106
107						use Lingua::TFIDF;
108						use Lingua::TFIDF::WordSegmenter::SplitBySpace;
109
110						my $tf_idf_calc = Lingua::TFIDF->new(
111						# Use a word segmenter for japanese text.
112						word_segmenter => Lingua::TFIDF::WordSegmenter::SplitBySpace->new,
113						);
114
115						my $document1 = 'Humpty Dumpty sat on a wall...';
116						my $document2 = 'Remember, remember, the fifth of November...';
117
118						my $tf = $tf_idf_calc->tf(document => $document1);
119						# TF of word "Dumpty" in $document1.
120						say $tf->{'Dumpty'}; # 2, if you are referring same text as mine.
121
122						my $idf = $tf_idf_calc->idf(documents => [$document1, $document2]);
123						say $idf->{'Dumpty'}; # log(2/1) â‰’ 0.693147
124
125						my $tf_idfs = $tf_idf_calc->tf_idf(documents => [$document1, $document2]);
126						# TF-IDF of word "Dumpty" in $document1.
127						say $tf_idfs->[0]{'Dumpty'}; # 2 log(2/1) â‰’ 1.386294
128						# Ditto. But in $document2.
129						say $tf_idfs->[1]{'Dumpty'}; # 0
130
131						=head1 DESCRIPTION
132
133						Quoting L<Wikipedia\|http://en.wikipedia.org/wiki/Tf%E2%80%93idf>:
134
135						tfâ€“idf, short for term frequencyâ€“inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. It is often used as a weighting factor in information retrieval and text mining.
136
137						This module provides feature for calculating TF, IDF and TF-IDF.
138
139						=head2 MOTIVATION
140
141						There are several TF-IDF calculator modules in CPAN already, for example L<Text::TFIDF> and L<Lingua::JA::TFIDF>. So why I reinvent the wheel? The reason is language dependency: C<Text::TFIDF> assumes that words in sentence are separated by spaces. This assumption is not true in most east asian languages. And C<Lingua::JA::TFIDF> works only on japanese text.
142
143						C<Lingua::TFIDF> solves this problem by separating word segmentation process from word frequency counting. You can process documents written in any languages, by providing appropriate word segmenter (see L</CUSTOM WORD SEGMENTER> below.)
144
145						=head1 METHODS
146
147						=head2 new(word_segmenter => $segmenter)
148
149						Constructor. Takes 1 mandatory parameter C<word_segmenter>.
150
151						=head3 CUSTOM WORD SEGMENTER
152
153						Although this distribution bundles some language-independent word segmenter, like L<Lingua::TFIDF::WordSegmenter::SplitBySpace>, sometimes language-specifiec word segmenters are more appropriate. You can pass a custom word segmenter object to the calculator.
154
155						The word segmenter is a plain Perl object that implements C<segment> method. The method takes 1 positional argument C<$document>, which is a string or a B<reference> to string. It is expected to return an word iterator as CodeRef.
156
157						Roughly speaking, given custom word segmenter will be used like:
158
159						my $document = 'foo bar baz';
160
161						# Can be called with a reference, like \|->segment(\$document)\|.
162						# Detecting data type is callee's responsibility.
163						my $iter = $word_segmenter->segment($document);
164
165						while (defined(my $word = $iter->())) {
166						...
167						}
168
169						=head2 idf(documents => \@documents)
170
171						Calculates IDFs. Result is returned as HashRef, which the keys and values are words and corresponding IDFs respectively.
172
173						=head2 tf(document => $document \| \$document [, normalize => 0])
174
175						Calculates TFs. Result is returned as HashRef, which the keys and values are words and corresponding TFs respectively.
176
177						If optional parameter <normalize> is set true, the TFs are devided by the number of words in the C<$document>. It is useful when comparing TFs with other documents.
178
179						=head2 tf_idf(documents => \@documents [, normalize => 0])
180
181						Calculates TF-IDFs. Result is returned as ArrayRef of HashRef. Each HashRef contains TF-IDF values for corresponding document.
182
183						=head1 SEE ALSO
184
185						=over 2
186
187						=item L<Lingua::TFIDF::WordSegmenter::LetterNgram>
188
189						=item L<Lingua::TFIDF::WordSegmenter::SplitBySpace>
190
191						=item L<Lingua::TFIDF::WordSegmenter::JA::MeCab>
192
193						=back
194
195						=head1 AUTHOR
196
197						Koichi SATOH <sekia@cpan.org>
198
199						=head1 COPYRIGHT AND LICENSE
200
201						This software is Copyright (c) 2014 by Koichi SATOH.
202
203						This is free software, licensed under:
204
205						The MIT (X11) License
206
207						=cut