line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Data::Classifier::NaiveBayes; |
2
|
2
|
|
|
2
|
|
79902
|
use Moose; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
use MooseX::Types::LoadableClass qw(LoadableClass); |
4
|
|
|
|
|
|
|
use List::Util qw(reduce sum); |
5
|
|
|
|
|
|
|
use 5.008008; |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
has categories => ( |
8
|
|
|
|
|
|
|
is => 'rw', |
9
|
|
|
|
|
|
|
default => sub { {} }); |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
# Need to implement |
12
|
|
|
|
|
|
|
has thresholds => ( |
13
|
|
|
|
|
|
|
is => 'rw', |
14
|
|
|
|
|
|
|
default => sub { {} }); |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
has tokenizer => ( |
17
|
|
|
|
|
|
|
is => 'rw', |
18
|
|
|
|
|
|
|
lazy_build => 1); |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
has tokenizer_class => ( |
21
|
|
|
|
|
|
|
is => 'ro', |
22
|
|
|
|
|
|
|
isa => LoadableClass, |
23
|
|
|
|
|
|
|
default => 'Data::Classifier::NaiveBayes::Tokenizer', |
24
|
|
|
|
|
|
|
coerce => 1); |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
has words => ( |
27
|
|
|
|
|
|
|
is => 'rw', |
28
|
|
|
|
|
|
|
default => sub { {} }); |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
sub _build_tokenizer { $_[0]->tokenizer_class->new } |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
sub _cat_count { |
33
|
|
|
|
|
|
|
my ($self, $category) = @_; |
34
|
|
|
|
|
|
|
$self->categories->{$category}; |
35
|
|
|
|
|
|
|
} |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
sub _cat_scores { |
38
|
|
|
|
|
|
|
my ($self, $text) = @_; |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
my $probs = {}; |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
for my $cat (keys %{$self->categories}) { |
43
|
|
|
|
|
|
|
$probs->{$cat} = $self->_text_prop($cat, $text); |
44
|
|
|
|
|
|
|
} |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
return sort { $a->[1] <=> $b->[1] } map { [$_, $probs->{$_} ] } keys %{$probs}; |
47
|
|
|
|
|
|
|
} |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
sub _doc_prob { |
50
|
|
|
|
|
|
|
my ($self, $text, $cat) = @_; |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
return reduce { $a * $b } @{$self->tokenizer->words($text, sub{ |
53
|
|
|
|
|
|
|
my $word = shift; |
54
|
|
|
|
|
|
|
return $self->_word_weighted_average($word, $cat); |
55
|
|
|
|
|
|
|
})}; |
56
|
|
|
|
|
|
|
} |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
sub _inc_cat { |
59
|
|
|
|
|
|
|
my ($self, $cat) = @_; |
60
|
|
|
|
|
|
|
$self->categories->{$cat} ||= 0; |
61
|
|
|
|
|
|
|
$self->categories->{$cat} += 1; |
62
|
|
|
|
|
|
|
} |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
sub _inc_word { |
65
|
|
|
|
|
|
|
my ($self, $word, $cat) = @_; |
66
|
|
|
|
|
|
|
$self->words->{$word} ||= {}; |
67
|
|
|
|
|
|
|
$self->words->{$word}->{$cat} ||= 0; |
68
|
|
|
|
|
|
|
$self->words->{$word}->{$cat} += 1; |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
sub _text_prop { |
72
|
|
|
|
|
|
|
my ($self, $cat, $text) = @_; |
73
|
|
|
|
|
|
|
my $cat_prob = ($self->_cat_count($cat) / $self->_total_count); |
74
|
|
|
|
|
|
|
my $doc_prob = $self->_doc_prob($text, $cat); |
75
|
|
|
|
|
|
|
return $cat_prob * $doc_prob; |
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
sub _total_count { |
79
|
|
|
|
|
|
|
my ($self) = @_; |
80
|
|
|
|
|
|
|
return sum values %{$self->categories}; |
81
|
|
|
|
|
|
|
} |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
sub _word_count { |
84
|
|
|
|
|
|
|
my ($self, $word, $category) = @_; |
85
|
|
|
|
|
|
|
return 0.0 unless $self->words->{$word} && $self->words->{$word}->{$category}; |
86
|
|
|
|
|
|
|
return sprintf("%.2f", $self->words->{$word}->{$category}); |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
sub _word_prob { |
90
|
|
|
|
|
|
|
my ($self, $word, $cat ) = @_; |
91
|
|
|
|
|
|
|
return 0.0 if $self->_cat_count($cat) == 0; |
92
|
|
|
|
|
|
|
return sprintf("%.2f", $self->_word_count($word, $cat) / $self->_cat_count($cat)); |
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
sub _word_weighted_average { |
96
|
|
|
|
|
|
|
my ($self, $word, $cat ) = @_; |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
my $weight = 1.0; |
99
|
|
|
|
|
|
|
my $assumed_prob = 0.5; |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# calculate current probability |
102
|
|
|
|
|
|
|
my $basic_prob = $self->_word_prob($word, $cat); |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
# count the number of times this word has appeared in all |
105
|
|
|
|
|
|
|
# categories |
106
|
|
|
|
|
|
|
my $totals = sum map { $self->_word_count($word, $_) } keys %{$self->categories}; |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
# the final weighted average |
109
|
|
|
|
|
|
|
return ($weight * $assumed_prob + $totals * $basic_prob) / ($weight + $totals); |
110
|
|
|
|
|
|
|
} |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
sub classify { |
113
|
|
|
|
|
|
|
my ($self, $text, $default) = @_; |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
my $max_prob = 0.0; |
116
|
|
|
|
|
|
|
my $best = undef; |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
my @scores = $self->_cat_scores($text); |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
for my $score ( @scores) { |
121
|
|
|
|
|
|
|
my ( $cat, $prob ) = @{$score}; |
122
|
|
|
|
|
|
|
if ( $prob > $max_prob ) { |
123
|
|
|
|
|
|
|
$max_prob = $prob; |
124
|
|
|
|
|
|
|
$best = $cat; |
125
|
|
|
|
|
|
|
} |
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
return $default unless $best; |
129
|
|
|
|
|
|
|
my $threshold = $self->thresholds->{$best} || 1.0; |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
for my $score ( @scores ) { |
132
|
|
|
|
|
|
|
my ( $cat, $prob ) = @{$score}; |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
next if $cat eq $best; |
135
|
|
|
|
|
|
|
return $default if $prob * $threshold > $max_prob; |
136
|
|
|
|
|
|
|
} |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
return $best; |
139
|
|
|
|
|
|
|
} |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
sub train { |
142
|
|
|
|
|
|
|
my ( $self, $cat, $string ) = @_; |
143
|
|
|
|
|
|
|
$self->tokenizer->words($string, sub{ |
144
|
|
|
|
|
|
|
$self->_inc_word(shift, $cat); |
145
|
|
|
|
|
|
|
}); |
146
|
|
|
|
|
|
|
$self->_inc_cat($cat); |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
1; |
150
|
|
|
|
|
|
|
=head1 NAME |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
Data::Classifier::NaiveBayes |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
=head1 SYNOPSIS |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
my $classifier = Data::Classifier::NaiveBayes->new; |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
$classifier->train('token', "Some text to train with"); |
159
|
|
|
|
|
|
|
print $classifier->classify("Some text to find a match"); |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
=head1 DESCRIPTION |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
This a Naive Bayes classifer. The code for this project is largely and |
164
|
|
|
|
|
|
|
shamelessly based off of the work done by alexandru's stuff-classifier |
165
|
|
|
|
|
|
|
originally written in Ruby. |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
https://github.com/alexandru/stuff-classifier |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
The code was ported over to Perl and L<Moose>. |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
For more information please see the following: |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
http://bionicspirit.com/blog/2012/02/09/howto-build-naive-bayes-classifier.html |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=head1 ATTRIBUTES |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
=head2 tokenizer |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
An access to L<Data::Classifier::NaiveBayes::Tokenizer>. |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
=head2 tokenizer_class |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
A string to the tokenizer class name. |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
=head2 words($hash_ref) |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
A key value pair of word counts by categories |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
=head2 categories($hash_ref) |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
A key value pair of catogory counts. |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=head1 METHODS |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
=head2 classify($phrase) |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
This will return the highest probable category associated with the phrase. |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
=head2 train($category, $phrase) |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
This will perform a word count and associate words with a category to later be |
203
|
|
|
|
|
|
|
classified. |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
=head1 SEE ALSO |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
L<Moose> |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
=head1 AUTHOR |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
Logan Bell, C<< <logie@cpan.org> >> |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
=head1 COPYRIGHT & LICENSE |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
Copyright 2012, Logan Bell |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify |
218
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
=cut |