File Coverage

blib/lib/AI/NaiveBayes.pm
Criterion Covered Total %
statement 52 52 100.0
branch 3 4 75.0
condition 2 3 66.6
subroutine 11 11 100.0
pod 3 3 100.0
total 71 73 97.2


line stmt bran cond sub pod time code
1             package AI::NaiveBayes;
2             $AI::NaiveBayes::VERSION = '0.04';
3 3     3   27750 use strict;
  3         4  
  3         78  
4 3     3   9 use warnings;
  3         4  
  3         75  
5 3     3   60 use 5.010;
  3         7  
6 3     3   1103 use AI::NaiveBayes::Classification;
  3         9  
  3         195  
7 3     3   1127 use AI::NaiveBayes::Learner;
  3         7  
  3         108  
8 3     3   19 use Moose;
  3         5  
  3         18  
9 3     3   16475 use MooseX::Storage;
  3         92834  
  3         19  
10              
11 3     3   711 use List::Util qw(max);
  3         4  
  3         1357  
12              
13             with Storage(format => 'Storable', io => 'File');
14              
15             has model => (is => 'ro', isa => 'HashRef[HashRef]', required => 1);
16              
17             sub train {
18 1     1 1 537 my $self = shift;
19 1         38 my $learner = AI::NaiveBayes::Learner->new();
20 1         3 for my $example ( @_ ){
21 2         7 $learner->add_example( %$example );
22             }
23 1         4 return $learner->classifier;
24             }
25              
26              
27             sub classify {
28 4     4 1 1927 my ($self, $newattrs) = @_;
29 4 50       8 $newattrs or die "Missing parameter for classify()";
30              
31 4         104 my $m = $self->model;
32              
33             # Note that we're using the log(prob) here. That's why we add instead of multiply.
34              
35 4         4 my %scores = %{$m->{prior_probs}};
  4         12  
36 4         4 my %features;
37 4         12 while (my ($feature, $value) = each %$newattrs) {
38 30 100       62 next unless exists $m->{attributes}{$feature}; # Ignore totally unseen features
39 9         6 while (my ($label, $attributes) = each %{$m->{probs}}) {
  27         52  
40 18   66     33 my $score = ($attributes->{$feature} || $m->{smoother}{$label})*$value; # P($feature|$label)**$value
41 18         14 $scores{$label} += $score;
42 18         26 $features{$feature}{$label} = $score;
43             }
44             }
45              
46 4         6 rescale(\%scores);
47              
48 4         139 return AI::NaiveBayes::Classification->new( label_sums => \%scores, features => \%features );
49             }
50              
51             sub rescale {
52 4     4 1 4 my ($scores) = @_;
53              
54             # Scale everything back to a reasonable area in logspace (near zero), un-loggify, and normalize
55 4         3 my $total = 0;
56 4         10 my $max = max(values %$scores);
57 4         7 foreach (values %$scores) {
58 8         18 $_ = exp($_ - $max);
59 8         17 $total += $_**2;
60             }
61 4         5 $total = sqrt($total);
62 4         5 foreach (values %$scores) {
63 8         9 $_ /= $total;
64             }
65             }
66              
67              
68             __PACKAGE__->meta->make_immutable;
69              
70             1;
71              
72             =pod
73              
74             =encoding UTF-8
75              
76             =head1 NAME
77              
78             AI::NaiveBayes - A Bayesian classifier
79              
80             =head1 VERSION
81              
82             version 0.04
83              
84             =head1 SYNOPSIS
85              
86             # AI::NaiveBayes objects are created by AI::NaiveBayes::Learner
87             # but for quick start you can use the 'train' class method
88             # that is a shortcut using default AI::NaiveBayes::Learner settings
89              
90             my $classifier = AI::NaiveBayes->train(
91             {
92             attributes => {
93             sheep => 1, very => 1, valuable => 1, farming => 1
94             },
95             labels => ['farming']
96             },
97             {
98             attributes => {
99             vampires => 1, cannot => 1, see => 1, their => 1,
100             images => 1, mirrors => 1
101             },
102             labels => ['vampire']
103             },
104             );
105              
106             # Classify a feature vector
107             my $result = $classifier->classify({bar => 3, blurp => 2});
108            
109             # $result is now a AI::NaiveBayes::Classification object
110            
111             my $best_category = $result->best_category;
112              
113             =head1 DESCRIPTION
114              
115             This module implements the classic "Naive Bayes" machine learning
116             algorithm. This is a low level class that accepts only pre-computed feature-vectors
117             as input, see L<AI::Classifier::Text> for a text classifier that uses
118             this class.
119              
120             Creation of C<AI::NaiveBayes> classifier object out of training
121             data is done by L<AI::NaiveBayes::Learner>. For quick start
122             you can use the limited C<train> class method that trains the
123             classifier in a default way.
124              
125             The classifier object is immutable.
126              
127             It is a well-studied probabilistic algorithm often used in
128             automatic text categorization. Compared to other algorithms (kNN,
129             SVM, Decision Trees), it's pretty fast and reasonably competitive in
130             the quality of its results.
131              
132             A paper by Fabrizio Sebastiani provides a really good introduction to
133             text categorization:
134             L<http://faure.iei.pi.cnr.it/~fabrizio/Publications/ACMCS02.pdf>
135              
136             =head1 METHODS
137              
138             =over 4
139              
140             =item new( model => $model )
141              
142             Internal. See L<AI::NaiveBayes::Learner> to learn how to create a C<AI::NaiveBayes>
143             classifier from training data.
144              
145             =item train( LIST of HASHREFS )
146              
147             Shortcut for creating a trained classifier using L<AI::NaiveBayes::Learner> default
148             settings.
149             Arguments are passed to the C<add_example> method of the L<AI::NaiveBayes::Learner>
150             object one by one.
151              
152             =item classify( HASHREF )
153              
154             Classifies a feature-vector of the form:
155              
156             { feature1 => weight1, feature2 => weight2, ... }
157              
158             The result is a C<AI::NaiveBayes::Classification> object.
159              
160             =item rescale
161              
162             Internal
163              
164             =back
165              
166             =head1 ATTRIBUTES
167              
168             =over 4
169              
170             =item model
171              
172             Internal
173              
174             =back
175              
176             =head1 THEORY
177              
178             Bayes' Theorem is a way of inverting a conditional probability. It
179             states:
180              
181             P(y|x) P(x)
182             P(x|y) = -------------
183             P(y)
184              
185             The notation C<P(x|y)> means "the probability of C<x> given C<y>." See also
186             L<"http://mathforum.org/dr.math/problems/battisfore.03.22.99.html">
187             for a simple but complete example of Bayes' Theorem.
188              
189             In this case, we want to know the probability of a given category given a
190             certain string of words in a document, so we have:
191              
192             P(words | cat) P(cat)
193             P(cat | words) = --------------------
194             P(words)
195              
196             We have applied Bayes' Theorem because C<P(cat | words)> is a difficult
197             quantity to compute directly, but C<P(words | cat)> and C<P(cat)> are accessible
198             (see below).
199              
200             The greater the expression above, the greater the probability that the given
201             document belongs to the given category. So we want to find the maximum
202             value. We write this as
203              
204             P(words | cat) P(cat)
205             Best category = ArgMax -----------------------
206             cat in cats P(words)
207              
208             Since C<P(words)> doesn't change over the range of categories, we can get rid
209             of it. That's good, because we didn't want to have to compute these values
210             anyway. So our new formula is:
211              
212             Best category = ArgMax P(words | cat) P(cat)
213             cat in cats
214              
215             Finally, we note that if C<w1, w2, ... wn> are the words in the document,
216             then this expression is equivalent to:
217              
218             Best category = ArgMax P(w1|cat)*P(w2|cat)*...*P(wn|cat)*P(cat)
219             cat in cats
220              
221             That's the formula I use in my document categorization code. The last
222             step is the only non-rigorous one in the derivation, and this is the
223             "naive" part of the Naive Bayes technique. It assumes that the
224             probability of each word appearing in a document is unaffected by the
225             presence or absence of each other word in the document. We assume
226             this even though we know this isn't true: for example, the word
227             "iodized" is far more likely to appear in a document that contains the
228             word "salt" than it is to appear in a document that contains the word
229             "subroutine". Luckily, as it turns out, making this assumption even
230             when it isn't true may have little effect on our results, as the
231             following paper by Pedro Domingos argues:
232             L<"http://www.cs.washington.edu/homes/pedrod/mlj97.ps.gz">
233              
234             =head1 SEE ALSO
235              
236             Algorithm::NaiveBayes (3), AI::Classifier::Text(3)
237              
238             =head1 BASED ON
239              
240             Much of the code and description is from L<Algorithm::NaiveBayes>.
241              
242             =head1 AUTHORS
243              
244             =over 4
245              
246             =item *
247              
248             Zbigniew Lukasiak <zlukasiak@opera.com>
249              
250             =item *
251              
252             Tadeusz SoÅ›nierz <tsosnierz@opera.com>
253              
254             =item *
255              
256             Ken Williams <ken@mathforum.org>
257              
258             =back
259              
260             =head1 COPYRIGHT AND LICENSE
261              
262             This software is copyright (c) 2012 by Opera Software ASA.
263              
264             This is free software; you can redistribute it and/or modify it under
265             the same terms as the Perl 5 programming language system itself.
266              
267             =cut
268              
269             __END__
270              
271              
272             # ABSTRACT: A Bayesian classifier
273