File Coverage

blib/lib/AI/NaiveBayes.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             package AI::NaiveBayes;
2             $AI::NaiveBayes::VERSION = '0.02';
3 2     2   45131 use strict;
  2         4  
  2         69  
4 2     2   9 use warnings;
  2         3  
  2         51  
5 2     2   37 use 5.010;
  2         6  
  2         68  
6 2     2   1039 use AI::NaiveBayes::Classification;
  0            
  0            
7             use AI::NaiveBayes::Learner;
8             use Moose;
9             use MooseX::Storage;
10              
11             use List::Util qw(max);
12              
13             with Storage(format => 'Storable', io => 'File');
14              
15             has model => (is => 'ro', isa => 'HashRef[HashRef]', required => 1);
16              
17             sub train {
18             my $self = shift;
19             my $learner = AI::NaiveBayes::Learner->new();
20             for my $example ( @_ ){
21             $learner->add_example( %$example );
22             }
23             return $learner->classifier;
24             }
25              
26              
27             sub classify {
28             my ($self, $newattrs) = @_;
29             $newattrs or die "Missing parameter for classify()";
30              
31             my $m = $self->model;
32              
33             # Note that we're using the log(prob) here. That's why we add instead of multiply.
34              
35             my %scores = %{$m->{prior_probs}};
36             my %features;
37             while (my ($feature, $value) = each %$newattrs) {
38             next unless exists $m->{attributes}{$feature}; # Ignore totally unseen features
39             while (my ($label, $attributes) = each %{$m->{probs}}) {
40             my $score = ($attributes->{$feature} || $m->{smoother}{$label})*$value; # P($feature|$label)**$value
41             $scores{$label} += $score;
42             $features{$feature}{$label} = $score;
43             }
44             }
45              
46             rescale(\%scores);
47              
48             return AI::NaiveBayes::Classification->new( label_sums => \%scores, features => \%features );
49             }
50              
51             sub rescale {
52             my ($scores) = @_;
53              
54             # Scale everything back to a reasonable area in logspace (near zero), un-loggify, and normalize
55             my $total = 0;
56             my $max = max(values %$scores);
57             foreach (values %$scores) {
58             $_ = exp($_ - $max);
59             $total += $_**2;
60             }
61             $total = sqrt($total);
62             foreach (values %$scores) {
63             $_ /= $total;
64             }
65             }
66              
67              
68             __PACKAGE__->meta->make_immutable;
69              
70             1;
71              
72             =pod
73              
74             =encoding UTF-8
75              
76             =head1 NAME
77              
78             AI::NaiveBayes - A Bayesian classifier
79              
80             =head1 VERSION
81              
82             version 0.02
83              
84             =head1 SYNOPSIS
85              
86             # AI::NaiveBayes objects are created by AI::NaiveBayes::Learner
87             # but for quick start you can use the 'train' class method
88             # that is a shortcut using default AI::NaiveBayes::Learner settings
89              
90             my $classifier = AI::NaiveBayes->train(
91             {
92             attributes => _hash(qw(sheep very valuable farming)),
93             labels => ['farming']
94             },
95             {
96             attributes => _hash(qw(vampires cannot see their images mirrors)),
97             labels => ['vampire']
98             },
99             );
100              
101             # Classify a feature vector
102             my $result = $classifier->classify({bar => 3, blurp => 2});
103            
104             # $result is now a AI::NaiveBayes::Classification object
105            
106             my $best_category = $result->best_category;
107              
108             =head1 DESCRIPTION
109              
110             This module implements the classic "Naive Bayes" machine learning
111             algorithm. This is a low level class that accepts only pre-computed feature-vectors
112             as input, see L<AI::Classifier::Text> for a text classifier that uses
113             this class.
114              
115             Creation of C<AI::NaiveBayes> classifier object out of training
116             data is done by L<AI::NaiveBayes::Learner>. For quick start
117             you can use the limited C<train> class method that trains the
118             classifier in a default way.
119              
120             The classifier object is immutable.
121              
122             It is a well-studied probabilistic algorithm often used in
123             automatic text categorization. Compared to other algorithms (kNN,
124             SVM, Decision Trees), it's pretty fast and reasonably competitive in
125             the quality of its results.
126              
127             A paper by Fabrizio Sebastiani provides a really good introduction to
128             text categorization:
129             L<http://faure.iei.pi.cnr.it/~fabrizio/Publications/ACMCS02.pdf>
130              
131             =head1 METHODS
132              
133             =over 4
134              
135             =item new( model => $model )
136              
137             Internal. See L<AI::NaiveBayes::Learner> to learn how to create a C<AI::NaiveBayes>
138             classifier from training data.
139              
140             =item train( LIST of HASHREFS )
141              
142             Shortcut for creating a trained classifier using L<AI::NaiveBayes::Learner> default
143             settings.
144             Arguments are passed to the C<add_example> method of the L<AI::NaiveBayes::Learner>
145             object one by one.
146              
147             =item classify( HASHREF )
148              
149             Classifies a feature-vector of the form:
150              
151             { feature1 => weight1, feature2 => weight2, ... }
152              
153             The result is a C<AI::NaiveBayes::Classification> object.
154              
155             =item rescale
156              
157             Internal
158              
159             =back
160              
161             =head1 ATTRIBUTES
162              
163             =over 4
164              
165             =item model
166              
167             Internal
168              
169             =back
170              
171             =head1 THEORY
172              
173             Bayes' Theorem is a way of inverting a conditional probability. It
174             states:
175              
176             P(y|x) P(x)
177             P(x|y) = -------------
178             P(y)
179              
180             The notation C<P(x|y)> means "the probability of C<x> given C<y>." See also
181             L<"http://mathforum.org/dr.math/problems/battisfore.03.22.99.html">
182             for a simple but complete example of Bayes' Theorem.
183              
184             In this case, we want to know the probability of a given category given a
185             certain string of words in a document, so we have:
186              
187             P(words | cat) P(cat)
188             P(cat | words) = --------------------
189             P(words)
190              
191             We have applied Bayes' Theorem because C<P(cat | words)> is a difficult
192             quantity to compute directly, but C<P(words | cat)> and C<P(cat)> are accessible
193             (see below).
194              
195             The greater the expression above, the greater the probability that the given
196             document belongs to the given category. So we want to find the maximum
197             value. We write this as
198              
199             P(words | cat) P(cat)
200             Best category = ArgMax -----------------------
201             cat in cats P(words)
202              
203             Since C<P(words)> doesn't change over the range of categories, we can get rid
204             of it. That's good, because we didn't want to have to compute these values
205             anyway. So our new formula is:
206              
207             Best category = ArgMax P(words | cat) P(cat)
208             cat in cats
209              
210             Finally, we note that if C<w1, w2, ... wn> are the words in the document,
211             then this expression is equivalent to:
212              
213             Best category = ArgMax P(w1|cat)*P(w2|cat)*...*P(wn|cat)*P(cat)
214             cat in cats
215              
216             That's the formula I use in my document categorization code. The last
217             step is the only non-rigorous one in the derivation, and this is the
218             "naive" part of the Naive Bayes technique. It assumes that the
219             probability of each word appearing in a document is unaffected by the
220             presence or absence of each other word in the document. We assume
221             this even though we know this isn't true: for example, the word
222             "iodized" is far more likely to appear in a document that contains the
223             word "salt" than it is to appear in a document that contains the word
224             "subroutine". Luckily, as it turns out, making this assumption even
225             when it isn't true may have little effect on our results, as the
226             following paper by Pedro Domingos argues:
227             L<"http://www.cs.washington.edu/homes/pedrod/mlj97.ps.gz">
228              
229             =head1 SEE ALSO
230              
231             Algorithm::NaiveBayes (3), AI::Classifier::Text(3)
232              
233             =head1 BASED ON
234              
235             Much of the code and description is from L<Algorithm::NaiveBayes>.
236              
237             =head1 AUTHORS
238              
239             =over 4
240              
241             =item *
242              
243             Zbigniew Lukasiak <zlukasiak@opera.com>
244              
245             =item *
246              
247             Tadeusz SoÅ›nierz <tsosnierz@opera.com>
248              
249             =item *
250              
251             Ken Williams <ken@mathforum.org>
252              
253             =back
254              
255             =head1 COPYRIGHT AND LICENSE
256              
257             This software is copyright (c) 2012 by Opera Software ASA.
258              
259             This is free software; you can redistribute it and/or modify it under
260             the same terms as the Perl 5 programming language system itself.
261              
262             =cut
263              
264             __END__
265              
266              
267             # ABSTRACT: A Bayesian classifier
268