File Coverage

blib/lib/AI/NaiveBayes.pm

Criterion	Covered	Total	%
statement	52	52	100.0
branch	3	4	75.0
condition	2	3	66.6
subroutine	11	11	100.0
pod	3	3	100.0
total	71	73	97.2

line	stmt	bran	cond	sub	pod	time	code
1							package AI::NaiveBayes;
2							$AI::NaiveBayes::VERSION = '0.04';
3	3			3		27750	use strict;
	3					4
	3					78
4	3			3		9	use warnings;
	3					4
	3					75
5	3			3		60	use 5.010;
	3					7
6	3			3		1103	use AI::NaiveBayes::Classification;
	3					9
	3					195
7	3			3		1127	use AI::NaiveBayes::Learner;
	3					7
	3					108
8	3			3		19	use Moose;
	3					5
	3					18
9	3			3		16475	use MooseX::Storage;
	3					92834
	3					19
10
11	3			3		711	use List::Util qw(max);
	3					4
	3					1357
12
13							with Storage(format => 'Storable', io => 'File');
14
15							has model => (is => 'ro', isa => 'HashRef[HashRef]', required => 1);
16
17							sub train {
18	1			1	1	537	my $self = shift;
19	1					38	my $learner = AI::NaiveBayes::Learner->new();
20	1					3	for my $example ( @_ ){
21	2					7	$learner->add_example( %$example );
22							}
23	1					4	return $learner->classifier;
24							}
25
26
27							sub classify {
28	4			4	1	1927	my ($self, $newattrs) = @_;
29	4	50				8	$newattrs or die "Missing parameter for classify()";
30
31	4					104	my $m = $self->model;
32
33							# Note that we're using the log(prob) here. That's why we add instead of multiply.
34
35	4					4	my %scores = %{$m->{prior_probs}};
	4					12
36	4					4	my %features;
37	4					12	while (my ($feature, $value) = each %$newattrs) {
38	30	100				62	next unless exists $m->{attributes}{$feature}; # Ignore totally unseen features
39	9					6	while (my ($label, $attributes) = each %{$m->{probs}}) {
	27					52
40	18		66			33	my $score = ($attributes->{$feature} \|\| $m->{smoother}{$label})$value; # P($feature\|$label)*$value
41	18					14	$scores{$label} += $score;
42	18					26	$features{$feature}{$label} = $score;
43							}
44							}
45
46	4					6	rescale(\%scores);
47
48	4					139	return AI::NaiveBayes::Classification->new( label_sums => \%scores, features => \%features );
49							}
50
51							sub rescale {
52	4			4	1	4	my ($scores) = @_;
53
54							# Scale everything back to a reasonable area in logspace (near zero), un-loggify, and normalize
55	4					3	my $total = 0;
56	4					10	my $max = max(values %$scores);
57	4					7	foreach (values %$scores) {
58	8					18	$_ = exp($_ - $max);
59	8					17	$total += $_**2;
60							}
61	4					5	$total = sqrt($total);
62	4					5	foreach (values %$scores) {
63	8					9	$_ /= $total;
64							}
65							}
66
67
68							__PACKAGE__->meta->make_immutable;
69
70							1;
71
72							=pod
73
74							=encoding UTF-8
75
76							=head1 NAME
77
78							AI::NaiveBayes - A Bayesian classifier
79
80							=head1 VERSION
81
82							version 0.04
83
84							=head1 SYNOPSIS
85
86							# AI::NaiveBayes objects are created by AI::NaiveBayes::Learner
87							# but for quick start you can use the 'train' class method
88							# that is a shortcut using default AI::NaiveBayes::Learner settings
89
90							my $classifier = AI::NaiveBayes->train(
91							{
92							attributes => {
93							sheep => 1, very => 1, valuable => 1, farming => 1
94							},
95							labels => ['farming']
96							},
97							{
98							attributes => {
99							vampires => 1, cannot => 1, see => 1, their => 1,
100							images => 1, mirrors => 1
101							},
102							labels => ['vampire']
103							},
104							);
105
106							# Classify a feature vector
107							my $result = $classifier->classify({bar => 3, blurp => 2});
108
109							# $result is now a AI::NaiveBayes::Classification object
110
111							my $best_category = $result->best_category;
112
113							=head1 DESCRIPTION
114
115							This module implements the classic "Naive Bayes" machine learning
116							algorithm. This is a low level class that accepts only pre-computed feature-vectors
117							as input, see L<AI::Classifier::Text> for a text classifier that uses
118							this class.
119
120							Creation of C<AI::NaiveBayes> classifier object out of training
121							data is done by L<AI::NaiveBayes::Learner>. For quick start
122							you can use the limited C<train> class method that trains the
123							classifier in a default way.
124
125							The classifier object is immutable.
126
127							It is a well-studied probabilistic algorithm often used in
128							automatic text categorization. Compared to other algorithms (kNN,
129							SVM, Decision Trees), it's pretty fast and reasonably competitive in
130							the quality of its results.
131
132							A paper by Fabrizio Sebastiani provides a really good introduction to
133							text categorization:
134							L<http://faure.iei.pi.cnr.it/~fabrizio/Publications/ACMCS02.pdf>
135
136							=head1 METHODS
137
138							=over 4
139
140							=item new( model => $model )
141
142							Internal. See L<AI::NaiveBayes::Learner> to learn how to create a C<AI::NaiveBayes>
143							classifier from training data.
144
145							=item train( LIST of HASHREFS )
146
147							Shortcut for creating a trained classifier using L<AI::NaiveBayes::Learner> default
148							settings.
149							Arguments are passed to the C<add_example> method of the L<AI::NaiveBayes::Learner>
150							object one by one.
151
152							=item classify( HASHREF )
153
154							Classifies a feature-vector of the form:
155
156							{ feature1 => weight1, feature2 => weight2, ... }
157
158							The result is a C<AI::NaiveBayes::Classification> object.
159
160							=item rescale
161
162							Internal
163
164							=back
165
166							=head1 ATTRIBUTES
167
168							=over 4
169
170							=item model
171
172							Internal
173
174							=back
175
176							=head1 THEORY
177
178							Bayes' Theorem is a way of inverting a conditional probability. It
179							states:
180
181							P(y\|x) P(x)
182							P(x\|y) = -------------
183							P(y)
184
185							The notation C<P(x\|y)> means "the probability of C<x> given C<y>." See also
186							L<"http://mathforum.org/dr.math/problems/battisfore.03.22.99.html">
187							for a simple but complete example of Bayes' Theorem.
188
189							In this case, we want to know the probability of a given category given a
190							certain string of words in a document, so we have:
191
192							P(words \| cat) P(cat)
193							P(cat \| words) = --------------------
194							P(words)
195
196							We have applied Bayes' Theorem because C<P(cat \| words)> is a difficult
197							quantity to compute directly, but C<P(words \| cat)> and C<P(cat)> are accessible
198							(see below).
199
200							The greater the expression above, the greater the probability that the given
201							document belongs to the given category. So we want to find the maximum
202							value. We write this as
203
204							P(words \| cat) P(cat)
205							Best category = ArgMax -----------------------
206							cat in cats P(words)
207
208							Since C<P(words)> doesn't change over the range of categories, we can get rid
209							of it. That's good, because we didn't want to have to compute these values
210							anyway. So our new formula is:
211
212							Best category = ArgMax P(words \| cat) P(cat)
213							cat in cats
214
215							Finally, we note that if C<w1, w2, ... wn> are the words in the document,
216							then this expression is equivalent to:
217
218							Best category = ArgMax P(w1\|cat)P(w2\|cat)...P(wn\|cat)P(cat)
219							cat in cats
220
221							That's the formula I use in my document categorization code. The last
222							step is the only non-rigorous one in the derivation, and this is the
223							"naive" part of the Naive Bayes technique. It assumes that the
224							probability of each word appearing in a document is unaffected by the
225							presence or absence of each other word in the document. We assume
226							this even though we know this isn't true: for example, the word
227							"iodized" is far more likely to appear in a document that contains the
228							word "salt" than it is to appear in a document that contains the word
229							"subroutine". Luckily, as it turns out, making this assumption even
230							when it isn't true may have little effect on our results, as the
231							following paper by Pedro Domingos argues:
232							L<"http://www.cs.washington.edu/homes/pedrod/mlj97.ps.gz">
233
234							=head1 SEE ALSO
235
236							Algorithm::NaiveBayes (3), AI::Classifier::Text(3)
237
238							=head1 BASED ON
239
240							Much of the code and description is from L<Algorithm::NaiveBayes>.
241
242							=head1 AUTHORS
243
244							=over 4
245
246							=item *
247
248							Zbigniew Lukasiak <zlukasiak@opera.com>
249
250							=item *
251
252							Tadeusz SoÅ›nierz <tsosnierz@opera.com>
253
254							=item *
255
256							Ken Williams <ken@mathforum.org>
257
258							=back
259
260							=head1 COPYRIGHT AND LICENSE
261
262							This software is copyright (c) 2012 by Opera Software ASA.
263
264							This is free software; you can redistribute it and/or modify it under
265							the same terms as the Perl 5 programming language system itself.
266
267							=cut
268
269							__END__
270
271
272							# ABSTRACT: A Bayesian classifier
273