File Coverage

blib/lib/AI/NaiveBayes/Learner.pm

Criterion	Covered	Total	%
statement	13	15	86.6
branch			n/a
condition			n/a
subroutine	5	5	100.0
pod			n/a
total	18	20	90.0

line	stmt	sub	time	code
1				package AI::NaiveBayes::Learner;
2				$AI::NaiveBayes::Learner::VERSION = '0.02';
3	1	1	19573	use strict;
	1		3
	1		57
4	1	1	6	use warnings;
	1		2
	1		28
5	1	1	22	use 5.010;
	1		7
	1		40
6
7	1	1	13	use List::Util qw( min sum );
	1		2
	1		140
8	1	1	1534	use Moose;
	0
	0
9				use AI::NaiveBayes;
10
11				has attributes => (is => 'ro', isa => 'HashRef', default => sub { {} }, clearer => '_clear_attrs');
12				has labels => (is => 'ro', isa => 'HashRef', default => sub { {} }, clearer => '_clear_labels');
13				has examples => (is => 'ro', isa => 'Int', default => 0, clearer => '_clear_examples');
14
15				has features_kept => (is => 'ro', predicate => 'limit_features');
16
17				has classifier_class => ( is => 'ro', isa => 'Str', default => 'AI::NaiveBayes' );
18
19				sub add_example {
20				my ($self, %params) = @_;
21				for ('attributes', 'labels') {
22				die "Missing required '$_' parameter" unless exists $params{$_};
23				}
24
25				$self->{examples}++;
26
27				my $attributes = $params{attributes};
28				my $labels = $params{labels};
29
30				add_hash($self->attributes(), $attributes);
31
32				my $our_labels = $self->labels;
33				foreach my $label ( @$labels ) {
34				$our_labels->{$label}{count}++;
35				$our_labels->{$label}{attributes} //= {};
36				add_hash($our_labels->{$label}{attributes}, $attributes);
37				}
38				}
39
40				sub classifier {
41				my $self = shift;
42
43				my $examples = $self->examples;
44				my $labels = $self->labels;
45				my $vocab_size = keys %{ $self->attributes };
46				my $model;
47				$model->{attributes} = $self->attributes;
48
49
50				# Calculate the log-probabilities for each category
51				foreach my $label (keys %$labels) {
52				$model->{prior_probs}{$label} = log($labels->{$label}{count} / $examples);
53
54				# Count the number of tokens in this cat
55				my $label_tokens = sum( values %{ $labels->{$label}{attributes} } );
56
57				# Compute a smoothing term so P(word\|cat)==0 can be avoided
58				$model->{smoother}{$label} = -log($label_tokens + $vocab_size);
59
60				# P(attr\|label) = $count/$label_tokens (simple)
61				# P(attr\|label) = ($count + 1)/($label_tokens + $vocab_size) (with smoothing)
62				# log P(attr\|label) = log($count + 1) - log($label_tokens + $vocab_size)
63
64				my $denominator = log($label_tokens + $vocab_size);
65
66				while (my ($attribute, $count) = each %{ $labels->{$label}{attributes} }) {
67				$model->{probs}{$label}{$attribute} = log($count + 1) - $denominator;
68				}
69
70				if ($self->limit_features) {
71				my %old = %{$model->{probs}{$label}};
72				my @features = sort { abs($old{$a}) <=> abs($old{$b}) } keys(%old);
73				my $limit = min($self->features_kept, 0+@features);
74				if ($limit < 1) {
75				$limit = int($limit * keys(%old));
76				}
77				my @top = @features[0..$limit-1];
78				my %kept = map { $_ => $old{$_} } @top;
79				$model->{probs}{$label} = \%kept;
80				}
81				}
82				my $classifier_class = $self->classifier_class;
83				return $classifier_class->new( model => $model );
84				}
85
86				sub add_hash {
87				my ($first, $second) = @_;
88				$first //= {};
89				foreach my $k (keys %$second) {
90				$first->{$k} //= 0;
91				$first->{$k} += $second->{$k};
92				}
93				}
94
95				__PACKAGE__->meta->make_immutable;
96
97				1;
98
99				=pod
100
101				=encoding UTF-8
102
103				=head1 NAME
104
105				AI::NaiveBayes::Learner - Build AI::NaiveBayes classifier from a set of training examples.
106
107				=head1 VERSION
108
109				version 0.02
110
111				=head1 SYNOPSIS
112
113				my $learner = AI::NaiveBayes::Learner->new(features_kept => 0.5);
114				$learner->add_example(
115				{
116				attributes => { sheep => 1, very => 1, valuable => 1, farming => 1 },
117				labels => ['farming']
118				},
119				);
120
121				my $classifier = $learner->classifier;
122
123				=head1 DESCRIPTION
124
125				This is a trainer of AI::NaiveBayes classifiers. It saves information passed
126				by the C<add_example> method from
127				training data into internal structures and then constructs a classifier when
128				the C<classifier> method is called.
129
130				=head1 ATTRIBUTES
131
132				=over 4
133
134				=item C<features_kept>
135
136				Indicates how many features should remain after calculating probabilities. By
137				default all of them will be kept. For C<features_kept> > 1, C<features_kept> of
138				features will be preserved. For values lower than 1, a specified fraction of
139				features will be kept (e.g. top 20% of features for C<features_kept> = 0.2).
140
141				The rest of the attributes is for class' internal usage, and thus not
142				documented.
143
144				=item C<classifier_class>
145
146				The class of the classifier to be created. By default it is
147				C<AI::NaiveBayes>
148
149				=back
150
151				=head1 METHODS
152
153				=over 4
154
155				=item C<add_example( HASHREF )>
156
157				Saves the information from a training example into internal data structures.
158				The parameter should be of the form of
159				{ feature1 => weight1, feature2 => weight2, ... }
160
161				=item C<classifier()>
162
163				Creates an AI::NaiveBayes classifier based on the data accumulated before.
164
165				=back
166
167				=head1 UTILITY SUBS
168
169				=over 4
170
171				=item C<add_hash>
172
173				=back
174
175				=head1 BASED ON
176
177				Much of the code and description is from L<Algorithm::NaiveBayes>.
178
179				=head1 AUTHORS
180
181				=over 4
182
183				=item *
184
185				Zbigniew Lukasiak <zlukasiak@opera.com>
186
187				=item *
188
189				Tadeusz SoÅ›nierz <tsosnierz@opera.com>
190
191				=item *
192
193				Ken Williams <ken@mathforum.org>
194
195				=back
196
197				=head1 COPYRIGHT AND LICENSE
198
199				This software is copyright (c) 2012 by Opera Software ASA.
200
201				This is free software; you can redistribute it and/or modify it under
202				the same terms as the Perl 5 programming language system itself.
203
204				=cut
205
206				__END__
207
208				# ABSTRACT: Build AI::NaiveBayes classifier from a set of training examples.
209