File Coverage

blib/lib/AI/NaiveBayes/Learner.pm

Criterion	Covered	Total	%
statement	64	64	100.0
branch	5	6	83.3
condition	5	6	83.3
subroutine	9	9	100.0
pod	3	3	100.0
total	86	88	97.7

line	stmt	bran	cond	sub	pod	time	code
1							package AI::NaiveBayes::Learner;
2							$AI::NaiveBayes::Learner::VERSION = '0.04';
3	3			3		28990	use strict;
	3					12
	3					98
4	3			3		12	use warnings;
	3					4
	3					134
5	3			3		118	use 5.010;
	3					15
6
7	3			3		12	use List::Util qw( min sum );
	3					4
	3					380
8	3			3		1129	use Moose;
	3					349847
	3					20
9	3			3		15409	use AI::NaiveBayes;
	3					7
	3					1687
10
11							has attributes => (is => 'ro', isa => 'HashRef', default => sub { {} }, clearer => '_clear_attrs');
12							has labels => (is => 'ro', isa => 'HashRef', default => sub { {} }, clearer => '_clear_labels');
13							has examples => (is => 'ro', isa => 'Int', default => 0, clearer => '_clear_examples');
14
15							has features_kept => (is => 'ro', predicate => 'limit_features');
16
17							has classifier_class => ( is => 'ro', isa => 'Str', default => 'AI::NaiveBayes' );
18
19							sub add_example {
20	20			20	1	1273	my ($self, %params) = @_;
21	20					30	for ('attributes', 'labels') {
22	40	50				74	die "Missing required '$_' parameter" unless exists $params{$_};
23							}
24
25	20					23	$self->{examples}++;
26
27	20					19	my $attributes = $params{attributes};
28	20					21	my $labels = $params{labels};
29
30	20					555	add_hash($self->attributes(), $attributes);
31
32	20					532	my $our_labels = $self->labels;
33	20					33	foreach my $label ( @$labels ) {
34	20					33	$our_labels->{$label}{count}++;
35	20		100			52	$our_labels->{$label}{attributes} //= {};
36	20					38	add_hash($our_labels->{$label}{attributes}, $attributes);
37							}
38							}
39
40							sub classifier {
41	5			5	1	23	my $self = shift;
42
43	5					131	my $examples = $self->examples;
44	5					125	my $labels = $self->labels;
45	5					7	my $vocab_size = keys %{ $self->attributes };
	5					146
46	5					8	my $model;
47	5					140	$model->{attributes} = $self->attributes;
48
49
50							# Calculate the log-probabilities for each category
51	5					38	foreach my $label (keys %$labels) {
52	8					42	$model->{prior_probs}{$label} = log($labels->{$label}{count} / $examples);
53
54							# Count the number of tokens in this cat
55	8					11	my $label_tokens = sum( values %{ $labels->{$label}{attributes} } );
	8					60
56
57							# Compute a smoothing term so P(word\|cat)==0 can be avoided
58	8					22	$model->{smoother}{$label} = -log($label_tokens + $vocab_size);
59
60							# P(attr\|label) = $count/$label_tokens (simple)
61							# P(attr\|label) = ($count + 1)/($label_tokens + $vocab_size) (with smoothing)
62							# log P(attr\|label) = log($count + 1) - log($label_tokens + $vocab_size)
63
64	8					43	my $denominator = log($label_tokens + $vocab_size);
65
66	8					9	while (my ($attribute, $count) = each %{ $labels->{$label}{attributes} }) {
	67					123
67	59					88	$model->{probs}{$label}{$attribute} = log($count + 1) - $denominator;
68							}
69
70	8	100				238	if ($self->limit_features) {
71	2					3	my %old = %{$model->{probs}{$label}};
	2					9
72	2					43	my @features = sort { abs($old{$a}) <=> abs($old{$b}) } keys(%old);
	31					33
73	2					65	my $limit = min($self->features_kept, 0+@features);
74	2	100				8	if ($limit < 1) {
75	1					5	$limit = int($limit * keys(%old));
76							}
77	2					10	my @top = @features[0..$limit-1];
78	2					4	my %kept = map { $_ => $old{$_} } @top;
	9					13
79	2					12	$model->{probs}{$label} = \%kept;
80							}
81							}
82	5					132	my $classifier_class = $self->classifier_class;
83	5					116	return $classifier_class->new( model => $model );
84							}
85
86							sub add_hash {
87	40			40	1	36	my ($first, $second) = @_;
88	40		50			58	$first //= {};
89	40					78	foreach my $k (keys %$second) {
90	182		100			365	$first->{$k} //= 0;
91	182					218	$first->{$k} += $second->{$k};
92							}
93							}
94
95							__PACKAGE__->meta->make_immutable;
96
97							1;
98
99							=pod
100
101							=encoding UTF-8
102
103							=head1 NAME
104
105							AI::NaiveBayes::Learner - Build AI::NaiveBayes classifier from a set of training examples.
106
107							=head1 VERSION
108
109							version 0.04
110
111							=head1 SYNOPSIS
112
113							my $learner = AI::NaiveBayes::Learner->new(features_kept => 0.5);
114							$learner->add_example(
115							attributes => { sheep => 1, very => 1, valuable => 1, farming => 1 },
116							labels => ['farming']
117							);
118
119							my $classifier = $learner->classifier;
120
121							=head1 DESCRIPTION
122
123							This is a trainer of AI::NaiveBayes classifiers. It saves information passed
124							by the C<add_example> method from
125							training data into internal structures and then constructs a classifier when
126							the C<classifier> method is called.
127
128							=head1 ATTRIBUTES
129
130							=over 4
131
132							=item C<features_kept>
133
134							Indicates how many features should remain after calculating probabilities. By
135							default all of them will be kept. For C<features_kept> > 1, C<features_kept> of
136							features will be preserved. For values lower than 1, a specified fraction of
137							features will be kept (e.g. top 20% of features for C<features_kept> = 0.2).
138
139							The rest of the attributes is for class' internal usage, and thus not
140							documented.
141
142							=item C<classifier_class>
143
144							The class of the classifier to be created. By default it is
145							C<AI::NaiveBayes>
146
147							=back
148
149							=head1 METHODS
150
151							=over 4
152
153							=item C<add_example( attributes => HASHREF, labels => LIST )>
154
155							Saves the information from a training example into internal data structures.
156							C<attributes> should be of the form of
157							{ feature1 => weight1, feature2 => weight2, ... }
158							C<labels> should be a list of strings denoting one or more classes to which the example belongs.
159
160							=item C<classifier()>
161
162							Creates an AI::NaiveBayes classifier based on the data accumulated before.
163
164							=back
165
166							=head1 UTILITY SUBS
167
168							=over 4
169
170							=item C<add_hash>
171
172							=back
173
174							=head1 BASED ON
175
176							Much of the code and description is from L<Algorithm::NaiveBayes>.
177
178							=head1 AUTHORS
179
180							=over 4
181
182							=item *
183
184							Zbigniew Lukasiak <zlukasiak@opera.com>
185
186							=item *
187
188							Tadeusz SoÅ›nierz <tsosnierz@opera.com>
189
190							=item *
191
192							Ken Williams <ken@mathforum.org>
193
194							=back
195
196							=head1 COPYRIGHT AND LICENSE
197
198							This software is copyright (c) 2012 by Opera Software ASA.
199
200							This is free software; you can redistribute it and/or modify it under
201							the same terms as the Perl 5 programming language system itself.
202
203							=cut
204
205							__END__
206
207							# ABSTRACT: Build AI::NaiveBayes classifier from a set of training examples.
208