File Coverage

blib/lib/AI/NaiveBayes/Learner.pm
Criterion Covered Total %
statement 64 64 100.0
branch 5 6 83.3
condition 5 6 83.3
subroutine 9 9 100.0
pod 3 3 100.0
total 86 88 97.7


line stmt bran cond sub pod time code
1             package AI::NaiveBayes::Learner;
2             $AI::NaiveBayes::Learner::VERSION = '0.04';
3 3     3   28990 use strict;
  3         12  
  3         98  
4 3     3   12 use warnings;
  3         4  
  3         134  
5 3     3   118 use 5.010;
  3         15  
6              
7 3     3   12 use List::Util qw( min sum );
  3         4  
  3         380  
8 3     3   1129 use Moose;
  3         349847  
  3         20  
9 3     3   15409 use AI::NaiveBayes;
  3         7  
  3         1687  
10              
11             has attributes => (is => 'ro', isa => 'HashRef', default => sub { {} }, clearer => '_clear_attrs');
12             has labels => (is => 'ro', isa => 'HashRef', default => sub { {} }, clearer => '_clear_labels');
13             has examples => (is => 'ro', isa => 'Int', default => 0, clearer => '_clear_examples');
14              
15             has features_kept => (is => 'ro', predicate => 'limit_features');
16              
17             has classifier_class => ( is => 'ro', isa => 'Str', default => 'AI::NaiveBayes' );
18              
19             sub add_example {
20 20     20 1 1273 my ($self, %params) = @_;
21 20         30 for ('attributes', 'labels') {
22 40 50       74 die "Missing required '$_' parameter" unless exists $params{$_};
23             }
24              
25 20         23 $self->{examples}++;
26              
27 20         19 my $attributes = $params{attributes};
28 20         21 my $labels = $params{labels};
29              
30 20         555 add_hash($self->attributes(), $attributes);
31              
32 20         532 my $our_labels = $self->labels;
33 20         33 foreach my $label ( @$labels ) {
34 20         33 $our_labels->{$label}{count}++;
35 20   100     52 $our_labels->{$label}{attributes} //= {};
36 20         38 add_hash($our_labels->{$label}{attributes}, $attributes);
37             }
38             }
39              
40             sub classifier {
41 5     5 1 23 my $self = shift;
42              
43 5         131 my $examples = $self->examples;
44 5         125 my $labels = $self->labels;
45 5         7 my $vocab_size = keys %{ $self->attributes };
  5         146  
46 5         8 my $model;
47 5         140 $model->{attributes} = $self->attributes;
48              
49              
50             # Calculate the log-probabilities for each category
51 5         38 foreach my $label (keys %$labels) {
52 8         42 $model->{prior_probs}{$label} = log($labels->{$label}{count} / $examples);
53              
54             # Count the number of tokens in this cat
55 8         11 my $label_tokens = sum( values %{ $labels->{$label}{attributes} } );
  8         60  
56              
57             # Compute a smoothing term so P(word|cat)==0 can be avoided
58 8         22 $model->{smoother}{$label} = -log($label_tokens + $vocab_size);
59              
60             # P(attr|label) = $count/$label_tokens (simple)
61             # P(attr|label) = ($count + 1)/($label_tokens + $vocab_size) (with smoothing)
62             # log P(attr|label) = log($count + 1) - log($label_tokens + $vocab_size)
63              
64 8         43 my $denominator = log($label_tokens + $vocab_size);
65              
66 8         9 while (my ($attribute, $count) = each %{ $labels->{$label}{attributes} }) {
  67         123  
67 59         88 $model->{probs}{$label}{$attribute} = log($count + 1) - $denominator;
68             }
69              
70 8 100       238 if ($self->limit_features) {
71 2         3 my %old = %{$model->{probs}{$label}};
  2         9  
72 2         43 my @features = sort { abs($old{$a}) <=> abs($old{$b}) } keys(%old);
  31         33  
73 2         65 my $limit = min($self->features_kept, 0+@features);
74 2 100       8 if ($limit < 1) {
75 1         5 $limit = int($limit * keys(%old));
76             }
77 2         10 my @top = @features[0..$limit-1];
78 2         4 my %kept = map { $_ => $old{$_} } @top;
  9         13  
79 2         12 $model->{probs}{$label} = \%kept;
80             }
81             }
82 5         132 my $classifier_class = $self->classifier_class;
83 5         116 return $classifier_class->new( model => $model );
84             }
85              
86             sub add_hash {
87 40     40 1 36 my ($first, $second) = @_;
88 40   50     58 $first //= {};
89 40         78 foreach my $k (keys %$second) {
90 182   100     365 $first->{$k} //= 0;
91 182         218 $first->{$k} += $second->{$k};
92             }
93             }
94              
95             __PACKAGE__->meta->make_immutable;
96              
97             1;
98              
99             =pod
100              
101             =encoding UTF-8
102              
103             =head1 NAME
104              
105             AI::NaiveBayes::Learner - Build AI::NaiveBayes classifier from a set of training examples.
106              
107             =head1 VERSION
108              
109             version 0.04
110              
111             =head1 SYNOPSIS
112              
113             my $learner = AI::NaiveBayes::Learner->new(features_kept => 0.5);
114             $learner->add_example(
115             attributes => { sheep => 1, very => 1, valuable => 1, farming => 1 },
116             labels => ['farming']
117             );
118              
119             my $classifier = $learner->classifier;
120              
121             =head1 DESCRIPTION
122              
123             This is a trainer of AI::NaiveBayes classifiers. It saves information passed
124             by the C<add_example> method from
125             training data into internal structures and then constructs a classifier when
126             the C<classifier> method is called.
127              
128             =head1 ATTRIBUTES
129              
130             =over 4
131              
132             =item C<features_kept>
133              
134             Indicates how many features should remain after calculating probabilities. By
135             default all of them will be kept. For C<features_kept> > 1, C<features_kept> of
136             features will be preserved. For values lower than 1, a specified fraction of
137             features will be kept (e.g. top 20% of features for C<features_kept> = 0.2).
138              
139             The rest of the attributes is for class' internal usage, and thus not
140             documented.
141              
142             =item C<classifier_class>
143              
144             The class of the classifier to be created. By default it is
145             C<AI::NaiveBayes>
146              
147             =back
148              
149             =head1 METHODS
150              
151             =over 4
152              
153             =item C<add_example( attributes => HASHREF, labels => LIST )>
154              
155             Saves the information from a training example into internal data structures.
156             C<attributes> should be of the form of
157             { feature1 => weight1, feature2 => weight2, ... }
158             C<labels> should be a list of strings denoting one or more classes to which the example belongs.
159              
160             =item C<classifier()>
161              
162             Creates an AI::NaiveBayes classifier based on the data accumulated before.
163              
164             =back
165              
166             =head1 UTILITY SUBS
167              
168             =over 4
169              
170             =item C<add_hash>
171              
172             =back
173              
174             =head1 BASED ON
175              
176             Much of the code and description is from L<Algorithm::NaiveBayes>.
177              
178             =head1 AUTHORS
179              
180             =over 4
181              
182             =item *
183              
184             Zbigniew Lukasiak <zlukasiak@opera.com>
185              
186             =item *
187              
188             Tadeusz SoÅ›nierz <tsosnierz@opera.com>
189              
190             =item *
191              
192             Ken Williams <ken@mathforum.org>
193              
194             =back
195              
196             =head1 COPYRIGHT AND LICENSE
197              
198             This software is copyright (c) 2012 by Opera Software ASA.
199              
200             This is free software; you can redistribute it and/or modify it under
201             the same terms as the Perl 5 programming language system itself.
202              
203             =cut
204              
205             __END__
206              
207             # ABSTRACT: Build AI::NaiveBayes classifier from a set of training examples.
208