File Coverage

blib/lib/AI/NaiveBayes/Learner.pm
Criterion Covered Total %
statement 64 64 100.0
branch 5 6 83.3
condition 5 6 83.3
subroutine 9 9 100.0
pod 3 3 100.0
total 86 88 97.7


line stmt bran cond sub pod time code
1             package AI::NaiveBayes::Learner;
2             $AI::NaiveBayes::Learner::VERSION = '0.03';
3 3     3   23005 use strict;
  3         7  
  3         90  
4 3     3   15 use warnings;
  3         5  
  3         90  
5 3     3   80 use 5.010;
  3         13  
6              
7 3     3   16 use List::Util qw( min sum );
  3         6  
  3         275  
8 3     3   916 use Moose;
  3         472694  
  3         22  
9 3     3   20255 use AI::NaiveBayes;
  3         7  
  3         2445  
10              
11             has attributes => (is => 'ro', isa => 'HashRef', default => sub { {} }, clearer => '_clear_attrs');
12             has labels => (is => 'ro', isa => 'HashRef', default => sub { {} }, clearer => '_clear_labels');
13             has examples => (is => 'ro', isa => 'Int', default => 0, clearer => '_clear_examples');
14              
15             has features_kept => (is => 'ro', predicate => 'limit_features');
16              
17             has classifier_class => ( is => 'ro', isa => 'Str', default => 'AI::NaiveBayes' );
18              
19             sub add_example {
20 20     20 1 2333 my ($self, %params) = @_;
21 20         41 for ('attributes', 'labels') {
22 40 50       113 die "Missing required '$_' parameter" unless exists $params{$_};
23             }
24              
25 20         30 $self->{examples}++;
26              
27 20         31 my $attributes = $params{attributes};
28 20         27 my $labels = $params{labels};
29              
30 20         716 add_hash($self->attributes(), $attributes);
31              
32 20         744 my $our_labels = $self->labels;
33 20         39 foreach my $label ( @$labels ) {
34 20         45 $our_labels->{$label}{count}++;
35 20   100     77 $our_labels->{$label}{attributes} //= {};
36 20         50 add_hash($our_labels->{$label}{attributes}, $attributes);
37             }
38             }
39              
40             sub classifier {
41 5     5 1 25 my $self = shift;
42              
43 5         188 my $examples = $self->examples;
44 5         170 my $labels = $self->labels;
45 5         10 my $vocab_size = keys %{ $self->attributes };
  5         170  
46 5         8 my $model;
47 5         171 $model->{attributes} = $self->attributes;
48              
49              
50             # Calculate the log-probabilities for each category
51 5         50 foreach my $label (keys %$labels) {
52 8         48 $model->{prior_probs}{$label} = log($labels->{$label}{count} / $examples);
53              
54             # Count the number of tokens in this cat
55 8         11 my $label_tokens = sum( values %{ $labels->{$label}{attributes} } );
  8         58  
56              
57             # Compute a smoothing term so P(word|cat)==0 can be avoided
58 8         27 $model->{smoother}{$label} = -log($label_tokens + $vocab_size);
59              
60             # P(attr|label) = $count/$label_tokens (simple)
61             # P(attr|label) = ($count + 1)/($label_tokens + $vocab_size) (with smoothing)
62             # log P(attr|label) = log($count + 1) - log($label_tokens + $vocab_size)
63              
64 8         44 my $denominator = log($label_tokens + $vocab_size);
65              
66 8         13 while (my ($attribute, $count) = each %{ $labels->{$label}{attributes} }) {
  67         211  
67 59         154 $model->{probs}{$label}{$attribute} = log($count + 1) - $denominator;
68             }
69              
70 8 100       343 if ($self->limit_features) {
71 2         2 my %old = %{$model->{probs}{$label}};
  2         14  
72 2         43 my @features = sort { abs($old{$a}) <=> abs($old{$b}) } keys(%old);
  31         53  
73 2         76 my $limit = min($self->features_kept, 0+@features);
74 2 100       8 if ($limit < 1) {
75 1         3 $limit = int($limit * keys(%old));
76             }
77 2         10 my @top = @features[0..$limit-1];
78 2         4 my %kept = map { $_ => $old{$_} } @top;
  9         23  
79 2         15 $model->{probs}{$label} = \%kept;
80             }
81             }
82 5         190 my $classifier_class = $self->classifier_class;
83 5         170 return $classifier_class->new( model => $model );
84             }
85              
86             sub add_hash {
87 40     40 1 59 my ($first, $second) = @_;
88 40   50     83 $first //= {};
89 40         107 foreach my $k (keys %$second) {
90 182   100     581 $first->{$k} //= 0;
91 182         343 $first->{$k} += $second->{$k};
92             }
93             }
94              
95             __PACKAGE__->meta->make_immutable;
96              
97             1;
98              
99             =pod
100              
101             =encoding UTF-8
102              
103             =head1 NAME
104              
105             AI::NaiveBayes::Learner - Build AI::NaiveBayes classifier from a set of training examples.
106              
107             =head1 VERSION
108              
109             version 0.03
110              
111             =head1 SYNOPSIS
112              
113             my $learner = AI::NaiveBayes::Learner->new(features_kept => 0.5);
114             $learner->add_example(
115             {
116             attributes => { sheep => 1, very => 1, valuable => 1, farming => 1 },
117             labels => ['farming']
118             },
119             );
120              
121             my $classifier = $learner->classifier;
122              
123             =head1 DESCRIPTION
124              
125             This is a trainer of AI::NaiveBayes classifiers. It saves information passed
126             by the C<add_example> method from
127             training data into internal structures and then constructs a classifier when
128             the C<classifier> method is called.
129              
130             =head1 ATTRIBUTES
131              
132             =over 4
133              
134             =item C<features_kept>
135              
136             Indicates how many features should remain after calculating probabilities. By
137             default all of them will be kept. For C<features_kept> > 1, C<features_kept> of
138             features will be preserved. For values lower than 1, a specified fraction of
139             features will be kept (e.g. top 20% of features for C<features_kept> = 0.2).
140              
141             The rest of the attributes is for class' internal usage, and thus not
142             documented.
143              
144             =item C<classifier_class>
145              
146             The class of the classifier to be created. By default it is
147             C<AI::NaiveBayes>
148              
149             =back
150              
151             =head1 METHODS
152              
153             =over 4
154              
155             =item C<add_example( HASHREF )>
156              
157             Saves the information from a training example into internal data structures.
158             The parameter should be of the form of
159             { feature1 => weight1, feature2 => weight2, ... }
160              
161             =item C<classifier()>
162              
163             Creates an AI::NaiveBayes classifier based on the data accumulated before.
164              
165             =back
166              
167             =head1 UTILITY SUBS
168              
169             =over 4
170              
171             =item C<add_hash>
172              
173             =back
174              
175             =head1 BASED ON
176              
177             Much of the code and description is from L<Algorithm::NaiveBayes>.
178              
179             =head1 AUTHORS
180              
181             =over 4
182              
183             =item *
184              
185             Zbigniew Lukasiak <zlukasiak@opera.com>
186              
187             =item *
188              
189             Tadeusz SoÅ›nierz <tsosnierz@opera.com>
190              
191             =item *
192              
193             Ken Williams <ken@mathforum.org>
194              
195             =back
196              
197             =head1 COPYRIGHT AND LICENSE
198              
199             This software is copyright (c) 2012 by Opera Software ASA.
200              
201             This is free software; you can redistribute it and/or modify it under
202             the same terms as the Perl 5 programming language system itself.
203              
204             =cut
205              
206             __END__
207              
208             # ABSTRACT: Build AI::NaiveBayes classifier from a set of training examples.
209