File Coverage

blib/lib/AI/NaiveBayes/Learner.pm
Criterion Covered Total %
statement 13 15 86.6
branch n/a
condition n/a
subroutine 5 5 100.0
pod n/a
total 18 20 90.0


line stmt bran cond sub pod time code
1             package AI::NaiveBayes::Learner;
2             $AI::NaiveBayes::Learner::VERSION = '0.02';
3 1     1   19573 use strict;
  1         3  
  1         57  
4 1     1   6 use warnings;
  1         2  
  1         28  
5 1     1   22 use 5.010;
  1         7  
  1         40  
6              
7 1     1   13 use List::Util qw( min sum );
  1         2  
  1         140  
8 1     1   1534 use Moose;
  0            
  0            
9             use AI::NaiveBayes;
10              
11             has attributes => (is => 'ro', isa => 'HashRef', default => sub { {} }, clearer => '_clear_attrs');
12             has labels => (is => 'ro', isa => 'HashRef', default => sub { {} }, clearer => '_clear_labels');
13             has examples => (is => 'ro', isa => 'Int', default => 0, clearer => '_clear_examples');
14              
15             has features_kept => (is => 'ro', predicate => 'limit_features');
16              
17             has classifier_class => ( is => 'ro', isa => 'Str', default => 'AI::NaiveBayes' );
18              
19             sub add_example {
20             my ($self, %params) = @_;
21             for ('attributes', 'labels') {
22             die "Missing required '$_' parameter" unless exists $params{$_};
23             }
24              
25             $self->{examples}++;
26              
27             my $attributes = $params{attributes};
28             my $labels = $params{labels};
29              
30             add_hash($self->attributes(), $attributes);
31              
32             my $our_labels = $self->labels;
33             foreach my $label ( @$labels ) {
34             $our_labels->{$label}{count}++;
35             $our_labels->{$label}{attributes} //= {};
36             add_hash($our_labels->{$label}{attributes}, $attributes);
37             }
38             }
39              
40             sub classifier {
41             my $self = shift;
42              
43             my $examples = $self->examples;
44             my $labels = $self->labels;
45             my $vocab_size = keys %{ $self->attributes };
46             my $model;
47             $model->{attributes} = $self->attributes;
48              
49              
50             # Calculate the log-probabilities for each category
51             foreach my $label (keys %$labels) {
52             $model->{prior_probs}{$label} = log($labels->{$label}{count} / $examples);
53              
54             # Count the number of tokens in this cat
55             my $label_tokens = sum( values %{ $labels->{$label}{attributes} } );
56              
57             # Compute a smoothing term so P(word|cat)==0 can be avoided
58             $model->{smoother}{$label} = -log($label_tokens + $vocab_size);
59              
60             # P(attr|label) = $count/$label_tokens (simple)
61             # P(attr|label) = ($count + 1)/($label_tokens + $vocab_size) (with smoothing)
62             # log P(attr|label) = log($count + 1) - log($label_tokens + $vocab_size)
63              
64             my $denominator = log($label_tokens + $vocab_size);
65              
66             while (my ($attribute, $count) = each %{ $labels->{$label}{attributes} }) {
67             $model->{probs}{$label}{$attribute} = log($count + 1) - $denominator;
68             }
69              
70             if ($self->limit_features) {
71             my %old = %{$model->{probs}{$label}};
72             my @features = sort { abs($old{$a}) <=> abs($old{$b}) } keys(%old);
73             my $limit = min($self->features_kept, 0+@features);
74             if ($limit < 1) {
75             $limit = int($limit * keys(%old));
76             }
77             my @top = @features[0..$limit-1];
78             my %kept = map { $_ => $old{$_} } @top;
79             $model->{probs}{$label} = \%kept;
80             }
81             }
82             my $classifier_class = $self->classifier_class;
83             return $classifier_class->new( model => $model );
84             }
85              
86             sub add_hash {
87             my ($first, $second) = @_;
88             $first //= {};
89             foreach my $k (keys %$second) {
90             $first->{$k} //= 0;
91             $first->{$k} += $second->{$k};
92             }
93             }
94              
95             __PACKAGE__->meta->make_immutable;
96              
97             1;
98              
99             =pod
100              
101             =encoding UTF-8
102              
103             =head1 NAME
104              
105             AI::NaiveBayes::Learner - Build AI::NaiveBayes classifier from a set of training examples.
106              
107             =head1 VERSION
108              
109             version 0.02
110              
111             =head1 SYNOPSIS
112              
113             my $learner = AI::NaiveBayes::Learner->new(features_kept => 0.5);
114             $learner->add_example(
115             {
116             attributes => { sheep => 1, very => 1, valuable => 1, farming => 1 },
117             labels => ['farming']
118             },
119             );
120              
121             my $classifier = $learner->classifier;
122              
123             =head1 DESCRIPTION
124              
125             This is a trainer of AI::NaiveBayes classifiers. It saves information passed
126             by the C<add_example> method from
127             training data into internal structures and then constructs a classifier when
128             the C<classifier> method is called.
129              
130             =head1 ATTRIBUTES
131              
132             =over 4
133              
134             =item C<features_kept>
135              
136             Indicates how many features should remain after calculating probabilities. By
137             default all of them will be kept. For C<features_kept> > 1, C<features_kept> of
138             features will be preserved. For values lower than 1, a specified fraction of
139             features will be kept (e.g. top 20% of features for C<features_kept> = 0.2).
140              
141             The rest of the attributes is for class' internal usage, and thus not
142             documented.
143              
144             =item C<classifier_class>
145              
146             The class of the classifier to be created. By default it is
147             C<AI::NaiveBayes>
148              
149             =back
150              
151             =head1 METHODS
152              
153             =over 4
154              
155             =item C<add_example( HASHREF )>
156              
157             Saves the information from a training example into internal data structures.
158             The parameter should be of the form of
159             { feature1 => weight1, feature2 => weight2, ... }
160              
161             =item C<classifier()>
162              
163             Creates an AI::NaiveBayes classifier based on the data accumulated before.
164              
165             =back
166              
167             =head1 UTILITY SUBS
168              
169             =over 4
170              
171             =item C<add_hash>
172              
173             =back
174              
175             =head1 BASED ON
176              
177             Much of the code and description is from L<Algorithm::NaiveBayes>.
178              
179             =head1 AUTHORS
180              
181             =over 4
182              
183             =item *
184              
185             Zbigniew Lukasiak <zlukasiak@opera.com>
186              
187             =item *
188              
189             Tadeusz SoÅ›nierz <tsosnierz@opera.com>
190              
191             =item *
192              
193             Ken Williams <ken@mathforum.org>
194              
195             =back
196              
197             =head1 COPYRIGHT AND LICENSE
198              
199             This software is copyright (c) 2012 by Opera Software ASA.
200              
201             This is free software; you can redistribute it and/or modify it under
202             the same terms as the Perl 5 programming language system itself.
203              
204             =cut
205              
206             __END__
207              
208             # ABSTRACT: Build AI::NaiveBayes classifier from a set of training examples.
209