line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package AI::NaiveBayes::Learner; |
2
|
|
|
|
|
|
|
$AI::NaiveBayes::Learner::VERSION = '0.02'; |
3
|
1
|
|
|
1
|
|
19573
|
use strict; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
57
|
|
4
|
1
|
|
|
1
|
|
6
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
28
|
|
5
|
1
|
|
|
1
|
|
22
|
use 5.010; |
|
1
|
|
|
|
|
7
|
|
|
1
|
|
|
|
|
40
|
|
6
|
|
|
|
|
|
|
|
7
|
1
|
|
|
1
|
|
13
|
use List::Util qw( min sum ); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
140
|
|
8
|
1
|
|
|
1
|
|
1534
|
use Moose; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
use AI::NaiveBayes; |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
has attributes => (is => 'ro', isa => 'HashRef', default => sub { {} }, clearer => '_clear_attrs'); |
12
|
|
|
|
|
|
|
has labels => (is => 'ro', isa => 'HashRef', default => sub { {} }, clearer => '_clear_labels'); |
13
|
|
|
|
|
|
|
has examples => (is => 'ro', isa => 'Int', default => 0, clearer => '_clear_examples'); |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
has features_kept => (is => 'ro', predicate => 'limit_features'); |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
has classifier_class => ( is => 'ro', isa => 'Str', default => 'AI::NaiveBayes' ); |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
sub add_example { |
20
|
|
|
|
|
|
|
my ($self, %params) = @_; |
21
|
|
|
|
|
|
|
for ('attributes', 'labels') { |
22
|
|
|
|
|
|
|
die "Missing required '$_' parameter" unless exists $params{$_}; |
23
|
|
|
|
|
|
|
} |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
$self->{examples}++; |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
my $attributes = $params{attributes}; |
28
|
|
|
|
|
|
|
my $labels = $params{labels}; |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
add_hash($self->attributes(), $attributes); |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
my $our_labels = $self->labels; |
33
|
|
|
|
|
|
|
foreach my $label ( @$labels ) { |
34
|
|
|
|
|
|
|
$our_labels->{$label}{count}++; |
35
|
|
|
|
|
|
|
$our_labels->{$label}{attributes} //= {}; |
36
|
|
|
|
|
|
|
add_hash($our_labels->{$label}{attributes}, $attributes); |
37
|
|
|
|
|
|
|
} |
38
|
|
|
|
|
|
|
} |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
sub classifier { |
41
|
|
|
|
|
|
|
my $self = shift; |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
my $examples = $self->examples; |
44
|
|
|
|
|
|
|
my $labels = $self->labels; |
45
|
|
|
|
|
|
|
my $vocab_size = keys %{ $self->attributes }; |
46
|
|
|
|
|
|
|
my $model; |
47
|
|
|
|
|
|
|
$model->{attributes} = $self->attributes; |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
# Calculate the log-probabilities for each category |
51
|
|
|
|
|
|
|
foreach my $label (keys %$labels) { |
52
|
|
|
|
|
|
|
$model->{prior_probs}{$label} = log($labels->{$label}{count} / $examples); |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
# Count the number of tokens in this cat |
55
|
|
|
|
|
|
|
my $label_tokens = sum( values %{ $labels->{$label}{attributes} } ); |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
# Compute a smoothing term so P(word|cat)==0 can be avoided |
58
|
|
|
|
|
|
|
$model->{smoother}{$label} = -log($label_tokens + $vocab_size); |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# P(attr|label) = $count/$label_tokens (simple) |
61
|
|
|
|
|
|
|
# P(attr|label) = ($count + 1)/($label_tokens + $vocab_size) (with smoothing) |
62
|
|
|
|
|
|
|
# log P(attr|label) = log($count + 1) - log($label_tokens + $vocab_size) |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
my $denominator = log($label_tokens + $vocab_size); |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
while (my ($attribute, $count) = each %{ $labels->{$label}{attributes} }) { |
67
|
|
|
|
|
|
|
$model->{probs}{$label}{$attribute} = log($count + 1) - $denominator; |
68
|
|
|
|
|
|
|
} |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
if ($self->limit_features) { |
71
|
|
|
|
|
|
|
my %old = %{$model->{probs}{$label}}; |
72
|
|
|
|
|
|
|
my @features = sort { abs($old{$a}) <=> abs($old{$b}) } keys(%old); |
73
|
|
|
|
|
|
|
my $limit = min($self->features_kept, 0+@features); |
74
|
|
|
|
|
|
|
if ($limit < 1) { |
75
|
|
|
|
|
|
|
$limit = int($limit * keys(%old)); |
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
my @top = @features[0..$limit-1]; |
78
|
|
|
|
|
|
|
my %kept = map { $_ => $old{$_} } @top; |
79
|
|
|
|
|
|
|
$model->{probs}{$label} = \%kept; |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
} |
82
|
|
|
|
|
|
|
my $classifier_class = $self->classifier_class; |
83
|
|
|
|
|
|
|
return $classifier_class->new( model => $model ); |
84
|
|
|
|
|
|
|
} |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
sub add_hash { |
87
|
|
|
|
|
|
|
my ($first, $second) = @_; |
88
|
|
|
|
|
|
|
$first //= {}; |
89
|
|
|
|
|
|
|
foreach my $k (keys %$second) { |
90
|
|
|
|
|
|
|
$first->{$k} //= 0; |
91
|
|
|
|
|
|
|
$first->{$k} += $second->{$k}; |
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
1; |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
=pod |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
=encoding UTF-8 |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
=head1 NAME |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
AI::NaiveBayes::Learner - Build AI::NaiveBayes classifier from a set of training examples. |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
=head1 VERSION |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
version 0.02 |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
=head1 SYNOPSIS |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
my $learner = AI::NaiveBayes::Learner->new(features_kept => 0.5); |
114
|
|
|
|
|
|
|
$learner->add_example( |
115
|
|
|
|
|
|
|
{ |
116
|
|
|
|
|
|
|
attributes => { sheep => 1, very => 1, valuable => 1, farming => 1 }, |
117
|
|
|
|
|
|
|
labels => ['farming'] |
118
|
|
|
|
|
|
|
}, |
119
|
|
|
|
|
|
|
); |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
my $classifier = $learner->classifier; |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
=head1 DESCRIPTION |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
This is a trainer of AI::NaiveBayes classifiers. It saves information passed |
126
|
|
|
|
|
|
|
by the C<add_example> method from |
127
|
|
|
|
|
|
|
training data into internal structures and then constructs a classifier when |
128
|
|
|
|
|
|
|
the C<classifier> method is called. |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
=head1 ATTRIBUTES |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=over 4 |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
=item C<features_kept> |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
Indicates how many features should remain after calculating probabilities. By |
137
|
|
|
|
|
|
|
default all of them will be kept. For C<features_kept> > 1, C<features_kept> of |
138
|
|
|
|
|
|
|
features will be preserved. For values lower than 1, a specified fraction of |
139
|
|
|
|
|
|
|
features will be kept (e.g. top 20% of features for C<features_kept> = 0.2). |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
The rest of the attributes is for class' internal usage, and thus not |
142
|
|
|
|
|
|
|
documented. |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
=item C<classifier_class> |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
The class of the classifier to be created. By default it is |
147
|
|
|
|
|
|
|
C<AI::NaiveBayes> |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
=back |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
=head1 METHODS |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
=over 4 |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
=item C<add_example( HASHREF )> |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
Saves the information from a training example into internal data structures. |
158
|
|
|
|
|
|
|
The parameter should be of the form of |
159
|
|
|
|
|
|
|
{ feature1 => weight1, feature2 => weight2, ... } |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
=item C<classifier()> |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
Creates an AI::NaiveBayes classifier based on the data accumulated before. |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
=back |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
=head1 UTILITY SUBS |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=over 4 |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
=item C<add_hash> |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
=back |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
=head1 BASED ON |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
Much of the code and description is from L<Algorithm::NaiveBayes>. |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
=head1 AUTHORS |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
=over 4 |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
=item * |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
Zbigniew Lukasiak <zlukasiak@opera.com> |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
=item * |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
Tadeusz SoÅnierz <tsosnierz@opera.com> |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
=item * |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
Ken Williams <ken@mathforum.org> |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
=back |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
This software is copyright (c) 2012 by Opera Software ASA. |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
This is free software; you can redistribute it and/or modify it under |
202
|
|
|
|
|
|
|
the same terms as the Perl 5 programming language system itself. |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
=cut |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
__END__ |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
# ABSTRACT: Build AI::NaiveBayes classifier from a set of training examples. |
209
|
|
|
|
|
|
|
|