line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package AI::NaiveBayes::Learner; |
2
|
|
|
|
|
|
|
$AI::NaiveBayes::Learner::VERSION = '0.04'; |
3
|
3
|
|
|
3
|
|
28990
|
use strict; |
|
3
|
|
|
|
|
12
|
|
|
3
|
|
|
|
|
98
|
|
4
|
3
|
|
|
3
|
|
12
|
use warnings; |
|
3
|
|
|
|
|
4
|
|
|
3
|
|
|
|
|
134
|
|
5
|
3
|
|
|
3
|
|
118
|
use 5.010; |
|
3
|
|
|
|
|
15
|
|
6
|
|
|
|
|
|
|
|
7
|
3
|
|
|
3
|
|
12
|
use List::Util qw( min sum ); |
|
3
|
|
|
|
|
4
|
|
|
3
|
|
|
|
|
380
|
|
8
|
3
|
|
|
3
|
|
1129
|
use Moose; |
|
3
|
|
|
|
|
349847
|
|
|
3
|
|
|
|
|
20
|
|
9
|
3
|
|
|
3
|
|
15409
|
use AI::NaiveBayes; |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
1687
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
has attributes => (is => 'ro', isa => 'HashRef', default => sub { {} }, clearer => '_clear_attrs'); |
12
|
|
|
|
|
|
|
has labels => (is => 'ro', isa => 'HashRef', default => sub { {} }, clearer => '_clear_labels'); |
13
|
|
|
|
|
|
|
has examples => (is => 'ro', isa => 'Int', default => 0, clearer => '_clear_examples'); |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
has features_kept => (is => 'ro', predicate => 'limit_features'); |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
has classifier_class => ( is => 'ro', isa => 'Str', default => 'AI::NaiveBayes' ); |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
sub add_example { |
20
|
20
|
|
|
20
|
1
|
1273
|
my ($self, %params) = @_; |
21
|
20
|
|
|
|
|
30
|
for ('attributes', 'labels') { |
22
|
40
|
50
|
|
|
|
74
|
die "Missing required '$_' parameter" unless exists $params{$_}; |
23
|
|
|
|
|
|
|
} |
24
|
|
|
|
|
|
|
|
25
|
20
|
|
|
|
|
23
|
$self->{examples}++; |
26
|
|
|
|
|
|
|
|
27
|
20
|
|
|
|
|
19
|
my $attributes = $params{attributes}; |
28
|
20
|
|
|
|
|
21
|
my $labels = $params{labels}; |
29
|
|
|
|
|
|
|
|
30
|
20
|
|
|
|
|
555
|
add_hash($self->attributes(), $attributes); |
31
|
|
|
|
|
|
|
|
32
|
20
|
|
|
|
|
532
|
my $our_labels = $self->labels; |
33
|
20
|
|
|
|
|
33
|
foreach my $label ( @$labels ) { |
34
|
20
|
|
|
|
|
33
|
$our_labels->{$label}{count}++; |
35
|
20
|
|
100
|
|
|
52
|
$our_labels->{$label}{attributes} //= {}; |
36
|
20
|
|
|
|
|
38
|
add_hash($our_labels->{$label}{attributes}, $attributes); |
37
|
|
|
|
|
|
|
} |
38
|
|
|
|
|
|
|
} |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
sub classifier { |
41
|
5
|
|
|
5
|
1
|
23
|
my $self = shift; |
42
|
|
|
|
|
|
|
|
43
|
5
|
|
|
|
|
131
|
my $examples = $self->examples; |
44
|
5
|
|
|
|
|
125
|
my $labels = $self->labels; |
45
|
5
|
|
|
|
|
7
|
my $vocab_size = keys %{ $self->attributes }; |
|
5
|
|
|
|
|
146
|
|
46
|
5
|
|
|
|
|
8
|
my $model; |
47
|
5
|
|
|
|
|
140
|
$model->{attributes} = $self->attributes; |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
# Calculate the log-probabilities for each category |
51
|
5
|
|
|
|
|
38
|
foreach my $label (keys %$labels) { |
52
|
8
|
|
|
|
|
42
|
$model->{prior_probs}{$label} = log($labels->{$label}{count} / $examples); |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
# Count the number of tokens in this cat |
55
|
8
|
|
|
|
|
11
|
my $label_tokens = sum( values %{ $labels->{$label}{attributes} } ); |
|
8
|
|
|
|
|
60
|
|
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
# Compute a smoothing term so P(word|cat)==0 can be avoided |
58
|
8
|
|
|
|
|
22
|
$model->{smoother}{$label} = -log($label_tokens + $vocab_size); |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# P(attr|label) = $count/$label_tokens (simple) |
61
|
|
|
|
|
|
|
# P(attr|label) = ($count + 1)/($label_tokens + $vocab_size) (with smoothing) |
62
|
|
|
|
|
|
|
# log P(attr|label) = log($count + 1) - log($label_tokens + $vocab_size) |
63
|
|
|
|
|
|
|
|
64
|
8
|
|
|
|
|
43
|
my $denominator = log($label_tokens + $vocab_size); |
65
|
|
|
|
|
|
|
|
66
|
8
|
|
|
|
|
9
|
while (my ($attribute, $count) = each %{ $labels->{$label}{attributes} }) { |
|
67
|
|
|
|
|
123
|
|
67
|
59
|
|
|
|
|
88
|
$model->{probs}{$label}{$attribute} = log($count + 1) - $denominator; |
68
|
|
|
|
|
|
|
} |
69
|
|
|
|
|
|
|
|
70
|
8
|
100
|
|
|
|
238
|
if ($self->limit_features) { |
71
|
2
|
|
|
|
|
3
|
my %old = %{$model->{probs}{$label}}; |
|
2
|
|
|
|
|
9
|
|
72
|
2
|
|
|
|
|
43
|
my @features = sort { abs($old{$a}) <=> abs($old{$b}) } keys(%old); |
|
31
|
|
|
|
|
33
|
|
73
|
2
|
|
|
|
|
65
|
my $limit = min($self->features_kept, 0+@features); |
74
|
2
|
100
|
|
|
|
8
|
if ($limit < 1) { |
75
|
1
|
|
|
|
|
5
|
$limit = int($limit * keys(%old)); |
76
|
|
|
|
|
|
|
} |
77
|
2
|
|
|
|
|
10
|
my @top = @features[0..$limit-1]; |
78
|
2
|
|
|
|
|
4
|
my %kept = map { $_ => $old{$_} } @top; |
|
9
|
|
|
|
|
13
|
|
79
|
2
|
|
|
|
|
12
|
$model->{probs}{$label} = \%kept; |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
} |
82
|
5
|
|
|
|
|
132
|
my $classifier_class = $self->classifier_class; |
83
|
5
|
|
|
|
|
116
|
return $classifier_class->new( model => $model ); |
84
|
|
|
|
|
|
|
} |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
sub add_hash { |
87
|
40
|
|
|
40
|
1
|
36
|
my ($first, $second) = @_; |
88
|
40
|
|
50
|
|
|
58
|
$first //= {}; |
89
|
40
|
|
|
|
|
78
|
foreach my $k (keys %$second) { |
90
|
182
|
|
100
|
|
|
365
|
$first->{$k} //= 0; |
91
|
182
|
|
|
|
|
218
|
$first->{$k} += $second->{$k}; |
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
1; |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
=pod |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
=encoding UTF-8 |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
=head1 NAME |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
AI::NaiveBayes::Learner - Build AI::NaiveBayes classifier from a set of training examples. |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
=head1 VERSION |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
version 0.04 |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
=head1 SYNOPSIS |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
my $learner = AI::NaiveBayes::Learner->new(features_kept => 0.5); |
114
|
|
|
|
|
|
|
$learner->add_example( |
115
|
|
|
|
|
|
|
attributes => { sheep => 1, very => 1, valuable => 1, farming => 1 }, |
116
|
|
|
|
|
|
|
labels => ['farming'] |
117
|
|
|
|
|
|
|
); |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
my $classifier = $learner->classifier; |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
=head1 DESCRIPTION |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
This is a trainer of AI::NaiveBayes classifiers. It saves information passed |
124
|
|
|
|
|
|
|
by the C<add_example> method from |
125
|
|
|
|
|
|
|
training data into internal structures and then constructs a classifier when |
126
|
|
|
|
|
|
|
the C<classifier> method is called. |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
=head1 ATTRIBUTES |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
=over 4 |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=item C<features_kept> |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
Indicates how many features should remain after calculating probabilities. By |
135
|
|
|
|
|
|
|
default all of them will be kept. For C<features_kept> > 1, C<features_kept> of |
136
|
|
|
|
|
|
|
features will be preserved. For values lower than 1, a specified fraction of |
137
|
|
|
|
|
|
|
features will be kept (e.g. top 20% of features for C<features_kept> = 0.2). |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
The rest of the attributes is for class' internal usage, and thus not |
140
|
|
|
|
|
|
|
documented. |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
=item C<classifier_class> |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
The class of the classifier to be created. By default it is |
145
|
|
|
|
|
|
|
C<AI::NaiveBayes> |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
=back |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
=head1 METHODS |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
=over 4 |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
=item C<add_example( attributes => HASHREF, labels => LIST )> |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
Saves the information from a training example into internal data structures. |
156
|
|
|
|
|
|
|
C<attributes> should be of the form of |
157
|
|
|
|
|
|
|
{ feature1 => weight1, feature2 => weight2, ... } |
158
|
|
|
|
|
|
|
C<labels> should be a list of strings denoting one or more classes to which the example belongs. |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
=item C<classifier()> |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
Creates an AI::NaiveBayes classifier based on the data accumulated before. |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=back |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
=head1 UTILITY SUBS |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
=over 4 |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
=item C<add_hash> |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
=back |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
=head1 BASED ON |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
Much of the code and description is from L<Algorithm::NaiveBayes>. |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
=head1 AUTHORS |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
=over 4 |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
=item * |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
Zbigniew Lukasiak <zlukasiak@opera.com> |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
=item * |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
Tadeusz SoÅnierz <tsosnierz@opera.com> |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
=item * |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
Ken Williams <ken@mathforum.org> |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=back |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
This software is copyright (c) 2012 by Opera Software ASA. |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
This is free software; you can redistribute it and/or modify it under |
201
|
|
|
|
|
|
|
the same terms as the Perl 5 programming language system itself. |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
=cut |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
__END__ |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
# ABSTRACT: Build AI::NaiveBayes classifier from a set of training examples. |
208
|
|
|
|
|
|
|
|