File Coverage

blib/lib/Algorithm/NaiveBayes/Model/Frequency.pm

Criterion	Covered	Total	%
statement	45	45	100.0
branch	2	2	100.0
condition	4	5	80.0
subroutine	7	7	100.0
pod	1	4	25.0
total	59	63	93.6

line	stmt	bran	cond	sub	pod	time	code
1							package Algorithm::NaiveBayes::Model::Frequency;
2
3	2			2		10	use strict;
	2					4
	2					77
4	2			2		967	use Algorithm::NaiveBayes::Util qw(sum_hash add_hash max rescale);
	2					4
	2					131
5	2			2		10	use base qw(Algorithm::NaiveBayes);
	2					3
	2					1137
6
7							sub new {
8	2			2	1	18	my $self = shift()->SUPER::new(@_);
9	2					16	$self->training_data->{attributes} = {};
10	2					7	$self->training_data->{labels} = {};
11	2					16	return $self;
12							}
13
14							sub do_add_instance {
15	5			5	0	8	my ($self, $attributes, $labels, $training_data) = @_;
16	5					16	add_hash($training_data->{attributes}, $attributes);
17
18	5					10	my $mylabels = $training_data->{labels};
19	5					21	foreach my $label ( @$labels ) {
20	5					11	$mylabels->{$label}{count}++;
21	5		100			221	add_hash($mylabels->{$label}{attributes} \|\|= {}, $attributes);
22							}
23							}
24
25							sub do_train {
26	2			2	0	2	my ($self, $training_data) = @_;
27	2					4	my $m = {};
28
29	2					12	my $instances = $self->instances;
30	2					3	my $labels = $training_data->{labels};
31	2					4	$m->{attributes} = $training_data->{attributes};
32	2					3	my $vocab_size = keys %{ $m->{attributes} };
	2					5
33
34							# Calculate the log-probabilities for each category
35	2					7	foreach my $label ($self->labels) {
36	3					23	$m->{prior_probs}{$label} = log($labels->{$label}{count} / $instances);
37
38							# Count the number of tokens in this cat
39	3					10	my $label_tokens = sum_hash($labels->{$label}{attributes});
40
41							# Compute a smoothing term so P(word\|cat)==0 can be avoided
42	3					18	$m->{smoother}{$label} = -log($label_tokens + $vocab_size);
43
44							# P(attr\|label) = $count/$label_tokens (simple)
45							# P(attr\|label) = ($count + 1)/($label_tokens + $vocab_size) (with smoothing)
46							# log P(attr\|label) = log($count + 1) - log($label_tokens + $vocab_size)
47
48	3					6	my $denominator = log($label_tokens + $vocab_size);
49
50	3					4	while (my ($attribute, $count) = each %{ $labels->{$label}{attributes} }) {
	25					60
51	22					47	$m->{probs}{$label}{$attribute} = log($count + 1) - $denominator;
52							}
53							}
54	2					8	return $m;
55							}
56
57							sub do_predict {
58	2			2	0	3	my ($self, $m, $newattrs) = @_;
59
60							# Note that we're using the log(prob) here. That's why we add instead of multiply.
61
62	2					2	my %scores = %{$m->{prior_probs}};
	2					6
63	2					486	while (my ($feature, $value) = each %$newattrs) {
64	19	100				49	next unless exists $m->{attributes}{$feature}; # Ignore totally unseen features
65	7					7	while (my ($label, $attributes) = each %{$m->{probs}}) {
	21					51
66	14		66			45	$scores{$label} += ($attributes->{$feature} \|\| $m->{smoother}{$label})$value; # P($feature\|$label)*$value
67							}
68							}
69
70	2					4	rescale(\%scores);
71
72	2					5	return \%scores;
73							}
74
75							1;