File Coverage

blib/lib/Algorithm/NaiveBayes/Model/Frequency.pm
Criterion Covered Total %
statement 45 45 100.0
branch 2 2 100.0
condition 4 5 80.0
subroutine 7 7 100.0
pod 1 4 25.0
total 59 63 93.6


line stmt bran cond sub pod time code
1             package Algorithm::NaiveBayes::Model::Frequency;
2              
3 2     2   10 use strict;
  2         4  
  2         77  
4 2     2   967 use Algorithm::NaiveBayes::Util qw(sum_hash add_hash max rescale);
  2         4  
  2         131  
5 2     2   10 use base qw(Algorithm::NaiveBayes);
  2         3  
  2         1137  
6              
7             sub new {
8 2     2 1 18 my $self = shift()->SUPER::new(@_);
9 2         16 $self->training_data->{attributes} = {};
10 2         7 $self->training_data->{labels} = {};
11 2         16 return $self;
12             }
13              
14             sub do_add_instance {
15 5     5 0 8 my ($self, $attributes, $labels, $training_data) = @_;
16 5         16 add_hash($training_data->{attributes}, $attributes);
17            
18 5         10 my $mylabels = $training_data->{labels};
19 5         21 foreach my $label ( @$labels ) {
20 5         11 $mylabels->{$label}{count}++;
21 5   100     221 add_hash($mylabels->{$label}{attributes} ||= {}, $attributes);
22             }
23             }
24              
25             sub do_train {
26 2     2 0 2 my ($self, $training_data) = @_;
27 2         4 my $m = {};
28            
29 2         12 my $instances = $self->instances;
30 2         3 my $labels = $training_data->{labels};
31 2         4 $m->{attributes} = $training_data->{attributes};
32 2         3 my $vocab_size = keys %{ $m->{attributes} };
  2         5  
33            
34             # Calculate the log-probabilities for each category
35 2         7 foreach my $label ($self->labels) {
36 3         23 $m->{prior_probs}{$label} = log($labels->{$label}{count} / $instances);
37            
38             # Count the number of tokens in this cat
39 3         10 my $label_tokens = sum_hash($labels->{$label}{attributes});
40            
41             # Compute a smoothing term so P(word|cat)==0 can be avoided
42 3         18 $m->{smoother}{$label} = -log($label_tokens + $vocab_size);
43            
44             # P(attr|label) = $count/$label_tokens (simple)
45             # P(attr|label) = ($count + 1)/($label_tokens + $vocab_size) (with smoothing)
46             # log P(attr|label) = log($count + 1) - log($label_tokens + $vocab_size)
47            
48 3         6 my $denominator = log($label_tokens + $vocab_size);
49            
50 3         4 while (my ($attribute, $count) = each %{ $labels->{$label}{attributes} }) {
  25         60  
51 22         47 $m->{probs}{$label}{$attribute} = log($count + 1) - $denominator;
52             }
53             }
54 2         8 return $m;
55             }
56              
57             sub do_predict {
58 2     2 0 3 my ($self, $m, $newattrs) = @_;
59            
60             # Note that we're using the log(prob) here. That's why we add instead of multiply.
61            
62 2         2 my %scores = %{$m->{prior_probs}};
  2         6  
63 2         486 while (my ($feature, $value) = each %$newattrs) {
64 19 100       49 next unless exists $m->{attributes}{$feature}; # Ignore totally unseen features
65 7         7 while (my ($label, $attributes) = each %{$m->{probs}}) {
  21         51  
66 14   66     45 $scores{$label} += ($attributes->{$feature} || $m->{smoother}{$label})*$value; # P($feature|$label)**$value
67             }
68             }
69            
70 2         4 rescale(\%scores);
71              
72 2         5 return \%scores;
73             }
74              
75             1;