| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package AI::NaiveBayes::Learner; | 
| 2 |  |  |  |  |  |  | $AI::NaiveBayes::Learner::VERSION = '0.03'; | 
| 3 | 3 |  |  | 3 |  | 23005 | use strict; | 
|  | 3 |  |  |  |  | 7 |  | 
|  | 3 |  |  |  |  | 90 |  | 
| 4 | 3 |  |  | 3 |  | 15 | use warnings; | 
|  | 3 |  |  |  |  | 5 |  | 
|  | 3 |  |  |  |  | 90 |  | 
| 5 | 3 |  |  | 3 |  | 80 | use 5.010; | 
|  | 3 |  |  |  |  | 13 |  | 
| 6 |  |  |  |  |  |  |  | 
| 7 | 3 |  |  | 3 |  | 16 | use List::Util qw( min sum ); | 
|  | 3 |  |  |  |  | 6 |  | 
|  | 3 |  |  |  |  | 275 |  | 
| 8 | 3 |  |  | 3 |  | 916 | use Moose; | 
|  | 3 |  |  |  |  | 472694 |  | 
|  | 3 |  |  |  |  | 22 |  | 
| 9 | 3 |  |  | 3 |  | 20255 | use AI::NaiveBayes; | 
|  | 3 |  |  |  |  | 7 |  | 
|  | 3 |  |  |  |  | 2445 |  | 
| 10 |  |  |  |  |  |  |  | 
| 11 |  |  |  |  |  |  | has attributes => (is => 'ro', isa => 'HashRef', default => sub { {} }, clearer => '_clear_attrs'); | 
| 12 |  |  |  |  |  |  | has labels     => (is => 'ro', isa => 'HashRef', default => sub { {} }, clearer => '_clear_labels'); | 
| 13 |  |  |  |  |  |  | has examples  => (is => 'ro', isa => 'Int',     default => 0, clearer => '_clear_examples'); | 
| 14 |  |  |  |  |  |  |  | 
| 15 |  |  |  |  |  |  | has features_kept => (is => 'ro', predicate => 'limit_features'); | 
| 16 |  |  |  |  |  |  |  | 
| 17 |  |  |  |  |  |  | has classifier_class => ( is => 'ro', isa => 'Str', default => 'AI::NaiveBayes' ); | 
| 18 |  |  |  |  |  |  |  | 
| 19 |  |  |  |  |  |  | sub add_example { | 
| 20 | 20 |  |  | 20 | 1 | 2333 | my ($self, %params) = @_; | 
| 21 | 20 |  |  |  |  | 41 | for ('attributes', 'labels') { | 
| 22 | 40 | 50 |  |  |  | 113 | die "Missing required '$_' parameter" unless exists $params{$_}; | 
| 23 |  |  |  |  |  |  | } | 
| 24 |  |  |  |  |  |  |  | 
| 25 | 20 |  |  |  |  | 30 | $self->{examples}++; | 
| 26 |  |  |  |  |  |  |  | 
| 27 | 20 |  |  |  |  | 31 | my $attributes = $params{attributes}; | 
| 28 | 20 |  |  |  |  | 27 | my $labels     = $params{labels}; | 
| 29 |  |  |  |  |  |  |  | 
| 30 | 20 |  |  |  |  | 716 | add_hash($self->attributes(), $attributes); | 
| 31 |  |  |  |  |  |  |  | 
| 32 | 20 |  |  |  |  | 744 | my $our_labels = $self->labels; | 
| 33 | 20 |  |  |  |  | 39 | foreach my $label ( @$labels ) { | 
| 34 | 20 |  |  |  |  | 45 | $our_labels->{$label}{count}++; | 
| 35 | 20 |  | 100 |  |  | 77 | $our_labels->{$label}{attributes} //= {}; | 
| 36 | 20 |  |  |  |  | 50 | add_hash($our_labels->{$label}{attributes}, $attributes); | 
| 37 |  |  |  |  |  |  | } | 
| 38 |  |  |  |  |  |  | } | 
| 39 |  |  |  |  |  |  |  | 
| 40 |  |  |  |  |  |  | sub classifier { | 
| 41 | 5 |  |  | 5 | 1 | 25 | my $self = shift; | 
| 42 |  |  |  |  |  |  |  | 
| 43 | 5 |  |  |  |  | 188 | my $examples    = $self->examples; | 
| 44 | 5 |  |  |  |  | 170 | my $labels       = $self->labels; | 
| 45 | 5 |  |  |  |  | 10 | my $vocab_size   = keys %{ $self->attributes }; | 
|  | 5 |  |  |  |  | 170 |  | 
| 46 | 5 |  |  |  |  | 8 | my $model; | 
| 47 | 5 |  |  |  |  | 171 | $model->{attributes} = $self->attributes; | 
| 48 |  |  |  |  |  |  |  | 
| 49 |  |  |  |  |  |  |  | 
| 50 |  |  |  |  |  |  | # Calculate the log-probabilities for each category | 
| 51 | 5 |  |  |  |  | 50 | foreach my $label (keys %$labels) { | 
| 52 | 8 |  |  |  |  | 48 | $model->{prior_probs}{$label} = log($labels->{$label}{count} / $examples); | 
| 53 |  |  |  |  |  |  |  | 
| 54 |  |  |  |  |  |  | # Count the number of tokens in this cat | 
| 55 | 8 |  |  |  |  | 11 | my $label_tokens = sum( values %{ $labels->{$label}{attributes} } ); | 
|  | 8 |  |  |  |  | 58 |  | 
| 56 |  |  |  |  |  |  |  | 
| 57 |  |  |  |  |  |  | # Compute a smoothing term so P(word|cat)==0 can be avoided | 
| 58 | 8 |  |  |  |  | 27 | $model->{smoother}{$label} = -log($label_tokens + $vocab_size); | 
| 59 |  |  |  |  |  |  |  | 
| 60 |  |  |  |  |  |  | # P(attr|label) = $count/$label_tokens                         (simple) | 
| 61 |  |  |  |  |  |  | # P(attr|label) = ($count + 1)/($label_tokens + $vocab_size)   (with smoothing) | 
| 62 |  |  |  |  |  |  | # log P(attr|label) = log($count + 1) - log($label_tokens + $vocab_size) | 
| 63 |  |  |  |  |  |  |  | 
| 64 | 8 |  |  |  |  | 44 | my $denominator = log($label_tokens + $vocab_size); | 
| 65 |  |  |  |  |  |  |  | 
| 66 | 8 |  |  |  |  | 13 | while (my ($attribute, $count) = each %{ $labels->{$label}{attributes} }) { | 
|  | 67 |  |  |  |  | 211 |  | 
| 67 | 59 |  |  |  |  | 154 | $model->{probs}{$label}{$attribute} = log($count + 1) - $denominator; | 
| 68 |  |  |  |  |  |  | } | 
| 69 |  |  |  |  |  |  |  | 
| 70 | 8 | 100 |  |  |  | 343 | if ($self->limit_features) { | 
| 71 | 2 |  |  |  |  | 2 | my %old  = %{$model->{probs}{$label}}; | 
|  | 2 |  |  |  |  | 14 |  | 
| 72 | 2 |  |  |  |  | 43 | my @features = sort { abs($old{$a}) <=> abs($old{$b}) } keys(%old); | 
|  | 31 |  |  |  |  | 53 |  | 
| 73 | 2 |  |  |  |  | 76 | my $limit = min($self->features_kept, 0+@features); | 
| 74 | 2 | 100 |  |  |  | 8 | if ($limit < 1) { | 
| 75 | 1 |  |  |  |  | 3 | $limit = int($limit * keys(%old)); | 
| 76 |  |  |  |  |  |  | } | 
| 77 | 2 |  |  |  |  | 10 | my @top = @features[0..$limit-1]; | 
| 78 | 2 |  |  |  |  | 4 | my %kept = map { $_ => $old{$_} } @top; | 
|  | 9 |  |  |  |  | 23 |  | 
| 79 | 2 |  |  |  |  | 15 | $model->{probs}{$label} = \%kept; | 
| 80 |  |  |  |  |  |  | } | 
| 81 |  |  |  |  |  |  | } | 
| 82 | 5 |  |  |  |  | 190 | my $classifier_class = $self->classifier_class; | 
| 83 | 5 |  |  |  |  | 170 | return $classifier_class->new( model => $model ); | 
| 84 |  |  |  |  |  |  | } | 
| 85 |  |  |  |  |  |  |  | 
| 86 |  |  |  |  |  |  | sub add_hash { | 
| 87 | 40 |  |  | 40 | 1 | 59 | my ($first, $second) = @_; | 
| 88 | 40 |  | 50 |  |  | 83 | $first //= {}; | 
| 89 | 40 |  |  |  |  | 107 | foreach my $k (keys %$second) { | 
| 90 | 182 |  | 100 |  |  | 581 | $first->{$k} //= 0; | 
| 91 | 182 |  |  |  |  | 343 | $first->{$k} += $second->{$k}; | 
| 92 |  |  |  |  |  |  | } | 
| 93 |  |  |  |  |  |  | } | 
| 94 |  |  |  |  |  |  |  | 
| 95 |  |  |  |  |  |  | __PACKAGE__->meta->make_immutable; | 
| 96 |  |  |  |  |  |  |  | 
| 97 |  |  |  |  |  |  | 1; | 
| 98 |  |  |  |  |  |  |  | 
| 99 |  |  |  |  |  |  | =pod | 
| 100 |  |  |  |  |  |  |  | 
| 101 |  |  |  |  |  |  | =encoding UTF-8 | 
| 102 |  |  |  |  |  |  |  | 
| 103 |  |  |  |  |  |  | =head1 NAME | 
| 104 |  |  |  |  |  |  |  | 
| 105 |  |  |  |  |  |  | AI::NaiveBayes::Learner - Build AI::NaiveBayes classifier from a set of training examples. | 
| 106 |  |  |  |  |  |  |  | 
| 107 |  |  |  |  |  |  | =head1 VERSION | 
| 108 |  |  |  |  |  |  |  | 
| 109 |  |  |  |  |  |  | version 0.03 | 
| 110 |  |  |  |  |  |  |  | 
| 111 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 112 |  |  |  |  |  |  |  | 
| 113 |  |  |  |  |  |  | my $learner = AI::NaiveBayes::Learner->new(features_kept => 0.5); | 
| 114 |  |  |  |  |  |  | $learner->add_example( | 
| 115 |  |  |  |  |  |  | { | 
| 116 |  |  |  |  |  |  | attributes => { sheep => 1, very => 1, valuable => 1, farming => 1 }, | 
| 117 |  |  |  |  |  |  | labels => ['farming'] | 
| 118 |  |  |  |  |  |  | }, | 
| 119 |  |  |  |  |  |  | ); | 
| 120 |  |  |  |  |  |  |  | 
| 121 |  |  |  |  |  |  | my $classifier = $learner->classifier; | 
| 122 |  |  |  |  |  |  |  | 
| 123 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 124 |  |  |  |  |  |  |  | 
| 125 |  |  |  |  |  |  | This is a trainer of AI::NaiveBayes classifiers.  It saves information passed | 
| 126 |  |  |  |  |  |  | by the C<add_example> method from | 
| 127 |  |  |  |  |  |  | training data into internal structures and then constructs a classifier when | 
| 128 |  |  |  |  |  |  | the C<classifier> method is called. | 
| 129 |  |  |  |  |  |  |  | 
| 130 |  |  |  |  |  |  | =head1 ATTRIBUTES | 
| 131 |  |  |  |  |  |  |  | 
| 132 |  |  |  |  |  |  | =over 4 | 
| 133 |  |  |  |  |  |  |  | 
| 134 |  |  |  |  |  |  | =item C<features_kept> | 
| 135 |  |  |  |  |  |  |  | 
| 136 |  |  |  |  |  |  | Indicates how many features should remain after calculating probabilities. By | 
| 137 |  |  |  |  |  |  | default all of them will be kept. For C<features_kept> > 1, C<features_kept> of | 
| 138 |  |  |  |  |  |  | features will be preserved. For values lower than 1, a specified fraction of | 
| 139 |  |  |  |  |  |  | features will be kept (e.g. top 20% of features for C<features_kept> = 0.2). | 
| 140 |  |  |  |  |  |  |  | 
| 141 |  |  |  |  |  |  | The rest of the attributes is for class' internal usage, and thus not | 
| 142 |  |  |  |  |  |  | documented. | 
| 143 |  |  |  |  |  |  |  | 
| 144 |  |  |  |  |  |  | =item C<classifier_class> | 
| 145 |  |  |  |  |  |  |  | 
| 146 |  |  |  |  |  |  | The class of the classifier to be created.  By default it is | 
| 147 |  |  |  |  |  |  | C<AI::NaiveBayes> | 
| 148 |  |  |  |  |  |  |  | 
| 149 |  |  |  |  |  |  | =back | 
| 150 |  |  |  |  |  |  |  | 
| 151 |  |  |  |  |  |  | =head1 METHODS | 
| 152 |  |  |  |  |  |  |  | 
| 153 |  |  |  |  |  |  | =over 4 | 
| 154 |  |  |  |  |  |  |  | 
| 155 |  |  |  |  |  |  | =item C<add_example( HASHREF )> | 
| 156 |  |  |  |  |  |  |  | 
| 157 |  |  |  |  |  |  | Saves the information from a training example into internal data structures. | 
| 158 |  |  |  |  |  |  | The parameter should be of the form of | 
| 159 |  |  |  |  |  |  | { feature1 => weight1, feature2 => weight2, ... } | 
| 160 |  |  |  |  |  |  |  | 
| 161 |  |  |  |  |  |  | =item C<classifier()> | 
| 162 |  |  |  |  |  |  |  | 
| 163 |  |  |  |  |  |  | Creates an AI::NaiveBayes classifier based on the data accumulated before. | 
| 164 |  |  |  |  |  |  |  | 
| 165 |  |  |  |  |  |  | =back | 
| 166 |  |  |  |  |  |  |  | 
| 167 |  |  |  |  |  |  | =head1 UTILITY SUBS | 
| 168 |  |  |  |  |  |  |  | 
| 169 |  |  |  |  |  |  | =over 4 | 
| 170 |  |  |  |  |  |  |  | 
| 171 |  |  |  |  |  |  | =item C<add_hash> | 
| 172 |  |  |  |  |  |  |  | 
| 173 |  |  |  |  |  |  | =back | 
| 174 |  |  |  |  |  |  |  | 
| 175 |  |  |  |  |  |  | =head1 BASED ON | 
| 176 |  |  |  |  |  |  |  | 
| 177 |  |  |  |  |  |  | Much of the code and description is from L<Algorithm::NaiveBayes>. | 
| 178 |  |  |  |  |  |  |  | 
| 179 |  |  |  |  |  |  | =head1 AUTHORS | 
| 180 |  |  |  |  |  |  |  | 
| 181 |  |  |  |  |  |  | =over 4 | 
| 182 |  |  |  |  |  |  |  | 
| 183 |  |  |  |  |  |  | =item * | 
| 184 |  |  |  |  |  |  |  | 
| 185 |  |  |  |  |  |  | Zbigniew Lukasiak <zlukasiak@opera.com> | 
| 186 |  |  |  |  |  |  |  | 
| 187 |  |  |  |  |  |  | =item * | 
| 188 |  |  |  |  |  |  |  | 
| 189 |  |  |  |  |  |  | Tadeusz SoÅnierz <tsosnierz@opera.com> | 
| 190 |  |  |  |  |  |  |  | 
| 191 |  |  |  |  |  |  | =item * | 
| 192 |  |  |  |  |  |  |  | 
| 193 |  |  |  |  |  |  | Ken Williams <ken@mathforum.org> | 
| 194 |  |  |  |  |  |  |  | 
| 195 |  |  |  |  |  |  | =back | 
| 196 |  |  |  |  |  |  |  | 
| 197 |  |  |  |  |  |  | =head1 COPYRIGHT AND LICENSE | 
| 198 |  |  |  |  |  |  |  | 
| 199 |  |  |  |  |  |  | This software is copyright (c) 2012 by Opera Software ASA. | 
| 200 |  |  |  |  |  |  |  | 
| 201 |  |  |  |  |  |  | This is free software; you can redistribute it and/or modify it under | 
| 202 |  |  |  |  |  |  | the same terms as the Perl 5 programming language system itself. | 
| 203 |  |  |  |  |  |  |  | 
| 204 |  |  |  |  |  |  | =cut | 
| 205 |  |  |  |  |  |  |  | 
| 206 |  |  |  |  |  |  | __END__ | 
| 207 |  |  |  |  |  |  |  | 
| 208 |  |  |  |  |  |  | # ABSTRACT: Build AI::NaiveBayes classifier from a set of training examples. | 
| 209 |  |  |  |  |  |  |  |