File Coverage

blib/lib/AI/Categorizer/Learner/KNN.pm
Criterion Covered Total %
statement 59 66 89.3
branch 15 20 75.0
condition 3 3 100.0
subroutine 10 11 90.9
pod 2 4 50.0
total 89 104 85.5


line stmt bran cond sub pod time code
1             package AI::Categorizer::Learner::KNN;
2              
3 1     1   760 use strict;
  1         2  
  1         26  
4 1     1   5 use AI::Categorizer::Learner;
  1         2  
  1         21  
5 1     1   4 use base qw(AI::Categorizer::Learner);
  1         1  
  1         78  
6 1     1   6 use Params::Validate qw(:types);
  1         3  
  1         953  
7              
8             __PACKAGE__->valid_params
9             (
10             threshold => {type => SCALAR, default => 0.4},
11             k_value => {type => SCALAR, default => 20},
12             knn_weighting => {type => SCALAR, default => 'score'},
13             max_instances => {type => SCALAR, default => 0},
14             );
15              
16             sub create_model {
17 2     2 0 5 my $self = shift;
18 2         14 foreach my $doc ($self->knowledge_set->documents) {
19 8         21 $doc->features->normalize;
20             }
21 2         6 $self->knowledge_set->features; # Initialize
22             }
23              
24             sub threshold {
25 0     0 1 0 my $self = shift;
26 0 0       0 $self->{threshold} = shift if @_;
27 0         0 return $self->{threshold};
28             }
29              
30             sub categorize_collection {
31 2     2 1 427 my $self = shift;
32            
33 2         10 my $f_class = $self->knowledge_set->contained_class('features');
34 2 50       34 if ($f_class->can('all_features')) {
35 0         0 $f_class->all_features([$self->knowledge_set->features->names]);
36             }
37 2         17 $self->SUPER::categorize_collection(@_);
38             }
39              
40             sub get_scores {
41 16     16 0 22 my ($self, $newdoc) = @_;
42 16         44 my $currentDocName = $newdoc->name;
43             #print "classifying $currentDocName\n";
44              
45 16         49 my $features = $newdoc->features->intersection($self->knowledge_set->features)->normalize;
46 16         79 my $q = AI::Categorizer::Learner::KNN::Queue->new(size => $self->{k_value});
47              
48 16         25 my @docset;
49 16 50       30 if ($self->{max_instances}) {
50             # Use (approximately) max_instances documents, chosen randomly from corpus
51 0         0 my $probability = $self->{max_instances} / $self->knowledge_set->documents;
52 0         0 @docset = grep {rand() < $probability} $self->knowledge_set->documents;
  0         0  
53             } else {
54             # Use the whole corpus
55 16         45 @docset = $self->knowledge_set->documents;
56             }
57            
58 16         31 foreach my $doc (@docset) {
59 64         166 my $score = $doc->features->dot( $features );
60 64 50       173 warn "Score for ", $doc->name, " (", ($doc->categories)[0]->name, "): $score" if $self->verbose > 1;
61 64         146 $q->add($doc, $score);
62             }
63            
64 16         50 my %scores = map {+$_->name, 0} $self->categories;
  32         86  
65 16         28 foreach my $e (@{$q->entries}) {
  16         32  
66 32         92 foreach my $cat ($e->{thing}->categories) {
67 32 100       82 $scores{$cat->name} += ($self->{knn_weighting} eq 'score' ? $e->{score} : 1); #increment cat score
68             }
69             }
70            
71 16         79 $_ /= $self->{k_value} foreach values %scores;
72            
73 16         129 return (\%scores, $self->{threshold});
74             }
75              
76             ###################################################################
77             package AI::Categorizer::Learner::KNN::Queue;
78              
79             sub new {
80 17     17   458 my ($pkg, %args) = @_;
81 17         100 return bless {
82             size => $args{size},
83             entries => [],
84             }, $pkg;
85             }
86              
87             sub add {
88 70     70   122 my ($self, $thing, $score) = @_;
89              
90             # scores may be (0.2, 0.4, 0.4, 0.8) - ascending
91              
92 70 100 100     67 return unless (@{$self->{entries}} < $self->{size} # Queue not filled
  70         324  
93             or $score > $self->{entries}[0]{score}); # Found a better entry
94            
95 55         63 my $i;
96 55 100       60 if (!@{$self->{entries}}) {
  55 100       166  
97 17         19 $i = 0;
98             } elsif ($score > $self->{entries}[-1]{score}) {
99 18         22 $i = @{$self->{entries}};
  18         34  
100             } else {
101 20         26 for ($i = 0; $i < @{$self->{entries}}; $i++) {
  32         73  
102 29 100       78 last if $score < $self->{entries}[$i]{score};
103             }
104             }
105 55         93 splice @{$self->{entries}}, $i, 0, { thing => $thing, score => $score};
  55         187  
106 55 100       66 shift @{$self->{entries}} if @{$self->{entries}} > $self->{size};
  20         71  
  55         178  
107             }
108              
109             sub entries {
110 17     17   45 return shift->{entries};
111             }
112              
113             1;
114              
115             __END__