line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package AI::Categorizer::Learner::KNN; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
760
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
26
|
|
4
|
1
|
|
|
1
|
|
5
|
use AI::Categorizer::Learner; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
21
|
|
5
|
1
|
|
|
1
|
|
4
|
use base qw(AI::Categorizer::Learner); |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
78
|
|
6
|
1
|
|
|
1
|
|
6
|
use Params::Validate qw(:types); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
953
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
__PACKAGE__->valid_params |
9
|
|
|
|
|
|
|
( |
10
|
|
|
|
|
|
|
threshold => {type => SCALAR, default => 0.4}, |
11
|
|
|
|
|
|
|
k_value => {type => SCALAR, default => 20}, |
12
|
|
|
|
|
|
|
knn_weighting => {type => SCALAR, default => 'score'}, |
13
|
|
|
|
|
|
|
max_instances => {type => SCALAR, default => 0}, |
14
|
|
|
|
|
|
|
); |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
sub create_model { |
17
|
2
|
|
|
2
|
0
|
5
|
my $self = shift; |
18
|
2
|
|
|
|
|
14
|
foreach my $doc ($self->knowledge_set->documents) { |
19
|
8
|
|
|
|
|
21
|
$doc->features->normalize; |
20
|
|
|
|
|
|
|
} |
21
|
2
|
|
|
|
|
6
|
$self->knowledge_set->features; # Initialize |
22
|
|
|
|
|
|
|
} |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
sub threshold { |
25
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
26
|
0
|
0
|
|
|
|
0
|
$self->{threshold} = shift if @_; |
27
|
0
|
|
|
|
|
0
|
return $self->{threshold}; |
28
|
|
|
|
|
|
|
} |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
sub categorize_collection { |
31
|
2
|
|
|
2
|
1
|
427
|
my $self = shift; |
32
|
|
|
|
|
|
|
|
33
|
2
|
|
|
|
|
10
|
my $f_class = $self->knowledge_set->contained_class('features'); |
34
|
2
|
50
|
|
|
|
34
|
if ($f_class->can('all_features')) { |
35
|
0
|
|
|
|
|
0
|
$f_class->all_features([$self->knowledge_set->features->names]); |
36
|
|
|
|
|
|
|
} |
37
|
2
|
|
|
|
|
17
|
$self->SUPER::categorize_collection(@_); |
38
|
|
|
|
|
|
|
} |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
sub get_scores { |
41
|
16
|
|
|
16
|
0
|
22
|
my ($self, $newdoc) = @_; |
42
|
16
|
|
|
|
|
44
|
my $currentDocName = $newdoc->name; |
43
|
|
|
|
|
|
|
#print "classifying $currentDocName\n"; |
44
|
|
|
|
|
|
|
|
45
|
16
|
|
|
|
|
49
|
my $features = $newdoc->features->intersection($self->knowledge_set->features)->normalize; |
46
|
16
|
|
|
|
|
79
|
my $q = AI::Categorizer::Learner::KNN::Queue->new(size => $self->{k_value}); |
47
|
|
|
|
|
|
|
|
48
|
16
|
|
|
|
|
25
|
my @docset; |
49
|
16
|
50
|
|
|
|
30
|
if ($self->{max_instances}) { |
50
|
|
|
|
|
|
|
# Use (approximately) max_instances documents, chosen randomly from corpus |
51
|
0
|
|
|
|
|
0
|
my $probability = $self->{max_instances} / $self->knowledge_set->documents; |
52
|
0
|
|
|
|
|
0
|
@docset = grep {rand() < $probability} $self->knowledge_set->documents; |
|
0
|
|
|
|
|
0
|
|
53
|
|
|
|
|
|
|
} else { |
54
|
|
|
|
|
|
|
# Use the whole corpus |
55
|
16
|
|
|
|
|
45
|
@docset = $self->knowledge_set->documents; |
56
|
|
|
|
|
|
|
} |
57
|
|
|
|
|
|
|
|
58
|
16
|
|
|
|
|
31
|
foreach my $doc (@docset) { |
59
|
64
|
|
|
|
|
166
|
my $score = $doc->features->dot( $features ); |
60
|
64
|
50
|
|
|
|
173
|
warn "Score for ", $doc->name, " (", ($doc->categories)[0]->name, "): $score" if $self->verbose > 1; |
61
|
64
|
|
|
|
|
146
|
$q->add($doc, $score); |
62
|
|
|
|
|
|
|
} |
63
|
|
|
|
|
|
|
|
64
|
16
|
|
|
|
|
50
|
my %scores = map {+$_->name, 0} $self->categories; |
|
32
|
|
|
|
|
86
|
|
65
|
16
|
|
|
|
|
28
|
foreach my $e (@{$q->entries}) { |
|
16
|
|
|
|
|
32
|
|
66
|
32
|
|
|
|
|
92
|
foreach my $cat ($e->{thing}->categories) { |
67
|
32
|
100
|
|
|
|
82
|
$scores{$cat->name} += ($self->{knn_weighting} eq 'score' ? $e->{score} : 1); #increment cat score |
68
|
|
|
|
|
|
|
} |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
|
71
|
16
|
|
|
|
|
79
|
$_ /= $self->{k_value} foreach values %scores; |
72
|
|
|
|
|
|
|
|
73
|
16
|
|
|
|
|
129
|
return (\%scores, $self->{threshold}); |
74
|
|
|
|
|
|
|
} |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
################################################################### |
77
|
|
|
|
|
|
|
package AI::Categorizer::Learner::KNN::Queue; |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
sub new { |
80
|
17
|
|
|
17
|
|
458
|
my ($pkg, %args) = @_; |
81
|
17
|
|
|
|
|
100
|
return bless { |
82
|
|
|
|
|
|
|
size => $args{size}, |
83
|
|
|
|
|
|
|
entries => [], |
84
|
|
|
|
|
|
|
}, $pkg; |
85
|
|
|
|
|
|
|
} |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
sub add { |
88
|
70
|
|
|
70
|
|
122
|
my ($self, $thing, $score) = @_; |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
# scores may be (0.2, 0.4, 0.4, 0.8) - ascending |
91
|
|
|
|
|
|
|
|
92
|
70
|
100
|
100
|
|
|
67
|
return unless (@{$self->{entries}} < $self->{size} # Queue not filled |
|
70
|
|
|
|
|
324
|
|
93
|
|
|
|
|
|
|
or $score > $self->{entries}[0]{score}); # Found a better entry |
94
|
|
|
|
|
|
|
|
95
|
55
|
|
|
|
|
63
|
my $i; |
96
|
55
|
100
|
|
|
|
60
|
if (!@{$self->{entries}}) { |
|
55
|
100
|
|
|
|
166
|
|
97
|
17
|
|
|
|
|
19
|
$i = 0; |
98
|
|
|
|
|
|
|
} elsif ($score > $self->{entries}[-1]{score}) { |
99
|
18
|
|
|
|
|
22
|
$i = @{$self->{entries}}; |
|
18
|
|
|
|
|
34
|
|
100
|
|
|
|
|
|
|
} else { |
101
|
20
|
|
|
|
|
26
|
for ($i = 0; $i < @{$self->{entries}}; $i++) { |
|
32
|
|
|
|
|
73
|
|
102
|
29
|
100
|
|
|
|
78
|
last if $score < $self->{entries}[$i]{score}; |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
} |
105
|
55
|
|
|
|
|
93
|
splice @{$self->{entries}}, $i, 0, { thing => $thing, score => $score}; |
|
55
|
|
|
|
|
187
|
|
106
|
55
|
100
|
|
|
|
66
|
shift @{$self->{entries}} if @{$self->{entries}} > $self->{size}; |
|
20
|
|
|
|
|
71
|
|
|
55
|
|
|
|
|
178
|
|
107
|
|
|
|
|
|
|
} |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
sub entries { |
110
|
17
|
|
|
17
|
|
45
|
return shift->{entries}; |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
1; |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
__END__ |