File Coverage

blib/lib/AI/Categorizer/Learner/KNN.pm

Criterion	Covered	Total	%
statement	59	66	89.3
branch	15	20	75.0
condition	3	3	100.0
subroutine	10	11	90.9
pod	2	4	50.0
total	89	104	85.5

line	stmt	bran	cond	sub	pod	time	code
1							package AI::Categorizer::Learner::KNN;
2
3	1			1		760	use strict;
	1					2
	1					26
4	1			1		5	use AI::Categorizer::Learner;
	1					2
	1					21
5	1			1		4	use base qw(AI::Categorizer::Learner);
	1					1
	1					78
6	1			1		6	use Params::Validate qw(:types);
	1					3
	1					953
7
8							__PACKAGE__->valid_params
9							(
10							threshold => {type => SCALAR, default => 0.4},
11							k_value => {type => SCALAR, default => 20},
12							knn_weighting => {type => SCALAR, default => 'score'},
13							max_instances => {type => SCALAR, default => 0},
14							);
15
16							sub create_model {
17	2			2	0	5	my $self = shift;
18	2					14	foreach my $doc ($self->knowledge_set->documents) {
19	8					21	$doc->features->normalize;
20							}
21	2					6	$self->knowledge_set->features; # Initialize
22							}
23
24							sub threshold {
25	0			0	1	0	my $self = shift;
26	0	0				0	$self->{threshold} = shift if @_;
27	0					0	return $self->{threshold};
28							}
29
30							sub categorize_collection {
31	2			2	1	427	my $self = shift;
32
33	2					10	my $f_class = $self->knowledge_set->contained_class('features');
34	2	50				34	if ($f_class->can('all_features')) {
35	0					0	$f_class->all_features([$self->knowledge_set->features->names]);
36							}
37	2					17	$self->SUPER::categorize_collection(@_);
38							}
39
40							sub get_scores {
41	16			16	0	22	my ($self, $newdoc) = @_;
42	16					44	my $currentDocName = $newdoc->name;
43							#print "classifying $currentDocName\n";
44
45	16					49	my $features = $newdoc->features->intersection($self->knowledge_set->features)->normalize;
46	16					79	my $q = AI::Categorizer::Learner::KNN::Queue->new(size => $self->{k_value});
47
48	16					25	my @docset;
49	16	50				30	if ($self->{max_instances}) {
50							# Use (approximately) max_instances documents, chosen randomly from corpus
51	0					0	my $probability = $self->{max_instances} / $self->knowledge_set->documents;
52	0					0	@docset = grep {rand() < $probability} $self->knowledge_set->documents;
	0					0
53							} else {
54							# Use the whole corpus
55	16					45	@docset = $self->knowledge_set->documents;
56							}
57
58	16					31	foreach my $doc (@docset) {
59	64					166	my $score = $doc->features->dot( $features );
60	64	50				173	warn "Score for ", $doc->name, " (", ($doc->categories)[0]->name, "): $score" if $self->verbose > 1;
61	64					146	$q->add($doc, $score);
62							}
63
64	16					50	my %scores = map {+$_->name, 0} $self->categories;
	32					86
65	16					28	foreach my $e (@{$q->entries}) {
	16					32
66	32					92	foreach my $cat ($e->{thing}->categories) {
67	32	100				82	$scores{$cat->name} += ($self->{knn_weighting} eq 'score' ? $e->{score} : 1); #increment cat score
68							}
69							}
70
71	16					79	$_ /= $self->{k_value} foreach values %scores;
72
73	16					129	return (\%scores, $self->{threshold});
74							}
75
76							###################################################################
77							package AI::Categorizer::Learner::KNN::Queue;
78
79							sub new {
80	17			17		458	my ($pkg, %args) = @_;
81	17					100	return bless {
82							size => $args{size},
83							entries => [],
84							}, $pkg;
85							}
86
87							sub add {
88	70			70		122	my ($self, $thing, $score) = @_;
89
90							# scores may be (0.2, 0.4, 0.4, 0.8) - ascending
91
92	70	100	100			67	return unless (@{$self->{entries}} < $self->{size} # Queue not filled
	70					324
93							or $score > $self->{entries}[0]{score}); # Found a better entry
94
95	55					63	my $i;
96	55	100				60	if (!@{$self->{entries}}) {
	55	100				166
97	17					19	$i = 0;
98							} elsif ($score > $self->{entries}[-1]{score}) {
99	18					22	$i = @{$self->{entries}};
	18					34
100							} else {
101	20					26	for ($i = 0; $i < @{$self->{entries}}; $i++) {
	32					73
102	29	100				78	last if $score < $self->{entries}[$i]{score};
103							}
104							}
105	55					93	splice @{$self->{entries}}, $i, 0, { thing => $thing, score => $score};
	55					187
106	55	100				66	shift @{$self->{entries}} if @{$self->{entries}} > $self->{size};
	20					71
	55					178
107							}
108
109							sub entries {
110	17			17		45	return shift->{entries};
111							}
112
113							1;
114
115							__END__