line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package AI::Categorizer::Learner; |
2
|
|
|
|
|
|
|
|
3
|
11
|
|
|
11
|
|
52
|
use strict; |
|
11
|
|
|
|
|
22
|
|
|
11
|
|
|
|
|
344
|
|
4
|
11
|
|
|
11
|
|
49
|
use Class::Container; |
|
11
|
|
|
|
|
19
|
|
|
11
|
|
|
|
|
173
|
|
5
|
11
|
|
|
11
|
|
4950
|
use AI::Categorizer::Storable; |
|
11
|
|
|
|
|
20
|
|
|
11
|
|
|
|
|
298
|
|
6
|
11
|
|
|
11
|
|
52
|
use base qw(Class::Container AI::Categorizer::Storable); |
|
11
|
|
|
|
|
27
|
|
|
11
|
|
|
|
|
1133
|
|
7
|
|
|
|
|
|
|
|
8
|
11
|
|
|
11
|
|
54
|
use Params::Validate qw(:types); |
|
11
|
|
|
|
|
24
|
|
|
11
|
|
|
|
|
1557
|
|
9
|
11
|
|
|
11
|
|
5609
|
use AI::Categorizer::ObjectSet; |
|
11
|
|
|
|
|
24
|
|
|
11
|
|
|
|
|
8030
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
__PACKAGE__->valid_params |
12
|
|
|
|
|
|
|
( |
13
|
|
|
|
|
|
|
knowledge_set => { isa => 'AI::Categorizer::KnowledgeSet', optional => 1 }, |
14
|
|
|
|
|
|
|
verbose => {type => SCALAR, default => 0}, |
15
|
|
|
|
|
|
|
); |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
__PACKAGE__->contained_objects |
18
|
|
|
|
|
|
|
( |
19
|
|
|
|
|
|
|
hypothesis => { |
20
|
|
|
|
|
|
|
class => 'AI::Categorizer::Hypothesis', |
21
|
|
|
|
|
|
|
delayed => 1, |
22
|
|
|
|
|
|
|
}, |
23
|
|
|
|
|
|
|
experiment => { |
24
|
|
|
|
|
|
|
class => 'AI::Categorizer::Experiment', |
25
|
|
|
|
|
|
|
delayed => 1, |
26
|
|
|
|
|
|
|
}, |
27
|
|
|
|
|
|
|
); |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# Subclasses must override these virtual methods: |
30
|
|
|
|
|
|
|
sub get_scores; |
31
|
|
|
|
|
|
|
sub create_model; |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# Optional virtual method for on-line learning: |
34
|
|
|
|
|
|
|
sub add_knowledge; |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
sub verbose { |
37
|
143
|
|
|
143
|
1
|
176
|
my $self = shift; |
38
|
143
|
50
|
|
|
|
303
|
if (@_) { |
39
|
0
|
|
|
|
|
0
|
$self->{verbose} = shift; |
40
|
|
|
|
|
|
|
} |
41
|
143
|
|
|
|
|
772
|
return $self->{verbose}; |
42
|
|
|
|
|
|
|
} |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
sub knowledge_set { |
45
|
70
|
|
|
70
|
1
|
80
|
my $self = shift; |
46
|
70
|
50
|
|
|
|
219
|
if (@_) { |
47
|
0
|
|
|
|
|
0
|
$self->{knowledge_set} = shift; |
48
|
|
|
|
|
|
|
} |
49
|
70
|
|
|
|
|
269
|
return $self->{knowledge_set}; |
50
|
|
|
|
|
|
|
} |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
sub categories { |
53
|
21
|
|
|
21
|
0
|
29
|
my $self = shift; |
54
|
21
|
|
|
|
|
42
|
return $self->knowledge_set->categories; |
55
|
|
|
|
|
|
|
} |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
sub train { |
58
|
7
|
|
|
7
|
1
|
2316
|
my ($self, %args) = @_; |
59
|
7
|
100
|
|
|
|
32
|
$self->{knowledge_set} = $args{knowledge_set} if $args{knowledge_set}; |
60
|
7
|
50
|
|
|
|
31
|
die "No knowledge_set provided" unless $self->{knowledge_set}; |
61
|
|
|
|
|
|
|
|
62
|
7
|
|
|
|
|
42
|
$self->{knowledge_set}->finish; |
63
|
7
|
|
|
|
|
125
|
$self->create_model; # Creates $self->{model} |
64
|
7
|
|
|
|
|
314
|
$self->delayed_object_params('hypothesis', |
65
|
|
|
|
|
|
|
all_categories => [map $_->name, $self->categories], |
66
|
|
|
|
|
|
|
); |
67
|
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
sub prog_bar { |
70
|
0
|
|
|
0
|
0
|
0
|
my ($self, $count) = @_; |
71
|
|
|
|
|
|
|
|
72
|
0
|
0
|
|
0
|
|
0
|
return sub { print STDERR '.' } unless eval "use Time::Progress; 1"; |
|
0
|
|
|
|
|
0
|
|
73
|
|
|
|
|
|
|
|
74
|
0
|
|
|
|
|
0
|
my $pb = 'Time::Progress'->new; |
75
|
0
|
|
|
|
|
0
|
$pb->attr(max => $count); |
76
|
0
|
|
|
|
|
0
|
my $i = 0; |
77
|
|
|
|
|
|
|
return sub { |
78
|
0
|
|
|
0
|
|
0
|
$i++; |
79
|
0
|
0
|
|
|
|
0
|
return if $i % 25; |
80
|
0
|
|
|
|
|
0
|
my $string = ''; |
81
|
0
|
0
|
|
|
|
0
|
if (@_) { |
82
|
0
|
|
|
|
|
0
|
my $e = shift; |
83
|
0
|
|
|
|
|
0
|
$string = sprintf " (maF1=%.03f, miF1=%.03f)", $e->macro_F1, $e->micro_F1; |
84
|
|
|
|
|
|
|
} |
85
|
0
|
|
|
|
|
0
|
print STDERR $pb->report("%50b %p ($i/$count)$string\r", $i); |
86
|
0
|
|
|
|
|
0
|
return $i; |
87
|
0
|
|
|
|
|
0
|
}; |
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
sub categorize_collection { |
91
|
5
|
|
|
5
|
1
|
533
|
my ($self, %args) = @_; |
92
|
5
|
50
|
|
|
|
26
|
my $c = $args{collection} or die "No collection provided"; |
93
|
|
|
|
|
|
|
|
94
|
5
|
|
|
|
|
25
|
my @all_cats = map $_->name, $self->categories; |
95
|
5
|
|
|
|
|
30
|
my $experiment = $self->create_delayed_object('experiment', categories => \@all_cats); |
96
|
5
|
50
|
|
20
|
|
21
|
my $pb = $self->verbose ? $self->prog_bar($c->count_documents) : sub {}; |
|
20
|
|
|
|
|
28
|
|
97
|
5
|
|
|
|
|
57
|
while (my $d = $c->next) { |
98
|
20
|
|
|
|
|
61
|
my $h = $self->categorize($d); |
99
|
20
|
|
|
|
|
2487
|
$experiment->add_hypothesis($h, [map $_->name, $d->categories]); |
100
|
20
|
|
|
|
|
929
|
$pb->($experiment); |
101
|
20
|
50
|
|
|
|
46
|
if ($self->verbose > 1) { |
102
|
0
|
|
|
|
|
0
|
printf STDERR ("%s: assigned=(%s) correct=(%s)\n", |
103
|
|
|
|
|
|
|
$d->name, |
104
|
|
|
|
|
|
|
join(', ', $h->categories), |
105
|
|
|
|
|
|
|
join(', ', map $_->name, $d->categories)); |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
} |
108
|
5
|
50
|
|
|
|
28
|
print STDERR "\n" if $self->verbose; |
109
|
|
|
|
|
|
|
|
110
|
5
|
|
|
|
|
38
|
return $experiment; |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
sub categorize { |
114
|
41
|
|
|
41
|
1
|
254
|
my ($self, $doc) = @_; |
115
|
|
|
|
|
|
|
|
116
|
41
|
|
|
|
|
152
|
my ($scores, $threshold) = $self->get_scores($doc); |
117
|
|
|
|
|
|
|
|
118
|
41
|
50
|
|
|
|
988
|
if ($self->verbose > 2) { |
119
|
0
|
0
|
|
|
|
0
|
warn "scores: @{[ %$scores ]}" if $self->verbose > 3; |
|
0
|
|
|
|
|
0
|
|
120
|
|
|
|
|
|
|
|
121
|
0
|
|
|
|
|
0
|
foreach my $key (sort {$scores->{$b} <=> $scores->{$a}} keys %$scores) { |
|
0
|
|
|
|
|
0
|
|
122
|
0
|
|
|
|
|
0
|
print "$key: $scores->{$key}\n"; |
123
|
|
|
|
|
|
|
} |
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
|
126
|
41
|
|
|
|
|
148
|
return $self->create_delayed_object('hypothesis', |
127
|
|
|
|
|
|
|
scores => $scores, |
128
|
|
|
|
|
|
|
threshold => $threshold, |
129
|
|
|
|
|
|
|
document_name => $doc->name, |
130
|
|
|
|
|
|
|
); |
131
|
|
|
|
|
|
|
} |
132
|
|
|
|
|
|
|
1; |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
__END__ |