line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package AI::Classifier::Text::FileLearner; |
2
|
|
|
|
|
|
|
{ |
3
|
|
|
|
|
|
|
$AI::Classifier::Text::FileLearner::VERSION = '0.03'; |
4
|
|
|
|
|
|
|
} |
5
|
1
|
|
|
1
|
|
25422
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
36
|
|
6
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
44
|
|
7
|
1
|
|
|
1
|
|
21
|
use 5.010; |
|
1
|
|
|
|
|
7
|
|
|
1
|
|
|
|
|
37
|
|
8
|
|
|
|
|
|
|
|
9
|
1
|
|
|
1
|
|
1510
|
use Moose; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
use File::Find::Rule; |
11
|
|
|
|
|
|
|
use File::Spec; |
12
|
|
|
|
|
|
|
use List::Util 'max'; |
13
|
|
|
|
|
|
|
use Carp 'croak'; |
14
|
|
|
|
|
|
|
use AI::NaiveBayes::Learner; |
15
|
|
|
|
|
|
|
use AI::Classifier::Text; |
16
|
|
|
|
|
|
|
use AI::Classifier::Text::Analyzer; |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
has term_weighting => (is => 'ro', isa => 'Str'); |
19
|
|
|
|
|
|
|
has analyzer => ( is => 'ro', default => sub{ AI::Classifier::Text::Analyzer->new() } ); |
20
|
|
|
|
|
|
|
has learner => ( is => 'ro', default => sub{ AI::NaiveBayes::Learner->new() } ); |
21
|
|
|
|
|
|
|
has training_dir => ( is => 'ro', isa => 'Str', required => 1 ); |
22
|
|
|
|
|
|
|
has iterator => ( is => 'ro', lazy_build => 1 ); |
23
|
|
|
|
|
|
|
sub _build_iterator { |
24
|
|
|
|
|
|
|
my $self = shift; |
25
|
|
|
|
|
|
|
my $rule = File::Find::Rule->new( ); |
26
|
|
|
|
|
|
|
$rule->file; |
27
|
|
|
|
|
|
|
$rule->not_name('*.data'); |
28
|
|
|
|
|
|
|
$rule->start( $self->training_dir ); |
29
|
|
|
|
|
|
|
return $rule; |
30
|
|
|
|
|
|
|
} |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
sub get_category { |
33
|
|
|
|
|
|
|
my( $self, $file ) = @_; |
34
|
|
|
|
|
|
|
my $training_dir = $self->training_dir; |
35
|
|
|
|
|
|
|
my $rest = File::Spec->abs2rel( $file, $training_dir ); |
36
|
|
|
|
|
|
|
my @dirs = File::Spec->splitdir( $rest ); |
37
|
|
|
|
|
|
|
return $dirs[0] |
38
|
|
|
|
|
|
|
} |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
sub next { |
42
|
|
|
|
|
|
|
my $self = shift; |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
my $file = $self->iterator->match; |
45
|
|
|
|
|
|
|
return if !defined($file); |
46
|
|
|
|
|
|
|
my $category = $self->get_category( $file ); |
47
|
|
|
|
|
|
|
open(my $fh, "<:encoding(UTF-8)", $file ) |
48
|
|
|
|
|
|
|
|| Carp::croak( |
49
|
|
|
|
|
|
|
"Unable to read the specified training file: $file\n"); |
50
|
|
|
|
|
|
|
my $content = join('', <$fh>); |
51
|
|
|
|
|
|
|
close $fh; |
52
|
|
|
|
|
|
|
my $initial_features = {}; |
53
|
|
|
|
|
|
|
if( -f "$file.data" ){ |
54
|
|
|
|
|
|
|
my $data = do "$file.data"; |
55
|
|
|
|
|
|
|
$initial_features = $data->{initial_features} |
56
|
|
|
|
|
|
|
} |
57
|
|
|
|
|
|
|
my $features = $self->analyzer->analyze( $content, $initial_features ); |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
return { |
60
|
|
|
|
|
|
|
file => $file, |
61
|
|
|
|
|
|
|
features => $features, |
62
|
|
|
|
|
|
|
categories => [ $category ], |
63
|
|
|
|
|
|
|
}; |
64
|
|
|
|
|
|
|
} |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
sub teach_it { |
67
|
|
|
|
|
|
|
my $self = shift; |
68
|
|
|
|
|
|
|
my $learner = $self->learner; |
69
|
|
|
|
|
|
|
while ( my $data = $self->next ) { |
70
|
|
|
|
|
|
|
normalize( $data->{features} ); |
71
|
|
|
|
|
|
|
$self->weight_terms($data); |
72
|
|
|
|
|
|
|
$learner->add_example( |
73
|
|
|
|
|
|
|
attributes => $data->{features}, |
74
|
|
|
|
|
|
|
labels => $data->{categories} |
75
|
|
|
|
|
|
|
); |
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
} |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
sub classifier { |
81
|
|
|
|
|
|
|
my $self = shift; |
82
|
|
|
|
|
|
|
$self->teach_it; |
83
|
|
|
|
|
|
|
return AI::Classifier::Text->new( |
84
|
|
|
|
|
|
|
classifier => $self->learner->classifier, |
85
|
|
|
|
|
|
|
analyzer => $self->analyzer, |
86
|
|
|
|
|
|
|
); |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
sub weight_terms { |
91
|
|
|
|
|
|
|
my ( $self, $doc ) = @_; |
92
|
|
|
|
|
|
|
my $f = $doc->{features}; |
93
|
|
|
|
|
|
|
given ($self->term_weighting) { |
94
|
|
|
|
|
|
|
when ('n') { |
95
|
|
|
|
|
|
|
my $max_tf = max values %$f; |
96
|
|
|
|
|
|
|
$_ = 0.5 + 0.5 * $_ / $max_tf for values %$f; |
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
when ('b') { |
99
|
|
|
|
|
|
|
$_ = $_ ? 1 : 0 for values %$f; |
100
|
|
|
|
|
|
|
} |
101
|
|
|
|
|
|
|
when (undef){ |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
default { |
104
|
|
|
|
|
|
|
croak 'Unknown weighting type: '.$self->term_weighting; |
105
|
|
|
|
|
|
|
} |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
} |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
# this doesn't quite fit the current model (it requires the entire collection |
110
|
|
|
|
|
|
|
# of documents to be in memory at once), but it may be useful to someone, someday |
111
|
|
|
|
|
|
|
# so let's just leave it here |
112
|
|
|
|
|
|
|
sub collection_weighting { |
113
|
|
|
|
|
|
|
my (@documents, $subtrahend) = @_; |
114
|
|
|
|
|
|
|
$subtrahend //= 0; |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
my $num_docs = +@documents; |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
my %frequency; |
119
|
|
|
|
|
|
|
for my $doc (@documents) { |
120
|
|
|
|
|
|
|
for my $k (keys %{$doc->{attributes}}) { |
121
|
|
|
|
|
|
|
$frequency{$k}++; |
122
|
|
|
|
|
|
|
} |
123
|
|
|
|
|
|
|
} |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
foreach my $doc (@documents) { |
126
|
|
|
|
|
|
|
my $f = $doc->{attributes}; |
127
|
|
|
|
|
|
|
for (keys %$f) { |
128
|
|
|
|
|
|
|
$f->{$_} *= log($num_docs / ($frequency{$_} // 0) - $subtrahend); |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
} |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
sub euclidean_length { |
134
|
|
|
|
|
|
|
my $f = shift; |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
my $total = 0; |
137
|
|
|
|
|
|
|
foreach (values %$f) { |
138
|
|
|
|
|
|
|
$total += $_**2; |
139
|
|
|
|
|
|
|
} |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
return sqrt($total); |
142
|
|
|
|
|
|
|
} |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
sub scale { |
145
|
|
|
|
|
|
|
my ($f, $scalar) = @_; |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
$_ *= $scalar foreach values %$f; |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
return $f; |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
sub normalize { |
153
|
|
|
|
|
|
|
my $attrs = shift; |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
my $length = euclidean_length($attrs); |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
return $length ? scale($attrs, 1/$length) : $attrs; |
158
|
|
|
|
|
|
|
} |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
1; |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
=pod |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=head1 NAME |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
AI::Classifier::Text::FileLearner - Training data reader for AI::NaiveBayes |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
=head1 VERSION |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
version 0.03 |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
=head1 SYNOPSIS |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
use AI::Classifier::Text::FileLearner; |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
my $learner = AI::Classifier::Text::FileLearner->new( training_dir => 't/data/training_set_ordered/' ); |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
my $classifier = $learner->classifier; |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
=head1 DESCRIPTION |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
This is a trainer of text classifiers. It traverses a directory filled, |
183
|
|
|
|
|
|
|
interprets the subdirectories in it as category names, reads all files in them and adds them |
184
|
|
|
|
|
|
|
as examples for the classifier being trained. |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
head1 METHODS |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
=over 4 |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
=item next |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
Internal method for traversing the training data directory. |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=item classifier |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
Returns a trained classifier. |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
=back |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
=head1 AUTHOR |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
Zbigniew Lukasiak <zlukasiak@opera.com>, Tadeusz SoÅnierz <tsosnierz@opera.com> |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
This software is copyright (c) 2012 by Opera Software ASA. |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
This is free software; you can redistribute it and/or modify it under |
209
|
|
|
|
|
|
|
the same terms as the Perl 5 programming language system itself. |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
=cut |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
__END__ |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
# ABSTRACT: Training data reader for AI::NaiveBayes |
216
|
|
|
|
|
|
|
|