line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package AI::Categorizer::Document; |
2
|
|
|
|
|
|
|
|
3
|
11
|
|
|
11
|
|
48
|
use strict; |
|
11
|
|
|
|
|
33
|
|
|
11
|
|
|
|
|
301
|
|
4
|
11
|
|
|
11
|
|
48
|
use Class::Container; |
|
11
|
|
|
|
|
18
|
|
|
11
|
|
|
|
|
243
|
|
5
|
11
|
|
|
11
|
|
47
|
use base qw(Class::Container); |
|
11
|
|
|
|
|
15
|
|
|
11
|
|
|
|
|
677
|
|
6
|
|
|
|
|
|
|
|
7
|
11
|
|
|
11
|
|
48
|
use Params::Validate qw(:types); |
|
11
|
|
|
|
|
18
|
|
|
11
|
|
|
|
|
1493
|
|
8
|
11
|
|
|
11
|
|
56
|
use AI::Categorizer::ObjectSet; |
|
11
|
|
|
|
|
19
|
|
|
11
|
|
|
|
|
185
|
|
9
|
11
|
|
|
11
|
|
12959
|
use AI::Categorizer::FeatureVector; |
|
11
|
|
|
|
|
24
|
|
|
11
|
|
|
|
|
20555
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
__PACKAGE__->valid_params |
12
|
|
|
|
|
|
|
( |
13
|
|
|
|
|
|
|
name => { |
14
|
|
|
|
|
|
|
type => SCALAR, |
15
|
|
|
|
|
|
|
}, |
16
|
|
|
|
|
|
|
categories => { |
17
|
|
|
|
|
|
|
type => ARRAYREF, |
18
|
|
|
|
|
|
|
default => [], |
19
|
|
|
|
|
|
|
callbacks => { 'all are Category objects' => |
20
|
|
|
|
|
|
|
sub { ! grep !UNIVERSAL::isa($_, 'AI::Categorizer::Category'), @{$_[0]} }, |
21
|
|
|
|
|
|
|
}, |
22
|
|
|
|
|
|
|
public => 0, |
23
|
|
|
|
|
|
|
}, |
24
|
|
|
|
|
|
|
stopwords => { |
25
|
|
|
|
|
|
|
type => ARRAYREF|HASHREF, |
26
|
|
|
|
|
|
|
default => {}, |
27
|
|
|
|
|
|
|
}, |
28
|
|
|
|
|
|
|
content => { |
29
|
|
|
|
|
|
|
type => HASHREF|SCALAR, |
30
|
|
|
|
|
|
|
default => undef, |
31
|
|
|
|
|
|
|
}, |
32
|
|
|
|
|
|
|
parse => { |
33
|
|
|
|
|
|
|
type => SCALAR, |
34
|
|
|
|
|
|
|
optional => 1, |
35
|
|
|
|
|
|
|
}, |
36
|
|
|
|
|
|
|
parse_handle => { |
37
|
|
|
|
|
|
|
type => HANDLE, |
38
|
|
|
|
|
|
|
optional => 1, |
39
|
|
|
|
|
|
|
}, |
40
|
|
|
|
|
|
|
features => { |
41
|
|
|
|
|
|
|
isa => 'AI::Categorizer::FeatureVector', |
42
|
|
|
|
|
|
|
optional => 1, |
43
|
|
|
|
|
|
|
}, |
44
|
|
|
|
|
|
|
content_weights => { |
45
|
|
|
|
|
|
|
type => HASHREF, |
46
|
|
|
|
|
|
|
default => {}, |
47
|
|
|
|
|
|
|
}, |
48
|
|
|
|
|
|
|
front_bias => { |
49
|
|
|
|
|
|
|
type => SCALAR, |
50
|
|
|
|
|
|
|
default => 0, |
51
|
|
|
|
|
|
|
}, |
52
|
|
|
|
|
|
|
use_features => { |
53
|
|
|
|
|
|
|
type => HASHREF|UNDEF, |
54
|
|
|
|
|
|
|
default => undef, |
55
|
|
|
|
|
|
|
}, |
56
|
|
|
|
|
|
|
stemming => { |
57
|
|
|
|
|
|
|
type => SCALAR|UNDEF, |
58
|
|
|
|
|
|
|
optional => 1, |
59
|
|
|
|
|
|
|
}, |
60
|
|
|
|
|
|
|
stopword_behavior => { |
61
|
|
|
|
|
|
|
type => SCALAR, |
62
|
|
|
|
|
|
|
default => "stem", |
63
|
|
|
|
|
|
|
}, |
64
|
|
|
|
|
|
|
); |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
__PACKAGE__->contained_objects |
67
|
|
|
|
|
|
|
( |
68
|
|
|
|
|
|
|
features => { delayed => 1, |
69
|
|
|
|
|
|
|
class => 'AI::Categorizer::FeatureVector' }, |
70
|
|
|
|
|
|
|
); |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
### Constructors |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
my $NAME = 'a'; |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
sub new { |
77
|
94
|
|
|
94
|
1
|
4735
|
my $pkg = shift; |
78
|
94
|
|
|
|
|
489
|
my $self = $pkg->SUPER::new(name => $NAME++, # Use a default name |
79
|
|
|
|
|
|
|
@_); |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
# Get efficient internal data structures |
82
|
94
|
|
|
|
|
7583
|
$self->{categories} = new AI::Categorizer::ObjectSet( @{$self->{categories}} ); |
|
94
|
|
|
|
|
463
|
|
83
|
|
|
|
|
|
|
|
84
|
94
|
|
|
|
|
257
|
$self->_fix_stopwords; |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
# A few different ways for the caller to initialize the content |
87
|
94
|
100
|
|
|
|
441
|
if (exists $self->{parse}) { |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
88
|
1
|
|
|
|
|
6
|
$self->parse(content => delete $self->{parse}); |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
} elsif (exists $self->{parse_handle}) { |
91
|
0
|
|
|
|
|
0
|
$self->parse_handle(handle => delete $self->{parse_handle}); |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
} elsif (defined $self->{content}) { |
94
|
|
|
|
|
|
|
# Allow a simple string as the content |
95
|
88
|
50
|
|
|
|
372
|
$self->{content} = { body => $self->{content} } unless ref $self->{content}; |
96
|
|
|
|
|
|
|
} |
97
|
|
|
|
|
|
|
|
98
|
94
|
100
|
|
|
|
406
|
$self->finish if $self->{content}; |
99
|
94
|
|
|
|
|
420
|
return $self; |
100
|
|
|
|
|
|
|
} |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
sub _fix_stopwords { |
103
|
94
|
|
|
94
|
|
125
|
my $self = shift; |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
# Convert to hash |
106
|
94
|
100
|
|
|
|
331
|
$self->{stopwords} = { map {($_ => 1)} @{ $self->{stopwords} } } |
|
123
|
|
|
|
|
282
|
|
|
27
|
|
|
|
|
54
|
|
107
|
|
|
|
|
|
|
if UNIVERSAL::isa($self->{stopwords}, 'ARRAY'); |
108
|
|
|
|
|
|
|
|
109
|
94
|
|
|
|
|
167
|
my $s = $self->{stopwords}; |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
# May need to perform stemming on the stopwords |
112
|
94
|
100
|
|
|
|
264
|
return unless keys %$s; # No point in doing anything if there are no stopwords |
113
|
27
|
100
|
|
|
|
99
|
return unless $self->{stopword_behavior} eq 'stem'; |
114
|
25
|
100
|
66
|
|
|
115
|
return if !defined($self->{stemming}) or $self->{stemming} eq 'none'; |
115
|
1
|
50
|
|
|
|
5
|
return if $s->{___stemmed}; |
116
|
|
|
|
|
|
|
|
117
|
1
|
|
|
|
|
3
|
my @keys = keys %$s; |
118
|
1
|
|
|
|
|
3
|
%$s = (); |
119
|
1
|
|
|
|
|
4
|
$self->stem_words(\@keys); |
120
|
1
|
|
|
|
|
4282
|
$s->{$_} = 1 foreach @keys; |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
# This flag is attached to the stopword structure itself so that |
123
|
|
|
|
|
|
|
# other documents will notice it. |
124
|
1
|
|
|
|
|
6
|
$s->{___stemmed} = 1; |
125
|
|
|
|
|
|
|
} |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
sub finish { |
128
|
92
|
|
|
92
|
0
|
127
|
my $self = shift; |
129
|
92
|
|
|
|
|
197
|
$self->create_feature_vector; |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
# Now we're done with all the content stuff |
132
|
92
|
|
|
|
|
126
|
delete @{$self}{'content', 'content_weights', 'stopwords', 'use_features'}; |
|
92
|
|
|
|
|
384
|
|
133
|
|
|
|
|
|
|
} |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
# Parse a document format - a virtual method |
137
|
|
|
|
|
|
|
sub parse; |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
sub parse_handle { |
140
|
3
|
|
|
3
|
0
|
9
|
my ($self, %args) = @_; |
141
|
3
|
50
|
|
|
|
8
|
my $fh = $args{handle} or die "No 'handle' argument given to parse_handle()"; |
142
|
3
|
|
|
|
|
90
|
return $self->parse( content => join '', <$fh> ); |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
### Accessors |
146
|
|
|
|
|
|
|
|
147
|
157
|
|
|
157
|
1
|
857
|
sub name { $_[0]->{name} } |
148
|
3
|
|
|
3
|
1
|
39
|
sub stopword_behavior { $_[0]->{stopword_behavior} } |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
sub features { |
151
|
198
|
|
|
198
|
1
|
1168
|
my $self = shift; |
152
|
198
|
50
|
|
|
|
477
|
if (@_) { |
153
|
0
|
|
|
|
|
0
|
$self->{features} = shift; |
154
|
|
|
|
|
|
|
} |
155
|
198
|
|
|
|
|
698
|
return $self->{features}; |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
sub categories { |
159
|
140
|
|
|
140
|
1
|
234
|
my $c = $_[0]->{categories}; |
160
|
140
|
50
|
|
|
|
454
|
return wantarray ? $c->members : $c->size; |
161
|
|
|
|
|
|
|
} |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
### Workers |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
sub create_feature_vector { |
167
|
92
|
|
|
92
|
1
|
104
|
my $self = shift; |
168
|
92
|
|
|
|
|
134
|
my $content = $self->{content}; |
169
|
92
|
|
|
|
|
126
|
my $weights = $self->{content_weights}; |
170
|
|
|
|
|
|
|
|
171
|
92
|
50
|
|
|
|
379
|
die "'stopword_behavior' must be one of 'stem', 'no_stem', or 'pre_stemmed'" |
172
|
|
|
|
|
|
|
unless $self->{stopword_behavior} =~ /^stem|no_stem|pre_stemmed$/; |
173
|
|
|
|
|
|
|
|
174
|
92
|
|
|
|
|
334
|
$self->{features} = $self->create_delayed_object('features'); |
175
|
92
|
|
|
|
|
370
|
while (my ($name, $data) = each %$content) { |
176
|
92
|
|
|
|
|
227
|
my $t = $self->tokenize($data); |
177
|
92
|
100
|
|
|
|
280
|
$t = $self->_filter_tokens($t) if $self->{stopword_behavior} eq 'no_stem'; |
178
|
92
|
|
|
|
|
235
|
$self->stem_words($t); |
179
|
92
|
100
|
|
|
|
1272
|
$t = $self->_filter_tokens($t) if $self->{stopword_behavior} =~ /^stem|pre_stemmed$/; |
180
|
92
|
50
|
|
|
|
453
|
my $h = $self->vectorize(tokens => $t, weight => exists($weights->{$name}) ? $weights->{$name} : 1 ); |
181
|
92
|
|
|
|
|
353
|
$self->{features}->add($h); |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
} |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
sub is_in_category { |
186
|
16
|
50
|
|
16
|
0
|
100
|
return (ref $_[1] |
187
|
|
|
|
|
|
|
? $_[0]->{categories}->includes( $_[1] ) |
188
|
|
|
|
|
|
|
: $_[0]->{categories}->includes_name( $_[1] )); |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
} |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
sub tokenize { |
193
|
92
|
|
|
92
|
0
|
114
|
my $self = shift; |
194
|
92
|
|
|
|
|
111
|
my @tokens; |
195
|
92
|
|
|
|
|
444
|
while ($_[0] =~ /([-\w]+)/g) { |
196
|
670
|
|
|
|
|
1291
|
my $word = lc $1; |
197
|
670
|
50
|
|
|
|
1553
|
next unless $word =~ /[a-z]/; |
198
|
670
|
|
|
|
|
990
|
$word =~ s/^[^a-z]+//; # Trim leading non-alpha characters (helps with ordinals) |
199
|
670
|
|
|
|
|
2268
|
push @tokens, $word; |
200
|
|
|
|
|
|
|
} |
201
|
92
|
|
|
|
|
207
|
return \@tokens; |
202
|
|
|
|
|
|
|
} |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
sub stem_words { |
205
|
93
|
|
|
93
|
0
|
137
|
my ($self, $tokens) = @_; |
206
|
93
|
100
|
|
|
|
259
|
return unless $self->{stemming}; |
207
|
4
|
50
|
|
|
|
12
|
return if $self->{stemming} eq 'none'; |
208
|
4
|
50
|
|
|
|
10
|
die "Unknown stemming option '$self->{stemming}' - options are 'porter' or 'none'" |
209
|
|
|
|
|
|
|
unless $self->{stemming} eq 'porter'; |
210
|
|
|
|
|
|
|
|
211
|
4
|
50
|
|
|
|
6
|
eval {require Lingua::Stem; 1} |
|
4
|
|
|
|
|
1034
|
|
|
4
|
|
|
|
|
7084
|
|
212
|
|
|
|
|
|
|
or die "Porter stemming requires the Lingua::Stem module, available from CPAN.\n"; |
213
|
|
|
|
|
|
|
|
214
|
4
|
|
|
|
|
7
|
@$tokens = @{ Lingua::Stem::stem(@$tokens) }; |
|
4
|
|
|
|
|
17
|
|
215
|
|
|
|
|
|
|
} |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
sub _filter_tokens { |
218
|
92
|
|
|
92
|
|
140
|
my ($self, $tokens_in) = @_; |
219
|
|
|
|
|
|
|
|
220
|
92
|
50
|
66
|
|
|
392
|
if ($self->{use_features}) { |
|
92
|
100
|
|
|
|
363
|
|
221
|
0
|
|
|
|
|
0
|
my $f = $self->{use_features}->as_hash; |
222
|
0
|
|
|
|
|
0
|
return [ grep exists($f->{$_}), @$tokens_in ]; |
223
|
|
|
|
|
|
|
} elsif ($self->{stopwords} and keys %{$self->{stopwords}}) { |
224
|
27
|
|
|
|
|
40
|
my $s = $self->{stopwords}; |
225
|
27
|
|
|
|
|
170
|
return [ grep !exists($s->{$_}), @$tokens_in ]; |
226
|
|
|
|
|
|
|
} |
227
|
65
|
|
|
|
|
128
|
return $tokens_in; |
228
|
|
|
|
|
|
|
} |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
sub _weigh_tokens { |
231
|
92
|
|
|
92
|
|
201
|
my ($self, $tokens, $weight) = @_; |
232
|
|
|
|
|
|
|
|
233
|
92
|
|
|
|
|
114
|
my %counts; |
234
|
92
|
50
|
|
|
|
235
|
if (my $b = 0+$self->{front_bias}) { |
235
|
0
|
0
|
0
|
|
|
0
|
die "'front_bias' value must be between -1 and 1" |
236
|
|
|
|
|
|
|
unless -1 < $b and $b < 1; |
237
|
|
|
|
|
|
|
|
238
|
0
|
|
|
|
|
0
|
my $n = @$tokens; |
239
|
0
|
|
|
|
|
0
|
my $r = ($b-1)**2 / ($b+1); |
240
|
0
|
|
|
|
|
0
|
my $mult = $weight * log($r)/($r-1); |
241
|
|
|
|
|
|
|
|
242
|
0
|
|
|
|
|
0
|
my $i = 0; |
243
|
0
|
|
|
|
|
0
|
foreach my $feature (@$tokens) { |
244
|
0
|
|
|
|
|
0
|
$counts{$feature} += $mult * $r**($i/$n); |
245
|
0
|
|
|
|
|
0
|
$i++; |
246
|
|
|
|
|
|
|
} |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
} else { |
249
|
92
|
|
|
|
|
169
|
foreach my $feature (@$tokens) { |
250
|
632
|
|
|
|
|
1209
|
$counts{$feature} += $weight; |
251
|
|
|
|
|
|
|
} |
252
|
|
|
|
|
|
|
} |
253
|
|
|
|
|
|
|
|
254
|
92
|
|
|
|
|
302
|
return \%counts; |
255
|
|
|
|
|
|
|
} |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
sub vectorize { |
258
|
92
|
|
|
92
|
0
|
259
|
my ($self, %args) = @_; |
259
|
92
|
50
|
|
|
|
222
|
if ($self->{stem_stopwords}) { |
260
|
0
|
|
|
|
|
0
|
my $s = $self->stem_tokens([keys %{$self->{stopwords}}]); |
|
0
|
|
|
|
|
0
|
|
261
|
0
|
|
|
|
|
0
|
$self->{stopwords} = { map {+$_, 1} @$s }; |
|
0
|
|
|
|
|
0
|
|
262
|
0
|
|
|
|
|
0
|
$args{tokens} = $self->_filter_tokens($args{tokens}); |
263
|
|
|
|
|
|
|
} |
264
|
92
|
|
|
|
|
249
|
return $self->_weigh_tokens($args{tokens}, $args{weight}); |
265
|
|
|
|
|
|
|
} |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
sub read { |
268
|
3
|
|
|
3
|
1
|
93
|
my ($class, %args) = @_; |
269
|
3
|
50
|
|
|
|
20
|
my $path = delete $args{path} or die "Must specify 'path' argument to read()"; |
270
|
|
|
|
|
|
|
|
271
|
3
|
|
|
|
|
12
|
my $self = $class->new(%args); |
272
|
|
|
|
|
|
|
|
273
|
3
|
50
|
|
|
|
124
|
open my($fh), "< $path" or die "$path: $!"; |
274
|
3
|
|
|
|
|
13
|
$self->parse_handle(handle => $fh); |
275
|
3
|
|
|
|
|
35
|
close $fh; |
276
|
|
|
|
|
|
|
|
277
|
3
|
|
|
|
|
11
|
$self->finish; |
278
|
3
|
|
|
|
|
19
|
return $self; |
279
|
|
|
|
|
|
|
} |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
sub dump_features { |
282
|
0
|
|
|
0
|
0
|
|
my ($self, %args) = @_; |
283
|
0
|
0
|
|
|
|
|
my $path = $args{path} or die "No 'path' argument given to dump_features()"; |
284
|
0
|
0
|
|
|
|
|
open my($fh), "> $path" or die "Can't create $path: $!"; |
285
|
0
|
|
|
|
|
|
my $f = $self->features->as_hash; |
286
|
0
|
|
|
|
|
|
while (my ($k, $v) = each %$f) { |
287
|
0
|
|
|
|
|
|
print $fh "$k\t$v\n"; |
288
|
|
|
|
|
|
|
} |
289
|
|
|
|
|
|
|
} |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
1; |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
__END__ |