line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package AI::Categorizer::FeatureSelector; |
2
|
|
|
|
|
|
|
|
3
|
6
|
|
|
6
|
|
28
|
use strict; |
|
6
|
|
|
|
|
11
|
|
|
6
|
|
|
|
|
155
|
|
4
|
6
|
|
|
6
|
|
29
|
use Class::Container; |
|
6
|
|
|
|
|
8
|
|
|
6
|
|
|
|
|
125
|
|
5
|
6
|
|
|
6
|
|
29
|
use base qw(Class::Container); |
|
6
|
|
|
|
|
10
|
|
|
6
|
|
|
|
|
407
|
|
6
|
|
|
|
|
|
|
|
7
|
6
|
|
|
6
|
|
30
|
use Params::Validate qw(:types); |
|
6
|
|
|
|
|
10
|
|
|
6
|
|
|
|
|
1057
|
|
8
|
6
|
|
|
6
|
|
33
|
use AI::Categorizer::FeatureVector; |
|
6
|
|
|
|
|
10
|
|
|
6
|
|
|
|
|
113
|
|
9
|
6
|
|
|
6
|
|
26
|
use AI::Categorizer::Util; |
|
6
|
|
|
|
|
10
|
|
|
6
|
|
|
|
|
236
|
|
10
|
6
|
|
|
6
|
|
27
|
use Carp qw(croak); |
|
6
|
|
|
|
|
8
|
|
|
6
|
|
|
|
|
2665
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
__PACKAGE__->valid_params |
13
|
|
|
|
|
|
|
( |
14
|
|
|
|
|
|
|
features_kept => { |
15
|
|
|
|
|
|
|
type => SCALAR, |
16
|
|
|
|
|
|
|
default => 0.2, |
17
|
|
|
|
|
|
|
}, |
18
|
|
|
|
|
|
|
verbose => { |
19
|
|
|
|
|
|
|
type => SCALAR, |
20
|
|
|
|
|
|
|
default => 0, |
21
|
|
|
|
|
|
|
}, |
22
|
|
|
|
|
|
|
); |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
sub verbose { |
25
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
26
|
0
|
0
|
|
|
|
|
$self->{verbose} = shift if @_; |
27
|
0
|
|
|
|
|
|
return $self->{verbose}; |
28
|
|
|
|
|
|
|
} |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
sub reduce_features { |
31
|
|
|
|
|
|
|
# Takes a feature vector whose weights are "feature scores", and |
32
|
|
|
|
|
|
|
# chops to the highest n features. n is specified by the |
33
|
|
|
|
|
|
|
# 'features_kept' parameter. If it's zero, all features are kept. |
34
|
|
|
|
|
|
|
# If it's between 0 and 1, we multiply by the present number of |
35
|
|
|
|
|
|
|
# features. If it's greater than 1, we treat it as the number of |
36
|
|
|
|
|
|
|
# features to use. |
37
|
|
|
|
|
|
|
|
38
|
0
|
|
|
0
|
0
|
|
my ($self, $f, %args) = @_; |
39
|
0
|
0
|
|
|
|
|
my $kept = defined $args{features_kept} ? $args{features_kept} : $self->{features_kept}; |
40
|
0
|
0
|
|
|
|
|
return $f unless $kept; |
41
|
|
|
|
|
|
|
|
42
|
0
|
0
|
|
|
|
|
my $num_kept = ($kept < 1 ? |
43
|
|
|
|
|
|
|
$f->length * $kept : |
44
|
|
|
|
|
|
|
$kept); |
45
|
|
|
|
|
|
|
|
46
|
0
|
0
|
|
|
|
|
print "Trimming features - # features = " . $f->length . "\n" if $self->verbose; |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
# This is algorithmic overkill, but the sort seems fast enough. Will revisit later. |
49
|
0
|
|
|
|
|
|
my $features = $f->as_hash; |
50
|
0
|
|
|
|
|
|
my @new_features = (sort {$features->{$b} <=> $features->{$a}} keys %$features) |
|
0
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
[0 .. $num_kept-1]; |
52
|
|
|
|
|
|
|
|
53
|
0
|
|
|
|
|
|
my $result = $f->intersection( \@new_features ); |
54
|
0
|
0
|
|
|
|
|
print "Finished trimming features - # features = " . $result->length . "\n" if $self->verbose; |
55
|
0
|
|
|
|
|
|
return $result; |
56
|
|
|
|
|
|
|
} |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
# Abstract methods |
59
|
|
|
|
|
|
|
sub rank_features; |
60
|
|
|
|
|
|
|
sub scan_features; |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
sub select_features { |
63
|
0
|
|
|
0
|
0
|
|
my ($self, %args) = @_; |
64
|
|
|
|
|
|
|
|
65
|
0
|
0
|
|
|
|
|
die "No knowledge_set parameter provided to select_features()" |
66
|
|
|
|
|
|
|
unless $args{knowledge_set}; |
67
|
|
|
|
|
|
|
|
68
|
0
|
|
|
|
|
|
my $f = $self->rank_features( knowledge_set => $args{knowledge_set} ); |
69
|
0
|
|
|
|
|
|
return $self->reduce_features( $f, features_kept => $args{features_kept} ); |
70
|
|
|
|
|
|
|
} |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
1; |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
__END__ |