File Coverage

blib/lib/Algorithm/ContextVector.pm

Criterion	Covered	Total	%
statement	12	83	14.4
branch	0	14	0.0
condition	0	4	0.0
subroutine	4	14	28.5
pod	6	6	100.0
total	22	121	18.1

line	stmt	bran	cond	sub	pod	time	code
1							package Algorithm::ContextVector;
2	1			1		698	use strict;
	1					2
	1					32
3	1			1		5	use warnings;
	1					3
	1					46
4
5							our $VERSION = 0.01;
6
7							=head1 NAME
8
9							Algorithm::ContextVector - Simple implementation based on Data::CosineSimilarity
10
11							=head1 SYNOPSIS
12
13							my $cv = Algorithm::ContextVector->new( top => 300 );
14
15							$cs->add_instance( label => 'label1', attributes => { feature1 => 3, feature2 => 1, feature3 => 10 } );
16							$cs->add_instance( label => [ 'label2', 'label3' ], attributes => { ... } );
17							$cs->add_instance( label => ..., attributes => ... );
18							...
19
20							$cv->train;
21
22							my $results = $cv->predict( attributes => { ... } );
23
24							=head1 DESCRIPTION
25
26							Simple implementation based on Data::CosineSimilarity
27
28							=head2 $class->new( top => ... )
29
30							During the training, keeps the $top most heavy weighted features.
31							Keeps the complete feature set if omitted.
32
33							=cut
34
35	1			1		888	use Data::CosineSimilarity;
	1					20476
	1					37
36	1			1		4734	use Storable;
	1					4821
	1					1161
37
38							sub new {
39	0			0	1		my $class = shift;
40	0						my %opts = @_;
41	0						return bless {
42							top => $opts{top},
43							labels => {},
44							}, $class;
45							}
46
47							=head2 $class->new_from_file( $filename )
48
49							Returns the instance of Algorithm::ContextVector stored in $filename.
50
51							=cut
52
53							sub new_from_file {
54	0			0	1		my $class = shift;
55	0						my ($file) = @_;
56	0						return retrieve($file);
57							}
58
59							=head2 $self->save_to_file( $filename )
60
61							Save the $self to $filename using Storable.
62
63							=cut
64
65							sub save_to_file {
66	0			0	1		my $self = shift;
67	0						my ($file) = @_;
68	0						store($self, $file);
69							}
70
71							sub _add_hashrefs {
72	0			0			my $self = shift;
73	0						my @list = @_;
74	0						my %r;
75	0						for my $h (@list) {
76	0						for my $key (keys %$h) {
77	0		0				$r{$key} \|\|= 0;
78	0						$r{$key} = $r{$key} + $h->{$key};
79							}
80							}
81	0						return \%r;
82							}
83
84							=head2 $self->add_instance( label => [ ... ], attributes => { ... } )
85
86							=cut
87
88							sub add_instance {
89	0			0	1		my $self = shift;
90	0						my %args = @_;
91
92	0	0					my $attr = $args{attributes} or die 'attributes required';
93	0	0					return unless keys %$attr;
94
95	0	0					my $labels = $args{label} or die 'label required';
96	0	0					$labels = [ $labels ] unless ref $labels;
97
98	0						for $_ (@$labels) {
99	0		0				$self->{labels}{$_}{features} \|\|= {};
100	0						$self->{labels}{$_}{features} = $self->_add_hashrefs(
101							$self->{labels}{$_}{features}, $attr
102							);
103							}
104							}
105
106							sub _norm_features {
107	0			0			my $self = shift;
108	0						my ($features) = @_;
109	0						my $norm = 0;
110	0						$norm += $_**2 for values %$features;
111	0						$norm = sqrt($norm);
112
113	0						$_ = $_ / $norm for values %$features;
114
115	0						return $features;
116							}
117
118							sub _cut_features {
119	0			0			my $self = shift;
120	0						my ($features) = @_;
121	0						my $top = $self->{top};
122	0	0					return $features unless defined $top;
123
124	0						my @sorted =
125	0						sort { $b->[1] <=> $a->[1] }
126	0						map { [ $_, $features->{$_} ] }
127							keys %$features;
128
129	0						my @keep = splice @sorted, 0, $top;
130
131	0						my $r = { map { $_->[0] => $_->[1] } @keep };
	0
132
133	0						return $r;
134							}
135
136							# IDEA dead code for now
137							sub _cut_features_avg {
138	0			0			my $features = shift;
139	0						my $sum = 0;
140	0						$sum += $_ for values %$features;
141	0						my $count = scalar keys %$features;
142	0						my $cut = $sum / $count; # hum cut at the avg
143	0						for (keys %$features) {
144	0	0					delete $features->{$_} if $features->{$_} < $cut;
145							}
146	0						return $features;
147							}
148
149							=head2 $self->train
150
151							Keeps the best features (top N) and norms the vectors.
152
153							=cut
154
155							sub train {
156	0			0	1		my $self = shift;
157	0						for $_ (keys %{ $self->{labels} }) {
	0
158	0						$self->{labels}{$_}{features} = $self->_cut_features( $self->{labels}{$_}{features} );
159	0						$self->{labels}{$_}{features} = $self->_norm_features( $self->{labels}{$_}{features} );
160							}
161							}
162
163							=head2 $self->predict( attributes => { ... } )
164
165							Returns a hashref with the labels as the keys and the cosines as the values.
166
167							=cut
168
169							sub predict {
170	0			0	1		my $self = shift;
171	0						my %args = @_;
172
173	0	0					my $attr = $args{attributes} or die 'attributes required';
174
175	0						my $cs = Data::CosineSimilarity->new( normed => 1 );
176
177	0						for my $label (keys %{ $self->{labels} }) {
	0
178	0						$cs->add( $label => $self->{labels}{$label}{features} );
179							}
180
181	0						$cs->add( __my_test => $self->_norm_features( $self->_cut_features( $attr ) ) );
182
183	0						my @all = $cs->all_for_label('__my_test');
184	0						my %r;
185	0						for (@all) {
186	0						my (undef, $label) = $_->labels;
187	0						$r{$label} = $_->cosine;
188							}
189	0						return \%r;
190							}
191
192							=head1 AUTHOR
193
194							Antoine Imbert, C<< >>
195
196							=head1 LICENSE AND COPYRIGHT
197
198							This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
199
200							=cut
201
202							1;