File Coverage

blib/lib/Data/CosineSimilarity.pm

Criterion	Covered	Total	%
statement	74	75	98.6
branch	12	20	60.0
condition			n/a
subroutine	18	18	100.0
pod	6	6	100.0
total	110	119	92.4

line	stmt	bran	sub	pod	time	code
1						package Data::CosineSimilarity;
2	2		2		56424	use strict;
	2				4
	2				79
3	2		2		12	use warnings;
	2				5
	2				1709
4
5						our $VERSION = 0.02;
6
7						=head1 NAME
8
9						Data::CosineSimilarity - Compute the Cosine Similarity
10
11						=head1 SYNOPSIS
12
13						$cs = Data::CosineSimilarity->new;
14						$cs->add( label1 => { feature1 => 3, feature2 => 1, feature3 => 10 } );
15						$cs->add( label2 => ... );
16						$cs->add( label3 => ... );
17
18						# computes the cosine similarity
19						my $r = $cs->similarity( 'label1', 'label2' );
20
21						# the result object
22						my $cosine = $r->cosine;
23						my $radian = $r->radian;
24						my $degree = $r->degree;
25						my ($label1, $label2) = $r->labels;
26
27						# computes all the cosine similarity between 'label1' and the others.
28						my @all = $cs->all_for_label('label1');
29
30						# computes all, and returns the best
31						my ($best_label, $r) = $cs->best_for_label('label2');
32
33						# computes all, and returns the worst
34						my ($worst_label, $r) = $cs->worst_for_label('label2');
35
36						=head1 DESCRIPTION
37
38						Compute the cosine similarities between a set of vectors.
39
40						=head2 $class->new( %opts )
41
42						If all the feature vectors are normed then the computation of the cosine
43						becomes just the dot product of the vectors. In this case, specify the
44						option normed => 1, the performance will be greatly improved.
45
46						=cut
47
48						sub new {
49	4		4	1	2897	my $class = shift;
50	4				8	my %opts = @_;
51	4	50			32	return bless {
52						normed => $opts{normed} ? 1 : 0,
53						labels => {},
54						}, $class;
55						}
56
57						=head2 $self->add( label => $features )
58
59						=cut
60
61						sub add {
62	9		9	1	2196	my $self = shift;
63	9				13	my ($label, $features) = @_;
64	9	50			22	die 'label required' unless $label;
65	9	50			21	die 'features required' unless $features;
66	9	50			29	die 'features must be a hashref'
67						unless ref $features eq 'HASH';
68	9	50			28	die 'features must contain terms'
69						unless keys %$features;
70
71	9	50			31	my $norm = $self->{normed} ? 1 : _euclidean_norm($features);
72
73	9	50			24	die 'euclidean norm is null' if $norm == 0;
74
75	9				49	$self->{labels}{$label} = {
76						features => $features,
77						norm => $norm,
78						};
79						}
80
81						sub _euclidean_norm {
82	9		9		14	my ($features) = @_;
83	9				12	my $sum = 0;
84	9				41	$sum += $_**2 for values %$features;
85	9				24	return sqrt $sum;
86						}
87
88						sub _scalar_product {
89	7		7		11	my ($features1, $features2) = @_;
90	7				8	my $product = 0;
91	7				19	for (keys %$features1) {
92	14				20	my $c1 = $features1->{$_};
93	14	100			38	my $c2 = $features2->{$_} or next;
94	9				21	$product += $c1 * $c2;
95						}
96	7				18	return $product;
97						}
98
99						=head2 $self->similarity( $label1, $label2 )
100
101						=cut
102
103						sub similarity {
104	7		7	1	18	my $self = shift;
105	7				11	my ($label1, $label2) = @_;
106
107	7				25	my $product = _scalar_product(
108						$self->{labels}{$label1}{features},
109						$self->{labels}{$label2}{features}
110						);
111
112	7				9	my $cosine;
113	7	50			16	if ($self->{normed}) {
114	0				0	$cosine = $product;
115						}
116						else {
117	7				21	$cosine = $product / ( $self->{labels}{$label1}{norm} * $self->{labels}{$label2}{norm} );
118						}
119
120	7				33	return Data::CosineSimilarity::Result->_new(
121						labels => [ $label1, $label2 ],
122						cosine => $cosine,
123						);
124						}
125
126						=head2 $self->all_for_label( $label )
127
128						=cut
129
130						sub all_for_label {
131	2		2	1	4	my $self = shift;
132	2				3	my ($label) = @_;
133	2				3	my @result;
134	2				4	for (keys %{ $self->{labels} }) {
	2				9
135	6	100			19	next if $_ eq $label;
136	4				10	push @result, $self->similarity($label, $_);
137						}
138	2				10	return sort { $b->cosine <=> $a->cosine } @result;
	2				6
139						}
140
141						=head2 $self->best_for_label( $label )
142
143						=cut
144
145						sub best_for_label {
146	1		1	1	6	my $self = shift;
147	1				3	my ($label) = @_;
148	1				5	my @sorted = $self->all_for_label($label);
149	1				4	my $r = shift @sorted;
150	1				3	my (undef, $best) = $r->labels;
151	1				6	return ($best, $r);
152						}
153
154						=head2 $self->worst_for_label( $label )
155
156						=cut
157
158						sub worst_for_label {
159	1		1	1	425	my $self = shift;
160	1				2	my ($label) = @_;
161	1				4	my @sorted = $self->all_for_label($label);
162	1				3	my $r = pop @sorted;
163	1				4	my (undef, $worst) = $r->labels;
164	1				6	return ($worst, $r);
165						}
166
167						package Data::CosineSimilarity::Result;
168	2		2		32	use strict;
	2				4
	2				62
169	2		2		10	use warnings;
	2				5
	2				62
170
171	2		2		984	use Math::Trig;
	2				22825
	2				8356
172
173						sub _new {
174	7		7		10	my $class = shift;
175	7				24	my %args = @_;
176	7				33	return bless \%args, $class;
177						}
178
179	7		7		3778	sub labels { @{ $_[0]->{labels} } }
	7				47
180
181	19		19		85	sub cosine { $_[0]->{cosine} }
182
183	10		10		26	sub radian { acos( $_[0]->cosine ) }
184
185	5		5		2531	sub degree { rad2deg( $_[0]->radian ) }
186
187						=head1 AUTHOR
188
189						Antoine Imbert, C<< >>
190
191						=head1 LICENSE AND COPYRIGHT
192
193						This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
194
195						=cut
196
197						1;