File Coverage

blib/lib/Algorithm/KernelKMeans/PP.pm

Criterion	Covered	Total	%
statement	16	18	88.8
branch			n/a
condition			n/a
subroutine	6	6	100.0
pod			n/a
total	22	24	91.6

line	stmt	sub	time	code
1				package Algorithm::KernelKMeans::PP;
2
3	1	1	1103	use 5.010;
	1		4
	1		61
4	1	1	968	use namespace::autoclean;
	1		24796
	1		9
5
6	1	1	71	use Carp;
	1		4
	1		70
7	1	1	5	use List::Util qw/min reduce sum shuffle/;
	1		3
	1		78
8	1	1	8	use List::MoreUtils qw/natatime pairwise/;
	1		2
	1		47
9	1	1	475	use Moose;
	0
	0
10				use POSIX qw/floor tanh/;
11
12				use Algorithm::KernelKMeans::Util qw/$KERNEL_POLYNOMINAL
13				$KERNEL_GAUSSIAN
14				$KERNEL_SIGMOID
15				$INITIALIZE_SIMPLE
16				$INITIALIZE_SHUFFLE
17				$INITIALIZE_KKZ
18				inner_product
19				euclidean_distance/;
20
21				with qw/Algorithm::KernelKMeans::Impl/;
22
23				sub init_clusters_simple {
24				my ($k, $vectors) = @_;
25				my $nvectors = @$vectors;
26				_init_clusters($k, [ 0 .. $nvectors - 1 ]);
27				}
28
29				sub init_clusters_shuffle {
30				my ($k, $vectors) = @_;
31				my $nvectors = @$vectors;
32				_init_clusters($k, [ shuffle 0 .. $nvectors - 1 ]);
33				}
34
35				sub _init_clusters {
36				my ($k, $indices) = @_;
37				my $cluster_size = floor($#$indices / $k);
38				my $iter = natatime $cluster_size, @$indices;
39				my @clusters;
40				while (my @cluster = $iter->()) { push @clusters, \@cluster }
41				if (@{ $clusters[-1] } < $cluster_size) {
42				my $last_cluster = pop @clusters;
43				push @{ $clusters[-1] }, @$last_cluster;
44				}
45				return \@clusters;
46				}
47
48				sub init_clusters_kkz {
49				my ($k, $vectors) = @_;
50				my $nvectors = @$vectors;
51				my @rep_vectors; # cluster representation vectors
52
53				my $first_vector = reduce {
54				$a->[1] > $b->[1] ? $a : $b
55				} map {
56				my $vector = $vectors->[$_];
57				[ $_ => inner_product($vector, $vector) ];
58				} 0 .. $nvectors - 1;
59				push @rep_vectors, $first_vector->[0];
60
61				until (@rep_vectors == $k) {
62				my @ds = map {
63				my $vector = $vectors->[$_];
64				my $min = min map {
65				euclidean_distance($vector, $vectors->[$_])
66				} @rep_vectors;
67				[ $_ => $min ];
68				} 0 .. $nvectors - 1;
69				my $rep_vector = reduce { $a->[1] > $b->[1] ? $a : $b } @ds;
70				push @rep_vectors, $rep_vector->[0];
71				}
72
73				my @clusters;
74				for my $i (0 .. $nvectors - 1) {
75				my $vector = $vectors->[$i];
76				my @ds = map {
77				my $rep_vector_idx = $rep_vectors[$_];
78				[ $_ => euclidean_distance($vector, $vectors->[$rep_vector_idx]) ];
79				} 0 .. $#rep_vectors;
80				my $nearest = reduce { $a->[1] <= $b->[1] ? $a : $b } @ds;
81				push @{ $clusters[$nearest->[0]] }, $i;
82				}
83				return \@clusters;
84				}
85
86				sub generate_polynominal_kernel {
87				my ($l, $p) = @_;
88				sub {
89				my ($x1, $x2) = @_;
90				($l + inner_product($x1, $x2)) ** $p
91				}
92				}
93
94				sub generate_gaussian_kernel {
95				my $sigma = shift;
96				my $numer = 2 * ($sigma ** 2);
97				sub {
98				my ($x1, $x2) = @_;
99				my %tmp; @tmp{keys %$x1, keys %$x2} = ();
100				my $norm = sqrt sum map {
101				my ($e1, $e2) = (($x1->{$_} // 0), ($x2->{$_} // 0));
102				($e1 - $e2) ** 2;
103				} keys %tmp;
104				exp(-$norm / $numer);
105				}
106				}
107
108				sub generate_sigmoid_kernel {
109				my ($s, $theta) = @_;
110				sub {
111				my ($x1, $x2) = @_;
112				tanh($s * inner_product($x1, $x2) + $theta);
113				}
114				}
115
116				sub _build_kernel_matrix {
117				my $self = shift;
118
119				my $kernel;
120				if (ref $self->kernel eq 'CODE') {
121				$kernel = $self->kernel;
122				} else {
123				my ($kernel_desc, @params) = @{ $self->kernel };
124				given ($kernel_desc) {
125				when ($KERNEL_POLYNOMINAL) {
126				croak 'Too few parameters' if @params < 2;
127				$kernel = generate_polynominal_kernel(@params);
128				}
129				when ($KERNEL_GAUSSIAN) {
130				croak 'Too few parameters' if @params < 1;
131				$kernel = generate_gaussian_kernel(@params);
132				}
133				when ($KERNEL_SIGMOID) {
134				croak 'Too few parameters' if @params < 2;
135				$kernel = generate_sigmoid_kernel(@params);
136				}
137				default { croak 'Unknown kernel function' }
138				}
139				}
140
141				my @matrix = map {
142				my $i = $_;
143				[ map {
144				my $j = $_;
145				$kernel->($self->vector($i), $self->vector($j));
146				} 0 .. $i ];
147				} 0 .. $self->num_vectors - 1;
148				return \@matrix;
149				}
150
151				sub init_clusters {
152				my ($self, $init, $k) = @_;
153				$init->($k, $self->vectors, $self->weights, $self->kernel_matrix);
154				}
155
156				sub total_weight_of {
157				my ($self, $cluster) = @_;
158				sum @{ $self->weights_of($cluster) };
159				}
160
161				sub step {
162				my ($self, $clusters, $norms) = @_;
163				my @new_clusters = map { [] } 0 .. $#$clusters;
164				for my $i (0 .. $self->num_vectors - 1) {
165				my ($nearest) = sort { $a->[1] <=> $b->[1] } map {
166				[ $_ => $norms->[$i][$_] ]
167				} 0 .. $#$clusters;
168				push @{ $new_clusters[$nearest->[0]] }, $i;
169				}
170				return [ grep { @$_ != 0 } @new_clusters ];
171				}
172
173				sub compute_score {
174				my ($self, $clusters, $norms) = @_;
175				my $score = 0;
176				for my $cluster_idx (0 .. $#$clusters) {
177				my $cluster = $clusters->[$cluster_idx];
178				$score += sum map {
179				$self->weight($_) * $norms->[$_][$cluster_idx]
180				} @$cluster;
181				}
182				return $score;
183				}
184
185				sub compute_norms {
186				my ($self, $clusters) = @_;
187				my @total_weights = map { $self->total_weight_of($_) } @$clusters;
188
189				my @term3_denoms = map {
190				$self->_norm_term3_denom_of($_)
191				} @$clusters;
192				my @term3s = pairwise { $a / ($b ** 2) } @term3_denoms, @total_weights;
193
194				my @norms = map {
195				my $i = $_;
196				my $term1 = $self->kernel_matrix->[$i][$i];
197				[ map {
198				my $cluster_idx = $_;
199				my $cluster = $clusters->[$cluster_idx];
200				my $total_weight = $total_weights[$cluster_idx];
201
202				my $weights = $self->weights_of($cluster);
203				my $term2 = -2 * sum(pairwise {
204				my ($s, $t) = $i >= $a ? ($i, $a) : ($a, $i);
205				$self->kernel_matrix->[$s][$t] * $b
206				} @$cluster, @$weights) / $total_weight;
207				my $term3 = $term3s[$cluster_idx];
208
209				$term1 + $term2 + $term3;
210				} 0 .. $#$clusters ]
211				} 0 .. $self->num_vectors - 1;
212				return \@norms;
213				}
214
215				sub _norm_term3_denom_of {
216				my ($self, $cluster) = @_;
217				sum map {
218				my $i = $_;
219				map {
220				my ($s, $t) = $i >= $_ ? ($i, $_) : ($_, $i);
221				$self->weight($s) * $self->weight($t) * $self->kernel_matrix->[$s][$t];
222				} @$cluster;
223				} @$cluster;
224				}
225
226				sub cluster_indices {
227				my ($self, %opts) = @_;
228				my $k = delete $opts{k} // croak 'Missing required parameter "k"';
229				my $k_min = delete $opts{k_min} // $k;
230				croak '"k_min" must be less than or equal to "k"' if $k_min > $k;
231				my $converged = delete $opts{converged} // sub {
232				my ($score, $new_score) = @_;
233				$score == $new_score;
234				};
235
236				my $init = delete $opts{initializer} // $INITIALIZE_KKZ;
237				unless (ref $init) {
238				given ($init) {
239				when ($INITIALIZE_SIMPLE) { $init = \&init_clusters_simple }
240				when ($INITIALIZE_SHUFFLE) { $init = \&init_clusters_shuffle }
241				when ($INITIALIZE_KKZ) { $init = \&init_clusters_kkz }
242				default { croak 'Unknown initializer' }
243				}
244				}
245
246				if (keys %opts) {
247				my $missings = join ', ', map { qq/"$_"/ } sort keys %opts;
248				croak "Unknown argument(s): $missings";
249				}
250
251				# cluster index -> [vector index]
252				my $clusters = $self->init_clusters($init, $k);
253				# vector index -> cluster index -> norm
254				my $norms = $self->compute_norms($clusters);
255				my $score;
256				my $new_score = $self->compute_score($clusters, $norms);
257				do {
258				$clusters = $self->step($clusters, $norms);
259				croak "Number of clusters became less than k_min=$k_min"
260				if @$clusters < $k_min;
261				$norms = $self->compute_norms($clusters);
262				$score = $new_score;
263				$new_score = $self->compute_score($clusters, $norms);
264				} until $converged->($score, $new_score);
265
266				return $clusters;
267				}
268
269				__PACKAGE__->meta->make_immutable;
270
271				__END__
272
273				=head1 NAME
274
275				Algorithm::KernelKMeans::PP
276
277				=head1 SYNOPSIS
278
279				use Algorithm::KernelKMeans::PP;
280
281				=head1 DESCRIPTION
282
283				This class is a pure-Perl implementation of weighted kernel k-means algorithm.
284
285				L<Algorithm::KernelKMeans> inherits this class by default.
286
287				=head1 AUTHOR
288
289				Koichi SATOH E<lt>sekia@cpan.orgE<gt>
290
291				=head1 SEE ALSO
292
293				L<Algorithm::KernelKMeans>
294
295				L<Algorithm::KernelKMeans::XS> - Yet another implementation. Fast!
296
297				=head1 LICENSE
298
299				The MIT License
300
301				Copyright (C) 2010 by Koichi SATOH
302
303				Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
304
305				The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
306
307				THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
308
309				=cut