File Coverage

blib/lib/Algorithm/KernelKMeans/PP.pm
Criterion Covered Total %
statement 16 18 88.8
branch n/a
condition n/a
subroutine 6 6 100.0
pod n/a
total 22 24 91.6


line stmt bran cond sub pod time code
1             package Algorithm::KernelKMeans::PP;
2              
3 1     1   1103 use 5.010;
  1         4  
  1         61  
4 1     1   968 use namespace::autoclean;
  1         24796  
  1         9  
5              
6 1     1   71 use Carp;
  1         4  
  1         70  
7 1     1   5 use List::Util qw/min reduce sum shuffle/;
  1         3  
  1         78  
8 1     1   8 use List::MoreUtils qw/natatime pairwise/;
  1         2  
  1         47  
9 1     1   475 use Moose;
  0            
  0            
10             use POSIX qw/floor tanh/;
11              
12             use Algorithm::KernelKMeans::Util qw/$KERNEL_POLYNOMINAL
13             $KERNEL_GAUSSIAN
14             $KERNEL_SIGMOID
15             $INITIALIZE_SIMPLE
16             $INITIALIZE_SHUFFLE
17             $INITIALIZE_KKZ
18             inner_product
19             euclidean_distance/;
20              
21             with qw/Algorithm::KernelKMeans::Impl/;
22              
23             sub init_clusters_simple {
24             my ($k, $vectors) = @_;
25             my $nvectors = @$vectors;
26             _init_clusters($k, [ 0 .. $nvectors - 1 ]);
27             }
28              
29             sub init_clusters_shuffle {
30             my ($k, $vectors) = @_;
31             my $nvectors = @$vectors;
32             _init_clusters($k, [ shuffle 0 .. $nvectors - 1 ]);
33             }
34              
35             sub _init_clusters {
36             my ($k, $indices) = @_;
37             my $cluster_size = floor($#$indices / $k);
38             my $iter = natatime $cluster_size, @$indices;
39             my @clusters;
40             while (my @cluster = $iter->()) { push @clusters, \@cluster }
41             if (@{ $clusters[-1] } < $cluster_size) {
42             my $last_cluster = pop @clusters;
43             push @{ $clusters[-1] }, @$last_cluster;
44             }
45             return \@clusters;
46             }
47              
48             sub init_clusters_kkz {
49             my ($k, $vectors) = @_;
50             my $nvectors = @$vectors;
51             my @rep_vectors; # cluster representation vectors
52              
53             my $first_vector = reduce {
54             $a->[1] > $b->[1] ? $a : $b
55             } map {
56             my $vector = $vectors->[$_];
57             [ $_ => inner_product($vector, $vector) ];
58             } 0 .. $nvectors - 1;
59             push @rep_vectors, $first_vector->[0];
60              
61             until (@rep_vectors == $k) {
62             my @ds = map {
63             my $vector = $vectors->[$_];
64             my $min = min map {
65             euclidean_distance($vector, $vectors->[$_])
66             } @rep_vectors;
67             [ $_ => $min ];
68             } 0 .. $nvectors - 1;
69             my $rep_vector = reduce { $a->[1] > $b->[1] ? $a : $b } @ds;
70             push @rep_vectors, $rep_vector->[0];
71             }
72              
73             my @clusters;
74             for my $i (0 .. $nvectors - 1) {
75             my $vector = $vectors->[$i];
76             my @ds = map {
77             my $rep_vector_idx = $rep_vectors[$_];
78             [ $_ => euclidean_distance($vector, $vectors->[$rep_vector_idx]) ];
79             } 0 .. $#rep_vectors;
80             my $nearest = reduce { $a->[1] <= $b->[1] ? $a : $b } @ds;
81             push @{ $clusters[$nearest->[0]] }, $i;
82             }
83             return \@clusters;
84             }
85              
86             sub generate_polynominal_kernel {
87             my ($l, $p) = @_;
88             sub {
89             my ($x1, $x2) = @_;
90             ($l + inner_product($x1, $x2)) ** $p
91             }
92             }
93              
94             sub generate_gaussian_kernel {
95             my $sigma = shift;
96             my $numer = 2 * ($sigma ** 2);
97             sub {
98             my ($x1, $x2) = @_;
99             my %tmp; @tmp{keys %$x1, keys %$x2} = ();
100             my $norm = sqrt sum map {
101             my ($e1, $e2) = (($x1->{$_} // 0), ($x2->{$_} // 0));
102             ($e1 - $e2) ** 2;
103             } keys %tmp;
104             exp(-$norm / $numer);
105             }
106             }
107              
108             sub generate_sigmoid_kernel {
109             my ($s, $theta) = @_;
110             sub {
111             my ($x1, $x2) = @_;
112             tanh($s * inner_product($x1, $x2) + $theta);
113             }
114             }
115              
116             sub _build_kernel_matrix {
117             my $self = shift;
118              
119             my $kernel;
120             if (ref $self->kernel eq 'CODE') {
121             $kernel = $self->kernel;
122             } else {
123             my ($kernel_desc, @params) = @{ $self->kernel };
124             given ($kernel_desc) {
125             when ($KERNEL_POLYNOMINAL) {
126             croak 'Too few parameters' if @params < 2;
127             $kernel = generate_polynominal_kernel(@params);
128             }
129             when ($KERNEL_GAUSSIAN) {
130             croak 'Too few parameters' if @params < 1;
131             $kernel = generate_gaussian_kernel(@params);
132             }
133             when ($KERNEL_SIGMOID) {
134             croak 'Too few parameters' if @params < 2;
135             $kernel = generate_sigmoid_kernel(@params);
136             }
137             default { croak 'Unknown kernel function' }
138             }
139             }
140              
141             my @matrix = map {
142             my $i = $_;
143             [ map {
144             my $j = $_;
145             $kernel->($self->vector($i), $self->vector($j));
146             } 0 .. $i ];
147             } 0 .. $self->num_vectors - 1;
148             return \@matrix;
149             }
150              
151             sub init_clusters {
152             my ($self, $init, $k) = @_;
153             $init->($k, $self->vectors, $self->weights, $self->kernel_matrix);
154             }
155              
156             sub total_weight_of {
157             my ($self, $cluster) = @_;
158             sum @{ $self->weights_of($cluster) };
159             }
160              
161             sub step {
162             my ($self, $clusters, $norms) = @_;
163             my @new_clusters = map { [] } 0 .. $#$clusters;
164             for my $i (0 .. $self->num_vectors - 1) {
165             my ($nearest) = sort { $a->[1] <=> $b->[1] } map {
166             [ $_ => $norms->[$i][$_] ]
167             } 0 .. $#$clusters;
168             push @{ $new_clusters[$nearest->[0]] }, $i;
169             }
170             return [ grep { @$_ != 0 } @new_clusters ];
171             }
172              
173             sub compute_score {
174             my ($self, $clusters, $norms) = @_;
175             my $score = 0;
176             for my $cluster_idx (0 .. $#$clusters) {
177             my $cluster = $clusters->[$cluster_idx];
178             $score += sum map {
179             $self->weight($_) * $norms->[$_][$cluster_idx]
180             } @$cluster;
181             }
182             return $score;
183             }
184              
185             sub compute_norms {
186             my ($self, $clusters) = @_;
187             my @total_weights = map { $self->total_weight_of($_) } @$clusters;
188              
189             my @term3_denoms = map {
190             $self->_norm_term3_denom_of($_)
191             } @$clusters;
192             my @term3s = pairwise { $a / ($b ** 2) } @term3_denoms, @total_weights;
193              
194             my @norms = map {
195             my $i = $_;
196             my $term1 = $self->kernel_matrix->[$i][$i];
197             [ map {
198             my $cluster_idx = $_;
199             my $cluster = $clusters->[$cluster_idx];
200             my $total_weight = $total_weights[$cluster_idx];
201              
202             my $weights = $self->weights_of($cluster);
203             my $term2 = -2 * sum(pairwise {
204             my ($s, $t) = $i >= $a ? ($i, $a) : ($a, $i);
205             $self->kernel_matrix->[$s][$t] * $b
206             } @$cluster, @$weights) / $total_weight;
207             my $term3 = $term3s[$cluster_idx];
208              
209             $term1 + $term2 + $term3;
210             } 0 .. $#$clusters ]
211             } 0 .. $self->num_vectors - 1;
212             return \@norms;
213             }
214              
215             sub _norm_term3_denom_of {
216             my ($self, $cluster) = @_;
217             sum map {
218             my $i = $_;
219             map {
220             my ($s, $t) = $i >= $_ ? ($i, $_) : ($_, $i);
221             $self->weight($s) * $self->weight($t) * $self->kernel_matrix->[$s][$t];
222             } @$cluster;
223             } @$cluster;
224             }
225              
226             sub cluster_indices {
227             my ($self, %opts) = @_;
228             my $k = delete $opts{k} // croak 'Missing required parameter "k"';
229             my $k_min = delete $opts{k_min} // $k;
230             croak '"k_min" must be less than or equal to "k"' if $k_min > $k;
231             my $converged = delete $opts{converged} // sub {
232             my ($score, $new_score) = @_;
233             $score == $new_score;
234             };
235              
236             my $init = delete $opts{initializer} // $INITIALIZE_KKZ;
237             unless (ref $init) {
238             given ($init) {
239             when ($INITIALIZE_SIMPLE) { $init = \&init_clusters_simple }
240             when ($INITIALIZE_SHUFFLE) { $init = \&init_clusters_shuffle }
241             when ($INITIALIZE_KKZ) { $init = \&init_clusters_kkz }
242             default { croak 'Unknown initializer' }
243             }
244             }
245              
246             if (keys %opts) {
247             my $missings = join ', ', map { qq/"$_"/ } sort keys %opts;
248             croak "Unknown argument(s): $missings";
249             }
250              
251             # cluster index -> [vector index]
252             my $clusters = $self->init_clusters($init, $k);
253             # vector index -> cluster index -> norm
254             my $norms = $self->compute_norms($clusters);
255             my $score;
256             my $new_score = $self->compute_score($clusters, $norms);
257             do {
258             $clusters = $self->step($clusters, $norms);
259             croak "Number of clusters became less than k_min=$k_min"
260             if @$clusters < $k_min;
261             $norms = $self->compute_norms($clusters);
262             $score = $new_score;
263             $new_score = $self->compute_score($clusters, $norms);
264             } until $converged->($score, $new_score);
265              
266             return $clusters;
267             }
268              
269             __PACKAGE__->meta->make_immutable;
270              
271             __END__
272              
273             =head1 NAME
274              
275             Algorithm::KernelKMeans::PP
276              
277             =head1 SYNOPSIS
278              
279             use Algorithm::KernelKMeans::PP;
280              
281             =head1 DESCRIPTION
282              
283             This class is a pure-Perl implementation of weighted kernel k-means algorithm.
284              
285             L<Algorithm::KernelKMeans> inherits this class by default.
286              
287             =head1 AUTHOR
288              
289             Koichi SATOH E<lt>sekia@cpan.orgE<gt>
290              
291             =head1 SEE ALSO
292              
293             L<Algorithm::KernelKMeans>
294              
295             L<Algorithm::KernelKMeans::XS> - Yet another implementation. Fast!
296              
297             =head1 LICENSE
298              
299             The MIT License
300              
301             Copyright (C) 2010 by Koichi SATOH
302              
303             Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
304              
305             The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
306              
307             THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
308              
309             =cut