File Coverage

blib/lib/PDL/Stats/Kmeans.pm
Criterion Covered Total %
statement 9 9 100.0
branch n/a
condition n/a
subroutine 3 3 100.0
pod n/a
total 12 12 100.0


line stmt bran cond sub pod time code
1             #
2             # GENERATED WITH PDL::PP from lib/PDL/Stats/Kmeans.pd! Don't modify!
3             #
4             package PDL::Stats::Kmeans;
5              
6             our @EXPORT_OK = qw(random_cluster iv_cluster _random_cluster which_cluster assign centroid _d_p2l );
7             our %EXPORT_TAGS = (Func=>\@EXPORT_OK);
8              
9 3     3   963 use PDL::Core;
  3         7  
  3         27  
10 3     3   1440 use PDL::Exporter;
  3         7  
  3         22  
11 3     3   195 use DynaLoader;
  3         8  
  3         1588  
12              
13              
14            
15             our @ISA = ( 'PDL::Exporter','DynaLoader' );
16             push @PDL::Core::PP, __PACKAGE__;
17             bootstrap PDL::Stats::Kmeans ;
18              
19              
20              
21              
22              
23              
24              
25              
26             #line 8 "lib/PDL/Stats/Kmeans.pd"
27              
28             use strict;
29             use warnings;
30             use Carp;
31             use PDL::LiteF;
32             use PDL::Stats::Basic;
33              
34             =head1 NAME
35              
36             PDL::Stats::Kmeans -- classic k-means cluster analysis
37              
38             =head1 DESCRIPTION
39              
40             Assumes that we have data pdl dim [observation, variable] and the goal is to put observations into clusters based on their values on the variables. The terms "observation" and "variable" are quite arbitrary but serve as a reminder for "that which is being clustered" and "that which is used to cluster".
41              
42             The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are broadcastable and methods that are non-broadcastable, respectively.
43              
44             =head1 SYNOPSIS
45              
46             Implement a basic k-means procedure,
47              
48             use PDL::LiteF;
49             use PDL::Stats;
50              
51             my ($data, $idv, $ido) = rtable( $file );
52             # or generate random data:
53             $data = grandom(200, 2); # two vars as below
54              
55             my ($cluster, $centroid, $ss_centroid, $cluster_last);
56              
57             # start out with 8 random clusters
58             $cluster = random_cluster( $data->dim(0), 8 );
59             # iterate to minimize total ss
60             # stop when no more changes in cluster membership
61             do {
62             $cluster_last = $cluster;
63             ($centroid, $ss_centroid) = $data->centroid( $cluster );
64             $cluster = $data->assign( $centroid );
65             } while sum(abs($cluster - $cluster_last)) > 0;
66              
67             or, use the B function provided here,
68              
69             my %k = $data->kmeans( \%opt );
70             print "$_\t$k{$_}\n" for sort keys %k;
71              
72             plot the clusters if there are only 2 vars in $data,
73              
74             use PDL::Graphics::Simple;
75              
76             my ($win, $c);
77             $win = pgswin();
78             $win->plot(map +(with=>'points', $data->dice_axis(0,which($k{cluster}->(,$_)))->dog), 0 .. $k{cluster}->dim(1)-1);
79              
80             =cut
81             #line 82 "lib/PDL/Stats/Kmeans.pm"
82              
83              
84             =head1 FUNCTIONS
85              
86             =cut
87              
88              
89              
90              
91              
92             #line 75 "lib/PDL/Stats/Kmeans.pd"
93              
94             #line 76 "lib/PDL/Stats/Kmeans.pd"
95              
96             =head2 random_cluster
97              
98             =for ref
99              
100             Creates masks for random mutually exclusive clusters. Accepts two
101             parameters, num_obs and num_cluster. Extra parameter turns into extra
102             dim in mask. May loop a long time if num_cluster approaches num_obs
103             because empty cluster is not allowed.
104              
105             =for usage
106              
107             my $cluster = random_cluster( $num_obs, $num_cluster );
108              
109             =cut
110              
111             # can't be called on pdl
112             sub random_cluster {
113             my ($obs, $clu, @extra) = @_;
114             # extra param in @_ made into extra dim
115             my $cluster = zeroes short(), @_;
116             do {
117             (random($obs, @extra) * $obs)->_random_cluster($cluster);
118             } while (PDL::any $cluster->sumover == 0 );
119             $cluster;
120             }
121             #line 122 "lib/PDL/Stats/Kmeans.pm"
122              
123             *_random_cluster = \&PDL::_random_cluster;
124              
125              
126              
127              
128              
129              
130             =head2 which_cluster
131              
132             =for sig
133              
134             Signature: (short a(o,c); indx [o]b(o))
135             Types: (ushort long)
136              
137             =for usage
138              
139             $b = which_cluster($a);
140             which_cluster($a, $b); # all arguments given
141             $b = $a->which_cluster; # method call
142             $a->which_cluster($b);
143              
144             Given cluster mask dim [obs x clu], returns the cluster index to which an obs belong.
145              
146             Does not support overlapping clusters. If an obs has TRUE value for multiple clusters, the returned index is the first cluster the obs belongs to. If an obs has no TRUE value for any cluster, the return val is set to -1 or BAD if the input mask has badflag set.
147              
148             Usage:
149              
150             # create a cluster mask dim [obs x clu]
151             pdl> p $c_mask = iv_cluster [qw(a a b b c c)]
152             [
153             [1 1 0 0 0 0]
154             [0 0 1 1 0 0]
155             [0 0 0 0 1 1]
156             ]
157             # get cluster membership list dim [obs]
158             pdl> p $ic = $c_mask->which_cluster
159             [0 0 1 1 2 2]
160              
161             =pod
162              
163             Broadcasts over its inputs.
164              
165             =for bad
166              
167             C processes bad values.
168             It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
169              
170             =cut
171              
172              
173              
174              
175             *which_cluster = \&PDL::which_cluster;
176              
177              
178              
179              
180              
181              
182             =head2 assign
183              
184             =for sig
185              
186             Signature: (data(o,v); centroid(c,v); short [o]cluster(o,c))
187             Types: (float double)
188              
189             =for ref
190              
191             Takes data pdl dim [obs x var] and centroid pdl dim [cluster x var] and returns mask dim [obs x cluster] to cluster membership. An obs is assigned to the first cluster with the smallest distance (ie sum squared error) to cluster centroid. With bad value, obs is assigned by smallest mean squared error across variables.
192              
193             =for usage
194              
195             pdl> p $centroid = xvals 2, 3
196             [
197             [0 1]
198             [0 1]
199             [0 1]
200             ]
201              
202             pdl> p $b = qsort( random 4, 3 )
203             [
204             [0.022774068 0.032513883 0.13890034 0.30942479]
205             [ 0.16943853 0.50262636 0.56251531 0.7152271]
206             [ 0.23964483 0.59932745 0.60967495 0.78452117]
207             ]
208             # notice that 1st 3 obs in $b are on average closer to 0
209             # and last obs closer to 1
210             pdl> p $b->assign( $centroid )
211             [
212             [1 1 1 0] # cluster 0 membership
213             [0 0 0 1] # cluster 1 membership
214             ]
215              
216             =pod
217              
218             Broadcasts over its inputs.
219              
220             =for bad
221              
222             C processes bad values.
223             It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
224              
225             =cut
226              
227              
228              
229              
230             *assign = \&PDL::assign;
231              
232              
233              
234              
235              
236              
237             =head2 centroid
238              
239             =for sig
240              
241             Signature: (data(o,v); cluster(o,c); [o]m(c,v); [o]ss(c,v))
242             Types: (float double)
243              
244             =for ref
245              
246             Takes data dim [obs x var] and mask dim [obs x cluster], returns mean and ss (ms when data contains bad values) dim [cluster x var], using data where mask == 1. Multiple cluster membership for an obs is okay. If a cluster is empty all means and ss are set to zero for that cluster.
247              
248             =for usage
249              
250             # data is 10 obs x 3 var
251             pdl> p $d = sequence 10, 3
252             [
253             [ 0 1 2 3 4 5 6 7 8 9]
254             [10 11 12 13 14 15 16 17 18 19]
255             [20 21 22 23 24 25 26 27 28 29]
256             ]
257             # create two clusters by value on 1st var
258             pdl> p $a = $d( ,(0)) <= 5
259             [1 1 1 1 1 1 0 0 0 0]
260              
261             pdl> p $b = $d( ,(0)) > 5
262             [0 0 0 0 0 0 1 1 1 1]
263              
264             pdl> p $c = cat $a, $b
265             [
266             [1 1 1 1 1 1 0 0 0 0]
267             [0 0 0 0 0 0 1 1 1 1]
268             ]
269              
270             pdl> p $d->centroid($c)
271             # mean for 2 cluster x 3 var
272             [
273             [ 2.5 7.5]
274             [12.5 17.5]
275             [22.5 27.5]
276             ]
277             # ss for 2 cluster x 3 var
278             [
279             [17.5 5]
280             [17.5 5]
281             [17.5 5]
282             ]
283            
284              
285             =pod
286              
287             Broadcasts over its inputs.
288              
289             =for bad
290              
291             C processes bad values.
292             It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
293              
294             =cut
295              
296              
297              
298              
299             *centroid = \&PDL::centroid;
300              
301              
302              
303              
304              
305             #line 307 "lib/PDL/Stats/Kmeans.pd"
306              
307             #line 308 "lib/PDL/Stats/Kmeans.pd"
308              
309             sub _scree_ind {
310             # use as scree cutoff the point with max distance to the line formed
311             # by the 1st and last points in $self
312             # it's a heuristic--whether we can get "good" results depends on
313             # the number of components in $self.
314              
315             my ($self) = @_;
316              
317             $self = $self->squeeze;
318             $self->ndims > 1 and
319             croak "1D pdl only please";
320              
321             my $a = zeroes 2, $self->nelem;
322             $a->slice('(0)') .= sequence $self->nelem;
323             $a->slice('(1)') .= $self;
324              
325             my $d = _d_point2line( $a, $a->slice(':,(0)'), $a->slice(':,(-1)') );
326              
327             return $d->maximum_ind;
328             }
329              
330             sub _d_point2line {
331             my ($self, $p1, $p2) = @_;
332              
333             for ($self, $p1, $p2) {
334             $_->dim(0) != 2 and
335             carp "point pdl dim(0) != 2";
336             }
337              
338             return _d_p2l( $self->mv(0,-1)->dog, $p1->mv(0,-1)->dog, $p2->mv(0,-1)->dog );
339             }
340             #line 341 "lib/PDL/Stats/Kmeans.pm"
341              
342             *_d_p2l = \&PDL::_d_p2l;
343              
344              
345              
346              
347              
348             #line 358 "lib/PDL/Stats/Kmeans.pd"
349              
350             #line 359 "lib/PDL/Stats/Kmeans.pd"
351             =head2 kmeans
352              
353             =for ref
354              
355             Implements classic k-means cluster analysis.
356              
357             =for example
358              
359             $data = grandom(200, 2); # two rows = two dimensions
360             %k = $data->kmeans; # use default of 3 clusters
361             print "$_\t$k{$_}\n" for sort keys %k;
362             $w->plot(
363             (map +(with=>'points', style=>$_+1, ke=>"Cluster ".($_+1),
364             $data->dice_axis(0,which($k{cluster}->slice(",$_")))->dog),
365             0 .. $k{cluster}->dim(1)-1),
366             (map +(with=>'circles', style=>$_+1, ke=>"Centroid ".($_+1), $k{centroid}->slice($_)->dog, 0.1),
367             0 .. $k{centroid}->dim(0)-1),
368             {le=>'tr'},
369             );
370              
371             Given a number of observations with values on a set of variables,
372             kmeans puts the observations into clusters that maximizes within-cluster
373             similarity with respect to the variables. Tries several different random
374             seeding and clustering in parallel. Stops when cluster assignment of the
375             observations no longer changes. Returns the best result in terms of R2
376             from the random-seeding trials.
377              
378             Instead of random seeding, kmeans also accepts manual seeding. This is
379             done by providing a centroid to the function, in which case clustering
380             will proceed from the centroid and there is no multiple tries.
381              
382             There are two distinct advantages from seeding with a centroid compared to
383             seeding with predefined cluster membership of a subset of the observations
384             ie "seeds":
385              
386             =over
387              
388             =item *
389              
390             a centroid could come from a previous study with a different set of observations;
391              
392             =item *
393              
394             a centroid could even be "fictional", or in more proper parlance,
395             an idealized prototype with respect to the actual data. For example,
396             if there are 10 person's ratings of 1 to 5 on 4 movies, ie a ratings
397             pdl of dim [10 obs x 4 var], providing a centroid like
398              
399             [
400             [5 0 0 0]
401             [0 5 0 0]
402             [0 0 5 0]
403             [0 0 0 5]
404             ]
405              
406             will produce 4 clusters of people with each cluster favoring a different
407             one of the 4 movies. Clusters from an idealized centroid may not give the
408             best result in terms of R2, but they sure are a lot more interpretable.
409              
410             =back
411              
412             If clustering has to be done from predefined clusters of seeds, simply
413             calculate the centroid using the B function and feed it
414             to kmeans,
415              
416             my ($centroid, $ss) = $rating($iseeds, )->centroid( $seeds_cluster );
417             my %k = $rating->kmeans( { CNTRD=>$centroid } );
418              
419             kmeans supports bad value*.
420              
421             =for options
422              
423             Default options (case insensitive):
424              
425             V => 1, # prints simple status
426             FULL => 0, # returns results for all seeding trials
427              
428             CNTRD => PDL->null, # optional. pdl [clu x var]. disables next 3 opts
429              
430             NTRY => 5, # num of random seeding trials
431             NSEED => 1000, # num of initial seeds, use NSEED up to max obs
432             NCLUS => 3, # num of clusters
433              
434             =for usage
435              
436             Usage:
437              
438             # suppose we have 4 person's ratings on 5 movies
439              
440             pdl> p $rating = ceil( random(4, 5) * 5 )
441             [
442             [3 2 2 3]
443             [2 4 5 4]
444             [5 3 2 3]
445             [3 3 1 5]
446             [4 3 3 2]
447             ]
448              
449             # we want to put the 4 persons into 2 groups
450              
451             pdl> %k = $rating->kmeans( {NCLUS=>2} )
452              
453             # by default prints back options used
454             # as well as info for all tries and iterations
455              
456             CNTRD => Null
457             FULL => 0
458             NCLUS => 3
459             NSEED => 4
460             NTRY => 5
461             V => 1
462             ss total: 20.5
463             iter 0 R2 [0.024390244 0.024390244 0.26829268 0.4796748 0.4796748]
464             iter 1 R2 [0.46341463 0.46341463 0.4796748 0.4796748 0.4796748]
465              
466             pdl> p "$_\t$k{$_}\n" for sort keys %k
467              
468             R2 0.479674796747968
469             centroid # mean ratings for 2 group x 5 movies
470             [
471             [ 3 2.3333333]
472             [ 2 4.3333333]
473             [ 5 2.6666667]
474             [ 3 3]
475             [ 4 2.6666667]
476             ]
477              
478             cluster # 4 persons' membership in two groups
479             [
480             [1 0 0 0]
481             [0 1 1 1]
482             ]
483              
484             n [1 3] # cluster size
485             ss
486             [
487             [ 0 0.66666667]
488             [ 0 0.66666667]
489             [ 0 0.66666667]
490             [ 0 8]
491             [ 0 0.66666667]
492             ]
493              
494             Now, for the valiant, kmeans is broadcastable. Say you gathered 10
495             persons' ratings on 5 movies from 2 countries, so the data is dim
496             [10,5,2], and you want to put the 10 persons from each country into
497             3 clusters, just specify NCLUS => [3,1], and there you have it. The
498             key is for NCLUS to include $data->ndims - 1 numbers. The 1 in [3,1]
499             turns into a dummy dim, so the 3-cluster operation is repeated on both
500             countries. Similarly, when seeding, CNTRD needs to have ndims that at
501             least match the data ndims. Extra dims in CNTRD will lead to broadcasting
502             (convenient if you want to try out different centroid locations,
503             for example, but you will have to hand pick the best result). See
504             F for examples w 3D and 4D data.
505              
506             *With bad value, R2 is based on average of variances instead of sum squared error.
507              
508             =cut
509              
510             *kmeans = \&PDL::kmeans;
511             sub PDL::kmeans {
512             my ($self, $opt) = @_;
513             my %opt = (
514             V => 1, # prints simple status
515             FULL => 0, # returns results for all seeding trials
516              
517             CNTRD => PDL->null, # optional. pdl [clu x var]. disables next 3 opts
518              
519             NTRY => 5, # num of random seeding trials
520             NSEED => 1000, # num of initial seeds, use NSEED up to max obs
521             NCLUS => 3, # num of clusters
522             );
523             if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
524             if (defined($opt{CNTRD}) and $opt{CNTRD}->nelem) {
525             $opt{NTRY} = 1;
526             $opt{NSEED} = $self->dim(0);
527             $opt{NCLUS} = $opt{CNTRD}->dim(0);
528             }
529             else {
530             $opt{NSEED} = pdl($self->dim(0), $opt{NSEED})->min->sclr;
531             }
532             $opt{V} and print "$_\t=> $opt{$_}\n" for sort keys %opt;
533              
534             my $ss_ms = $self->badflag? 'ms' : 'ss';
535             my $ss_total
536             = $self->badflag? $self->var->average : $self->ss->sumover;
537             $opt{V} and print "overall $ss_ms:\t$ss_total\n";
538              
539             my ($centroid, $ss_cv, $R2, $clus_this, $clus_last);
540              
541             # NTRY made into extra dim in $cluster for broadcasting
542             my @nclus = (ref $opt{NCLUS} eq 'ARRAY')? @{$opt{NCLUS}} : ($opt{NCLUS});
543             $clus_this
544             = (defined($opt{CNTRD}) and $opt{CNTRD}->nelem) ?
545             $self->assign( $opt{CNTRD}->dummy(-1) ) # put dummy(-1) to match NTRY
546             : random_cluster($opt{NSEED}, @nclus, $opt{NTRY} )
547             ;
548              
549             ($centroid, $ss_cv) = $self->slice([0,$opt{NSEED} - 1])->centroid( $clus_this );
550             # now obs in $clus_this matches $self
551             $clus_this = $self->assign( $centroid );
552             ($centroid, $ss_cv) = $self->centroid( $clus_this );
553              
554             my $iter = 0;
555             do {
556             $R2 = $self->badflag? 1 - $ss_cv->average->average / $ss_total
557             : 1 - $ss_cv->sumover->sumover / $ss_total
558             ;
559             $opt{V} and print join(' ',('iter', $iter++, 'R2', $R2)) . "\n";
560              
561             $clus_last = $clus_this;
562              
563             $clus_this = $self->assign( $centroid );
564             ($centroid, $ss_cv) = $self->centroid( $clus_this );
565             }
566             while ( any long(abs($clus_this - $clus_last))->sumover->sumover > 0 );
567              
568             $opt{FULL} and
569             return (
570             centroid => PDL::squeeze( $centroid ),
571             cluster => PDL::squeeze( $clus_this ),
572             n => PDL::squeeze( $clus_this )->sumover,
573             R2 => PDL::squeeze( $R2 ),
574             $ss_ms => PDL::squeeze( $ss_cv ),
575             );
576              
577             # xchg/mv(-1,0) leaves it as was if single dim--unlike transpose
578             my $i_best = $R2->mv(-1,0)->maximum_ind;
579              
580             $R2->getndims == 1 and
581             return (
582             centroid => $centroid->dice_axis(-1,$i_best)->sever->squeeze,
583             cluster => $clus_this->dice_axis(-1,$i_best)->sever->squeeze,
584             n => $clus_this->dice_axis(-1,$i_best)->sever->squeeze->sumover,
585             R2 => $R2->dice_axis(-1,$i_best)->sever->squeeze,
586             $ss_ms => $ss_cv->dice_axis(-1,$i_best)->sever->squeeze,
587             );
588              
589             # now for broadcasting beyond 2D data
590              
591             # can't believe i'm using a perl loop :P
592              
593             $i_best = $i_best->flat->sever;
594             my @i_best = map { $opt{NTRY} * $_ + $i_best->slice("($_)") }
595             0 .. $i_best->nelem - 1;
596              
597             my @shapes;
598             for ($centroid, $clus_this, $R2) {
599             my @dims = $_->dims;
600             pop @dims;
601             push @shapes, \@dims;
602             }
603              
604             $clus_this = $clus_this->mv(-1,2)->clump(2..$clus_this->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[1] } )->sever,
605              
606             return (
607             centroid =>
608             $centroid->mv(-1,2)->clump(2..$centroid->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[0] } )->sever,
609              
610             cluster => $clus_this,
611             n => $clus_this->sumover,
612              
613             R2 =>
614             $R2->mv(-1,0)->clump(0..$R2->ndims-1)->dice_axis(0,\@i_best)->sever->reshape( @{ $shapes[2] } )->sever,
615              
616             $ss_ms =>
617             $ss_cv->mv(-1,2)->clump(2..$ss_cv->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[0] } )->sever,
618             );
619             }
620              
621             =head1 METHODS
622              
623             =head2 iv_cluster
624              
625             =for ref
626              
627             Turns an independent variable into a cluster pdl. Returns cluster pdl and level-to-pdl_index mapping in list context and cluster pdl only in scalar context.
628              
629             This is the method used for mean and var in anova. The difference between iv_cluster and dummy_code is that iv_cluster returns pdl dim [obs x level] whereas dummy_code returns pdl dim [obs x (level - 1)].
630              
631             =for usage
632              
633             Usage:
634              
635             pdl> @bake = qw( y y y n n n )
636              
637             # accepts @ ref or 1d pdl
638              
639             pdl> p $bake = iv_cluster( \@bake )
640             [
641             [1 1 1 0 0 0]
642             [0 0 0 1 1 1]
643             ]
644              
645             pdl> p $rating = sequence 6
646             [0 1 2 3 4 5]
647              
648             pdl> p $rating->centroid( $bake )
649             # mean for each iv level
650             [
651             [1 4]
652             ]
653             # ss
654             [
655             [2 2]
656             ]
657              
658             =cut
659              
660             *iv_cluster = \&PDL::iv_cluster;
661             sub PDL::iv_cluster {
662             my ($var_ref) = @_;
663             my ($var, $map_ref) = PDL::Stats::Basic::code_ivs( $var_ref );
664             my $var_a = yvals( short, $var->nelem, $var->max->sclr + 1 ) == $var;
665             $var_a = $var_a->setbadif( $var->isbad ) if $var->badflag;
666             wantarray ? ($var_a, $map_ref) : $var_a;
667             }
668              
669             =head2 pca_cluster
670              
671             Assign variables to components ie clusters based on pca loadings or
672             scores. One way to seed kmeans (see Ding & He, 2004, and Su & Dy, 2004
673             for other ways of using pca with kmeans). Variables are assigned to
674             their most associated component. Note that some components may not have
675             any variable that is most associated with them, so the returned number
676             of clusters may be smaller than NCOMP.
677              
678             Default options (case insensitive):
679              
680             V => 1,
681             ABS => 1, # high pos and neg loadings on a comp in same cluster
682             NCOMP => undef, # max number of components to consider. determined by
683             # scree plot black magic if not specified
684             PLOT => 0, # pca scree plot with cutoff at NCOMP
685             WIN => undef, # pass pgswin object for more plotting control
686              
687             Usage:
688              
689             # say we need to cluster a group of documents
690             # $data is pdl dim [word x doc]
691             ($data, $idd, $idw) = get_data 'doc_word_info.txt';
692              
693             pdl> %p = $data->pca;
694             # $cluster is pdl mask dim [doc x ncomp]
695             pdl> $cluster = $p{loading}->pca_cluster;
696              
697             # pca clusters var while kmeans clusters obs. hence transpose
698             pdl> ($m, $ss) = $data->transpose->centroid( $cluster );
699             pdl> %k = $data->transpose->kmeans( { cntrd=>$m } );
700              
701             # take a look at cluster 0 doc ids
702             pdl> p join("\n", @$idd[ list which $k{cluster}->( ,0) ]);
703              
704             =cut
705              
706             *pca_cluster = \&PDL::pca_cluster;
707             sub PDL::pca_cluster {
708             my ($self, $opt) = @_;
709              
710             my %opt = (
711             V => 1,
712             ABS => 1, # high pos and neg loadings on a comp in same cluster
713             NCOMP => undef, # max number of components to consider. determined by
714             # scree plot black magic if not specified
715             PLOT => 0, # pca scree plot with cutoff at NCOMP
716             WIN => undef, # pass pgswin object for more plotting control
717             );
718             if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
719              
720             my $var = sumover($self ** 2) / $self->dim(0);
721             if (!$opt{NCOMP}) {
722             # here's the black magic part
723             my $comps = ($self->dim(1) > 300)? int($self->dim(1) * .1)
724             : pdl($self->dim(1), 30)->min
725             ;
726             $var = $var->slice([0,$comps-1])->sever;
727             $opt{NCOMP} = _scree_ind( $var );
728             }
729             $opt{PLOT} and do {
730             require PDL::Stats::GLM;
731             $var->plot_screes({NCOMP=>$var->dim(0), CUT=>$opt{NCOMP}, WIN=>$opt{WIN}});
732             };
733              
734             my $c = $self->slice(':',[0,$opt{NCOMP}-1])->transpose->abs->maximum_ind;
735              
736             if ($opt{ABS}) {
737             $c = $c->iv_cluster;
738             }
739             else {
740             my @c = map { ($self->slice($_,$c->slice($_)) >= 0)? $c->slice($_)*2 : $c->slice($_)*2 + 1 }
741             ( 0 .. $c->dim(0)-1 );
742             $c = iv_cluster( \@c );
743             }
744             $opt{V} and print "cluster membership mask as " . $c->info . "\n";
745              
746             return $c;
747             }
748              
749             =head1 REFERENCES
750              
751             Ding, C., & He, X. (2004). K-means clustering via principal component analysis. Proceedings of the 21st International Conference on Machine Learning, 69, 29.
752              
753             Su, T., & Dy, J. (2004). A deterministic method for initializing K-means clustering. 16th IEEE International Conference on Tools with Artificial Intelligence, 784-786.
754              
755             Romesburg, H.C. (1984). Cluster Analysis for Researchers. NC: Lulu Press.
756              
757             Wikipedia (retrieved June, 2009). K-means clustering. http://en.wikipedia.org/wiki/K-means_algorithm
758              
759             =head1 AUTHOR
760              
761             Copyright (C) 2009 Maggie J. Xiong
762              
763             All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution.
764              
765             =cut
766             #line 767 "lib/PDL/Stats/Kmeans.pm"
767              
768             # Exit with OK status
769              
770             1;