File Coverage

blib/lib/PDL/Stats/Kmeans.pm
Criterion Covered Total %
statement 98 114 85.9
branch 28 44 63.6
condition 7 15 46.6
subroutine 12 13 92.3
pod 1 4 25.0
total 146 190 76.8


line stmt bran cond sub pod time code
1              
2             #
3             # GENERATED WITH PDL::PP! Don't modify!
4             #
5             package PDL::Stats::Kmeans;
6              
7             @EXPORT_OK = qw( random_cluster iv_cluster PDL::PP _random_cluster PDL::PP which_cluster PDL::PP assign PDL::PP centroid PDL::PP _d_p2l );
8             %EXPORT_TAGS = (Func=>[@EXPORT_OK]);
9              
10 4     4   2182 use PDL::Core;
  4         10  
  4         36  
11 4     4   1071 use PDL::Exporter;
  4         9  
  4         29  
12 4     4   123 use DynaLoader;
  4         8  
  4         248  
13              
14              
15              
16            
17             @ISA = ( 'PDL::Exporter','DynaLoader' );
18             push @PDL::Core::PP, __PACKAGE__;
19             bootstrap PDL::Stats::Kmeans ;
20              
21              
22              
23              
24              
25 4     4   26 use Carp;
  4         8  
  4         265  
26 4     4   24 use PDL::LiteF;
  4         8  
  4         43  
27 4     4   6952 use PDL::NiceSlice;
  4         8  
  4         32  
28 4     4   83724 use PDL::Stats::Basic;
  4         8  
  4         32  
29              
30             $PDL::onlinedoc->scan(__FILE__) if $PDL::onlinedoc;
31              
32             =head1 NAME
33              
34             PDL::Stats::Kmeans -- classic k-means cluster analysis
35              
36             =head1 DESCRIPTION
37              
38             Assumes that we have data pdl dim [observation, variable] and the goal is to put observations into clusters based on their values on the variables. The terms "observation" and "variable" are quite arbitrary but serve as a reminder for "that which is being clustered" and "that which is used to cluster".
39              
40             The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are threadable and methods that are non-threadable, respectively.
41              
42             =head1 SYNOPSIS
43              
44             Implement a basic k-means procedure,
45            
46             use PDL::LiteF;
47             use PDL::NiceSlice;
48             use PDL::Stats;
49              
50             my ($data, $idv, $ido) = rtable( $file );
51              
52             my ($cluster, $centroid, $ss_centroid, $cluster_last);
53              
54             # start out with 8 random clusters
55             $cluster = random_cluster( $data->dim(0), 8 );
56             # iterate to minimize total ss
57             # stop when no more changes in cluster membership
58             do {
59             $cluster_last = $cluster;
60             ($centroid, $ss_centroid) = $data->centroid( $cluster );
61             $cluster = $data->assign( $centroid );
62             }
63             while ( sum(abs($cluster - $cluster_last)) > 0 );
64              
65             or, use the B function provided here,
66              
67             my %k = $data->kmeans( \%opt );
68             print "$_\t$k{$_}\n" for (sort keys %k);
69              
70             plot the clusters if there are only 2 vars in $data,
71              
72             use PDL::Graphics::PGPLOT::Window;
73              
74             my ($win, $c);
75             $win = pgwin 'xs';
76             $win->env($data( ,0)->minmax, $data( ,1)->minmax);
77              
78             $win->points( $data->dice_axis(0,which($k{cluster}->(,$_)))->dog,
79             {COLOR=>++$c} )
80             for (0 .. $k{cluster}->dim(1)-1);
81              
82             =cut
83              
84              
85              
86              
87              
88              
89              
90             =head1 FUNCTIONS
91              
92              
93              
94             =cut
95              
96              
97              
98              
99              
100             # my tmp var for PDL 2.007 slice upate
101             my $_tmp;
102              
103             =head2 random_cluster
104              
105             =for sig
106              
107             Signature: (short [o]cluster(o,c); int obs=>o; int clu=>c)
108              
109             =for ref
110              
111             Creates masks for random mutually exclusive clusters. Accepts two parameters, num_obs and num_cluster. Extra parameter turns into extra dim in mask. May loop a long time if num_cluster approaches num_obs because empty cluster is not allowed.
112              
113             =for usage
114              
115             my $cluster = random_cluster( $num_obs, $num_cluster );
116              
117             =cut
118              
119             # can't be called on pdl
120             sub random_cluster {
121 4     4 1 9 my ($obs, $clu) = @_;
122             # extra param in @_ made into extra dim
123 4         15 my $cluster = zeroes @_;
124 4         310 do {
125 4         14 $cluster->inplace->_random_cluster();
126             } while (PDL::any $cluster->sumover == 0 );
127 4         576 return $cluster;
128             }
129              
130              
131              
132              
133              
134             *_random_cluster = \&PDL::_random_cluster;
135              
136              
137              
138              
139              
140             =head2 which_cluster
141              
142             =for sig
143              
144             Signature: (short a(o,c); indx [o]b(o))
145              
146             Given cluster mask dim [obs x clu], returns the cluster index to which an obs belong.
147              
148             Does not support overlapping clusters. If an obs has TRUE value for multiple clusters, the returned index is the first cluster the obs belongs to. If an obs has no TRUE value for any cluster, the return val is set to -1 or BAD if the input mask has badflag set.
149              
150             Usage:
151              
152             # create a cluster mask dim [obs x clu]
153             perldl> p $c_mask = iv_cluster [qw(a a b b c c)]
154             [
155             [1 1 0 0 0 0]
156             [0 0 1 1 0 0]
157             [0 0 0 0 1 1]
158             ]
159             # get cluster membership list dim [obs]
160             perldl> p $ic = $c_mask->which_cluster
161             [0 0 1 1 2 2]
162              
163              
164              
165             =for bad
166              
167             which_cluster processes bad values.
168             It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
169              
170              
171             =cut
172              
173              
174              
175              
176              
177              
178             *which_cluster = \&PDL::which_cluster;
179              
180              
181              
182              
183              
184             =head2 assign
185              
186             =for sig
187              
188             Signature: (data(o,v); centroid(c,v); short [o]cluster(o,c))
189              
190              
191              
192             =for ref
193              
194             Takes data pdl dim [obs x var] and centroid pdl dim [cluster x var] and returns mask dim [obs x cluster] to cluster membership. An obs is assigned to the first cluster with the smallest distance (ie sum squared error) to cluster centroid. With bad value, obs is assigned by smallest mean squared error across variables.
195              
196             =for usage
197              
198             perldl> $centroid = ones 2, 3
199             perldl> $centroid(0,) .= 0
200             perldl> p $centroid
201             [
202             [0 1]
203             [0 1]
204             [0 1]
205             ]
206              
207             perldl> $b = qsort( random 4, 3 )
208             perldl> p $b
209             [
210             [0.022774068 0.032513883 0.13890034 0.30942479]
211             [ 0.16943853 0.50262636 0.56251531 0.7152271]
212             [ 0.23964483 0.59932745 0.60967495 0.78452117]
213             ]
214             # notice that 1st 3 obs in $b are on average closer to 0
215             # and last obs closer to 1
216             perldl> p $b->assign( $centroid )
217             [
218             [1 1 1 0] # cluster 0 membership
219             [0 0 0 1] # cluster 1 membership
220             ]
221              
222              
223            
224              
225             =for bad
226              
227             assign processes bad values.
228             It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
229              
230              
231             =cut
232              
233              
234              
235              
236              
237              
238             *assign = \&PDL::assign;
239              
240              
241              
242              
243              
244             =head2 centroid
245              
246             =for sig
247              
248             Signature: (data(o,v); cluster(o,c); float+ [o]m(c,v); float+ [o]ss(c,v))
249              
250              
251             =for ref
252              
253             Takes data dim [obs x var] and mask dim [obs x cluster], returns mean and ss (ms when data contains bad values) dim [cluster x var], using data where mask == 1. Multiple cluster membership for an obs is okay. If a cluster is empty all means and ss are set to zero for that cluster.
254              
255             =for usage
256              
257             # data is 10 obs x 3 var
258             perldl> p $d = sequence 10, 3
259             [
260             [ 0 1 2 3 4 5 6 7 8 9]
261             [10 11 12 13 14 15 16 17 18 19]
262             [20 21 22 23 24 25 26 27 28 29]
263             ]
264             # create two clusters by value on 1st var
265             perldl> p $a = $d( ,(0)) <= 5
266             [1 1 1 1 1 1 0 0 0 0]
267            
268             perldl> p $b = $d( ,(0)) > 5
269             [0 0 0 0 0 0 1 1 1 1]
270              
271             perldl> p $c = cat $a, $b
272             [
273             [1 1 1 1 1 1 0 0 0 0]
274             [0 0 0 0 0 0 1 1 1 1]
275             ]
276              
277             perldl> p $d->centroid($c)
278             # mean for 2 cluster x 3 var
279             [
280             [ 2.5 7.5]
281             [12.5 17.5]
282             [22.5 27.5]
283             ]
284             # ss for 2 cluster x 3 var
285             [
286             [17.5 5]
287             [17.5 5]
288             [17.5 5]
289             ]
290              
291              
292            
293              
294             =for bad
295              
296             centroid processes bad values.
297             It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
298              
299              
300             =cut
301              
302              
303              
304              
305              
306              
307             *centroid = \&PDL::centroid;
308              
309              
310              
311              
312             sub _scree_ind {
313             # use as scree cutoff the point with max distance to the line formed
314             # by the 1st and last points in $self
315             # it's a heuristic--whether we can get "good" results depends on
316             # the number of components in $self.
317              
318 0     0   0 my ($self) = @_;
319              
320 0         0 $self = $self->squeeze;
321 0 0       0 $self->ndims > 1 and
322             croak "1D pdl only please";
323              
324 0         0 my $a = zeroes 2, $self->nelem;
325 0         0 ($_tmp = $a((0), )) .= sequence $self->nelem;
326 0         0 ($_tmp = $a((1), )) .= $self;
327              
328 0         0 my $d = _d_point2line( $a, $a( ,(0)), $a( ,(-1)) );
329              
330 0         0 return $d->maximum_ind;
331             }
332              
333             sub _d_point2line {
334 1     1   893 my ($self, $p1, $p2) = @_;
335              
336 1         5 for ($self, $p1, $p2) {
337 3 50       12 $_->dim(0) != 2 and
338             carp "point pdl dim(0) != 2";
339             }
340              
341 1         11 return _d_p2l( $self->mv(0,-1)->dog, $p1->mv(0,-1)->dog, $p2->mv(0,-1)->dog );
342             }
343              
344              
345              
346              
347              
348             *_d_p2l = \&PDL::_d_p2l;
349              
350              
351              
352              
353             =head2 kmeans
354              
355             =for ref
356              
357             Implements classic k-means cluster analysis. Given a number of observations with values on a set of variables, kmeans puts the observations into clusters that maximizes within-cluster similarity with respect to the variables. Tries several different random seeding and clustering in parallel. Stops when cluster assignment of the observations no longer changes. Returns the best result in terms of R2 from the random-seeding trials.
358              
359             Instead of random seeding, kmeans also accepts manual seeding. This is done by providing a centroid to the function, in which case clustering will proceed from the centroid and there is no multiple tries.
360              
361             There are two distinct advantages from seeding with a centroid compared to seeding with predefined cluster membership of a subset of the observations ie "seeds",
362              
363             (1) a centroid could come from a previous study with a different set of observations;
364              
365             (2) a centroid could even be "fictional", or in more proper parlance, an idealized prototype with respect to the actual data. For example, if there are 10 person's ratings of 1 to 5 on 4 movies, ie a ratings pdl of dim [10 obs x 4 var], providing a centroid like
366              
367             [
368             [5 0 0 0]
369             [0 5 0 0]
370             [0 0 5 0]
371             [0 0 0 5]
372             ]
373              
374             will produce 4 clusters of people with each cluster favoring a different one of the 4 movies. Clusters from an idealized centroid may not give the best result in terms of R2, but they sure are a lot more interpretable.
375              
376             If clustering has to be done from predefined clusters of seeds, simply calculate the centroid using the B function and feed it to kmeans,
377              
378             my ($centroid, $ss) = $rating($iseeds, )->centroid( $seeds_cluster );
379              
380             my %k = $rating->kmeans( { CNTRD=>$centroid } );
381              
382             kmeans supports bad value*.
383              
384             =for options
385              
386             Default options (case insensitive):
387              
388             V => 1, # prints simple status
389             FULL => 0, # returns results for all seeding trials
390              
391             CNTRD => PDL->null, # optional. pdl [clu x var]. disables next 3 opts
392              
393             NTRY => 5, # num of random seeding trials
394             NSEED => 1000, # num of initial seeds, use NSEED up to max obs
395             NCLUS => 3, # num of clusters
396              
397             =for usage
398              
399             Usage:
400              
401             # suppose we have 4 person's ratings on 5 movies
402              
403             perldl> p $rating = ceil( random(4, 5) * 5 )
404             [
405             [3 2 2 3]
406             [2 4 5 4]
407             [5 3 2 3]
408             [3 3 1 5]
409             [4 3 3 2]
410             ]
411              
412             # we want to put the 4 persons into 2 groups
413              
414             perldl> %k = $rating->kmeans( {NCLUS=>2} )
415              
416             # by default prints back options used
417             # as well as info for all tries and iterations
418              
419             CNTRD => Null
420             FULL => 0
421             NCLUS => 2
422             NSEED => 4
423             NTRY => 5
424             V => 1
425             ss total: 20.5
426             iter 0 R2 [0.024390244 0.024390244 0.26829268 0.4796748 0.4796748]
427             iter 1 R2 [0.46341463 0.46341463 0.4796748 0.4796748 0.4796748]
428              
429             perldl> p "$_\t$k{$_}\n" for (sort keys %k)
430              
431             R2 0.479674796747968
432             centroid # mean ratings for 2 group x 5 movies
433             [
434             [ 3 2.3333333]
435             [ 2 4.3333333]
436             [ 5 2.6666667]
437             [ 3 3]
438             [ 4 2.6666667]
439             ]
440            
441             cluster # 4 persons' membership in two groups
442             [
443             [1 0 0 0]
444             [0 1 1 1]
445             ]
446            
447             n [1 3] # cluster size
448             ss
449             [
450             [ 0 0.66666667]
451             [ 0 0.66666667]
452             [ 0 0.66666667]
453             [ 0 8]
454             [ 0 0.66666667]
455             ]
456              
457             Now, for the valiant, kmeans is threadable. Say you gathered 10 persons' ratings on 5 movies from 2 countries, so the data is dim [10,5,2], and you want to put the 10 persons from each country into 3 clusters, just specify NCLUS => [3,1], and there you have it. The key is for NCLUS to include $data->ndims - 1 numbers. The 1 in [3,1] turns into a dummy dim, so the 3-cluster operation is repeated on both countries. Similarly, when seeding, CNTRD needs to have ndims that at least match the data ndims. Extra dims in CNTRD will lead to threading (convenient if you want to try out different centroid locations, for example, but you will have to hand pick the best result). See stats_kmeans.t for examples w 3D and 4D data.
458              
459             *With bad value, R2 is based on average of variances instead of sum squared error.
460              
461             =cut
462              
463             *kmeans = \&PDL::kmeans;
464             sub PDL::kmeans {
465 5     5 0 17607 my ($self, $opt) = @_;
466 5         20 my %opt = (
467             V => 1, # prints simple status
468             FULL => 0, # returns results for all seeding trials
469            
470             CNTRD => PDL->null, # optional. pdl [clu x var]. disables next 3 opts
471            
472             NTRY => 5, # num of random seeding trials
473             NSEED => 1000, # num of initial seeds, use NSEED up to max obs
474             NCLUS => 3, # num of clusters
475             );
476 5   33     105 $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt);
477 5 100 66     43 if (defined($opt{CNTRD}) and $opt{CNTRD}->nelem) {
478 1         3 $opt{NTRY} = 1;
479 1         4 $opt{NSEED} = $self->dim(0);
480 1         4 $opt{NCLUS} = $opt{CNTRD}->dim(0);
481             }
482             else {
483 4         18 $opt{NSEED} = pdl($self->dim(0), $opt{NSEED})->min;
484             }
485 5   33     350 $opt{V} and print STDERR "$_\t=> $opt{$_}\n" for (sort keys %opt);
486            
487 5 100       24 my $ss_ms = $self->badflag? 'ms' : 'ss';
488 5 100       222 my $ss_total
489             = $self->badflag? $self->var->average : $self->ss->sumover;
490 5 50       33 $opt{V} and print STDERR "overall $ss_ms:\t$ss_total\n";
491              
492 5         9 my ($centroid, $ss_cv, $R2, $clus_this, $clus_last);
493              
494             # NTRY made into extra dim in $cluster for threading
495 5 100       18 my @nclus = (ref $opt{NCLUS} eq 'ARRAY')? @{$opt{NCLUS}} : ($opt{NCLUS});
  2         7  
496             $clus_this
497             = (defined($opt{CNTRD}) and $opt{CNTRD}->nelem) ?
498             $self->assign( $opt{CNTRD}->dummy(-1) ) # put dummy(-1) to match NTRY
499             : random_cluster($opt{NSEED}, @nclus, $opt{NTRY} )
500 5 100 66     42 ;
501              
502 5         77 ($centroid, $ss_cv) = $self(0:$opt{NSEED} - 1, )->centroid( $clus_this );
503             # now obs in $clus_this matches $self
504 5         581 $clus_this = $self->assign( $centroid );
505 5         260 ($centroid, $ss_cv) = $self->centroid( $clus_this );
506              
507 5         22 my $iter = 0;
508 5         11 do {
509 9 100       943 $R2 = $self->badflag? 1 - $ss_cv->average->average / $ss_total
510             : 1 - $ss_cv->sumover->sumover / $ss_total
511             ;
512 9 50       72 $opt{V} and print STDERR join(' ',('iter', $iter++, 'R2', $R2)) . "\n";
513              
514 9         17 $clus_last = $clus_this;
515            
516 9         468 $clus_this = $self->assign( $centroid );
517 9         639 ($centroid, $ss_cv) = $self->centroid( $clus_this );
518             }
519             while ( any long(abs($clus_this - $clus_last))->sumover->sumover > 0 );
520              
521             $opt{FULL} and
522             return (
523 5 50       749 centroid => PDL::squeeze( $centroid ),
524             cluster => PDL::squeeze( $clus_this ),
525             n => PDL::squeeze( $clus_this )->sumover,
526             R2 => PDL::squeeze( $R2 ),
527             $ss_ms => PDL::squeeze( $ss_cv ),
528             );
529              
530             # xchg/mv(-1,0) leaves it as was if single dim--unlike transpose
531 5         66 my $i_best = $R2->mv(-1,0)->maximum_ind;
532              
533 5 100       37 $R2->getndims == 1 and
534             return (
535             centroid => $centroid->dice_axis(-1,$i_best)->sever->squeeze,
536             cluster => $clus_this->dice_axis(-1,$i_best)->sever->squeeze,
537             n => $clus_this->dice_axis(-1,$i_best)->sever->squeeze->sumover,
538             R2 => $R2->dice_axis(-1,$i_best)->sever->squeeze,
539             $ss_ms => $ss_cv->dice_axis(-1,$i_best)->sever->squeeze,
540             );
541              
542             # now for threading beyond 2D data
543              
544             # can't believe i'm using a perl loop :P
545              
546 3         9 $i_best = $i_best->flat->sever;
547 3         45 my @i_best = map { $opt{NTRY} * $_ + $i_best(($_)) }
  10         191  
548             0 .. $i_best->nelem - 1;
549              
550 3         65 my @shapes;
551 3         9 for ($centroid, $clus_this, $R2) {
552 9         19 my @dims = $_->dims;
553 9         201 pop @dims;
554 9         21 push @shapes, \@dims;
555             }
556              
557 3         409 $clus_this = $clus_this->mv(-1,2)->clump(2..$clus_this->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[1] } )->sever,
558              
559             return (
560             centroid =>
561 3         463 $centroid->mv(-1,2)->clump(2..$centroid->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[0] } )->sever,
562              
563             cluster => $clus_this,
564             n => $clus_this->sumover,
565              
566             R2 =>
567 3         452 $R2->mv(-1,0)->clump(0..$R2->ndims-1)->dice_axis(0,\@i_best)->sever->reshape( @{ $shapes[2] } )->sever,
568              
569             $ss_ms =>
570 3         28 $ss_cv->mv(-1,2)->clump(2..$ss_cv->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[0] } )->sever,
  3         459  
571             );
572             }
573              
574             =head1 METHODS
575              
576             =head2 iv_cluster
577              
578             =for ref
579              
580             Turns an independent variable into a cluster pdl. Returns cluster pdl and level-to-pdl_index mapping in list context and cluster pdl only in scalar context.
581              
582             This is the method used for mean and var in anova. The difference between iv_cluster and dummy_code is that iv_cluster returns pdl dim [obs x level] whereas dummy_code returns pdl dim [obs x (level - 1)].
583              
584             =for usage
585              
586             Usage:
587              
588             perldl> @bake = qw( y y y n n n )
589            
590             # accepts @ ref or 1d pdl
591              
592             perldl> p $bake = iv_cluster( \@bake )
593             [
594             [1 1 1 0 0 0]
595             [0 0 0 1 1 1]
596             ]
597            
598             perldl> p $rating = sequence 6
599             [0 1 2 3 4 5]
600              
601             perldl> p $rating->centroid( $bake )
602             # mean for each iv level
603             [
604             [1 4]
605             ]
606             # ss
607             [
608             [2 2]
609             ]
610              
611             =cut
612              
613             *iv_cluster = \&PDL::iv_cluster;
614             sub PDL::iv_cluster {
615 13     13 0 1428 my ($var_ref) = @_;
616              
617             # pdl->uniq puts elem in order. so instead list it to maintain old order
618 13 100       69 if (ref $var_ref eq 'PDL') {
619 3         12 $var_ref = $var_ref->squeeze;
620 3 50       171 $var_ref->getndims > 1 and
621             croak "multidim pdl passed for single var!";
622 3         13 $var_ref = [ list $var_ref ];
623             }
624              
625 13         132 my ($var, $map_ref) = PDL::Stats::Basic::_array_to_pdl( $var_ref );
626 13         79 my $var_a = zeroes( short, $var->nelem, $var->max + 1 );
627              
628 13         1766 for my $l (0 .. $var->max) {
629 29         1825 my $v = $var_a( ,$l);
630 29         797 ($_tmp = $v->index( which $var == $l )) .= 1;
631             }
632              
633 13 100       900 if ($var->badflag) {
634 1         20 my $ibad = which $var->isbad;
635 1         41 ($_tmp = $var_a($ibad, )) .= -1;
636 1         76 $var_a->inplace->setvaltobad(-1);
637             }
638              
639 13 50       135 return wantarray? ($var_a, $map_ref) : $var_a;
640             }
641              
642             =head2 pca_cluster
643              
644             Assign variables to components ie clusters based on pca loadings or scores. One way to seed kmeans (see Ding & He, 2004, and Su & Dy, 2004 for other ways of using pca with kmeans). Variables are assigned to their most associated component. Note that some components may not have any variable that is most associated with them, so the returned number of clusters may be smaller than NCOMP.
645              
646             Default options (case insensitive):
647              
648             V => 1,
649             ABS => 1, # high pos and neg loadings on a comp in same cluster
650             NCOMP => undef, # max number of components to consider. determined by
651             # scree plot black magic if not specified
652             PLOT => 1, # pca scree plot with cutoff at NCOMP
653              
654             Usage:
655              
656             # say we need to cluster a group of documents
657             # $data is pdl dim [word x doc]
658             ($data, $idd, $idw) = get_data 'doc_word_info.txt';
659              
660             perldl> %p = $data->pca;
661             # $cluster is pdl mask dim [doc x ncomp]
662             perldl> $cluster = $p{loading}->pca_cluster;
663              
664             # pca clusters var while kmeans clusters obs. hence transpose
665             perldl> ($m, $ss) = $data->transpose->centroid( $cluster );
666             perldl> %k = $data->transpose->kmeans( { cntrd=>$m } );
667              
668             # take a look at cluster 0 doc ids
669             perldl> p join("\n", @$idd[ list which $k{cluster}->( ,0) ]);
670              
671             =cut
672              
673             *pca_cluster = \&PDL::pca_cluster;
674             sub PDL::pca_cluster {
675 1     1 0 1713 my ($self, $opt) = @_;
676              
677 1         6 my %opt = (
678             V => 1,
679             ABS => 1, # high pos and neg loadings on a comp in same cluster
680             NCOMP => undef, # max number of components to consider. determined by
681             # scree plot black magic if not specified
682             PLOT => 1, # pca scree plot with cutoff at NCOMP
683             );
684 1   33     11 $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt);
685              
686 1         42 my $var = sumover($self ** 2) / $self->dim(0);
687 1 50       16 if (!$opt{NCOMP}) {
688             # here's the black magic part
689 0 0       0 my $comps = ($self->dim(1) > 300)? int($self->dim(1) * .1)
690             : pdl($self->dim(1), 30)->min
691             ;
692 0         0 $var = $var(0:$comps-1)->sever;
693 0         0 $opt{NCOMP} = _scree_ind( $var );
694             }
695 1 50       4 $opt{PLOT} and do {
696 0         0 require PDL::Stats::GLM;
697 0         0 $var->plot_scree( {NCOMP=>$var->dim(0), CUT=>$opt{NCOMP}} );
698             };
699              
700 1         22 my $c = $self->( ,0:$opt{NCOMP}-1)->transpose->abs->maximum_ind;
701              
702 1 50       58 if ($opt{ABS}) {
703 1         13 $c = $c->iv_cluster;
704             }
705             else {
706 0 0       0 my @c = map { ($self->($_,$c($_)) >= 0)? $c($_)*2 : $c($_)*2 + 1 }
  0         0  
707             ( 0 .. $c->dim(0)-1 );
708 0         0 $c = iv_cluster( \@c );
709             }
710 1 50       5 $opt{V} and print STDERR "cluster membership mask as " . $c->info . "\n";
711              
712 1         5 return $c;
713             }
714              
715             =head1 REFERENCES
716              
717             Ding, C., & He, X. (2004). K-means clustering via principal component analysis. Proceedings of the 21st International Conference on Machine Learning, 69, 29.
718              
719             Su, T., & Dy, J. (2004). A deterministic method for initializing K-means clustering. 16th IEEE International Conference on Tools with Artificial Intelligence, 784-786.
720              
721             Romesburg, H.C. (1984). Cluster Analysis for Researchers. NC: Lulu Press.
722              
723             Wikipedia (retrieved June, 2009). K-means clustering. http://en.wikipedia.org/wiki/K-means_algorithm
724              
725             =head1 AUTHOR
726              
727             Copyright (C) 2009 Maggie J. Xiong
728              
729             All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution.
730              
731             =cut
732              
733              
734              
735             ;
736              
737              
738              
739             # Exit with OK status
740              
741             1;
742              
743