File Coverage

lib/PDL/Stats/Kmeans.pd
Criterion Covered Total %
statement 84 100 84.0
branch 27 44 61.3
condition 5 9 55.5
subroutine 10 11 90.9
pod 1 4 25.0
total 127 168 75.6


line stmt bran cond sub pod time code
1             use strict;
2             use warnings;
3              
4             my $F = [qw(F D)];
5              
6             pp_add_exported('', 'random_cluster', 'iv_cluster');
7              
8             pp_addpm({At=>'Top'}, <<'EOD');
9 3     3   27  
  3         7  
  3         193  
10 3     3   19 use strict;
  3         6  
  3         206  
11 3     3   18 use warnings;
  3         5  
  3         242  
12 3     3   18 use Carp;
  3         22  
  3         29  
13 3     3   8181 use PDL::LiteF;
  3         7  
  3         22  
14             use PDL::Stats::Basic;
15              
16             =head1 NAME
17              
18             PDL::Stats::Kmeans -- classic k-means cluster analysis
19              
20             =head1 DESCRIPTION
21              
22             Assumes that we have data pdl dim [observation, variable] and the goal is to put observations into clusters based on their values on the variables. The terms "observation" and "variable" are quite arbitrary but serve as a reminder for "that which is being clustered" and "that which is used to cluster".
23              
24             The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are broadcastable and methods that are non-broadcastable, respectively.
25              
26             =head1 SYNOPSIS
27              
28             Implement a basic k-means procedure,
29              
30             use PDL::LiteF;
31             use PDL::Stats;
32              
33             my ($data, $idv, $ido) = rtable( $file );
34             # or generate random data:
35             $data = grandom(200, 2); # two vars as below
36              
37             my ($cluster, $centroid, $ss_centroid, $cluster_last);
38              
39             # start out with 8 random clusters
40             $cluster = random_cluster( $data->dim(0), 8 );
41             # iterate to minimize total ss
42             # stop when no more changes in cluster membership
43             do {
44             $cluster_last = $cluster;
45             ($centroid, $ss_centroid) = $data->centroid( $cluster );
46             $cluster = $data->assign( $centroid );
47             } while sum(abs($cluster - $cluster_last)) > 0;
48              
49             or, use the B function provided here,
50              
51             my %k = $data->kmeans( \%opt );
52             print "$_\t$k{$_}\n" for sort keys %k;
53              
54             plot the clusters if there are only 2 vars in $data,
55              
56             use PDL::Graphics::Simple;
57              
58             my ($win, $c);
59             $win = pgswin();
60             $win->plot(map +(with=>'points', $data->dice_axis(0,which($k{cluster}->(,$_)))->dog), 0 .. $k{cluster}->dim(1)-1);
61              
62             =cut
63              
64             EOD
65              
66             pp_addhdr('
67             #include
68             #include
69             #include
70              
71             '
72             );
73              
74              
75             pp_addpm pp_line_numbers(__LINE__, <<'EOD' );
76              
77             =head2 random_cluster
78              
79             =for ref
80              
81             Creates masks for random mutually exclusive clusters. Accepts two
82             parameters, num_obs and num_cluster. Extra parameter turns into extra
83             dim in mask. May loop a long time if num_cluster approaches num_obs
84             because empty cluster is not allowed.
85              
86             =for usage
87              
88             my $cluster = random_cluster( $num_obs, $num_cluster );
89              
90             =cut
91              
92             # can't be called on pdl
93             sub random_cluster {
94 4     4 1 13 my ($obs, $clu, @extra) = @_;
95             # extra param in @_ made into extra dim
96 4         15 my $cluster = zeroes short(), @_;
97 4         102 do {
98 6         502 (random($obs, @extra) * $obs)->_random_cluster($cluster);
99             } while (PDL::any $cluster->sumover == 0 );
100 4         1075 $cluster;
101             }
102              
103             EOD
104              
105             pp_def('_random_cluster',
106             Pars => 'rand_data(o); [o]b(o,c)',
107             GenericTypes => ['S'],
108             RedoDimsCode => 'if ($SIZE(c) > $SIZE(o)) $CROAK("more cluster than obs!");',
109             Code => '
110             PDL_Indx nc = $SIZE(c);
111             broadcastloop %{
112             loop (o) %{
113             PDL_Indx cl = $rand_data() % nc;
114             loop (c) %{
115             $b() = (c == cl) ? 1 : 0;
116             %}
117             %}
118             %}
119             ',
120             Doc => undef,
121             );
122              
123             pp_def('which_cluster',
124             Pars => 'short a(o,c); indx [o]b(o)',
125             GenericTypes => [qw(U L)],
126             HandleBad => 1,
127             Code => '
128             loop(o) %{
129             PDL_Indx cl=-1;
130             loop(c) %{
131             PDL_IF_BAD(if ($ISBAD($a())) continue;,)
132             if (!$a()) continue;
133             cl = c;
134             break;
135             %}
136             PDL_IF_BAD(if (cl==-1) { $SETBAD(b()); continue; },)
137             $b() = cl;
138             %}
139             ',
140             Doc => 'Given cluster mask dim [obs x clu], returns the cluster index to which an obs belong.
141              
142             Does not support overlapping clusters. If an obs has TRUE value for multiple clusters, the returned index is the first cluster the obs belongs to. If an obs has no TRUE value for any cluster, the return val is set to -1 or BAD if the input mask has badflag set.
143              
144             Usage:
145              
146             # create a cluster mask dim [obs x clu]
147             pdl> p $c_mask = iv_cluster [qw(a a b b c c)]
148             [
149             [1 1 0 0 0 0]
150             [0 0 1 1 0 0]
151             [0 0 0 0 1 1]
152             ]
153             # get cluster membership list dim [obs]
154             pdl> p $ic = $c_mask->which_cluster
155             [0 0 1 1 2 2]
156             ',
157             );
158              
159             pp_def('assign',
160             Pars => 'data(o,v); centroid(c,v); short [o]cluster(o,c)',
161             GenericTypes => $F,
162             HandleBad => 1,
163             Code => '
164             PDL_Indx cl = 0;
165             loop (o) %{
166             $GENERIC(centroid) ssmin = -1;
167             loop (c) %{
168             $GENERIC(centroid) ssc = 0;
169             PDL_Indx nvc = 0;
170             loop (v) %{
171             PDL_IF_BAD(if ($ISBAD($data()) || $ISBAD($centroid())) continue;,)
172             $GENERIC() diff = $data() - $centroid();
173             ssc += diff*diff;
174             nvc ++;
175             %}
176             if (nvc) {
177             ssc /= nvc;
178             }
179             else {
180             /* taking advantage of the fact that 1st valid ssmin takes precedence */
181             /* so ssc has no effect if there is already ssmin. or it is -1 */
182             ssc = ssmin;
183             }
184             /* notice that if multiple ssc == ssmin the 1st is taken as cluster */
185             if (ssmin < 0 || ssmin > ssc) {
186             cl = c;
187             ssmin = ssc;
188             }
189             %}
190             loop (c) %{
191             PDL_IF_BAD(if (ssmin < 0) { $SETBAD($cluster()); continue; },)
192             $cluster() = (c == cl)? 1 : 0;
193             %}
194             %}
195             ',
196             Doc => '
197             =for ref
198              
199             Takes data pdl dim [obs x var] and centroid pdl dim [cluster x var] and returns mask dim [obs x cluster] to cluster membership. An obs is assigned to the first cluster with the smallest distance (ie sum squared error) to cluster centroid. With bad value, obs is assigned by smallest mean squared error across variables.
200              
201             =for usage
202              
203             pdl> p $centroid = xvals 2, 3
204             [
205             [0 1]
206             [0 1]
207             [0 1]
208             ]
209              
210             pdl> p $b = qsort( random 4, 3 )
211             [
212             [0.022774068 0.032513883 0.13890034 0.30942479]
213             [ 0.16943853 0.50262636 0.56251531 0.7152271]
214             [ 0.23964483 0.59932745 0.60967495 0.78452117]
215             ]
216             # notice that 1st 3 obs in $b are on average closer to 0
217             # and last obs closer to 1
218             pdl> p $b->assign( $centroid )
219             [
220             [1 1 1 0] # cluster 0 membership
221             [0 0 0 1] # cluster 1 membership
222             ]
223             ',
224             );
225              
226             pp_def('centroid',
227             Pars => 'data(o,v); cluster(o,c); [o]m(c,v); [o]ss(c,v)',
228             GenericTypes => $F,
229             HandleBad => 1,
230             Code => '
231             $GENERIC(m) s[ $SIZE(c) ][ $SIZE(v) ], s2[ $SIZE(c) ][ $SIZE(v) ];
232             PDL_Indx n[ $SIZE(c) ]PDL_IF_BAD([ $SIZE(v) ],);
233             loop (c) %{
234             loop (v) %{
235             s[c][v] = 0.0;
236             s2[c][v] = 0.0;
237             PDL_IF_BAD(n[c][v] = 0;,)
238             %}
239             PDL_IF_BAD(,n[c] = 0;)
240             loop (o) %{
241             PDL_IF_BAD(if ($ISBAD($cluster())) continue;,)
242             if (!$cluster()) continue;
243             PDL_IF_BAD(,n[c]++;)
244             loop (v) %{
245             PDL_IF_BAD(if ($ISBAD($data())) continue;,)
246             s[c][v] += $data();
247             s2[c][v] += $data()*$data();
248             PDL_IF_BAD(n[c][v]++;,)
249             %}
250             %}
251             loop (v) %{
252             if (n[c]PDL_IF_BAD([v],)) {
253             $m() = s[c][v] / n[c]PDL_IF_BAD([v],);
254             $ss() = s2[c][v] PDL_IF_BAD(/ n[c][v],) - pow(s[c][v] / n[c]PDL_IF_BAD([v],), 2) PDL_IF_BAD(,* n[c]);
255             }
256             else {
257             $m() = 0;
258             $ss() = 0;
259             /* $CROAK("please make sure there is no empty cluster!"); */
260             }
261             %}
262             %}
263             ',
264             Doc => '
265             =for ref
266              
267             Takes data dim [obs x var] and mask dim [obs x cluster], returns mean and ss (ms when data contains bad values) dim [cluster x var], using data where mask == 1. Multiple cluster membership for an obs is okay. If a cluster is empty all means and ss are set to zero for that cluster.
268              
269             =for usage
270              
271             # data is 10 obs x 3 var
272             pdl> p $d = sequence 10, 3
273             [
274             [ 0 1 2 3 4 5 6 7 8 9]
275             [10 11 12 13 14 15 16 17 18 19]
276             [20 21 22 23 24 25 26 27 28 29]
277             ]
278             # create two clusters by value on 1st var
279             pdl> p $a = $d( ,(0)) <= 5
280             [1 1 1 1 1 1 0 0 0 0]
281              
282             pdl> p $b = $d( ,(0)) > 5
283             [0 0 0 0 0 0 1 1 1 1]
284              
285             pdl> p $c = cat $a, $b
286             [
287             [1 1 1 1 1 1 0 0 0 0]
288             [0 0 0 0 0 0 1 1 1 1]
289             ]
290              
291             pdl> p $d->centroid($c)
292             # mean for 2 cluster x 3 var
293             [
294             [ 2.5 7.5]
295             [12.5 17.5]
296             [22.5 27.5]
297             ]
298             # ss for 2 cluster x 3 var
299             [
300             [17.5 5]
301             [17.5 5]
302             [17.5 5]
303             ]
304             ',
305             );
306              
307             pp_addpm pp_line_numbers(__LINE__, <<'EOD');
308              
309             sub _scree_ind {
310             # use as scree cutoff the point with max distance to the line formed
311             # by the 1st and last points in $self
312             # it's a heuristic--whether we can get "good" results depends on
313             # the number of components in $self.
314              
315 0     0   0 my ($self) = @_;
316              
317 0         0 $self = $self->squeeze;
318 0 0       0 $self->ndims > 1 and
319             croak "1D pdl only please";
320              
321 0         0 my $a = zeroes 2, $self->nelem;
322 0         0 $a->slice('(0)') .= sequence $self->nelem;
323 0         0 $a->slice('(1)') .= $self;
324              
325 0         0 my $d = _d_point2line( $a, $a->slice(':,(0)'), $a->slice(':,(-1)') );
326              
327 0         0 return $d->maximum_ind;
328             }
329              
330             sub _d_point2line {
331 1     1   1070 my ($self, $p1, $p2) = @_;
332              
333 1         4 for ($self, $p1, $p2) {
334 3 50       14 $_->dim(0) != 2 and
335             carp "point pdl dim(0) != 2";
336             }
337              
338 1         97 return _d_p2l( $self->mv(0,-1)->dog, $p1->mv(0,-1)->dog, $p2->mv(0,-1)->dog );
339             }
340              
341             EOD
342              
343             pp_def('_d_p2l',
344             Pars => 'xc(); yc(); xa(); ya(); xb(); yb(); [o]d()',
345             GenericTypes => $F,
346             HandleBad => 1,
347             Code => '
348             PDL_IF_BAD(if ($ISBAD(xc()) || $ISBAD(yc()) || $ISBAD(xa()) || $ISBAD(ya()) || $ISBAD(xb()) || $ISBAD(yb()) ) { $SETBAD(d()); continue; },)
349             $GENERIC(d) xba, yba;
350             xba = $xb() - $xa();
351             yba = $yb() - $ya();
352             $d() = fabs( xba * ($ya() - $yc()) - ($xa() - $xc()) * yba )
353             / hypot( xba, yba );
354             ',
355             Doc => undef,
356             );
357              
358             pp_addpm pp_line_numbers(__LINE__, <<'EOD');
359             =head2 kmeans
360              
361             =for ref
362              
363             Implements classic k-means cluster analysis.
364              
365             =for example
366              
367             $data = grandom(200, 2); # two rows = two dimensions
368             %k = $data->kmeans; # use default of 3 clusters
369             print "$_\t$k{$_}\n" for sort keys %k;
370             $w->plot(
371             (map +(with=>'points', style=>$_+1, ke=>"Cluster ".($_+1),
372             $data->dice_axis(0,which($k{cluster}->slice(",$_")))->dog),
373             0 .. $k{cluster}->dim(1)-1),
374             (map +(with=>'circles', style=>$_+1, ke=>"Centroid ".($_+1), $k{centroid}->slice($_)->dog, 0.1),
375             0 .. $k{centroid}->dim(0)-1),
376             {le=>'tr'},
377             );
378              
379             Given a number of observations with values on a set of variables,
380             kmeans puts the observations into clusters that maximizes within-cluster
381             similarity with respect to the variables. Tries several different random
382             seeding and clustering in parallel. Stops when cluster assignment of the
383             observations no longer changes. Returns the best result in terms of R2
384             from the random-seeding trials.
385              
386             Instead of random seeding, kmeans also accepts manual seeding. This is
387             done by providing a centroid to the function, in which case clustering
388             will proceed from the centroid and there is no multiple tries.
389              
390             There are two distinct advantages from seeding with a centroid compared to
391             seeding with predefined cluster membership of a subset of the observations
392             ie "seeds":
393              
394             =over
395              
396             =item *
397              
398             a centroid could come from a previous study with a different set of observations;
399              
400             =item *
401              
402             a centroid could even be "fictional", or in more proper parlance,
403             an idealized prototype with respect to the actual data. For example,
404             if there are 10 person's ratings of 1 to 5 on 4 movies, ie a ratings
405             pdl of dim [10 obs x 4 var], providing a centroid like
406              
407             [
408             [5 0 0 0]
409             [0 5 0 0]
410             [0 0 5 0]
411             [0 0 0 5]
412             ]
413              
414             will produce 4 clusters of people with each cluster favoring a different
415             one of the 4 movies. Clusters from an idealized centroid may not give the
416             best result in terms of R2, but they sure are a lot more interpretable.
417              
418             =back
419              
420             If clustering has to be done from predefined clusters of seeds, simply
421             calculate the centroid using the B function and feed it
422             to kmeans,
423              
424             my ($centroid, $ss) = $rating($iseeds, )->centroid( $seeds_cluster );
425             my %k = $rating->kmeans( { CNTRD=>$centroid } );
426              
427             kmeans supports bad value*.
428              
429             =for options
430              
431             Default options (case insensitive):
432              
433             V => 1, # prints simple status
434             FULL => 0, # returns results for all seeding trials
435              
436             CNTRD => PDL->null, # optional. pdl [clu x var]. disables next 3 opts
437              
438             NTRY => 5, # num of random seeding trials
439             NSEED => 1000, # num of initial seeds, use NSEED up to max obs
440             NCLUS => 3, # num of clusters
441              
442             =for usage
443              
444             Usage:
445              
446             # suppose we have 4 person's ratings on 5 movies
447              
448             pdl> p $rating = ceil( random(4, 5) * 5 )
449             [
450             [3 2 2 3]
451             [2 4 5 4]
452             [5 3 2 3]
453             [3 3 1 5]
454             [4 3 3 2]
455             ]
456              
457             # we want to put the 4 persons into 2 groups
458              
459             pdl> %k = $rating->kmeans( {NCLUS=>2} )
460              
461             # by default prints back options used
462             # as well as info for all tries and iterations
463              
464             CNTRD => Null
465             FULL => 0
466             NCLUS => 3
467             NSEED => 4
468             NTRY => 5
469             V => 1
470             ss total: 20.5
471             iter 0 R2 [0.024390244 0.024390244 0.26829268 0.4796748 0.4796748]
472             iter 1 R2 [0.46341463 0.46341463 0.4796748 0.4796748 0.4796748]
473              
474             pdl> p "$_\t$k{$_}\n" for sort keys %k
475              
476             R2 0.479674796747968
477             centroid # mean ratings for 2 group x 5 movies
478             [
479             [ 3 2.3333333]
480             [ 2 4.3333333]
481             [ 5 2.6666667]
482             [ 3 3]
483             [ 4 2.6666667]
484             ]
485              
486             cluster # 4 persons' membership in two groups
487             [
488             [1 0 0 0]
489             [0 1 1 1]
490             ]
491              
492             n [1 3] # cluster size
493             ss
494             [
495             [ 0 0.66666667]
496             [ 0 0.66666667]
497             [ 0 0.66666667]
498             [ 0 8]
499             [ 0 0.66666667]
500             ]
501              
502             Now, for the valiant, kmeans is broadcastable. Say you gathered 10
503             persons' ratings on 5 movies from 2 countries, so the data is dim
504             [10,5,2], and you want to put the 10 persons from each country into
505             3 clusters, just specify NCLUS => [3,1], and there you have it. The
506             key is for NCLUS to include $data->ndims - 1 numbers. The 1 in [3,1]
507             turns into a dummy dim, so the 3-cluster operation is repeated on both
508             countries. Similarly, when seeding, CNTRD needs to have ndims that at
509             least match the data ndims. Extra dims in CNTRD will lead to broadcasting
510             (convenient if you want to try out different centroid locations,
511             for example, but you will have to hand pick the best result). See
512             F for examples w 3D and 4D data.
513              
514             *With bad value, R2 is based on average of variances instead of sum squared error.
515              
516             =cut
517              
518             *kmeans = \&PDL::kmeans;
519             sub PDL::kmeans {
520 5     5 0 32988 my ($self, $opt) = @_;
521 5         53 my %opt = (
522             V => 1, # prints simple status
523             FULL => 0, # returns results for all seeding trials
524              
525             CNTRD => PDL->null, # optional. pdl [clu x var]. disables next 3 opts
526              
527             NTRY => 5, # num of random seeding trials
528             NSEED => 1000, # num of initial seeds, use NSEED up to max obs
529             NCLUS => 3, # num of clusters
530             );
531 5 50       119 if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
  5         45  
532 5 100 66     51 if (defined($opt{CNTRD}) and $opt{CNTRD}->nelem) {
533 1         3 $opt{NTRY} = 1;
534 1         4 $opt{NSEED} = $self->dim(0);
535 1         3 $opt{NCLUS} = $opt{CNTRD}->dim(0);
536             }
537             else {
538 4         19 $opt{NSEED} = pdl($self->dim(0), $opt{NSEED})->min->sclr;
539             }
540 5   33     432 $opt{V} and print "$_\t=> $opt{$_}\n" for sort keys %opt;
541              
542 5 100       26 my $ss_ms = $self->badflag? 'ms' : 'ss';
543 5 100       247 my $ss_total
544             = $self->badflag? $self->var->average : $self->ss->sumover;
545 5 50       32 $opt{V} and print "overall $ss_ms:\t$ss_total\n";
546              
547 5         10 my ($centroid, $ss_cv, $R2, $clus_this, $clus_last);
548              
549             # NTRY made into extra dim in $cluster for broadcasting
550 5 100       23 my @nclus = (ref $opt{NCLUS} eq 'ARRAY')? @{$opt{NCLUS}} : ($opt{NCLUS});
  2         7  
551             $clus_this
552             = (defined($opt{CNTRD}) and $opt{CNTRD}->nelem) ?
553             $self->assign( $opt{CNTRD}->dummy(-1) ) # put dummy(-1) to match NTRY
554             : random_cluster($opt{NSEED}, @nclus, $opt{NTRY} )
555 5 100 66     62 ;
556              
557 5         105 ($centroid, $ss_cv) = $self->slice([0,$opt{NSEED} - 1])->centroid( $clus_this );
558             # now obs in $clus_this matches $self
559 5         835 $clus_this = $self->assign( $centroid );
560 5         336 ($centroid, $ss_cv) = $self->centroid( $clus_this );
561              
562 5         22 my $iter = 0;
563 5         12 do {
564 9 100       1805 $R2 = $self->badflag? 1 - $ss_cv->average->average / $ss_total
565             : 1 - $ss_cv->sumover->sumover / $ss_total
566             ;
567 9 50       627 $opt{V} and print join(' ',('iter', $iter++, 'R2', $R2)) . "\n";
568              
569 9         22 $clus_last = $clus_this;
570              
571 9         379 $clus_this = $self->assign( $centroid );
572 9         705 ($centroid, $ss_cv) = $self->centroid( $clus_this );
573             }
574             while ( any long(abs($clus_this - $clus_last))->sumover->sumover > 0 );
575              
576             $opt{FULL} and
577             return (
578 5 50       1761 centroid => PDL::squeeze( $centroid ),
579             cluster => PDL::squeeze( $clus_this ),
580             n => PDL::squeeze( $clus_this )->sumover,
581             R2 => PDL::squeeze( $R2 ),
582             $ss_ms => PDL::squeeze( $ss_cv ),
583             );
584              
585             # xchg/mv(-1,0) leaves it as was if single dim--unlike transpose
586 5         126 my $i_best = $R2->mv(-1,0)->maximum_ind;
587              
588 5 100       45 $R2->getndims == 1 and
589             return (
590             centroid => $centroid->dice_axis(-1,$i_best)->sever->squeeze,
591             cluster => $clus_this->dice_axis(-1,$i_best)->sever->squeeze,
592             n => $clus_this->dice_axis(-1,$i_best)->sever->squeeze->sumover,
593             R2 => $R2->dice_axis(-1,$i_best)->sever->squeeze,
594             $ss_ms => $ss_cv->dice_axis(-1,$i_best)->sever->squeeze,
595             );
596              
597             # now for broadcasting beyond 2D data
598              
599             # can't believe i'm using a perl loop :P
600              
601 3         14 $i_best = $i_best->flat->sever;
602 3         53 my @i_best = map { $opt{NTRY} * $_ + $i_best->slice("($_)") }
  10         489  
603             0 .. $i_best->nelem - 1;
604              
605 3         151 my @shapes;
606 3         9 for ($centroid, $clus_this, $R2) {
607 9         28 my @dims = $_->dims;
608 9         13 pop @dims;
609 9         24 push @shapes, \@dims;
610             }
611              
612 3         988 $clus_this = $clus_this->mv(-1,2)->clump(2..$clus_this->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[1] } )->sever,
613              
614             return (
615             centroid =>
616 3         1056 $centroid->mv(-1,2)->clump(2..$centroid->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[0] } )->sever,
617              
618             cluster => $clus_this,
619             n => $clus_this->sumover,
620              
621             R2 =>
622 3         1050 $R2->mv(-1,0)->clump(0..$R2->ndims-1)->dice_axis(0,\@i_best)->sever->reshape( @{ $shapes[2] } )->sever,
623              
624             $ss_ms =>
625 3         40 $ss_cv->mv(-1,2)->clump(2..$ss_cv->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[0] } )->sever,
  3         1025  
626             );
627             }
628              
629             =head1 METHODS
630              
631             =head2 iv_cluster
632              
633             =for ref
634              
635             Turns an independent variable into a cluster pdl. Returns cluster pdl and level-to-pdl_index mapping in list context and cluster pdl only in scalar context.
636              
637             This is the method used for mean and var in anova. The difference between iv_cluster and dummy_code is that iv_cluster returns pdl dim [obs x level] whereas dummy_code returns pdl dim [obs x (level - 1)].
638              
639             =for usage
640              
641             Usage:
642              
643             pdl> @bake = qw( y y y n n n )
644              
645             # accepts @ ref or 1d pdl
646              
647             pdl> p $bake = iv_cluster( \@bake )
648             [
649             [1 1 1 0 0 0]
650             [0 0 0 1 1 1]
651             ]
652              
653             pdl> p $rating = sequence 6
654             [0 1 2 3 4 5]
655              
656             pdl> p $rating->centroid( $bake )
657             # mean for each iv level
658             [
659             [1 4]
660             ]
661             # ss
662             [
663             [2 2]
664             ]
665              
666             =cut
667              
668             *iv_cluster = \&PDL::iv_cluster;
669             sub PDL::iv_cluster {
670 23     23 0 305527 my ($var_ref) = @_;
671 23         112 my ($var, $map_ref) = PDL::Stats::Basic::code_ivs( $var_ref );
672 23         127 my $var_a = yvals( short, $var->nelem, $var->max->sclr + 1 ) == $var;
673 23 100       3180 $var_a = $var_a->setbadif( $var->isbad ) if $var->badflag;
674 23 50       184 wantarray ? ($var_a, $map_ref) : $var_a;
675             }
676              
677             =head2 pca_cluster
678              
679             Assign variables to components ie clusters based on pca loadings or
680             scores. One way to seed kmeans (see Ding & He, 2004, and Su & Dy, 2004
681             for other ways of using pca with kmeans). Variables are assigned to
682             their most associated component. Note that some components may not have
683             any variable that is most associated with them, so the returned number
684             of clusters may be smaller than NCOMP.
685              
686             Default options (case insensitive):
687              
688             V => 1,
689             ABS => 1, # high pos and neg loadings on a comp in same cluster
690             NCOMP => undef, # max number of components to consider. determined by
691             # scree plot black magic if not specified
692             PLOT => 0, # pca scree plot with cutoff at NCOMP
693             WIN => undef, # pass pgswin object for more plotting control
694              
695             Usage:
696              
697             # say we need to cluster a group of documents
698             # $data is pdl dim [word x doc]
699             ($data, $idd, $idw) = get_data 'doc_word_info.txt';
700              
701             pdl> %p = $data->pca;
702             # $cluster is pdl mask dim [doc x ncomp]
703             pdl> $cluster = $p{loading}->pca_cluster;
704              
705             # pca clusters var while kmeans clusters obs. hence transpose
706             pdl> ($m, $ss) = $data->transpose->centroid( $cluster );
707             pdl> %k = $data->transpose->kmeans( { cntrd=>$m } );
708              
709             # take a look at cluster 0 doc ids
710             pdl> p join("\n", @$idd[ list which $k{cluster}->( ,0) ]);
711              
712             =cut
713              
714             *pca_cluster = \&PDL::pca_cluster;
715             sub PDL::pca_cluster {
716 1     1 0 3610 my ($self, $opt) = @_;
717              
718 1         8 my %opt = (
719             V => 1,
720             ABS => 1, # high pos and neg loadings on a comp in same cluster
721             NCOMP => undef, # max number of components to consider. determined by
722             # scree plot black magic if not specified
723             PLOT => 0, # pca scree plot with cutoff at NCOMP
724             WIN => undef, # pass pgswin object for more plotting control
725             );
726 1 50       5 if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
  1         8  
727              
728 1         8 my $var = sumover($self ** 2) / $self->dim(0);
729 1 50       80 if (!$opt{NCOMP}) {
730             # here's the black magic part
731 0 0       0 my $comps = ($self->dim(1) > 300)? int($self->dim(1) * .1)
732             : pdl($self->dim(1), 30)->min
733             ;
734 0         0 $var = $var->slice([0,$comps-1])->sever;
735 0         0 $opt{NCOMP} = _scree_ind( $var );
736             }
737 1 50       4 $opt{PLOT} and do {
738 0         0 require PDL::Stats::GLM;
739 0         0 $var->plot_screes({NCOMP=>$var->dim(0), CUT=>$opt{NCOMP}, WIN=>$opt{WIN}});
740             };
741              
742 1         8 my $c = $self->slice(':',[0,$opt{NCOMP}-1])->transpose->abs->maximum_ind;
743              
744 1 50       121 if ($opt{ABS}) {
745 1         5 $c = $c->iv_cluster;
746             }
747             else {
748 0 0       0 my @c = map { ($self->slice($_,$c->slice($_)) >= 0)? $c->slice($_)*2 : $c->slice($_)*2 + 1 }
  0         0  
749             ( 0 .. $c->dim(0)-1 );
750 0         0 $c = iv_cluster( \@c );
751             }
752 1 50       5 $opt{V} and print "cluster membership mask as " . $c->info . "\n";
753              
754 1         7 return $c;
755             }
756              
757             =head1 REFERENCES
758              
759             Ding, C., & He, X. (2004). K-means clustering via principal component analysis. Proceedings of the 21st International Conference on Machine Learning, 69, 29.
760              
761             Su, T., & Dy, J. (2004). A deterministic method for initializing K-means clustering. 16th IEEE International Conference on Tools with Artificial Intelligence, 784-786.
762              
763             Romesburg, H.C. (1984). Cluster Analysis for Researchers. NC: Lulu Press.
764              
765             Wikipedia (retrieved June, 2009). K-means clustering. http://en.wikipedia.org/wiki/K-means_algorithm
766              
767             =head1 AUTHOR
768              
769             Copyright (C) 2009 Maggie J. Xiong
770              
771             All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution.
772              
773             =cut
774              
775             EOD
776              
777             pp_done();