File Coverage

lib/PDL/Stats/Kmeans.pd

Criterion	Covered	Total	%
statement	84	100	84.0
branch	27	44	61.3
condition	5	9	55.5
subroutine	10	11	90.9
pod	1	4	25.0
total	127	168	75.6

line	stmt	bran	cond	sub	pod	time	code
1							use strict;
2							use warnings;
3
4							my $F = [qw(F D)];
5
6							pp_add_exported('', 'random_cluster', 'iv_cluster');
7
8							pp_addpm({At=>'Top'}, <<'EOD');
9	3			3		27
	3					7
	3					193
10	3			3		19	use strict;
	3					6
	3					206
11	3			3		18	use warnings;
	3					5
	3					242
12	3			3		18	use Carp;
	3					22
	3					29
13	3			3		8181	use PDL::LiteF;
	3					7
	3					22
14							use PDL::Stats::Basic;
15
16							=head1 NAME
17
18							PDL::Stats::Kmeans -- classic k-means cluster analysis
19
20							=head1 DESCRIPTION
21
22							Assumes that we have data pdl dim [observation, variable] and the goal is to put observations into clusters based on their values on the variables. The terms "observation" and "variable" are quite arbitrary but serve as a reminder for "that which is being clustered" and "that which is used to cluster".
23
24							The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are broadcastable and methods that are non-broadcastable, respectively.
25
26							=head1 SYNOPSIS
27
28							Implement a basic k-means procedure,
29
30							use PDL::LiteF;
31							use PDL::Stats;
32
33							my ($data, $idv, $ido) = rtable( $file );
34							# or generate random data:
35							$data = grandom(200, 2); # two vars as below
36
37							my ($cluster, $centroid, $ss_centroid, $cluster_last);
38
39							# start out with 8 random clusters
40							$cluster = random_cluster( $data->dim(0), 8 );
41							# iterate to minimize total ss
42							# stop when no more changes in cluster membership
43							do {
44							$cluster_last = $cluster;
45							($centroid, $ss_centroid) = $data->centroid( $cluster );
46							$cluster = $data->assign( $centroid );
47							} while sum(abs($cluster - $cluster_last)) > 0;
48
49							or, use the B function provided here,
50
51							my %k = $data->kmeans( \%opt );
52							print "$_\t$k{$_}\n" for sort keys %k;
53
54							plot the clusters if there are only 2 vars in $data,
55
56							use PDL::Graphics::Simple;
57
58							my ($win, $c);
59							$win = pgswin();
60							$win->plot(map +(with=>'points', $data->dice_axis(0,which($k{cluster}->(,$_)))->dog), 0 .. $k{cluster}->dim(1)-1);
61
62							=cut
63
64							EOD
65
66							pp_addhdr('
67							#include
68							#include
69							#include
70
71							'
72							);
73
74
75							pp_addpm pp_line_numbers(__LINE__, <<'EOD' );
76
77							=head2 random_cluster
78
79							=for ref
80
81							Creates masks for random mutually exclusive clusters. Accepts two
82							parameters, num_obs and num_cluster. Extra parameter turns into extra
83							dim in mask. May loop a long time if num_cluster approaches num_obs
84							because empty cluster is not allowed.
85
86							=for usage
87
88							my $cluster = random_cluster( $num_obs, $num_cluster );
89
90							=cut
91
92							# can't be called on pdl
93							sub random_cluster {
94	4			4	1	13	my ($obs, $clu, @extra) = @_;
95							# extra param in @_ made into extra dim
96	4					15	my $cluster = zeroes short(), @_;
97	4					102	do {
98	6					502	(random($obs, @extra) * $obs)->_random_cluster($cluster);
99							} while (PDL::any $cluster->sumover == 0 );
100	4					1075	$cluster;
101							}
102
103							EOD
104
105							pp_def('_random_cluster',
106							Pars => 'rand_data(o); [o]b(o,c)',
107							GenericTypes => ['S'],
108							RedoDimsCode => 'if ($SIZE(c) > $SIZE(o)) $CROAK("more cluster than obs!");',
109							Code => '
110							PDL_Indx nc = $SIZE(c);
111							broadcastloop %{
112							loop (o) %{
113							PDL_Indx cl = $rand_data() % nc;
114							loop (c) %{
115							$b() = (c == cl) ? 1 : 0;
116							%}
117							%}
118							%}
119							',
120							Doc => undef,
121							);
122
123							pp_def('which_cluster',
124							Pars => 'short a(o,c); indx [o]b(o)',
125							GenericTypes => [qw(U L)],
126							HandleBad => 1,
127							Code => '
128							loop(o) %{
129							PDL_Indx cl=-1;
130							loop(c) %{
131							PDL_IF_BAD(if ($ISBAD($a())) continue;,)
132							if (!$a()) continue;
133							cl = c;
134							break;
135							%}
136							PDL_IF_BAD(if (cl==-1) { $SETBAD(b()); continue; },)
137							$b() = cl;
138							%}
139							',
140							Doc => 'Given cluster mask dim [obs x clu], returns the cluster index to which an obs belong.
141
142							Does not support overlapping clusters. If an obs has TRUE value for multiple clusters, the returned index is the first cluster the obs belongs to. If an obs has no TRUE value for any cluster, the return val is set to -1 or BAD if the input mask has badflag set.
143
144							Usage:
145
146							# create a cluster mask dim [obs x clu]
147							pdl> p $c_mask = iv_cluster [qw(a a b b c c)]
148							[
149							[1 1 0 0 0 0]
150							[0 0 1 1 0 0]
151							[0 0 0 0 1 1]
152							]
153							# get cluster membership list dim [obs]
154							pdl> p $ic = $c_mask->which_cluster
155							[0 0 1 1 2 2]
156							',
157							);
158
159							pp_def('assign',
160							Pars => 'data(o,v); centroid(c,v); short [o]cluster(o,c)',
161							GenericTypes => $F,
162							HandleBad => 1,
163							Code => '
164							PDL_Indx cl = 0;
165							loop (o) %{
166							$GENERIC(centroid) ssmin = -1;
167							loop (c) %{
168							$GENERIC(centroid) ssc = 0;
169							PDL_Indx nvc = 0;
170							loop (v) %{
171							PDL_IF_BAD(if ($ISBAD($data()) \|\| $ISBAD($centroid())) continue;,)
172							$GENERIC() diff = $data() - $centroid();
173							ssc += diff*diff;
174							nvc ++;
175							%}
176							if (nvc) {
177							ssc /= nvc;
178							}
179							else {
180							/* taking advantage of the fact that 1st valid ssmin takes precedence */
181							/* so ssc has no effect if there is already ssmin. or it is -1 */
182							ssc = ssmin;
183							}
184							/* notice that if multiple ssc == ssmin the 1st is taken as cluster */
185							if (ssmin < 0 \|\| ssmin > ssc) {
186							cl = c;
187							ssmin = ssc;
188							}
189							%}
190							loop (c) %{
191							PDL_IF_BAD(if (ssmin < 0) { $SETBAD($cluster()); continue; },)
192							$cluster() = (c == cl)? 1 : 0;
193							%}
194							%}
195							',
196							Doc => '
197							=for ref
198
199							Takes data pdl dim [obs x var] and centroid pdl dim [cluster x var] and returns mask dim [obs x cluster] to cluster membership. An obs is assigned to the first cluster with the smallest distance (ie sum squared error) to cluster centroid. With bad value, obs is assigned by smallest mean squared error across variables.
200
201							=for usage
202
203							pdl> p $centroid = xvals 2, 3
204							[
205							[0 1]
206							[0 1]
207							[0 1]
208							]
209
210							pdl> p $b = qsort( random 4, 3 )
211							[
212							[0.022774068 0.032513883 0.13890034 0.30942479]
213							[ 0.16943853 0.50262636 0.56251531 0.7152271]
214							[ 0.23964483 0.59932745 0.60967495 0.78452117]
215							]
216							# notice that 1st 3 obs in $b are on average closer to 0
217							# and last obs closer to 1
218							pdl> p $b->assign( $centroid )
219							[
220							[1 1 1 0] # cluster 0 membership
221							[0 0 0 1] # cluster 1 membership
222							]
223							',
224							);
225
226							pp_def('centroid',
227							Pars => 'data(o,v); cluster(o,c); [o]m(c,v); [o]ss(c,v)',
228							GenericTypes => $F,
229							HandleBad => 1,
230							Code => '
231							$GENERIC(m) s[ $SIZE(c) ][ $SIZE(v) ], s2[ $SIZE(c) ][ $SIZE(v) ];
232							PDL_Indx n[ $SIZE(c) ]PDL_IF_BAD([ $SIZE(v) ],);
233							loop (c) %{
234							loop (v) %{
235							s[c][v] = 0.0;
236							s2[c][v] = 0.0;
237							PDL_IF_BAD(n[c][v] = 0;,)
238							%}
239							PDL_IF_BAD(,n[c] = 0;)
240							loop (o) %{
241							PDL_IF_BAD(if ($ISBAD($cluster())) continue;,)
242							if (!$cluster()) continue;
243							PDL_IF_BAD(,n[c]++;)
244							loop (v) %{
245							PDL_IF_BAD(if ($ISBAD($data())) continue;,)
246							s[c][v] += $data();
247							s2[c][v] += $data()*$data();
248							PDL_IF_BAD(n[c][v]++;,)
249							%}
250							%}
251							loop (v) %{
252							if (n[c]PDL_IF_BAD([v],)) {
253							$m() = s[c][v] / n[c]PDL_IF_BAD([v],);
254							$ss() = s2[c][v] PDL_IF_BAD(/ n[c][v],) - pow(s[c][v] / n[c]PDL_IF_BAD([v],), 2) PDL_IF_BAD(,* n[c]);
255							}
256							else {
257							$m() = 0;
258							$ss() = 0;
259							/* $CROAK("please make sure there is no empty cluster!"); */
260							}
261							%}
262							%}
263							',
264							Doc => '
265							=for ref
266
267							Takes data dim [obs x var] and mask dim [obs x cluster], returns mean and ss (ms when data contains bad values) dim [cluster x var], using data where mask == 1. Multiple cluster membership for an obs is okay. If a cluster is empty all means and ss are set to zero for that cluster.
268
269							=for usage
270
271							# data is 10 obs x 3 var
272							pdl> p $d = sequence 10, 3
273							[
274							[ 0 1 2 3 4 5 6 7 8 9]
275							[10 11 12 13 14 15 16 17 18 19]
276							[20 21 22 23 24 25 26 27 28 29]
277							]
278							# create two clusters by value on 1st var
279							pdl> p $a = $d( ,(0)) <= 5
280							[1 1 1 1 1 1 0 0 0 0]
281
282							pdl> p $b = $d( ,(0)) > 5
283							[0 0 0 0 0 0 1 1 1 1]
284
285							pdl> p $c = cat $a, $b
286							[
287							[1 1 1 1 1 1 0 0 0 0]
288							[0 0 0 0 0 0 1 1 1 1]
289							]
290
291							pdl> p $d->centroid($c)
292							# mean for 2 cluster x 3 var
293							[
294							[ 2.5 7.5]
295							[12.5 17.5]
296							[22.5 27.5]
297							]
298							# ss for 2 cluster x 3 var
299							[
300							[17.5 5]
301							[17.5 5]
302							[17.5 5]
303							]
304							',
305							);
306
307							pp_addpm pp_line_numbers(__LINE__, <<'EOD');
308
309							sub _scree_ind {
310							# use as scree cutoff the point with max distance to the line formed
311							# by the 1st and last points in $self
312							# it's a heuristic--whether we can get "good" results depends on
313							# the number of components in $self.
314
315	0			0		0	my ($self) = @_;
316
317	0					0	$self = $self->squeeze;
318	0	0				0	$self->ndims > 1 and
319							croak "1D pdl only please";
320
321	0					0	my $a = zeroes 2, $self->nelem;
322	0					0	$a->slice('(0)') .= sequence $self->nelem;
323	0					0	$a->slice('(1)') .= $self;
324
325	0					0	my $d = _d_point2line( $a, $a->slice(':,(0)'), $a->slice(':,(-1)') );
326
327	0					0	return $d->maximum_ind;
328							}
329
330							sub _d_point2line {
331	1			1		1070	my ($self, $p1, $p2) = @_;
332
333	1					4	for ($self, $p1, $p2) {
334	3	50				14	$_->dim(0) != 2 and
335							carp "point pdl dim(0) != 2";
336							}
337
338	1					97	return _d_p2l( $self->mv(0,-1)->dog, $p1->mv(0,-1)->dog, $p2->mv(0,-1)->dog );
339							}
340
341							EOD
342
343							pp_def('_d_p2l',
344							Pars => 'xc(); yc(); xa(); ya(); xb(); yb(); [o]d()',
345							GenericTypes => $F,
346							HandleBad => 1,
347							Code => '
348							PDL_IF_BAD(if ($ISBAD(xc()) \|\| $ISBAD(yc()) \|\| $ISBAD(xa()) \|\| $ISBAD(ya()) \|\| $ISBAD(xb()) \|\| $ISBAD(yb()) ) { $SETBAD(d()); continue; },)
349							$GENERIC(d) xba, yba;
350							xba = $xb() - $xa();
351							yba = $yb() - $ya();
352							$d() = fabs( xba * ($ya() - $yc()) - ($xa() - $xc()) * yba )
353							/ hypot( xba, yba );
354							',
355							Doc => undef,
356							);
357
358							pp_addpm pp_line_numbers(__LINE__, <<'EOD');
359							=head2 kmeans
360
361							=for ref
362
363							Implements classic k-means cluster analysis.
364
365							=for example
366
367							$data = grandom(200, 2); # two rows = two dimensions
368							%k = $data->kmeans; # use default of 3 clusters
369							print "$_\t$k{$_}\n" for sort keys %k;
370							$w->plot(
371							(map +(with=>'points', style=>$_+1, ke=>"Cluster ".($_+1),
372							$data->dice_axis(0,which($k{cluster}->slice(",$_")))->dog),
373							0 .. $k{cluster}->dim(1)-1),
374							(map +(with=>'circles', style=>$_+1, ke=>"Centroid ".($_+1), $k{centroid}->slice($_)->dog, 0.1),
375							0 .. $k{centroid}->dim(0)-1),
376							{le=>'tr'},
377							);
378
379							Given a number of observations with values on a set of variables,
380							kmeans puts the observations into clusters that maximizes within-cluster
381							similarity with respect to the variables. Tries several different random
382							seeding and clustering in parallel. Stops when cluster assignment of the
383							observations no longer changes. Returns the best result in terms of R2
384							from the random-seeding trials.
385
386							Instead of random seeding, kmeans also accepts manual seeding. This is
387							done by providing a centroid to the function, in which case clustering
388							will proceed from the centroid and there is no multiple tries.
389
390							There are two distinct advantages from seeding with a centroid compared to
391							seeding with predefined cluster membership of a subset of the observations
392							ie "seeds":
393
394							=over
395
396							=item *
397
398							a centroid could come from a previous study with a different set of observations;
399
400							=item *
401
402							a centroid could even be "fictional", or in more proper parlance,
403							an idealized prototype with respect to the actual data. For example,
404							if there are 10 person's ratings of 1 to 5 on 4 movies, ie a ratings
405							pdl of dim [10 obs x 4 var], providing a centroid like
406
407							[
408							[5 0 0 0]
409							[0 5 0 0]
410							[0 0 5 0]
411							[0 0 0 5]
412							]
413
414							will produce 4 clusters of people with each cluster favoring a different
415							one of the 4 movies. Clusters from an idealized centroid may not give the
416							best result in terms of R2, but they sure are a lot more interpretable.
417
418							=back
419
420							If clustering has to be done from predefined clusters of seeds, simply
421							calculate the centroid using the B function and feed it
422							to kmeans,
423
424							my ($centroid, $ss) = $rating($iseeds, )->centroid( $seeds_cluster );
425							my %k = $rating->kmeans( { CNTRD=>$centroid } );
426
427							kmeans supports bad value*.
428
429							=for options
430
431							Default options (case insensitive):
432
433							V => 1, # prints simple status
434							FULL => 0, # returns results for all seeding trials
435
436							CNTRD => PDL->null, # optional. pdl [clu x var]. disables next 3 opts
437
438							NTRY => 5, # num of random seeding trials
439							NSEED => 1000, # num of initial seeds, use NSEED up to max obs
440							NCLUS => 3, # num of clusters
441
442							=for usage
443
444							Usage:
445
446							# suppose we have 4 person's ratings on 5 movies
447
448							pdl> p $rating = ceil( random(4, 5) * 5 )
449							[
450							[3 2 2 3]
451							[2 4 5 4]
452							[5 3 2 3]
453							[3 3 1 5]
454							[4 3 3 2]
455							]
456
457							# we want to put the 4 persons into 2 groups
458
459							pdl> %k = $rating->kmeans( {NCLUS=>2} )
460
461							# by default prints back options used
462							# as well as info for all tries and iterations
463
464							CNTRD => Null
465							FULL => 0
466							NCLUS => 3
467							NSEED => 4
468							NTRY => 5
469							V => 1
470							ss total: 20.5
471							iter 0 R2 [0.024390244 0.024390244 0.26829268 0.4796748 0.4796748]
472							iter 1 R2 [0.46341463 0.46341463 0.4796748 0.4796748 0.4796748]
473
474							pdl> p "$_\t$k{$_}\n" for sort keys %k
475
476							R2 0.479674796747968
477							centroid # mean ratings for 2 group x 5 movies
478							[
479							[ 3 2.3333333]
480							[ 2 4.3333333]
481							[ 5 2.6666667]
482							[ 3 3]
483							[ 4 2.6666667]
484							]
485
486							cluster # 4 persons' membership in two groups
487							[
488							[1 0 0 0]
489							[0 1 1 1]
490							]
491
492							n [1 3] # cluster size
493							ss
494							[
495							[ 0 0.66666667]
496							[ 0 0.66666667]
497							[ 0 0.66666667]
498							[ 0 8]
499							[ 0 0.66666667]
500							]
501
502							Now, for the valiant, kmeans is broadcastable. Say you gathered 10
503							persons' ratings on 5 movies from 2 countries, so the data is dim
504							[10,5,2], and you want to put the 10 persons from each country into
505							3 clusters, just specify NCLUS => [3,1], and there you have it. The
506							key is for NCLUS to include $data->ndims - 1 numbers. The 1 in [3,1]
507							turns into a dummy dim, so the 3-cluster operation is repeated on both
508							countries. Similarly, when seeding, CNTRD needs to have ndims that at
509							least match the data ndims. Extra dims in CNTRD will lead to broadcasting
510							(convenient if you want to try out different centroid locations,
511							for example, but you will have to hand pick the best result). See
512							F for examples w 3D and 4D data.
513
514							*With bad value, R2 is based on average of variances instead of sum squared error.
515
516							=cut
517
518							*kmeans = \&PDL::kmeans;
519							sub PDL::kmeans {
520	5			5	0	32988	my ($self, $opt) = @_;
521	5					53	my %opt = (
522							V => 1, # prints simple status
523							FULL => 0, # returns results for all seeding trials
524
525							CNTRD => PDL->null, # optional. pdl [clu x var]. disables next 3 opts
526
527							NTRY => 5, # num of random seeding trials
528							NSEED => 1000, # num of initial seeds, use NSEED up to max obs
529							NCLUS => 3, # num of clusters
530							);
531	5	50				119	if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
	5					45
532	5	100	66			51	if (defined($opt{CNTRD}) and $opt{CNTRD}->nelem) {
533	1					3	$opt{NTRY} = 1;
534	1					4	$opt{NSEED} = $self->dim(0);
535	1					3	$opt{NCLUS} = $opt{CNTRD}->dim(0);
536							}
537							else {
538	4					19	$opt{NSEED} = pdl($self->dim(0), $opt{NSEED})->min->sclr;
539							}
540	5		33			432	$opt{V} and print "$_\t=> $opt{$_}\n" for sort keys %opt;
541
542	5	100				26	my $ss_ms = $self->badflag? 'ms' : 'ss';
543	5	100				247	my $ss_total
544							= $self->badflag? $self->var->average : $self->ss->sumover;
545	5	50				32	$opt{V} and print "overall $ss_ms:\t$ss_total\n";
546
547	5					10	my ($centroid, $ss_cv, $R2, $clus_this, $clus_last);
548
549							# NTRY made into extra dim in $cluster for broadcasting
550	5	100				23	my @nclus = (ref $opt{NCLUS} eq 'ARRAY')? @{$opt{NCLUS}} : ($opt{NCLUS});
	2					7
551							$clus_this
552							= (defined($opt{CNTRD}) and $opt{CNTRD}->nelem) ?
553							$self->assign( $opt{CNTRD}->dummy(-1) ) # put dummy(-1) to match NTRY
554							: random_cluster($opt{NSEED}, @nclus, $opt{NTRY} )
555	5	100	66			62	;
556
557	5					105	($centroid, $ss_cv) = $self->slice([0,$opt{NSEED} - 1])->centroid( $clus_this );
558							# now obs in $clus_this matches $self
559	5					835	$clus_this = $self->assign( $centroid );
560	5					336	($centroid, $ss_cv) = $self->centroid( $clus_this );
561
562	5					22	my $iter = 0;
563	5					12	do {
564	9	100				1805	$R2 = $self->badflag? 1 - $ss_cv->average->average / $ss_total
565							: 1 - $ss_cv->sumover->sumover / $ss_total
566							;
567	9	50				627	$opt{V} and print join(' ',('iter', $iter++, 'R2', $R2)) . "\n";
568
569	9					22	$clus_last = $clus_this;
570
571	9					379	$clus_this = $self->assign( $centroid );
572	9					705	($centroid, $ss_cv) = $self->centroid( $clus_this );
573							}
574							while ( any long(abs($clus_this - $clus_last))->sumover->sumover > 0 );
575
576							$opt{FULL} and
577							return (
578	5	50				1761	centroid => PDL::squeeze( $centroid ),
579							cluster => PDL::squeeze( $clus_this ),
580							n => PDL::squeeze( $clus_this )->sumover,
581							R2 => PDL::squeeze( $R2 ),
582							$ss_ms => PDL::squeeze( $ss_cv ),
583							);
584
585							# xchg/mv(-1,0) leaves it as was if single dim--unlike transpose
586	5					126	my $i_best = $R2->mv(-1,0)->maximum_ind;
587
588	5	100				45	$R2->getndims == 1 and
589							return (
590							centroid => $centroid->dice_axis(-1,$i_best)->sever->squeeze,
591							cluster => $clus_this->dice_axis(-1,$i_best)->sever->squeeze,
592							n => $clus_this->dice_axis(-1,$i_best)->sever->squeeze->sumover,
593							R2 => $R2->dice_axis(-1,$i_best)->sever->squeeze,
594							$ss_ms => $ss_cv->dice_axis(-1,$i_best)->sever->squeeze,
595							);
596
597							# now for broadcasting beyond 2D data
598
599							# can't believe i'm using a perl loop :P
600
601	3					14	$i_best = $i_best->flat->sever;
602	3					53	my @i_best = map { $opt{NTRY} * $_ + $i_best->slice("($_)") }
	10					489
603							0 .. $i_best->nelem - 1;
604
605	3					151	my @shapes;
606	3					9	for ($centroid, $clus_this, $R2) {
607	9					28	my @dims = $_->dims;
608	9					13	pop @dims;
609	9					24	push @shapes, \@dims;
610							}
611
612	3					988	$clus_this = $clus_this->mv(-1,2)->clump(2..$clus_this->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[1] } )->sever,
613
614							return (
615							centroid =>
616	3					1056	$centroid->mv(-1,2)->clump(2..$centroid->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[0] } )->sever,
617
618							cluster => $clus_this,
619							n => $clus_this->sumover,
620
621							R2 =>
622	3					1050	$R2->mv(-1,0)->clump(0..$R2->ndims-1)->dice_axis(0,\@i_best)->sever->reshape( @{ $shapes[2] } )->sever,
623
624							$ss_ms =>
625	3					40	$ss_cv->mv(-1,2)->clump(2..$ss_cv->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[0] } )->sever,
	3					1025
626							);
627							}
628
629							=head1 METHODS
630
631							=head2 iv_cluster
632
633							=for ref
634
635							Turns an independent variable into a cluster pdl. Returns cluster pdl and level-to-pdl_index mapping in list context and cluster pdl only in scalar context.
636
637							This is the method used for mean and var in anova. The difference between iv_cluster and dummy_code is that iv_cluster returns pdl dim [obs x level] whereas dummy_code returns pdl dim [obs x (level - 1)].
638
639							=for usage
640
641							Usage:
642
643							pdl> @bake = qw( y y y n n n )
644
645							# accepts @ ref or 1d pdl
646
647							pdl> p $bake = iv_cluster( \@bake )
648							[
649							[1 1 1 0 0 0]
650							[0 0 0 1 1 1]
651							]
652
653							pdl> p $rating = sequence 6
654							[0 1 2 3 4 5]
655
656							pdl> p $rating->centroid( $bake )
657							# mean for each iv level
658							[
659							[1 4]
660							]
661							# ss
662							[
663							[2 2]
664							]
665
666							=cut
667
668							*iv_cluster = \&PDL::iv_cluster;
669							sub PDL::iv_cluster {
670	23			23	0	305527	my ($var_ref) = @_;
671	23					112	my ($var, $map_ref) = PDL::Stats::Basic::code_ivs( $var_ref );
672	23					127	my $var_a = yvals( short, $var->nelem, $var->max->sclr + 1 ) == $var;
673	23	100				3180	$var_a = $var_a->setbadif( $var->isbad ) if $var->badflag;
674	23	50				184	wantarray ? ($var_a, $map_ref) : $var_a;
675							}
676
677							=head2 pca_cluster
678
679							Assign variables to components ie clusters based on pca loadings or
680							scores. One way to seed kmeans (see Ding & He, 2004, and Su & Dy, 2004
681							for other ways of using pca with kmeans). Variables are assigned to
682							their most associated component. Note that some components may not have
683							any variable that is most associated with them, so the returned number
684							of clusters may be smaller than NCOMP.
685
686							Default options (case insensitive):
687
688							V => 1,
689							ABS => 1, # high pos and neg loadings on a comp in same cluster
690							NCOMP => undef, # max number of components to consider. determined by
691							# scree plot black magic if not specified
692							PLOT => 0, # pca scree plot with cutoff at NCOMP
693							WIN => undef, # pass pgswin object for more plotting control
694
695							Usage:
696
697							# say we need to cluster a group of documents
698							# $data is pdl dim [word x doc]
699							($data, $idd, $idw) = get_data 'doc_word_info.txt';
700
701							pdl> %p = $data->pca;
702							# $cluster is pdl mask dim [doc x ncomp]
703							pdl> $cluster = $p{loading}->pca_cluster;
704
705							# pca clusters var while kmeans clusters obs. hence transpose
706							pdl> ($m, $ss) = $data->transpose->centroid( $cluster );
707							pdl> %k = $data->transpose->kmeans( { cntrd=>$m } );
708
709							# take a look at cluster 0 doc ids
710							pdl> p join("\n", @$idd[ list which $k{cluster}->( ,0) ]);
711
712							=cut
713
714							*pca_cluster = \&PDL::pca_cluster;
715							sub PDL::pca_cluster {
716	1			1	0	3610	my ($self, $opt) = @_;
717
718	1					8	my %opt = (
719							V => 1,
720							ABS => 1, # high pos and neg loadings on a comp in same cluster
721							NCOMP => undef, # max number of components to consider. determined by
722							# scree plot black magic if not specified
723							PLOT => 0, # pca scree plot with cutoff at NCOMP
724							WIN => undef, # pass pgswin object for more plotting control
725							);
726	1	50				5	if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
	1					8
727
728	1					8	my $var = sumover($self ** 2) / $self->dim(0);
729	1	50				80	if (!$opt{NCOMP}) {
730							# here's the black magic part
731	0	0				0	my $comps = ($self->dim(1) > 300)? int($self->dim(1) * .1)
732							: pdl($self->dim(1), 30)->min
733							;
734	0					0	$var = $var->slice([0,$comps-1])->sever;
735	0					0	$opt{NCOMP} = _scree_ind( $var );
736							}
737	1	50				4	$opt{PLOT} and do {
738	0					0	require PDL::Stats::GLM;
739	0					0	$var->plot_screes({NCOMP=>$var->dim(0), CUT=>$opt{NCOMP}, WIN=>$opt{WIN}});
740							};
741
742	1					8	my $c = $self->slice(':',[0,$opt{NCOMP}-1])->transpose->abs->maximum_ind;
743
744	1	50				121	if ($opt{ABS}) {
745	1					5	$c = $c->iv_cluster;
746							}
747							else {
748	0	0				0	my @c = map { ($self->slice($_,$c->slice($_)) >= 0)? $c->slice($_)2 : $c->slice($_)2 + 1 }
	0					0
749							( 0 .. $c->dim(0)-1 );
750	0					0	$c = iv_cluster( \@c );
751							}
752	1	50				5	$opt{V} and print "cluster membership mask as " . $c->info . "\n";
753
754	1					7	return $c;
755							}
756
757							=head1 REFERENCES
758
759							Ding, C., & He, X. (2004). K-means clustering via principal component analysis. Proceedings of the 21st International Conference on Machine Learning, 69, 29.
760
761							Su, T., & Dy, J. (2004). A deterministic method for initializing K-means clustering. 16th IEEE International Conference on Tools with Artificial Intelligence, 784-786.
762
763							Romesburg, H.C. (1984). Cluster Analysis for Researchers. NC: Lulu Press.
764
765							Wikipedia (retrieved June, 2009). K-means clustering. http://en.wikipedia.org/wiki/K-means_algorithm
766
767							=head1 AUTHOR
768
769							Copyright (C) 2009 Maggie J. Xiong
770
771							All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution.
772
773							=cut
774
775							EOD
776
777							pp_done();