File Coverage

blib/lib/PDL/Stats/Kmeans.pm

Criterion	Covered	Total	%
statement	9	9	100.0
branch			n/a
condition			n/a
subroutine	3	3	100.0
pod			n/a
total	12	12	100.0

line	stmt	sub	time	code
1				#
2				# GENERATED WITH PDL::PP from lib/PDL/Stats/Kmeans.pd! Don't modify!
3				#
4				package PDL::Stats::Kmeans;
5
6				our @EXPORT_OK = qw(random_cluster iv_cluster _random_cluster which_cluster assign centroid _d_p2l );
7				our %EXPORT_TAGS = (Func=>\@EXPORT_OK);
8
9	3	3	963	use PDL::Core;
	3		7
	3		27
10	3	3	1440	use PDL::Exporter;
	3		7
	3		22
11	3	3	195	use DynaLoader;
	3		8
	3		1588
12
13
14
15				our @ISA = ( 'PDL::Exporter','DynaLoader' );
16				push @PDL::Core::PP, __PACKAGE__;
17				bootstrap PDL::Stats::Kmeans ;
18
19
20
21
22
23
24
25
26				#line 8 "lib/PDL/Stats/Kmeans.pd"
27
28				use strict;
29				use warnings;
30				use Carp;
31				use PDL::LiteF;
32				use PDL::Stats::Basic;
33
34				=head1 NAME
35
36				PDL::Stats::Kmeans -- classic k-means cluster analysis
37
38				=head1 DESCRIPTION
39
40				Assumes that we have data pdl dim [observation, variable] and the goal is to put observations into clusters based on their values on the variables. The terms "observation" and "variable" are quite arbitrary but serve as a reminder for "that which is being clustered" and "that which is used to cluster".
41
42				The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are broadcastable and methods that are non-broadcastable, respectively.
43
44				=head1 SYNOPSIS
45
46				Implement a basic k-means procedure,
47
48				use PDL::LiteF;
49				use PDL::Stats;
50
51				my ($data, $idv, $ido) = rtable( $file );
52				# or generate random data:
53				$data = grandom(200, 2); # two vars as below
54
55				my ($cluster, $centroid, $ss_centroid, $cluster_last);
56
57				# start out with 8 random clusters
58				$cluster = random_cluster( $data->dim(0), 8 );
59				# iterate to minimize total ss
60				# stop when no more changes in cluster membership
61				do {
62				$cluster_last = $cluster;
63				($centroid, $ss_centroid) = $data->centroid( $cluster );
64				$cluster = $data->assign( $centroid );
65				} while sum(abs($cluster - $cluster_last)) > 0;
66
67				or, use the B function provided here,
68
69				my %k = $data->kmeans( \%opt );
70				print "$_\t$k{$_}\n" for sort keys %k;
71
72				plot the clusters if there are only 2 vars in $data,
73
74				use PDL::Graphics::Simple;
75
76				my ($win, $c);
77				$win = pgswin();
78				$win->plot(map +(with=>'points', $data->dice_axis(0,which($k{cluster}->(,$_)))->dog), 0 .. $k{cluster}->dim(1)-1);
79
80				=cut
81				#line 82 "lib/PDL/Stats/Kmeans.pm"
82
83
84				=head1 FUNCTIONS
85
86				=cut
87
88
89
90
91
92				#line 75 "lib/PDL/Stats/Kmeans.pd"
93
94				#line 76 "lib/PDL/Stats/Kmeans.pd"
95
96				=head2 random_cluster
97
98				=for ref
99
100				Creates masks for random mutually exclusive clusters. Accepts two
101				parameters, num_obs and num_cluster. Extra parameter turns into extra
102				dim in mask. May loop a long time if num_cluster approaches num_obs
103				because empty cluster is not allowed.
104
105				=for usage
106
107				my $cluster = random_cluster( $num_obs, $num_cluster );
108
109				=cut
110
111				# can't be called on pdl
112				sub random_cluster {
113				my ($obs, $clu, @extra) = @_;
114				# extra param in @_ made into extra dim
115				my $cluster = zeroes short(), @_;
116				do {
117				(random($obs, @extra) * $obs)->_random_cluster($cluster);
118				} while (PDL::any $cluster->sumover == 0 );
119				$cluster;
120				}
121				#line 122 "lib/PDL/Stats/Kmeans.pm"
122
123				*_random_cluster = \&PDL::_random_cluster;
124
125
126
127
128
129
130				=head2 which_cluster
131
132				=for sig
133
134				Signature: (short a(o,c); indx [o]b(o))
135				Types: (ushort long)
136
137				=for usage
138
139				$b = which_cluster($a);
140				which_cluster($a, $b); # all arguments given
141				$b = $a->which_cluster; # method call
142				$a->which_cluster($b);
143
144				Given cluster mask dim [obs x clu], returns the cluster index to which an obs belong.
145
146				Does not support overlapping clusters. If an obs has TRUE value for multiple clusters, the returned index is the first cluster the obs belongs to. If an obs has no TRUE value for any cluster, the return val is set to -1 or BAD if the input mask has badflag set.
147
148				Usage:
149
150				# create a cluster mask dim [obs x clu]
151				pdl> p $c_mask = iv_cluster [qw(a a b b c c)]
152				[
153				[1 1 0 0 0 0]
154				[0 0 1 1 0 0]
155				[0 0 0 0 1 1]
156				]
157				# get cluster membership list dim [obs]
158				pdl> p $ic = $c_mask->which_cluster
159				[0 0 1 1 2 2]
160
161				=pod
162
163				Broadcasts over its inputs.
164
165				=for bad
166
167				C processes bad values.
168				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
169
170				=cut
171
172
173
174
175				*which_cluster = \&PDL::which_cluster;
176
177
178
179
180
181
182				=head2 assign
183
184				=for sig
185
186				Signature: (data(o,v); centroid(c,v); short [o]cluster(o,c))
187				Types: (float double)
188
189				=for ref
190
191				Takes data pdl dim [obs x var] and centroid pdl dim [cluster x var] and returns mask dim [obs x cluster] to cluster membership. An obs is assigned to the first cluster with the smallest distance (ie sum squared error) to cluster centroid. With bad value, obs is assigned by smallest mean squared error across variables.
192
193				=for usage
194
195				pdl> p $centroid = xvals 2, 3
196				[
197				[0 1]
198				[0 1]
199				[0 1]
200				]
201
202				pdl> p $b = qsort( random 4, 3 )
203				[
204				[0.022774068 0.032513883 0.13890034 0.30942479]
205				[ 0.16943853 0.50262636 0.56251531 0.7152271]
206				[ 0.23964483 0.59932745 0.60967495 0.78452117]
207				]
208				# notice that 1st 3 obs in $b are on average closer to 0
209				# and last obs closer to 1
210				pdl> p $b->assign( $centroid )
211				[
212				[1 1 1 0] # cluster 0 membership
213				[0 0 0 1] # cluster 1 membership
214				]
215
216				=pod
217
218				Broadcasts over its inputs.
219
220				=for bad
221
222				C processes bad values.
223				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
224
225				=cut
226
227
228
229
230				*assign = \&PDL::assign;
231
232
233
234
235
236
237				=head2 centroid
238
239				=for sig
240
241				Signature: (data(o,v); cluster(o,c); [o]m(c,v); [o]ss(c,v))
242				Types: (float double)
243
244				=for ref
245
246				Takes data dim [obs x var] and mask dim [obs x cluster], returns mean and ss (ms when data contains bad values) dim [cluster x var], using data where mask == 1. Multiple cluster membership for an obs is okay. If a cluster is empty all means and ss are set to zero for that cluster.
247
248				=for usage
249
250				# data is 10 obs x 3 var
251				pdl> p $d = sequence 10, 3
252				[
253				[ 0 1 2 3 4 5 6 7 8 9]
254				[10 11 12 13 14 15 16 17 18 19]
255				[20 21 22 23 24 25 26 27 28 29]
256				]
257				# create two clusters by value on 1st var
258				pdl> p $a = $d( ,(0)) <= 5
259				[1 1 1 1 1 1 0 0 0 0]
260
261				pdl> p $b = $d( ,(0)) > 5
262				[0 0 0 0 0 0 1 1 1 1]
263
264				pdl> p $c = cat $a, $b
265				[
266				[1 1 1 1 1 1 0 0 0 0]
267				[0 0 0 0 0 0 1 1 1 1]
268				]
269
270				pdl> p $d->centroid($c)
271				# mean for 2 cluster x 3 var
272				[
273				[ 2.5 7.5]
274				[12.5 17.5]
275				[22.5 27.5]
276				]
277				# ss for 2 cluster x 3 var
278				[
279				[17.5 5]
280				[17.5 5]
281				[17.5 5]
282				]
283
284
285				=pod
286
287				Broadcasts over its inputs.
288
289				=for bad
290
291				C processes bad values.
292				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
293
294				=cut
295
296
297
298
299				*centroid = \&PDL::centroid;
300
301
302
303
304
305				#line 307 "lib/PDL/Stats/Kmeans.pd"
306
307				#line 308 "lib/PDL/Stats/Kmeans.pd"
308
309				sub _scree_ind {
310				# use as scree cutoff the point with max distance to the line formed
311				# by the 1st and last points in $self
312				# it's a heuristic--whether we can get "good" results depends on
313				# the number of components in $self.
314
315				my ($self) = @_;
316
317				$self = $self->squeeze;
318				$self->ndims > 1 and
319				croak "1D pdl only please";
320
321				my $a = zeroes 2, $self->nelem;
322				$a->slice('(0)') .= sequence $self->nelem;
323				$a->slice('(1)') .= $self;
324
325				my $d = _d_point2line( $a, $a->slice(':,(0)'), $a->slice(':,(-1)') );
326
327				return $d->maximum_ind;
328				}
329
330				sub _d_point2line {
331				my ($self, $p1, $p2) = @_;
332
333				for ($self, $p1, $p2) {
334				$_->dim(0) != 2 and
335				carp "point pdl dim(0) != 2";
336				}
337
338				return _d_p2l( $self->mv(0,-1)->dog, $p1->mv(0,-1)->dog, $p2->mv(0,-1)->dog );
339				}
340				#line 341 "lib/PDL/Stats/Kmeans.pm"
341
342				*_d_p2l = \&PDL::_d_p2l;
343
344
345
346
347
348				#line 358 "lib/PDL/Stats/Kmeans.pd"
349
350				#line 359 "lib/PDL/Stats/Kmeans.pd"
351				=head2 kmeans
352
353				=for ref
354
355				Implements classic k-means cluster analysis.
356
357				=for example
358
359				$data = grandom(200, 2); # two rows = two dimensions
360				%k = $data->kmeans; # use default of 3 clusters
361				print "$_\t$k{$_}\n" for sort keys %k;
362				$w->plot(
363				(map +(with=>'points', style=>$_+1, ke=>"Cluster ".($_+1),
364				$data->dice_axis(0,which($k{cluster}->slice(",$_")))->dog),
365				0 .. $k{cluster}->dim(1)-1),
366				(map +(with=>'circles', style=>$_+1, ke=>"Centroid ".($_+1), $k{centroid}->slice($_)->dog, 0.1),
367				0 .. $k{centroid}->dim(0)-1),
368				{le=>'tr'},
369				);
370
371				Given a number of observations with values on a set of variables,
372				kmeans puts the observations into clusters that maximizes within-cluster
373				similarity with respect to the variables. Tries several different random
374				seeding and clustering in parallel. Stops when cluster assignment of the
375				observations no longer changes. Returns the best result in terms of R2
376				from the random-seeding trials.
377
378				Instead of random seeding, kmeans also accepts manual seeding. This is
379				done by providing a centroid to the function, in which case clustering
380				will proceed from the centroid and there is no multiple tries.
381
382				There are two distinct advantages from seeding with a centroid compared to
383				seeding with predefined cluster membership of a subset of the observations
384				ie "seeds":
385
386				=over
387
388				=item *
389
390				a centroid could come from a previous study with a different set of observations;
391
392				=item *
393
394				a centroid could even be "fictional", or in more proper parlance,
395				an idealized prototype with respect to the actual data. For example,
396				if there are 10 person's ratings of 1 to 5 on 4 movies, ie a ratings
397				pdl of dim [10 obs x 4 var], providing a centroid like
398
399				[
400				[5 0 0 0]
401				[0 5 0 0]
402				[0 0 5 0]
403				[0 0 0 5]
404				]
405
406				will produce 4 clusters of people with each cluster favoring a different
407				one of the 4 movies. Clusters from an idealized centroid may not give the
408				best result in terms of R2, but they sure are a lot more interpretable.
409
410				=back
411
412				If clustering has to be done from predefined clusters of seeds, simply
413				calculate the centroid using the B function and feed it
414				to kmeans,
415
416				my ($centroid, $ss) = $rating($iseeds, )->centroid( $seeds_cluster );
417				my %k = $rating->kmeans( { CNTRD=>$centroid } );
418
419				kmeans supports bad value*.
420
421				=for options
422
423				Default options (case insensitive):
424
425				V => 1, # prints simple status
426				FULL => 0, # returns results for all seeding trials
427
428				CNTRD => PDL->null, # optional. pdl [clu x var]. disables next 3 opts
429
430				NTRY => 5, # num of random seeding trials
431				NSEED => 1000, # num of initial seeds, use NSEED up to max obs
432				NCLUS => 3, # num of clusters
433
434				=for usage
435
436				Usage:
437
438				# suppose we have 4 person's ratings on 5 movies
439
440				pdl> p $rating = ceil( random(4, 5) * 5 )
441				[
442				[3 2 2 3]
443				[2 4 5 4]
444				[5 3 2 3]
445				[3 3 1 5]
446				[4 3 3 2]
447				]
448
449				# we want to put the 4 persons into 2 groups
450
451				pdl> %k = $rating->kmeans( {NCLUS=>2} )
452
453				# by default prints back options used
454				# as well as info for all tries and iterations
455
456				CNTRD => Null
457				FULL => 0
458				NCLUS => 3
459				NSEED => 4
460				NTRY => 5
461				V => 1
462				ss total: 20.5
463				iter 0 R2 [0.024390244 0.024390244 0.26829268 0.4796748 0.4796748]
464				iter 1 R2 [0.46341463 0.46341463 0.4796748 0.4796748 0.4796748]
465
466				pdl> p "$_\t$k{$_}\n" for sort keys %k
467
468				R2 0.479674796747968
469				centroid # mean ratings for 2 group x 5 movies
470				[
471				[ 3 2.3333333]
472				[ 2 4.3333333]
473				[ 5 2.6666667]
474				[ 3 3]
475				[ 4 2.6666667]
476				]
477
478				cluster # 4 persons' membership in two groups
479				[
480				[1 0 0 0]
481				[0 1 1 1]
482				]
483
484				n [1 3] # cluster size
485				ss
486				[
487				[ 0 0.66666667]
488				[ 0 0.66666667]
489				[ 0 0.66666667]
490				[ 0 8]
491				[ 0 0.66666667]
492				]
493
494				Now, for the valiant, kmeans is broadcastable. Say you gathered 10
495				persons' ratings on 5 movies from 2 countries, so the data is dim
496				[10,5,2], and you want to put the 10 persons from each country into
497				3 clusters, just specify NCLUS => [3,1], and there you have it. The
498				key is for NCLUS to include $data->ndims - 1 numbers. The 1 in [3,1]
499				turns into a dummy dim, so the 3-cluster operation is repeated on both
500				countries. Similarly, when seeding, CNTRD needs to have ndims that at
501				least match the data ndims. Extra dims in CNTRD will lead to broadcasting
502				(convenient if you want to try out different centroid locations,
503				for example, but you will have to hand pick the best result). See
504				F for examples w 3D and 4D data.
505
506				*With bad value, R2 is based on average of variances instead of sum squared error.
507
508				=cut
509
510				*kmeans = \&PDL::kmeans;
511				sub PDL::kmeans {
512				my ($self, $opt) = @_;
513				my %opt = (
514				V => 1, # prints simple status
515				FULL => 0, # returns results for all seeding trials
516
517				CNTRD => PDL->null, # optional. pdl [clu x var]. disables next 3 opts
518
519				NTRY => 5, # num of random seeding trials
520				NSEED => 1000, # num of initial seeds, use NSEED up to max obs
521				NCLUS => 3, # num of clusters
522				);
523				if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
524				if (defined($opt{CNTRD}) and $opt{CNTRD}->nelem) {
525				$opt{NTRY} = 1;
526				$opt{NSEED} = $self->dim(0);
527				$opt{NCLUS} = $opt{CNTRD}->dim(0);
528				}
529				else {
530				$opt{NSEED} = pdl($self->dim(0), $opt{NSEED})->min->sclr;
531				}
532				$opt{V} and print "$_\t=> $opt{$_}\n" for sort keys %opt;
533
534				my $ss_ms = $self->badflag? 'ms' : 'ss';
535				my $ss_total
536				= $self->badflag? $self->var->average : $self->ss->sumover;
537				$opt{V} and print "overall $ss_ms:\t$ss_total\n";
538
539				my ($centroid, $ss_cv, $R2, $clus_this, $clus_last);
540
541				# NTRY made into extra dim in $cluster for broadcasting
542				my @nclus = (ref $opt{NCLUS} eq 'ARRAY')? @{$opt{NCLUS}} : ($opt{NCLUS});
543				$clus_this
544				= (defined($opt{CNTRD}) and $opt{CNTRD}->nelem) ?
545				$self->assign( $opt{CNTRD}->dummy(-1) ) # put dummy(-1) to match NTRY
546				: random_cluster($opt{NSEED}, @nclus, $opt{NTRY} )
547				;
548
549				($centroid, $ss_cv) = $self->slice([0,$opt{NSEED} - 1])->centroid( $clus_this );
550				# now obs in $clus_this matches $self
551				$clus_this = $self->assign( $centroid );
552				($centroid, $ss_cv) = $self->centroid( $clus_this );
553
554				my $iter = 0;
555				do {
556				$R2 = $self->badflag? 1 - $ss_cv->average->average / $ss_total
557				: 1 - $ss_cv->sumover->sumover / $ss_total
558				;
559				$opt{V} and print join(' ',('iter', $iter++, 'R2', $R2)) . "\n";
560
561				$clus_last = $clus_this;
562
563				$clus_this = $self->assign( $centroid );
564				($centroid, $ss_cv) = $self->centroid( $clus_this );
565				}
566				while ( any long(abs($clus_this - $clus_last))->sumover->sumover > 0 );
567
568				$opt{FULL} and
569				return (
570				centroid => PDL::squeeze( $centroid ),
571				cluster => PDL::squeeze( $clus_this ),
572				n => PDL::squeeze( $clus_this )->sumover,
573				R2 => PDL::squeeze( $R2 ),
574				$ss_ms => PDL::squeeze( $ss_cv ),
575				);
576
577				# xchg/mv(-1,0) leaves it as was if single dim--unlike transpose
578				my $i_best = $R2->mv(-1,0)->maximum_ind;
579
580				$R2->getndims == 1 and
581				return (
582				centroid => $centroid->dice_axis(-1,$i_best)->sever->squeeze,
583				cluster => $clus_this->dice_axis(-1,$i_best)->sever->squeeze,
584				n => $clus_this->dice_axis(-1,$i_best)->sever->squeeze->sumover,
585				R2 => $R2->dice_axis(-1,$i_best)->sever->squeeze,
586				$ss_ms => $ss_cv->dice_axis(-1,$i_best)->sever->squeeze,
587				);
588
589				# now for broadcasting beyond 2D data
590
591				# can't believe i'm using a perl loop :P
592
593				$i_best = $i_best->flat->sever;
594				my @i_best = map { $opt{NTRY} * $_ + $i_best->slice("($_)") }
595				0 .. $i_best->nelem - 1;
596
597				my @shapes;
598				for ($centroid, $clus_this, $R2) {
599				my @dims = $_->dims;
600				pop @dims;
601				push @shapes, \@dims;
602				}
603
604				$clus_this = $clus_this->mv(-1,2)->clump(2..$clus_this->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[1] } )->sever,
605
606				return (
607				centroid =>
608				$centroid->mv(-1,2)->clump(2..$centroid->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[0] } )->sever,
609
610				cluster => $clus_this,
611				n => $clus_this->sumover,
612
613				R2 =>
614				$R2->mv(-1,0)->clump(0..$R2->ndims-1)->dice_axis(0,\@i_best)->sever->reshape( @{ $shapes[2] } )->sever,
615
616				$ss_ms =>
617				$ss_cv->mv(-1,2)->clump(2..$ss_cv->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[0] } )->sever,
618				);
619				}
620
621				=head1 METHODS
622
623				=head2 iv_cluster
624
625				=for ref
626
627				Turns an independent variable into a cluster pdl. Returns cluster pdl and level-to-pdl_index mapping in list context and cluster pdl only in scalar context.
628
629				This is the method used for mean and var in anova. The difference between iv_cluster and dummy_code is that iv_cluster returns pdl dim [obs x level] whereas dummy_code returns pdl dim [obs x (level - 1)].
630
631				=for usage
632
633				Usage:
634
635				pdl> @bake = qw( y y y n n n )
636
637				# accepts @ ref or 1d pdl
638
639				pdl> p $bake = iv_cluster( \@bake )
640				[
641				[1 1 1 0 0 0]
642				[0 0 0 1 1 1]
643				]
644
645				pdl> p $rating = sequence 6
646				[0 1 2 3 4 5]
647
648				pdl> p $rating->centroid( $bake )
649				# mean for each iv level
650				[
651				[1 4]
652				]
653				# ss
654				[
655				[2 2]
656				]
657
658				=cut
659
660				*iv_cluster = \&PDL::iv_cluster;
661				sub PDL::iv_cluster {
662				my ($var_ref) = @_;
663				my ($var, $map_ref) = PDL::Stats::Basic::code_ivs( $var_ref );
664				my $var_a = yvals( short, $var->nelem, $var->max->sclr + 1 ) == $var;
665				$var_a = $var_a->setbadif( $var->isbad ) if $var->badflag;
666				wantarray ? ($var_a, $map_ref) : $var_a;
667				}
668
669				=head2 pca_cluster
670
671				Assign variables to components ie clusters based on pca loadings or
672				scores. One way to seed kmeans (see Ding & He, 2004, and Su & Dy, 2004
673				for other ways of using pca with kmeans). Variables are assigned to
674				their most associated component. Note that some components may not have
675				any variable that is most associated with them, so the returned number
676				of clusters may be smaller than NCOMP.
677
678				Default options (case insensitive):
679
680				V => 1,
681				ABS => 1, # high pos and neg loadings on a comp in same cluster
682				NCOMP => undef, # max number of components to consider. determined by
683				# scree plot black magic if not specified
684				PLOT => 0, # pca scree plot with cutoff at NCOMP
685				WIN => undef, # pass pgswin object for more plotting control
686
687				Usage:
688
689				# say we need to cluster a group of documents
690				# $data is pdl dim [word x doc]
691				($data, $idd, $idw) = get_data 'doc_word_info.txt';
692
693				pdl> %p = $data->pca;
694				# $cluster is pdl mask dim [doc x ncomp]
695				pdl> $cluster = $p{loading}->pca_cluster;
696
697				# pca clusters var while kmeans clusters obs. hence transpose
698				pdl> ($m, $ss) = $data->transpose->centroid( $cluster );
699				pdl> %k = $data->transpose->kmeans( { cntrd=>$m } );
700
701				# take a look at cluster 0 doc ids
702				pdl> p join("\n", @$idd[ list which $k{cluster}->( ,0) ]);
703
704				=cut
705
706				*pca_cluster = \&PDL::pca_cluster;
707				sub PDL::pca_cluster {
708				my ($self, $opt) = @_;
709
710				my %opt = (
711				V => 1,
712				ABS => 1, # high pos and neg loadings on a comp in same cluster
713				NCOMP => undef, # max number of components to consider. determined by
714				# scree plot black magic if not specified
715				PLOT => 0, # pca scree plot with cutoff at NCOMP
716				WIN => undef, # pass pgswin object for more plotting control
717				);
718				if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
719
720				my $var = sumover($self ** 2) / $self->dim(0);
721				if (!$opt{NCOMP}) {
722				# here's the black magic part
723				my $comps = ($self->dim(1) > 300)? int($self->dim(1) * .1)
724				: pdl($self->dim(1), 30)->min
725				;
726				$var = $var->slice([0,$comps-1])->sever;
727				$opt{NCOMP} = _scree_ind( $var );
728				}
729				$opt{PLOT} and do {
730				require PDL::Stats::GLM;
731				$var->plot_screes({NCOMP=>$var->dim(0), CUT=>$opt{NCOMP}, WIN=>$opt{WIN}});
732				};
733
734				my $c = $self->slice(':',[0,$opt{NCOMP}-1])->transpose->abs->maximum_ind;
735
736				if ($opt{ABS}) {
737				$c = $c->iv_cluster;
738				}
739				else {
740				my @c = map { ($self->slice($_,$c->slice($_)) >= 0)? $c->slice($_)2 : $c->slice($_)2 + 1 }
741				( 0 .. $c->dim(0)-1 );
742				$c = iv_cluster( \@c );
743				}
744				$opt{V} and print "cluster membership mask as " . $c->info . "\n";
745
746				return $c;
747				}
748
749				=head1 REFERENCES
750
751				Ding, C., & He, X. (2004). K-means clustering via principal component analysis. Proceedings of the 21st International Conference on Machine Learning, 69, 29.
752
753				Su, T., & Dy, J. (2004). A deterministic method for initializing K-means clustering. 16th IEEE International Conference on Tools with Artificial Intelligence, 784-786.
754
755				Romesburg, H.C. (1984). Cluster Analysis for Researchers. NC: Lulu Press.
756
757				Wikipedia (retrieved June, 2009). K-means clustering. http://en.wikipedia.org/wiki/K-means_algorithm
758
759				=head1 AUTHOR
760
761				Copyright (C) 2009 Maggie J. Xiong
762
763				All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution.
764
765				=cut
766				#line 767 "lib/PDL/Stats/Kmeans.pm"
767
768				# Exit with OK status
769
770				1;