File Coverage

blib/lib/PDL/Cluster.pm

Criterion	Covered	Total	%
statement	9	9	100.0
branch			n/a
condition			n/a
subroutine	3	3	100.0
pod			n/a
total	12	12	100.0

line	stmt	sub	time	code
1				#
2				# GENERATED WITH PDL::PP from Cluster.pd! Don't modify!
3				#
4				package PDL::Cluster;
5
6				our @EXPORT_OK = qw(cmean cmedian calculate_weights clusterdistance distancematrix getclustercentroids getclustermean getclustermedian getclustermedoids kcluster kmedoids treecluster treeclusterd cuttree somcluster pca rowdistances clusterdistances clustersizes clusterelements clusterelementmask clusterdistancematrix clusterenc clusterdec clusteroffsets clusterdistancematrixenc clusterdistancesenc getclusterwsum attachtonearest attachtonearestd checkprototypes checkpartitions randomprototypes randompartition );
7				our %EXPORT_TAGS = (Func=>\@EXPORT_OK);
8
9	9	9	2947935	use PDL::Core;
	9		25
	9		70
10	9	9	3786	use PDL::Exporter;
	9		21
	9		101
11	9	9	433	use DynaLoader;
	9		19
	9		15340
12
13
14				our $VERSION = '1.54.004';
15				our @ISA = ( 'PDL::Exporter','DynaLoader' );
16				push @PDL::Core::PP, __PACKAGE__;
17				bootstrap PDL::Cluster $VERSION;
18
19
20
21
22
23
24
25
26				#line 14 "Cluster.pd"
27
28				#---------------------------------------------------------------------------
29				# File: PDL::Cluster.pm
30				# Author: Bryan Jurish
31				# Description: PDL wrappers for the C Clustering library.
32				#
33				# Copyright (c) 2005-2021 Bryan Jurish. All rights reserved.
34				# This program is free software. You may modify and/or
35				# distribute it under the same terms as Perl itself.
36				#
37				#---------------------------------------------------------------------------
38				# Based on the C clustering library for cDNA microarray data,
39				# Copyright (C) 2002-2005 Michiel Jan Laurens de Hoon.
40				#
41				# The C clustering library was written at the Laboratory of DNA Information
42				# Analysis, Human Genome Center, Institute of Medical Science, University of
43				# Tokyo, 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
44				# Contact: michiel.dehoon 'AT' riken.jp
45				#
46				# See the files "cluster.c" and "cluster.h" in the PDL::Cluster distribution
47				# for details.
48				#---------------------------------------------------------------------------
49
50				=pod
51
52				=head1 NAME
53
54				PDL::Cluster - PDL interface to the C Clustering Library
55
56				=head1 SYNOPSIS
57
58				use PDL::Cluster;
59
60				##-----------------------------------------------------
61				## Data Format
62				$d = 42; ##-- number of features
63				$n = 1024; ##-- number of data elements
64
65				$data = random($d,$n); ##-- data matrix
66				$elt = $data->slice(",($i)"); ##-- element data vector
67				$ftr = $data->slice("($j),"); ##-- feature vector over all elements
68
69				$wts = ones($d)/$d; ##-- feature weights
70				$msk = ones($d,$n); ##-- missing-datum mask (1=ok)
71
72				##-----------------------------------------------------
73				## Library Utilties
74
75				$mean = $ftr->cmean();
76				$median = $ftr->cmedian();
77
78				calculate_weights($data,$msk,$wts, $cutoff,$expnt,
79				$weights);
80
81				##-----------------------------------------------------
82				## Distance Functions
83
84				clusterdistance($data,$msk,$wts, $n1,$n2,$idx1,$idx2,
85				$dist,
86				$distFlag, $methodFlag2);
87
88				distancematrix($data,$msk,$wts, $distmat, $distFlag);
89
90				##-----------------------------------------------------
91				## Partitioning Algorithms
92
93				getclustermean($data,$msk,$clusterids,
94				$ctrdata, $ctrmask);
95
96				getclustermedian($data,$msk,$clusterids,
97				$ctrdata, $ctrmask);
98
99				getclustermedoid($distmat,$clusterids,$centroids,
100				$errorsums);
101
102				kcluster($k, $data,$msk,$wts, $npass,
103				$clusterids, $error, $nfound,
104				$distFlag, $methodFlag);
105
106				kmedoids($k, $distmat,$npass,
107				$clusterids, $error, $nfound);
108
109				##-----------------------------------------------------
110				## Hierarchical Algorithms
111
112				treecluster($data,$msk,$wts,
113				$tree, $lnkdist,
114				$distFlag, $methodFlag);
115
116				treeclusterd($data,$msk,$wts, $distmat,
117				$tree, $lnkdist,
118				$distFlag, $methodFlag);
119
120				cuttree($tree, $nclusters,
121				$clusterids);
122
123				##-----------------------------------------------------
124				## Self-Organizing Maps
125
126				somcluster($data,$msk,$wts, $nx,$ny,$tau,$niter,
127				$clusterids,
128				$distFlag);
129
130				##-----------------------------------------------------
131				## Principal Component Analysis
132
133				pca($U, $S, $V);
134
135				##-----------------------------------------------------
136				## Extensions
137
138				rowdistances($data,$msk,$wts, $rowids1,$rowids2, $distvec, $distFlag);
139				clusterdistances($data,$msk,$wts, $rowids, $index2,
140				$dist,
141				$distFlag, $methodFlag);
142
143				clustersizes($clusterids, $clustersizes);
144				clusterelements($clustierids, $clustersizes, $eltids);
145				clusterelementmask($clusterids, $eltmask);
146
147				clusterdistancematrix($data,$msk,$wts,
148				$rowids, $clustersizes, $eltids,
149				$dist,
150				$distFlag, $methodFlag);
151
152				clusterenc($clusterids, $clens,$cvals,$crowids, $k);
153				clusterdec($clens,$cvals,$crowids, $clusterids, $k);
154				clusteroffsets($clusterids, $coffsets,$cvals,$crowids, $k);
155				clusterdistancematrixenc($data,$msk,$wts,
156				$clens1,$crowids1, $clens2,$crowids2,
157				$dist,
158				$distFlag, $methodFlag);
159
160				=cut
161				#line 162 "Cluster.pm"
162
163
164				=head1 FUNCTIONS
165
166				=cut
167
168
169
170
171
172
173				=head2 cmean
174
175				=for sig
176
177				Signature: (double a(n); double [o]b())
178				Types: (double)
179
180				=for usage
181
182				$b = cmean($a);
183				cmean($a, $b); # all arguments given
184				$b = $a->cmean; # method call
185				$a->cmean($b);
186
187				=for ref
188
189				Computes arithmetic mean of the vector $a(). See also PDL::Primitive::avg().
190
191				=pod
192
193				Broadcasts over its inputs.
194
195				=for bad
196
197				C does not process bad values.
198				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
199
200				=cut
201
202
203
204
205				*cmean = \&PDL::cmean;
206
207
208
209
210
211
212				=head2 cmedian
213
214				=for sig
215
216				Signature: (double a(n); double [o]b())
217				Types: (double)
218
219				=for usage
220
221				$b = cmedian($a);
222				cmedian($a, $b); # all arguments given
223				$b = $a->cmedian; # method call
224				$a->cmedian($b);
225
226				=for ref
227
228				Computes median of the vector $a(). See also PDL::Primitive::median().
229
230				=pod
231
232				Broadcasts over its inputs.
233
234				=for bad
235
236				C does not process bad values.
237				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
238
239				=cut
240
241
242
243
244				*cmedian = \&PDL::cmedian;
245
246
247
248
249
250
251				=head2 calculate_weights
252
253				=for sig
254
255				Signature: (
256				double data(d,n);
257				int mask(d,n);
258				double weight(d);
259				double cutoff();
260				double exponent();
261				double [o]oweights(d);
262				; char *distFlag;
263				)
264				Types: (sbyte byte short ushort long ulong indx ulonglong longlong
265				float double ldouble)
266
267				=for usage
268
269				$oweights = calculate_weights($data, $mask, $weight, $cutoff, $exponent, $distFlag);
270				calculate_weights($data, $mask, $weight, $cutoff, $exponent, $oweights, $distFlag); # all arguments given
271				$oweights = $data->calculate_weights($mask, $weight, $cutoff, $exponent, $distFlag); # method call
272				$data->calculate_weights($mask, $weight, $cutoff, $exponent, $oweights, $distFlag);
273
274				This function calculates weights for the features using the weighting scheme
275				proposed by Michael Eisen:
276
277				w[i] = 1.0 / sum_{j where dist(i,j)
278
279				where the cutoff and the exponent are specified by the user.
280
281				=pod
282
283				Broadcasts over its inputs.
284
285				=for bad
286
287				C does not process bad values.
288				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
289
290				=cut
291
292
293
294
295				*calculate_weights = \&PDL::calculate_weights;
296
297
298
299
300
301
302				=head2 clusterdistance
303
304				=for sig
305
306				Signature: (
307				double data(d,n);
308				int mask(d,n);
309				double weight(d);
310				int n1();
311				int n2();
312				int index1(n1);
313				int index2(n2);
314				double [o]dist();
315				;
316				char *distFlag;
317				char *methodFlag;
318				)
319				Types: (sbyte byte short ushort long ulong indx ulonglong longlong
320				float double ldouble)
321
322				=for usage
323
324				$dist = clusterdistance($data, $mask, $weight, $n1, $n2, $index1, $index2, $distFlag, $methodFlag);
325				clusterdistance($data, $mask, $weight, $n1, $n2, $index1, $index2, $dist, $distFlag, $methodFlag); # all arguments given
326				$dist = $data->clusterdistance($mask, $weight, $n1, $n2, $index1, $index2, $distFlag, $methodFlag); # method call
327				$data->clusterdistance($mask, $weight, $n1, $n2, $index1, $index2, $dist, $distFlag, $methodFlag);
328
329				Computes distance between two clusters $index1() and $index2().
330				Each of the $index() vectors represents a single cluster whose values
331				are the row-indices in the $data() matrix of the elements assigned
332				to the respective cluster. $n1() and $n2() are the number of elements
333				in $index1() and $index2(), respectively. Each $index$i() must have
334				at least $n$i() elements allocated.
335
336				B the $methodFlag argument is interpreted differently than
337				by the treecluster() method, namely:
338
339				=over 4
340
341				=item a
342
343				Distance between the arithmetic means of the two clusters,
344				as for treecluster() "f".
345
346				=item m
347
348				Distance between the medians of the two clusters,
349				as for treecluster() "c".
350
351				=item s
352
353				Minimum pairwise distance between members of the two clusters,
354				as for treecluster() "s".
355
356				=item x
357
358				Maximum pairwise distance between members of the two clusters
359				as for treecluster() "m".
360
361				=item v
362
363				Average of the pairwise distances between members of the two clusters,
364				as for treecluster() "a".
365
366				=back
367
368				=pod
369
370				Broadcasts over its inputs.
371
372				=for bad
373
374				C does not process bad values.
375				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
376
377				=cut
378
379
380
381
382				*clusterdistance = \&PDL::clusterdistance;
383
384
385
386
387
388
389				=head2 distancematrix
390
391				=for sig
392
393				Signature: (
394				double data(d,n);
395				int mask(d,n);
396				double weight(d);
397				double [o]dists(n,n);
398				; char *distFlag;
399				)
400				Types: (sbyte byte short ushort long ulong indx ulonglong longlong
401				float double ldouble)
402
403				=for usage
404
405				$dists = distancematrix($data, $mask, $weight, $distFlag);
406				distancematrix($data, $mask, $weight, $dists, $distFlag); # all arguments given
407				$dists = $data->distancematrix($mask, $weight, $distFlag); # method call
408				$data->distancematrix($mask, $weight, $dists, $distFlag);
409
410				=for ref
411
412				Compute triangular distance matrix over all data points.
413
414				=pod
415
416				Broadcasts over its inputs.
417
418				=for bad
419
420				C does not process bad values.
421				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
422
423				=cut
424
425
426
427
428				*distancematrix = \&PDL::distancematrix;
429
430
431
432
433
434
435				=head2 getclustercentroids
436
437				=for sig
438
439				Signature: (
440				double data(d,n);
441				int mask(d,n);
442				int clusterids(n);
443				double [o]cdata(d,k);
444				int [o]cmask(d,k);
445				; char *ctrMethodFlag;
446				)
447				Types: (sbyte byte short ushort long ulong indx ulonglong longlong
448				float double ldouble)
449
450				=for usage
451
452				($cdata, $cmask) = getclustercentroids($data, $mask, $clusterids, $ctrMethodFlag);
453				getclustercentroids($data, $mask, $clusterids, $cdata, $cmask, $ctrMethodFlag); # all arguments given
454				($cdata, $cmask) = $data->getclustercentroids($mask, $clusterids, $ctrMethodFlag); # method call
455				$data->getclustercentroids($mask, $clusterids, $cdata, $cmask, $ctrMethodFlag);
456
457				=for ref
458
459				Find cluster centroids by arithmetic mean (C) or median over each dimension (C).
460
461				=pod
462
463				Broadcasts over its inputs.
464
465				=for bad
466
467				C does not process bad values.
468				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
469
470				=cut
471
472
473
474
475				*getclustercentroids = \&PDL::getclustercentroids;
476
477
478
479
480
481				#line 589 "Cluster.pd"
482
483				=pod
484
485				=head2 getclustermean
486
487				=for sig
488
489				Signature: (
490				double data(d,n);
491				int mask(d,n);
492				int clusterids(n);
493				double [o]cdata(d,k);
494				int [o]cmask(d,k);
495				)
496
497				Really just a wrapper for getclustercentroids(...,"a").
498
499				=cut
500
501				sub getclustermean {
502				my ($data,$mask,$cids,$cdata,$cmask) = @_;
503				return getclustercentroids($dat,$mask,$cids,$cdata,$cmask,'a');
504				}
505
506				#line 620 "Cluster.pd"
507
508				=pod
509
510				=head2 getclustermedian
511
512				=for sig
513
514				Signature: (
515				double data(d,n);
516				int mask(d,n);
517				int clusterids(n);
518				double [o]cdata(d,k);
519				int [o]cmask(d,k);
520				)
521
522				Really just a wrapper for getclustercentroids(...,"m").
523
524				=cut
525
526				sub getclustermedian {
527				my ($data,$mask,$cids,$cdata,$cmask) = @_;
528				return getclustercentroids($dat,$mask,$cids,$cdata,$cmask,'m');
529				}
530				#line 531 "Cluster.pm"
531
532
533				=head2 getclustermedoids
534
535				=for sig
536
537				Signature: (
538				double distance(n,n);
539				int clusterids(n);
540				int [o]centroids(k);
541				double [o]errors(k);
542				)
543				Types: (sbyte byte short ushort long ulong indx ulonglong longlong
544				float double ldouble)
545
546				=for usage
547
548				($centroids, $errors) = getclustermedoids($distance, $clusterids);
549				getclustermedoids($distance, $clusterids, $centroids, $errors); # all arguments given
550				($centroids, $errors) = $distance->getclustermedoids($clusterids); # method call
551				$distance->getclustermedoids($clusterids, $centroids, $errors);
552
553				The getclustermedoid routine calculates the cluster centroids, given to which
554				cluster each element belongs. The centroid is defined as the element with the
555				smallest sum of distances to the other elements.
556
557				=pod
558
559				Broadcasts over its inputs.
560
561				=for bad
562
563				C does not process bad values.
564				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
565
566				=cut
567
568
569
570
571				*getclustermedoids = \&PDL::getclustermedoids;
572
573
574
575
576
577
578				=head2 kcluster
579
580				=for sig
581
582				Signature: (
583				int nclusters();
584				double data(d,n);
585				int mask(d,n);
586				double weight(d);
587				int npass();
588				int [o]clusterids(n);
589				double [o]error();
590				int [o]nfound();
591				;
592				char *distFlag;
593				char *ctrMethodFlag;
594				)
595				Types: (sbyte byte short ushort long ulong indx ulonglong longlong
596				float double ldouble)
597
598				=for usage
599
600				($clusterids, $error, $nfound) = kcluster($nclusters, $data, $mask, $weight, $npass, $distFlag, $ctrMethodFlag);
601				kcluster($nclusters, $data, $mask, $weight, $npass, $clusterids, $error, $nfound, $distFlag, $ctrMethodFlag); # all arguments given
602				($clusterids, $error, $nfound) = $nclusters->kcluster($data, $mask, $weight, $npass, $distFlag, $ctrMethodFlag); # method call
603				$nclusters->kcluster($data, $mask, $weight, $npass, $clusterids, $error, $nfound, $distFlag, $ctrMethodFlag);
604
605				K-Means clustering algorithm. The "ctrMethodFlag" determines how
606				clusters centroids are to be computed; see getclustercentroids() for details.
607
608				Because the C library code reads from the C if and only if
609				C is 0, before writing to it, it would be inconvenient to
610				set it to C<[io]>. However for efficiency reasons, as of 2.096, PDL
611				will not convert it (force a read-back on the conversion) for you
612				if you pass in the wrongly-typed data. This means that you should
613				be careful to pass in C data of the right size if you set C
614				to 0.
615
616				See also: kmedoids().
617
618				=pod
619
620				Broadcasts over its inputs.
621
622				=for bad
623
624				C does not process bad values.
625				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
626
627				=cut
628
629
630
631
632				*kcluster = \&PDL::kcluster;
633
634
635
636
637
638
639				=head2 kmedoids
640
641				=for sig
642
643				Signature: (
644				int nclusters();
645				double distance(n,n);
646				int npass();
647				int [o]clusterids(n);
648				double [o]error();
649				int [o]nfound();
650				)
651				Types: (sbyte byte short ushort long ulong indx ulonglong longlong
652				float double ldouble)
653
654				=for usage
655
656				($clusterids, $error, $nfound) = kmedoids($nclusters, $distance, $npass);
657				kmedoids($nclusters, $distance, $npass, $clusterids, $error, $nfound); # all arguments given
658				($clusterids, $error, $nfound) = $nclusters->kmedoids($distance, $npass); # method call
659				$nclusters->kmedoids($distance, $npass, $clusterids, $error, $nfound);
660
661				K-Medoids clustering algorithm (uses distance matrix).
662
663				See also: kcluster().
664
665				=pod
666
667				Broadcasts over its inputs.
668
669				=for bad
670
671				C does not process bad values.
672				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
673
674				=cut
675
676
677
678
679				*kmedoids = \&PDL::kmedoids;
680
681
682
683
684
685
686				=head2 treecluster
687
688				=for sig
689
690				Signature: (
691				double data(d,n);
692				int mask(d,n);
693				double weight(d);
694				int [o]tree(2,n);
695				double [o]lnkdist(n);
696				;
697				char *distFlag;
698				char *methodFlag;
699				)
700				Types: (sbyte byte short ushort long ulong indx ulonglong longlong
701				float double ldouble)
702
703				=for usage
704
705				($tree, $lnkdist) = treecluster($data, $mask, $weight, $distFlag, $methodFlag);
706				treecluster($data, $mask, $weight, $tree, $lnkdist, $distFlag, $methodFlag); # all arguments given
707				($tree, $lnkdist) = $data->treecluster($mask, $weight, $distFlag, $methodFlag); # method call
708				$data->treecluster($mask, $weight, $tree, $lnkdist, $distFlag, $methodFlag);
709
710				Hierachical agglomerative clustering.
711
712				$tree(2,n) represents the clustering solution.
713				Each row in the matrix describes one linking event,
714				with the two columns containing the name of the nodes that were joined.
715				The original genes are numbered 0..(n-1), nodes are numbered
716				-1..-(n-1).
717				$tree(2,n) thus actually uses only (2,n-1) cells.
718
719				$lnkdist(n) represents the distance between the two subnodes that were joined.
720				As for $tree(), $lnkdist() uses only (n-1) cells.
721
722				=pod
723
724				Broadcasts over its inputs.
725
726				=for bad
727
728				C does not process bad values.
729				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
730
731				=cut
732
733
734
735
736				*treecluster = \&PDL::treecluster;
737
738
739
740
741
742
743				=head2 treeclusterd
744
745				=for sig
746
747				Signature: (
748				double data(d,n);
749				int mask(d,n);
750				double weight(d);
751				double distances(n,n);
752				int [o]tree(2,n);
753				double [o]lnkdist(n);
754				;
755				char *distFlag;
756				char *methodFlag;
757				)
758				Types: (sbyte byte short ushort long ulong indx ulonglong longlong
759				float double ldouble)
760
761				=for usage
762
763				($tree, $lnkdist) = treeclusterd($data, $mask, $weight, $distances, $distFlag, $methodFlag);
764				treeclusterd($data, $mask, $weight, $distances, $tree, $lnkdist, $distFlag, $methodFlag); # all arguments given
765				($tree, $lnkdist) = $data->treeclusterd($mask, $weight, $distances, $distFlag, $methodFlag); # method call
766				$data->treeclusterd($mask, $weight, $distances, $tree, $lnkdist, $distFlag, $methodFlag);
767
768				Hierachical agglomerative clustering using given distance matrix.
769
770				See distancematrix() and treecluster(), above.
771
772				=pod
773
774				Broadcasts over its inputs.
775
776				=for bad
777
778				C does not process bad values.
779				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
780
781				=cut
782
783
784
785
786				*treeclusterd = \&PDL::treeclusterd;
787
788
789
790
791
792
793				=head2 cuttree
794
795				=for sig
796
797				Signature: (
798				int tree(2,n);
799				int nclusters();
800				int [o]clusterids(n);
801				)
802				Types: (sbyte byte short ushort long ulong indx ulonglong longlong
803				float double ldouble)
804
805				=for usage
806
807				$clusterids = cuttree($tree, $nclusters);
808				cuttree($tree, $nclusters, $clusterids); # all arguments given
809				$clusterids = $tree->cuttree($nclusters); # method call
810				$tree->cuttree($nclusters, $clusterids);
811
812				Cluster selection for hierarchical clustering trees.
813
814				=pod
815
816				Broadcasts over its inputs.
817
818				=for bad
819
820				C does not process bad values.
821				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
822
823				=cut
824
825
826
827
828				*cuttree = \&PDL::cuttree;
829
830
831
832
833
834
835				=head2 somcluster
836
837				=for sig
838
839				Signature: (
840				double data(d,n);
841				int mask(d,n);
842				double weight(d);
843				int nxnodes();
844				int nynodes();
845				double inittau();
846				int niter();
847				int [o]clusterids(2,n);
848				; char *distFlag;
849				)
850				Types: (sbyte byte short ushort long ulong indx ulonglong longlong
851				float double ldouble)
852
853				=for usage
854
855				$clusterids = somcluster($data, $mask, $weight, $nxnodes, $nynodes, $inittau, $niter, $distFlag);
856				somcluster($data, $mask, $weight, $nxnodes, $nynodes, $inittau, $niter, $clusterids, $distFlag); # all arguments given
857				$clusterids = $data->somcluster($mask, $weight, $nxnodes, $nynodes, $inittau, $niter, $distFlag); # method call
858				$data->somcluster($mask, $weight, $nxnodes, $nynodes, $inittau, $niter, $clusterids, $distFlag);
859
860				=for ref
861
862				Self-Organizing Map clustering, does not return centroid data.
863
864				=pod
865
866				Broadcasts over its inputs.
867
868				=for bad
869
870				C does not process bad values.
871				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
872
873				=cut
874
875
876
877
878				*somcluster = \&PDL::somcluster;
879
880
881
882
883
884
885				=head2 pca
886
887				=for sig
888
889				Signature: (
890				double [o]U(d,n);
891				double [o]S(d);
892				double [o]V(d,d);
893				)
894				Types: (sbyte byte short ushort long ulong indx ulonglong longlong
895				float double ldouble)
896
897				=for usage
898
899				($U, $S, $V) = pca();
900				pca($U, $S, $V); # all arguments given
901				$U->pca($S, $V);
902
903				Principal Component Analysis (SVD), operates in-place on $U() and requires ($SIZE(n) E= $SIZE(d)).
904
905				=pod
906
907				Broadcasts over its inputs.
908
909				=for bad
910
911				C does not process bad values.
912				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
913
914				=cut
915
916
917
918
919				*pca = \&PDL::pca;
920
921
922
923
924
925
926				=head2 rowdistances
927
928				=for sig
929
930				Signature: (
931				double data(d,n);
932				int mask(d,n);
933				double weight(d);
934				int rowids1(ncmps);
935				int rowids2(ncmps);
936				double [o]dist(ncmps);
937				; char *distFlag;
938				)
939				Types: (sbyte byte short ushort long ulong indx ulonglong longlong
940				float double ldouble)
941
942				=for usage
943
944				$dist = rowdistances($data, $mask, $weight, $rowids1, $rowids2, $distFlag);
945				rowdistances($data, $mask, $weight, $rowids1, $rowids2, $dist, $distFlag); # all arguments given
946				$dist = $data->rowdistances($mask, $weight, $rowids1, $rowids2, $distFlag); # method call
947				$data->rowdistances($mask, $weight, $rowids1, $rowids2, $dist, $distFlag);
948
949				Computes pairwise distances between rows of $data().
950				$rowids1() contains the row-indices of the left (first) comparison operand,
951				and $rowids2() the row-indices of the right (second) comparison operand. Since each
952				of these are assumed to be indices into the first dimension $data(), it should be the case that:
953
954				0 <= $rowids1(i),rowids2(i) < $SIZE(n) for 0 <= i < $SIZE(ncmps)
955
956				See also clusterdistance(), clusterdistances(), clusterdistancematrixenc(), distancematrix().
957
958				=pod
959
960				Broadcasts over its inputs.
961
962				=for bad
963
964				C does not process bad values.
965				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
966
967				=cut
968
969
970
971
972				*rowdistances = \&PDL::rowdistances;
973
974
975
976
977
978
979				=head2 clusterdistances
980
981				=for sig
982
983				Signature: (
984				double data(d,n);
985				int mask(d,n);
986				double weight(d);
987				int rowids(nr);
988				int index2(n2);
989				double [o]dist(nr);
990				;
991				char *distFlag;
992				char *methodFlag;
993				)
994				Types: (sbyte byte short ushort long ulong indx ulonglong longlong
995				float double ldouble)
996
997				=for usage
998
999				$dist = clusterdistances($data, $mask, $weight, $rowids, $index2, $distFlag, $methodFlag);
1000				clusterdistances($data, $mask, $weight, $rowids, $index2, $dist, $distFlag, $methodFlag); # all arguments given
1001				$dist = $data->clusterdistances($mask, $weight, $rowids, $index2, $distFlag, $methodFlag); # method call
1002				$data->clusterdistances($mask, $weight, $rowids, $index2, $dist, $distFlag, $methodFlag);
1003
1004				Computes pairwise distance(s) from each of $rowids() as a singleton cluster
1005				with the cluster represented by $index2(), which should be an index
1006				vector as for clusterdistance(). See also clusterdistancematrixenc().
1007
1008				=pod
1009
1010				Broadcasts over its inputs.
1011
1012				=for bad
1013
1014				C does not process bad values.
1015				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
1016
1017				=cut
1018
1019
1020
1021
1022				*clusterdistances = \&PDL::clusterdistances;
1023
1024
1025
1026
1027
1028
1029				=head2 clustersizes
1030
1031				=for sig
1032
1033				Signature: (int clusterids(n); int [o]clustersizes(k))
1034				Types: (sbyte byte short ushort long ulong indx ulonglong longlong
1035				float double ldouble)
1036
1037				=for usage
1038
1039				$clustersizes = clustersizes($clusterids);
1040				clustersizes($clusterids, $clustersizes); # all arguments given
1041				$clustersizes = $clusterids->clustersizes; # method call
1042				$clusterids->clustersizes($clustersizes);
1043
1044				Computes the size (number of elements) of each cluster in $clusterids().
1045				Useful for allocating less than maximmal space for $clusterelements().
1046
1047				=pod
1048
1049				Broadcasts over its inputs.
1050
1051				=for bad
1052
1053				The output piddle should never be marked BAD.
1054
1055				=cut
1056
1057
1058
1059
1060				*clustersizes = \&PDL::clustersizes;
1061
1062
1063
1064
1065
1066
1067				=head2 clusterelements
1068
1069				=for sig
1070
1071				Signature: (int clusterids(n); int [o]clustersizes(k); int [o]eltids(mcsize,k))
1072				Types: (sbyte byte short ushort long ulong indx ulonglong longlong
1073				float double ldouble)
1074
1075				=for usage
1076
1077				($clustersizes, $eltids) = clusterelements($clusterids);
1078				clusterelements($clusterids, $clustersizes, $eltids); # all arguments given
1079				($clustersizes, $eltids) = $clusterids->clusterelements; # method call
1080				$clusterids->clusterelements($clustersizes, $eltids);
1081
1082				Converts the vector $clusterids() to a matrix $eltids() of element (row) indices
1083				indexed by cluster-id. $mcsize() is the maximum number of elements per cluster,
1084				at most $n. The output PDLs $clustersizes() and $eltids() can be passed to
1085				clusterdistancematrix().
1086
1087				=pod
1088
1089				Broadcasts over its inputs.
1090
1091				=for bad
1092
1093				C does not process bad values.
1094				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
1095
1096				=cut
1097
1098
1099
1100
1101				*clusterelements = \&PDL::clusterelements;
1102
1103
1104
1105
1106
1107
1108				=head2 clusterelementmask
1109
1110				=for sig
1111
1112				Signature: (int clusterids(n); byte [o]eltmask(k,n))
1113				Types: (sbyte byte short ushort long ulong indx ulonglong longlong
1114				float double ldouble)
1115
1116				=for usage
1117
1118				$eltmask = clusterelementmask($clusterids);
1119				clusterelementmask($clusterids, $eltmask); # all arguments given
1120				$eltmask = $clusterids->clusterelementmask; # method call
1121				$clusterids->clusterelementmask($eltmask);
1122
1123				Get boolean membership mask $eltmask() based on cluster assignment in $clusterids().
1124				No value in $clusterids() may be greater than or equal to $k.
1125				On completion, $eltmask(k,n) is a true value iff $clusterids(n)=$k.
1126
1127				=pod
1128
1129				Broadcasts over its inputs.
1130
1131				=for bad
1132
1133				C does not process bad values.
1134				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
1135
1136				=cut
1137
1138
1139
1140
1141				*clusterelementmask = \&PDL::clusterelementmask;
1142
1143
1144
1145
1146
1147
1148				=head2 clusterdistancematrix
1149
1150				=for sig
1151
1152				Signature: (
1153				double data(d,n);
1154				int mask(d,n);
1155				double weight(d);
1156				int rowids(nr);
1157				int clustersizes(k);
1158				int eltids(mcsize,k);
1159				double [o]dist(k,nr);
1160				;
1161				char *distFlag;
1162				char *methodFlag;
1163				)
1164				Types: (sbyte byte short ushort long ulong indx ulonglong longlong
1165				float double ldouble)
1166
1167				=for usage
1168
1169				$dist = clusterdistancematrix($data, $mask, $weight, $rowids, $clustersizes, $eltids, $distFlag, $methodFlag);
1170				clusterdistancematrix($data, $mask, $weight, $rowids, $clustersizes, $eltids, $dist, $distFlag, $methodFlag); # all arguments given
1171				$dist = $data->clusterdistancematrix($mask, $weight, $rowids, $clustersizes, $eltids, $distFlag, $methodFlag); # method call
1172				$data->clusterdistancematrix($mask, $weight, $rowids, $clustersizes, $eltids, $dist, $distFlag, $methodFlag);
1173
1174				B in favor of clusterdistancematrixenc().
1175				In the future, this method is expected to become a wrapper for clusterdistancematrixenc().
1176
1177				Computes distance between each row index in $rowids()
1178				considered as a singleton cluster
1179				and each of the $k clusters whose elements are given by a single row of $eltids().
1180				$clustersizes() and $eltids() are as output by the clusterelements() method.
1181
1182				See also clusterdistance(), clusterdistances(), clustersizes(), clusterelements(), clusterdistancematrixenc().
1183
1184				=pod
1185
1186				Broadcasts over its inputs.
1187
1188				=for bad
1189
1190				C does not process bad values.
1191				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
1192
1193				=cut
1194
1195
1196
1197
1198				*clusterdistancematrix = \&PDL::clusterdistancematrix;
1199
1200
1201
1202
1203
1204				#line 1196 "Cluster.pd"
1205
1206				=pod
1207
1208				=head2 clusterenc
1209
1210				=for sig
1211
1212				Signature: (
1213				int clusterids(n);
1214				int [o]clusterlens(k1);
1215				int [o]clustervals(k1);
1216				int [o]clusterrows(n);
1217				;
1218				int k1;
1219				)
1220
1221				Encodes datum-to-cluster vector $clusterids() for efficiently mapping
1222				clusters-to-data. Returned PDL $clusterlens() holds the lengths of each
1223				cluster containing at least one element. $clustervals() holds the IDs
1224				of such clusters as they appear as values in $clusterids(). $clusterrows()
1225				is such that:
1226
1227				all( rld($clusterlens, $clustervals) == $clusterids )
1228
1229				... if all available cluster-ids are in use.
1230
1231				If specified, $k1 is a perl scalar
1232				holding the number of clusters (maximum cluster index + 1); an
1233				appropriate value will guessed from $clusterids() otherwise.
1234
1235				Really just a wrapper for some lower-level PDL and PDL::Cluster calls.
1236
1237				=cut
1238
1239				sub clusterenc {
1240				my ($cids, $clens,$cvals,$crows, $kmax) = @_;
1241				$kmax = $cids->max+1 if (!defined($kmax));
1242
1243				##-- cluster sizes
1244				$clens = zeroes(long, $kmax) if (!defined($clens));
1245				clustersizes($cids,$clens);
1246
1247				##-- cluster-id values
1248				if (!defined($cvals)) { $cvals = PDL->sequence(long,$kmax); }
1249				else { $cvals .= PDL->sequence(long,$kmax); }
1250
1251				##-- cluster-row values: handle BAD and negative values
1252				#if (!defined($crows)) { $crows = $cids->qsorti->where($cids->isgood & $cids>=0); }
1253				#else { $crows .= $cids->qsorti->where($cids->isgood & $cids>=0); }
1254
1255				##-- cluster-row values: treat BAD and negative values like anything else
1256				if (!defined($crows)) { $crows = $cids->qsorti; }
1257				else { $crows .= $cids->qsorti; }
1258
1259				return ($clens,$cvals,$crows);
1260				}
1261
1262				#line 1262 "Cluster.pd"
1263
1264				=pod
1265
1266				=head2 clusterdec
1267
1268				=for sig
1269
1270				Signature: (
1271				int clusterlens(k1);
1272				int clustervals(k1);
1273				int clusterrows(n);
1274				int [o]clusterids(n);
1275				)
1276
1277				Decodes cluster-to-datum vectors ($clusterlens,$clustervals,$clusterrows)
1278				into a single datum-to-cluster vector $clusterids().
1279				$(clusterlens,$clustervals,$clusterrows) are as returned by the clusterenc() method.
1280
1281				Un-addressed row-index values in $clusterrows() will be assigned the pseudo-cluster (-1)
1282				in $clusterids().
1283
1284				Really just a wrapper for some lower-level PDL calls.
1285
1286				=cut
1287
1288				sub clusterdec {
1289				my ($clens,$cvals,$crows, $cids2) = @_;
1290
1291				##-- get $cids
1292				$cids2 = zeroes($cvals->type, $crows->dims) if (!defined($cids2));
1293				$cids2 .= -1;
1294
1295				##-- trim $crows
1296				#my $crows_good = $crows->slice("0:".($clens->sum-1)); ##-- assume bad indices are at END of $crows (BAD,inf,...)
1297				my $crows_good = $crows->slice(-$clens->sum.":-1"); ##-- assume bad indices are at BEGINNING of $crows (-1, ...)
1298
1299				##-- decode
1300				$clens->rld($cvals, $cids2->index($crows_good));
1301
1302				return $cids2;
1303				}
1304
1305				#line 1312 "Cluster.pd"
1306
1307				=pod
1308
1309				=head2 clusteroffsets
1310
1311				=for sig
1312
1313				Signature: (
1314				int clusterids(n);
1315				int [o]clusteroffsets(k1+1);
1316				int [o]clustervals(k1);
1317				int [o]clusterrows(n);
1318				;
1319				int k1;
1320				)
1321
1322				Encodes datum-to-cluster vector $clusterids() for efficiently mapping
1323				clusters-to-data. Like clusterenc(), but returns cumulative offsets
1324				instead of lengths.
1325
1326				Really just a wrapper for clusterenc(), cumusumover(), and append().
1327
1328				=cut
1329
1330				sub clusteroffsets {
1331				my ($cids, $coffsets,$cvals,$crows, $kmax) = @_;
1332				my ($clens);
1333				($clens,$cvals,$crows) = clusterenc($cids,undef,$cvals,$crows,$kmax);
1334				$coffsets = $clens->append(0)->rotate(1)->cumusumover;
1335
1336				return ($coffsets,$cvals,$crows);
1337				}
1338				#line 1339 "Cluster.pm"
1339
1340
1341				=head2 clusterdistancematrixenc
1342
1343				=for sig
1344
1345				Signature: (
1346				double data(d,n);
1347				int mask(d,n);
1348				double weight(d);
1349				int clens1(k1);
1350				int crowids1(nc1);
1351				int clens2(k2);
1352				int crowids2(nc2);
1353				double [o]dist(k1,k2);
1354				;
1355				char *distFlag;
1356				char *methodFlag;
1357				)
1358				Types: (sbyte byte short ushort long ulong indx ulonglong longlong
1359				float double ldouble)
1360
1361				=for usage
1362
1363				$dist = clusterdistancematrixenc($data, $mask, $weight, $clens1, $crowids1, $clens2, $crowids2, $distFlag, $methodFlag);
1364				clusterdistancematrixenc($data, $mask, $weight, $clens1, $crowids1, $clens2, $crowids2, $dist, $distFlag, $methodFlag); # all arguments given
1365				$dist = $data->clusterdistancematrixenc($mask, $weight, $clens1, $crowids1, $clens2, $crowids2, $distFlag, $methodFlag); # method call
1366				$data->clusterdistancematrixenc($mask, $weight, $clens1, $crowids1, $clens2, $crowids2, $dist, $distFlag, $methodFlag);
1367
1368				Computes cluster-distance between each pair of clusters in (sequence($k1) x sequence($k2)), where 'x'
1369				is the Cartesian product. Cluster contents are passed as pairs ($clens(),$crowids()) as returned
1370				by the clusterenc() function (assuming that the $cvals() vector returned by clusterenc() is a flat sequence).
1371
1372				The deprecated method clusterdistancematrix() can be simulated by this function in the following
1373				manner: if a clusterdistancematrix() call was:
1374
1375				clustersizes ($cids, $csizes=zeroes(long,$k));
1376				clusterelements($cids, $celts =zeroes(long,$csizes->max)-1);
1377				clusterdistancematrix($data,$msk,$wt, $rowids, $csizes,$celts,
1378				$cdmat=zeroes(double,$k,$rowids->dim(0)),
1379				$distFlag, $methodFlag
1380				);
1381
1382				Then the corresponding use of clusterdistancematrixenc() would be:
1383
1384				($clens,$cvals,$crows) = clusterenc($cids);
1385				clusterdistancematrixenc($data,$msk,$wt,
1386				$clens, $crows, ##-- "real" clusters in output dim 0
1387				$rowids->ones, $rowids, ##-- $rowids as singleton clusters in output dim 1
1388				$cdmat=zeroes(double,$clens->dim(0),$rowids->dim(0)),
1389				$distFlag, $methodFlag);
1390
1391				If your $cvals() are not a flat sequence, you will probably need to do some index-twiddling
1392				to get things into the proper shape:
1393
1394				if ( !all($cvals==$cvals->sequence) \|\| $cvals->dim(0) != $k )
1395				{
1396				my $cdmat0 = $cdmat;
1397				my $nr = $rowids->dim(0);
1398				$cdmat = pdl(double,"inf")->slice("$k,$nr")->make_physical(); ##-- "missing" distances are infinite
1399				$cdmat->dice_axis(0,$cvals) .= $cdmat0;
1400				}
1401
1402				$distFlag and $methodFlag are interpreted as for clusterdistance().
1403
1404				See also clusterenc(), clusterdistancematrix().
1405
1406				=pod
1407
1408				Broadcasts over its inputs.
1409
1410				=for bad
1411
1412				C does not process bad values.
1413				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
1414
1415				=cut
1416
1417
1418
1419
1420				*clusterdistancematrixenc = \&PDL::clusterdistancematrixenc;
1421
1422
1423
1424
1425
1426
1427				=head2 clusterdistancesenc
1428
1429				=for sig
1430
1431				Signature: (
1432				double data(d,n);
1433				int mask(d,n);
1434				double weight(d);
1435				int coffsets1(k1);
1436				int crowids1(nc1);
1437				int cwhich1(ncmps);
1438				int coffsets2(k2);
1439				int crowids2(nc2);
1440				int cwhich2(ncmps);
1441				double [o]dists(ncmps);
1442				;
1443				char *distFlag;
1444				char *methodFlag;
1445				)
1446				Types: (sbyte byte short ushort long ulong indx ulonglong longlong
1447				float double ldouble)
1448
1449				=for usage
1450
1451				$dists = clusterdistancesenc($data, $mask, $weight, $coffsets1, $crowids1, $cwhich1, $coffsets2, $crowids2, $cwhich2, $distFlag, $methodFlag);
1452				clusterdistancesenc($data, $mask, $weight, $coffsets1, $crowids1, $cwhich1, $coffsets2, $crowids2, $cwhich2, $dists, $distFlag, $methodFlag); # all arguments given
1453				$dists = $data->clusterdistancesenc($mask, $weight, $coffsets1, $crowids1, $cwhich1, $coffsets2, $crowids2, $cwhich2, $distFlag, $methodFlag); # method call
1454				$data->clusterdistancesenc($mask, $weight, $coffsets1, $crowids1, $cwhich1, $coffsets2, $crowids2, $cwhich2, $dists, $distFlag, $methodFlag);
1455
1456				Computes cluster-distance between selected pairs of co-indexed clusters in ($cwhich1,$cwhich2).
1457				Cluster contents are passed as pairs ($coffsetsX(),$crowidsX()) as returned
1458				by the clusteroffsets() function.
1459
1460				$distFlag and $methodFlag are interpreted as for clusterdistance().
1461
1462				See also clusterenc(), clusterdistancematrixenc().
1463
1464				=pod
1465
1466				Broadcasts over its inputs.
1467
1468				=for bad
1469
1470				C does not process bad values.
1471				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
1472
1473				=cut
1474
1475
1476
1477
1478				*clusterdistancesenc = \&PDL::clusterdistancesenc;
1479
1480
1481
1482
1483
1484
1485				=head2 getclusterwsum
1486
1487				=for sig
1488
1489				Signature: (
1490				double data(d,n);
1491				int mask(d,n);
1492				double clusterwts(k,n);
1493				double [o]cdata(d,k);
1494				int [o]cmask(d,k);
1495				)
1496				Types: (sbyte byte short ushort long ulong indx ulonglong longlong
1497				float double ldouble)
1498
1499				=for usage
1500
1501				($cdata, $cmask) = getclusterwsum($data, $mask, $clusterwts);
1502				getclusterwsum($data, $mask, $clusterwts, $cdata, $cmask); # all arguments given
1503				($cdata, $cmask) = $data->getclusterwsum($mask, $clusterwts); # method call
1504				$data->getclusterwsum($mask, $clusterwts, $cdata, $cmask);
1505
1506				Find cluster centroids by weighted sum. This can be considered an
1507				expensive generalization of the getclustermean() and getclustermedian()
1508				functions. Here, the input PDLs $data() and $mask(), as well as the
1509				output PDL $cdata() are as for getclustermean(). The matrix $clusterwts()
1510				determines the relative weight of each data row in determining the
1511				centroid of each cluster, potentially useful for "fuzzy" clustering.
1512				The equation used to compute cluster means is:
1513
1514				$cdata(d,k) = sum_{n} $clusterwts(k,n) * $data(d,n) * $mask(d,n)
1515
1516				For centroids in the same range as data elements, $clusterwts()
1517				should sum to 1 over each column (k):
1518
1519				all($clusterwts->xchg(0,1)->sumover == 1)
1520
1521				getclustermean() can be simulated by instantiating $clusterwts() with
1522				a uniform distribution over cluster elements:
1523
1524				$clusterwts = zeroes($k,$n);
1525				$clusterwts->indexND(cat($clusterids, xvals($clusterids))->xchg(0,1)) .= 1;
1526				$clusterwts /= $clusterwts->xchg(0,1)->sumover;
1527				getclusterwsum($data,$mask, $clusterwts, $cdata=zeroes($d,$k));
1528
1529				Similarly, getclustermedian() can be simulated by setting $clusterwts() to
1530				1 for cluster medians and otherwise to 0. More sophisticated centroid
1531				discovery methods can be computed by this function by setting
1532				$clusterwts(k,n) to some estimate of the conditional probability
1533				of the datum at row $n given the cluster with index $k:
1534				p(Elt==n\|Cluster==k). One
1535				way to achieve such an estimate is to use (normalized inverses of) the
1536				singleton-row-to-cluster distances as output by clusterdistancematrix().
1537
1538				=pod
1539
1540				Broadcasts over its inputs.
1541
1542				=for bad
1543
1544				C does not process bad values.
1545				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
1546
1547				=cut
1548
1549
1550
1551
1552				*getclusterwsum = \&PDL::getclusterwsum;
1553
1554
1555
1556
1557
1558
1559				=head2 attachtonearest
1560
1561				=for sig
1562
1563				Signature: (
1564				double data(d,n);
1565				int mask(d,n);
1566				double weight(d);
1567				int rowids(nr);
1568				double cdata(d,k);
1569				int cmask(d,k);
1570				int [o]clusterids(nr);
1571				double [o]cdist(nr);
1572				;
1573				char *distFlag;
1574				char *methodFlag;
1575				)
1576				Types: (sbyte byte short ushort long ulong indx ulonglong longlong
1577				float double ldouble)
1578
1579				=for usage
1580
1581				($clusterids, $cdist) = attachtonearest($data, $mask, $weight, $rowids, $cdata, $cmask, $distFlag, $methodFlag);
1582				attachtonearest($data, $mask, $weight, $rowids, $cdata, $cmask, $clusterids, $cdist, $distFlag, $methodFlag); # all arguments given
1583				($clusterids, $cdist) = $data->attachtonearest($mask, $weight, $rowids, $cdata, $cmask, $distFlag, $methodFlag); # method call
1584				$data->attachtonearest($mask, $weight, $rowids, $cdata, $cmask, $clusterids, $cdist, $distFlag, $methodFlag);
1585
1586				Assigns each specified data row to the nearest cluster centroid.
1587				Data elements are given by $data() and $mask(), feature weights are
1588				given by $weight(), as usual. Cluster centroids are defined by
1589				by $cdata() and $cmask(), and the indices of rows to be attached
1590				are given in the vector $rowids(). The output vector $clusterids()
1591				contains for each specified row index the identifier of the nearest
1592				cluster centroid. The vector $cdist() contains the distance to
1593				the best clusters.
1594
1595				See also: clusterdistancematrix(), attachtonearestd().
1596
1597				=pod
1598
1599				Broadcasts over its inputs.
1600
1601				=for bad
1602
1603				C does not process bad values.
1604				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
1605
1606				=cut
1607
1608
1609
1610
1611				*attachtonearest = \&PDL::attachtonearest;
1612
1613
1614
1615
1616
1617				#line 1659 "Cluster.pd"
1618
1619				=pod
1620
1621				=head2 attachtonearestd
1622
1623				=for sig
1624
1625				Signature: (
1626				double cdistmat(k,n);
1627				int rowids(nr);
1628				int [o]clusterids(nr);
1629				double [o]dists(nr);
1630				)
1631
1632				Assigns each specified data row to the nearest cluster centroid,
1633				as for attachtonearest(), given the datum-to-cluster distance
1634				matrix $cdistmat(). Currently just a wrapper for a few PDL calls.
1635				In scalar context returns $clusterids(), in list context returns
1636				the list ($clusterids(),$dists()).
1637
1638				=cut
1639
1640				sub attachtonearestd {
1641				my ($cdm,$rowids,$cids,$dists)=@_;
1642				$cids = zeroes(long, $rowids->dim(0)) if (!defined($cids));
1643				$dists = zeroes(double, $rowids->dim(0)) if (!defined($dists));
1644
1645				##-- dice matrix
1646				my $cdmr = $cdm->dice_axis(1,$rowids);
1647
1648				##-- get best
1649				$cdmr->minimum_ind($cids);
1650				$dists .= $cdmr->index($cids);
1651
1652				return wantarray ? ($cids,$dists) : $cids;
1653				}
1654				#line 1655 "Cluster.pm"
1655
1656
1657				=head2 checkprototypes
1658
1659				=for sig
1660
1661				Signature: (
1662				protos(k);
1663				[o]cprotos(k);
1664				byte [t]otmp(n);
1665				; int nsize => n)
1666				Types: (byte short ushort long)
1667
1668				=for usage
1669
1670				$cprotos = checkprototypes($protos, $nsize);
1671				checkprototypes($protos, $cprotos, $nsize); # all arguments given
1672				$cprotos = $protos->checkprototypes($nsize); # method call
1673				$protos->checkprototypes($cprotos, $nsize);
1674				$protos->inplace->checkprototypes($nsize); # can be used inplace
1675				checkprototypes($protos->inplace, $nsize);
1676
1677				(Deterministic)
1678
1679				Ensure that the assignment $protos() from $k objects to
1680				integer "prototype" indices in the range [0,$n( contains no repetitions of any
1681				of the $n possible prototype values. One use for this function is
1682				the restriction of (randomly generated) potential clustering solutions
1683				for $k clusters in which each cluster is represented by a
1684				"prototypical" element from a data sample of size $n.
1685
1686				Requires: $n >= $k.
1687
1688				=pod
1689
1690				Broadcasts over its inputs.
1691
1692				=for bad
1693
1694				C does not process bad values.
1695				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
1696
1697				=cut
1698
1699
1700
1701
1702				*checkprototypes = \&PDL::checkprototypes;
1703
1704
1705
1706
1707
1708
1709				=head2 checkpartitions
1710
1711				=for sig
1712
1713				Signature: (
1714				part(n);
1715				[o]cpart(n);
1716				[t]ptmp(k);
1717				; int ksize => k)
1718				Types: (byte short ushort long)
1719
1720				=for usage
1721
1722				$cpart = checkpartitions($part, $ksize);
1723				checkpartitions($part, $cpart, $ksize); # all arguments given
1724				$cpart = $part->checkpartitions($ksize); # method call
1725				$part->checkpartitions($cpart, $ksize);
1726				$part->inplace->checkpartitions($ksize); # can be used inplace
1727				checkpartitions($part->inplace, $ksize);
1728
1729				(Deterministic)
1730
1731				Ensure that the partitioning $part() of $n objects into $k bins
1732				(identified by integer values in the range [0,$k-1])
1733				contains at least one instance of each of the
1734				$k possible values. One use for this function is
1735				the restriction of (randomly generated) potential clustering solutions
1736				for $n elements into $k clusters to those which assign at least one
1737				element to each cluster.
1738
1739				Requires: $n >= $k.
1740
1741				=pod
1742
1743				Broadcasts over its inputs.
1744
1745				=for bad
1746
1747				C does not process bad values.
1748				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
1749
1750				=cut
1751
1752
1753
1754
1755				*checkpartitions = \&PDL::checkpartitions;
1756
1757
1758
1759
1760
1761				#line 1813 "Cluster.pd"
1762
1763				=pod
1764
1765				=head2 randomprototypes
1766
1767				=for sig
1768
1769				Signature: (int k; int n; [o]prototypes(k))
1770
1771				Generate a random set of $k prototype indices drawn from $n objects,
1772				ensuring that no object is used more than once. Calls checkprototypes().
1773
1774				See also: checkprototypes(), randomassign(), checkpartitions(), randompartition().
1775
1776				=cut
1777
1778				sub randomprototypes {
1779				my ($k,$n,$protos) = @_;
1780				$protos = zeroes(long, $k) if (!defined($protos));
1781				$protos .= PDL->random($k)*$n;
1782				checkprototypes($protos->inplace, $n);
1783				return $protos;
1784				}
1785
1786				#line 1845 "Cluster.pd"
1787
1788				=pod
1789
1790				=head2 randompartition
1791
1792				=for sig
1793
1794				Signature: (int k; int n; [o]partition(n))
1795
1796				Generate a partitioning of $n objects into $k clusters,
1797				ensuring that every cluster contains at least one object.
1798				Calls checkpartitions().
1799				This method is identical in functionality to randomassign(),
1800				but may be faster if $k is significantly smaller than $n.
1801
1802				See also: randomassign(), checkpartitions(), checkprototypes(), randomprototypes().
1803
1804				=cut
1805
1806				sub randompartition {
1807				my ($k,$n,$part) = @_;
1808				$part = zeroes(long, $n) if (!defined($part));
1809				$part .= PDL->random($n)*$k;
1810				checkpartitions($part->inplace, $k);
1811				return $part;
1812				}
1813
1814				#line 1884 "Cluster.pd"
1815				##---------------------------------------------------------------------
1816				=pod
1817
1818				=head1 COMMON ARGUMENTS
1819
1820				Many of the functions described above require one or
1821				more of the following parameters:
1822
1823				=over 4
1824
1825				=item d
1826
1827				The number of features defined for each data element.
1828
1829				=item n
1830
1831				The number of data elements to be clustered.
1832
1833				=item k
1834
1835				=item nclusters
1836
1837				The number of desired clusters.
1838
1839				=item data(d,n)
1840
1841				A matrix representing the data to be clustered, double-valued.
1842
1843				=item mask(d,n)
1844
1845				A matrix indicating which data values are missing. If
1846				mask(i,j) == 0, then data(i,j) is treated as missing.
1847
1848				=item weights(d)
1849
1850				The (feature-) weights that are used to calculate the distance.
1851
1852				B Not all distance metrics make use of weights;
1853				you must provide some nonetheless.
1854
1855				=item clusterids(n)
1856
1857				A clustering solution. $clusterids() maps data elements
1858				(row indices in $data()) to values in the range [0,$k-1].
1859
1860				=back
1861
1862				=cut
1863
1864				##---------------------------------------------------------------------
1865				=pod
1866
1867				=head2 Distance Metrics
1868
1869				Distances between data elements (and cluster centroids, where applicable)
1870				are computed using one of a number of built-in metrics. Which metric
1871				is to be used for a given computation is indicated by a character
1872				flag denoted above with $distFlag(). In the following, w[i] represents
1873				a weighting factor in the $weights() matrix, and $W represents the total
1874				of all weights.
1875
1876				Currently implemented distance
1877				metrics and the corresponding flags are:
1878
1879				=over 4
1880
1881				=item e
1882
1883				Pseudo-Euclidean distance:
1884
1885				dist_e(x,y) = 1/W * sum_{i=1..d} w[i] * (x[i] - y[i])^2
1886
1887				Note that this is not the "true" Euclidean distance, which is defined as:
1888
1889				dist_E(x,y) = sqrt( sum_{i=1..d} (x[i] - y[i])^2 )
1890
1891				=item b
1892
1893				City-block ("Manhattan") distance:
1894
1895				dist_b(x,y) = 1/W * sum_{i=1..d} w[i] * \|x[i] - y[i]\|
1896
1897				=item c
1898
1899				Pearson correlation distance:
1900
1901				dist_c(x,y) = 1-r(x,y)
1902
1903				where r is the Pearson correlation coefficient:
1904
1905				r(x,y) = 1/d * sum_{i=1..d} (x[i]-mean(x))/stddev(x) * (y[i]-mean(y))/stddev(y)
1906
1907				=item a
1908
1909				Absolute value of the correlation,
1910
1911				dist_a(x,y) = 1-\|r(x,y)\|
1912
1913				where r(x,y) is the Pearson correlation coefficient.
1914
1915				=item u
1916
1917				Uncentered correlation (cosine of the angle):
1918
1919				dist_u(x,y) = 1-r_u(x,y)
1920
1921				where:
1922
1923				r_u(x,y) = 1/d * sum_{i=1..d} (x[i]/sigma0(x)) * (y[i]/sigma0(y))
1924
1925				and:
1926
1927				sigma0(w) = sqrt( 1/d * sum_{i=1..d} w[i]^2 )
1928
1929				=item x
1930
1931				Absolute uncentered correlation,
1932
1933				dist_x(x,y) = 1-\|r_u(x,y)\|
1934
1935				=item s
1936
1937				Spearman's rank correlation.
1938
1939				dist_s(x,y) = 1-r_s(x,y) ~= dist_c(ranks(x),ranks(y))
1940
1941				where r_s(x,y) is the Spearman rank correlation. Weights are ignored.
1942
1943				=item k
1944
1945				Kendall's tau (does not use weights).
1946
1947				dist_k(x,y) = 1 - tau(x,y)
1948
1949				=item (other values)
1950
1951				For other values of dist, the default (Euclidean distance) is used.
1952
1953				=back
1954
1955				=cut
1956
1957				##---------------------------------------------------------------------
1958				=pod
1959
1960				=head2 Link Methods
1961
1962				For hierarchical clustering, the 'link method' must be specified
1963				by a character flag, denoted above as $methodFlag.
1964				Known link methods are:
1965
1966				=over 4
1967
1968				=item s
1969
1970				Pairwise minimum-linkage ("single") clustering.
1971
1972				Defines the distance between two clusters as the
1973				least distance between any two of their respective elements.
1974
1975				=item m
1976
1977				Pairwise maximum-linkage ("complete") clustering.
1978
1979				Defines the distance between two clusters as the
1980				greatest distance between any two of their respective elements.
1981
1982				=item a
1983
1984				Pairwise average-linkage clustering (centroid distance using arithmetic mean).
1985
1986				Defines the distance between two clusters as the
1987				distance between their respective centroids, where each
1988				cluster centroid is defined as the arithmetic mean of
1989				that cluster's elements.
1990
1991				=item c
1992
1993				Pairwise centroid-linkage clustering (centroid distance using median).
1994
1995				Identifies the distance between two clusters as the
1996				distance between their respective centroids, where each
1997				cluster centroid is computed as the median of
1998				that cluster's elements.
1999
2000				=item (other values)
2001
2002				Behavior for other values is currently undefined.
2003
2004				=back
2005
2006				For the first three, either the distance matrix or the gene expression data is
2007				sufficient to perform the clustering algorithm. For pairwise centroid-linkage
2008				clustering, however, the gene expression data are always needed, even if the
2009				distance matrix itself is available.
2010
2011				=cut
2012
2013				##---------------------------------------------------------------------
2014				=pod
2015
2016				=head1 ACKNOWLEDGEMENTS
2017
2018				Perl by Larry Wall.
2019
2020				PDL by Karl Glazebrook, Tuomas J. Lukka, Christian Soeller, and others.
2021
2022				C Clustering Library by
2023				Michiel de Hoon,
2024				Seiya Imoto,
2025				and Satoru Miyano.
2026
2027				Orignal Algorithm::Cluster module by John Nolan and Michiel de Hoon.
2028
2029				=cut
2030
2031				##----------------------------------------------------------------------
2032				=pod
2033
2034				=head1 KNOWN BUGS
2035
2036				Dimensional requirements are sometimes too strict.
2037
2038				Passing weights to Spearman and Kendall link methods wastes space.
2039
2040				=cut
2041
2042				##---------------------------------------------------------------------
2043				=pod
2044
2045				=head1 AUTHOR
2046
2047				Bryan Jurish Emoocow@cpan.orgE wrote and maintains the PDL::Cluster distribution.
2048
2049				Michiel de Hoon wrote the underlying C clustering library for cDNA microarray data.
2050
2051				=head1 COPYRIGHT
2052
2053				PDL::Cluster is a set of wrappers around the C Clustering library for cDNA microarray data.
2054
2055				=over 4
2056
2057				=item *
2058
2059				The C clustering library for cDNA microarray data.
2060				Copyright (C) 2002-2005 Michiel Jan Laurens de Hoon.
2061
2062				This library was written at the Laboratory of DNA Information Analysis,
2063				Human Genome Center, Institute of Medical Science, University of Tokyo,
2064				4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
2065				Contact: michiel.dehoon 'AT' riken.jp
2066
2067				See the files F, F and F in the PDL::Cluster distribution
2068				for details.
2069
2070				=item *
2071
2072				PDL::Cluster wrappers copyright (C) Bryan Jurish 2005-2018. All rights reserved.
2073				This package is free software, and entirely without warranty.
2074				You may redistribute it and/or modify it under the same terms
2075				as Perl itself.
2076
2077				=back
2078
2079				=head1 SEE ALSO
2080
2081				perl(1), PDL(3perl), Algorithm::Cluster(3perl), cluster(1),
2082				L
2083
2084				=cut
2085				#line 2086 "Cluster.pm"
2086
2087				# Exit with OK status
2088
2089				1;