File Coverage

Cluster.pd

Criterion	Covered	Total	%
statement	0	43	0.0
branch	0	20	0.0
condition			n/a
subroutine	0	8	0.0
pod	8	8	100.0
total	8	79	10.1

line	stmt	bran	sub	pod	code
1					#-- Mode: CPerl --
2
3					##======================================================================
4					## Header Administrivia
5					##======================================================================
6
7					our $VERSION = '1.54.004';
8					pp_setversion("'$VERSION'");
9
10					use PDL::Bad; ##-- for $PDL::Bad::Status
11
12					##------------------------------------------------------
13					## pm additions
14					pp_addpm({At=>'Top'},<<'EOPM');
15
16					#---------------------------------------------------------------------------
17					# File: PDL::Cluster.pm
18					# Author: Bryan Jurish
19					# Description: PDL wrappers for the C Clustering library.
20					#
21					# Copyright (c) 2005-2021 Bryan Jurish. All rights reserved.
22					# This program is free software. You may modify and/or
23					# distribute it under the same terms as Perl itself.
24					#
25					#---------------------------------------------------------------------------
26					# Based on the C clustering library for cDNA microarray data,
27					# Copyright (C) 2002-2005 Michiel Jan Laurens de Hoon.
28					#
29					# The C clustering library was written at the Laboratory of DNA Information
30					# Analysis, Human Genome Center, Institute of Medical Science, University of
31					# Tokyo, 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
32					# Contact: michiel.dehoon 'AT' riken.jp
33					#
34					# See the files "cluster.c" and "cluster.h" in the PDL::Cluster distribution
35					# for details.
36					#---------------------------------------------------------------------------
37
38					=pod
39
40					=head1 NAME
41
42					PDL::Cluster - PDL interface to the C Clustering Library
43
44					=head1 SYNOPSIS
45
46					use PDL::Cluster;
47
48					##-----------------------------------------------------
49					## Data Format
50					$d = 42; ##-- number of features
51					$n = 1024; ##-- number of data elements
52
53					$data = random($d,$n); ##-- data matrix
54					$elt = $data->slice(",($i)"); ##-- element data vector
55					$ftr = $data->slice("($j),"); ##-- feature vector over all elements
56
57					$wts = ones($d)/$d; ##-- feature weights
58					$msk = ones($d,$n); ##-- missing-datum mask (1=ok)
59
60					##-----------------------------------------------------
61					## Library Utilties
62
63					$mean = $ftr->cmean();
64					$median = $ftr->cmedian();
65
66					calculate_weights($data,$msk,$wts, $cutoff,$expnt,
67					$weights);
68
69					##-----------------------------------------------------
70					## Distance Functions
71
72					clusterdistance($data,$msk,$wts, $n1,$n2,$idx1,$idx2,
73					$dist,
74					$distFlag, $methodFlag2);
75
76					distancematrix($data,$msk,$wts, $distmat, $distFlag);
77
78					##-----------------------------------------------------
79					## Partitioning Algorithms
80
81					getclustermean($data,$msk,$clusterids,
82					$ctrdata, $ctrmask);
83
84					getclustermedian($data,$msk,$clusterids,
85					$ctrdata, $ctrmask);
86
87					getclustermedoid($distmat,$clusterids,$centroids,
88					$errorsums);
89
90					kcluster($k, $data,$msk,$wts, $npass,
91					$clusterids, $error, $nfound,
92					$distFlag, $methodFlag);
93
94					kmedoids($k, $distmat,$npass,
95					$clusterids, $error, $nfound);
96
97					##-----------------------------------------------------
98					## Hierarchical Algorithms
99
100					treecluster($data,$msk,$wts,
101					$tree, $lnkdist,
102					$distFlag, $methodFlag);
103
104					treeclusterd($data,$msk,$wts, $distmat,
105					$tree, $lnkdist,
106					$distFlag, $methodFlag);
107
108					cuttree($tree, $nclusters,
109					$clusterids);
110
111					##-----------------------------------------------------
112					## Self-Organizing Maps
113
114					somcluster($data,$msk,$wts, $nx,$ny,$tau,$niter,
115					$clusterids,
116					$distFlag);
117
118					##-----------------------------------------------------
119					## Principal Component Analysis
120
121					pca($U, $S, $V);
122
123					##-----------------------------------------------------
124					## Extensions
125
126					rowdistances($data,$msk,$wts, $rowids1,$rowids2, $distvec, $distFlag);
127					clusterdistances($data,$msk,$wts, $rowids, $index2,
128					$dist,
129					$distFlag, $methodFlag);
130
131					clustersizes($clusterids, $clustersizes);
132					clusterelements($clustierids, $clustersizes, $eltids);
133					clusterelementmask($clusterids, $eltmask);
134
135					clusterdistancematrix($data,$msk,$wts,
136					$rowids, $clustersizes, $eltids,
137					$dist,
138					$distFlag, $methodFlag);
139
140					clusterenc($clusterids, $clens,$cvals,$crowids, $k);
141					clusterdec($clens,$cvals,$crowids, $clusterids, $k);
142					clusteroffsets($clusterids, $coffsets,$cvals,$crowids, $k);
143					clusterdistancematrixenc($data,$msk,$wts,
144					$clens1,$crowids1, $clens2,$crowids2,
145					$dist,
146					$distFlag, $methodFlag);
147
148					=cut
149
150					EOPM
151					## /pm additions
152					##------------------------------------------------------
153
154					##------------------------------------------------------
155					## Exports: None
156					#pp_export_nothing();
157
158					##------------------------------------------------------
159					## Includes
160					pp_addhdr(<<'EOH');
161					#include "ccluster.h"
162					EOH
163
164					##======================================================================
165					## XS additions
166					##======================================================================
167					pp_addxs(<<'EOXS');
168
169					char *
170					library_version()
171					CODE:
172					RETVAL = CLUSTERVERSION;
173					OUTPUT:
174					RETVAL
175
176					EOXS
177
178					##======================================================================
179					## C Support Functions
180					##======================================================================
181
182					##------------------------------------------------------
183					## Debugging Utilities
184					pp_addhdr(<<'EOH');
185
186					//#define CDEBUG 1
187					//#undef CDEBUG
188
189					void print_pp_dbl(int nrows, int ncols, double **pp) {
190					int i,j;
191					for (i=0; i
192					printf(" %d:[ ", i);
193					for (j=0; j
194					printf("%d=%lf ", j, pp[i][j]);
195					}
196					printf("]\n");
197					}
198					}
199					EOH
200
201
202					##----------------------------------------------------------------------
203					## Allocation Utilities
204					pp_addhdr(<<'EOH');
205					static
206					void **pp_alloc(int nrows)
207					{
208					return ((void *)malloc(nrowssizeof(void**)));
209					}
210					EOH
211
212					##----------------------------------------------------------------------
213					## Assignment utilities (new)
214					pp_addhdr(<<'EOH');
215					static
216					double *p2pp_dbl(int nrows, int ncols, double p, double **matrix)
217					{
218					int i;
219					if (!(p && nrows && ncols)) return NULL;
220					if (!matrix) matrix = (double **)pp_alloc(nrows);
221					for (i=0; i < nrows; i++) {
222					matrix[i] = p + (i*ncols);
223					#ifdef CDEBUG
224					printf("p2pp_dbl(nr=%d,nc=%d,p=%p) : (p+%d*%d)=%p\n", nrows,ncols,p, i,ncols,matrix[i]);
225					#endif
226					}
227					return matrix;
228					}
229					EOH
230
231					pp_addhdr(<<'EOH');
232					int *p2pp_int(int nrows, int ncols, int p, int **matrix)
233					{
234					int i;
235					if (!(p && nrows && ncols)) return NULL;
236					if (!matrix) matrix = (int **)pp_alloc(nrows);
237					for (i=0; i < nrows; i++) {
238					matrix[i] = p + (i*ncols);
239					}
240					return matrix;
241					}
242					EOH
243
244					##-- ragged conversion: just using p2pp_dbl()
245					pp_addhdr(<<'EOH');
246					static
247					double *p2pp_dbl_ragged(int nrows, int ncols, double p, double **matrix)
248					{
249					int i;
250					if (!(p && nrows && ncols)) return NULL;
251					if (!matrix) matrix = (double **)pp_alloc(nrows);
252					for (i=0; i < nrows; i++) {
253					matrix[i] = p + (i*ncols);
254					}
255					return matrix;
256					}
257					EOH
258
259
260					pp_addhdr(<<'EOH');
261					static
262					void pp2pdl_ragged_dbl(int nrows, int ncols, double *pp, double p)
263					{
264					int i,j;
265					if (!(pp && nrows && ncols)) return;
266					for (i=0; i
267					for (j=0; j
268					p[i*ncols+j] = pp[i][j];
269					p[j*ncols+i] = pp[i][j];
270					}
271					p[i*ncols+i] = 0;
272					}
273					}
274					EOH
275
276
277					pp_addhdr(<<'EOH');
278					static
279					void pp2pdl_dbl(int nrows, int ncols, double *pp, double p)
280					{
281					int i,j;
282					if (!(pp && nrows && ncols)) return;
283					for (i=0; i
284					for (j=0; j
285					p[i*ncols+j] = pp[i][j];
286					}
287					}
288					}
289					EOH
290
291					##------------------------------------------------------
292					## tree clustering solution utilities
293
294					pp_addhdr(<<'EOH');
295					static
296					Node* p2node(int nnodes, int* tree, double *lnkdist)
297					{
298					int i;
299					Node *nod = NULL;
300					if (!(nnodes && (tree \|\| lnkdist))) return NULL;
301					nod = (Node)malloc(nnodessizeof(Node));
302					for (i=0; i < nnodes; ++i) {
303					nod[i].left = tree ? tree[i*2+0] : 0;
304					nod[i].right = tree ? tree[i*2+1] : 0;
305					nod[i].distance = lnkdist ? lnkdist[i] : 0;
306					}
307					return nod;
308					}
309
310					void node2p(int nnodes, Node* nod, int* tree, double *lnkdist)
311					{
312					int i;
313					if (!(nnodes && nod && (tree \|\| lnkdist))) return;
314					for (i=0; i < nnodes; ++i) {
315					if (tree) {
316					tree[i*2+0] = nod[i].left;
317					tree[i*2+1] = nod[i].right;
318					}
319					if (lnkdist) {
320					lnkdist[i] = nod[i].distance;
321					}
322					}
323					return;
324					}
325
326					EOH
327
328					##======================================================================
329					## Library Utility Routines
330					##======================================================================
331
332					##------------------------------------------------------
333					## Mean
334					pp_def
335					('cmean',
336					Pars => 'double a(n); double [o]b()',
337					GenericTypes => [D],
338					Code => '$b() = mean($SIZE(n), $P(a));',
339					Doc => 'Computes arithmetic mean of the vector $a(). See also PDL::Primitive::avg().',
340					);
341
342					##------------------------------------------------------
343					## Median
344					pp_def
345					('cmedian',
346					Pars => 'double a(n); double [o]b()',
347					GenericTypes => [D],
348					Code => '$b() = median($SIZE(n), $P(a));',
349					Doc => 'Computes median of the vector $a(). See also PDL::Primitive::median().',
350					);
351
352
353					##------------------------------------------------------
354					## Weights
355					pp_def
356					('calculate_weights',
357					Pars => join("\n ", '',
358					q(double data(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
359					q(int mask(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
360					q(double weight(d);), ##-- feature-weighting factors
361					q(double cutoff();), ##-- distance cutoff
362					q(double exponent();), ##-- distance exponent
363					q(double [o]oweights(d);), ##-- output weights
364					''
365					),
366					OtherPars => join("\n ", '', 'char *distFlag;', ''),
367					Code =>
368					('
369					double datapp = (double )pp_alloc($SIZE(n));
370					int maskpp = (int )pp_alloc($SIZE(n));
371					int transpose=0;
372					int i;
373					double *owp;
374					//
375					threadloop %{
376					p2pp_dbl($SIZE(n), $SIZE(d), $P(data), datapp);
377					p2pp_int($SIZE(n), $SIZE(d), $P(mask), maskpp);
378					owp = calculate_weights($SIZE(n), $SIZE(d), datapp, maskpp,
379					$P(weight), transpose, *$COMP(distFlag),
380					$cutoff(), $exponent());
381					if (owp) {
382					loop (d) %{
383					$oweights() = owp[d];
384					%}
385					free(owp);
386					}
387					%}
388					//
389					/-- cleanup --/
390					if (datapp) free(datapp);
391					if (maskpp) free(maskpp);
392					'),
393
394					Doc=>
395					('
396					This function calculates weights for the features using the weighting scheme
397					proposed by Michael Eisen:
398
399					w[i] = 1.0 / sum_{j where dist(i,j)
400
401					where the cutoff and the exponent are specified by the user.
402					'),
403					);
404
405					##======================================================================
406					## Chapter 2: Distance Functions
407					##======================================================================
408
409					##------------------------------------------------------
410					## Cluster Distance
411					pp_def
412					('clusterdistance',
413					Pars => join("\n ", '',
414					q(double data(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
415					q(int mask(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
416					q(double weight(d);), ##-- normalization
417					q(int n1();),
418					q(int n2();),
419					q(int index1(n1);),
420					q(int index2(n2);),
421					q(double [o]dist();),
422					#q(int transpose=>0;),
423					''
424					),
425					OtherPars => join("\n ", '', 'char distFlag;', 'char methodFlag;', ''),
426
427					Doc =>
428					('
429					Computes distance between two clusters $index1() and $index2().
430					Each of the $index() vectors represents a single cluster whose values
431					are the row-indices in the $data() matrix of the elements assigned
432					to the respective cluster. $n1() and $n2() are the number of elements
433					in $index1() and $index2(), respectively. Each $index$i() must have
434					at least $n$i() elements allocated.
435
436					B the $methodFlag argument is interpreted differently than
437					by the treecluster() method, namely:
438
439					=over 4
440
441					=item a
442
443					Distance between the arithmetic means of the two clusters,
444					as for treecluster() "f".
445
446					=item m
447
448					Distance between the medians of the two clusters,
449					as for treecluster() "c".
450
451					=item s
452
453					Minimum pairwise distance between members of the two clusters,
454					as for treecluster() "s".
455
456					=item x
457
458					Maximum pairwise distance between members of the two clusters
459					as for treecluster() "m".
460
461					=item v
462
463					Average of the pairwise distances between members of the two clusters,
464					as for treecluster() "a".
465
466					=back
467
468					=cut
469
470					'),
471
472					Code =>
473					('
474					double datapp = (double )pp_alloc($SIZE(n));
475					int maskpp = (int )pp_alloc($SIZE(n));
476					int transpose=0;
477					double retval;
478
479					threadloop %{
480					p2pp_dbl($SIZE(n), $SIZE(d), $P(data), datapp);
481					p2pp_int($SIZE(n), $SIZE(d), $P(mask), maskpp);
482					retval = clusterdistance($SIZE(n), $SIZE(d), datapp, maskpp,
483					$P(weight), $n1(), $n2(), $P(index1), $P(index2),
484					$COMP(distFlag), $COMP(methodFlag), transpose);
485					$dist() = retval;
486					%}
487
488					/-- cleanup --/
489					if (datapp) free(datapp);
490					if (maskpp) free(maskpp);
491					'),
492					);
493
494					##----------------------------------------------------------------------
495					## Distance Matrix
496					pp_def
497					('distancematrix',
498					Pars => join("\n ", '',
499					q(double data(d,n);),
500					q(int mask(d,n);),
501					q(double weight(d);),
502					#q(int transpose();),
503					q(double [o]dists(n,n);),
504					''
505					),
506					OtherPars => join("\n ", '', 'char *distFlag;', ''),
507					Code =>
508					('
509					int transpose = 0;
510					double datapp = (double )pp_alloc($SIZE(n));
511					int maskpp = (int )pp_alloc($SIZE(n));
512					double **retval;
513					//
514					threadloop %{
515					p2pp_dbl($SIZE(n), $SIZE(d), $P(data), datapp);
516					p2pp_int($SIZE(n), $SIZE(d), $P(mask), maskpp);
517					retval = distancematrix($SIZE(n), $SIZE(d), datapp, maskpp,
518					$P(weight), *$COMP(distFlag), transpose);
519					//
520					if (!retval) barf("Cluster matrix allocation failed!");
521					pp2pdl_ragged_dbl($SIZE(n), $SIZE(n), retval, $P(dists));
522					//
523					if (retval) free(retval);
524					%}
525					//
526					/-- cleanup --/
527					if (datapp) free(datapp);
528					if (maskpp) free(maskpp);
529					'),
530
531					Doc => 'Compute triangular distance matrix over all data points.',
532					);
533
534					##======================================================================
535					## REMOVED: Random number generation
536					## + REMOVED: initran()
537
538					##======================================================================
539					## Chapter 3: Partitioning Algorithms
540					## + REMOVED: randomassign
541
542					##----------------------------------------------------------------------
543					## Cluster Centroids: Generic
544					pp_def
545					('getclustercentroids',
546					Pars => join("\n ", '',
547					q(double data(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
548					q(int mask(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
549					#q(int transpose();), ##-- probably dangerous
550					q(int clusterids(n);), ##-- maps elts to cluster-ids
551					q(double [o]cdata(d,k);), ##-- centroid data
552					q(int [o]cmask(d,k);), ##-- centroid data
553					''
554					),
555					OtherPars => join("\n ", '', 'char *ctrMethodFlag;', ''),
556					Code =>
557					('
558					int transpose = 0;
559					double datapp = (double )pp_alloc($SIZE(n));
560					int maskpp = (int )pp_alloc($SIZE(n));
561					double cdatapp = (double )pp_alloc($SIZE(k));
562					int cmaskpp = (int )pp_alloc($SIZE(k));
563
564					threadloop %{
565					p2pp_dbl($SIZE(n), $SIZE(d), $P(data), datapp);
566					p2pp_int($SIZE(n), $SIZE(d), $P(mask), maskpp);
567					p2pp_dbl($SIZE(k), $SIZE(d), $P(cdata), cdatapp);
568					p2pp_int($SIZE(k), $SIZE(d), $P(cmask), cmaskpp);
569
570					getclustercentroids($SIZE(k), $SIZE(n), $SIZE(d), datapp, maskpp,
571					$P(clusterids), cdatapp, cmaskpp, transpose, *$COMP(ctrMethodFlag));
572					%}
573
574					/-- cleanup --/
575					if (datapp) free(datapp);
576					if (maskpp) free(maskpp);
577					if (cdatapp) free(cdatapp);
578					if (cmaskpp) free(cmaskpp);
579					'),
580
581					Doc => 'Find cluster centroids by arithmetic mean (C) or median over each dimension (C).'
582					);
583
584
585					##----------------------------------------------------------------------
586					## Cluster Centroids: Mean
587					## + now just a wrapper for 'getclustercentroids(...,"a")'
588					pp_add_exported('','getclustermean');
589					pp_addpm(<<'EOPM');
590
591					=pod
592
593					=head2 getclustermean
594
595					=for sig
596
597					Signature: (
598					double data(d,n);
599					int mask(d,n);
600					int clusterids(n);
601					double [o]cdata(d,k);
602					int [o]cmask(d,k);
603					)
604
605					Really just a wrapper for getclustercentroids(...,"a").
606
607					=cut
608
609	0		0	1	sub getclustermean {
610	0				my ($data,$mask,$cids,$cdata,$cmask) = @_;
611					return getclustercentroids($dat,$mask,$cids,$cdata,$cmask,'a');
612					}
613
614					EOPM
615
616					##----------------------------------------------------------------------
617					## Cluster Centroids: Median
618					## + now just a wrapper for 'getclustercentroids(...,"m")'
619					pp_add_exported('','getclustermedian');
620					pp_addpm(<<'EOPM');
621
622					=pod
623
624					=head2 getclustermedian
625
626					=for sig
627
628					Signature: (
629					double data(d,n);
630					int mask(d,n);
631					int clusterids(n);
632					double [o]cdata(d,k);
633					int [o]cmask(d,k);
634					)
635
636					Really just a wrapper for getclustercentroids(...,"m").
637
638					=cut
639
640	0		0	1	sub getclustermedian {
641	0				my ($data,$mask,$cids,$cdata,$cmask) = @_;
642					return getclustercentroids($dat,$mask,$cids,$cdata,$cmask,'m');
643					}
644
645					EOPM
646
647
648					##----------------------------------------------------------------------
649					## Cluster Centroids: Medoids
650					pp_def
651					('getclustermedoids',
652					Pars => join("\n ", '',
653					q(double distance(n,n);), ##-- (in) distance matrix (ragged, lower-left-triangle)
654					q(int clusterids(n);), ##-- (in) cluster ids indexed by gene-id
655					q(int [o]centroids(k);), ##-- (out) centroid-(elt-)ids indexed by cluster-id
656					q(double [o]errors(k);), ##-- (out) maps cluster-id c -> sum(dist(x \in c, ctr(c)))
657					''
658					),
659					Code =>
660					('
661					double distpp = (double )pp_alloc($SIZE(n));
662					threadloop %{
663					p2pp_dbl_ragged($SIZE(n), $SIZE(n), $P(distance), distpp);
664					getclustermedoids($SIZE(k), $SIZE(n), distpp,
665					$P(clusterids), $P(centroids), $P(errors));
666					%}
667					//
668					/-- cleanup --/
669					if (distpp) free(distpp);
670					'),
671
672					Doc => 'The getclustermedoid routine calculates the cluster centroids, given to which
673					cluster each element belongs. The centroid is defined as the element with the
674					smallest sum of distances to the other elements.
675					'
676					);
677
678
679					##----------------------------------------------------------------------
680					## K-Means
681					pp_def
682					('kcluster',
683					Pars => join("\n ", '',
684					q(int nclusters();), ##-- number of clusters to find
685					q(double data(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
686					q(int mask(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
687					q(double weight(d);), ##-- weights
688					#q(int transpose();), ##-- probably dangerous
689					q(int npass();), ##-- n passes
690					q(int [o]clusterids(n);), ##-- maps elts to cluster-ids
691					q(double [o]error();) , ##-- solution error
692					q(int [o]nfound();), ##-- number of times solution found
693					''
694					),
695					OtherPars => join("\n ", '', 'char distFlag;', 'char ctrMethodFlag;', ''),
696					Code =>
697					('
698					int transpose = 0;
699					double datapp = (double )pp_alloc($SIZE(n));
700					int maskpp = (int )pp_alloc($SIZE(n));
701					//
702					threadloop %{
703					p2pp_dbl($SIZE(n), $SIZE(d), $P(data), datapp);
704					p2pp_int($SIZE(n), $SIZE(d), $P(mask), maskpp);
705					kcluster($nclusters(), $SIZE(n), $SIZE(d), datapp, maskpp,
706					$P(weight), transpose, $npass(), $COMP(ctrMethodFlag), $COMP(distFlag),
707					$P(clusterids), $P(error), $P(nfound));
708					%}
709					//
710					/-- cleanup --/
711					if (datapp) free(datapp);
712					if (maskpp) free(maskpp);
713					'),
714					Doc => ('K-Means clustering algorithm. The "ctrMethodFlag" determines how
715					clusters centroids are to be computed; see getclustercentroids() for details.
716
717					Because the C library code reads from the C if and only if
718					C is 0, before writing to it, it would be inconvenient to
719					set it to C<[io]>. However for efficiency reasons, as of 2.096, PDL
720					will not convert it (force a read-back on the conversion) for you
721					if you pass in the wrongly-typed data. This means that you should
722					be careful to pass in C data of the right size if you set C
723					to 0.
724
725					See also: kmedoids().
726					'),
727					);
728
729
730
731					##----------------------------------------------------------------------
732					## K-Means (specify maximum # / iterations) : NOT AVAILABLE
733					## + NOT AVAILABLE: kclusteri
734
735					##----------------------------------------------------------------------
736					## K-Medoids
737					pp_def
738					('kmedoids',
739					Pars => join("\n ", '',
740					q(int nclusters();), ##-- number of clusters to find
741					q(double distance(n,n);), ##-- distance matrix
742					q(int npass();), ##-- n passes
743					q(int [o]clusterids(n);), ##-- maps elts to cluster-ids
744					q(double [o]error();) , ##-- solution error
745					q(int [o]nfound();), ##-- number of times solution found
746					''
747					),
748					Code => ('
749					double distpp = (double )pp_alloc($SIZE(n));
750					threadloop %{
751					p2pp_dbl_ragged($SIZE(n), $SIZE(n), $P(distance), distpp);
752					kmedoids($nclusters(), $SIZE(n), distpp,
753					$npass(), $P(clusterids), $P(error), $P(nfound));
754					%}
755					/-- cleanup --/
756					if (distpp) free(distpp);
757					'),
758
759					Doc => 'K-Medoids clustering algorithm (uses distance matrix).
760
761					See also: kcluster().
762					'
763					);
764
765					##----------------------------------------------------------------------
766					## K-Medoids (given max #/iterations)
767					## + NOT AVAILABLE: kmedoidsi
768
769					##======================================================================
770					## Chapter 4: Hierarchical Clustering
771
772					##----------------------------------------------------------------------
773					## Hierarchical Clustering, without distances
774					pp_def
775					('treecluster',
776					Pars => join("\n ", '',
777					q(double data(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
778					q(int mask(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
779					q(double weight(d);), ##-- weights
780					q(int [o]tree(2,n);), ##-- result tree: uses only (2,n-1)
781					q(double [o]lnkdist(n);), ##-- link distance (n-1)
782					''
783					),
784					OtherPars => join("\n ", '', 'char distFlag;', 'char methodFlag;', ''),
785					Code => ('
786					int transpose = 0;
787					double datapp = (double )pp_alloc($SIZE(n));
788					int maskpp = (int )pp_alloc($SIZE(n));
789					double **distpp = NULL;
790					Node *nod = NULL;
791					int nmax = $SIZE(n)-1;
792					threadloop %{
793					p2pp_dbl($SIZE(n), $SIZE(d), $P(data), datapp);
794					p2pp_int($SIZE(n), $SIZE(d), $P(mask), maskpp);
795					nod = treecluster($SIZE(n), $SIZE(d), datapp, maskpp,
796					$P(weight), transpose, $COMP(distFlag), $COMP(methodFlag),
797					NULL);
798
799					node2p(nmax, nod, $P(tree), $P(lnkdist));
800					$tree(2=>0, n=>nmax) = 0;
801					$tree(2=>1, n=>nmax) = 0;
802					$lnkdist(n=>nmax) = 0;
803					%}
804					//
805					/-- cleanup --/
806					if (datapp) free(datapp);
807					if (maskpp) free(maskpp);
808					if (nod) free(nod);
809					'),
810
811					Doc => '
812					Hierachical agglomerative clustering.
813
814					$tree(2,n) represents the clustering solution.
815					Each row in the matrix describes one linking event,
816					with the two columns containing the name of the nodes that were joined.
817					The original genes are numbered 0..(n-1), nodes are numbered
818					-1..-(n-1).
819					$tree(2,n) thus actually uses only (2,n-1) cells.
820
821					$lnkdist(n) represents the distance between the two subnodes that were joined.
822					As for $tree(), $lnkdist() uses only (n-1) cells.
823					'
824					);
825
826					##----------------------------------------------------------------------
827					## Hierarchical Clustering, given distances
828					pp_def
829					('treeclusterd',
830					Pars => join("\n ", '',
831					q(double data(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
832					q(int mask(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
833					q(double weight(d);), ##-- weights
834					#q(int transpose();), ##-- probably dangerous
835					q(double distances(n,n);), ##-- distance matrix, should already be populated
836					q(int [o]tree(2,n);), ##-- result tree: uses only (2,n-1)
837					q(double [o]lnkdist(n);), ##-- link distance uses only (n-1)
838					''
839					),
840					OtherPars => join("\n ", '', 'char distFlag;', 'char methodFlag;', ''),
841					Code => '
842					int transpose = 0;
843					double datapp = (double )pp_alloc($SIZE(n));
844					int maskpp = (int )pp_alloc($SIZE(n));
845					double distpp = (double )pp_alloc($SIZE(n));
846					Node *nod = NULL;
847					int nmax = $SIZE(n)-1;
848					//
849					threadloop %{
850					p2pp_dbl($SIZE(n), $SIZE(d), $P(data), datapp);
851					p2pp_int($SIZE(n), $SIZE(d), $P(mask), maskpp);
852					p2pp_dbl_ragged($SIZE(n), $SIZE(n), $P(distances), distpp);
853					nod = treecluster($SIZE(n), $SIZE(d), datapp, maskpp,
854					$P(weight), transpose, $COMP(distFlag), $COMP(methodFlag),
855					distpp);
856
857					node2p(nmax, nod, $P(tree), $P(lnkdist));
858					$tree(2=>0, n=>nmax) = 0;
859					$tree(2=>1, n=>nmax) = 0;
860					$lnkdist(n=>nmax) = 0;
861					%}
862					/-- cleanup --/
863					if (datapp) free(datapp);
864					if (maskpp) free(maskpp);
865					if (distpp) free(distpp);
866					if (nod) free(nod);
867					',
868
869					Doc => '
870					Hierachical agglomerative clustering using given distance matrix.
871
872					See distancematrix() and treecluster(), above.
873					'
874					);
875
876					##----------------------------------------------------------------------
877					## Hierarchical Clustering: cut
878					pp_def('cuttree',
879					Pars => join("\n ", '',
880					q(int tree(2,n);), ##-- result tree using (n-1,2)
881					q(int nclusters();), ##-- number of desired clusters
882					q(int [o]clusterids(n);), ##-- output map: cluster-id by elt-id
883					''
884					),
885					Code => ('
886					Node *nod = p2node($SIZE(n)-1,$P(tree),NULL);
887					cuttree($SIZE(n), nod, $nclusters(), $P(clusterids));
888					if (nod) free(nod);
889					'),
890					Doc => '
891					Cluster selection for hierarchical clustering trees.
892					'
893					);
894
895
896					##======================================================================
897					## Chapter 5: SOM Clustering
898
899					##----------------------------------------------------------------------
900					## SOM clustering, without saving centroid vectors
901					pp_def
902					('somcluster',
903					Pars => join("\n ", '',
904					q(double data(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
905					q(int mask(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
906					q(double weight(d);), ##-- weights
907					q(int nxnodes();), ##-- number of horizontal cells in target topology
908					q(int nynodes();), ##-- number of vertical cells in target topology
909					q(double inittau();), ##-- initial value for \tau: usually ca. 0.02
910					q(int niter();), ##-- total number of iterations
911					#q(double [o]celldata(d,ny,nx);), ##-- centroid data vectors
912					q(int [o]clusterids(2,n);), ##-- output cluster indices (x,y) by elt
913					''
914					),
915					OtherPars => join("\n ", '', 'char *distFlag;', ''),
916					Code => '
917					int transpose = 0;
918					double datapp = (double )pp_alloc($SIZE(n));
919					int maskpp = (int )pp_alloc($SIZE(n));
920					threadloop %{
921					p2pp_dbl($SIZE(n), $SIZE(d), $P(data), datapp);
922					p2pp_int($SIZE(n), $SIZE(d), $P(mask), maskpp);
923					somcluster($SIZE(n), $SIZE(d), datapp, maskpp,
924					$P(weight), transpose, $nxnodes(), $nynodes(),
925					$inittau(), $niter(), *$COMP(distFlag), NULL,
926					(int (*)[2])($P(clusterids)));
927					%}
928					/-- cleanup --/
929					if (datapp) free(datapp);
930					if (maskpp) free(maskpp);
931					',
932					Doc => 'Self-Organizing Map clustering, does not return centroid data.'
933					);
934
935
936					##======================================================================
937					## Chapter 6: PCA
938
939					pp_def
940					('pca',
941					Pars => join("\n ", '',
942					q(double [o]U(d,n);), ##-- U_out = U_in · S · V^T
943					q(double [o]S(d);), ##-- Singular values (diagonal(?))
944					q(double [o]V(d,d);), ##-- orthogonal matrix of the decomposition
945					''
946					),
947					Code => '
948					double Upp = (double )pp_alloc($SIZE(n));
949					double Vpp = (double )pp_alloc($SIZE(d));
950					if ($SIZE(n) < $SIZE(d)) {
951					barf("svd(): Number of rows (=%d) must be >= number of columns (=%d)!\n", $SIZE(n), $SIZE(d));
952					}
953					//
954					threadloop %{
955					p2pp_dbl($SIZE(n), $SIZE(d), $P(U), Upp);
956					p2pp_dbl($SIZE(d), $SIZE(d), $P(V), Vpp);
957					//
958					pca($SIZE(n), $SIZE(d), Upp, Vpp, $P(S));
959					%}
960					/-- cleanup --/
961					if (Upp) free(Upp);
962					if (Vpp) free(Vpp);
963					',
964					Doc => '
965					Principal Component Analysis (SVD), operates in-place on $U() and requires ($SIZE(n) E= $SIZE(d)).
966					'
967					);
968
969					##======================================================================
970					## Extensions
971
972					##------------------------------------------------------
973					## rowdistances(): selected row-row distances
974					pp_def
975					('rowdistances',
976					Pars => join("\n ", '',
977					q(double data(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
978					q(int mask(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
979					q(double weight(d);), ##-- normalization
980					q(int rowids1(ncmps);), ##-- $data() row-ids: 0 <= $rowids1[$i] <= $n
981					q(int rowids2(ncmps);), ##-- $data() row-ids: 0 <= $rowids2[$i] <= $n
982					q(double [o]dist(ncmps);),
983					''
984					),
985					OtherPars => join("\n ", '', 'char *distFlag;', ''),
986					Doc => '
987					Computes pairwise distances between rows of $data().
988					$rowids1() contains the row-indices of the left (first) comparison operand,
989					and $rowids2() the row-indices of the right (second) comparison operand. Since each
990					of these are assumed to be indices into the first dimension $data(), it should be the case that:
991
992					0 <= $rowids1(i),rowids2(i) < $SIZE(n) for 0 <= i < $SIZE(ncmps)
993
994					See also clusterdistance(), clusterdistances(), clusterdistancematrixenc(), distancematrix().
995					',
996
997					Code => '
998					double datapp = (double )pp_alloc($SIZE(n));
999					int maskpp = (int )pp_alloc($SIZE(n));
1000					int rowid1, rowid2;
1001					char methodChar = \'x\'; /-- doesnt matter --/
1002					int transpose=0;
1003					//
1004					threadloop %{
1005					p2pp_dbl($SIZE(n), $SIZE(d), $P(data), datapp);
1006					p2pp_int($SIZE(n), $SIZE(d), $P(mask), maskpp);
1007					//
1008					loop(ncmps) %{
1009					rowid1 = $rowids1();
1010					rowid2 = $rowids2();
1011					$dist() = clusterdistance($SIZE(n), $SIZE(d), datapp, maskpp, $P(weight),
1012					1, 1,
1013					&rowid1, &rowid2,
1014					*$COMP(distFlag), methodChar, transpose);
1015					%}
1016					%}
1017					/-- cleanup --/
1018					if (datapp) free(datapp);
1019					if (maskpp) free(maskpp);
1020					',
1021					);
1022
1023					##------------------------------------------------------
1024					## Cluster + rows -> row distance vectors to cluster
1025					pp_def
1026					('clusterdistances',
1027					Pars => join("\n ", '',
1028					q(double data(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
1029					q(int mask(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
1030					q(double weight(d);), ##-- normalization
1031					q(int rowids(nr);),
1032					q(int index2(n2);),
1033					q(double [o]dist(nr);),
1034					''
1035					),
1036					OtherPars => join("\n ", '', 'char distFlag;', 'char methodFlag;', ''),
1037
1038					Doc => '
1039					Computes pairwise distance(s) from each of $rowids() as a singleton cluster
1040					with the cluster represented by $index2(), which should be an index
1041					vector as for clusterdistance(). See also clusterdistancematrixenc().
1042					',
1043
1044					Code => '
1045					double datapp = (double )pp_alloc($SIZE(n));
1046					int maskpp = (int )pp_alloc($SIZE(n));
1047					int transpose=0;
1048					//
1049					threadloop %{
1050					p2pp_dbl($SIZE(n), $SIZE(d), $P(data), datapp);
1051					p2pp_int($SIZE(n), $SIZE(d), $P(mask), maskpp);
1052					loop(nr) %{
1053					$dist() = clusterdistance($SIZE(n), $SIZE(d), datapp, maskpp, $P(weight),
1054					1, $SIZE(n2), &($rowids()), $P(index2),
1055					$COMP(distFlag), $COMP(methodFlag), transpose);
1056					%}
1057					%}
1058					/-- cleanup --/
1059					if (datapp) free(datapp);
1060					if (maskpp) free(maskpp);
1061					',
1062					);
1063
1064					##------------------------------------------------------
1065					## Cluster-Ids -> Cluster sizes
1066					pp_def
1067					('clustersizes',
1068					Pars => q(int clusterids(n); int [o]clustersizes(k);),
1069					Doc => '
1070					Computes the size (number of elements) of each cluster in $clusterids().
1071					Useful for allocating less than maximmal space for $clusterelements().
1072					',
1073					Code => '
1074					broadcastloop %{
1075					int cid, csize;
1076					loop (k) %{ $clustersizes() = 0; %}
1077					loop (n) %{
1078					cid = $clusterids();
1079					if (cid < 0 \|\| cid >= $SIZE(k)'
1080					.($PDL::Bad::Status ? ' \|\| $ISBADVAR(cid,clusterids)' : '')
1081					.') continue; /-- sanity check --/
1082					$clustersizes(k=>cid)++;
1083					%}
1084					%}
1085					$PDLSTATESETGOOD(clustersizes); /* always make sure the output is "good" */
1086					',
1087					HandleBad => 1,
1088					BadDoc => 'The output piddle should never be marked BAD.',
1089					);
1090
1091
1092					##------------------------------------------------------
1093					## clusterelements(): Cluster-Ids -> Item-Ids
1094					pp_def
1095					('clusterelements',
1096					Pars => q(int clusterids(n); int [o]clustersizes(k); int [o]eltids(mcsize,k);),
1097					Doc => '
1098					Converts the vector $clusterids() to a matrix $eltids() of element (row) indices
1099					indexed by cluster-id. $mcsize() is the maximum number of elements per cluster,
1100					at most $n. The output PDLs $clustersizes() and $eltids() can be passed to
1101					clusterdistancematrix().
1102					',
1103
1104					Code => '
1105					int cid, csize;
1106					loop (k) %{ $clustersizes() = 0; %}
1107					loop (n) %{
1108					cid = $clusterids();
1109					csize = $clustersizes(k=>cid)++;
1110					$eltids(mcsize=>csize,k=>cid) = n;
1111					%}
1112					',
1113					);
1114
1115					##------------------------------------------------------
1116					## clusterelementmask(): Cluster-Ids x Item-Ids -> bool (mask)
1117					pp_def
1118					('clusterelementmask',
1119					Pars => q(int clusterids(n); byte [o]eltmask(k,n);),
1120					Doc => '
1121					Get boolean membership mask $eltmask() based on cluster assignment in $clusterids().
1122					No value in $clusterids() may be greater than or equal to $k.
1123					On completion, $eltmask(k,n) is a true value iff $clusterids(n)=$k.
1124					',
1125
1126					Code => '
1127					int cid, csize;
1128					loop (n) %{
1129					cid = $clusterids();
1130					$eltmask(k=>cid) = 1;
1131					%}
1132					',
1133					);
1134
1135					##------------------------------------------------------
1136					## clusterdistancematrix(): all row<->cluster distances
1137					pp_def
1138					('clusterdistancematrix',
1139					Pars => join("\n ", '',
1140					q(double data(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
1141					q(int mask(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
1142					q(double weight(d);), ##-- normalization
1143					q(int rowids(nr);), ##-- rows to check
1144					q(int clustersizes(k);), ##-- cluster sizes
1145					q(int eltids(mcsize,k);), ##-- row-ids by cluster
1146					q(double [o]dist(k,nr);),
1147					''
1148					),
1149					OtherPars => join("\n ", '', 'char distFlag;', 'char methodFlag;', ''),
1150					Doc => '
1151					B in favor of clusterdistancematrixenc().
1152					In the future, this method is expected to become a wrapper for clusterdistancematrixenc().
1153
1154					Computes distance between each row index in $rowids()
1155					considered as a singleton cluster
1156					and each of the $k clusters whose elements are given by a single row of $eltids().
1157					$clustersizes() and $eltids() are as output by the clusterelements() method.
1158
1159					See also clusterdistance(), clusterdistances(), clustersizes(), clusterelements(), clusterdistancematrixenc().
1160					',
1161
1162					Code => '
1163					double datapp = (double )pp_alloc($SIZE(n));
1164					int maskpp = (int )pp_alloc($SIZE(n));
1165					int eltidspp = (int )pp_alloc($SIZE(k));
1166					int transpose=0;
1167					int rowid;
1168					//
1169					threadloop %{
1170					p2pp_dbl($SIZE(n), $SIZE(d), $P(data), datapp);
1171					p2pp_int($SIZE(n), $SIZE(d), $P(mask), maskpp);
1172					p2pp_int($SIZE(k), $SIZE(mcsize), $P(eltids), eltidspp);
1173					//
1174					loop(nr) %{
1175					rowid = $rowids();
1176					loop (k) %{
1177					$dist() = clusterdistance($SIZE(n), $SIZE(d), datapp, maskpp, $P(weight),
1178					1, $clustersizes(),
1179					&rowid, eltidspp[k],
1180					$COMP(distFlag), $COMP(methodFlag), transpose);
1181					%}
1182					%}
1183					%}
1184					/-- cleanup --/
1185					if (datapp) free(datapp);
1186					if (maskpp) free(maskpp);
1187					if (eltidspp) free(eltidspp);
1188					',
1189					);
1190
1191					##------------------------------------------------------
1192					## clusterenc(): ccs-like encoding cluster-to-row
1193
1194					pp_add_exported('','clusterenc');
1195
1196					pp_addpm(<<'EOPM');
1197
1198					=pod
1199
1200					=head2 clusterenc
1201
1202					=for sig
1203
1204					Signature: (
1205					int clusterids(n);
1206					int [o]clusterlens(k1);
1207					int [o]clustervals(k1);
1208					int [o]clusterrows(n);
1209					;
1210					int k1;
1211					)
1212
1213					Encodes datum-to-cluster vector $clusterids() for efficiently mapping
1214					clusters-to-data. Returned PDL $clusterlens() holds the lengths of each
1215					cluster containing at least one element. $clustervals() holds the IDs
1216					of such clusters as they appear as values in $clusterids(). $clusterrows()
1217					is such that:
1218
1219					all( rld($clusterlens, $clustervals) == $clusterids )
1220
1221					... if all available cluster-ids are in use.
1222
1223					If specified, $k1 is a perl scalar
1224					holding the number of clusters (maximum cluster index + 1); an
1225					appropriate value will guessed from $clusterids() otherwise.
1226
1227					Really just a wrapper for some lower-level PDL and PDL::Cluster calls.
1228
1229					=cut
1230
1231	0		0	1	sub clusterenc {
1232	0	0			my ($cids, $clens,$cvals,$crows, $kmax) = @_;
1233					$kmax = $cids->max+1 if (!defined($kmax));
1234
1235	0	0			##-- cluster sizes
1236	0				$clens = zeroes(long, $kmax) if (!defined($clens));
1237					clustersizes($cids,$clens);
1238
1239	0	0			##-- cluster-id values
	0
1240	0				if (!defined($cvals)) { $cvals = PDL->sequence(long,$kmax); }
1241					else { $cvals .= PDL->sequence(long,$kmax); }
1242
1243					##-- cluster-row values: handle BAD and negative values
1244					#if (!defined($crows)) { $crows = $cids->qsorti->where($cids->isgood & $cids>=0); }
1245					#else { $crows .= $cids->qsorti->where($cids->isgood & $cids>=0); }
1246
1247	0	0			##-- cluster-row values: treat BAD and negative values like anything else
	0
1248	0				if (!defined($crows)) { $crows = $cids->qsorti; }
1249					else { $crows .= $cids->qsorti; }
1250	0
1251					return ($clens,$cvals,$crows);
1252					}
1253
1254					EOPM
1255
1256
1257					##------------------------------------------------------
1258					## clusterdec(): ccs-like decoding cluster-to-row
1259
1260					pp_add_exported('','clusterdec');
1261
1262					pp_addpm(<<'EOPM');
1263
1264					=pod
1265
1266					=head2 clusterdec
1267
1268					=for sig
1269
1270					Signature: (
1271					int clusterlens(k1);
1272					int clustervals(k1);
1273					int clusterrows(n);
1274					int [o]clusterids(n);
1275					)
1276
1277					Decodes cluster-to-datum vectors ($clusterlens,$clustervals,$clusterrows)
1278					into a single datum-to-cluster vector $clusterids().
1279					$(clusterlens,$clustervals,$clusterrows) are as returned by the clusterenc() method.
1280
1281					Un-addressed row-index values in $clusterrows() will be assigned the pseudo-cluster (-1)
1282					in $clusterids().
1283
1284					Really just a wrapper for some lower-level PDL calls.
1285
1286					=cut
1287
1288	0		0	1	sub clusterdec {
1289					my ($clens,$cvals,$crows, $cids2) = @_;
1290
1291	0	0			##-- get $cids
1292	0				$cids2 = zeroes($cvals->type, $crows->dims) if (!defined($cids2));
1293					$cids2 .= -1;
1294
1295					##-- trim $crows
1296	0				#my $crows_good = $crows->slice("0:".($clens->sum-1)); ##-- assume bad indices are at END of $crows (BAD,inf,...)
1297					my $crows_good = $crows->slice(-$clens->sum.":-1"); ##-- assume bad indices are at BEGINNING of $crows (-1, ...)
1298
1299	0				##-- decode
1300					$clens->rld($cvals, $cids2->index($crows_good));
1301	0
1302					return $cids2;
1303					}
1304
1305					EOPM
1306
1307					##------------------------------------------------------
1308					## clusteroffsets(): ccs-like encoding cluster-to-row
1309
1310					pp_add_exported('','clusteroffsets');
1311
1312					pp_addpm(<<'EOPM');
1313
1314					=pod
1315
1316					=head2 clusteroffsets
1317
1318					=for sig
1319
1320					Signature: (
1321					int clusterids(n);
1322					int [o]clusteroffsets(k1+1);
1323					int [o]clustervals(k1);
1324					int [o]clusterrows(n);
1325					;
1326					int k1;
1327					)
1328
1329					Encodes datum-to-cluster vector $clusterids() for efficiently mapping
1330					clusters-to-data. Like clusterenc(), but returns cumulative offsets
1331					instead of lengths.
1332
1333					Really just a wrapper for clusterenc(), cumusumover(), and append().
1334
1335					=cut
1336
1337	0		0	1	sub clusteroffsets {
1338	0				my ($cids, $coffsets,$cvals,$crows, $kmax) = @_;
1339	0				my ($clens);
1340	0				($clens,$cvals,$crows) = clusterenc($cids,undef,$cvals,$crows,$kmax);
1341					$coffsets = $clens->append(0)->rotate(1)->cumusumover;
1342	0
1343					return ($coffsets,$cvals,$crows);
1344					}
1345
1346					EOPM
1347
1348					##------------------------------------------------------
1349					## clusterdistancematrixenc(): all cluster<->cluster distances for "encoded" cluster-to-row matrices
1350					pp_def
1351					('clusterdistancematrixenc',
1352					Pars => join("\n ", '',
1353					q(double data(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
1354					q(int mask(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
1355					q(double weight(d);), ##-- normalization
1356					q(int clens1(k1);), ##-- (encoded): X (dim=0) cluster sizes
1357					q(int crowids1(nc1);), ##-- (encoded): X (dim=0) clustered-element row-ids
1358					q(int clens2(k2);), ##-- (encoded): Y (dim=1) cluster sizes
1359					q(int crowids2(nc2);), ##-- (encoded): Y (dim=1) clustered-element row-ids
1360					q(double [o]dist(k1,k2);),
1361					''
1362					),
1363					OtherPars => join("\n ", '', 'char distFlag;', 'char methodFlag;', ''),
1364					Doc => '
1365					Computes cluster-distance between each pair of clusters in (sequence($k1) x sequence($k2)), where \'x\'
1366					is the Cartesian product. Cluster contents are passed as pairs ($clens(),$crowids()) as returned
1367					by the clusterenc() function (assuming that the $cvals() vector returned by clusterenc() is a flat sequence).
1368
1369					The deprecated method clusterdistancematrix() can be simulated by this function in the following
1370					manner: if a clusterdistancematrix() call was:
1371
1372					clustersizes ($cids, $csizes=zeroes(long,$k));
1373					clusterelements($cids, $celts =zeroes(long,$csizes->max)-1);
1374					clusterdistancematrix($data,$msk,$wt, $rowids, $csizes,$celts,
1375					$cdmat=zeroes(double,$k,$rowids->dim(0)),
1376					$distFlag, $methodFlag
1377					);
1378
1379					Then the corresponding use of clusterdistancematrixenc() would be:
1380
1381					($clens,$cvals,$crows) = clusterenc($cids);
1382					clusterdistancematrixenc($data,$msk,$wt,
1383					$clens, $crows, ##-- "real" clusters in output dim 0
1384					$rowids->ones, $rowids, ##-- $rowids as singleton clusters in output dim 1
1385					$cdmat=zeroes(double,$clens->dim(0),$rowids->dim(0)),
1386					$distFlag, $methodFlag);
1387
1388					If your $cvals() are not a flat sequence, you will probably need to do some index-twiddling
1389					to get things into the proper shape:
1390
1391					if ( !all($cvals==$cvals->sequence) \|\| $cvals->dim(0) != $k )
1392					{
1393					my $cdmat0 = $cdmat;
1394					my $nr = $rowids->dim(0);
1395					$cdmat = pdl(double,"inf")->slice("$k,$nr")->make_physical(); ##-- "missing" distances are infinite
1396					$cdmat->dice_axis(0,$cvals) .= $cdmat0;
1397					}
1398
1399					$distFlag and $methodFlag are interpreted as for clusterdistance().
1400
1401					See also clusterenc(), clusterdistancematrix().
1402					',
1403
1404					Code => '
1405					double datapp = (double )pp_alloc($SIZE(n));
1406					int maskpp = (int )pp_alloc($SIZE(n));
1407					int transpose=0;
1408					int crowids1p, crowids2p;
1409					//
1410					threadloop %{
1411					p2pp_dbl($SIZE(n), $SIZE(d), $P(data), datapp);
1412					p2pp_int($SIZE(n), $SIZE(d), $P(mask), maskpp);
1413					crowids1p = $P(crowids1);
1414					//
1415					loop(k1) %{
1416					crowids2p = $P(crowids2);
1417					loop (k2) %{
1418					$dist() = clusterdistance($SIZE(n), $SIZE(d), datapp, maskpp, $P(weight),
1419					$clens1(), $clens2(),
1420					crowids1p, crowids2p,
1421					$COMP(distFlag), $COMP(methodFlag), transpose);
1422					crowids2p += $clens2();
1423					%}
1424					crowids1p += $clens1();
1425					%}
1426					%}
1427					/-- cleanup --/
1428					if (datapp) free(datapp);
1429					if (maskpp) free(maskpp);
1430					',
1431					);
1432
1433
1434					##------------------------------------------------------
1435					## clusterdistancesenc(): selected cluster<->cluster distances for "encoded" cluster-to-row matrices
1436					pp_def
1437					('clusterdistancesenc',
1438					Pars => join("\n ", '',
1439					q(double data(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
1440					q(int mask(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
1441					q(double weight(d);), ##-- normalization
1442					q(int coffsets1(k1);), ##-- (encoded): X (dim=0) cluster offsets (k1+1)
1443					q(int crowids1(nc1);), ##-- (encoded): X (dim=0) clustered-element row-ids
1444					q(int cwhich1(ncmps);), ##-- selected left comparison operands
1445					q(int coffsets2(k2);), ##-- (encoded): Y (dim=1) cluster offsets (k2+1)
1446					q(int crowids2(nc2);), ##-- (encoded): Y (dim=1) clustered-element row-ids
1447					q(int cwhich2(ncmps);), ##-- selected right comparison operands
1448					q(double [o]dists(ncmps);), ##-- output matrix
1449					''
1450					),
1451					OtherPars => join("\n ", '', 'char distFlag;', 'char methodFlag;', ''),
1452					Doc => '
1453					Computes cluster-distance between selected pairs of co-indexed clusters in ($cwhich1,$cwhich2).
1454					Cluster contents are passed as pairs ($coffsetsX(),$crowidsX()) as returned
1455					by the clusteroffsets() function.
1456
1457					$distFlag and $methodFlag are interpreted as for clusterdistance().
1458
1459					See also clusterenc(), clusterdistancematrixenc().
1460					',
1461
1462					Code => '
1463					double datapp = (double )pp_alloc($SIZE(n));
1464					int maskpp = (int )pp_alloc($SIZE(n));
1465					int transpose=0;
1466					//
1467					threadloop %{
1468					p2pp_dbl($SIZE(n), $SIZE(d), $P(data), datapp);
1469					p2pp_int($SIZE(n), $SIZE(d), $P(mask), maskpp);
1470					//
1471					loop (ncmps) %{
1472					int c1 = $cwhich1();
1473					int c2 = $cwhich2();
1474					int succ_c1=c1+1;
1475					int succ_c2=c2+1;
1476					int beg1 = $coffsets1(k1=>c1);
1477					int beg2 = $coffsets2(k2=>c2);
1478					int len1 = $coffsets1(k1=>succ_c1) - beg1;
1479					int len2 = $coffsets2(k2=>succ_c2) - beg2;
1480					int *crowids1p = $P(crowids1) + beg1;
1481					int *crowids2p = $P(crowids2) + beg2;
1482
1483					$dists() = clusterdistance($SIZE(n), $SIZE(d), datapp, maskpp, $P(weight),
1484					len1, len2,
1485					crowids1p, crowids2p,
1486					$COMP(distFlag), $COMP(methodFlag), transpose);
1487					%}
1488					%}
1489					/-- cleanup --/
1490					if (datapp) free(datapp);
1491					if (maskpp) free(maskpp);
1492					',
1493					);
1494
1495					##----------------------------------------------------------------------
1496					## Cluster Centroids via Weighted Sum [p(datum_n\|cluster_k)]
1497					pp_def
1498					('getclusterwsum',
1499					Pars => join("\n ", '',
1500					q(double data(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
1501					q(int mask(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
1502					q(double clusterwts(k,n);), ##-- maps (cluster,elt) to weight [p(elt\|cluster)]
1503					## : should probably sum to 1 over each cluster (k)
1504					q(double [o]cdata(d,k);), ##-- centroid data
1505					q(int [o]cmask(d,k);), ##-- centroid mask
1506					''
1507					),
1508					Code =>
1509					('
1510					int rid, rwt, cmaskdk;
1511					loop (d) %{
1512					loop (k) %{
1513					cmaskdk = 0;
1514					loop (n) %{
1515					if ($mask()) {
1516					cmaskdk = 1;
1517					$cdata() += $clusterwts() * $data();
1518					}
1519					%}
1520					$cmask() = cmaskdk;
1521					%}
1522					%}
1523					'),
1524
1525					Doc => '
1526					Find cluster centroids by weighted sum. This can be considered an
1527					expensive generalization of the getclustermean() and getclustermedian()
1528					functions. Here, the input PDLs $data() and $mask(), as well as the
1529					output PDL $cdata() are as for getclustermean(). The matrix $clusterwts()
1530					determines the relative weight of each data row in determining the
1531					centroid of each cluster, potentially useful for "fuzzy" clustering.
1532					The equation used to compute cluster means is:
1533
1534					$cdata(d,k) = sum_{n} $clusterwts(k,n) * $data(d,n) * $mask(d,n)
1535
1536					For centroids in the same range as data elements, $clusterwts()
1537					should sum to 1 over each column (k):
1538
1539					all($clusterwts->xchg(0,1)->sumover == 1)
1540
1541					getclustermean() can be simulated by instantiating $clusterwts() with
1542					a uniform distribution over cluster elements:
1543
1544					$clusterwts = zeroes($k,$n);
1545					$clusterwts->indexND(cat($clusterids, xvals($clusterids))->xchg(0,1)) .= 1;
1546					$clusterwts /= $clusterwts->xchg(0,1)->sumover;
1547					getclusterwsum($data,$mask, $clusterwts, $cdata=zeroes($d,$k));
1548
1549					Similarly, getclustermedian() can be simulated by setting $clusterwts() to
1550					1 for cluster medians and otherwise to 0. More sophisticated centroid
1551					discovery methods can be computed by this function by setting
1552					$clusterwts(k,n) to some estimate of the conditional probability
1553					of the datum at row $n given the cluster with index $k:
1554					p(Elt==n\|Cluster==k). One
1555					way to achieve such an estimate is to use (normalized inverses of) the
1556					singleton-row-to-cluster distances as output by clusterdistancematrix().
1557
1558					'
1559					);
1560
1561					##----------------------------------------------------------------------
1562					## Attach data to nearest centroid
1563					pp_def
1564					('attachtonearest',
1565					Pars => join("\n ", '',
1566					q(double data(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
1567					q(int mask(d,n);), ##-- n="rows"\|"elts", d="columns"\|"features"
1568					q(double weight(d);), ##-- feature weights
1569					q(int rowids(nr);), ##-- rows to attach
1570					q(double cdata(d,k);), ##-- centroid data
1571					q(int cmask(d,k);), ##-- centroid mask
1572					q(int [o]clusterids(nr);), ##-- output cluster ids
1573					q(double [o]cdist(nr);), ##-- distances to best clusters
1574					''
1575					),
1576					OtherPars => join("\n ", '', 'char distFlag;', 'char methodFlag;', ''),
1577					Code =>
1578					('
1579					double datapp = (double )pp_alloc($SIZE(n));
1580					int maskpp = (int )pp_alloc($SIZE(n));
1581					double cdatapp = (double )pp_alloc($SIZE(k));
1582					int cmaskpp = (int )pp_alloc($SIZE(k));
1583					double *tmpdatapp[2];
1584					int *tmpmaskpp[2];
1585					int transpose=0;
1586					//
1587					threadloop %{
1588					int tmprowid = 0;
1589					int tmpctrid = 1;
1590					int ni;
1591					int ki, kbest;
1592					double dist, dbest;
1593					//
1594					p2pp_dbl($SIZE(n), $SIZE(d), $P(data), datapp);
1595					p2pp_int($SIZE(n), $SIZE(d), $P(mask), maskpp);
1596					p2pp_dbl($SIZE(k), $SIZE(d), $P(cdata), cdatapp);
1597					p2pp_int($SIZE(k), $SIZE(d), $P(cmask), cmaskpp);
1598					//
1599					/-- loop over all target rows --/
1600					loop (nr) %{
1601					ni = $rowids();
1602					tmpdatapp[tmprowid] = datapp[ni];
1603					tmpmaskpp[tmprowid] = maskpp[ni];
1604					//
1605					/-- initialize --/
1606					tmpdatapp[tmpctrid] = cdatapp[0];
1607					tmpmaskpp[tmpctrid] = cmaskpp[0];
1608					kbest = 0;
1609					dbest = clusterdistance(2, $SIZE(d), tmpdatapp, tmpmaskpp, $P(weight),
1610					1, 1, &tmprowid, &tmpctrid,
1611					$COMP(distFlag), $COMP(methodFlag), transpose);
1612					//
1613					/-- loop over all centroids --/
1614					for (ki=1; ki < $SIZE(k); ki++) {
1615					tmpdatapp[tmpctrid] = cdatapp[ki];
1616					tmpmaskpp[tmpctrid] = cmaskpp[ki];
1617					//
1618					dist = clusterdistance(2, $SIZE(d), tmpdatapp, tmpmaskpp, $P(weight),
1619					1, 1, &tmprowid, &tmpctrid,
1620					$COMP(distFlag), $COMP(methodFlag), transpose);
1621					if (dist < dbest) {
1622					kbest = ki;
1623					dbest = dist;
1624					}
1625					}
1626					//
1627					/-- save best data --/
1628					$clusterids() = kbest;
1629					$cdist() = dbest;
1630					%}
1631					%}
1632					//
1633					/-- cleanup --/
1634					if (datapp) free(datapp);
1635					if (maskpp) free(maskpp);
1636					if (cdatapp) free(cdatapp);
1637					if (cmaskpp) free(cmaskpp);
1638					'),
1639
1640					Doc => '
1641					Assigns each specified data row to the nearest cluster centroid.
1642					Data elements are given by $data() and $mask(), feature weights are
1643					given by $weight(), as usual. Cluster centroids are defined by
1644					by $cdata() and $cmask(), and the indices of rows to be attached
1645					are given in the vector $rowids(). The output vector $clusterids()
1646					contains for each specified row index the identifier of the nearest
1647					cluster centroid. The vector $cdist() contains the distance to
1648					the best clusters.
1649
1650					See also: clusterdistancematrix(), attachtonearestd().
1651					'
1652					);
1653
1654
1655					##----------------------------------------------------------------------
1656					## Attach data to nearest centroid, given datum-to-centroid distance matrix
1657					pp_add_exported('','attachtonearestd');
1658
1659					pp_addpm(<<'EOPM');
1660
1661					=pod
1662
1663					=head2 attachtonearestd
1664
1665					=for sig
1666
1667					Signature: (
1668					double cdistmat(k,n);
1669					int rowids(nr);
1670					int [o]clusterids(nr);
1671					double [o]dists(nr);
1672					)
1673
1674					Assigns each specified data row to the nearest cluster centroid,
1675					as for attachtonearest(), given the datum-to-cluster distance
1676					matrix $cdistmat(). Currently just a wrapper for a few PDL calls.
1677					In scalar context returns $clusterids(), in list context returns
1678					the list ($clusterids(),$dists()).
1679
1680					=cut
1681
1682	0		0	1	sub attachtonearestd {
1683	0	0			my ($cdm,$rowids,$cids,$dists)=@_;
1684	0	0			$cids = zeroes(long, $rowids->dim(0)) if (!defined($cids));
1685					$dists = zeroes(double, $rowids->dim(0)) if (!defined($dists));
1686
1687	0				##-- dice matrix
1688					my $cdmr = $cdm->dice_axis(1,$rowids);
1689
1690	0				##-- get best
1691	0				$cdmr->minimum_ind($cids);
1692					$dists .= $cdmr->index($cids);
1693	0	0
1694					return wantarray ? ($cids,$dists) : $cids;
1695					}
1696
1697					EOPM
1698
1699					##======================================================================
1700					## Cluster assignment: checking
1701					##======================================================================
1702
1703					##----------------------------------------------------------------------
1704					## checkprototypes(): ensure no repeats of $k values in the range [0,$n(
1705					pp_def
1706					('checkprototypes',
1707					Pars => join("\n ", '',
1708					q(protos(k);), ##-- prototypes: $protos($i) \in [0,$n(
1709					q([o]cprotos(k);), ##-- protos without repetitions
1710					q(byte [t]otmp(n);), ##-- $otmp($i)==1 iff $protos($j)==$i for some $j [must be specified]
1711					''
1712					),
1713					OtherPars => q(int nsize => n),
1714					GenericTypes => [qw(B S U L)],
1715					Inplace => ['protos'],
1716					Code =>
1717					('
1718					/-- sanity check --/
1719					if ($SIZE(k) > $SIZE(n)) {
1720					barf("checkprototypes(): number of prototypes \"k\" (=%d) must be <= number of objects \"n\" (=%d)!\n",
1721					$SIZE(k), $SIZE(n));
1722					}
1723					threadloop %{
1724					loop (n) %{ $otmp() = 0; %}
1725					loop (k) %{
1726					int protoi = $protos();
1727					for (; $otmp(n=>protoi); protoi = (protoi+1)%$SIZE(n)) { ; }
1728					$cprotos() = protoi;
1729					$otmp(n=>protoi) = 1;
1730					%}
1731					%}
1732					'),
1733					Doc =>
1734					('(Deterministic)
1735
1736					Ensure that the assignment $protos() from $k objects to
1737					integer "prototype" indices in the range [0,$n( contains no repetitions of any
1738					of the $n possible prototype values. One use for this function is
1739					the restriction of (randomly generated) potential clustering solutions
1740					for $k clusters in which each cluster is represented by a
1741					"prototypical" element from a data sample of size $n.
1742
1743					Requires: $n >= $k.
1744					'),
1745					);
1746
1747
1748					##----------------------------------------------------------------------
1749					## checkpartitions(): ensure no gaps in $k values for $n objects
1750					pp_def
1751					('checkpartitions',
1752					Pars => join("\n ", '',
1753					q(part(n);), ##-- partitioning of $n objects into to $k partitions
1754					q([o]cpart(n);), ##-- partitioning using full codomain
1755					q([t]ptmp(k);), ##-- $ptmp($i)==1 iff $map($j)==$i for some $j
1756					''
1757					),
1758					OtherPars => q(int ksize => k),
1759					GenericTypes => [qw(B S U L)],
1760					Inplace => ['part'],
1761					Code =>
1762					('
1763					/-- sanity check --/
1764					if ($SIZE(k) > $SIZE(n)) {
1765					barf("checkpartitions(): number of partitions \"k\" (=%d) must be <= number of objects \"n\" (=%d)!\n",
1766					$SIZE(k), $SIZE(n));
1767					}
1768					threadloop %{
1769					int ni, ki, kj;
1770					loop (k) %{ $ptmp() = 0; %}
1771					loop (n) %{
1772					ki = $part();
1773					$cpart() = ki;
1774					$ptmp(k=>ki) += 1;
1775					%}
1776					ni = 0;
1777					for (ki=0; ki < $SIZE(k); ki++) {
1778					if (!$ptmp(k=>ki)) {
1779					for (; 1; ni = (ni+1)%$SIZE(n)) {
1780					kj = $cpart(n=>ni);
1781					if ($ptmp(k=>kj) > 1) break;
1782					}
1783					$cpart(n=>ni) = ki;
1784					$ptmp(k=>ki) += 1;
1785					$ptmp(k=>kj) -= 1;
1786					}
1787					}
1788					%}
1789					'),
1790					Doc =>
1791					('(Deterministic)
1792
1793					Ensure that the partitioning $part() of $n objects into $k bins
1794					(identified by integer values in the range [0,$k-1])
1795					contains at least one instance of each of the
1796					$k possible values. One use for this function is
1797					the restriction of (randomly generated) potential clustering solutions
1798					for $n elements into $k clusters to those which assign at least one
1799					element to each cluster.
1800
1801					Requires: $n >= $k.
1802					'),
1803					);
1804
1805					##======================================================================
1806					## Cluster assignment: generation
1807					##======================================================================
1808
1809					##----------------------------------------------------------------------
1810					## randomprototypes(): generate a random prototype solution
1811					pp_add_exported('','randomprototypes');
1812
1813					pp_addpm(<<'EOPM');
1814
1815					=pod
1816
1817					=head2 randomprototypes
1818
1819					=for sig
1820
1821					Signature: (int k; int n; [o]prototypes(k))
1822
1823					Generate a random set of $k prototype indices drawn from $n objects,
1824					ensuring that no object is used more than once. Calls checkprototypes().
1825
1826					See also: checkprototypes(), randomassign(), checkpartitions(), randompartition().
1827
1828					=cut
1829
1830	0		0	1	sub randomprototypes {
1831	0	0			my ($k,$n,$protos) = @_;
1832	0				$protos = zeroes(long, $k) if (!defined($protos));
1833	0				$protos .= PDL->random($k)*$n;
1834	0				checkprototypes($protos->inplace, $n);
1835					return $protos;
1836					}
1837
1838					EOPM
1839
1840
1841					##----------------------------------------------------------------------
1842					## randompartition(): generate a random partition solution
1843					pp_add_exported('','randompartition');
1844
1845					pp_addpm(<<'EOPM');
1846
1847					=pod
1848
1849					=head2 randompartition
1850
1851					=for sig
1852
1853					Signature: (int k; int n; [o]partition(n))
1854
1855					Generate a partitioning of $n objects into $k clusters,
1856					ensuring that every cluster contains at least one object.
1857					Calls checkpartitions().
1858					This method is identical in functionality to randomassign(),
1859					but may be faster if $k is significantly smaller than $n.
1860
1861					See also: randomassign(), checkpartitions(), checkprototypes(), randomprototypes().
1862
1863					=cut
1864
1865	0		0	1	sub randompartition {
1866	0	0			my ($k,$n,$part) = @_;
1867	0				$part = zeroes(long, $n) if (!defined($part));
1868	0				$part .= PDL->random($n)*$k;
1869	0				checkpartitions($part->inplace, $k);
1870					return $part;
1871					}
1872
1873					EOPM
1874
1875
1876
1877
1878					##======================================================================
1879					## Footer Administrivia
1880					##======================================================================
1881
1882					##------------------------------------------------------
1883					## pm additions
1884					pp_addpm(<<'EOPM');
1885
1886
1887					##---------------------------------------------------------------------
1888					=pod
1889
1890					=head1 COMMON ARGUMENTS
1891
1892					Many of the functions described above require one or
1893					more of the following parameters:
1894
1895					=over 4
1896
1897					=item d
1898
1899					The number of features defined for each data element.
1900
1901					=item n
1902
1903					The number of data elements to be clustered.
1904
1905					=item k
1906
1907					=item nclusters
1908
1909					The number of desired clusters.
1910
1911					=item data(d,n)
1912
1913					A matrix representing the data to be clustered, double-valued.
1914
1915					=item mask(d,n)
1916
1917					A matrix indicating which data values are missing. If
1918					mask(i,j) == 0, then data(i,j) is treated as missing.
1919
1920					=item weights(d)
1921
1922					The (feature-) weights that are used to calculate the distance.
1923
1924					B Not all distance metrics make use of weights;
1925					you must provide some nonetheless.
1926
1927					=item clusterids(n)
1928
1929					A clustering solution. $clusterids() maps data elements
1930					(row indices in $data()) to values in the range [0,$k-1].
1931
1932					=back
1933
1934					=cut
1935
1936					##---------------------------------------------------------------------
1937					=pod
1938
1939					=head2 Distance Metrics
1940
1941					Distances between data elements (and cluster centroids, where applicable)
1942					are computed using one of a number of built-in metrics. Which metric
1943					is to be used for a given computation is indicated by a character
1944					flag denoted above with $distFlag(). In the following, w[i] represents
1945					a weighting factor in the $weights() matrix, and $W represents the total
1946					of all weights.
1947
1948					Currently implemented distance
1949					metrics and the corresponding flags are:
1950
1951					=over 4
1952
1953					=item e
1954
1955					Pseudo-Euclidean distance:
1956
1957					dist_e(x,y) = 1/W * sum_{i=1..d} w[i] * (x[i] - y[i])^2
1958
1959					Note that this is not the "true" Euclidean distance, which is defined as:
1960
1961					dist_E(x,y) = sqrt( sum_{i=1..d} (x[i] - y[i])^2 )
1962
1963
1964					=item b
1965
1966					City-block ("Manhattan") distance:
1967
1968					dist_b(x,y) = 1/W * sum_{i=1..d} w[i] * \|x[i] - y[i]\|
1969
1970
1971
1972					=item c
1973
1974					Pearson correlation distance:
1975
1976					dist_c(x,y) = 1-r(x,y)
1977
1978					where r is the Pearson correlation coefficient:
1979
1980					r(x,y) = 1/d * sum_{i=1..d} (x[i]-mean(x))/stddev(x) * (y[i]-mean(y))/stddev(y)
1981
1982					=item a
1983
1984					Absolute value of the correlation,
1985
1986					dist_a(x,y) = 1-\|r(x,y)\|
1987
1988					where r(x,y) is the Pearson correlation coefficient.
1989
1990					=item u
1991
1992					Uncentered correlation (cosine of the angle):
1993
1994					dist_u(x,y) = 1-r_u(x,y)
1995
1996					where:
1997
1998					r_u(x,y) = 1/d * sum_{i=1..d} (x[i]/sigma0(x)) * (y[i]/sigma0(y))
1999
2000					and:
2001
2002					sigma0(w) = sqrt( 1/d * sum_{i=1..d} w[i]^2 )
2003
2004					=item x
2005
2006					Absolute uncentered correlation,
2007
2008					dist_x(x,y) = 1-\|r_u(x,y)\|
2009
2010					=item s
2011
2012					Spearman's rank correlation.
2013
2014					dist_s(x,y) = 1-r_s(x,y) ~= dist_c(ranks(x),ranks(y))
2015
2016					where r_s(x,y) is the Spearman rank correlation. Weights are ignored.
2017
2018					=item k
2019
2020					Kendall's tau (does not use weights).
2021
2022					dist_k(x,y) = 1 - tau(x,y)
2023
2024					=item (other values)
2025
2026					For other values of dist, the default (Euclidean distance) is used.
2027
2028					=back
2029
2030					=cut
2031
2032
2033					##---------------------------------------------------------------------
2034					=pod
2035
2036					=head2 Link Methods
2037
2038					For hierarchical clustering, the 'link method' must be specified
2039					by a character flag, denoted above as $methodFlag.
2040					Known link methods are:
2041
2042					=over 4
2043
2044					=item s
2045
2046					Pairwise minimum-linkage ("single") clustering.
2047
2048					Defines the distance between two clusters as the
2049					least distance between any two of their respective elements.
2050
2051					=item m
2052
2053					Pairwise maximum-linkage ("complete") clustering.
2054
2055					Defines the distance between two clusters as the
2056					greatest distance between any two of their respective elements.
2057
2058					=item a
2059
2060					Pairwise average-linkage clustering (centroid distance using arithmetic mean).
2061
2062					Defines the distance between two clusters as the
2063					distance between their respective centroids, where each
2064					cluster centroid is defined as the arithmetic mean of
2065					that cluster's elements.
2066
2067					=item c
2068
2069					Pairwise centroid-linkage clustering (centroid distance using median).
2070
2071					Identifies the distance between two clusters as the
2072					distance between their respective centroids, where each
2073					cluster centroid is computed as the median of
2074					that cluster's elements.
2075
2076					=item (other values)
2077
2078					Behavior for other values is currently undefined.
2079
2080					=back
2081
2082					For the first three, either the distance matrix or the gene expression data is
2083					sufficient to perform the clustering algorithm. For pairwise centroid-linkage
2084					clustering, however, the gene expression data are always needed, even if the
2085					distance matrix itself is available.
2086
2087					=cut
2088
2089					##---------------------------------------------------------------------
2090					=pod
2091
2092					=head1 ACKNOWLEDGEMENTS
2093
2094					Perl by Larry Wall.
2095
2096					PDL by Karl Glazebrook, Tuomas J. Lukka, Christian Soeller, and others.
2097
2098					C Clustering Library by
2099					Michiel de Hoon,
2100					Seiya Imoto,
2101					and Satoru Miyano.
2102
2103					Orignal Algorithm::Cluster module by John Nolan and Michiel de Hoon.
2104
2105					=cut
2106
2107					##----------------------------------------------------------------------
2108					=pod
2109
2110					=head1 KNOWN BUGS
2111
2112					Dimensional requirements are sometimes too strict.
2113
2114					Passing weights to Spearman and Kendall link methods wastes space.
2115
2116					=cut
2117
2118
2119					##---------------------------------------------------------------------
2120					=pod
2121
2122					=head1 AUTHOR
2123
2124					Bryan Jurish Emoocow@cpan.orgE wrote and maintains the PDL::Cluster distribution.
2125
2126					Michiel de Hoon wrote the underlying C clustering library for cDNA microarray data.
2127
2128					=head1 COPYRIGHT
2129
2130					PDL::Cluster is a set of wrappers around the C Clustering library for cDNA microarray data.
2131
2132					=over 4
2133
2134					=item *
2135
2136					The C clustering library for cDNA microarray data.
2137					Copyright (C) 2002-2005 Michiel Jan Laurens de Hoon.
2138
2139					This library was written at the Laboratory of DNA Information Analysis,
2140					Human Genome Center, Institute of Medical Science, University of Tokyo,
2141					4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
2142					Contact: michiel.dehoon 'AT' riken.jp
2143
2144					See the files F, F and F in the PDL::Cluster distribution
2145					for details.
2146
2147					=item *
2148
2149					PDL::Cluster wrappers copyright (C) Bryan Jurish 2005-2018. All rights reserved.
2150					This package is free software, and entirely without warranty.
2151					You may redistribute it and/or modify it under the same terms
2152					as Perl itself.
2153
2154					=back
2155
2156					=head1 SEE ALSO
2157
2158					perl(1), PDL(3perl), Algorithm::Cluster(3perl), cluster(1),
2159					L
2160
2161					=cut
2162
2163					EOPM
2164
2165					# Always make sure that you finish your PP declarations with
2166					# pp_done
2167					pp_done();
2168					##----------------------------------------------------------------------