File Coverage

blib/lib/Algorithm/KMeans.pm

Criterion	Covered	Total	%
statement	22	24	91.6
branch			n/a
condition			n/a
subroutine	8	8	100.0
pod			n/a
total	30	32	93.7

line	stmt	sub	time	code
1				package Algorithm::KMeans;
2
3				#------------------------------------------------------------------------------------
4				# Copyright (c) 2014 Avinash Kak. All rights reserved. This program is free
5				# software. You may modify and/or distribute it under the same terms as Perl itself.
6				# This copyright notice must remain attached to the file.
7				#
8				# Algorithm::KMeans is a Perl module for clustering multidimensional data.
9				# -----------------------------------------------------------------------------------
10
11	1	1	15887	use 5.10.0;
	1		3
	1		40
12	1	1	3	use strict;
	1		1
	1		55
13	1	1	4	use warnings;
	1		5
	1		37
14	1	1	4	use Carp;
	1		1
	1		74
15	1	1	5	use File::Basename;
	1		1
	1		66
16	1	1	555	use Math::Random;
	1		4937
	1		93
17	1	1	536	use Graphics::GnuplotIF;
	1		8773
	1		53
18	1	1	966	use Math::GSL::Matrix;
	0
	0
19
20
21				our $VERSION = '2.03';
22
23				# from Perl docs:
24				my $_num_regex = '^[+-]?\ (\d+(\.\d)?\|\.\d+)([eE][+-]?\d+)?$';
25
26				# Constructor:
27				sub new {
28				my ($class, %args) = @_;
29				my @params = keys %args;
30				croak "\nYou have used a wrong name for a keyword argument " .
31				"--- perhaps a misspelling\n"
32				if check_for_illegal_params(@params) == 0;
33				bless {
34				_datafile => $args{datafile} \|\| croak("datafile required"),
35				_mask => $args{mask} \|\| croak("mask required"),
36				_K => $args{K} \|\| 0,
37				_K_min => $args{Kmin} \|\| 'unknown',
38				_K_max => $args{Kmax} \|\| 'unknown',
39				_cluster_seeding => $args{cluster_seeding} \|\| croak("must choose smart or random ".
40				"for cluster seeding"),
41				_var_normalize => $args{do_variance_normalization} \|\| 0,
42				_use_mahalanobis_metric => $args{use_mahalanobis_metric} \|\| 0,
43				_clusters_2_files => $args{write_clusters_to_files} \|\| 0,
44				_terminal_output => $args{terminal_output} \|\| 0,
45				_debug => $args{debug} \|\| 0,
46				_N => 0,
47				_K_best => 'unknown',
48				_original_data => {},
49				_data => {},
50				_data_id_tags => [],
51				_QoC_values => {},
52				_clusters => [],
53				_cluster_centers => [],
54				_clusters_hash => {},
55				_cluster_centers_hash => {},
56				_cluster_covariances_hash => {},
57				_data_dimensions => 0,
58
59				}, $class;
60				}
61
62				sub read_data_from_file {
63				my $self = shift;
64				my $datafile = $self->{_datafile};
65				my $mask = $self->{_mask};
66				my @mask = split //, $mask;
67				$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
68				print "data dimensionality: $self->{_data_dimensions} \n"if $self->{_terminal_output};
69				open INPUT, $datafile or die "unable to open file $datafile: $!\n";
70				chomp( my @raw_data = );
71				close INPUT;
72				# Transform strings into number data
73				foreach my $record (@raw_data) {
74				next unless $record;
75				next if $record =~ /^#/;
76				my @data_fields;
77				my @fields = split /\s+/, $record;
78				die "\nABORTED: Mask size does not correspond to row record size\n" if $#fields != $#mask;
79				my $record_id;
80				foreach my $i (0..@fields-1) {
81				if ($mask[$i] eq '0') {
82				next;
83				} elsif ($mask[$i] eq 'N') {
84				$record_id = $fields[$i];
85				} elsif ($mask[$i] eq '1') {
86				push @data_fields, $fields[$i];
87				} else {
88				die "misformed mask for reading the data file\n";
89				}
90				}
91				my @nums = map {/$_num_regex/;$_} @data_fields;
92				$self->{_original_data}->{ $record_id } = \@nums;
93				}
94				if ($self->{_var_normalize}) {
95				$self->{_data} = variance_normalization( $self->{_original_data} );
96				} else {
97				$self->{_data} = deep_copy_hash( $self->{_original_data} );
98				}
99				my @all_data_ids = keys %{$self->{_data}};
100				$self->{_data_id_tags} = \@all_data_ids;
101				$self->{_N} = scalar @all_data_ids;
102				if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
103				carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
104				"points must satisfy the relation N > 2xK**2 where K is " .
105				"the number of clusters requested for the clusters to be " .
106				"meaningful $!"
107				if ( $self->{_N} < (2 * $self->{_K} ** 2) );
108				print "\n\n\n";
109				}
110				srand(123456789);
111				}
112
113				sub kmeans {
114				my $self = shift;
115				my $K = $self->{_K};
116				if ( ($K == 0) \|\|
117				( ($self->{_K_min} ne 'unknown') &&
118				($self->{_K_max} ne 'unknown') ) ) {
119				eval {
120				$self->iterate_through_K();
121				};
122				die "in kmeans(): $@" if ($@);
123				} elsif ( $K =~ /\d+/) {
124				eval {
125				$self->cluster_for_fixed_K_multiple_random_tries($K)
126				if $self->{_cluster_seeding} eq 'random';
127				$self->cluster_for_fixed_K_single_smart_try($K)
128				if $self->{_cluster_seeding} eq 'smart';
129				};
130				die "in kmeans(): $@" if ($@);
131				} else {
132				croak "Incorrect call syntax used. See documentation.\n";
133				}
134				if ((defined $self->{_clusters}) && (defined $self->{_cluster_centers})){
135				foreach my $i (0..@{$self->{_clusters}}-1) {
136				$self->{_clusters_hash}->{"cluster$i"} = $self->{_clusters}->[$i];
137				$self->{_cluster_centers_hash}->{"cluster$i"} = $self->{_cluster_centers}->[$i];
138				$self->{_cluster_covariances_hash}->{"cluster$i"} =
139				$self->estimate_cluster_covariance($self->{_clusters}->[$i]);
140				}
141				$self->write_clusters_to_files( $self->{_clusters} ) if $self->{_clusters_2_files};
142				return ($self->{_clusters_hash}, $self->{_cluster_centers_hash});
143				} else {
144				croak "Clustering failed. Try again.";
145				}
146				}
147
148				# This is the subroutine to call if you prefer purely random choices for the initial
149				# seeding of the cluster centers.
150				sub cluster_for_fixed_K_multiple_random_tries {
151				my $self = shift;
152				my $K = shift;
153				my @all_data_ids = @{$self->{_data_id_tags}};
154				my $QoC;
155				my @QoC_values;
156				my @array_of_clusters;
157				my @array_of_cluster_centers;
158				my $clusters;
159				my $new_clusters;
160				my $cluster_centers;
161				my $new_cluster_centers;
162				print "Clustering for K = $K\n" if $self->{_terminal_output};
163				foreach my $trial (1..20) {
164				print ". ";
165				my ($new_clusters, $new_cluster_centers);
166				eval {
167				($new_clusters, $new_cluster_centers) = $self->cluster_for_given_K($K);
168				};
169				next if $@;
170				next if @$new_clusters <= 1;
171				my $newQoC = $self->cluster_quality( $new_clusters, $new_cluster_centers );
172				if ( (!defined $QoC) \|\| ($newQoC < $QoC) ) {
173				$QoC = $newQoC;
174				$clusters = deep_copy_AoA( $new_clusters );
175				$cluster_centers = deep_copy_AoA( $new_cluster_centers );
176				}
177				}
178				$self->{_clusters} = $clusters;
179				$self->{_cluster_centers} = $cluster_centers;
180				$self->{_QoC_values}->{"$K"} = $QoC;
181				if ($self->{_terminal_output}) {
182				print "\nDisplaying final clusters for best K (= $K) :\n";
183				display_clusters( $clusters );
184				$self->display_cluster_centers( $clusters );
185				print "\nQoC value (the smaller, the better): $QoC\n";
186				}
187				}
188
189				# For the smart try, we do initial cluster seeding by subjecting the data to
190				# principal components analysis in order to discover the direction of maximum
191				# variance in the data space. Subsequently, we try to find the K largest peaks along
192				# this direction. The coordinates of these peaks serve as the seeds for the K
193				# clusters.
194				sub cluster_for_fixed_K_single_smart_try {
195				my $self = shift;
196				my $K = shift;
197				my @all_data_ids = @{$self->{_data_id_tags}};
198				print "Clustering for K = $K\n" if $self->{_terminal_output};
199				my ($clusters, $cluster_centers);
200				eval {
201				($clusters, $cluster_centers) = $self->cluster_for_given_K($K);
202				};
203				if ($@) {
204				die "In cluster_for_fixed_K_single_smart_try: insufficient data for clustering " .
205				"with $self->{_K} clusters --- $@";
206				}
207				my $QoC = $self->cluster_quality( $clusters, $cluster_centers );
208				$self->{_clusters} = $clusters;
209				$self->{_cluster_centers} = $cluster_centers;
210				$self->{_QoC_values}->{"$K"} = $QoC;
211				if ($self->{_terminal_output}) {
212				print "\nDisplaying final clusters for best K (= $K) :\n";
213				display_clusters( $clusters );
214				$self->display_cluster_centers( $clusters );
215				print "\nQoC value (the smaller, the better): $QoC\n";
216				}
217				}
218
219				# The following subroutine is the top-level routine to call if you want the system to
220				# figure out on its own what value to use for K, the number of clusters. If you call
221				# the KMeans constructor with the K=0 option, the method below will try every
222				# possible value of K between 2 and the maximum possible depending on the number of
223				# data points available. For example, if the number of data points is 10,000, it will
224				# try all possible values of K between 2 and 70. For how the maximum value is set for
225				# K, see the comments made under Description. Note also how this method makes 20
226				# different tries for each value of K as a defense against the problem of the final
227				# result corresponding to some local minimum in the values of the QoC metric. Out of
228				# these 20 tries for each K, it retains the clusters and the cluster centers for only
229				# that try that yields the smallest value for the QoC metric. After estimating the
230				# "best" QoC values for all possible K in this manner, it then finds the K for which
231				# the QoC is the minimum. This is taken to be the best value for K. Finally, the
232				# output clusters are written out to separate files.
233				#
234				# If the KMeans constructor is invoked with the (Kmin, Kmax) options, then, instead
235				# of iterating through 2 and the maximum permissible value for K, the iterations are
236				# carried out only between Kmin and Kmax.
237				sub iterate_through_K {
238				my $self = shift;
239				my @all_data_ids = @{$self->{_data_id_tags}};
240				my $N = $self->{_N};
241				croak "You need more than 8 data samples. The number of data points must satisfy " .
242				"the relation N > 2xK**2 where K is the number of clusters. The smallest " .
243				"value for K is 2.\n" if $N <= 8;
244				my $K_statistical_max = int( sqrt( $N / 2.0 ) );
245				my $Kmin = $self->{_K_min} eq 'unknown'
246				? 2
247				: $self->{_K_min};
248				my $Kmax = $self->{_K_max} eq 'unknown'
249				? int( sqrt( $N / 2.0 ) )
250				: $self->{_K_max};
251				croak "\n\nYour Kmax value is too high. Ideally, it should not exceed sqrt(N/2) " .
252				"where N is the number of data points" if $Kmax > $K_statistical_max;
253				print "Value of Kmax is: $Kmax\n" if $self->{_terminal_output};
254				my @QoC_values;
255				my @array_of_clusters;
256				my @array_of_cluster_centers;
257				foreach my $K ($Kmin..$Kmax) {
258				my $QoC;
259				my $clusters;
260				my $cluster_centers;
261				print "Clustering for K = $K\n" if $self->{_terminal_output};
262				if ($self->{_cluster_seeding} eq 'random') {
263				foreach my $trial (1..21) {
264				print ". ";
265				my ($new_clusters, $new_cluster_centers);
266				if ($self->{_use_mahalanobis_metric}) {
267				eval {
268				($new_clusters, $new_cluster_centers) = $self->cluster_for_given_K($K);
269				};
270				next if $@;
271				} else {
272				($new_clusters, $new_cluster_centers) = $self->cluster_for_given_K($K);
273				}
274				my $newQoC = $self->cluster_quality( $new_clusters, $new_cluster_centers );
275				if ( (!defined $QoC) \|\| ($newQoC < $QoC) ) {
276				$QoC = $newQoC;
277				$clusters = deep_copy_AoA( $new_clusters );
278				$cluster_centers = deep_copy_AoA( $new_cluster_centers );
279				}
280				}
281				print "\n";
282				} elsif ($self->{_cluster_seeding} eq 'smart') {
283				eval {
284				($clusters, $cluster_centers) = $self->cluster_for_given_K($K);
285				};
286				if ($@) {
287				$Kmax = $K - 1;
288				last;
289				}
290				$QoC = $self->cluster_quality($clusters,$cluster_centers);
291				} else {
292				die "You must either choose 'smart' for cluster_seeding or 'random'. " .
293				"Fix your constructor call."
294				}
295				push @QoC_values, $QoC;
296				push @array_of_clusters, $clusters;
297				push @array_of_cluster_centers, $cluster_centers;
298				}
299				my ($min, $max) = minmax( \@QoC_values );
300				my $K_best_relative_to_Kmin = get_index_at_value($min, \@QoC_values );
301				my $K_best = $K_best_relative_to_Kmin + $Kmin;
302				if ($self->{_terminal_output}) {
303				print "\nDisplaying final clusters for best K (= $K_best) :\n";
304				display_clusters( $array_of_clusters[$K_best_relative_to_Kmin] );
305				$self->display_cluster_centers($array_of_clusters[$K_best_relative_to_Kmin]);
306				print "\nBest clustering achieved for K=$K_best with QoC = $min\n" if defined $min;
307				my @printableQoC = grep {$_} @QoC_values;
308				print "\nQoC values array (the smaller the value, the better it is) for different " .
309				"K starting with K=$Kmin: @printableQoC\n";
310				}
311				$self->{_K_best} = $K_best;
312				foreach my $i (0..@QoC_values-1) {
313				my $k = $i + $Kmin;
314				$self->{_QoC_values}->{"$k"} = $QoC_values[$i];
315				}
316				$self->{_clusters} = $array_of_clusters[$K_best_relative_to_Kmin];
317				$self->{_cluster_centers} =
318				$array_of_cluster_centers[$K_best_relative_to_Kmin];
319				}
320
321				# This is the function to call if you already know what value you want to use for K,
322				# the number of expected clusters. The purpose of this function is to do the
323				# initialization of the cluster centers and to carry out the initial assignment of
324				# the data to the clusters with the initial cluster centers. The initialization
325				# consists of 3 steps: Construct a random sequence of K integers between 0 and N-1
326				# where N is the number of data points to be clustered; 2) Call
327				# get_initial_cluster_centers() to index into the data array with the random integers
328				# to get a list of K data points that would serve as the initial cluster centers; and
329				# (3) Call assign_data_to_clusters_initial() to assign the rest of the data to each
330				# of the K clusters on the basis of the proximity to the cluster centers.
331				sub cluster_for_given_K {
332				my $self = shift;
333				my $K = shift;
334				my @all_data_ids = @{$self->{_data_id_tags}};
335				my $cluster_centers;
336				if ($self->{_cluster_seeding} eq 'smart') {
337				$cluster_centers = $self->get_initial_cluster_centers_smart($K);
338				} elsif ($self->{_cluster_seeding} eq 'random') {
339				$cluster_centers = $self->get_initial_cluster_centers($K);
340				} else {
341				die "You must either choose 'smart' for cluster_seeding or 'random'. " .
342				"Fix your constructor call."
343				}
344				my $clusters;
345				if ($self->{_use_mahalanobis_metric}) {
346				my $clusters_and_determinants =
347				$self->assign_data_to_clusters_initial_mahalanobis($cluster_centers);
348				$clusters = $clusters_and_determinants->[0];
349				my @determinants = @{$clusters_and_determinants->[1]};
350				my ($min,$max) = minmax(\@determinants);
351				die "In cluster_for_given_K(): min determinant of covariance matrix for at " .
352				"least one cluster is too small" if $min / $max < 0.001;
353				} else {
354				$clusters = $self->assign_data_to_clusters_initial($cluster_centers);
355				}
356				my $cluster_nonexistent_flag = 0;
357				foreach my $trial (0..2) {
358				if ($self->{_use_mahalanobis_metric}) {
359				($clusters, $cluster_centers) = $self->assign_data_to_clusters_mahalanobis($clusters, $K);
360				} else {
361				($clusters, $cluster_centers) = $self->assign_data_to_clusters( $clusters, $K );
362				}
363				my $num_of_clusters_returned = @$clusters;
364				foreach my $cluster (@$clusters) {
365				$cluster_nonexistent_flag = 1 if ((!defined $cluster) \|\| (@$cluster == 0));
366				}
367				last unless $cluster_nonexistent_flag;
368				}
369				return ($clusters, $cluster_centers);
370				}
371
372				# This function is used when you set the "cluster_seeding" option to 'random' in the
373				# constructor. Returns a set of K random integers. These serve as indices to reach
374				# into the data array. A data element whose index is one of the random numbers
375				# returned by this routine serves as an initial cluster center. Note the quality
376				# check it runs on the list of K random integers constructed. We first make sure
377				# that all K random integers are different. Subsequently, we carry out a quality
378				# assessment of the K random integers constructed. This quality measure consists of
379				# the ratio of the values spanned by the random integers to the value of N, the total
380				# number of data points to be clustered. Currently, if this ratio is less than 0.3,
381				# we discard the K integers and try again.
382				sub initialize_cluster_centers {
383				my $self = shift;
384				my $K = shift;
385				my $data_store_size = $self->{_N};
386				my @cluster_center_indices;
387				while (1) {
388				foreach my $i (0..$K-1) {
389				$cluster_center_indices[$i] = int rand( $data_store_size );
390				next if $i == 0;
391				foreach my $j (0..$i-1) {
392				while ( $cluster_center_indices[$j] ==
393				$cluster_center_indices[$i] ) {
394				my $old = $cluster_center_indices[$i];
395				$cluster_center_indices[$i] = int rand($data_store_size);
396				}
397				}
398				}
399				my ($min,$max) = minmax(\@cluster_center_indices );
400				my $quality = ($max - $min) / $data_store_size;
401				last if $quality > 0.3;
402				}
403				return @cluster_center_indices;
404				}
405
406				# This function is used when you set the "cluster_seeding" option to 'random' in the
407				# constructor. This routine merely reaches into the data array with the random
408				# integers, as constructed by the previous routine, serving as indices and fetching
409				# values corresponding to those indices. The fetched data samples serve as the
410				# initial cluster centers.
411				sub get_initial_cluster_centers {
412				my $self = shift;
413				my $K = shift;
414				my @cluster_center_indices = $self->initialize_cluster_centers($K);
415				my @result;
416				foreach my $i (@cluster_center_indices) {
417				my $tag = $self->{_data_id_tags}[$i];
418				push @result, $self->{_data}->{$tag};
419				}
420				return \@result;
421				}
422
423				# This method is invoked when you choose the 'smart' option for the "cluster_seeding"
424				# option in the constructor. It subjects the data to a principal components analysis
425				# to figure out the direction of maximal variance. Subsequently, it tries to locate
426				# K peaks in a smoothed histogram of the data points projected onto the maximal
427				# variance direction.
428				sub get_initial_cluster_centers_smart {
429				my $self = shift;
430				my $K = shift;
431				if ($self->{_data_dimensions} == 1) {
432				my @one_d_data;
433				foreach my $j (0..$self->{_N}-1) {
434				my $tag = $self->{_data_id_tags}[$j];
435				push @one_d_data, $self->{_data}->{$tag}->[0];
436				}
437				my @peak_points = find_peak_points_in_given_direction(\@one_d_data,$K);
438				print "highest points at data values: @peak_points\n" if $self->{_debug};
439				my @cluster_centers;
440				foreach my $peakpoint (@peak_points) {
441				push @cluster_centers, [$peakpoint];
442				}
443				return \@cluster_centers;
444				}
445				my ($num_rows,$num_cols) = ($self->{_data_dimensions},$self->{_N});
446				my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
447				my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
448				# All the record labels are stored in the array $self->{_data_id_tags}. The
449				# actual data for clustering is stored in a hash at $self->{_data} whose keys are
450				# the record labels; the value associated with each key is the array holding the
451				# corresponding numerical multidimensional data.
452				foreach my $j (0..$num_cols-1) {
453				my $tag = $self->{_data_id_tags}[$j];
454				my $data = $self->{_data}->{$tag};
455				$matrix->set_col($j, $data);
456				}
457				if ($self->{_debug}) {
458				print "\nDisplaying the original data as a matrix:";
459				display_matrix( $matrix );
460				}
461				foreach my $j (0..$num_cols-1) {
462				$mean_vec += $matrix->col($j);
463				}
464				$mean_vec *= 1.0 / $num_cols;
465				if ($self->{_debug}) {
466				print "Displaying the mean vector for the data:";
467				display_matrix( $mean_vec );
468				}
469				foreach my $j (0..$num_cols-1) {
470				my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
471				$matrix->set_col($j, \@new_col);
472				}
473				if ($self->{_debug}) {
474				print "Displaying mean subtracted data as a matrix:";
475				display_matrix( $matrix );
476				}
477				my $transposed = transpose( $matrix );
478				if ($self->{_debug}) {
479				print "Displaying transposed data matrix:";
480				display_matrix( $transposed );
481				}
482				my $covariance = matrix_multiply( $matrix, $transposed );
483				$covariance *= 1.0 / $num_cols;
484				if ($self->{_debug}) {
485				print "\nDisplaying the Covariance Matrix for your data:";
486				display_matrix( $covariance );
487				}
488				my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
489				my $num_of_eigens = @$eigenvalues;
490				my $largest_eigen_index = 0;
491				my $smallest_eigen_index = 0;
492				print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug};
493				foreach my $i (1..$num_of_eigens-1) {
494				$largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
495				$smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
496				print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug};
497				}
498				print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
499				print "\nsmallest eigen index: $smallest_eigen_index\n\n" if $self->{_debug};
500				foreach my $i (0..$num_of_eigens-1) {
501				my @vec = $eigenvectors->[$i]->as_list;
502				print "Eigenvector $i: @vec\n" if $self->{_debug};
503				}
504				my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
505				print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug};
506				my @max_var_direction;
507				# Each element of the array @largest_eigen_vec is a Math::Complex object
508				foreach my $k (0..@largest_eigen_vec-1) {
509				my ($mag, $theta) = $largest_eigen_vec[$k] =~ /\[(\d*\.\d+),(\S+)\]/;
510				if ($theta eq '0') {
511				$max_var_direction[$k] = $mag;
512				} elsif ($theta eq 'pi') {
513				$max_var_direction[$k] = -1.0 * $mag;
514				} else {
515				die "eigendecomposition of covariance matrix produced a complex " .
516				"eigenvector --- something is wrong";
517				}
518				}
519				print "\nMaximum Variance Direction: @max_var_direction\n\n" if $self->{_debug};
520				# We now project all data points on the largest eigenvector.
521				# Each projection will yield a single point on the eigenvector.
522				my @projections;
523				foreach my $j (0..$self->{_N}-1) {
524				my $tag = $self->{_data_id_tags}[$j];
525				my $data = $self->{_data}->{$tag};
526				die "Dimensionality of the largest eigenvector does not "
527				. "match the dimensionality of the data"
528				unless @max_var_direction == $self->{_data_dimensions};
529				my $projection = vector_multiply($data, \@max_var_direction);
530				push @projections, $projection;
531				}
532				print "All projection points: @projections\n" if $self->{_debug};
533				my @peak_points = find_peak_points_in_given_direction(\@projections, $K);
534				print "highest points at points along largest eigenvec: @peak_points\n" if $self->{_debug};
535				my @cluster_centers;
536				foreach my $peakpoint (@peak_points) {
537				my @actual_peak_coords = map {$peakpoint * $_} @max_var_direction;
538				push @cluster_centers, \@actual_peak_coords;
539				}
540				return \@cluster_centers;
541				}
542
543				# This method is invoked when you choose the 'smart' option for the "cluster_seeding"
544				# option in the constructor. It is called by the previous method to locate K peaks
545				# in a smoothed histogram of the data points projected onto the maximal variance
546				# direction.
547				sub find_peak_points_in_given_direction {
548				my $dataref = shift;
549				my $how_many = shift;
550				my @data = @$dataref;
551				my ($min, $max) = minmax(\@data);
552				my $num_points = @data;
553				my @sorted_data = sort {$a <=> $b} @data;
554				my $scale = $max - $min;
555				foreach my $index (0..$#sorted_data-1) {
556				$sorted_data[$index] = ($sorted_data[$index] - $min) / $scale;
557				}
558				my $avg_diff = 0;
559				foreach my $index (0..$#sorted_data-1) {
560				my $diff = $sorted_data[$index+1] - $sorted_data[$index];
561				$avg_diff += ($diff - $avg_diff) / ($index + 1);
562				}
563				my $delta = 1.0 / 1000.0;
564				my @accumulator = (0) x 1000;
565				foreach my $index (0..@sorted_data-1) {
566				my $cell_index = int($sorted_data[$index] / $delta);
567				my $smoothness = 40;
568				for my $index ($cell_index-$smoothness..$cell_index+$smoothness) {
569				next if $index < 0 \|\| $index > 999;
570				$accumulator[$index]++;
571				}
572				}
573				my $peaks_array = non_maximum_suppression( \@accumulator );
574				my $peaks_index_hash = get_value_index_hash( $peaks_array );
575				my @K_highest_peak_locations;
576				my $k = 0;
577				foreach my $peak (sort {$b <=> $a} keys %$peaks_index_hash) {
578				my $unscaled_peak_point = $min + $peaks_index_hash->{$peak} * $scale * $delta;
579				push @K_highest_peak_locations, $unscaled_peak_point
580				if $k < $how_many;
581				last if ++$k == $how_many;
582				}
583				return @K_highest_peak_locations;
584				}
585
586				# The purpose of this routine is to form initial clusters by assigning the data
587				# samples to the initial clusters formed by the previous routine on the basis of the
588				# best proximity of the data samples to the different cluster centers.
589				sub assign_data_to_clusters_initial {
590				my $self = shift;
591				my @cluster_centers = @{ shift @_ };
592				my @clusters;
593				foreach my $ele (@{$self->{_data_id_tags}}) {
594				my $best_cluster;
595				my @dist_from_clust_centers;
596				foreach my $center (@cluster_centers) {
597				push @dist_from_clust_centers, $self->distance($ele, $center);
598				}
599				my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
600				push @{$clusters[$best_center_index]}, $ele;
601				}
602				return \@clusters;
603				}
604
605				sub assign_data_to_clusters_initial_mahalanobis {
606				my $self = shift;
607				my @cluster_centers = @{ shift @_ };
608				my @clusters;
609				foreach my $ele (@{$self->{_data_id_tags}}) {
610				my $best_cluster;
611				my @dist_from_clust_centers;
612				foreach my $center (@cluster_centers) {
613				push @dist_from_clust_centers, $self->distance($ele, $center);
614				}
615				my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
616				push @{$clusters[$best_center_index]}, $ele if defined $best_center_index;
617				}
618				# Since a cluster center may not correspond to any particular sample, it is possible
619				# for one of the elements of the array @clusters to be null using the above
620				# strategy for populating the initial clusters. Let's say there are five cluster
621				# centers in the array @cluster_centers. The $best_center_index may populate the
622				# the elements of the array @clusters for the indices 0, 1, 2, 4, which would leave
623				# $clusters[3] as undefined. So, in what follows, we must first check if all of
624				# the elements of @clusters are defined.
625				my @determinants;
626				foreach my $cluster(@clusters) {
627				die "The clustering program started with bad initialization. Please start over"
628				unless defined $cluster;
629				my $covariance = $self->estimate_cluster_covariance($cluster);
630				my $determinant = $covariance->det();
631				push @determinants, $determinant;
632				}
633				return [\@clusters, \@determinants];
634				}
635
636				# This is the main routine that along with the update_cluster_centers() routine
637				# constitute the two key steps of the K-Means algorithm. In most cases, the infinite
638				# while() loop will terminate automatically when the cluster assignments of the data
639				# points remain unchanged. For the sake of safety, we keep track of the number of
640				# iterations. If this number reaches 100, we exit the while() loop anyway. In most
641				# cases, this limit will not be reached.
642				sub assign_data_to_clusters {
643				my $self = shift;
644				my $clusters = shift;
645				my $K = shift;
646				my $final_cluster_centers;
647				my $iteration_index = 0;
648				while (1) {
649				my $new_clusters;
650				my $assignment_changed_flag = 0;
651				my $current_cluster_center_index = 0;
652				my $cluster_size_zero_condition = 0;
653				my $how_many = @$clusters;
654				my $cluster_centers = $self->update_cluster_centers( deep_copy_AoA_with_nulls( $clusters ) );
655				$iteration_index++;
656				foreach my $cluster (@$clusters) {
657				my $current_cluster_center = $cluster_centers->[$current_cluster_center_index];
658				foreach my $ele (@$cluster) {
659				my @dist_from_clust_centers;
660				foreach my $center (@$cluster_centers) {
661				push @dist_from_clust_centers,
662				$self->distance($ele, $center);
663				}
664				my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
665				my $best_cluster_center = $cluster_centers->[$best_center_index];
666				if (vector_equal($current_cluster_center, $best_cluster_center)){
667				push @{$new_clusters->[$current_cluster_center_index]}, $ele;
668				} else {
669				$assignment_changed_flag = 1;
670				push @{$new_clusters->[$best_center_index]}, $ele;
671				}
672				}
673				$current_cluster_center_index++;
674				}
675				next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
676				# Now make sure that none of the K clusters is an empty cluster:
677				foreach my $newcluster (@$new_clusters) {
678				$cluster_size_zero_condition = 1 if ((!defined $newcluster) or (@$newcluster == 0));
679				}
680				push @$new_clusters, (undef) x ($K - @$new_clusters) if @$new_clusters < $K;
681				# During clustering for a fixed K, should a cluster inadvertently
682				# become empty, steal a member from the largest cluster to hopefully
683				# spawn a new cluster:
684				my $largest_cluster;
685				foreach my $local_cluster (@$new_clusters) {
686				next if !defined $local_cluster;
687				$largest_cluster = $local_cluster if !defined $largest_cluster;
688				if (@$local_cluster > @$largest_cluster) {
689				$largest_cluster = $local_cluster;
690				}
691				}
692				foreach my $local_cluster (@$new_clusters) {
693				if ( (!defined $local_cluster) \|\| (@$local_cluster == 0) ) {
694				push @$local_cluster, pop @$largest_cluster;
695				}
696				}
697				next if (($cluster_size_zero_condition) && ($iteration_index < 100));
698				last if $iteration_index == 100;
699				$clusters = deep_copy_AoA( $new_clusters );
700				last if $assignment_changed_flag == 0;
701				}
702				$final_cluster_centers = $self->update_cluster_centers( $clusters );
703				return ($clusters, $final_cluster_centers);
704				}
705
706				sub assign_data_to_clusters_mahalanobis {
707				my $self = shift;
708				my $clusters = shift;
709				my $K = shift;
710				my $final_cluster_centers;
711				my $iteration_index = 0;
712				while (1) {
713				my $new_clusters;
714				my $assignment_changed_flag = 0;
715				my $current_cluster_center_index = 0;
716				my $cluster_size_zero_condition = 0;
717				my $how_many = @$clusters;
718				my $cluster_centers_and_covariances =
719				$self->update_cluster_centers_and_covariances_mahalanobis(deep_copy_AoA_with_nulls($clusters));
720				my $cluster_centers = $cluster_centers_and_covariances->[0];
721				my $cluster_covariances = $cluster_centers_and_covariances->[1];
722				$iteration_index++;
723				foreach my $cluster (@$clusters) {
724				my $current_cluster_center = $cluster_centers->[$current_cluster_center_index];
725				my $current_cluster_covariance = $cluster_covariances->[$current_cluster_center_index];
726				foreach my $ele (@$cluster) {
727				my @mahalanobis_dist_from_clust_centers;
728				foreach my $i (0..@$clusters-1) {
729				my $center = $cluster_centers->[$i];
730				my $covariance = $cluster_covariances->[$i];
731				my $maha_distance;
732				eval {
733				$maha_distance = $self->distance_mahalanobis($ele, $center, $covariance);
734				};
735				next if $@;
736				push @mahalanobis_dist_from_clust_centers, $maha_distance;
737				}
738				my ($min, $best_center_index) = minimum( \@mahalanobis_dist_from_clust_centers );
739				my $best_cluster_center = $cluster_centers->[$best_center_index];
740				if (vector_equal($current_cluster_center, $best_cluster_center)){
741				push @{$new_clusters->[$current_cluster_center_index]}, $ele;
742				} else {
743				$assignment_changed_flag = 1;
744				push @{$new_clusters->[$best_center_index]}, $ele;
745				}
746				}
747				$current_cluster_center_index++;
748				}
749				next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
750				# Now make sure that none of the K clusters is an empty cluster:
751				foreach my $newcluster (@$new_clusters) {
752				$cluster_size_zero_condition = 1 if ((!defined $newcluster) or (@$newcluster == 0));
753				}
754				push @$new_clusters, (undef) x ($K - @$new_clusters) if @$new_clusters < $K;
755				# During clustering for a fixed K, should a cluster inadvertently
756				# become empty, steal a member from the largest cluster to hopefully
757				# spawn a new cluster:
758				my $largest_cluster;
759				foreach my $local_cluster (@$new_clusters) {
760				next if !defined $local_cluster;
761				$largest_cluster = $local_cluster if !defined $largest_cluster;
762				if (@$local_cluster > @$largest_cluster) {
763				$largest_cluster = $local_cluster;
764				}
765				}
766				foreach my $local_cluster (@$new_clusters) {
767				if ( (!defined $local_cluster) \|\| (@$local_cluster == 0) ) {
768				push @$local_cluster, pop @$largest_cluster;
769				}
770				}
771				next if (($cluster_size_zero_condition) && ($iteration_index < 100));
772				last if $iteration_index == 100;
773				# Now do a deep copy of new_clusters into clusters
774				$clusters = deep_copy_AoA( $new_clusters );
775				last if $assignment_changed_flag == 0;
776				}
777				$final_cluster_centers = $self->update_cluster_centers( $clusters );
778				return ($clusters, $final_cluster_centers);
779				}
780
781				sub update_cluster_centers_and_covariances_mahalanobis {
782				my $self = shift;
783				my @clusters = @{ shift @_ };
784				my @new_cluster_centers;
785				my @new_cluster_covariances;
786				# During clustering for a fixed K, should a cluster inadvertently become empty,
787				# steal a member from the largest cluster to hopefully spawn a new cluster:
788				my $largest_cluster;
789				foreach my $cluster (@clusters) {
790				next if !defined $cluster;
791				$largest_cluster = $cluster if !defined $largest_cluster;
792				if (@$cluster > @$largest_cluster) {
793				$largest_cluster = $cluster;
794				}
795				}
796				foreach my $cluster (@clusters) {
797				if ( (!defined $cluster) \|\| (@$cluster == 0) ) {
798				push @$cluster, pop @$largest_cluster;
799				}
800				}
801				foreach my $cluster (@clusters) {
802				die "Cluster became empty --- untenable condition " .
803				"for a given K. Try again. \n" if !defined $cluster;
804				my $cluster_size = @$cluster;
805				die "Cluster size is zero --- untenable.\n" if $cluster_size == 0;
806				my @new_cluster_center = @{$self->add_point_coords( $cluster )};
807				@new_cluster_center = map {my $x = $_/$cluster_size; $x} @new_cluster_center;
808				push @new_cluster_centers, \@new_cluster_center;
809				# for covariance calculation:
810				my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$cluster));
811				my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
812				my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
813				# All the record labels are stored in the array $self->{_data_id_tags}. The
814				# actual data for clustering is stored in a hash at $self->{_data} whose keys are
815				# the record labels; the value associated with each key is the array holding the
816				# corresponding numerical multidimensional data.
817				foreach my $j (0..$num_cols-1) {
818				my $tag = $cluster->[$j];
819				my $data = $self->{_data}->{$tag};
820				my @diff_from_mean = vector_subtract($data, \@new_cluster_center);
821				$matrix->set_col($j, \@diff_from_mean);
822				}
823				my $transposed = transpose( $matrix );
824				my $covariance = matrix_multiply( $matrix, $transposed );
825				$covariance *= 1.0 / $num_cols;
826				if ($self->{_debug}) {
827				print "\nDisplaying the Covariance Matrix for cluster:";
828				display_matrix( $covariance );
829				}
830				push @new_cluster_covariances, $covariance;
831				}
832				return [\@new_cluster_centers, \@new_cluster_covariances];
833				}
834
835				# After each new assignment of the data points to the clusters on the basis of the
836				# current values for the cluster centers, we call the routine shown here for updating
837				# the values of the cluster centers.
838				sub update_cluster_centers {
839				my $self = shift;
840				my @clusters = @{ shift @_ };
841				my @new_cluster_centers;
842				# During clustering for a fixed K, should a cluster inadvertently become empty,
843				# steal a member from the largest cluster to hopefully spawn a new cluster:
844				my $largest_cluster;
845				foreach my $cluster (@clusters) {
846				next if !defined $cluster;
847				$largest_cluster = $cluster if !defined $largest_cluster;
848				if (@$cluster > @$largest_cluster) {
849				$largest_cluster = $cluster;
850				}
851				}
852				foreach my $cluster (@clusters) {
853				if ( (!defined $cluster) \|\| (@$cluster == 0) ) {
854				push @$cluster, pop @$largest_cluster;
855				}
856				}
857				foreach my $cluster (@clusters) {
858				die "Cluster became empty --- untenable condition " .
859				"for a given K. Try again. \n" if !defined $cluster;
860				my $cluster_size = @$cluster;
861				die "Cluster size is zero --- untenable.\n" if $cluster_size == 0;
862				my @new_cluster_center = @{$self->add_point_coords( $cluster )};
863				@new_cluster_center = map {my $x = $_/$cluster_size; $x}
864				@new_cluster_center;
865				push @new_cluster_centers, \@new_cluster_center;
866				}
867				return \@new_cluster_centers;
868				}
869
870				sub which_cluster_for_new_data_element {
871				my $self = shift;
872				my $ele = shift;
873				die "The dimensionality of the new data element is not correct: $!"
874				unless @$ele == $self->{_data_dimensions};
875				my %distance_to_new_ele_hash;
876				foreach my $cluster_id (sort keys %{$self->{_cluster_centers_hash}}) {
877				$distance_to_new_ele_hash{$cluster_id} = $self->distance2($ele,
878				$self->{_cluster_centers_hash}->{$cluster_id});
879				}
880				my @values = values %distance_to_new_ele_hash;
881				my ($min,$max) = minmax(\@values);
882				my $answer;
883				foreach my $cluster_id (keys %distance_to_new_ele_hash) {
884				$answer = $cluster_id if $distance_to_new_ele_hash{$cluster_id} == $min;
885				}
886				return $answer;
887				}
888
889				sub which_cluster_for_new_data_element_mahalanobis {
890				my $self = shift;
891				my $ele = shift;
892				die "The dimensionality of the new data element is not correct: $!"
893				unless @$ele == $self->{_data_dimensions};
894				my %distance_to_new_ele_hash;
895				foreach my $cluster_id (sort keys %{$self->{_cluster_centers_hash}}) {
896				$distance_to_new_ele_hash{$cluster_id} =
897				$self->distance_mahalanobis2($ele, $self->{_cluster_centers_hash}->{$cluster_id},
898				$self->{_cluster_covariances_hash}->{$cluster_id});
899				}
900				my @values = values %distance_to_new_ele_hash;
901				my ($min,$max) = minmax(\@values);
902				my $answer;
903				foreach my $cluster_id (keys %distance_to_new_ele_hash) {
904				$answer = $cluster_id if $distance_to_new_ele_hash{$cluster_id} == $min;
905				}
906				return $answer;
907				}
908
909				# The following function returns the value of QoC for a given partitioning of the
910				# data into K clusters. It calculates two things: the average value for the distance
911				# between a data point and the center of the cluster in which the data point resides,
912				# and the average value for the distances between the cluster centers. We obviously
913				# want to minimize the former and maximize the latter. All of the "from center"
914				# distances within each cluster are stored in the variable
915				# $sum_of_distances_for_one_cluster. When this variable, after it is divided by the
916				# number of data elements in the cluster, is summed over all the clusters, we get a
917				# value that is stored in $avg_dist_for_cluster. The inter-cluster-center distances
918				# are stored in the variable $inter_cluster_center_dist.
919				sub cluster_quality {
920				my $self = shift;
921				my $clusters = shift;
922				my $cluster_centers = shift;
923				my $K = @$cluster_centers; # Number of clusters
924				my $cluster_radius = 0;
925				foreach my $i (0..@$clusters-1) {
926				my $sum_of_distances_for_one_cluster = 0;
927				foreach my $ele (@{$clusters->[$i]}) {
928				$sum_of_distances_for_one_cluster +=
929				$self->distance( $ele, $cluster_centers->[$i] );
930				}
931				$cluster_radius +=
932				$sum_of_distances_for_one_cluster / @{$clusters->[$i]};
933				}
934				my $inter_cluster_center_dist = 0;
935				foreach my $i (0..@$cluster_centers-1) {
936				foreach my $j (0..@$cluster_centers-1) {
937				$inter_cluster_center_dist +=
938				$self->distance2( $cluster_centers->[$i],
939				$cluster_centers->[$j] );
940				}
941				}
942				my $avg_inter_cluster_center_dist = $inter_cluster_center_dist /
943				( $K * ($K-1) / 2.0 );
944				return $cluster_radius / $avg_inter_cluster_center_dist;
945				}
946
947				# The following routine is for computing the distance between a data point specified
948				# by its symbolic name in the master datafile and a point (such as the center of a
949				# cluster) expressed as a vector of coordinates:
950				sub distance {
951				my $self = shift;
952				my $ele1_id = shift @_; # symbolic name of data sample
953				my @ele1 = @{$self->{_data}->{$ele1_id}};
954				my @ele2 = @{shift @_};
955				die "wrong data types for distance calculation\n" if @ele1 != @ele2;
956				my $how_many = @ele1;
957				my $squared_sum = 0;
958				foreach my $i (0..$how_many-1) {
959				$squared_sum += ($ele1[$i] - $ele2[$i])**2;
960				}
961				my $dist = sqrt $squared_sum;
962				return $dist;
963				}
964
965				# The following routine does the same as above but now both arguments are expected to
966				# be arrays of numbers:
967				sub distance2 {
968				my $self = shift;
969				my @ele1 = @{shift @_};
970				my @ele2 = @{shift @_};
971				die "wrong data types for distance calculation\n" if @ele1 != @ele2;
972				my $how_many = @ele1;
973				my $squared_sum = 0;
974				foreach my $i (0..$how_many-1) {
975				$squared_sum += ($ele1[$i] - $ele2[$i])**2;
976				}
977				return sqrt $squared_sum;
978				}
979
980				# Requires three args: $ele for the symbolic tag of the element, $center for the
981				# coordinates of the center of the cluster, and $covariance for the covariance of
982				# cluster. Our goal is to find the distance of the element ele from the cluster
983				# whose mean and covariance are provided.
984				sub distance_mahalanobis {
985				my $self = shift;
986				my $ele = shift;
987				my $cluster_center = shift;
988				my $cluster_covariance = shift;
989				my $det_of_covar = $cluster_covariance->det();
990				my $ele_data = $self->{_data}->{$ele};
991				my @diff_from_mean = vector_subtract($ele_data, $cluster_center);
992				my $vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
993				$vec->set_col(0, \@diff_from_mean);
994				my $transposed = transpose( $vec );
995				my $covariance_inverse;
996				if ($cluster_covariance->det() > 0.001) {
997				$covariance_inverse = $cluster_covariance->inverse;
998				} else {
999				die "covariance matrix may not have an inverse";
1000				}
1001				my $determinant = $covariance_inverse->det();
1002				my $distance = $transposed * $covariance_inverse * $vec;
1003				my @distance = $distance->as_list;
1004				$distance = $distance[0];
1005				return sqrt $distance;
1006				}
1007				# Same as the previous method except that the first argument can be the actual
1008				# coordinates of the data element. Our goal is to find the Mahalanobis distance
1009				# from a given data element to a cluster whose center and covariance are known. As
1010				# for the previous method, it requires three arguments.
1011				sub distance_mahalanobis2 {
1012				my $self = shift;
1013				my $ele = shift; # is now a ref to the array of coords for a point
1014				my $cluster_center = shift;
1015				my $cluster_covariance = shift;
1016				return undef unless defined $cluster_covariance;
1017				my $det_of_covar = $cluster_covariance->det();
1018				my @diff_from_mean = vector_subtract($ele, $cluster_center);
1019				my $vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
1020				$vec->set_col(0, \@diff_from_mean);
1021				my $transposed = transpose( $vec );
1022				my $covariance_inverse;
1023				if ($cluster_covariance->det() > 0.001) {
1024				$covariance_inverse = $cluster_covariance->inverse;
1025				} else {
1026				die "covariance matrix may not have an inverse";
1027				}
1028				my $determinant = $covariance_inverse->det();
1029				my $distance = $transposed * $covariance_inverse * $vec;
1030				my @distance = $distance->as_list;
1031				$distance = $distance[0];
1032				return sqrt $distance;
1033				}
1034
1035				sub estimate_cluster_covariance {
1036				my $self = shift;
1037				my $cluster = shift;
1038				my $cluster_size = @$cluster;
1039				my @cluster_center = @{$self->add_point_coords( $cluster )};
1040				@cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;
1041				# for covariance calculation:
1042				my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$cluster));
1043				my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
1044				my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
1045				# All the record labels are stored in the array $self->{_data_id_tags}. The
1046				# actual data for clustering is stored in a hash at $self->{_data} whose keys are
1047				# the record labels; the value associated with each key is the array holding the
1048				# corresponding numerical multidimensional data.
1049				foreach my $j (0..$num_cols-1) {
1050				my $tag = $cluster->[$j];
1051				my $data = $self->{_data}->{$tag};
1052				my @diff_from_mean = vector_subtract($data, \@cluster_center);
1053				$matrix->set_col($j, \@diff_from_mean);
1054				}
1055				my $transposed = transpose( $matrix );
1056				my $covariance = $matrix * $transposed;
1057				$covariance *= 1.0 / $num_cols;
1058				if ($self->{_debug}) {
1059				print "\nDisplaying the Covariance Matrix for cluster:";
1060				display_matrix( $covariance );
1061				}
1062				return $covariance;
1063				}
1064
1065				sub write_clusters_to_files {
1066				my $self = shift;
1067				my @clusters = @{$self->{_clusters}};
1068				unlink glob "cluster*.dat";
1069				foreach my $i (0..@clusters-1) {
1070				my $filename = "cluster" . $i . ".dat";
1071				print "\nWriting cluster $i to file $filename\n" if $self->{_terminal_output};
1072				open FILEHANDLE, "\| sort > $filename" or die "Unable to open file: $!";
1073				foreach my $ele (@{$clusters[$i]}) {
1074				print FILEHANDLE "$ele ";
1075				}
1076				close FILEHANDLE;
1077				}
1078				}
1079
1080				sub get_K_best {
1081				my $self = shift;
1082				croak "You need to run the clusterer with K=0 option " .
1083				"before you can call this method" if $self->{_K_best} eq 'unknown';
1084				print "\nThe best value of K: $self->{_K_best}\n" if $self->{_terminal_output};
1085				return $self->{_K_best};
1086				}
1087
1088				sub show_QoC_values {
1089				my $self = shift;
1090				croak "\nYou need to run the clusterer with K=0 option before you can call this method"
1091				if $self->{_K_best} eq 'unknown';
1092				print "\nShown below are K on the left and the QoC values on the right (the smaller " .
1093				"the QoC, the better it is)\n";
1094				foreach my $key (sort keys %{$self->{_QoC_values}} ) {
1095				print " $key => $self->{_QoC_values}->{$key}\n" if defined $self->{_QoC_values}->{$key};
1096				}
1097				}
1098
1099				sub DESTROY {
1100				unlink "__temp_" . basename($_[0]->{_datafile});
1101				unlink "__temp_data_" . basename($_[0]->{_datafile});
1102				unlink "__temp_normed_data_" . basename($_[0]->{_datafile});
1103				}
1104
1105				################################## Visualization Code ###################################
1106
1107				# It makes sense to call visualize_clusters() only AFTER you have called kmeans().
1108				#
1109				# The visualize_clusters() implementation automatically figures out whether it
1110				# should do a 2D plot or a 3D plot. If the number of on bits in the mask that is
1111				# supplied as one of the arguments is greater than 2, it does a 3D plot for the
1112				# first three data coordinates. That is, the clusters will be displayed in the 3D
1113				# space formed by the first three data coordinates. On the other hand, if the number
1114				# of on bits in the mask is exactly 2, it does a 2D plot. Should it happen that
1115				# only one on bit is specified for the mask, visualize_clusters() aborts.
1116				#
1117				# The visualization code consists of first accessing each of clusters created by the
1118				# kmeans() subroutine. Note that the clusters contain only the symbolic names for
1119				# the individual records in the source data file. We therefore next reach into the
1120				# $self->{_original_data} hash and get the data coordinates associated with each
1121				# symbolic label in a cluster. The numerical data thus generated is then written
1122				# out to a temp file. When doing so we must remember to insert TWO BLANK LINES
1123				# between the data blocks corresponding to the different clusters. This constraint
1124				# is imposed on us by Gnuplot when plotting data from the same file since we want to
1125				# use different point styles for the data points in different cluster files.
1126				#
1127				# Subsequently, we call upon the Perl interface provided by the Graphics::GnuplotIF
1128				# module to plot the data clusters.
1129				sub visualize_clusters {
1130				my $self = shift;
1131				my $v_mask;
1132				my $pause_time;
1133				if (@_ == 1) {
1134				$v_mask = shift \|\| croak "visualization mask missing";
1135				} elsif (@_ == 2) {
1136				$v_mask = shift \|\| croak "visualization mask missing";
1137				$pause_time = shift;
1138				} else {
1139				croak "visualize_clusters() called with wrong args";
1140				}
1141				my $master_datafile = $self->{_datafile};
1142				my @v_mask = split //, $v_mask;
1143				my $visualization_mask_width = @v_mask;
1144				my $original_data_mask = $self->{_mask};
1145				my @mask = split //, $original_data_mask;
1146				my $data_field_width = scalar grep {$_ eq '1'} @mask;
1147				croak "\n\nABORTED: The width of the visualization mask (including " .
1148				"all its 1s and 0s) must equal the width of the original mask " .
1149				"used for reading the data file (counting only the 1's)"
1150				if $visualization_mask_width != $data_field_width;
1151				my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1152				my %visualization_data;
1153				while ( my ($record_id, $data) = each %{$self->{_original_data}} ) {
1154				my @fields = @$data;
1155				croak "\nABORTED: Visualization mask size exceeds data record size\n"
1156				if $#v_mask > $#fields;
1157				my @data_fields;
1158				foreach my $i (0..@fields-1) {
1159				if ($v_mask[$i] eq '0') {
1160				next;
1161				} elsif ($v_mask[$i] eq '1') {
1162				push @data_fields, $fields[$i];
1163				} else {
1164				croak "Misformed visualization mask. It can only have 1s and 0s\n";
1165				}
1166				}
1167				$visualization_data{ $record_id } = \@data_fields;
1168				}
1169				my @all_data_ids = @{$self->{_data_id_tags}};
1170				my $K = scalar @{$self->{_clusters}};
1171				my $filename = basename($master_datafile);
1172				my $temp_file = "__temp_" . $filename;
1173				unlink $temp_file if -e $temp_file;
1174				open OUTPUT, ">$temp_file"
1175				or die "Unable to open a temp file in this directory: $!\n";
1176				foreach my $cluster (@{$self->{_clusters}}) {
1177				foreach my $item (@$cluster) {
1178				print OUTPUT "@{$visualization_data{$item}}";
1179				print OUTPUT "\n";
1180				}
1181				print OUTPUT "\n\n";
1182				}
1183				close OUTPUT;
1184				my $plot;
1185				my $hardcopy_plot;
1186				if (!defined $pause_time) {
1187				$plot = Graphics::GnuplotIF->new( persist => 1 );
1188				$hardcopy_plot = Graphics::GnuplotIF->new();
1189				$hardcopy_plot->gnuplot_cmd('set terminal png', "set output \"clustering_results.png\"");
1190				} else {
1191				$plot = Graphics::GnuplotIF->new();
1192				}
1193				$plot->gnuplot_cmd( "set noclip" );
1194				$plot->gnuplot_cmd( "set pointsize 2" );
1195				my $arg_string = "";
1196				if ($visualization_data_field_width > 2) {
1197				foreach my $i (0..$K-1) {
1198				my $j = $i + 1;
1199				$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i\" with points lt $j pt $j, ";
1200				}
1201				} elsif ($visualization_data_field_width == 2) {
1202				foreach my $i (0..$K-1) {
1203				my $j = $i + 1;
1204				$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i\" with points lt $j pt $j, ";
1205				}
1206				} elsif ($visualization_data_field_width == 1 ) {
1207				foreach my $i (0..$K-1) {
1208				my $j = $i + 1;
1209				$arg_string .= "\"$temp_file\" index $i using 1 title \"Cluster $i\" with points lt $j pt $j, ";
1210				}
1211				}
1212				$arg_string = $arg_string =~ /^(.*),[ ]+$/;
1213				$arg_string = $1;
1214				if ($visualization_data_field_width > 2) {
1215				$plot->gnuplot_cmd( "splot $arg_string" );
1216				$hardcopy_plot->gnuplot_cmd( "splot $arg_string" ) unless defined $pause_time;
1217				$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1218				} elsif ($visualization_data_field_width == 2) {
1219				$plot->gnuplot_cmd( "plot $arg_string" );
1220				$hardcopy_plot->gnuplot_cmd( "plot $arg_string" ) unless defined $pause_time;
1221				$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1222				} elsif ($visualization_data_field_width == 1) {
1223				croak "No provision for plotting 1-D data\n";
1224				}
1225				}
1226
1227				# It makes sense to call visualize_data() only AFTER you have called the method
1228				# read_data_from_file().
1229				#
1230				# The visualize_data() is meant for the visualization of the original data in its
1231				# various 2D or 3D subspaces. The method can also be used to visualize the normed
1232				# data in a similar manner. Recall the normed data is the original data after each
1233				# data dimension is normalized by the standard-deviation along that dimension.
1234				#
1235				# Whether you see the original data or the normed data depends on the second
1236				# argument supplied in the method call. It must be either the string 'original' or
1237				# the string 'normed'.
1238				sub visualize_data {
1239				my $self = shift;
1240				my $v_mask = shift \|\| croak "visualization mask missing";
1241				my $datatype = shift; # must be either 'original' or 'normed'
1242				croak "\n\nABORTED: You called visualize_data() for normed data " .
1243				"but without first turning on data normalization in the " .
1244				"in the KMeans constructor"
1245				if ($datatype eq 'normed') && ! $self->{_var_normalize};
1246				my $master_datafile = $self->{_datafile};
1247				my @v_mask = split //, $v_mask;
1248				my $visualization_mask_width = @v_mask;
1249				my $original_data_mask = $self->{_mask};
1250				my @mask = split //, $original_data_mask;
1251				my $data_field_width = scalar grep {$_ eq '1'} @mask;
1252				croak "\n\nABORTED: The width of the visualization mask (including " .
1253				"all its 1s and 0s) must equal the width of the original mask " .
1254				"used for reading the data file (counting only the 1's)"
1255				if $visualization_mask_width != $data_field_width;
1256				my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1257				my %visualization_data;
1258				my $data_source;
1259				if ($datatype eq 'original') {
1260				$data_source = $self->{_original_data};
1261				} elsif ($datatype eq 'normed') {
1262				$data_source = $self->{_data};
1263				} else {
1264				croak "\n\nABORTED: improper call to visualize_data()";
1265				}
1266				while ( my ($record_id, $data) = each %{$data_source} ) {
1267				my @fields = @$data;
1268				croak "\nABORTED: Visualization mask size exceeds data record size\n"
1269				if $#v_mask > $#fields;
1270				my @data_fields;
1271				foreach my $i (0..@fields-1) {
1272				if ($v_mask[$i] eq '0') {
1273				next;
1274				} elsif ($v_mask[$i] eq '1') {
1275				push @data_fields, $fields[$i];
1276				} else {
1277				croak "Misformed visualization mask. It can only have 1s and 0s\n";
1278				}
1279				}
1280				$visualization_data{ $record_id } = \@data_fields;
1281				}
1282				my $filename = basename($master_datafile);
1283				my $temp_file;
1284				if ($datatype eq 'original') {
1285				$temp_file = "__temp_data_" . $filename;
1286				} elsif ($datatype eq 'normed') {
1287				$temp_file = "__temp_normed_data_" . $filename;
1288				} else {
1289				croak "ABORTED: Improper call to visualize_data()";
1290				}
1291				unlink $temp_file if -e $temp_file;
1292				open OUTPUT, ">$temp_file"
1293				or die "Unable to open a temp file in this directory: $!\n";
1294				foreach my $datapoint (values %visualization_data) {
1295				print OUTPUT "@$datapoint";
1296				print OUTPUT "\n";
1297				}
1298				close OUTPUT;
1299				my $plot = Graphics::GnuplotIF->new( persist => 1 );
1300				$plot->gnuplot_cmd( "set noclip" );
1301				$plot->gnuplot_cmd( "set pointsize 2" );
1302				my $plot_title = $datatype eq 'original' ? '"data"' : '"normed data"';
1303				my $arg_string ;
1304				if ($visualization_data_field_width > 2) {
1305				$arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
1306				} elsif ($visualization_data_field_width == 2) {
1307				$arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
1308				} elsif ($visualization_data_field_width == 1 ) {
1309				$arg_string = "\"$temp_file\" using 1 notitle with points lt -1 pt 1";
1310				}
1311				if ($visualization_data_field_width > 2) {
1312				$plot->gnuplot_cmd( "splot $arg_string" );
1313				} elsif ($visualization_data_field_width == 2) {
1314				$plot->gnuplot_cmd( "plot $arg_string" );
1315				} elsif ($visualization_data_field_width == 1) {
1316				croak "No provision for plotting 1-D data\n";
1317				}
1318				}
1319
1320				########################### Generating Synthetic Data for Clustering ##############################
1321
1322				# The data generated corresponds to a multivariate distribution. The mean and the
1323				# covariance of each Gaussian in the distribution are specified individually in a
1324				# parameter file. See the example parameter file param.txt in the examples
1325				# directory. Just edit this file for your own needs.
1326				#
1327				# The multivariate random numbers are generated by calling the Math::Random module.
1328				# As you would expect, that module will insist that the covariance matrix you
1329				# specify be symmetric and positive definite.
1330				sub cluster_data_generator {
1331				my $class = shift;
1332				croak "illegal call of a class method" unless $class eq 'Algorithm::KMeans';
1333				my %args = @_;
1334				my $input_parameter_file = $args{input_parameter_file};
1335				my $output_file = $args{output_datafile};
1336				my $N = $args{number_data_points_per_cluster};
1337				my @all_params;
1338				my $param_string;
1339				if (defined $input_parameter_file) {
1340				open INPUT, $input_parameter_file \|\| "unable to open parameter file: $!";
1341				@all_params = ;
1342				@all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
1343				chomp @all_params;
1344				$param_string = join ' ', @all_params;
1345				} else {
1346				# Just for testing. Used in t/test.t
1347				$param_string = "cluster 5 0 0 1 0 0 0 1 0 0 0 1 " .
1348				"cluster 0 5 0 1 0 0 0 1 0 0 0 1 " .
1349				"cluster 0 0 5 1 0 0 0 1 0 0 0 1";
1350				}
1351				my @cluster_strings = split /[ ]cluster[ ]/, $param_string;
1352				@cluster_strings = grep $_, @cluster_strings;
1353				my $K = @cluster_strings;
1354				croak "Too many clusters requested" if $K > 12;
1355				my @point_labels = ('a'..'z');
1356				print "Number of Gaussians used for the synthetic data: $K\n";
1357				my @means;
1358				my @covariances;
1359				my $data_dimension;
1360				foreach my $i (0..$K-1) {
1361				my @num_strings = split / /, $cluster_strings[$i];
1362				my @cluster_mean = map {/$_num_regex/;$_} split / /, $num_strings[0];
1363				$data_dimension = @cluster_mean;
1364				push @means, \@cluster_mean;
1365				my @covariance_nums = map {/$_num_regex/;$_} split / /, $num_strings[1];
1366				croak "dimensionality error" if @covariance_nums !=
1367				($data_dimension ** 2);
1368				my $cluster_covariance;
1369				foreach my $j (0..$data_dimension-1) {
1370				foreach my $k (0..$data_dimension-1) {
1371				$cluster_covariance->[$j]->[$k] =
1372				$covariance_nums[$j*$data_dimension + $k];
1373				}
1374				}
1375				push @covariances, $cluster_covariance;
1376				}
1377				random_seed_from_phrase( 'hellojello' );
1378				my @data_dump;
1379				foreach my $i (0..$K-1) {
1380				my @m = @{shift @means};
1381				my @covar = @{shift @covariances};
1382				my @new_data = Math::Random::random_multivariate_normal( $N, @m, @covar );
1383				my $p = 0;
1384				my $label = $point_labels[$i];
1385				@new_data = map {unshift @$_, $label.$i; $i++; $_} @new_data;
1386				push @data_dump, @new_data;
1387				}
1388				fisher_yates_shuffle( \@data_dump );
1389				open OUTPUT, ">$output_file";
1390				foreach my $ele (@data_dump) {
1391				foreach my $coord ( @$ele ) {
1392				print OUTPUT "$coord ";
1393				}
1394				print OUTPUT "\n";
1395				}
1396				print "Data written out to file $output_file\n";
1397				close OUTPUT;
1398				}
1399
1400				sub add_point_coords {
1401				my $self = shift;
1402				my @arr_of_ids = @{shift @_}; # array of data element names
1403				my @result;
1404				my $data_dimensionality = $self->{_data_dimensions};
1405				foreach my $i (0..$data_dimensionality-1) {
1406				$result[$i] = 0.0;
1407				}
1408				foreach my $id (@arr_of_ids) {
1409				my $ele = $self->{_data}->{$id};
1410				my $i = 0;
1411				foreach my $component (@$ele) {
1412				$result[$i] += $component;
1413				$i++;
1414				}
1415				}
1416				return \@result;
1417				}
1418
1419				sub add_point_coords_from_original_data {
1420				my $self = shift;
1421				my @arr_of_ids = @{shift @_}; # array of data element names
1422				my @result;
1423				my $data_dimensionality = $self->{_data_dimensions};
1424				foreach my $i (0..$data_dimensionality-1) {
1425				$result[$i] = 0.0;
1426				}
1427				foreach my $id (@arr_of_ids) {
1428				my $ele = $self->{_original_data}->{$id};
1429				my $i = 0;
1430				foreach my $component (@$ele) {
1431				$result[$i] += $component;
1432				$i++;
1433				}
1434				}
1435				return \@result;
1436				}
1437
1438				################################### Support Routines ########################################
1439
1440				sub get_index_at_value {
1441				my $value = shift;
1442				my @array = @{shift @_};
1443				foreach my $i (0..@array-1) {
1444				return $i if $value == $array[$i];
1445				}
1446				return -1;
1447				}
1448
1449				# This routine is really not necessary in light of the new `~~' operator in Perl.
1450				# Will use the new operator in the next version.
1451				sub vector_equal {
1452				my $vec1 = shift;
1453				my $vec2 = shift;
1454				die "wrong data types for vector-equal predicate\n" if @$vec1 != @$vec2;
1455				foreach my $i (0..@$vec1-1){
1456				return 0 if $vec1->[$i] != $vec2->[$i];
1457				}
1458				return 1;
1459				}
1460
1461				# Returns the minimum value and its positional index in an array
1462				sub minimum {
1463				my $arr = shift;
1464				my $min;
1465				my $index;
1466				foreach my $i (0..@{$arr}-1) {
1467				if ( (!defined $min) \|\| ($arr->[$i] < $min) ) {
1468				$index = $i;
1469				$min = $arr->[$i];
1470				}
1471				}
1472				return ($min, $index);
1473				}
1474
1475				sub minmax {
1476				my $arr = shift;
1477				my $min;
1478				my $max;
1479				foreach my $i (0..@{$arr}-1) {
1480				next if !defined $arr->[$i];
1481				if ( (!defined $min) && (!defined $max) ) {
1482				$min = $arr->[$i];
1483				$max = $arr->[$i];
1484				} elsif ( $arr->[$i] < $min ) {
1485				$min = $arr->[$i];
1486				} elsif ( $arr->[$i] > $max ) {
1487				$max = $arr->[$i];
1488				}
1489				}
1490				return ($min, $max);
1491				}
1492
1493				# Meant only for constructing a deep copy of an array of arrays:
1494				sub deep_copy_AoA {
1495				my $ref_in = shift;
1496				my $ref_out;
1497				foreach my $i (0..@{$ref_in}-1) {
1498				foreach my $j (0..@{$ref_in->[$i]}-1) {
1499				$ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
1500				}
1501				}
1502				return $ref_out;
1503				}
1504
1505				# Meant only for constructing a deep copy of an array of arrays for the case when
1506				# some elements of the top-level array may be undefined:
1507				sub deep_copy_AoA_with_nulls {
1508				my $ref_in = shift;
1509				my $ref_out;
1510				foreach my $i (0..@{$ref_in}-1) {
1511				if ( !defined $ref_in->[$i] ) {
1512				$ref_out->[$i] = undef;
1513				next;
1514				}
1515				foreach my $j (0..@{$ref_in->[$i]}-1) {
1516				$ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
1517				}
1518				}
1519				return $ref_out;
1520				}
1521
1522				# Meant only for constructing a deep copy of a hash in which each value is an
1523				# anonymous array of numbers:
1524				sub deep_copy_hash {
1525				my $ref_in = shift;
1526				my $ref_out;
1527				while ( my ($key, $value) = each( %$ref_in ) ) {
1528				$ref_out->{$key} = deep_copy_array( $value );
1529				}
1530				return $ref_out;
1531				}
1532
1533				# Meant only for an array of numbers:
1534				sub deep_copy_array {
1535				my $ref_in = shift;
1536				my $ref_out;
1537				foreach my $i (0..@{$ref_in}-1) {
1538				$ref_out->[$i] = $ref_in->[$i];
1539				}
1540				return $ref_out;
1541				}
1542
1543				sub display_cluster_centers {
1544				my $self = shift;
1545				my @clusters = @{shift @_};
1546				my $i = 0;
1547				foreach my $cluster (@clusters) {
1548				my $cluster_size = @$cluster;
1549				my @cluster_center =
1550				@{$self->add_point_coords_from_original_data( $cluster )};
1551				@cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;
1552				print "\ncluster $i ($cluster_size records):\n";
1553				print "cluster center $i: " .
1554				"@{[map {my $x = sprintf('%.4f', $_); $x} @cluster_center]}\n";
1555				$i++;
1556				}
1557				}
1558
1559				# For displaying the individual clusters on a terminal screen. Each cluster is
1560				# displayed through the symbolic names associated with the data points.
1561				sub display_clusters {
1562				my @clusters = @{shift @_};
1563				my $i = 0;
1564				foreach my $cluster (@clusters) {
1565				@$cluster = sort @$cluster;
1566				my $cluster_size = @$cluster;
1567				print "\n\nCluster $i ($cluster_size records):\n";
1568				foreach my $ele (@$cluster) {
1569				print " $ele";
1570				}
1571				$i++
1572				}
1573				print "\n\n";
1574				}
1575
1576				# from perl docs:
1577				sub fisher_yates_shuffle {
1578				my $arr = shift;
1579				my $i = @$arr;
1580				while (--$i) {
1581				my $j = int rand( $i + 1 );
1582				@$arr[$i, $j] = @$arr[$j, $i];
1583				}
1584				}
1585
1586				sub variance_normalization {
1587				my %data_hash = %{shift @_};
1588				my @all_data_points = values %data_hash;
1589				my $dimensions = @{$all_data_points[0]};
1590				my @data_projections;
1591				foreach my $data_point (@all_data_points) {
1592				my $i = 0;
1593				foreach my $proj (@$data_point) {
1594				push @{$data_projections[$i++]}, $proj;
1595				}
1596				}
1597				my @variance_vec;
1598				foreach my $vec (@data_projections) {
1599				my ($mean, $variance) = mean_and_variance( $vec );
1600				push @variance_vec, $variance;
1601				}
1602				my %new_data_hash;
1603				while (my ($label, $data) = each(%data_hash) ) {
1604				my @new_data;
1605				foreach my $i (0..@{$data}-1) {
1606				my $new = $data->[$i] / sqrt($variance_vec[$i]);
1607				push @new_data, $data->[$i] / sqrt($variance_vec[$i]);
1608				}
1609				$new_data_hash{$label} = \@new_data;
1610				}
1611				return \%new_data_hash;
1612				}
1613
1614				sub mean_and_variance {
1615				my @data = @{shift @_};
1616				my ($mean, $variance);
1617				foreach my $i (1..@data) {
1618				if ($i == 1) {
1619				$mean = $data[0];
1620				$variance = 0;
1621				} else {
1622				$mean = ( (($i-1)/$i) * $mean ) + $data[$i-1] / $i;
1623				$variance = ( (($i-1)/$i) * $variance ) + ($data[$i-1]-$mean)**2 / ($i-1);
1624				}
1625				}
1626				return ($mean, $variance);
1627				}
1628
1629				sub check_for_illegal_params {
1630				my @params = @_;
1631				my @legal_params = qw / datafile
1632				mask
1633				K
1634				Kmin
1635				Kmax
1636				terminal_output
1637				write_clusters_to_files
1638				do_variance_normalization
1639				cluster_seeding
1640				use_mahalanobis_metric
1641				debug
1642				/;
1643				my $found_match_flag;
1644				foreach my $param (@params) {
1645				foreach my $legal (@legal_params) {
1646				$found_match_flag = 0;
1647				if ($param eq $legal) {
1648				$found_match_flag = 1;
1649				last;
1650				}
1651				}
1652				last if $found_match_flag == 0;
1653				}
1654				return $found_match_flag;
1655				}
1656
1657				sub get_value_index_hash {
1658				my $arr = shift;
1659				my %hash;
1660				foreach my $index (0..@$arr-1) {
1661				$hash{$arr->[$index]} = $index if $arr->[$index] > 0;
1662				}
1663				return \%hash;
1664				}
1665
1666				sub non_maximum_suppression {
1667				my $arr = shift;
1668				my @output = (0) x @$arr;
1669				my @final_output = (0) x @$arr;
1670				my %hash;
1671				my @array_of_runs = ([$arr->[0]]);
1672				foreach my $index (1..@$arr-1) {
1673				if ($arr->[$index] == $arr->[$index-1]) {
1674				push @{$array_of_runs[-1]}, $arr->[$index];
1675				} else {
1676				push @array_of_runs, [$arr->[$index]];
1677				}
1678				}
1679				my $runstart_index = 0;
1680				foreach my $run_index (1..@array_of_runs-2) {
1681				$runstart_index += @{$array_of_runs[$run_index-1]};
1682				if ($array_of_runs[$run_index]->[0] >
1683				$array_of_runs[$run_index-1]->[0] &&
1684				$array_of_runs[$run_index]->[0] >
1685				$array_of_runs[$run_index+1]->[0]) {
1686				my $run_center = @{$array_of_runs[$run_index]} / 2;
1687				my $assignment_index = $runstart_index + $run_center;
1688				$output[$assignment_index] = $arr->[$assignment_index];
1689				}
1690				}
1691				if ($array_of_runs[-1]->[0] > $array_of_runs[-2]->[0]) {
1692				$runstart_index += @{$array_of_runs[-2]};
1693				my $run_center = @{$array_of_runs[-1]} / 2;
1694				my $assignment_index = $runstart_index + $run_center;
1695				$output[$assignment_index] = $arr->[$assignment_index];
1696				}
1697				if ($array_of_runs[0]->[0] > $array_of_runs[1]->[0]) {
1698				my $run_center = @{$array_of_runs[0]} / 2;
1699				$output[$run_center] = $arr->[$run_center];
1700				}
1701				return \@output;
1702				}
1703
1704				sub display_matrix {
1705				my $matrix = shift;
1706				my $nrows = $matrix->rows();
1707				my $ncols = $matrix->cols();
1708				print "\n\nDisplaying matrix of size $nrows rows and $ncols columns:\n";
1709				foreach my $i (0..$nrows-1) {
1710				my $row = $matrix->row($i);
1711				my @row_as_list = $row->as_list;
1712				print "@row_as_list\n";
1713				}
1714				print "\n\n";
1715				}
1716
1717				sub transpose {
1718				my $matrix = shift;
1719				my $num_rows = $matrix->rows();
1720				my $num_cols = $matrix->cols();
1721				my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
1722				foreach my $i (0..$num_rows-1) {
1723				my @row = $matrix->row($i)->as_list;
1724				$transpose->set_col($i, \@row );
1725				}
1726				return $transpose;
1727				}
1728
1729				sub vector_subtract {
1730				my $vec1 = shift;
1731				my $vec2 = shift;
1732				die "wrong data types for vector subtract calculation\n" if @$vec1 != @$vec2;
1733				my @result;
1734				foreach my $i (0..@$vec1-1){
1735				push @result, $vec1->[$i] - $vec2->[$i];
1736				}
1737				return @result;
1738				}
1739
1740				sub vector_multiply {
1741				my $vec1 = shift;
1742				my $vec2 = shift;
1743				die "vec_multiply called with two vectors of different sizes"
1744				unless @$vec1 == @$vec2;
1745				my $result = 0;
1746				foreach my $i (0..@$vec1-1) {
1747				$result += $vec1->[$i] * $vec2->[$i];
1748				}
1749				return $result;
1750				}
1751
1752				sub matrix_multiply {
1753				my $matrix1 = shift;
1754				my $matrix2 = shift;
1755				my ($nrows1, $ncols1) = ($matrix1->rows(), $matrix1->cols());
1756				my ($nrows2, $ncols2) = ($matrix2->rows(), $matrix2->cols());
1757				die "matrix multiplication called with non-matching matrix arguments"
1758				unless $nrows1 == $ncols2 && $ncols1 == $nrows2;
1759				if ($nrows1 == 1) {
1760				my @row = $matrix1->row(0)->as_list;
1761				my @col = $matrix2->col(0)->as_list;
1762				my $result;
1763				foreach my $j (0..$ncols1-1) {
1764				$result += $row[$j] * $col[$j];
1765				}
1766				return $result;
1767				}
1768				my $product = Math::GSL::Matrix->new($nrows1, $nrows1);
1769				foreach my $i (0..$nrows1-1) {
1770				my $row = $matrix1->row($i);
1771				my @product_row;
1772				foreach my $j (0..$ncols2-1) {
1773				my $col = $matrix2->col($j);
1774				my $row_times_col = matrix_multiply($row, $col);
1775				push @product_row, $row_times_col;
1776				}
1777				$product->set_row($i, \@product_row);
1778				}
1779				return $product;
1780				}
1781
1782				1;
1783
1784				=pod
1785
1786				=head1 NAME
1787
1788				Algorithm::KMeans - for clustering multidimensional data
1789
1790				=head1 SYNOPSIS
1791
1792				# You now have four different choices for clustering your data with this module:
1793				#
1794				# 1) With Euclidean distances and with random cluster seeding
1795				#
1796				# 2) With Mahalanobis distances and with random cluster seeding
1797				#
1798				# 3) With Euclidean distances and with smart cluster seeding
1799				#
1800				# 4) With Mahalanobis distances and with smart cluster seeding
1801				#
1802				# Despite the qualifier 'smart' in 'smart cluster seeding', it may not always
1803				# produce results that are superior to those obtained with random seeding. (If you
1804				# also factor in the choice regarding variance normalization, you actually have
1805				# eight different choices for data clustering with this module.)
1806				#
1807				# In all cases, you'd obviously begin with
1808
1809				use Algorithm::KMeans;
1810
1811				# You'd then name the data file:
1812
1813				my $datafile = "mydatafile.dat";
1814
1815				# Next, set the mask to indicate which columns of the datafile to use for
1816				# clustering and which column contains a symbolic ID for each data record. For
1817				# example, if the symbolic name is in the first column, you want the second column
1818				# to be ignored, and you want the next three columns to be used for 3D clustering,
1819				# you'd set the mask to:
1820
1821				my $mask = "N0111";
1822
1823				# Now construct an instance of the clusterer. The parameter K controls the number
1824				# of clusters. If you know how many clusters you want (let's say 3), call
1825
1826				my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1827				mask => $mask,
1828				K => 3,
1829				cluster_seeding => 'random',
1830				terminal_output => 1,
1831				write_clusters_to_files => 1,
1832				);
1833
1834				# By default, this constructor call will set you up for clustering based on
1835				# Euclidean distances. If you want the module to use Mahalanobis distances, your
1836				# constructor call will look like:
1837
1838				my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1839				mask => $mask,
1840				K => 3,
1841				cluster_seeding => 'random',
1842				use_mahalanobis_metric => 1,
1843				terminal_output => 1,
1844				write_clusters_to_files => 1,
1845				);
1846
1847				# For both constructor calls shown above, you can use smart seeding of the clusters
1848				# by changing 'random' to 'smart' for the cluster_seeding option. See the
1849				# explanation of smart seeding in the Methods section of this documentation.
1850
1851				# If your data is such that its variability along the different dimensions of the
1852				# data space is significantly different, you may get better clustering if you first
1853				# normalize your data by setting the constructor parameter
1854				# do_variance_normalization as shown below:
1855
1856				my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1857				mask => $mask,
1858				K => 3,
1859				cluster_seeding => 'smart', # or 'random'
1860				terminal_output => 1,
1861				do_variance_normalization => 1,
1862				write_clusters_to_files => 1,
1863				);
1864
1865				# But bear in mind that such data normalization may actually decrease the
1866				# performance of the clusterer if the variability in the data is more a result of
1867				# the separation between the means than a consequence of intra-cluster variance.
1868
1869				# Set K to 0 if you want the module to figure out the optimum number of clusters
1870				# from the data. (It is best to run this option with the terminal_output set to 1
1871				# so that you can see the different value of QoC for the different K):
1872
1873				my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1874				mask => $mask,
1875				K => 0,
1876				cluster_seeding => 'random', # or 'smart'
1877				terminal_output => 1,
1878				write_clusters_to_files => 1,
1879				);
1880
1881				# Although not shown above, you can obviously set the 'do_variance_normalization'
1882				# flag here also if you wish.
1883
1884				# For very large data files, setting K to 0 will result in searching through too
1885				# many values for K. For such cases, you can range limit the values of K to search
1886				# through by
1887
1888				my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1889				mask => "N111",
1890				Kmin => 3,
1891				Kmax => 10,
1892				cluster_seeding => 'random', # or 'smart'
1893				terminal_output => 1,
1894				write_clusters_to_files => 1,
1895				);
1896
1897				# FOR ALL CASES ABOVE, YOU'D NEED TO MAKE THE FOLLOWING CALLS ON THE CLUSTERER
1898				# INSTANCE TO ACTUALLY CLUSTER THE DATA:
1899
1900				$clusterer->read_data_from_file();
1901				$clusterer->kmeans();
1902
1903				# If you want to directly access the clusters and the cluster centers in your own
1904				# top-level script, replace the above two statements with:
1905
1906				$clusterer->read_data_from_file();
1907				my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans();
1908
1909				# You can subsequently access the clusters directly in your own code, as in:
1910
1911				foreach my $cluster_id (sort keys %{$clusters_hash}) {
1912				print "\n$cluster_id => @{$clusters_hash->{$cluster_id}}\n";
1913				}
1914				foreach my $cluster_id (sort keys %{$cluster_centers_hash}) {
1915				print "\n$cluster_id => @{$cluster_centers_hash->{$cluster_id}}\n";
1916				}
1917
1918
1919				# CLUSTER VISUALIZATION:
1920
1921				# You must first set the mask for cluster visualization. This mask tells the module
1922				# which 2D or 3D subspace of the original data space you wish to visualize the
1923				# clusters in:
1924
1925				my $visualization_mask = "111";
1926				$clusterer->visualize_clusters($visualization_mask);
1927
1928
1929				# SYNTHETIC DATA GENERATION:
1930
1931				# The module has been provided with a class method for generating multivariate data
1932				# for experimenting with clustering. The data generation is controlled by the
1933				# contents of the parameter file that is supplied as an argument to the data
1934				# generator method. The mean and covariance matrix entries in the parameter file
1935				# must be according to the syntax shown in the param.txt file in the examples
1936				# directory. It is best to edit this file as needed:
1937
1938				my $parameter_file = "param.txt";
1939				my $out_datafile = "mydatafile.dat";
1940				Algorithm::KMeans->cluster_data_generator(
1941				input_parameter_file => $parameter_file,
1942				output_datafile => $out_datafile,
1943				number_data_points_per_cluster => $N );
1944
1945				=head1 CHANGES
1946
1947				Version 2.03 incorporates minor code cleanup. The main implementation of the module
1948				remains unchanged.
1949
1950				Version 2.02 downshifts the version of Perl that is required for this module. The
1951				module should work with versions 5.10 and higher of Perl. The implementation code
1952				for the module remains unchanged.
1953
1954				Version 2.01 removes many errors in the documentation. The changes made to the module
1955				in Version 2.0 were not reflected properly in the documentation page for that
1956				version. The implementation code remains unchanged.
1957
1958				Version 2.0 includes significant additional functionality: (1) You now have the
1959				option to cluster using the Mahalanobis distance metric (the default is the Euclidean
1960				metric); and (2) With the two C methods that have been added to the
1961				module, you can now determine the best cluster for a new data sample after you have
1962				created the clusters with the previously available data. Finding the best cluster
1963				for a new data sample can be done using either the Euclidean metric or the
1964				Mahalanobis metric.
1965
1966				Version 1.40 includes a C option for seeding the clusters. This option,
1967				supplied through the constructor parameter C, means that the
1968				clusterer will (1) Subject the data to principal components analysis in order to
1969				determine the maximum variance direction; (2) Project the data onto this direction;
1970				(3) Find peaks in a smoothed histogram of the projected points; and (4) Use the
1971				locations of the highest peaks as initial guesses for the cluster centers. If you
1972				don't want to use this option, set C to C. That should work
1973				as in the previous version of the module.
1974
1975				Version 1.30 includes a bug fix for the case when the datafile contains empty lines,
1976				that is, lines with no data records. Another bug fix in Version 1.30 deals with the
1977				case when you want the module to figure out how many clusters to form (this is the
1978				C option in the constructor call) and the number of data records is close to the
1979				minimum.
1980
1981				Version 1.21 includes fixes to handle the possibility that, when clustering the data
1982				for a fixed number of clusters, a cluster may become empty during iterative
1983				calculation of cluster assignments of the data elements and the updating of the
1984				cluster centers. The code changes are in the C and
1985				C subroutines.
1986
1987				Version 1.20 includes an option to normalize the data with respect to its variability
1988				along the different coordinates before clustering is carried out.
1989
1990				Version 1.1.1 allows for range limiting the values of C to search through. C
1991				stands for the number of clusters to form. This version also declares the module
1992				dependencies in the C file.
1993
1994				Version 1.1 is a an object-oriented version of the implementation presented in
1995				version 1.0. The current version should lend itself more easily to code extension.
1996				You could, for example, create your own class by subclassing from the class presented
1997				here and, in your subclass, use your own criteria for the similarity distance between
1998				the data points and for the QoC (Quality of Clustering) metric, and, possibly a
1999				different rule to stop the iterations. Version 1.1 also allows you to directly
2000				access the clusters formed and the cluster centers in your calling script.
2001
2002
2003				=head1 SPECIAL USAGE NOTE
2004
2005				If you were directly accessing in your own scripts the clusters produced by the older
2006				versions of this module, you'd need to make changes to your code if you wish to use
2007				Version 2.0 or higher. Instead of returning arrays of clusters and cluster centers,
2008				Versions 2.0 and higher return hashes. This change was made necessary by the logic
2009				required for implementing the two new C methods that were introduced
2010				in Version 2.0. These methods return the best cluster for a new data sample from the
2011				clusters you created using the existing data. One of the C methods is
2012				based on the Euclidean metric for finding the cluster that is closest to the new data
2013				sample, and the other on the Mahalanobis metric. Another point of incompatibility
2014				with the previous versions is that you must now explicitly set the C
2015				parameter in the call to the constructor to either C or C. This
2016				parameter does not have a default associated with it starting with Version 2.0.
2017
2018
2019				=head1 DESCRIPTION
2020
2021				Clustering with K-Means takes place iteratively and involves two steps: 1) assignment
2022				of data samples to clusters on the basis of how far the data samples are from the
2023				cluster centers; and 2) Recalculation of the cluster centers (and cluster covariances
2024				if you are using the Mahalanobis distance metric for clustering).
2025
2026				Obviously, before the two-step approach can proceed, we need to initialize the the
2027				cluster centers. How this initialization is carried out is important. The module
2028				gives you two very different ways for carrying out this initialization. One option,
2029				called the C option, consists of subjecting the data to principal components
2030				analysis to discover the direction of maximum variance in the data space. The data
2031				points are then projected on to this direction and a histogram constructed from the
2032				projections. Centers of the smoothed histogram are used to seed the clustering
2033				operation. The other option is to choose the cluster centers purely randomly. You
2034				get the first option if you set C to C in the constructor,
2035				and you get the second option if you set it to C.
2036
2037				How to specify the number of clusters, C, is one of the most vexing issues in any
2038				approach to clustering. In some case, we can set C on the basis of prior
2039				knowledge. But, more often than not, no such prior knowledge is available. When the
2040				programmer does not explicitly specify a value for C, the approach taken in the
2041				current implementation is to try all possible values between 2 and some largest
2042				possible value that makes statistical sense. We then choose that value for C
2043				which yields the best value for the QoC (Quality of Clustering) metric. It is
2044				generally believed that the largest value for C should not exceed C
2045				where C is the number of data samples to be clustered.
2046
2047				What to use for the QoC metric is obviously a critical issue unto itself. In the
2048				current implementation, the value of QoC is the ratio of the average radius of the
2049				clusters and the average distance between the cluster centers.
2050
2051				Every iterative algorithm requires a stopping criterion. The criterion implemented
2052				here is that we stop iterations when there is no re-assignment of the data points
2053				during the assignment step.
2054
2055				Ordinarily, the output produced by a K-Means clusterer will correspond to a local
2056				minimum for the QoC values, as opposed to a global minimum. The current
2057				implementation protects against that when the module constructor is called with the
2058				C option for C by trying different randomly selected initial
2059				cluster centers and then selecting the one that gives the best overall QoC value.
2060
2061				A K-Means clusterer will generally produce good results if the overlap between the
2062				clusters is minimal and if each cluster exhibits variability that is uniform in all
2063				directions. When the data variability is different along the different directions in
2064				the data space, the results you obtain with a K-Means clusterer may be improved by
2065				first normalizing the data appropriately, as can be done in this module when you set
2066				the C option in the constructor. However, as pointed out
2067				elsewhere in this documentation, such normalization may actually decrease the
2068				performance of the clusterer if the overall data variability along any dimension is
2069				more a result of separation between the means than a consequence of intra-cluster
2070				variability.
2071
2072
2073				=head1 METHODS
2074
2075				The module provides the following methods for clustering, for cluster visualization,
2076				for data visualization, for the generation of data for testing a clustering
2077				algorithm, and for determining the cluster membership of a new data sample:
2078
2079				=over 4
2080
2081				=item B
2082
2083				my $clusterer = Algorithm::KMeans->new(datafile => $datafile,
2084				mask => $mask,
2085				K => $K,
2086				cluster_seeding => 'random', # also try 'smart'
2087				use_mahalanobis_metric => 1, # also try '0'
2088				terminal_output => 1,
2089				write_clusters_to_files => 1,
2090				);
2091
2092				A call to C constructs a new instance of the C class. When
2093				C<$K> is a non-zero positive integer, the module will construct exactly that many
2094				clusters. However, when C<$K> is 0, the module will find the best number of clusters
2095				to partition the data into. As explained in the Description, setting
2096				C to C causes PCA (principal components analysis) to be used
2097				for discovering the best choices for the initial cluster centers. If you want purely
2098				random decisions to be made for the initial choices for the cluster centers, set
2099				C to C.
2100
2101				The data file is expected to contain entries in the following format
2102
2103				c20 0 10.7087017086940 9.63528386251712 10.9512155258108 ...
2104				c7 0 12.8025925026787 10.6126270065785 10.5228482095349 ...
2105				b9 0 7.60118206283120 5.05889245193079 5.82841781759102 ...
2106				....
2107				....
2108
2109				where the first column contains the symbolic ID tag for each data record and the rest
2110				of the columns the numerical information. As to which columns are actually used for
2111				clustering is decided by the string value of the mask. For example, if we wanted to
2112				cluster on the basis of the entries in just the 3rd, the 4th, and the 5th columns
2113				above, the mask value would be C where the character C indicates that the
2114				ID tag is in the first column, the character C<0> that the second column is to be
2115				ignored, and the C<1>'s that follow that the 3rd, the 4th, and the 5th columns are to
2116				be used for clustering.
2117
2118				If you wish for the clusterer to search through a C<(Kmin,Kmax)> range of values for
2119				C, the constructor should be called in the following fashion:
2120
2121				my $clusterer = Algorithm::KMeans->new(datafile => $datafile,
2122				mask => $mask,
2123				Kmin => 3,
2124				Kmax => 10,
2125				cluster_seeding => 'smart', # try 'random' also
2126				terminal_output => 1,
2127				);
2128
2129				where obviously you can choose any reasonable values for C and C. If you
2130				choose a value for C that is statistically too large, the module will let you
2131				know. Again, you may choose C for C, the default value being
2132				C.
2133
2134				If you believe that the variability of the data is very different along the different
2135				dimensions of the data space, you may get better clustering by first normalizing the
2136				data coordinates by the standard-deviations along those directions. When you set the
2137				constructor option C as shown below, the module uses the
2138				overall data standard-deviation along a direction for the normalization in that
2139				direction. (As mentioned elsewhere in the documentation, such a normalization could
2140				backfire on you if the data variability along a dimension is more a result of the
2141				separation between the means than a consequence of the intra-cluster variability.):
2142
2143				my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
2144				mask => "N111",
2145				K => 2,
2146				cluster_seeding => 'smart', # try 'random' also
2147				terminal_output => 1,
2148				do_variance_normalization => 1,
2149				);
2150
2151				=back
2152
2153				=head2 Constructor Parameters
2154
2155				=over 8
2156
2157				=item C:
2158
2159				This parameter names the data file that contains the multidimensional data records
2160				you want the module to cluster.
2161
2162				=item C:
2163
2164				This parameter supplies the mask to be applied to the columns of your data file. See
2165				the explanation in Synopsis for what this mask looks like.
2166
2167				=item C:
2168
2169				This parameter supplies the number of clusters you are looking for. If you set this
2170				option to 0, that means that you want the module to search for the best value for
2171				C. (Keep in mind the fact that searching for the best C may take a long time
2172				for large data files.)
2173
2174				=item C:
2175
2176				If you supply an integer value for C, the search for the best C will begin
2177				with that value.
2178
2179				=item C:
2180
2181				If you supply an integer value for C, the search for the best C will end at
2182				that value.
2183
2184				=item C:
2185
2186				This parameter must be set to either C or C. Depending on your data,
2187				you may get superior clustering with the C option. The choice C means
2188				that the clusterer will (1) subject the data to principal components analysis to
2189				determine the maximum variance direction; (2) project the data onto this direction;
2190				(3) find peaks in a smoothed histogram of the projected points; and (4) use the
2191				locations of the highest peaks as seeds for cluster centers. If the C option
2192				produces bizarre results, try C.
2193
2194				=item C:
2195
2196				When set to 1, this option causes Mahalanobis distances to be used for clustering.
2197				The default is 0 for this parameter. By default, the module uses the Euclidean
2198				distances for clustering. In general, Mahalanobis distance based clustering will
2199				fail if you seek too many clusters, your data dimensionality is high, and you do not
2200				have a sufficient number of samples in your data file. A necessary requirement for
2201				the module to be able to compute Mahalanobis distances is that the cluster covariance
2202				matrices be non-singular. (Let's say your data dimensionality is C and the module
2203				is considering a cluster that has only C samples in it where C is less than
2204				C. In this case, the covariance matrix will be singular since its rank will not
2205				exceed C. For the covariance matrix to be non-singular, it must be of full rank,
2206				that is, its rank must be C.)
2207
2208				=item C:
2209
2210				When set, the module will first normalize the data variance along the different
2211				dimensions of the data space before attempting clustering. Depending on your data,
2212				this option may or may not result in better clustering.
2213
2214				=item C:
2215
2216				This boolean parameter, when not supplied in the call to C, defaults to 0.
2217				When set, you will see in your terminal window the different clusters as lists of the
2218				symbolic IDs and their cluster centers. You will also see the QoC (Quality of
2219				Clustering) values for the clusters displayed.
2220
2221				=item C:
2222
2223				This parameter is also boolean. When set to 1, the clusters are written out to files
2224				that are named in the following manner:
2225
2226				cluster0.dat
2227				cluster1.dat
2228				cluster2.dat
2229				...
2230				...
2231
2232				Before the clusters are written to these files, the module destroys all files with
2233				such names in the directory in which you call the module.
2234
2235
2236				=back
2237
2238				=over
2239
2240				=item B
2241
2242				$clusterer->read_data_from_file()
2243
2244				=item B
2245
2246				$clusterer->kmeans();
2247
2248				or
2249
2250				my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans();
2251
2252				The first call above works solely by side-effect. The second call also returns the
2253				clusters and the cluster centers. See the C script in the
2254				C directory for how you can in your own code extract the clusters and the
2255				cluster centers from the variables C<$clusters_hash> and C<$cluster_centers_hash>.
2256
2257				=item B
2258
2259				$clusterer->get_K_best();
2260
2261				This call makes sense only if you supply either the C option to the constructor,
2262				or if you specify values for the C and C options. The C and the
2263				C<(Kmin,Kmax)> options cause the module to determine the best value for C.
2264				Remember, C is the number of clusters the data is partitioned into.
2265
2266				=item B
2267
2268				$clusterer->show_QoC_values();
2269
2270				presents a table with C values in the left column and the corresponding QoC
2271				(Quality-of-Clustering) values in the right column. Note that this call makes sense
2272				only if you either supply the C option to the constructor, or if you specify
2273				values for the C and C options.
2274
2275				=item B
2276
2277				$clusterer->visualize_clusters( $visualization_mask )
2278
2279				The visualization mask here does not have to be identical to the one used for
2280				clustering, but must be a subset of that mask. This is convenient for visualizing
2281				the clusters in two- or three-dimensional subspaces of the original space.
2282
2283				=item B
2284
2285				$clusterer->visualize_data($visualization_mask, 'original');
2286
2287				$clusterer->visualize_data($visualization_mask, 'normed');
2288
2289				This method requires a second argument and, as shown, it must be either the string
2290				C or the string C, the former for the visualization of the raw data
2291				and the latter for the visualization of the data after its different dimensions are
2292				normalized by the standard-deviations along those directions. If you call the method
2293				with the second argument set to C, but do so without turning on the
2294				C option in the KMeans constructor, it will let you know.
2295
2296
2297				=item B
2298
2299				If you wish to determine the cluster membership of a new data sample after you have
2300				created the clusters with the existing data samples, you would need to call this
2301				method. The C script in the C directory
2302				shows how to use this method.
2303
2304
2305				=item B
2306
2307				This does the same thing as the previous method, except that it determines the
2308				cluster membership using the Mahalanobis distance metric. As for the previous
2309				method, see the C script in the C directory
2310				for how to use this method.
2311
2312
2313				=item B
2314
2315				Algorithm::KMeans->cluster_data_generator(
2316				input_parameter_file => $parameter_file,
2317				output_datafile => $out_datafile,
2318				number_data_points_per_cluster => 20 );
2319
2320				for generating multivariate data for clustering if you wish to play with synthetic
2321				data for clustering. The input parameter file contains the means and the variances
2322				for the different Gaussians you wish to use for the synthetic data. See the file
2323				C provided in the examples directory. It will be easiest for you to just
2324				edit this file for your data generation needs. In addition to the format of the
2325				parameter file, the main constraint you need to observe in specifying the parameters
2326				is that the dimensionality of the covariance matrix must correspond to the
2327				dimensionality of the mean vectors. The multivariate random numbers are generated by
2328				calling the C module. As you would expect, this module requires that
2329				the covariance matrices you specify in your parameter file be symmetric and positive
2330				definite. Should the covariances in your parameter file not obey this condition, the
2331				C module will let you know.
2332
2333				=back
2334
2335				=head1 HOW THE CLUSTERS ARE OUTPUT
2336
2337				When the option C is set in the call to the constructor, the
2338				clusters are displayed on the terminal screen.
2339
2340				When the option C is set in the call to the constructor, the
2341				module dumps the clusters in files named
2342
2343				cluster0.dat
2344				cluster1.dat
2345				cluster2.dat
2346				...
2347				...
2348
2349				in the directory in which you execute the module. The number of such files will
2350				equal the number of clusters formed. All such existing files in the directory are
2351				destroyed before any fresh ones are created. Each cluster file contains the symbolic
2352				ID tags of the data samples in that cluster.
2353
2354				The module also leaves in your directory a printable `.png' file that is a point plot
2355				of the different clusters. The name of this file is always C.
2356
2357				Also, as mentioned previously, a call to C in your own code will return the
2358				clusters and the cluster centers.
2359
2360				=head1 REQUIRED
2361
2362				This module requires the following three modules:
2363
2364				Math::Random
2365				Graphics::GnuplotIF
2366				Math::GSL
2367
2368				With regard to the third item above, what is actually required is the
2369				C module. However, that module is a part of the C
2370				distribution. The acronym GSL stands for the GNU Scientific Library. C is
2371				a Perl interface to the GSL C-based library.
2372
2373
2374				=head1 THE C DIRECTORY
2375
2376				The C directory contains several scripts to help you become familiar with
2377				this module. The following script is an example of how the module can be expected to
2378				be used most of the time. It calls for clustering to be carried out with a fixed
2379				C:
2380
2381				cluster_and_visualize.pl
2382
2383				The more time you spend with this script, the more comfortable you will become with
2384				the use of this module. The script file contains a large comment block that mentions
2385				six locations in the script where you have to make decisions about how to use the
2386				module.
2387
2388				See the following script if you do not know what value to use for C for clustering
2389				your data:
2390
2391				find_best_K_and_cluster.pl
2392
2393				This script uses the C option in the constructor that causes the module to
2394				search for the best C for your data. Since this search is virtually unbounded ---
2395				limited only by the number of samples in your data file --- the script may take a
2396				long time to run for a large data file. Hence the next script.
2397
2398				If your datafile is too large, you may need to range limit the values of C that
2399				are searched through, as in the following script:
2400
2401				find_best_K_in_range_and_cluster.pl
2402
2403				If you also want to include data normalization (it may reduce the performance of the
2404				clusterer in some cases), see the following script:
2405
2406				cluster_after_data_normalization.pl
2407
2408				When you include the data normalization step and you would like to visualize the data
2409				before and after normalization, see the following script:
2410
2411				cluster_and_visualize_with_data_visualization.pl*
2412
2413				After you are done clustering, let's say you want to find the cluster membership of a
2414				new data sample. To see how you can do that, see the script:
2415
2416				which_cluster_for_new_data.pl
2417
2418				This script returns two answers for which cluster a new data sample belongs to: one
2419				using the Euclidean metric to calculate the distances between the new data sample and
2420				the cluster centers, and the other using the Mahalanobis metric. If the clusters are
2421				strongly elliptical in shape, you are likely to get better results with the
2422				Mahalanobis metric. (To see that you can get two different answers using the two
2423				different distance metrics, run the C script on the
2424				data in the file C. To make this run, note that you have to comment
2425				out and uncomment the lines at four different locations in the script.)
2426
2427				The C directory also contains the following support scripts:
2428
2429				For generating the data for experiments with clustering:
2430
2431				data_generator.pl
2432
2433				For cleaning up the examples directory:
2434
2435				cleanup_directory.pl
2436
2437				The examples directory also includes a parameter file, C, for generating
2438				synthetic data for clustering. Just edit this file if you would like to generate
2439				your own multivariate data for clustering. The parameter file is for the 3D case,
2440				but you can generate data with any dimensionality through appropriate entries in the
2441				parameter file.
2442
2443				=head1 EXPORT
2444
2445				None by design.
2446
2447				=head1 CAVEATS
2448
2449				K-Means based clustering usually does not work well when the clusters are strongly
2450				overlapping and when the extent of variability along the different dimensions is
2451				different for the different clusters. The module does give you the ability to
2452				normalize the variability in your data with the constructor option
2453				C. However, as described elsewhere, this may actually
2454				reduce the performance of the clusterer if the data variability along a direction is
2455				more a result of the separation between the means than because of intra-cluster
2456				variability. For better clustering with difficult-to-cluster data, you could try
2457				using the author's C module.
2458
2459				=head1 BUGS
2460
2461				Please notify the author if you encounter any bugs. When sending email, please place
2462				the string 'KMeans' in the subject line.
2463
2464				=head1 INSTALLATION
2465
2466				Download the archive from CPAN in any directory of your choice. Unpack the archive
2467				with a command that on a Linux machine would look like:
2468
2469				tar zxvf Algorithm-KMeans-2.03.tar.gz
2470
2471				This will create an installation directory for you whose name will be
2472				C. Enter this directory and execute the following commands
2473				for a standard install of the module if you have root privileges:
2474
2475				perl Makefile.PL
2476				make
2477				make test
2478				sudo make install
2479
2480				If you do not have root privileges, you can carry out a non-standard install the
2481				module in any directory of your choice by:
2482
2483				perl Makefile.PL prefix=/some/other/directory/
2484				make
2485				make test
2486				make install
2487
2488				With a non-standard install, you may also have to set your PERL5LIB environment
2489				variable so that this module can find the required other modules. How you do that
2490				would depend on what platform you are working on. In order to install this module in
2491				a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
2492				variable by
2493
2494				setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2495
2496				If I used bash, I'd need to declare:
2497
2498				export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2499
2500
2501				=head1 THANKS
2502
2503				I thank Slaven for pointing out that I needed to downshift the required version of Perl
2504				for this module. Fortunately, I had access to an old machine still running Perl
2505				5.10.1. The current version, 2.02, is based on my testing the module on that machine.
2506
2507				I added two C methods in Version 2.0 as a result of an email from
2508				Jerome White who expressed a need for such methods in order to determine the best
2509				cluster for a new data record after you have successfully clustered your existing
2510				data. Thanks Jerome for your feedback!
2511
2512				It was an email from Nadeem Bulsara that prompted me to create Version 1.40 of this
2513				module. Working with Version 1.30, Nadeem noticed that occasionally the module would
2514				produce variable clustering results on the same dataset. I believe that this
2515				variability was caused (at least partly) by the purely random mode that was used in
2516				Version 1.30 for the seeding of the cluster centers. Version 1.40 now includes a
2517				C mode. With the new mode the clusterer uses a PCA (Principal Components
2518				Analysis) of the data to make good guesses for the cluster centers. However,
2519				depending on how the data is jumbled up, it is possible that the new mode will not
2520				produce uniformly good results in all cases. So you can still use the old mode by
2521				setting C to C in the constructor. Thanks Nadeem for your
2522				feedback!
2523
2524				Version 1.30 resulted from Martin Kalin reporting problems with a very small data
2525				set. Thanks Martin!
2526
2527				Version 1.21 came about in response to the problems encountered by Luis Fernando
2528				D'Haro with version 1.20. Although the module would yield the clusters for some of
2529				its runs, more frequently than not the module would abort with an "empty cluster"
2530				message for his data. Luis Fernando has also suggested other improvements (such as
2531				clustering directly from the contents of a hash) that I intend to make in future
2532				versions of this module. Thanks Luis Fernando.
2533
2534				Chad Aeschliman was kind enough to test out the interface of this module and to give
2535				suggestions for its improvement. His key slogan: "If you cannot figure out how to
2536				use a module in under 10 minutes, it's not going to be used." That should explain
2537				the longish Synopsis included here.
2538
2539				=head1 AUTHOR
2540
2541				Avinash Kak, kak@purdue.edu
2542
2543				If you send email, please place the string "KMeans" in your subject line to get past
2544				my spam filter.
2545
2546				=head1 COPYRIGHT
2547
2548				This library is free software; you can redistribute it and/or modify it under the
2549				same terms as Perl itself.
2550
2551				Copyright 2014 Avinash Kak
2552
2553				=cut
2554