File Coverage

blib/lib/Algorithm/KMeans.pm

Criterion	Covered	Total	%
statement	22	24	91.6
branch			n/a
condition			n/a
subroutine	8	8	100.0
pod			n/a
total	30	32	93.7

line	stmt	sub	time	code
1				package Algorithm::KMeans;
2
3				#------------------------------------------------------------------------------------
4				# Copyright (c) 2014 Avinash Kak. All rights reserved. This program is free
5				# software. You may modify and/or distribute it under the same terms as Perl itself.
6				# This copyright notice must remain attached to the file.
7				#
8				# Algorithm::KMeans is a Perl module for clustering multidimensional data.
9				# -----------------------------------------------------------------------------------
10
11	1	1	12262	use 5.10.0;
	1		4
	1		37
12	1	1	4	use strict;
	1		1
	1		21
13	1	1	3	use warnings;
	1		5
	1		34
14	1	1	3	use Carp;
	1		1
	1		71
15	1	1	4	use File::Basename;
	1		1
	1		66
16	1	1	569	use Math::Random;
	1		4655
	1		76
17	1	1	513	use Graphics::GnuplotIF;
	1		9001
	1		52
18	1	1	986	use Math::GSL::Matrix;
	0
	0
19
20
21				our $VERSION = '2.04';
22
23				# from Perl docs:
24				my $_num_regex = '^[+-]?\ (\d+(\.\d)?\|\.\d+)([eE][+-]?\d+)?$';
25
26				# Constructor:
27				sub new {
28				my ($class, %args) = @_;
29				my @params = keys %args;
30				croak "\nYou have used a wrong name for a keyword argument " .
31				"--- perhaps a misspelling\n"
32				if check_for_illegal_params(@params) == 0;
33				bless {
34				_datafile => $args{datafile} \|\| croak("datafile required"),
35				_mask => $args{mask} \|\| croak("mask required"),
36				_K => $args{K} \|\| 0,
37				_K_min => $args{Kmin} \|\| 'unknown',
38				_K_max => $args{Kmax} \|\| 'unknown',
39				_cluster_seeding => $args{cluster_seeding} \|\| croak("must choose smart or random ".
40				"for cluster seeding"),
41				_var_normalize => $args{do_variance_normalization} \|\| 0,
42				_use_mahalanobis_metric => $args{use_mahalanobis_metric} \|\| 0,
43				_clusters_2_files => $args{write_clusters_to_files} \|\| 0,
44				_terminal_output => $args{terminal_output} \|\| 0,
45				_debug => $args{debug} \|\| 0,
46				_N => 0,
47				_K_best => 'unknown',
48				_original_data => {},
49				_data => {},
50				_data_id_tags => [],
51				_QoC_values => {},
52				_clusters => [],
53				_cluster_centers => [],
54				_clusters_hash => {},
55				_cluster_centers_hash => {},
56				_cluster_covariances_hash => {},
57				_data_dimensions => 0,
58
59				}, $class;
60				}
61
62				sub read_data_from_file {
63				my $self = shift;
64				my $filename = $self->{_datafile};
65				$self->read_data_from_file_csv() if $filename =~ /.csv$/;
66				$self->read_data_from_file_dat() if $filename =~ /.dat$/;
67				}
68
69				sub read_data_from_file_csv {
70				my $self = shift;
71				my $numregex = '[+-]?\ (\d+(\.\d)?\|\.\d+)([eE][+-]?\d+)?';
72				my $filename = $self->{_datafile} \|\| die "you did not specify a file with the data to be clustered";
73				my $mask = $self->{_mask};
74				my @mask = split //, $mask;
75				$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
76				print "data dimensionality: $self->{_data_dimensions} \n"if $self->{_terminal_output};
77				open FILEIN, $filename or die "Unable to open $filename: $!";
78				die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
79				local $/ = undef;
80				my @all_data = split /\s+/, ;
81				my %data_hash = ();
82				my @data_tags = ();
83				foreach my $record (@all_data) {
84				my @splits = split /,/, $record;
85				my $record_name = shift @splits;
86				$data_hash{$record_name} = \@splits;
87				push @data_tags, $record_name;
88				}
89				$self->{_original_data} = \%data_hash;
90				$self->{_data_id_tags} = \@data_tags;
91				$self->{_N} = scalar @data_tags;
92				if ($self->{_var_normalize}) {
93				$self->{_data} = variance_normalization( $self->{_original_data} );
94				} else {
95				$self->{_data} = deep_copy_hash( $self->{_original_data} );
96				}
97				# Need to make the following call to set the global mean and covariance:
98				# my $covariance = $self->estimate_mean_and_covariance(\@data_tags);
99				# Need to make the following call to set the global eigenvec eigenval sets:
100				# $self->eigen_analysis_of_covariance($covariance);
101				if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
102				carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
103				"points must satisfy the relation N > 2xK**2 where K is " .
104				"the number of clusters requested for the clusters to be " .
105				"meaningful $!"
106				if ( $self->{_N} < (2 * $self->{_K} ** 2) );
107				print "\n\n\n";
108				}
109				}
110
111				sub read_data_from_file_dat {
112				my $self = shift;
113				my $datafile = $self->{_datafile};
114				my $mask = $self->{_mask};
115				my @mask = split //, $mask;
116				$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
117				print "data dimensionality: $self->{_data_dimensions} \n"if $self->{_terminal_output};
118				open INPUT, $datafile or die "unable to open file $datafile: $!\n";
119				chomp( my @raw_data = );
120				close INPUT;
121				# Transform strings into number data
122				foreach my $record (@raw_data) {
123				next unless $record;
124				next if $record =~ /^#/;
125				my @data_fields;
126				my @fields = split /\s+/, $record;
127				die "\nABORTED: Mask size does not correspond to row record size\n" if $#fields != $#mask;
128				my $record_id;
129				foreach my $i (0..@fields-1) {
130				if ($mask[$i] eq '0') {
131				next;
132				} elsif ($mask[$i] eq 'N') {
133				$record_id = $fields[$i];
134				} elsif ($mask[$i] eq '1') {
135				push @data_fields, $fields[$i];
136				} else {
137				die "misformed mask for reading the data file\n";
138				}
139				}
140				my @nums = map {/$_num_regex/;$_} @data_fields;
141				$self->{_original_data}->{ $record_id } = \@nums;
142				}
143				if ($self->{_var_normalize}) {
144				$self->{_data} = variance_normalization( $self->{_original_data} );
145				} else {
146				$self->{_data} = deep_copy_hash( $self->{_original_data} );
147				}
148				my @all_data_ids = keys %{$self->{_data}};
149				$self->{_data_id_tags} = \@all_data_ids;
150				$self->{_N} = scalar @all_data_ids;
151				if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
152				carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
153				"points must satisfy the relation N > 2xK**2 where K is " .
154				"the number of clusters requested for the clusters to be " .
155				"meaningful $!"
156				if ( $self->{_N} < (2 * $self->{_K} ** 2) );
157				print "\n\n\n";
158				}
159				}
160
161				sub kmeans {
162				my $self = shift;
163				my $K = $self->{_K};
164				if ( ($K == 0) \|\|
165				( ($self->{_K_min} ne 'unknown') &&
166				($self->{_K_max} ne 'unknown') ) ) {
167				eval {
168				$self->iterate_through_K();
169				};
170				die "in kmeans(): $@" if ($@);
171				} elsif ( $K =~ /\d+/) {
172				eval {
173				$self->cluster_for_fixed_K_multiple_random_tries($K)
174				if $self->{_cluster_seeding} eq 'random';
175				$self->cluster_for_fixed_K_single_smart_try($K)
176				if $self->{_cluster_seeding} eq 'smart';
177				};
178				die "in kmeans(): $@" if ($@);
179				} else {
180				croak "Incorrect call syntax used. See documentation.\n";
181				}
182				if ((defined $self->{_clusters}) && (defined $self->{_cluster_centers})){
183				foreach my $i (0..@{$self->{_clusters}}-1) {
184				$self->{_clusters_hash}->{"cluster$i"} = $self->{_clusters}->[$i];
185				$self->{_cluster_centers_hash}->{"cluster$i"} = $self->{_cluster_centers}->[$i];
186				$self->{_cluster_covariances_hash}->{"cluster$i"} =
187				$self->estimate_cluster_covariance($self->{_clusters}->[$i]);
188				}
189				$self->write_clusters_to_files( $self->{_clusters} ) if $self->{_clusters_2_files};
190				return ($self->{_clusters_hash}, $self->{_cluster_centers_hash});
191				} else {
192				croak "Clustering failed. Try again.";
193				}
194				}
195
196				# This is the subroutine to call if you prefer purely random choices for the initial
197				# seeding of the cluster centers.
198				sub cluster_for_fixed_K_multiple_random_tries {
199				my $self = shift;
200				my $K = shift;
201				my @all_data_ids = @{$self->{_data_id_tags}};
202				my $QoC;
203				my @QoC_values;
204				my @array_of_clusters;
205				my @array_of_cluster_centers;
206				my $clusters;
207				my $new_clusters;
208				my $cluster_centers;
209				my $new_cluster_centers;
210				print "Clustering for K = $K\n" if $self->{_terminal_output};
211				foreach my $trial (1..20) {
212				print ". ";
213				my ($new_clusters, $new_cluster_centers);
214				eval {
215				($new_clusters, $new_cluster_centers) = $self->cluster_for_given_K($K);
216				};
217				next if $@;
218				next if @$new_clusters <= 1;
219				my $newQoC = $self->cluster_quality( $new_clusters, $new_cluster_centers );
220				if ( (!defined $QoC) \|\| ($newQoC < $QoC) ) {
221				$QoC = $newQoC;
222				$clusters = deep_copy_AoA( $new_clusters );
223				$cluster_centers = deep_copy_AoA( $new_cluster_centers );
224				}
225				}
226				die "\n\nThe constructor options you have chosen do not work with the data. Try\n" .
227				"turning off the Mahalanobis option if you are using it.\n"
228				unless defined $clusters;
229				$self->{_clusters} = $clusters;
230				$self->{_cluster_centers} = $cluster_centers;
231				$self->{_QoC_values}->{"$K"} = $QoC;
232				if ($self->{_terminal_output}) {
233				print "\nDisplaying final clusters for best K (= $K) :\n";
234				display_clusters( $clusters );
235				$self->display_cluster_centers( $clusters );
236				print "\nQoC value (the smaller, the better): $QoC\n";
237				}
238				}
239
240				# For the smart try, we do initial cluster seeding by subjecting the data to
241				# principal components analysis in order to discover the direction of maximum
242				# variance in the data space. Subsequently, we try to find the K largest peaks along
243				# this direction. The coordinates of these peaks serve as the seeds for the K
244				# clusters.
245				sub cluster_for_fixed_K_single_smart_try {
246				my $self = shift;
247				my $K = shift;
248				my @all_data_ids = @{$self->{_data_id_tags}};
249				print "Clustering for K = $K\n" if $self->{_terminal_output};
250				my ($clusters, $cluster_centers);
251				eval {
252				($clusters, $cluster_centers) = $self->cluster_for_given_K($K);
253				};
254				if ($@) {
255				die "In cluster_for_fixed_K_single_smart_try: insufficient data for clustering " .
256				"with $self->{_K} clusters --- $@";
257				}
258				my $QoC = $self->cluster_quality( $clusters, $cluster_centers );
259				$self->{_clusters} = $clusters;
260				$self->{_cluster_centers} = $cluster_centers;
261				$self->{_QoC_values}->{"$K"} = $QoC;
262				if ($self->{_terminal_output}) {
263				print "\nDisplaying final clusters for best K (= $K) :\n";
264				display_clusters( $clusters );
265				$self->display_cluster_centers( $clusters );
266				print "\nQoC value (the smaller, the better): $QoC\n";
267				}
268				}
269
270				# The following subroutine is the top-level routine to call if you want the system to
271				# figure out on its own what value to use for K, the number of clusters. If you call
272				# the KMeans constructor with the K=0 option, the method below will try every
273				# possible value of K between 2 and the maximum possible depending on the number of
274				# data points available. For example, if the number of data points is 10,000, it will
275				# try all possible values of K between 2 and 70. For how the maximum value is set for
276				# K, see the comments made under Description. Note also how this method makes 20
277				# different tries for each value of K as a defense against the problem of the final
278				# result corresponding to some local minimum in the values of the QoC metric. Out of
279				# these 20 tries for each K, it retains the clusters and the cluster centers for only
280				# that try that yields the smallest value for the QoC metric. After estimating the
281				# "best" QoC values for all possible K in this manner, it then finds the K for which
282				# the QoC is the minimum. This is taken to be the best value for K. Finally, the
283				# output clusters are written out to separate files.
284				#
285				# If the KMeans constructor is invoked with the (Kmin, Kmax) options, then, instead
286				# of iterating through 2 and the maximum permissible value for K, the iterations are
287				# carried out only between Kmin and Kmax.
288				sub iterate_through_K {
289				my $self = shift;
290				my @all_data_ids = @{$self->{_data_id_tags}};
291				my $N = $self->{_N};
292				croak "You need more than 8 data samples. The number of data points must satisfy " .
293				"the relation N > 2xK**2 where K is the number of clusters. The smallest " .
294				"value for K is 2.\n" if $N <= 8;
295				my $K_statistical_max = int( sqrt( $N / 2.0 ) );
296				my $Kmin = $self->{_K_min} eq 'unknown'
297				? 2
298				: $self->{_K_min};
299				my $Kmax = $self->{_K_max} eq 'unknown'
300				? int( sqrt( $N / 2.0 ) )
301				: $self->{_K_max};
302				croak "\n\nYour Kmax value is too high. Ideally, it should not exceed sqrt(N/2) " .
303				"where N is the number of data points" if $Kmax > $K_statistical_max;
304				print "Value of Kmax is: $Kmax\n" if $self->{_terminal_output};
305				my @QoC_values;
306				my @array_of_clusters;
307				my @array_of_cluster_centers;
308				foreach my $K ($Kmin..$Kmax) {
309				my $QoC;
310				my $clusters;
311				my $cluster_centers;
312				print "Clustering for K = $K\n" if $self->{_terminal_output};
313				if ($self->{_cluster_seeding} eq 'random') {
314				foreach my $trial (1..21) {
315				print ". ";
316				my ($new_clusters, $new_cluster_centers);
317				if ($self->{_use_mahalanobis_metric}) {
318				eval {
319				($new_clusters, $new_cluster_centers) = $self->cluster_for_given_K($K);
320				};
321				next if $@;
322				} else {
323				($new_clusters, $new_cluster_centers) = $self->cluster_for_given_K($K);
324				}
325				my $newQoC = $self->cluster_quality( $new_clusters, $new_cluster_centers );
326				if ( (!defined $QoC) \|\| ($newQoC < $QoC) ) {
327				$QoC = $newQoC;
328				$clusters = deep_copy_AoA( $new_clusters );
329				$cluster_centers = deep_copy_AoA( $new_cluster_centers );
330				}
331				}
332				print "\n";
333				} elsif ($self->{_cluster_seeding} eq 'smart') {
334				eval {
335				($clusters, $cluster_centers) = $self->cluster_for_given_K($K);
336				};
337				if ($@) {
338				$Kmax = $K - 1;
339				last;
340				}
341				$QoC = $self->cluster_quality($clusters,$cluster_centers);
342				} else {
343				die "You must either choose 'smart' for cluster_seeding or 'random'. " .
344				"Fix your constructor call."
345				}
346				push @QoC_values, $QoC;
347				push @array_of_clusters, $clusters;
348				push @array_of_cluster_centers, $cluster_centers;
349				}
350				my ($min, $max) = minmax( \@QoC_values );
351				my $K_best_relative_to_Kmin = get_index_at_value($min, \@QoC_values );
352				my $K_best = $K_best_relative_to_Kmin + $Kmin;
353				if ($self->{_terminal_output}) {
354				print "\nDisplaying final clusters for best K (= $K_best) :\n";
355				display_clusters( $array_of_clusters[$K_best_relative_to_Kmin] );
356				$self->display_cluster_centers($array_of_clusters[$K_best_relative_to_Kmin]);
357				print "\nBest clustering achieved for K=$K_best with QoC = $min\n" if defined $min;
358				my @printableQoC = grep {$_} @QoC_values;
359				print "\nQoC values array (the smaller the value, the better it is) for different " .
360				"K starting with K=$Kmin: @printableQoC\n";
361				}
362				$self->{_K_best} = $K_best;
363				foreach my $i (0..@QoC_values-1) {
364				my $k = $i + $Kmin;
365				$self->{_QoC_values}->{"$k"} = $QoC_values[$i];
366				}
367				$self->{_clusters} = $array_of_clusters[$K_best_relative_to_Kmin];
368				$self->{_cluster_centers} =
369				$array_of_cluster_centers[$K_best_relative_to_Kmin];
370				}
371
372				# This is the function to call if you already know what value you want to use for K,
373				# the number of expected clusters. The purpose of this function is to do the
374				# initialization of the cluster centers and to carry out the initial assignment of
375				# the data to the clusters with the initial cluster centers. The initialization
376				# consists of 3 steps: Construct a random sequence of K integers between 0 and N-1
377				# where N is the number of data points to be clustered; 2) Call
378				# get_initial_cluster_centers() to index into the data array with the random integers
379				# to get a list of K data points that would serve as the initial cluster centers; and
380				# (3) Call assign_data_to_clusters_initial() to assign the rest of the data to each
381				# of the K clusters on the basis of the proximity to the cluster centers.
382				sub cluster_for_given_K {
383				my $self = shift;
384				my $K = shift;
385				my @all_data_ids = @{$self->{_data_id_tags}};
386				my $cluster_centers;
387				if ($self->{_cluster_seeding} eq 'smart') {
388				$cluster_centers = $self->get_initial_cluster_centers_smart($K);
389				} elsif ($self->{_cluster_seeding} eq 'random') {
390				$cluster_centers = $self->get_initial_cluster_centers($K);
391				} else {
392				die "You must either choose 'smart' for cluster_seeding or 'random'. " .
393				"Fix your constructor call."
394				}
395				my $clusters;
396				if ($self->{_use_mahalanobis_metric}) {
397				my $clusters_and_determinants =
398				$self->assign_data_to_clusters_initial_mahalanobis($cluster_centers);
399				$clusters = $clusters_and_determinants->[0];
400				my @determinants = @{$clusters_and_determinants->[1]};
401				my ($min,$max) = minmax(\@determinants);
402				die "In cluster_for_given_K(): min determinant of covariance matrix for at " .
403				"least one cluster is too small" if $min / $max < 0.001;
404				} else {
405				$clusters = $self->assign_data_to_clusters_initial($cluster_centers);
406				}
407				my $cluster_nonexistent_flag = 0;
408				foreach my $trial (0..2) {
409				if ($self->{_use_mahalanobis_metric}) {
410				($clusters, $cluster_centers) = $self->assign_data_to_clusters_mahalanobis($clusters, $K);
411				} else {
412				($clusters, $cluster_centers) = $self->assign_data_to_clusters( $clusters, $K );
413				}
414				my $num_of_clusters_returned = @$clusters;
415				foreach my $cluster (@$clusters) {
416				$cluster_nonexistent_flag = 1 if ((!defined $cluster) \|\| (@$cluster == 0));
417				}
418				last unless $cluster_nonexistent_flag;
419				}
420				return ($clusters, $cluster_centers);
421				}
422
423				# This function is used when you set the "cluster_seeding" option to 'random' in the
424				# constructor. Returns a set of K random integers. These serve as indices to reach
425				# into the data array. A data element whose index is one of the random numbers
426				# returned by this routine serves as an initial cluster center. Note the quality
427				# check it runs on the list of K random integers constructed. We first make sure
428				# that all K random integers are different. Subsequently, we carry out a quality
429				# assessment of the K random integers constructed. This quality measure consists of
430				# the ratio of the values spanned by the random integers to the value of N, the total
431				# number of data points to be clustered. Currently, if this ratio is less than 0.3,
432				# we discard the K integers and try again.
433				sub initialize_cluster_centers {
434				my $self = shift;
435				my $K = shift;
436				my $data_store_size = $self->{_N};
437				my @cluster_center_indices;
438				while (1) {
439				foreach my $i (0..$K-1) {
440				$cluster_center_indices[$i] = int rand( $data_store_size );
441				next if $i == 0;
442				foreach my $j (0..$i-1) {
443				while ( $cluster_center_indices[$j] ==
444				$cluster_center_indices[$i] ) {
445				my $old = $cluster_center_indices[$i];
446				$cluster_center_indices[$i] = int rand($data_store_size);
447				}
448				}
449				}
450				my ($min,$max) = minmax(\@cluster_center_indices );
451				my $quality = ($max - $min) / $data_store_size;
452				last if $quality > 0.3;
453				}
454				return @cluster_center_indices;
455				}
456
457				# This function is used when you set the "cluster_seeding" option to 'random' in the
458				# constructor. This routine merely reaches into the data array with the random
459				# integers, as constructed by the previous routine, serving as indices and fetching
460				# values corresponding to those indices. The fetched data samples serve as the
461				# initial cluster centers.
462				sub get_initial_cluster_centers {
463				my $self = shift;
464				my $K = shift;
465				my @cluster_center_indices = $self->initialize_cluster_centers($K);
466				my @result;
467				foreach my $i (@cluster_center_indices) {
468				my $tag = $self->{_data_id_tags}[$i];
469				push @result, $self->{_data}->{$tag};
470				}
471				return \@result;
472				}
473
474				# This method is invoked when you choose the 'smart' option for the "cluster_seeding"
475				# option in the constructor. It subjects the data to a principal components analysis
476				# to figure out the direction of maximal variance. Subsequently, it tries to locate
477				# K peaks in a smoothed histogram of the data points projected onto the maximal
478				# variance direction.
479				sub get_initial_cluster_centers_smart {
480				my $self = shift;
481				my $K = shift;
482				if ($self->{_data_dimensions} == 1) {
483				my @one_d_data;
484				foreach my $j (0..$self->{_N}-1) {
485				my $tag = $self->{_data_id_tags}[$j];
486				push @one_d_data, $self->{_data}->{$tag}->[0];
487				}
488				my @peak_points = find_peak_points_in_given_direction(\@one_d_data,$K);
489				print "highest points at data values: @peak_points\n" if $self->{_debug};
490				my @cluster_centers;
491				foreach my $peakpoint (@peak_points) {
492				push @cluster_centers, [$peakpoint];
493				}
494				return \@cluster_centers;
495				}
496				my ($num_rows,$num_cols) = ($self->{_data_dimensions},$self->{_N});
497				my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
498				my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
499				# All the record labels are stored in the array $self->{_data_id_tags}. The
500				# actual data for clustering is stored in a hash at $self->{_data} whose keys are
501				# the record labels; the value associated with each key is the array holding the
502				# corresponding numerical multidimensional data.
503				foreach my $j (0..$num_cols-1) {
504				my $tag = $self->{_data_id_tags}[$j];
505				my $data = $self->{_data}->{$tag};
506				$matrix->set_col($j, $data);
507				}
508				if ($self->{_debug}) {
509				print "\nDisplaying the original data as a matrix:";
510				display_matrix( $matrix );
511				}
512				foreach my $j (0..$num_cols-1) {
513				$mean_vec += $matrix->col($j);
514				}
515				$mean_vec *= 1.0 / $num_cols;
516				if ($self->{_debug}) {
517				print "Displaying the mean vector for the data:";
518				display_matrix( $mean_vec );
519				}
520				foreach my $j (0..$num_cols-1) {
521				my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
522				$matrix->set_col($j, \@new_col);
523				}
524				if ($self->{_debug}) {
525				print "Displaying mean subtracted data as a matrix:";
526				display_matrix( $matrix );
527				}
528				my $transposed = transpose( $matrix );
529				if ($self->{_debug}) {
530				print "Displaying transposed data matrix:";
531				display_matrix( $transposed );
532				}
533				my $covariance = matrix_multiply( $matrix, $transposed );
534				$covariance *= 1.0 / $num_cols;
535				if ($self->{_debug}) {
536				print "\nDisplaying the Covariance Matrix for your data:";
537				display_matrix( $covariance );
538				}
539				my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
540				my $num_of_eigens = @$eigenvalues;
541				my $largest_eigen_index = 0;
542				my $smallest_eigen_index = 0;
543				print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug};
544				foreach my $i (1..$num_of_eigens-1) {
545				$largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
546				$smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
547				print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug};
548				}
549				print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
550				print "\nsmallest eigen index: $smallest_eigen_index\n\n" if $self->{_debug};
551				foreach my $i (0..$num_of_eigens-1) {
552				my @vec = $eigenvectors->[$i]->as_list;
553				print "Eigenvector $i: @vec\n" if $self->{_debug};
554				}
555				my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
556				print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug};
557				my @max_var_direction;
558				# Each element of the array @largest_eigen_vec is a Math::Complex object
559				foreach my $k (0..@largest_eigen_vec-1) {
560				my ($mag, $theta) = $largest_eigen_vec[$k] =~ /\[(\d*\.\d+),(\S+)\]/;
561				if ($theta eq '0') {
562				$max_var_direction[$k] = $mag;
563				} elsif ($theta eq 'pi') {
564				$max_var_direction[$k] = -1.0 * $mag;
565				} else {
566				die "eigendecomposition of covariance matrix produced a complex " .
567				"eigenvector --- something is wrong";
568				}
569				}
570				print "\nMaximum Variance Direction: @max_var_direction\n\n" if $self->{_debug};
571				# We now project all data points on the largest eigenvector.
572				# Each projection will yield a single point on the eigenvector.
573				my @projections;
574				foreach my $j (0..$self->{_N}-1) {
575				my $tag = $self->{_data_id_tags}[$j];
576				my $data = $self->{_data}->{$tag};
577				die "Dimensionality of the largest eigenvector does not "
578				. "match the dimensionality of the data"
579				unless @max_var_direction == $self->{_data_dimensions};
580				my $projection = vector_multiply($data, \@max_var_direction);
581				push @projections, $projection;
582				}
583				print "All projection points: @projections\n" if $self->{_debug};
584				my @peak_points = find_peak_points_in_given_direction(\@projections, $K);
585				print "highest points at points along largest eigenvec: @peak_points\n" if $self->{_debug};
586				my @cluster_centers;
587				foreach my $peakpoint (@peak_points) {
588				my @actual_peak_coords = map {$peakpoint * $_} @max_var_direction;
589				push @cluster_centers, \@actual_peak_coords;
590				}
591				return \@cluster_centers;
592				}
593
594				# This method is invoked when you choose the 'smart' option for the "cluster_seeding"
595				# option in the constructor. It is called by the previous method to locate K peaks
596				# in a smoothed histogram of the data points projected onto the maximal variance
597				# direction.
598				sub find_peak_points_in_given_direction {
599				my $dataref = shift;
600				my $how_many = shift;
601				my @data = @$dataref;
602				my ($min, $max) = minmax(\@data);
603				my $num_points = @data;
604				my @sorted_data = sort {$a <=> $b} @data;
605				my $scale = $max - $min;
606				foreach my $index (0..$#sorted_data-1) {
607				$sorted_data[$index] = ($sorted_data[$index] - $min) / $scale;
608				}
609				my $avg_diff = 0;
610				foreach my $index (0..$#sorted_data-1) {
611				my $diff = $sorted_data[$index+1] - $sorted_data[$index];
612				$avg_diff += ($diff - $avg_diff) / ($index + 1);
613				}
614				my $delta = 1.0 / 1000.0;
615				my @accumulator = (0) x 1000;
616				foreach my $index (0..@sorted_data-1) {
617				my $cell_index = int($sorted_data[$index] / $delta);
618				my $smoothness = 40;
619				for my $index ($cell_index-$smoothness..$cell_index+$smoothness) {
620				next if $index < 0 \|\| $index > 999;
621				$accumulator[$index]++;
622				}
623				}
624				my $peaks_array = non_maximum_suppression( \@accumulator );
625				my $peaks_index_hash = get_value_index_hash( $peaks_array );
626				my @K_highest_peak_locations;
627				my $k = 0;
628				foreach my $peak (sort {$b <=> $a} keys %$peaks_index_hash) {
629				my $unscaled_peak_point = $min + $peaks_index_hash->{$peak} * $scale * $delta;
630				push @K_highest_peak_locations, $unscaled_peak_point
631				if $k < $how_many;
632				last if ++$k == $how_many;
633				}
634				return @K_highest_peak_locations;
635				}
636
637				# The purpose of this routine is to form initial clusters by assigning the data
638				# samples to the initial clusters formed by the previous routine on the basis of the
639				# best proximity of the data samples to the different cluster centers.
640				sub assign_data_to_clusters_initial {
641				my $self = shift;
642				my @cluster_centers = @{ shift @_ };
643				my @clusters;
644				foreach my $ele (@{$self->{_data_id_tags}}) {
645				my $best_cluster;
646				my @dist_from_clust_centers;
647				foreach my $center (@cluster_centers) {
648				push @dist_from_clust_centers, $self->distance($ele, $center);
649				}
650				my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
651				push @{$clusters[$best_center_index]}, $ele;
652				}
653				return \@clusters;
654				}
655
656				sub assign_data_to_clusters_initial_mahalanobis {
657				my $self = shift;
658				my @cluster_centers = @{ shift @_ };
659				my @clusters;
660				foreach my $ele (@{$self->{_data_id_tags}}) {
661				my $best_cluster;
662				my @dist_from_clust_centers;
663				foreach my $center (@cluster_centers) {
664				push @dist_from_clust_centers, $self->distance($ele, $center);
665				}
666				my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
667				push @{$clusters[$best_center_index]}, $ele if defined $best_center_index;
668				}
669				# Since a cluster center may not correspond to any particular sample, it is possible
670				# for one of the elements of the array @clusters to be null using the above
671				# strategy for populating the initial clusters. Let's say there are five cluster
672				# centers in the array @cluster_centers. The $best_center_index may populate the
673				# the elements of the array @clusters for the indices 0, 1, 2, 4, which would leave
674				# $clusters[3] as undefined. So, in what follows, we must first check if all of
675				# the elements of @clusters are defined.
676				my @determinants;
677				foreach my $cluster(@clusters) {
678				die "The clustering program started with bad initialization. Please start over"
679				unless defined $cluster;
680				my $covariance = $self->estimate_cluster_covariance($cluster);
681				my $determinant = $covariance->det();
682				push @determinants, $determinant;
683				}
684				return [\@clusters, \@determinants];
685				}
686
687				# This is the main routine that along with the update_cluster_centers() routine
688				# constitute the two key steps of the K-Means algorithm. In most cases, the infinite
689				# while() loop will terminate automatically when the cluster assignments of the data
690				# points remain unchanged. For the sake of safety, we keep track of the number of
691				# iterations. If this number reaches 100, we exit the while() loop anyway. In most
692				# cases, this limit will not be reached.
693				sub assign_data_to_clusters {
694				my $self = shift;
695				my $clusters = shift;
696				my $K = shift;
697				my $final_cluster_centers;
698				my $iteration_index = 0;
699				while (1) {
700				my $new_clusters;
701				my $assignment_changed_flag = 0;
702				my $current_cluster_center_index = 0;
703				my $cluster_size_zero_condition = 0;
704				my $how_many = @$clusters;
705				my $cluster_centers = $self->update_cluster_centers( deep_copy_AoA_with_nulls( $clusters ) );
706				$iteration_index++;
707				foreach my $cluster (@$clusters) {
708				my $current_cluster_center = $cluster_centers->[$current_cluster_center_index];
709				foreach my $ele (@$cluster) {
710				my @dist_from_clust_centers;
711				foreach my $center (@$cluster_centers) {
712				push @dist_from_clust_centers,
713				$self->distance($ele, $center);
714				}
715				my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
716				my $best_cluster_center = $cluster_centers->[$best_center_index];
717				if (vector_equal($current_cluster_center, $best_cluster_center)){
718				push @{$new_clusters->[$current_cluster_center_index]}, $ele;
719				} else {
720				$assignment_changed_flag = 1;
721				push @{$new_clusters->[$best_center_index]}, $ele;
722				}
723				}
724				$current_cluster_center_index++;
725				}
726				next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
727				# Now make sure that none of the K clusters is an empty cluster:
728				foreach my $newcluster (@$new_clusters) {
729				$cluster_size_zero_condition = 1 if ((!defined $newcluster) or (@$newcluster == 0));
730				}
731				push @$new_clusters, (undef) x ($K - @$new_clusters) if @$new_clusters < $K;
732				# During clustering for a fixed K, should a cluster inadvertently
733				# become empty, steal a member from the largest cluster to hopefully
734				# spawn a new cluster:
735				my $largest_cluster;
736				foreach my $local_cluster (@$new_clusters) {
737				next if !defined $local_cluster;
738				$largest_cluster = $local_cluster if !defined $largest_cluster;
739				if (@$local_cluster > @$largest_cluster) {
740				$largest_cluster = $local_cluster;
741				}
742				}
743				foreach my $local_cluster (@$new_clusters) {
744				if ( (!defined $local_cluster) \|\| (@$local_cluster == 0) ) {
745				push @$local_cluster, pop @$largest_cluster;
746				}
747				}
748				next if (($cluster_size_zero_condition) && ($iteration_index < 100));
749				last if $iteration_index == 100;
750				$clusters = deep_copy_AoA( $new_clusters );
751				last if $assignment_changed_flag == 0;
752				}
753				$final_cluster_centers = $self->update_cluster_centers( $clusters );
754				return ($clusters, $final_cluster_centers);
755				}
756
757				sub assign_data_to_clusters_mahalanobis {
758				my $self = shift;
759				my $clusters = shift;
760				my $K = shift;
761				my $final_cluster_centers;
762				my $iteration_index = 0;
763				while (1) {
764				my $new_clusters;
765				my $assignment_changed_flag = 0;
766				my $current_cluster_center_index = 0;
767				my $cluster_size_zero_condition = 0;
768				my $how_many = @$clusters;
769				my $cluster_centers_and_covariances =
770				$self->update_cluster_centers_and_covariances_mahalanobis(deep_copy_AoA_with_nulls($clusters));
771				my $cluster_centers = $cluster_centers_and_covariances->[0];
772				my $cluster_covariances = $cluster_centers_and_covariances->[1];
773				$iteration_index++;
774				foreach my $cluster (@$clusters) {
775				my $current_cluster_center = $cluster_centers->[$current_cluster_center_index];
776				my $current_cluster_covariance = $cluster_covariances->[$current_cluster_center_index];
777				foreach my $ele (@$cluster) {
778				my @mahalanobis_dist_from_clust_centers;
779				foreach my $i (0..@$clusters-1) {
780				my $center = $cluster_centers->[$i];
781				my $covariance = $cluster_covariances->[$i];
782				my $maha_distance;
783				eval {
784				$maha_distance = $self->distance_mahalanobis($ele, $center, $covariance);
785				};
786				next if $@;
787				push @mahalanobis_dist_from_clust_centers, $maha_distance;
788				}
789				my ($min, $best_center_index) = minimum( \@mahalanobis_dist_from_clust_centers );
790				die "The Mahalanobis metric may not be appropriate for the data"
791				unless defined $best_center_index;
792				my $best_cluster_center = $cluster_centers->[$best_center_index];
793				if (vector_equal($current_cluster_center, $best_cluster_center)){
794				push @{$new_clusters->[$current_cluster_center_index]}, $ele;
795				} else {
796				$assignment_changed_flag = 1;
797				push @{$new_clusters->[$best_center_index]}, $ele;
798				}
799				}
800				$current_cluster_center_index++;
801				}
802				next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
803				# Now make sure that none of the K clusters is an empty cluster:
804				foreach my $newcluster (@$new_clusters) {
805				$cluster_size_zero_condition = 1 if ((!defined $newcluster) or (@$newcluster == 0));
806				}
807				push @$new_clusters, (undef) x ($K - @$new_clusters) if @$new_clusters < $K;
808				# During clustering for a fixed K, should a cluster inadvertently
809				# become empty, steal a member from the largest cluster to hopefully
810				# spawn a new cluster:
811				my $largest_cluster;
812				foreach my $local_cluster (@$new_clusters) {
813				next if !defined $local_cluster;
814				$largest_cluster = $local_cluster if !defined $largest_cluster;
815				if (@$local_cluster > @$largest_cluster) {
816				$largest_cluster = $local_cluster;
817				}
818				}
819				foreach my $local_cluster (@$new_clusters) {
820				if ( (!defined $local_cluster) \|\| (@$local_cluster == 0) ) {
821				push @$local_cluster, pop @$largest_cluster;
822				}
823				}
824				next if (($cluster_size_zero_condition) && ($iteration_index < 100));
825				last if $iteration_index == 100;
826				# Now do a deep copy of new_clusters into clusters
827				$clusters = deep_copy_AoA( $new_clusters );
828				last if $assignment_changed_flag == 0;
829				}
830				$final_cluster_centers = $self->update_cluster_centers( $clusters );
831				return ($clusters, $final_cluster_centers);
832				}
833
834				sub update_cluster_centers_and_covariances_mahalanobis {
835				my $self = shift;
836				my @clusters = @{ shift @_ };
837				my @new_cluster_centers;
838				my @new_cluster_covariances;
839				# During clustering for a fixed K, should a cluster inadvertently become empty,
840				# steal a member from the largest cluster to hopefully spawn a new cluster:
841				my $largest_cluster;
842				foreach my $cluster (@clusters) {
843				next if !defined $cluster;
844				$largest_cluster = $cluster if !defined $largest_cluster;
845				if (@$cluster > @$largest_cluster) {
846				$largest_cluster = $cluster;
847				}
848				}
849				foreach my $cluster (@clusters) {
850				if ( (!defined $cluster) \|\| (@$cluster == 0) ) {
851				push @$cluster, pop @$largest_cluster;
852				}
853				}
854				foreach my $cluster (@clusters) {
855				die "Cluster became empty --- untenable condition " .
856				"for a given K. Try again. \n" if !defined $cluster;
857				my $cluster_size = @$cluster;
858				die "Cluster size is zero --- untenable.\n" if $cluster_size == 0;
859				my @new_cluster_center = @{$self->add_point_coords( $cluster )};
860				@new_cluster_center = map {my $x = $_/$cluster_size; $x} @new_cluster_center;
861				push @new_cluster_centers, \@new_cluster_center;
862				# for covariance calculation:
863				my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$cluster));
864				my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
865				my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
866				# All the record labels are stored in the array $self->{_data_id_tags}. The
867				# actual data for clustering is stored in a hash at $self->{_data} whose keys are
868				# the record labels; the value associated with each key is the array holding the
869				# corresponding numerical multidimensional data.
870				foreach my $j (0..$num_cols-1) {
871				my $tag = $cluster->[$j];
872				my $data = $self->{_data}->{$tag};
873				my @diff_from_mean = vector_subtract($data, \@new_cluster_center);
874				$matrix->set_col($j, \@diff_from_mean);
875				}
876				my $transposed = transpose( $matrix );
877				my $covariance = matrix_multiply( $matrix, $transposed );
878				$covariance *= 1.0 / $num_cols;
879				if ($self->{_debug}) {
880				print "\nDisplaying the Covariance Matrix for cluster:";
881				display_matrix( $covariance );
882				}
883				push @new_cluster_covariances, $covariance;
884				}
885				return [\@new_cluster_centers, \@new_cluster_covariances];
886				}
887
888				# After each new assignment of the data points to the clusters on the basis of the
889				# current values for the cluster centers, we call the routine shown here for updating
890				# the values of the cluster centers.
891				sub update_cluster_centers {
892				my $self = shift;
893				my @clusters = @{ shift @_ };
894				my @new_cluster_centers;
895				# During clustering for a fixed K, should a cluster inadvertently become empty,
896				# steal a member from the largest cluster to hopefully spawn a new cluster:
897				my $largest_cluster;
898				foreach my $cluster (@clusters) {
899				next if !defined $cluster;
900				$largest_cluster = $cluster if !defined $largest_cluster;
901				if (@$cluster > @$largest_cluster) {
902				$largest_cluster = $cluster;
903				}
904				}
905				foreach my $cluster (@clusters) {
906				if ( (!defined $cluster) \|\| (@$cluster == 0) ) {
907				push @$cluster, pop @$largest_cluster;
908				}
909				}
910				foreach my $cluster (@clusters) {
911				die "Cluster became empty --- untenable condition " .
912				"for a given K. Try again. \n" if !defined $cluster;
913				my $cluster_size = @$cluster;
914				die "Cluster size is zero --- untenable.\n" if $cluster_size == 0;
915				my @new_cluster_center = @{$self->add_point_coords( $cluster )};
916				@new_cluster_center = map {my $x = $_/$cluster_size; $x}
917				@new_cluster_center;
918				push @new_cluster_centers, \@new_cluster_center;
919				}
920				return \@new_cluster_centers;
921				}
922
923				sub which_cluster_for_new_data_element {
924				my $self = shift;
925				my $ele = shift;
926				die "The dimensionality of the new data element is not correct: $!"
927				unless @$ele == $self->{_data_dimensions};
928				my %distance_to_new_ele_hash;
929				foreach my $cluster_id (sort keys %{$self->{_cluster_centers_hash}}) {
930				$distance_to_new_ele_hash{$cluster_id} = $self->distance2($ele,
931				$self->{_cluster_centers_hash}->{$cluster_id});
932				}
933				my @values = values %distance_to_new_ele_hash;
934				my ($min,$max) = minmax(\@values);
935				my $answer;
936				foreach my $cluster_id (keys %distance_to_new_ele_hash) {
937				$answer = $cluster_id if $distance_to_new_ele_hash{$cluster_id} == $min;
938				}
939				return $answer;
940				}
941
942				sub which_cluster_for_new_data_element_mahalanobis {
943				my $self = shift;
944				my $ele = shift;
945				die "The dimensionality of the new data element is not correct: $!"
946				unless @$ele == $self->{_data_dimensions};
947				my %distance_to_new_ele_hash;
948				foreach my $cluster_id (sort keys %{$self->{_cluster_centers_hash}}) {
949				$distance_to_new_ele_hash{$cluster_id} =
950				$self->distance_mahalanobis2($ele, $self->{_cluster_centers_hash}->{$cluster_id},
951				$self->{_cluster_covariances_hash}->{$cluster_id});
952				}
953				my @values = values %distance_to_new_ele_hash;
954				my ($min,$max) = minmax(\@values);
955				my $answer;
956				foreach my $cluster_id (keys %distance_to_new_ele_hash) {
957				$answer = $cluster_id if $distance_to_new_ele_hash{$cluster_id} == $min;
958				}
959				return $answer;
960				}
961
962				# The following function returns the value of QoC for a given partitioning of the
963				# data into K clusters. It calculates two things: the average value for the distance
964				# between a data point and the center of the cluster in which the data point resides,
965				# and the average value for the distances between the cluster centers. We obviously
966				# want to minimize the former and maximize the latter. All of the "from center"
967				# distances within each cluster are stored in the variable
968				# $sum_of_distances_for_one_cluster. When this variable, after it is divided by the
969				# number of data elements in the cluster, is summed over all the clusters, we get a
970				# value that is stored in $avg_dist_for_cluster. The inter-cluster-center distances
971				# are stored in the variable $inter_cluster_center_dist.
972				sub cluster_quality {
973				my $self = shift;
974				my $clusters = shift;
975				my $cluster_centers = shift;
976				my $K = @$cluster_centers; # Number of clusters
977				my $cluster_radius = 0;
978				foreach my $i (0..@$clusters-1) {
979				my $sum_of_distances_for_one_cluster = 0;
980				foreach my $ele (@{$clusters->[$i]}) {
981				$sum_of_distances_for_one_cluster +=
982				$self->distance( $ele, $cluster_centers->[$i] );
983				}
984				$cluster_radius +=
985				$sum_of_distances_for_one_cluster / @{$clusters->[$i]};
986				}
987				my $inter_cluster_center_dist = 0;
988				foreach my $i (0..@$cluster_centers-1) {
989				foreach my $j (0..@$cluster_centers-1) {
990				$inter_cluster_center_dist +=
991				$self->distance2( $cluster_centers->[$i],
992				$cluster_centers->[$j] );
993				}
994				}
995				my $avg_inter_cluster_center_dist = $inter_cluster_center_dist /
996				( $K * ($K-1) / 2.0 );
997				return $cluster_radius / $avg_inter_cluster_center_dist;
998				}
999
1000				# The following routine is for computing the distance between a data point specified
1001				# by its symbolic name in the master datafile and a point (such as the center of a
1002				# cluster) expressed as a vector of coordinates:
1003				sub distance {
1004				my $self = shift;
1005				my $ele1_id = shift @_; # symbolic name of data sample
1006				my @ele1 = @{$self->{_data}->{$ele1_id}};
1007				my @ele2 = @{shift @_};
1008				die "wrong data types for distance calculation\n" if @ele1 != @ele2;
1009				my $how_many = @ele1;
1010				my $squared_sum = 0;
1011				foreach my $i (0..$how_many-1) {
1012				$squared_sum += ($ele1[$i] - $ele2[$i])**2;
1013				}
1014				my $dist = sqrt $squared_sum;
1015				return $dist;
1016				}
1017
1018				# The following routine does the same as above but now both arguments are expected to
1019				# be arrays of numbers:
1020				sub distance2 {
1021				my $self = shift;
1022				my @ele1 = @{shift @_};
1023				my @ele2 = @{shift @_};
1024				die "wrong data types for distance calculation\n" if @ele1 != @ele2;
1025				my $how_many = @ele1;
1026				my $squared_sum = 0;
1027				foreach my $i (0..$how_many-1) {
1028				$squared_sum += ($ele1[$i] - $ele2[$i])**2;
1029				}
1030				return sqrt $squared_sum;
1031				}
1032
1033				# Requires three args: $ele for the symbolic tag of the element, $center for the
1034				# coordinates of the center of the cluster, and $covariance for the covariance of
1035				# cluster. Our goal is to find the distance of the element ele from the cluster
1036				# whose mean and covariance are provided.
1037				sub distance_mahalanobis {
1038				my $self = shift;
1039				my $ele = shift;
1040				my $cluster_center = shift;
1041				my $cluster_covariance = shift;
1042				my $det_of_covar = $cluster_covariance->det();
1043				my $ele_data = $self->{_data}->{$ele};
1044				my @diff_from_mean = vector_subtract($ele_data, $cluster_center);
1045				my $vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
1046				$vec->set_col(0, \@diff_from_mean);
1047				my $transposed = transpose( $vec );
1048				my $covariance_inverse;
1049				if ($cluster_covariance->det() > 0.001) {
1050				$covariance_inverse = $cluster_covariance->inverse;
1051				} else {
1052				die "covariance matrix may not have an inverse";
1053				}
1054				my $determinant = $covariance_inverse->det();
1055				my $distance = $transposed * $covariance_inverse * $vec;
1056				my @distance = $distance->as_list;
1057				$distance = $distance[0];
1058				return sqrt $distance;
1059				}
1060				# Same as the previous method except that the first argument can be the actual
1061				# coordinates of the data element. Our goal is to find the Mahalanobis distance
1062				# from a given data element to a cluster whose center and covariance are known. As
1063				# for the previous method, it requires three arguments.
1064				sub distance_mahalanobis2 {
1065				my $self = shift;
1066				my $ele = shift; # is now a ref to the array of coords for a point
1067				my $cluster_center = shift;
1068				my $cluster_covariance = shift;
1069				return undef unless defined $cluster_covariance;
1070				my $det_of_covar = $cluster_covariance->det();
1071				my @diff_from_mean = vector_subtract($ele, $cluster_center);
1072				my $vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
1073				$vec->set_col(0, \@diff_from_mean);
1074				my $transposed = transpose( $vec );
1075				my $covariance_inverse;
1076				if ($cluster_covariance->det() > 0.001) {
1077				$covariance_inverse = $cluster_covariance->inverse;
1078				} else {
1079				die "covariance matrix may not have an inverse";
1080				}
1081				my $determinant = $covariance_inverse->det();
1082				my $distance = $transposed * $covariance_inverse * $vec;
1083				my @distance = $distance->as_list;
1084				$distance = $distance[0];
1085				return sqrt $distance;
1086				}
1087
1088				sub estimate_cluster_covariance {
1089				my $self = shift;
1090				my $cluster = shift;
1091				my $cluster_size = @$cluster;
1092				my @cluster_center = @{$self->add_point_coords( $cluster )};
1093				@cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;
1094				# for covariance calculation:
1095				my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$cluster));
1096				my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
1097				my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
1098				# All the record labels are stored in the array $self->{_data_id_tags}. The
1099				# actual data for clustering is stored in a hash at $self->{_data} whose keys are
1100				# the record labels; the value associated with each key is the array holding the
1101				# corresponding numerical multidimensional data.
1102				foreach my $j (0..$num_cols-1) {
1103				my $tag = $cluster->[$j];
1104				my $data = $self->{_data}->{$tag};
1105				my @diff_from_mean = vector_subtract($data, \@cluster_center);
1106				$matrix->set_col($j, \@diff_from_mean);
1107				}
1108				my $transposed = transpose( $matrix );
1109				my $covariance = $matrix * $transposed;
1110				$covariance *= 1.0 / $num_cols;
1111				if ($self->{_debug}) {
1112				print "\nDisplaying the Covariance Matrix for cluster:";
1113				display_matrix( $covariance );
1114				}
1115				return $covariance;
1116				}
1117
1118				sub write_clusters_to_files {
1119				my $self = shift;
1120				my @clusters = @{$self->{_clusters}};
1121				unlink glob "cluster*.dat";
1122				foreach my $i (0..@clusters-1) {
1123				my $filename = "cluster" . $i . ".txt";
1124				print "\nWriting cluster $i to file $filename\n" if $self->{_terminal_output};
1125				open FILEHANDLE, "\| sort > $filename" or die "Unable to open file: $!";
1126				foreach my $ele (@{$clusters[$i]}) {
1127				print FILEHANDLE "$ele ";
1128				}
1129				close FILEHANDLE;
1130				}
1131				}
1132
1133				sub get_K_best {
1134				my $self = shift;
1135				croak "You need to run the clusterer with K=0 option " .
1136				"before you can call this method" if $self->{_K_best} eq 'unknown';
1137				print "\nThe best value of K: $self->{_K_best}\n" if $self->{_terminal_output};
1138				return $self->{_K_best};
1139				}
1140
1141				sub show_QoC_values {
1142				my $self = shift;
1143				croak "\nYou need to run the clusterer with K=0 option before you can call this method"
1144				if $self->{_K_best} eq 'unknown';
1145				print "\nShown below are K on the left and the QoC values on the right (the smaller " .
1146				"the QoC, the better it is)\n";
1147				foreach my $key (sort keys %{$self->{_QoC_values}} ) {
1148				print " $key => $self->{_QoC_values}->{$key}\n" if defined $self->{_QoC_values}->{$key};
1149				}
1150				}
1151
1152				sub DESTROY {
1153				unlink "__temp_" . basename($_[0]->{_datafile});
1154				unlink "__temp_data_" . basename($_[0]->{_datafile});
1155				unlink "__temp_normed_data_" . basename($_[0]->{_datafile});
1156				}
1157
1158				################################## Visualization Code ###################################
1159
1160				# It makes sense to call visualize_clusters() only AFTER you have called kmeans().
1161				#
1162				# The visualize_clusters() implementation automatically figures out whether it
1163				# should do a 2D plot or a 3D plot. If the number of on bits in the mask that is
1164				# supplied as one of the arguments is greater than 2, it does a 3D plot for the
1165				# first three data coordinates. That is, the clusters will be displayed in the 3D
1166				# space formed by the first three data coordinates. On the other hand, if the number
1167				# of on bits in the mask is exactly 2, it does a 2D plot. Should it happen that
1168				# only one on bit is specified for the mask, visualize_clusters() aborts.
1169				#
1170				# The visualization code consists of first accessing each of clusters created by the
1171				# kmeans() subroutine. Note that the clusters contain only the symbolic names for
1172				# the individual records in the source data file. We therefore next reach into the
1173				# $self->{_original_data} hash and get the data coordinates associated with each
1174				# symbolic label in a cluster. The numerical data thus generated is then written
1175				# out to a temp file. When doing so we must remember to insert TWO BLANK LINES
1176				# between the data blocks corresponding to the different clusters. This constraint
1177				# is imposed on us by Gnuplot when plotting data from the same file since we want to
1178				# use different point styles for the data points in different cluster files.
1179				#
1180				# Subsequently, we call upon the Perl interface provided by the Graphics::GnuplotIF
1181				# module to plot the data clusters.
1182				sub visualize_clusters {
1183				my $self = shift;
1184				my $v_mask;
1185				my $pause_time;
1186				if (@_ == 1) {
1187				$v_mask = shift \|\| croak "visualization mask missing";
1188				} elsif (@_ == 2) {
1189				$v_mask = shift \|\| croak "visualization mask missing";
1190				$pause_time = shift;
1191				} else {
1192				croak "visualize_clusters() called with wrong args";
1193				}
1194				my $master_datafile = $self->{_datafile};
1195				my @v_mask = split //, $v_mask;
1196				my $visualization_mask_width = @v_mask;
1197				my $original_data_mask = $self->{_mask};
1198				my @mask = split //, $original_data_mask;
1199				my $data_field_width = scalar grep {$_ eq '1'} @mask;
1200				croak "\n\nABORTED: The width of the visualization mask (including " .
1201				"all its 1s and 0s) must equal the width of the original mask " .
1202				"used for reading the data file (counting only the 1's)"
1203				if $visualization_mask_width != $data_field_width;
1204				my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1205				my %visualization_data;
1206				while ( my ($record_id, $data) = each %{$self->{_original_data}} ) {
1207				my @fields = @$data;
1208				croak "\nABORTED: Visualization mask size exceeds data record size\n"
1209				if $#v_mask > $#fields;
1210				my @data_fields;
1211				foreach my $i (0..@fields-1) {
1212				if ($v_mask[$i] eq '0') {
1213				next;
1214				} elsif ($v_mask[$i] eq '1') {
1215				push @data_fields, $fields[$i];
1216				} else {
1217				croak "Misformed visualization mask. It can only have 1s and 0s\n";
1218				}
1219				}
1220				$visualization_data{ $record_id } = \@data_fields;
1221				}
1222				my @all_data_ids = @{$self->{_data_id_tags}};
1223				my $K = scalar @{$self->{_clusters}};
1224				my $filename = basename($master_datafile);
1225				my $temp_file = "__temp_" . $filename;
1226				unlink $temp_file if -e $temp_file;
1227				open OUTPUT, ">$temp_file"
1228				or die "Unable to open a temp file in this directory: $!\n";
1229				foreach my $cluster (@{$self->{_clusters}}) {
1230				foreach my $item (@$cluster) {
1231				print OUTPUT "@{$visualization_data{$item}}";
1232				print OUTPUT "\n";
1233				}
1234				print OUTPUT "\n\n";
1235				}
1236				close OUTPUT;
1237				my $plot;
1238				my $hardcopy_plot;
1239				if (!defined $pause_time) {
1240				$plot = Graphics::GnuplotIF->new( persist => 1 );
1241				$hardcopy_plot = Graphics::GnuplotIF->new();
1242				$hardcopy_plot->gnuplot_cmd('set terminal png', "set output \"clustering_results.png\"");
1243				} else {
1244				$plot = Graphics::GnuplotIF->new();
1245				}
1246				$plot->gnuplot_cmd( "set noclip" );
1247				$plot->gnuplot_cmd( "set pointsize 2" );
1248				my $arg_string = "";
1249				if ($visualization_data_field_width > 2) {
1250				foreach my $i (0..$K-1) {
1251				my $j = $i + 1;
1252				$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i\" with points lt $j pt $j, ";
1253				}
1254				} elsif ($visualization_data_field_width == 2) {
1255				foreach my $i (0..$K-1) {
1256				my $j = $i + 1;
1257				$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i\" with points lt $j pt $j, ";
1258				}
1259				} elsif ($visualization_data_field_width == 1 ) {
1260				foreach my $i (0..$K-1) {
1261				my $j = $i + 1;
1262				$arg_string .= "\"$temp_file\" index $i using 1 title \"Cluster $i\" with points lt $j pt $j, ";
1263				}
1264				}
1265				$arg_string = $arg_string =~ /^(.*),[ ]+$/;
1266				$arg_string = $1;
1267				if ($visualization_data_field_width > 2) {
1268				$plot->gnuplot_cmd( "splot $arg_string" );
1269				$hardcopy_plot->gnuplot_cmd( "splot $arg_string" ) unless defined $pause_time;
1270				$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1271				} elsif ($visualization_data_field_width == 2) {
1272				$plot->gnuplot_cmd( "plot $arg_string" );
1273				$hardcopy_plot->gnuplot_cmd( "plot $arg_string" ) unless defined $pause_time;
1274				$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1275				} elsif ($visualization_data_field_width == 1) {
1276				croak "No provision for plotting 1-D data\n";
1277				}
1278				}
1279
1280				# It makes sense to call visualize_data() only AFTER you have called the method
1281				# read_data_from_file().
1282				#
1283				# The visualize_data() is meant for the visualization of the original data in its
1284				# various 2D or 3D subspaces. The method can also be used to visualize the normed
1285				# data in a similar manner. Recall the normed data is the original data after each
1286				# data dimension is normalized by the standard-deviation along that dimension.
1287				#
1288				# Whether you see the original data or the normed data depends on the second
1289				# argument supplied in the method call. It must be either the string 'original' or
1290				# the string 'normed'.
1291				sub visualize_data {
1292				my $self = shift;
1293				my $v_mask = shift \|\| croak "visualization mask missing";
1294				my $datatype = shift; # must be either 'original' or 'normed'
1295				croak "\n\nABORTED: You called visualize_data() for normed data " .
1296				"but without first turning on data normalization in the " .
1297				"in the KMeans constructor"
1298				if ($datatype eq 'normed') && ! $self->{_var_normalize};
1299				my $master_datafile = $self->{_datafile};
1300				my @v_mask = split //, $v_mask;
1301				my $visualization_mask_width = @v_mask;
1302				my $original_data_mask = $self->{_mask};
1303				my @mask = split //, $original_data_mask;
1304				my $data_field_width = scalar grep {$_ eq '1'} @mask;
1305				croak "\n\nABORTED: The width of the visualization mask (including " .
1306				"all its 1s and 0s) must equal the width of the original mask " .
1307				"used for reading the data file (counting only the 1's)"
1308				if $visualization_mask_width != $data_field_width;
1309				my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1310				my %visualization_data;
1311				my $data_source;
1312				if ($datatype eq 'original') {
1313				$data_source = $self->{_original_data};
1314				} elsif ($datatype eq 'normed') {
1315				$data_source = $self->{_data};
1316				} else {
1317				croak "\n\nABORTED: improper call to visualize_data()";
1318				}
1319				while ( my ($record_id, $data) = each %{$data_source} ) {
1320				my @fields = @$data;
1321				croak "\nABORTED: Visualization mask size exceeds data record size\n"
1322				if $#v_mask > $#fields;
1323				my @data_fields;
1324				foreach my $i (0..@fields-1) {
1325				if ($v_mask[$i] eq '0') {
1326				next;
1327				} elsif ($v_mask[$i] eq '1') {
1328				push @data_fields, $fields[$i];
1329				} else {
1330				croak "Misformed visualization mask. It can only have 1s and 0s\n";
1331				}
1332				}
1333				$visualization_data{ $record_id } = \@data_fields;
1334				}
1335				my $filename = basename($master_datafile);
1336				my $temp_file;
1337				if ($datatype eq 'original') {
1338				$temp_file = "__temp_data_" . $filename;
1339				} elsif ($datatype eq 'normed') {
1340				$temp_file = "__temp_normed_data_" . $filename;
1341				} else {
1342				croak "ABORTED: Improper call to visualize_data()";
1343				}
1344				unlink $temp_file if -e $temp_file;
1345				open OUTPUT, ">$temp_file"
1346				or die "Unable to open a temp file in this directory: $!\n";
1347				foreach my $datapoint (values %visualization_data) {
1348				print OUTPUT "@$datapoint";
1349				print OUTPUT "\n";
1350				}
1351				close OUTPUT;
1352				my $plot = Graphics::GnuplotIF->new( persist => 1 );
1353				$plot->gnuplot_cmd( "set noclip" );
1354				$plot->gnuplot_cmd( "set pointsize 2" );
1355				my $plot_title = $datatype eq 'original' ? '"data"' : '"normed data"';
1356				my $arg_string ;
1357				if ($visualization_data_field_width > 2) {
1358				$arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
1359				} elsif ($visualization_data_field_width == 2) {
1360				$arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
1361				} elsif ($visualization_data_field_width == 1 ) {
1362				$arg_string = "\"$temp_file\" using 1 notitle with points lt -1 pt 1";
1363				}
1364				if ($visualization_data_field_width > 2) {
1365				$plot->gnuplot_cmd( "splot $arg_string" );
1366				} elsif ($visualization_data_field_width == 2) {
1367				$plot->gnuplot_cmd( "plot $arg_string" );
1368				} elsif ($visualization_data_field_width == 1) {
1369				croak "No provision for plotting 1-D data\n";
1370				}
1371				}
1372
1373				########################### Generating Synthetic Data for Clustering ##############################
1374
1375				# The data generated corresponds to a multivariate distribution. The mean and the
1376				# covariance of each Gaussian in the distribution are specified individually in a
1377				# parameter file. See the example parameter file param.txt in the examples
1378				# directory. Just edit this file for your own needs.
1379				#
1380				# The multivariate random numbers are generated by calling the Math::Random module.
1381				# As you would expect, that module will insist that the covariance matrix you
1382				# specify be symmetric and positive definite.
1383				sub cluster_data_generator {
1384				my $class = shift;
1385				croak "illegal call of a class method" unless $class eq 'Algorithm::KMeans';
1386				my %args = @_;
1387				my $input_parameter_file = $args{input_parameter_file};
1388				my $output_file = $args{output_datafile};
1389				my $N = $args{number_data_points_per_cluster};
1390				my @all_params;
1391				my $param_string;
1392				if (defined $input_parameter_file) {
1393				open INPUT, $input_parameter_file \|\| "unable to open parameter file: $!";
1394				@all_params = ;
1395				@all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
1396				chomp @all_params;
1397				$param_string = join ' ', @all_params;
1398				} else {
1399				# Just for testing. Used in t/test.t
1400				$param_string = "cluster 5 0 0 1 0 0 0 1 0 0 0 1 " .
1401				"cluster 0 5 0 1 0 0 0 1 0 0 0 1 " .
1402				"cluster 0 0 5 1 0 0 0 1 0 0 0 1";
1403				}
1404				my @cluster_strings = split /[ ]cluster[ ]/, $param_string;
1405				@cluster_strings = grep $_, @cluster_strings;
1406				my $K = @cluster_strings;
1407				croak "Too many clusters requested" if $K > 12;
1408				my @point_labels = ('a'..'z');
1409				print "Number of Gaussians used for the synthetic data: $K\n";
1410				my @means;
1411				my @covariances;
1412				my $data_dimension;
1413				foreach my $i (0..$K-1) {
1414				my @num_strings = split / /, $cluster_strings[$i];
1415				my @cluster_mean = map {/$_num_regex/;$_} split / /, $num_strings[0];
1416				$data_dimension = @cluster_mean;
1417				push @means, \@cluster_mean;
1418				my @covariance_nums = map {/$_num_regex/;$_} split / /, $num_strings[1];
1419				croak "dimensionality error" if @covariance_nums !=
1420				($data_dimension ** 2);
1421				my $cluster_covariance;
1422				foreach my $j (0..$data_dimension-1) {
1423				foreach my $k (0..$data_dimension-1) {
1424				$cluster_covariance->[$j]->[$k] =
1425				$covariance_nums[$j*$data_dimension + $k];
1426				}
1427				}
1428				push @covariances, $cluster_covariance;
1429				}
1430				random_seed_from_phrase( 'hellojello' );
1431				my @data_dump;
1432				foreach my $i (0..$K-1) {
1433				my @m = @{shift @means};
1434				my @covar = @{shift @covariances};
1435				my @new_data = Math::Random::random_multivariate_normal( $N, @m, @covar );
1436				my $p = 0;
1437				my $label = $point_labels[$i];
1438				@new_data = map {unshift @$_, $label.$i; $i++; $_} @new_data;
1439				push @data_dump, @new_data;
1440				}
1441				fisher_yates_shuffle( \@data_dump );
1442				open OUTPUT, ">$output_file";
1443				foreach my $ele (@data_dump) {
1444				foreach my $coord ( @$ele ) {
1445				print OUTPUT "$coord ";
1446				}
1447				print OUTPUT "\n";
1448				}
1449				print "Data written out to file $output_file\n";
1450				close OUTPUT;
1451				}
1452
1453				sub add_point_coords {
1454				my $self = shift;
1455				my @arr_of_ids = @{shift @_}; # array of data element names
1456				my @result;
1457				my $data_dimensionality = $self->{_data_dimensions};
1458				foreach my $i (0..$data_dimensionality-1) {
1459				$result[$i] = 0.0;
1460				}
1461				foreach my $id (@arr_of_ids) {
1462				my $ele = $self->{_data}->{$id};
1463				my $i = 0;
1464				foreach my $component (@$ele) {
1465				$result[$i] += $component;
1466				$i++;
1467				}
1468				}
1469				return \@result;
1470				}
1471
1472				sub add_point_coords_from_original_data {
1473				my $self = shift;
1474				my @arr_of_ids = @{shift @_}; # array of data element names
1475				my @result;
1476				my $data_dimensionality = $self->{_data_dimensions};
1477				foreach my $i (0..$data_dimensionality-1) {
1478				$result[$i] = 0.0;
1479				}
1480				foreach my $id (@arr_of_ids) {
1481				my $ele = $self->{_original_data}->{$id};
1482				my $i = 0;
1483				foreach my $component (@$ele) {
1484				$result[$i] += $component;
1485				$i++;
1486				}
1487				}
1488				return \@result;
1489				}
1490
1491				################################### Support Routines ########################################
1492
1493				sub get_index_at_value {
1494				my $value = shift;
1495				my @array = @{shift @_};
1496				foreach my $i (0..@array-1) {
1497				return $i if $value == $array[$i];
1498				}
1499				return -1;
1500				}
1501
1502				# This routine is really not necessary in light of the new `~~' operator in Perl.
1503				# Will use the new operator in the next version.
1504				sub vector_equal {
1505				my $vec1 = shift;
1506				my $vec2 = shift;
1507				die "wrong data types for vector-equal predicate\n" if @$vec1 != @$vec2;
1508				foreach my $i (0..@$vec1-1){
1509				return 0 if $vec1->[$i] != $vec2->[$i];
1510				}
1511				return 1;
1512				}
1513
1514				# Returns the minimum value and its positional index in an array
1515				sub minimum {
1516				my $arr = shift;
1517				my $min;
1518				my $index;
1519				foreach my $i (0..@{$arr}-1) {
1520				if ( (!defined $min) \|\| ($arr->[$i] < $min) ) {
1521				$index = $i;
1522				$min = $arr->[$i];
1523				}
1524				}
1525				return ($min, $index);
1526				}
1527
1528				sub minmax {
1529				my $arr = shift;
1530				my $min;
1531				my $max;
1532				foreach my $i (0..@{$arr}-1) {
1533				next if !defined $arr->[$i];
1534				if ( (!defined $min) && (!defined $max) ) {
1535				$min = $arr->[$i];
1536				$max = $arr->[$i];
1537				} elsif ( $arr->[$i] < $min ) {
1538				$min = $arr->[$i];
1539				} elsif ( $arr->[$i] > $max ) {
1540				$max = $arr->[$i];
1541				}
1542				}
1543				return ($min, $max);
1544				}
1545
1546				# Meant only for constructing a deep copy of an array of arrays:
1547				sub deep_copy_AoA {
1548				my $ref_in = shift;
1549				my $ref_out;
1550				foreach my $i (0..@{$ref_in}-1) {
1551				foreach my $j (0..@{$ref_in->[$i]}-1) {
1552				$ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
1553				}
1554				}
1555				return $ref_out;
1556				}
1557
1558				# Meant only for constructing a deep copy of an array of arrays for the case when
1559				# some elements of the top-level array may be undefined:
1560				sub deep_copy_AoA_with_nulls {
1561				my $ref_in = shift;
1562				my $ref_out;
1563				foreach my $i (0..@{$ref_in}-1) {
1564				if ( !defined $ref_in->[$i] ) {
1565				$ref_out->[$i] = undef;
1566				next;
1567				}
1568				foreach my $j (0..@{$ref_in->[$i]}-1) {
1569				$ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
1570				}
1571				}
1572				return $ref_out;
1573				}
1574
1575				# Meant only for constructing a deep copy of a hash in which each value is an
1576				# anonymous array of numbers:
1577				sub deep_copy_hash {
1578				my $ref_in = shift;
1579				my $ref_out;
1580				while ( my ($key, $value) = each( %$ref_in ) ) {
1581				$ref_out->{$key} = deep_copy_array( $value );
1582				}
1583				return $ref_out;
1584				}
1585
1586				# Meant only for an array of numbers:
1587				sub deep_copy_array {
1588				my $ref_in = shift;
1589				my $ref_out;
1590				foreach my $i (0..@{$ref_in}-1) {
1591				$ref_out->[$i] = $ref_in->[$i];
1592				}
1593				return $ref_out;
1594				}
1595
1596				sub display_cluster_centers {
1597				my $self = shift;
1598				my @clusters = @{shift @_};
1599				my $i = 0;
1600				foreach my $cluster (@clusters) {
1601				my $cluster_size = @$cluster;
1602				my @cluster_center =
1603				@{$self->add_point_coords_from_original_data( $cluster )};
1604				@cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;
1605				print "\ncluster $i ($cluster_size records):\n";
1606				print "cluster center $i: " .
1607				"@{[map {my $x = sprintf('%.4f', $_); $x} @cluster_center]}\n";
1608				$i++;
1609				}
1610				}
1611
1612				# For displaying the individual clusters on a terminal screen. Each cluster is
1613				# displayed through the symbolic names associated with the data points.
1614				sub display_clusters {
1615				my @clusters = @{shift @_};
1616				my $i = 0;
1617				foreach my $cluster (@clusters) {
1618				@$cluster = sort @$cluster;
1619				my $cluster_size = @$cluster;
1620				print "\n\nCluster $i ($cluster_size records):\n";
1621				foreach my $ele (@$cluster) {
1622				print " $ele";
1623				}
1624				$i++
1625				}
1626				print "\n\n";
1627				}
1628
1629				# from perl docs:
1630				sub fisher_yates_shuffle {
1631				my $arr = shift;
1632				my $i = @$arr;
1633				while (--$i) {
1634				my $j = int rand( $i + 1 );
1635				@$arr[$i, $j] = @$arr[$j, $i];
1636				}
1637				}
1638
1639				sub variance_normalization {
1640				my %data_hash = %{shift @_};
1641				my @all_data_points = values %data_hash;
1642				my $dimensions = @{$all_data_points[0]};
1643				my @data_projections;
1644				foreach my $data_point (@all_data_points) {
1645				my $i = 0;
1646				foreach my $proj (@$data_point) {
1647				push @{$data_projections[$i++]}, $proj;
1648				}
1649				}
1650				my @variance_vec;
1651				foreach my $vec (@data_projections) {
1652				my ($mean, $variance) = mean_and_variance( $vec );
1653				push @variance_vec, $variance;
1654				}
1655				my %new_data_hash;
1656				while (my ($label, $data) = each(%data_hash) ) {
1657				my @new_data;
1658				foreach my $i (0..@{$data}-1) {
1659				my $new = $data->[$i] / sqrt($variance_vec[$i]);
1660				push @new_data, $data->[$i] / sqrt($variance_vec[$i]);
1661				}
1662				$new_data_hash{$label} = \@new_data;
1663				}
1664				return \%new_data_hash;
1665				}
1666
1667				sub mean_and_variance {
1668				my @data = @{shift @_};
1669				my ($mean, $variance);
1670				foreach my $i (1..@data) {
1671				if ($i == 1) {
1672				$mean = $data[0];
1673				$variance = 0;
1674				} else {
1675				$mean = ( (($i-1)/$i) * $mean ) + $data[$i-1] / $i;
1676				$variance = ( (($i-1)/$i) * $variance ) + ($data[$i-1]-$mean)**2 / ($i-1);
1677				}
1678				}
1679				return ($mean, $variance);
1680				}
1681
1682				sub check_for_illegal_params {
1683				my @params = @_;
1684				my @legal_params = qw / datafile
1685				mask
1686				K
1687				Kmin
1688				Kmax
1689				terminal_output
1690				write_clusters_to_files
1691				do_variance_normalization
1692				cluster_seeding
1693				use_mahalanobis_metric
1694				debug
1695				/;
1696				my $found_match_flag;
1697				foreach my $param (@params) {
1698				foreach my $legal (@legal_params) {
1699				$found_match_flag = 0;
1700				if ($param eq $legal) {
1701				$found_match_flag = 1;
1702				last;
1703				}
1704				}
1705				last if $found_match_flag == 0;
1706				}
1707				return $found_match_flag;
1708				}
1709
1710				sub get_value_index_hash {
1711				my $arr = shift;
1712				my %hash;
1713				foreach my $index (0..@$arr-1) {
1714				$hash{$arr->[$index]} = $index if $arr->[$index] > 0;
1715				}
1716				return \%hash;
1717				}
1718
1719				sub non_maximum_suppression {
1720				my $arr = shift;
1721				my @output = (0) x @$arr;
1722				my @final_output = (0) x @$arr;
1723				my %hash;
1724				my @array_of_runs = ([$arr->[0]]);
1725				foreach my $index (1..@$arr-1) {
1726				if ($arr->[$index] == $arr->[$index-1]) {
1727				push @{$array_of_runs[-1]}, $arr->[$index];
1728				} else {
1729				push @array_of_runs, [$arr->[$index]];
1730				}
1731				}
1732				my $runstart_index = 0;
1733				foreach my $run_index (1..@array_of_runs-2) {
1734				$runstart_index += @{$array_of_runs[$run_index-1]};
1735				if ($array_of_runs[$run_index]->[0] >
1736				$array_of_runs[$run_index-1]->[0] &&
1737				$array_of_runs[$run_index]->[0] >
1738				$array_of_runs[$run_index+1]->[0]) {
1739				my $run_center = @{$array_of_runs[$run_index]} / 2;
1740				my $assignment_index = $runstart_index + $run_center;
1741				$output[$assignment_index] = $arr->[$assignment_index];
1742				}
1743				}
1744				if ($array_of_runs[-1]->[0] > $array_of_runs[-2]->[0]) {
1745				$runstart_index += @{$array_of_runs[-2]};
1746				my $run_center = @{$array_of_runs[-1]} / 2;
1747				my $assignment_index = $runstart_index + $run_center;
1748				$output[$assignment_index] = $arr->[$assignment_index];
1749				}
1750				if ($array_of_runs[0]->[0] > $array_of_runs[1]->[0]) {
1751				my $run_center = @{$array_of_runs[0]} / 2;
1752				$output[$run_center] = $arr->[$run_center];
1753				}
1754				return \@output;
1755				}
1756
1757				sub display_matrix {
1758				my $matrix = shift;
1759				my $nrows = $matrix->rows();
1760				my $ncols = $matrix->cols();
1761				print "\n\nDisplaying matrix of size $nrows rows and $ncols columns:\n";
1762				foreach my $i (0..$nrows-1) {
1763				my $row = $matrix->row($i);
1764				my @row_as_list = $row->as_list;
1765				print "@row_as_list\n";
1766				}
1767				print "\n\n";
1768				}
1769
1770				sub transpose {
1771				my $matrix = shift;
1772				my $num_rows = $matrix->rows();
1773				my $num_cols = $matrix->cols();
1774				my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
1775				foreach my $i (0..$num_rows-1) {
1776				my @row = $matrix->row($i)->as_list;
1777				$transpose->set_col($i, \@row );
1778				}
1779				return $transpose;
1780				}
1781
1782				sub vector_subtract {
1783				my $vec1 = shift;
1784				my $vec2 = shift;
1785				die "wrong data types for vector subtract calculation\n" if @$vec1 != @$vec2;
1786				my @result;
1787				foreach my $i (0..@$vec1-1){
1788				push @result, $vec1->[$i] - $vec2->[$i];
1789				}
1790				return @result;
1791				}
1792
1793				sub vector_multiply {
1794				my $vec1 = shift;
1795				my $vec2 = shift;
1796				die "vec_multiply called with two vectors of different sizes"
1797				unless @$vec1 == @$vec2;
1798				my $result = 0;
1799				foreach my $i (0..@$vec1-1) {
1800				$result += $vec1->[$i] * $vec2->[$i];
1801				}
1802				return $result;
1803				}
1804
1805				sub matrix_multiply {
1806				my $matrix1 = shift;
1807				my $matrix2 = shift;
1808				my ($nrows1, $ncols1) = ($matrix1->rows(), $matrix1->cols());
1809				my ($nrows2, $ncols2) = ($matrix2->rows(), $matrix2->cols());
1810				die "matrix multiplication called with non-matching matrix arguments"
1811				unless $nrows1 == $ncols2 && $ncols1 == $nrows2;
1812				if ($nrows1 == 1) {
1813				my @row = $matrix1->row(0)->as_list;
1814				my @col = $matrix2->col(0)->as_list;
1815				my $result;
1816				foreach my $j (0..$ncols1-1) {
1817				$result += $row[$j] * $col[$j];
1818				}
1819				return $result;
1820				}
1821				my $product = Math::GSL::Matrix->new($nrows1, $nrows1);
1822				foreach my $i (0..$nrows1-1) {
1823				my $row = $matrix1->row($i);
1824				my @product_row;
1825				foreach my $j (0..$ncols2-1) {
1826				my $col = $matrix2->col($j);
1827				my $row_times_col = matrix_multiply($row, $col);
1828				push @product_row, $row_times_col;
1829				}
1830				$product->set_row($i, \@product_row);
1831				}
1832				return $product;
1833				}
1834
1835				1;
1836
1837				=pod
1838
1839				=head1 NAME
1840
1841				Algorithm::KMeans - for clustering multidimensional data
1842
1843				=head1 SYNOPSIS
1844
1845				# You now have four different choices for clustering your data with this module:
1846				#
1847				# 1) With Euclidean distances and with random cluster seeding
1848				#
1849				# 2) With Mahalanobis distances and with random cluster seeding
1850				#
1851				# 3) With Euclidean distances and with smart cluster seeding
1852				#
1853				# 4) With Mahalanobis distances and with smart cluster seeding
1854				#
1855				# Despite the qualifier 'smart' in 'smart cluster seeding', it may not always
1856				# produce results that are superior to those obtained with random seeding. (If you
1857				# also factor in the choice regarding variance normalization, you actually have
1858				# eight different choices for data clustering with this module.)
1859				#
1860				# In all cases, you'd obviously begin with
1861
1862				use Algorithm::KMeans;
1863
1864				# You'd then name the data file:
1865
1866				my $datafile = "mydatafile.csv";
1867
1868				# Next, set the mask to indicate which columns of the datafile to use for
1869				# clustering and which column contains a symbolic ID for each data record. For
1870				# example, if the symbolic name is in the first column, you want the second column
1871				# to be ignored, and you want the next three columns to be used for 3D clustering,
1872				# you'd set the mask to:
1873
1874				my $mask = "N0111";
1875
1876				# Now construct an instance of the clusterer. The parameter K controls the number
1877				# of clusters. If you know how many clusters you want (let's say 3), call
1878
1879				my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1880				mask => $mask,
1881				K => 3,
1882				cluster_seeding => 'random',
1883				terminal_output => 1,
1884				write_clusters_to_files => 1,
1885				);
1886
1887				# By default, this constructor call will set you up for clustering based on
1888				# Euclidean distances. If you want the module to use Mahalanobis distances, your
1889				# constructor call will look like:
1890
1891				my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1892				mask => $mask,
1893				K => 3,
1894				cluster_seeding => 'random',
1895				use_mahalanobis_metric => 1,
1896				terminal_output => 1,
1897				write_clusters_to_files => 1,
1898				);
1899
1900				# For both constructor calls shown above, you can use smart seeding of the clusters
1901				# by changing 'random' to 'smart' for the cluster_seeding option. See the
1902				# explanation of smart seeding in the Methods section of this documentation.
1903
1904				# If your data is such that its variability along the different dimensions of the
1905				# data space is significantly different, you may get better clustering if you first
1906				# normalize your data by setting the constructor parameter
1907				# do_variance_normalization as shown below:
1908
1909				my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1910				mask => $mask,
1911				K => 3,
1912				cluster_seeding => 'smart', # or 'random'
1913				terminal_output => 1,
1914				do_variance_normalization => 1,
1915				write_clusters_to_files => 1,
1916				);
1917
1918				# But bear in mind that such data normalization may actually decrease the
1919				# performance of the clusterer if the variability in the data is more a result of
1920				# the separation between the means than a consequence of intra-cluster variance.
1921
1922				# Set K to 0 if you want the module to figure out the optimum number of clusters
1923				# from the data. (It is best to run this option with the terminal_output set to 1
1924				# so that you can see the different value of QoC for the different K):
1925
1926				my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1927				mask => $mask,
1928				K => 0,
1929				cluster_seeding => 'random', # or 'smart'
1930				terminal_output => 1,
1931				write_clusters_to_files => 1,
1932				);
1933
1934				# Although not shown above, you can obviously set the 'do_variance_normalization'
1935				# flag here also if you wish.
1936
1937				# For very large data files, setting K to 0 will result in searching through too
1938				# many values for K. For such cases, you can range limit the values of K to search
1939				# through by
1940
1941				my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1942				mask => "N111",
1943				Kmin => 3,
1944				Kmax => 10,
1945				cluster_seeding => 'random', # or 'smart'
1946				terminal_output => 1,
1947				write_clusters_to_files => 1,
1948				);
1949
1950				# FOR ALL CASES ABOVE, YOU'D NEED TO MAKE THE FOLLOWING CALLS ON THE CLUSTERER
1951				# INSTANCE TO ACTUALLY CLUSTER THE DATA:
1952
1953				$clusterer->read_data_from_file();
1954				$clusterer->kmeans();
1955
1956				# If you want to directly access the clusters and the cluster centers in your own
1957				# top-level script, replace the above two statements with:
1958
1959				$clusterer->read_data_from_file();
1960				my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans();
1961
1962				# You can subsequently access the clusters directly in your own code, as in:
1963
1964				foreach my $cluster_id (sort keys %{$clusters_hash}) {
1965				print "\n$cluster_id => @{$clusters_hash->{$cluster_id}}\n";
1966				}
1967				foreach my $cluster_id (sort keys %{$cluster_centers_hash}) {
1968				print "\n$cluster_id => @{$cluster_centers_hash->{$cluster_id}}\n";
1969				}
1970
1971
1972				# CLUSTER VISUALIZATION:
1973
1974				# You must first set the mask for cluster visualization. This mask tells the module
1975				# which 2D or 3D subspace of the original data space you wish to visualize the
1976				# clusters in:
1977
1978				my $visualization_mask = "111";
1979				$clusterer->visualize_clusters($visualization_mask);
1980
1981
1982				# SYNTHETIC DATA GENERATION:
1983
1984				# The module has been provided with a class method for generating multivariate data
1985				# for experimenting with clustering. The data generation is controlled by the
1986				# contents of the parameter file that is supplied as an argument to the data
1987				# generator method. The mean and covariance matrix entries in the parameter file
1988				# must be according to the syntax shown in the param.txt file in the examples
1989				# directory. It is best to edit this file as needed:
1990
1991				my $parameter_file = "param.txt";
1992				my $out_datafile = "mydatafile.dat";
1993				Algorithm::KMeans->cluster_data_generator(
1994				input_parameter_file => $parameter_file,
1995				output_datafile => $out_datafile,
1996				number_data_points_per_cluster => $N );
1997
1998				=head1 CHANGES
1999
2000				Version 2.04 allows you to use CSV data files for clustering.
2001
2002				Version 2.03 incorporates minor code cleanup. The main implementation of the module
2003				remains unchanged.
2004
2005				Version 2.02 downshifts the version of Perl that is required for this module. The
2006				module should work with versions 5.10 and higher of Perl. The implementation code
2007				for the module remains unchanged.
2008
2009				Version 2.01 removes many errors in the documentation. The changes made to the module
2010				in Version 2.0 were not reflected properly in the documentation page for that
2011				version. The implementation code remains unchanged.
2012
2013				Version 2.0 includes significant additional functionality: (1) You now have the
2014				option to cluster using the Mahalanobis distance metric (the default is the Euclidean
2015				metric); and (2) With the two C methods that have been added to the
2016				module, you can now determine the best cluster for a new data sample after you have
2017				created the clusters with the previously available data. Finding the best cluster
2018				for a new data sample can be done using either the Euclidean metric or the
2019				Mahalanobis metric.
2020
2021				Version 1.40 includes a C option for seeding the clusters. This option,
2022				supplied through the constructor parameter C, means that the
2023				clusterer will (1) Subject the data to principal components analysis in order to
2024				determine the maximum variance direction; (2) Project the data onto this direction;
2025				(3) Find peaks in a smoothed histogram of the projected points; and (4) Use the
2026				locations of the highest peaks as initial guesses for the cluster centers. If you
2027				don't want to use this option, set C to C. That should work
2028				as in the previous version of the module.
2029
2030				Version 1.30 includes a bug fix for the case when the datafile contains empty lines,
2031				that is, lines with no data records. Another bug fix in Version 1.30 deals with the
2032				case when you want the module to figure out how many clusters to form (this is the
2033				C option in the constructor call) and the number of data records is close to the
2034				minimum.
2035
2036				Version 1.21 includes fixes to handle the possibility that, when clustering the data
2037				for a fixed number of clusters, a cluster may become empty during iterative
2038				calculation of cluster assignments of the data elements and the updating of the
2039				cluster centers. The code changes are in the C and
2040				C subroutines.
2041
2042				Version 1.20 includes an option to normalize the data with respect to its variability
2043				along the different coordinates before clustering is carried out.
2044
2045				Version 1.1.1 allows for range limiting the values of C to search through. C
2046				stands for the number of clusters to form. This version also declares the module
2047				dependencies in the C file.
2048
2049				Version 1.1 is a an object-oriented version of the implementation presented in
2050				version 1.0. The current version should lend itself more easily to code extension.
2051				You could, for example, create your own class by subclassing from the class presented
2052				here and, in your subclass, use your own criteria for the similarity distance between
2053				the data points and for the QoC (Quality of Clustering) metric, and, possibly a
2054				different rule to stop the iterations. Version 1.1 also allows you to directly
2055				access the clusters formed and the cluster centers in your calling script.
2056
2057
2058				=head1 SPECIAL USAGE NOTE
2059
2060				If you were directly accessing in your own scripts the clusters produced by the older
2061				versions of this module, you'd need to make changes to your code if you wish to use
2062				Version 2.0 or higher. Instead of returning arrays of clusters and cluster centers,
2063				Versions 2.0 and higher return hashes. This change was made necessary by the logic
2064				required for implementing the two new C methods that were introduced
2065				in Version 2.0. These methods return the best cluster for a new data sample from the
2066				clusters you created using the existing data. One of the C methods is
2067				based on the Euclidean metric for finding the cluster that is closest to the new data
2068				sample, and the other on the Mahalanobis metric. Another point of incompatibility
2069				with the previous versions is that you must now explicitly set the C
2070				parameter in the call to the constructor to either C or C. This
2071				parameter does not have a default associated with it starting with Version 2.0.
2072
2073
2074				=head1 DESCRIPTION
2075
2076				Clustering with K-Means takes place iteratively and involves two steps: 1) assignment
2077				of data samples to clusters on the basis of how far the data samples are from the
2078				cluster centers; and 2) Recalculation of the cluster centers (and cluster covariances
2079				if you are using the Mahalanobis distance metric for clustering).
2080
2081				Obviously, before the two-step approach can proceed, we need to initialize the the
2082				cluster centers. How this initialization is carried out is important. The module
2083				gives you two very different ways for carrying out this initialization. One option,
2084				called the C option, consists of subjecting the data to principal components
2085				analysis to discover the direction of maximum variance in the data space. The data
2086				points are then projected on to this direction and a histogram constructed from the
2087				projections. Centers of the smoothed histogram are used to seed the clustering
2088				operation. The other option is to choose the cluster centers purely randomly. You
2089				get the first option if you set C to C in the constructor,
2090				and you get the second option if you set it to C.
2091
2092				How to specify the number of clusters, C, is one of the most vexing issues in any
2093				approach to clustering. In some case, we can set C on the basis of prior
2094				knowledge. But, more often than not, no such prior knowledge is available. When the
2095				programmer does not explicitly specify a value for C, the approach taken in the
2096				current implementation is to try all possible values between 2 and some largest
2097				possible value that makes statistical sense. We then choose that value for C
2098				which yields the best value for the QoC (Quality of Clustering) metric. It is
2099				generally believed that the largest value for C should not exceed C
2100				where C is the number of data samples to be clustered.
2101
2102				What to use for the QoC metric is obviously a critical issue unto itself. In the
2103				current implementation, the value of QoC is the ratio of the average radius of the
2104				clusters and the average distance between the cluster centers.
2105
2106				Every iterative algorithm requires a stopping criterion. The criterion implemented
2107				here is that we stop iterations when there is no re-assignment of the data points
2108				during the assignment step.
2109
2110				Ordinarily, the output produced by a K-Means clusterer will correspond to a local
2111				minimum for the QoC values, as opposed to a global minimum. The current
2112				implementation protects against that when the module constructor is called with the
2113				C option for C by trying different randomly selected initial
2114				cluster centers and then selecting the one that gives the best overall QoC value.
2115
2116				A K-Means clusterer will generally produce good results if the overlap between the
2117				clusters is minimal and if each cluster exhibits variability that is uniform in all
2118				directions. When the data variability is different along the different directions in
2119				the data space, the results you obtain with a K-Means clusterer may be improved by
2120				first normalizing the data appropriately, as can be done in this module when you set
2121				the C option in the constructor. However, as pointed out
2122				elsewhere in this documentation, such normalization may actually decrease the
2123				performance of the clusterer if the overall data variability along any dimension is
2124				more a result of separation between the means than a consequence of intra-cluster
2125				variability.
2126
2127
2128				=head1 METHODS
2129
2130				The module provides the following methods for clustering, for cluster visualization,
2131				for data visualization, for the generation of data for testing a clustering
2132				algorithm, and for determining the cluster membership of a new data sample:
2133
2134				=over 4
2135
2136				=item B
2137
2138				my $clusterer = Algorithm::KMeans->new(datafile => $datafile,
2139				mask => $mask,
2140				K => $K,
2141				cluster_seeding => 'random', # also try 'smart'
2142				use_mahalanobis_metric => 1, # also try '0'
2143				terminal_output => 1,
2144				write_clusters_to_files => 1,
2145				);
2146
2147				A call to C constructs a new instance of the C class. When
2148				C<$K> is a non-zero positive integer, the module will construct exactly that many
2149				clusters. However, when C<$K> is 0, the module will find the best number of clusters
2150				to partition the data into. As explained in the Description, setting
2151				C to C causes PCA (principal components analysis) to be used
2152				for discovering the best choices for the initial cluster centers. If you want purely
2153				random decisions to be made for the initial choices for the cluster centers, set
2154				C to C.
2155
2156				The data file is expected to contain entries in the following format
2157
2158				c20 0 10.7087017086940 9.63528386251712 10.9512155258108 ...
2159				c7 0 12.8025925026787 10.6126270065785 10.5228482095349 ...
2160				b9 0 7.60118206283120 5.05889245193079 5.82841781759102 ...
2161				....
2162				....
2163
2164				where the first column contains the symbolic ID tag for each data record and the rest
2165				of the columns the numerical information. As to which columns are actually used for
2166				clustering is decided by the string value of the mask. For example, if we wanted to
2167				cluster on the basis of the entries in just the 3rd, the 4th, and the 5th columns
2168				above, the mask value would be C where the character C indicates that the
2169				ID tag is in the first column, the character C<0> that the second column is to be
2170				ignored, and the C<1>'s that follow that the 3rd, the 4th, and the 5th columns are to
2171				be used for clustering.
2172
2173				If you wish for the clusterer to search through a C<(Kmin,Kmax)> range of values for
2174				C, the constructor should be called in the following fashion:
2175
2176				my $clusterer = Algorithm::KMeans->new(datafile => $datafile,
2177				mask => $mask,
2178				Kmin => 3,
2179				Kmax => 10,
2180				cluster_seeding => 'smart', # try 'random' also
2181				terminal_output => 1,
2182				);
2183
2184				where obviously you can choose any reasonable values for C and C. If you
2185				choose a value for C that is statistically too large, the module will let you
2186				know. Again, you may choose C for C, the default value being
2187				C.
2188
2189				If you believe that the variability of the data is very different along the different
2190				dimensions of the data space, you may get better clustering by first normalizing the
2191				data coordinates by the standard-deviations along those directions. When you set the
2192				constructor option C as shown below, the module uses the
2193				overall data standard-deviation along a direction for the normalization in that
2194				direction. (As mentioned elsewhere in the documentation, such a normalization could
2195				backfire on you if the data variability along a dimension is more a result of the
2196				separation between the means than a consequence of the intra-cluster variability.):
2197
2198				my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
2199				mask => "N111",
2200				K => 2,
2201				cluster_seeding => 'smart', # try 'random' also
2202				terminal_output => 1,
2203				do_variance_normalization => 1,
2204				);
2205
2206				=back
2207
2208				=head2 Constructor Parameters
2209
2210				=over 8
2211
2212				=item C:
2213
2214				This parameter names the data file that contains the multidimensional data records
2215				you want the module to cluster.
2216
2217				=item C:
2218
2219				This parameter supplies the mask to be applied to the columns of your data file. See
2220				the explanation in Synopsis for what this mask looks like.
2221
2222				=item C:
2223
2224				This parameter supplies the number of clusters you are looking for. If you set this
2225				option to 0, that means that you want the module to search for the best value for
2226				C. (Keep in mind the fact that searching for the best C may take a long time
2227				for large data files.)
2228
2229				=item C:
2230
2231				If you supply an integer value for C, the search for the best C will begin
2232				with that value.
2233
2234				=item C:
2235
2236				If you supply an integer value for C, the search for the best C will end at
2237				that value.
2238
2239				=item C:
2240
2241				This parameter must be set to either C or C. Depending on your data,
2242				you may get superior clustering with the C option. The choice C means
2243				that the clusterer will (1) subject the data to principal components analysis to
2244				determine the maximum variance direction; (2) project the data onto this direction;
2245				(3) find peaks in a smoothed histogram of the projected points; and (4) use the
2246				locations of the highest peaks as seeds for cluster centers. If the C option
2247				produces bizarre results, try C.
2248
2249				=item C:
2250
2251				When set to 1, this option causes Mahalanobis distances to be used for clustering.
2252				The default is 0 for this parameter. By default, the module uses the Euclidean
2253				distances for clustering. In general, Mahalanobis distance based clustering will
2254				fail if your data resides on a lower-dimensional hyperplane in the data space, if you
2255				seek too many clusters, and if you do not have a sufficient number of samples in your
2256				data file. A necessary requirement for the module to be able to compute Mahalanobis
2257				distances is that the cluster covariance matrices be non-singular. (Let's say your
2258				data dimensionality is C and the module is considering a cluster that has only
2259				C samples in it where C is less than C. In this case, the covariance matrix
2260				will be singular since its rank will not exceed C. For the covariance matrix to
2261				be non-singular, it must be of full rank, that is, its rank must be C.)
2262
2263				=item C:
2264
2265				When set, the module will first normalize the data variance along the different
2266				dimensions of the data space before attempting clustering. Depending on your data,
2267				this option may or may not result in better clustering.
2268
2269				=item C:
2270
2271				This boolean parameter, when not supplied in the call to C, defaults to 0.
2272				When set, you will see in your terminal window the different clusters as lists of the
2273				symbolic IDs and their cluster centers. You will also see the QoC (Quality of
2274				Clustering) values for the clusters displayed.
2275
2276				=item C:
2277
2278				This parameter is also boolean. When set to 1, the clusters are written out to files
2279				that are named in the following manner:
2280
2281				cluster0.txt
2282				cluster1.txt
2283				cluster2.txt
2284				...
2285				...
2286
2287				Before the clusters are written to these files, the module destroys all files with
2288				such names in the directory in which you call the module.
2289
2290
2291				=back
2292
2293				=over
2294
2295				=item B
2296
2297				$clusterer->read_data_from_file()
2298
2299				=item B
2300
2301				$clusterer->kmeans();
2302
2303				or
2304
2305				my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans();
2306
2307				The first call above works solely by side-effect. The second call also returns the
2308				clusters and the cluster centers. See the C script in the
2309				C directory for how you can in your own code extract the clusters and the
2310				cluster centers from the variables C<$clusters_hash> and C<$cluster_centers_hash>.
2311
2312				=item B
2313
2314				$clusterer->get_K_best();
2315
2316				This call makes sense only if you supply either the C option to the constructor,
2317				or if you specify values for the C and C options. The C and the
2318				C<(Kmin,Kmax)> options cause the module to determine the best value for C.
2319				Remember, C is the number of clusters the data is partitioned into.
2320
2321				=item B
2322
2323				$clusterer->show_QoC_values();
2324
2325				presents a table with C values in the left column and the corresponding QoC
2326				(Quality-of-Clustering) values in the right column. Note that this call makes sense
2327				only if you either supply the C option to the constructor, or if you specify
2328				values for the C and C options.
2329
2330				=item B
2331
2332				$clusterer->visualize_clusters( $visualization_mask )
2333
2334				The visualization mask here does not have to be identical to the one used for
2335				clustering, but must be a subset of that mask. This is convenient for visualizing
2336				the clusters in two- or three-dimensional subspaces of the original space.
2337
2338				=item B
2339
2340				$clusterer->visualize_data($visualization_mask, 'original');
2341
2342				$clusterer->visualize_data($visualization_mask, 'normed');
2343
2344				This method requires a second argument and, as shown, it must be either the string
2345				C or the string C, the former for the visualization of the raw data
2346				and the latter for the visualization of the data after its different dimensions are
2347				normalized by the standard-deviations along those directions. If you call the method
2348				with the second argument set to C, but do so without turning on the
2349				C option in the KMeans constructor, it will let you know.
2350
2351
2352				=item B
2353
2354				If you wish to determine the cluster membership of a new data sample after you have
2355				created the clusters with the existing data samples, you would need to call this
2356				method. The C script in the C directory
2357				shows how to use this method.
2358
2359
2360				=item B
2361
2362				This does the same thing as the previous method, except that it determines the
2363				cluster membership using the Mahalanobis distance metric. As for the previous
2364				method, see the C script in the C directory
2365				for how to use this method.
2366
2367
2368				=item B
2369
2370				Algorithm::KMeans->cluster_data_generator(
2371				input_parameter_file => $parameter_file,
2372				output_datafile => $out_datafile,
2373				number_data_points_per_cluster => 20 );
2374
2375				for generating multivariate data for clustering if you wish to play with synthetic
2376				data for clustering. The input parameter file contains the means and the variances
2377				for the different Gaussians you wish to use for the synthetic data. See the file
2378				C provided in the examples directory. It will be easiest for you to just
2379				edit this file for your data generation needs. In addition to the format of the
2380				parameter file, the main constraint you need to observe in specifying the parameters
2381				is that the dimensionality of the covariance matrix must correspond to the
2382				dimensionality of the mean vectors. The multivariate random numbers are generated by
2383				calling the C module. As you would expect, this module requires that
2384				the covariance matrices you specify in your parameter file be symmetric and positive
2385				definite. Should the covariances in your parameter file not obey this condition, the
2386				C module will let you know.
2387
2388				=back
2389
2390				=head1 HOW THE CLUSTERS ARE OUTPUT
2391
2392				When the option C is set in the call to the constructor, the
2393				clusters are displayed on the terminal screen.
2394
2395				When the option C is set in the call to the constructor, the
2396				module dumps the clusters in files named
2397
2398				cluster0.txt
2399				cluster1.txt
2400				cluster2.txt
2401				...
2402				...
2403
2404				in the directory in which you execute the module. The number of such files will
2405				equal the number of clusters formed. All such existing files in the directory are
2406				destroyed before any fresh ones are created. Each cluster file contains the symbolic
2407				ID tags of the data samples in that cluster.
2408
2409				The module also leaves in your directory a printable `.png' file that is a point plot
2410				of the different clusters. The name of this file is always C.
2411
2412				Also, as mentioned previously, a call to C in your own code will return the
2413				clusters and the cluster centers.
2414
2415				=head1 REQUIRED
2416
2417				This module requires the following three modules:
2418
2419				Math::Random
2420				Graphics::GnuplotIF
2421				Math::GSL
2422
2423				With regard to the third item above, what is actually required is the
2424				C module. However, that module is a part of the C
2425				distribution. The acronym GSL stands for the GNU Scientific Library. C is
2426				a Perl interface to the GSL C-based library.
2427
2428
2429				=head1 THE C DIRECTORY
2430
2431				The C directory contains several scripts to help you become familiar with
2432				this module. The following script is an example of how the module can be expected to
2433				be used most of the time. It calls for clustering to be carried out with a fixed
2434				C:
2435
2436				cluster_and_visualize.pl
2437
2438				The more time you spend with this script, the more comfortable you will become with
2439				the use of this module. The script file contains a large comment block that mentions
2440				six locations in the script where you have to make decisions about how to use the
2441				module.
2442
2443				See the following script if you do not know what value to use for C for clustering
2444				your data:
2445
2446				find_best_K_and_cluster.pl
2447
2448				This script uses the C option in the constructor that causes the module to
2449				search for the best C for your data. Since this search is virtually unbounded ---
2450				limited only by the number of samples in your data file --- the script may take a
2451				long time to run for a large data file. Hence the next script.
2452
2453				If your datafile is too large, you may need to range limit the values of C that
2454				are searched through, as in the following script:
2455
2456				find_best_K_in_range_and_cluster.pl
2457
2458				If you also want to include data normalization (it may reduce the performance of the
2459				clusterer in some cases), see the following script:
2460
2461				cluster_after_data_normalization.pl
2462
2463				When you include the data normalization step and you would like to visualize the data
2464				before and after normalization, see the following script:
2465
2466				cluster_and_visualize_with_data_visualization.pl*
2467
2468				After you are done clustering, let's say you want to find the cluster membership of a
2469				new data sample. To see how you can do that, see the script:
2470
2471				which_cluster_for_new_data.pl
2472
2473				This script returns two answers for which cluster a new data sample belongs to: one
2474				using the Euclidean metric to calculate the distances between the new data sample and
2475				the cluster centers, and the other using the Mahalanobis metric. If the clusters are
2476				strongly elliptical in shape, you are likely to get better results with the
2477				Mahalanobis metric. (To see that you can get two different answers using the two
2478				different distance metrics, run the C script on the
2479				data in the file C. To make this run, note that you have to comment
2480				out and uncomment the lines at four different locations in the script.)
2481
2482				The C directory also contains the following support scripts:
2483
2484				For generating the data for experiments with clustering:
2485
2486				data_generator.pl
2487
2488				For cleaning up the examples directory:
2489
2490				cleanup_directory.pl
2491
2492				The examples directory also includes a parameter file, C, for generating
2493				synthetic data for clustering. Just edit this file if you would like to generate
2494				your own multivariate data for clustering. The parameter file is for the 3D case,
2495				but you can generate data with any dimensionality through appropriate entries in the
2496				parameter file.
2497
2498				=head1 EXPORT
2499
2500				None by design.
2501
2502				=head1 CAVEATS
2503
2504				K-Means based clustering usually does not work well when the clusters are strongly
2505				overlapping and when the extent of variability along the different dimensions is
2506				different for the different clusters. The module does give you the ability to
2507				normalize the variability in your data with the constructor option
2508				C. However, as described elsewhere, this may actually
2509				reduce the performance of the clusterer if the data variability along a direction is
2510				more a result of the separation between the means than because of intra-cluster
2511				variability. For better clustering with difficult-to-cluster data, you could try
2512				using the author's C module.
2513
2514				=head1 BUGS
2515
2516				Please notify the author if you encounter any bugs. When sending email, please place
2517				the string 'KMeans' in the subject line.
2518
2519				=head1 INSTALLATION
2520
2521				Download the archive from CPAN in any directory of your choice. Unpack the archive
2522				with a command that on a Linux machine would look like:
2523
2524				tar zxvf Algorithm-KMeans-2.04.tar.gz
2525
2526				This will create an installation directory for you whose name will be
2527				C. Enter this directory and execute the following commands
2528				for a standard install of the module if you have root privileges:
2529
2530				perl Makefile.PL
2531				make
2532				make test
2533				sudo make install
2534
2535				If you do not have root privileges, you can carry out a non-standard install the
2536				module in any directory of your choice by:
2537
2538				perl Makefile.PL prefix=/some/other/directory/
2539				make
2540				make test
2541				make install
2542
2543				With a non-standard install, you may also have to set your PERL5LIB environment
2544				variable so that this module can find the required other modules. How you do that
2545				would depend on what platform you are working on. In order to install this module in
2546				a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
2547				variable by
2548
2549				setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2550
2551				If I used bash, I'd need to declare:
2552
2553				export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2554
2555
2556				=head1 THANKS
2557
2558				I thank Slaven for pointing out that I needed to downshift the required version of Perl
2559				for this module. Fortunately, I had access to an old machine still running Perl
2560				5.10.1. The current version, 2.02, is based on my testing the module on that machine.
2561
2562				I added two C methods in Version 2.0 as a result of an email from
2563				Jerome White who expressed a need for such methods in order to determine the best
2564				cluster for a new data record after you have successfully clustered your existing
2565				data. Thanks Jerome for your feedback!
2566
2567				It was an email from Nadeem Bulsara that prompted me to create Version 1.40 of this
2568				module. Working with Version 1.30, Nadeem noticed that occasionally the module would
2569				produce variable clustering results on the same dataset. I believe that this
2570				variability was caused (at least partly) by the purely random mode that was used in
2571				Version 1.30 for the seeding of the cluster centers. Version 1.40 now includes a
2572				C mode. With the new mode the clusterer uses a PCA (Principal Components
2573				Analysis) of the data to make good guesses for the cluster centers. However,
2574				depending on how the data is jumbled up, it is possible that the new mode will not
2575				produce uniformly good results in all cases. So you can still use the old mode by
2576				setting C to C in the constructor. Thanks Nadeem for your
2577				feedback!
2578
2579				Version 1.30 resulted from Martin Kalin reporting problems with a very small data
2580				set. Thanks Martin!
2581
2582				Version 1.21 came about in response to the problems encountered by Luis Fernando
2583				D'Haro with version 1.20. Although the module would yield the clusters for some of
2584				its runs, more frequently than not the module would abort with an "empty cluster"
2585				message for his data. Luis Fernando has also suggested other improvements (such as
2586				clustering directly from the contents of a hash) that I intend to make in future
2587				versions of this module. Thanks Luis Fernando.
2588
2589				Chad Aeschliman was kind enough to test out the interface of this module and to give
2590				suggestions for its improvement. His key slogan: "If you cannot figure out how to
2591				use a module in under 10 minutes, it's not going to be used." That should explain
2592				the longish Synopsis included here.
2593
2594				=head1 AUTHOR
2595
2596				Avinash Kak, kak@purdue.edu
2597
2598				If you send email, please place the string "KMeans" in your subject line to get past
2599				my spam filter.
2600
2601				=head1 COPYRIGHT
2602
2603				This library is free software; you can redistribute it and/or modify it under the
2604				same terms as Perl itself.
2605
2606				Copyright 2014 Avinash Kak
2607
2608				=cut
2609