File Coverage

blib/lib/Algorithm/KMeans.pm

Criterion	Covered	Total	%
statement	19	21	90.4
branch			n/a
condition			n/a
subroutine	7	7	100.0
pod			n/a
total	26	28	92.8

line	stmt	sub	time	code
1				package Algorithm::KMeans;
2
3				#------------------------------------------------------------------------------------
4				# Copyright (c) 2014 Avinash Kak. All rights reserved. This program is free
5				# software. You may modify and/or distribute it under the same terms as Perl itself.
6				# This copyright notice must remain attached to the file.
7				#
8				# Algorithm::KMeans is a Perl module for clustering multidimensional data.
9				# -----------------------------------------------------------------------------------
10
11				#use 5.10.0;
12	1	1	20289	use strict;
	1		2
	1		44
13	1	1	6	use warnings;
	1		1
	1		43
14	1	1	8	use Carp;
	1		6
	1		109
15	1	1	7	use File::Basename;
	1		2
	1		107
16	1	1	754	use Math::Random;
	1		13847
	1		138
17	1	1	817	use Graphics::GnuplotIF;
	1		14462
	1		78
18	1	1	1334	use Math::GSL::Matrix;
	0
	0
19
20
21				our $VERSION = '2.05';
22
23				# from Perl docs:
24				my $_num_regex = '^[+-]?\ (\d+(\.\d)?\|\.\d+)([eE][+-]?\d+)?$';
25
26				# Constructor:
27				sub new {
28				my ($class, %args) = @_;
29				my @params = keys %args;
30				croak "\nYou have used a wrong name for a keyword argument " .
31				"--- perhaps a misspelling\n"
32				if check_for_illegal_params(@params) == 0;
33				bless {
34				_datafile => $args{datafile} \|\| croak("datafile required"),
35				_mask => $args{mask} \|\| croak("mask required"),
36				_K => $args{K} \|\| 0,
37				_K_min => $args{Kmin} \|\| 'unknown',
38				_K_max => $args{Kmax} \|\| 'unknown',
39				_cluster_seeding => $args{cluster_seeding} \|\| croak("must choose smart or random ".
40				"for cluster seeding"),
41				_var_normalize => $args{do_variance_normalization} \|\| 0,
42				_use_mahalanobis_metric => $args{use_mahalanobis_metric} \|\| 0,
43				_clusters_2_files => $args{write_clusters_to_files} \|\| 0,
44				_terminal_output => $args{terminal_output} \|\| 0,
45				_debug => $args{debug} \|\| 0,
46				_N => 0,
47				_K_best => 'unknown',
48				_original_data => {},
49				_data => {},
50				_data_id_tags => [],
51				_QoC_values => {},
52				_clusters => [],
53				_cluster_centers => [],
54				_clusters_hash => {},
55				_cluster_centers_hash => {},
56				_cluster_covariances_hash => {},
57				_data_dimensions => 0,
58
59				}, $class;
60				}
61
62				sub read_data_from_file {
63				my $self = shift;
64				my $filename = $self->{_datafile};
65				$self->read_data_from_file_csv() if $filename =~ /.csv$/;
66				$self->read_data_from_file_dat() if $filename =~ /.dat$/;
67				}
68
69				sub read_data_from_file_csv {
70				my $self = shift;
71				my $numregex = '[+-]?\ (\d+(\.\d)?\|\.\d+)([eE][+-]?\d+)?';
72				my $filename = $self->{_datafile} \|\| die "you did not specify a file with the data to be clustered";
73				my $mask = $self->{_mask};
74				my @mask = split //, $mask;
75				$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
76				print "data dimensionality: $self->{_data_dimensions} \n"if $self->{_terminal_output};
77				open FILEIN, $filename or die "Unable to open $filename: $!";
78				die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
79				local $/ = undef;
80				my @all_data = split /\s+/, ;
81				my %data_hash = ();
82				my @data_tags = ();
83				foreach my $record (@all_data) {
84				my @splits = split /,/, $record;
85				die "\nYour mask size (including `N' and 1's and 0's) does not match\n" .
86				"the size of at least one of the data records in the file: $!"
87				unless scalar(@mask) == scalar(@splits);
88				my $record_name = shift @splits;
89				$data_hash{$record_name} = \@splits;
90				push @data_tags, $record_name;
91				}
92				$self->{_original_data} = \%data_hash;
93				$self->{_data_id_tags} = \@data_tags;
94				$self->{_N} = scalar @data_tags;
95				if ($self->{_var_normalize}) {
96				$self->{_data} = variance_normalization( $self->{_original_data} );
97				} else {
98				$self->{_data} = deep_copy_hash( $self->{_original_data} );
99				}
100				# Need to make the following call to set the global mean and covariance:
101				# my $covariance = $self->estimate_mean_and_covariance(\@data_tags);
102				# Need to make the following call to set the global eigenvec eigenval sets:
103				# $self->eigen_analysis_of_covariance($covariance);
104				if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
105				carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
106				"points must satisfy the relation N > 2xK**2 where K is " .
107				"the number of clusters requested for the clusters to be " .
108				"meaningful $!"
109				if ( $self->{_N} < (2 * $self->{_K} ** 2) );
110				print "\n\n\n";
111				}
112				}
113
114				sub read_data_from_file_dat {
115				my $self = shift;
116				my $datafile = $self->{_datafile};
117				my $mask = $self->{_mask};
118				my @mask = split //, $mask;
119				$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
120				print "data dimensionality: $self->{_data_dimensions} \n"if $self->{_terminal_output};
121				open INPUT, $datafile or die "unable to open file $datafile: $!\n";
122				chomp( my @raw_data = );
123				close INPUT;
124				# Transform strings into number data
125				foreach my $record (@raw_data) {
126				next unless $record;
127				next if $record =~ /^#/;
128				my @data_fields;
129				my @fields = split /\s+/, $record;
130				die "\nABORTED: Mask size does not correspond to row record size\n" if $#fields != $#mask;
131				my $record_id;
132				foreach my $i (0..@fields-1) {
133				if ($mask[$i] eq '0') {
134				next;
135				} elsif ($mask[$i] eq 'N') {
136				$record_id = $fields[$i];
137				} elsif ($mask[$i] eq '1') {
138				push @data_fields, $fields[$i];
139				} else {
140				die "misformed mask for reading the data file\n";
141				}
142				}
143				my @nums = map {/$_num_regex/;$_} @data_fields;
144				$self->{_original_data}->{ $record_id } = \@nums;
145				}
146				if ($self->{_var_normalize}) {
147				$self->{_data} = variance_normalization( $self->{_original_data} );
148				} else {
149				$self->{_data} = deep_copy_hash( $self->{_original_data} );
150				}
151				my @all_data_ids = keys %{$self->{_data}};
152				$self->{_data_id_tags} = \@all_data_ids;
153				$self->{_N} = scalar @all_data_ids;
154				if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
155				carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
156				"points must satisfy the relation N > 2xK**2 where K is " .
157				"the number of clusters requested for the clusters to be " .
158				"meaningful $!"
159				if ( $self->{_N} < (2 * $self->{_K} ** 2) );
160				print "\n\n\n";
161				}
162				}
163
164				sub kmeans {
165				my $self = shift;
166				my $K = $self->{_K};
167				if ( ($K == 0) \|\|
168				( ($self->{_K_min} ne 'unknown') &&
169				($self->{_K_max} ne 'unknown') ) ) {
170				eval {
171				$self->iterate_through_K();
172				};
173				die "in kmeans(): $@" if ($@);
174				} elsif ( $K =~ /\d+/) {
175				eval {
176				$self->cluster_for_fixed_K_multiple_random_tries($K)
177				if $self->{_cluster_seeding} eq 'random';
178				$self->cluster_for_fixed_K_single_smart_try($K)
179				if $self->{_cluster_seeding} eq 'smart';
180				};
181				die "in kmeans(): $@" if ($@);
182				} else {
183				croak "Incorrect call syntax used. See documentation.\n";
184				}
185				if ((defined $self->{_clusters}) && (defined $self->{_cluster_centers})){
186				foreach my $i (0..@{$self->{_clusters}}-1) {
187				$self->{_clusters_hash}->{"cluster$i"} = $self->{_clusters}->[$i];
188				$self->{_cluster_centers_hash}->{"cluster$i"} = $self->{_cluster_centers}->[$i];
189				$self->{_cluster_covariances_hash}->{"cluster$i"} =
190				$self->estimate_cluster_covariance($self->{_clusters}->[$i]);
191				}
192				$self->write_clusters_to_files( $self->{_clusters} ) if $self->{_clusters_2_files};
193				return ($self->{_clusters_hash}, $self->{_cluster_centers_hash});
194				} else {
195				croak "Clustering failed. Try again.";
196				}
197				}
198
199				# This is the subroutine to call if you prefer purely random choices for the initial
200				# seeding of the cluster centers.
201				sub cluster_for_fixed_K_multiple_random_tries {
202				my $self = shift;
203				my $K = shift;
204				my @all_data_ids = @{$self->{_data_id_tags}};
205				my $QoC;
206				my @QoC_values;
207				my @array_of_clusters;
208				my @array_of_cluster_centers;
209				my $clusters;
210				my $new_clusters;
211				my $cluster_centers;
212				my $new_cluster_centers;
213				print "Clustering for K = $K\n" if $self->{_terminal_output};
214				foreach my $trial (1..20) {
215				print ". ";
216				my ($new_clusters, $new_cluster_centers);
217				eval {
218				($new_clusters, $new_cluster_centers) = $self->cluster_for_given_K($K);
219				};
220				next if $@;
221				next if @$new_clusters <= 1;
222				my $newQoC = $self->cluster_quality( $new_clusters, $new_cluster_centers );
223				if ( (!defined $QoC) \|\| ($newQoC < $QoC) ) {
224				$QoC = $newQoC;
225				$clusters = deep_copy_AoA( $new_clusters );
226				$cluster_centers = deep_copy_AoA( $new_cluster_centers );
227				}
228				}
229				die "\n\nThe constructor options you have chosen do not work with the data. Try\n" .
230				"turning off the Mahalanobis option if you are using it.\n"
231				unless defined $clusters;
232				$self->{_clusters} = $clusters;
233				$self->{_cluster_centers} = $cluster_centers;
234				$self->{_QoC_values}->{"$K"} = $QoC;
235				if ($self->{_terminal_output}) {
236				print "\nDisplaying final clusters for best K (= $K) :\n";
237				display_clusters( $clusters );
238				$self->display_cluster_centers( $clusters );
239				print "\nQoC value (the smaller, the better): $QoC\n";
240				}
241				}
242
243				# For the smart try, we do initial cluster seeding by subjecting the data to
244				# principal components analysis in order to discover the direction of maximum
245				# variance in the data space. Subsequently, we try to find the K largest peaks along
246				# this direction. The coordinates of these peaks serve as the seeds for the K
247				# clusters.
248				sub cluster_for_fixed_K_single_smart_try {
249				my $self = shift;
250				my $K = shift;
251				my @all_data_ids = @{$self->{_data_id_tags}};
252				print "Clustering for K = $K\n" if $self->{_terminal_output};
253				my ($clusters, $cluster_centers);
254				eval {
255				($clusters, $cluster_centers) = $self->cluster_for_given_K($K);
256				};
257				if ($@) {
258				die "In cluster_for_fixed_K_single_smart_try: insufficient data for clustering " .
259				"with $self->{_K} clusters --- $@";
260				}
261				my $QoC = $self->cluster_quality( $clusters, $cluster_centers );
262				$self->{_clusters} = $clusters;
263				$self->{_cluster_centers} = $cluster_centers;
264				$self->{_QoC_values}->{"$K"} = $QoC;
265				if ($self->{_terminal_output}) {
266				print "\nDisplaying final clusters for best K (= $K) :\n";
267				display_clusters( $clusters );
268				$self->display_cluster_centers( $clusters );
269				print "\nQoC value (the smaller, the better): $QoC\n";
270				}
271				}
272
273				# The following subroutine is the top-level routine to call if you want the system to
274				# figure out on its own what value to use for K, the number of clusters. If you call
275				# the KMeans constructor with the K=0 option, the method below will try every
276				# possible value of K between 2 and the maximum possible depending on the number of
277				# data points available. For example, if the number of data points is 10,000, it will
278				# try all possible values of K between 2 and 70. For how the maximum value is set for
279				# K, see the comments made under Description. Note also how this method makes 20
280				# different tries for each value of K as a defense against the problem of the final
281				# result corresponding to some local minimum in the values of the QoC metric. Out of
282				# these 20 tries for each K, it retains the clusters and the cluster centers for only
283				# that try that yields the smallest value for the QoC metric. After estimating the
284				# "best" QoC values for all possible K in this manner, it then finds the K for which
285				# the QoC is the minimum. This is taken to be the best value for K. Finally, the
286				# output clusters are written out to separate files.
287				#
288				# If the KMeans constructor is invoked with the (Kmin, Kmax) options, then, instead
289				# of iterating through 2 and the maximum permissible value for K, the iterations are
290				# carried out only between Kmin and Kmax.
291				sub iterate_through_K {
292				my $self = shift;
293				my @all_data_ids = @{$self->{_data_id_tags}};
294				my $N = $self->{_N};
295				croak "You need more than 8 data samples. The number of data points must satisfy " .
296				"the relation N > 2xK**2 where K is the number of clusters. The smallest " .
297				"value for K is 2.\n" if $N <= 8;
298				my $K_statistical_max = int( sqrt( $N / 2.0 ) );
299				my $Kmin = $self->{_K_min} eq 'unknown'
300				? 2
301				: $self->{_K_min};
302				my $Kmax = $self->{_K_max} eq 'unknown'
303				? int( sqrt( $N / 2.0 ) )
304				: $self->{_K_max};
305				croak "\n\nYour Kmax value is too high. Ideally, it should not exceed sqrt(N/2) " .
306				"where N is the number of data points" if $Kmax > $K_statistical_max;
307				print "Value of Kmax is: $Kmax\n" if $self->{_terminal_output};
308				my @QoC_values;
309				my @array_of_clusters;
310				my @array_of_cluster_centers;
311				foreach my $K ($Kmin..$Kmax) {
312				my $QoC;
313				my $clusters;
314				my $cluster_centers;
315				print "Clustering for K = $K\n" if $self->{_terminal_output};
316				if ($self->{_cluster_seeding} eq 'random') {
317				foreach my $trial (1..21) {
318				print ". ";
319				my ($new_clusters, $new_cluster_centers);
320				if ($self->{_use_mahalanobis_metric}) {
321				eval {
322				($new_clusters, $new_cluster_centers) = $self->cluster_for_given_K($K);
323				};
324				next if $@;
325				} else {
326				($new_clusters, $new_cluster_centers) = $self->cluster_for_given_K($K);
327				}
328				my $newQoC = $self->cluster_quality( $new_clusters, $new_cluster_centers );
329				if ( (!defined $QoC) \|\| ($newQoC < $QoC) ) {
330				$QoC = $newQoC;
331				$clusters = deep_copy_AoA( $new_clusters );
332				$cluster_centers = deep_copy_AoA( $new_cluster_centers );
333				}
334				}
335				print "\n";
336				} elsif ($self->{_cluster_seeding} eq 'smart') {
337				eval {
338				($clusters, $cluster_centers) = $self->cluster_for_given_K($K);
339				};
340				if ($@) {
341				$Kmax = $K - 1;
342				last;
343				}
344				$QoC = $self->cluster_quality($clusters,$cluster_centers);
345				} else {
346				die "You must either choose 'smart' for cluster_seeding or 'random'. " .
347				"Fix your constructor call."
348				}
349				push @QoC_values, $QoC;
350				push @array_of_clusters, $clusters;
351				push @array_of_cluster_centers, $cluster_centers;
352				}
353				my ($min, $max) = minmax( \@QoC_values );
354				my $K_best_relative_to_Kmin = get_index_at_value($min, \@QoC_values );
355				my $K_best = $K_best_relative_to_Kmin + $Kmin;
356				if ($self->{_terminal_output}) {
357				print "\nDisplaying final clusters for best K (= $K_best) :\n";
358				display_clusters( $array_of_clusters[$K_best_relative_to_Kmin] );
359				$self->display_cluster_centers($array_of_clusters[$K_best_relative_to_Kmin]);
360				print "\nBest clustering achieved for K=$K_best with QoC = $min\n" if defined $min;
361				my @printableQoC = grep {$_} @QoC_values;
362				print "\nQoC values array (the smaller the value, the better it is) for different " .
363				"K starting with K=$Kmin: @printableQoC\n";
364				}
365				$self->{_K_best} = $K_best;
366				foreach my $i (0..@QoC_values-1) {
367				my $k = $i + $Kmin;
368				$self->{_QoC_values}->{"$k"} = $QoC_values[$i];
369				}
370				$self->{_clusters} = $array_of_clusters[$K_best_relative_to_Kmin];
371				$self->{_cluster_centers} =
372				$array_of_cluster_centers[$K_best_relative_to_Kmin];
373				}
374
375				# This is the function to call if you already know what value you want to use for K,
376				# the number of expected clusters. The purpose of this function is to do the
377				# initialization of the cluster centers and to carry out the initial assignment of
378				# the data to the clusters with the initial cluster centers. The initialization
379				# consists of 3 steps: Construct a random sequence of K integers between 0 and N-1
380				# where N is the number of data points to be clustered; 2) Call
381				# get_initial_cluster_centers() to index into the data array with the random integers
382				# to get a list of K data points that would serve as the initial cluster centers; and
383				# (3) Call assign_data_to_clusters_initial() to assign the rest of the data to each
384				# of the K clusters on the basis of the proximity to the cluster centers.
385				sub cluster_for_given_K {
386				my $self = shift;
387				my $K = shift;
388				my @all_data_ids = @{$self->{_data_id_tags}};
389				my $cluster_centers;
390				if ($self->{_cluster_seeding} eq 'smart') {
391				$cluster_centers = $self->get_initial_cluster_centers_smart($K);
392				} elsif ($self->{_cluster_seeding} eq 'random') {
393				$cluster_centers = $self->get_initial_cluster_centers($K);
394				} else {
395				die "You must either choose 'smart' for cluster_seeding or 'random'. " .
396				"Fix your constructor call."
397				}
398				my $clusters;
399				if ($self->{_use_mahalanobis_metric}) {
400				my $clusters_and_determinants =
401				$self->assign_data_to_clusters_initial_mahalanobis($cluster_centers);
402				$clusters = $clusters_and_determinants->[0];
403				my @determinants = @{$clusters_and_determinants->[1]};
404				my ($min,$max) = minmax(\@determinants);
405				die "In cluster_for_given_K(): min determinant of covariance matrix for at " .
406				"least one cluster is too small" if $min / $max < 0.001;
407				} else {
408				$clusters = $self->assign_data_to_clusters_initial($cluster_centers);
409				}
410				my $cluster_nonexistent_flag = 0;
411				foreach my $trial (0..2) {
412				if ($self->{_use_mahalanobis_metric}) {
413				($clusters, $cluster_centers) = $self->assign_data_to_clusters_mahalanobis($clusters, $K);
414				} else {
415				($clusters, $cluster_centers) = $self->assign_data_to_clusters( $clusters, $K );
416				}
417				my $num_of_clusters_returned = @$clusters;
418				foreach my $cluster (@$clusters) {
419				$cluster_nonexistent_flag = 1 if ((!defined $cluster) \|\| (@$cluster == 0));
420				}
421				last unless $cluster_nonexistent_flag;
422				}
423				return ($clusters, $cluster_centers);
424				}
425
426				# This function is used when you set the "cluster_seeding" option to 'random' in the
427				# constructor. Returns a set of K random integers. These serve as indices to reach
428				# into the data array. A data element whose index is one of the random numbers
429				# returned by this routine serves as an initial cluster center. Note the quality
430				# check it runs on the list of K random integers constructed. We first make sure
431				# that all K random integers are different. Subsequently, we carry out a quality
432				# assessment of the K random integers constructed. This quality measure consists of
433				# the ratio of the values spanned by the random integers to the value of N, the total
434				# number of data points to be clustered. Currently, if this ratio is less than 0.3,
435				# we discard the K integers and try again.
436				sub initialize_cluster_centers {
437				my $self = shift;
438				my $K = shift;
439				my $data_store_size = $self->{_N};
440				my @cluster_center_indices;
441				while (1) {
442				foreach my $i (0..$K-1) {
443				$cluster_center_indices[$i] = int rand( $data_store_size );
444				next if $i == 0;
445				foreach my $j (0..$i-1) {
446				while ( $cluster_center_indices[$j] ==
447				$cluster_center_indices[$i] ) {
448				my $old = $cluster_center_indices[$i];
449				$cluster_center_indices[$i] = int rand($data_store_size);
450				}
451				}
452				}
453				my ($min,$max) = minmax(\@cluster_center_indices );
454				my $quality = ($max - $min) / $data_store_size;
455				last if $quality > 0.3;
456				}
457				return @cluster_center_indices;
458				}
459
460				# This function is used when you set the "cluster_seeding" option to 'random' in the
461				# constructor. This routine merely reaches into the data array with the random
462				# integers, as constructed by the previous routine, serving as indices and fetching
463				# values corresponding to those indices. The fetched data samples serve as the
464				# initial cluster centers.
465				sub get_initial_cluster_centers {
466				my $self = shift;
467				my $K = shift;
468				my @cluster_center_indices = $self->initialize_cluster_centers($K);
469				my @result;
470				foreach my $i (@cluster_center_indices) {
471				my $tag = $self->{_data_id_tags}[$i];
472				push @result, $self->{_data}->{$tag};
473				}
474				return \@result;
475				}
476
477				# This method is invoked when you choose the 'smart' option for the "cluster_seeding"
478				# option in the constructor. It subjects the data to a principal components analysis
479				# to figure out the direction of maximal variance. Subsequently, it tries to locate
480				# K peaks in a smoothed histogram of the data points projected onto the maximal
481				# variance direction.
482				sub get_initial_cluster_centers_smart {
483				my $self = shift;
484				my $K = shift;
485				if ($self->{_data_dimensions} == 1) {
486				my @one_d_data;
487				foreach my $j (0..$self->{_N}-1) {
488				my $tag = $self->{_data_id_tags}[$j];
489				push @one_d_data, $self->{_data}->{$tag}->[0];
490				}
491				my @peak_points = find_peak_points_in_given_direction(\@one_d_data,$K);
492				print "highest points at data values: @peak_points\n" if $self->{_debug};
493				my @cluster_centers;
494				foreach my $peakpoint (@peak_points) {
495				push @cluster_centers, [$peakpoint];
496				}
497				return \@cluster_centers;
498				}
499				my ($num_rows,$num_cols) = ($self->{_data_dimensions},$self->{_N});
500				my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
501				my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
502				# All the record labels are stored in the array $self->{_data_id_tags}. The
503				# actual data for clustering is stored in a hash at $self->{_data} whose keys are
504				# the record labels; the value associated with each key is the array holding the
505				# corresponding numerical multidimensional data.
506				foreach my $j (0..$num_cols-1) {
507				my $tag = $self->{_data_id_tags}[$j];
508				my $data = $self->{_data}->{$tag};
509				$matrix->set_col($j, $data);
510				}
511				if ($self->{_debug}) {
512				print "\nDisplaying the original data as a matrix:";
513				display_matrix( $matrix );
514				}
515				foreach my $j (0..$num_cols-1) {
516				$mean_vec += $matrix->col($j);
517				}
518				$mean_vec *= 1.0 / $num_cols;
519				if ($self->{_debug}) {
520				print "Displaying the mean vector for the data:";
521				display_matrix( $mean_vec );
522				}
523				foreach my $j (0..$num_cols-1) {
524				my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
525				$matrix->set_col($j, \@new_col);
526				}
527				if ($self->{_debug}) {
528				print "Displaying mean subtracted data as a matrix:";
529				display_matrix( $matrix );
530				}
531				my $transposed = transpose( $matrix );
532				if ($self->{_debug}) {
533				print "Displaying transposed data matrix:";
534				display_matrix( $transposed );
535				}
536				my $covariance = matrix_multiply( $matrix, $transposed );
537				$covariance *= 1.0 / $num_cols;
538				if ($self->{_debug}) {
539				print "\nDisplaying the Covariance Matrix for your data:";
540				display_matrix( $covariance );
541				}
542				my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
543				my $num_of_eigens = @$eigenvalues;
544				my $largest_eigen_index = 0;
545				my $smallest_eigen_index = 0;
546				print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug};
547				foreach my $i (1..$num_of_eigens-1) {
548				$largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
549				$smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
550				print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug};
551				}
552				print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
553				print "\nsmallest eigen index: $smallest_eigen_index\n\n" if $self->{_debug};
554				foreach my $i (0..$num_of_eigens-1) {
555				my @vec = $eigenvectors->[$i]->as_list;
556				print "Eigenvector $i: @vec\n" if $self->{_debug};
557				}
558				my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
559				print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug};
560				my @max_var_direction;
561				# Each element of the array @largest_eigen_vec is a Math::Complex object
562				foreach my $k (0..@largest_eigen_vec-1) {
563				my ($mag, $theta) = $largest_eigen_vec[$k] =~ /\[(\d*\.\d+),(\S+)\]/;
564				if ($theta eq '0') {
565				$max_var_direction[$k] = $mag;
566				} elsif ($theta eq 'pi') {
567				$max_var_direction[$k] = -1.0 * $mag;
568				} else {
569				die "eigendecomposition of covariance matrix produced a complex " .
570				"eigenvector --- something is wrong";
571				}
572				}
573				print "\nMaximum Variance Direction: @max_var_direction\n\n" if $self->{_debug};
574				# We now project all data points on the largest eigenvector.
575				# Each projection will yield a single point on the eigenvector.
576				my @projections;
577				foreach my $j (0..$self->{_N}-1) {
578				my $tag = $self->{_data_id_tags}[$j];
579				my $data = $self->{_data}->{$tag};
580				die "Dimensionality of the largest eigenvector does not "
581				. "match the dimensionality of the data"
582				unless @max_var_direction == $self->{_data_dimensions};
583				my $projection = vector_multiply($data, \@max_var_direction);
584				push @projections, $projection;
585				}
586				print "All projection points: @projections\n" if $self->{_debug};
587				my @peak_points = find_peak_points_in_given_direction(\@projections, $K);
588				print "highest points at points along largest eigenvec: @peak_points\n" if $self->{_debug};
589				my @cluster_centers;
590				foreach my $peakpoint (@peak_points) {
591				my @actual_peak_coords = map {$peakpoint * $_} @max_var_direction;
592				push @cluster_centers, \@actual_peak_coords;
593				}
594				return \@cluster_centers;
595				}
596
597				# This method is invoked when you choose the 'smart' option for the "cluster_seeding"
598				# option in the constructor. It is called by the previous method to locate K peaks
599				# in a smoothed histogram of the data points projected onto the maximal variance
600				# direction.
601				sub find_peak_points_in_given_direction {
602				my $dataref = shift;
603				my $how_many = shift;
604				my @data = @$dataref;
605				my ($min, $max) = minmax(\@data);
606				my $num_points = @data;
607				my @sorted_data = sort {$a <=> $b} @data;
608				my $scale = $max - $min;
609				foreach my $index (0..$#sorted_data-1) {
610				$sorted_data[$index] = ($sorted_data[$index] - $min) / $scale;
611				}
612				my $avg_diff = 0;
613				foreach my $index (0..$#sorted_data-1) {
614				my $diff = $sorted_data[$index+1] - $sorted_data[$index];
615				$avg_diff += ($diff - $avg_diff) / ($index + 1);
616				}
617				my $delta = 1.0 / 1000.0;
618				my @accumulator = (0) x 1000;
619				foreach my $index (0..@sorted_data-1) {
620				my $cell_index = int($sorted_data[$index] / $delta);
621				my $smoothness = 40;
622				for my $index ($cell_index-$smoothness..$cell_index+$smoothness) {
623				next if $index < 0 \|\| $index > 999;
624				$accumulator[$index]++;
625				}
626				}
627				my $peaks_array = non_maximum_suppression( \@accumulator );
628				my $peaks_index_hash = get_value_index_hash( $peaks_array );
629				my @K_highest_peak_locations;
630				my $k = 0;
631				foreach my $peak (sort {$b <=> $a} keys %$peaks_index_hash) {
632				my $unscaled_peak_point = $min + $peaks_index_hash->{$peak} * $scale * $delta;
633				push @K_highest_peak_locations, $unscaled_peak_point
634				if $k < $how_many;
635				last if ++$k == $how_many;
636				}
637				return @K_highest_peak_locations;
638				}
639
640				# The purpose of this routine is to form initial clusters by assigning the data
641				# samples to the initial clusters formed by the previous routine on the basis of the
642				# best proximity of the data samples to the different cluster centers.
643				sub assign_data_to_clusters_initial {
644				my $self = shift;
645				my @cluster_centers = @{ shift @_ };
646				my @clusters;
647				foreach my $ele (@{$self->{_data_id_tags}}) {
648				my $best_cluster;
649				my @dist_from_clust_centers;
650				foreach my $center (@cluster_centers) {
651				push @dist_from_clust_centers, $self->distance($ele, $center);
652				}
653				my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
654				push @{$clusters[$best_center_index]}, $ele;
655				}
656				return \@clusters;
657				}
658
659				sub assign_data_to_clusters_initial_mahalanobis {
660				my $self = shift;
661				my @cluster_centers = @{ shift @_ };
662				my @clusters;
663				foreach my $ele (@{$self->{_data_id_tags}}) {
664				my $best_cluster;
665				my @dist_from_clust_centers;
666				foreach my $center (@cluster_centers) {
667				push @dist_from_clust_centers, $self->distance($ele, $center);
668				}
669				my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
670				push @{$clusters[$best_center_index]}, $ele if defined $best_center_index;
671				}
672				# Since a cluster center may not correspond to any particular sample, it is possible
673				# for one of the elements of the array @clusters to be null using the above
674				# strategy for populating the initial clusters. Let's say there are five cluster
675				# centers in the array @cluster_centers. The $best_center_index may populate the
676				# the elements of the array @clusters for the indices 0, 1, 2, 4, which would leave
677				# $clusters[3] as undefined. So, in what follows, we must first check if all of
678				# the elements of @clusters are defined.
679				my @determinants;
680				foreach my $cluster(@clusters) {
681				die "The clustering program started with bad initialization. Please start over"
682				unless defined $cluster;
683				my $covariance = $self->estimate_cluster_covariance($cluster);
684				my $determinant = $covariance->det();
685				push @determinants, $determinant;
686				}
687				return [\@clusters, \@determinants];
688				}
689
690				# This is the main routine that along with the update_cluster_centers() routine
691				# constitute the two key steps of the K-Means algorithm. In most cases, the infinite
692				# while() loop will terminate automatically when the cluster assignments of the data
693				# points remain unchanged. For the sake of safety, we keep track of the number of
694				# iterations. If this number reaches 100, we exit the while() loop anyway. In most
695				# cases, this limit will not be reached.
696				sub assign_data_to_clusters {
697				my $self = shift;
698				my $clusters = shift;
699				my $K = shift;
700				my $final_cluster_centers;
701				my $iteration_index = 0;
702				while (1) {
703				my $new_clusters;
704				my $assignment_changed_flag = 0;
705				my $current_cluster_center_index = 0;
706				my $cluster_size_zero_condition = 0;
707				my $how_many = @$clusters;
708				my $cluster_centers = $self->update_cluster_centers( deep_copy_AoA_with_nulls( $clusters ) );
709				$iteration_index++;
710				foreach my $cluster (@$clusters) {
711				my $current_cluster_center = $cluster_centers->[$current_cluster_center_index];
712				foreach my $ele (@$cluster) {
713				my @dist_from_clust_centers;
714				foreach my $center (@$cluster_centers) {
715				push @dist_from_clust_centers,
716				$self->distance($ele, $center);
717				}
718				my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
719				my $best_cluster_center = $cluster_centers->[$best_center_index];
720				if (vector_equal($current_cluster_center, $best_cluster_center)){
721				push @{$new_clusters->[$current_cluster_center_index]}, $ele;
722				} else {
723				$assignment_changed_flag = 1;
724				push @{$new_clusters->[$best_center_index]}, $ele;
725				}
726				}
727				$current_cluster_center_index++;
728				}
729				next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
730				# Now make sure that none of the K clusters is an empty cluster:
731				foreach my $newcluster (@$new_clusters) {
732				$cluster_size_zero_condition = 1 if ((!defined $newcluster) or (@$newcluster == 0));
733				}
734				push @$new_clusters, (undef) x ($K - @$new_clusters) if @$new_clusters < $K;
735				# During clustering for a fixed K, should a cluster inadvertently
736				# become empty, steal a member from the largest cluster to hopefully
737				# spawn a new cluster:
738				my $largest_cluster;
739				foreach my $local_cluster (@$new_clusters) {
740				next if !defined $local_cluster;
741				$largest_cluster = $local_cluster if !defined $largest_cluster;
742				if (@$local_cluster > @$largest_cluster) {
743				$largest_cluster = $local_cluster;
744				}
745				}
746				foreach my $local_cluster (@$new_clusters) {
747				if ( (!defined $local_cluster) \|\| (@$local_cluster == 0) ) {
748				push @$local_cluster, pop @$largest_cluster;
749				}
750				}
751				next if (($cluster_size_zero_condition) && ($iteration_index < 100));
752				last if $iteration_index == 100;
753				$clusters = deep_copy_AoA( $new_clusters );
754				last if $assignment_changed_flag == 0;
755				}
756				$final_cluster_centers = $self->update_cluster_centers( $clusters );
757				return ($clusters, $final_cluster_centers);
758				}
759
760				sub assign_data_to_clusters_mahalanobis {
761				my $self = shift;
762				my $clusters = shift;
763				my $K = shift;
764				my $final_cluster_centers;
765				my $iteration_index = 0;
766				while (1) {
767				my $new_clusters;
768				my $assignment_changed_flag = 0;
769				my $current_cluster_center_index = 0;
770				my $cluster_size_zero_condition = 0;
771				my $how_many = @$clusters;
772				my $cluster_centers_and_covariances =
773				$self->update_cluster_centers_and_covariances_mahalanobis(deep_copy_AoA_with_nulls($clusters));
774				my $cluster_centers = $cluster_centers_and_covariances->[0];
775				my $cluster_covariances = $cluster_centers_and_covariances->[1];
776				$iteration_index++;
777				foreach my $cluster (@$clusters) {
778				my $current_cluster_center = $cluster_centers->[$current_cluster_center_index];
779				my $current_cluster_covariance = $cluster_covariances->[$current_cluster_center_index];
780				foreach my $ele (@$cluster) {
781				my @mahalanobis_dist_from_clust_centers;
782				foreach my $i (0..@$clusters-1) {
783				my $center = $cluster_centers->[$i];
784				my $covariance = $cluster_covariances->[$i];
785				my $maha_distance;
786				eval {
787				$maha_distance = $self->distance_mahalanobis($ele, $center, $covariance);
788				};
789				next if $@;
790				push @mahalanobis_dist_from_clust_centers, $maha_distance;
791				}
792				my ($min, $best_center_index) = minimum( \@mahalanobis_dist_from_clust_centers );
793				die "The Mahalanobis metric may not be appropriate for the data"
794				unless defined $best_center_index;
795				my $best_cluster_center = $cluster_centers->[$best_center_index];
796				if (vector_equal($current_cluster_center, $best_cluster_center)){
797				push @{$new_clusters->[$current_cluster_center_index]}, $ele;
798				} else {
799				$assignment_changed_flag = 1;
800				push @{$new_clusters->[$best_center_index]}, $ele;
801				}
802				}
803				$current_cluster_center_index++;
804				}
805				next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
806				# Now make sure that none of the K clusters is an empty cluster:
807				foreach my $newcluster (@$new_clusters) {
808				$cluster_size_zero_condition = 1 if ((!defined $newcluster) or (@$newcluster == 0));
809				}
810				push @$new_clusters, (undef) x ($K - @$new_clusters) if @$new_clusters < $K;
811				# During clustering for a fixed K, should a cluster inadvertently
812				# become empty, steal a member from the largest cluster to hopefully
813				# spawn a new cluster:
814				my $largest_cluster;
815				foreach my $local_cluster (@$new_clusters) {
816				next if !defined $local_cluster;
817				$largest_cluster = $local_cluster if !defined $largest_cluster;
818				if (@$local_cluster > @$largest_cluster) {
819				$largest_cluster = $local_cluster;
820				}
821				}
822				foreach my $local_cluster (@$new_clusters) {
823				if ( (!defined $local_cluster) \|\| (@$local_cluster == 0) ) {
824				push @$local_cluster, pop @$largest_cluster;
825				}
826				}
827				next if (($cluster_size_zero_condition) && ($iteration_index < 100));
828				last if $iteration_index == 100;
829				# Now do a deep copy of new_clusters into clusters
830				$clusters = deep_copy_AoA( $new_clusters );
831				last if $assignment_changed_flag == 0;
832				}
833				$final_cluster_centers = $self->update_cluster_centers( $clusters );
834				return ($clusters, $final_cluster_centers);
835				}
836
837				sub update_cluster_centers_and_covariances_mahalanobis {
838				my $self = shift;
839				my @clusters = @{ shift @_ };
840				my @new_cluster_centers;
841				my @new_cluster_covariances;
842				# During clustering for a fixed K, should a cluster inadvertently become empty,
843				# steal a member from the largest cluster to hopefully spawn a new cluster:
844				my $largest_cluster;
845				foreach my $cluster (@clusters) {
846				next if !defined $cluster;
847				$largest_cluster = $cluster if !defined $largest_cluster;
848				if (@$cluster > @$largest_cluster) {
849				$largest_cluster = $cluster;
850				}
851				}
852				foreach my $cluster (@clusters) {
853				if ( (!defined $cluster) \|\| (@$cluster == 0) ) {
854				push @$cluster, pop @$largest_cluster;
855				}
856				}
857				foreach my $cluster (@clusters) {
858				die "Cluster became empty --- untenable condition " .
859				"for a given K. Try again. \n" if !defined $cluster;
860				my $cluster_size = @$cluster;
861				die "Cluster size is zero --- untenable.\n" if $cluster_size == 0;
862				my @new_cluster_center = @{$self->add_point_coords( $cluster )};
863				@new_cluster_center = map {my $x = $_/$cluster_size; $x} @new_cluster_center;
864				push @new_cluster_centers, \@new_cluster_center;
865				# for covariance calculation:
866				my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$cluster));
867				my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
868				my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
869				# All the record labels are stored in the array $self->{_data_id_tags}. The
870				# actual data for clustering is stored in a hash at $self->{_data} whose keys are
871				# the record labels; the value associated with each key is the array holding the
872				# corresponding numerical multidimensional data.
873				foreach my $j (0..$num_cols-1) {
874				my $tag = $cluster->[$j];
875				my $data = $self->{_data}->{$tag};
876				my @diff_from_mean = vector_subtract($data, \@new_cluster_center);
877				$matrix->set_col($j, \@diff_from_mean);
878				}
879				my $transposed = transpose( $matrix );
880				my $covariance = matrix_multiply( $matrix, $transposed );
881				$covariance *= 1.0 / $num_cols;
882				if ($self->{_debug}) {
883				print "\nDisplaying the Covariance Matrix for cluster:";
884				display_matrix( $covariance );
885				}
886				push @new_cluster_covariances, $covariance;
887				}
888				return [\@new_cluster_centers, \@new_cluster_covariances];
889				}
890
891				# After each new assignment of the data points to the clusters on the basis of the
892				# current values for the cluster centers, we call the routine shown here for updating
893				# the values of the cluster centers.
894				sub update_cluster_centers {
895				my $self = shift;
896				my @clusters = @{ shift @_ };
897				my @new_cluster_centers;
898				# During clustering for a fixed K, should a cluster inadvertently become empty,
899				# steal a member from the largest cluster to hopefully spawn a new cluster:
900				my $largest_cluster;
901				foreach my $cluster (@clusters) {
902				next if !defined $cluster;
903				$largest_cluster = $cluster if !defined $largest_cluster;
904				if (@$cluster > @$largest_cluster) {
905				$largest_cluster = $cluster;
906				}
907				}
908				foreach my $cluster (@clusters) {
909				if ( (!defined $cluster) \|\| (@$cluster == 0) ) {
910				push @$cluster, pop @$largest_cluster;
911				}
912				}
913				foreach my $cluster (@clusters) {
914				die "Cluster became empty --- untenable condition " .
915				"for a given K. Try again. \n" if !defined $cluster;
916				my $cluster_size = @$cluster;
917				die "Cluster size is zero --- untenable.\n" if $cluster_size == 0;
918				my @new_cluster_center = @{$self->add_point_coords( $cluster )};
919				@new_cluster_center = map {my $x = $_/$cluster_size; $x}
920				@new_cluster_center;
921				push @new_cluster_centers, \@new_cluster_center;
922				}
923				return \@new_cluster_centers;
924				}
925
926				sub which_cluster_for_new_data_element {
927				my $self = shift;
928				my $ele = shift;
929				die "The dimensionality of the new data element is not correct: $!"
930				unless @$ele == $self->{_data_dimensions};
931				my %distance_to_new_ele_hash;
932				foreach my $cluster_id (sort keys %{$self->{_cluster_centers_hash}}) {
933				$distance_to_new_ele_hash{$cluster_id} = $self->distance2($ele,
934				$self->{_cluster_centers_hash}->{$cluster_id});
935				}
936				my @values = values %distance_to_new_ele_hash;
937				my ($min,$max) = minmax(\@values);
938				my $answer;
939				foreach my $cluster_id (keys %distance_to_new_ele_hash) {
940				$answer = $cluster_id if $distance_to_new_ele_hash{$cluster_id} == $min;
941				}
942				return $answer;
943				}
944
945				sub which_cluster_for_new_data_element_mahalanobis {
946				my $self = shift;
947				my $ele = shift;
948				die "The dimensionality of the new data element is not correct: $!"
949				unless @$ele == $self->{_data_dimensions};
950				my %distance_to_new_ele_hash;
951				foreach my $cluster_id (sort keys %{$self->{_cluster_centers_hash}}) {
952				$distance_to_new_ele_hash{$cluster_id} =
953				$self->distance_mahalanobis2($ele, $self->{_cluster_centers_hash}->{$cluster_id},
954				$self->{_cluster_covariances_hash}->{$cluster_id});
955				}
956				my @values = values %distance_to_new_ele_hash;
957				my ($min,$max) = minmax(\@values);
958				my $answer;
959				foreach my $cluster_id (keys %distance_to_new_ele_hash) {
960				$answer = $cluster_id if $distance_to_new_ele_hash{$cluster_id} == $min;
961				}
962				return $answer;
963				}
964
965				# The following function returns the value of QoC for a given partitioning of the
966				# data into K clusters. It calculates two things: the average value for the distance
967				# between a data point and the center of the cluster in which the data point resides,
968				# and the average value for the distances between the cluster centers. We obviously
969				# want to minimize the former and maximize the latter. All of the "from center"
970				# distances within each cluster are stored in the variable
971				# $sum_of_distances_for_one_cluster. When this variable, after it is divided by the
972				# number of data elements in the cluster, is summed over all the clusters, we get a
973				# value that is stored in $avg_dist_for_cluster. The inter-cluster-center distances
974				# are stored in the variable $inter_cluster_center_dist.
975				sub cluster_quality {
976				my $self = shift;
977				my $clusters = shift;
978				my $cluster_centers = shift;
979				my $K = @$cluster_centers; # Number of clusters
980				my $cluster_radius = 0;
981				foreach my $i (0..@$clusters-1) {
982				my $sum_of_distances_for_one_cluster = 0;
983				foreach my $ele (@{$clusters->[$i]}) {
984				$sum_of_distances_for_one_cluster +=
985				$self->distance( $ele, $cluster_centers->[$i] );
986				}
987				$cluster_radius +=
988				$sum_of_distances_for_one_cluster / @{$clusters->[$i]};
989				}
990				my $inter_cluster_center_dist = 0;
991				foreach my $i (0..@$cluster_centers-1) {
992				foreach my $j (0..@$cluster_centers-1) {
993				$inter_cluster_center_dist +=
994				$self->distance2( $cluster_centers->[$i],
995				$cluster_centers->[$j] );
996				}
997				}
998				my $avg_inter_cluster_center_dist = $inter_cluster_center_dist /
999				( $K * ($K-1) / 2.0 );
1000				return $cluster_radius / $avg_inter_cluster_center_dist;
1001				}
1002
1003				# The following routine is for computing the distance between a data point specified
1004				# by its symbolic name in the master datafile and a point (such as the center of a
1005				# cluster) expressed as a vector of coordinates:
1006				sub distance {
1007				my $self = shift;
1008				my $ele1_id = shift @_; # symbolic name of data sample
1009				my @ele1 = @{$self->{_data}->{$ele1_id}};
1010				my @ele2 = @{shift @_};
1011				die "wrong data types for distance calculation\n" if @ele1 != @ele2;
1012				my $how_many = @ele1;
1013				my $squared_sum = 0;
1014				foreach my $i (0..$how_many-1) {
1015				$squared_sum += ($ele1[$i] - $ele2[$i])**2;
1016				}
1017				my $dist = sqrt $squared_sum;
1018				return $dist;
1019				}
1020
1021				# The following routine does the same as above but now both arguments are expected to
1022				# be arrays of numbers:
1023				sub distance2 {
1024				my $self = shift;
1025				my @ele1 = @{shift @_};
1026				my @ele2 = @{shift @_};
1027				die "wrong data types for distance calculation\n" if @ele1 != @ele2;
1028				my $how_many = @ele1;
1029				my $squared_sum = 0;
1030				foreach my $i (0..$how_many-1) {
1031				$squared_sum += ($ele1[$i] - $ele2[$i])**2;
1032				}
1033				return sqrt $squared_sum;
1034				}
1035
1036				# Requires three args: $ele for the symbolic tag of the element, $center for the
1037				# coordinates of the center of the cluster, and $covariance for the covariance of
1038				# cluster. Our goal is to find the distance of the element ele from the cluster
1039				# whose mean and covariance are provided.
1040				sub distance_mahalanobis {
1041				my $self = shift;
1042				my $ele = shift;
1043				my $cluster_center = shift;
1044				my $cluster_covariance = shift;
1045				my $det_of_covar = $cluster_covariance->det();
1046				my $ele_data = $self->{_data}->{$ele};
1047				my @diff_from_mean = vector_subtract($ele_data, $cluster_center);
1048				my $vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
1049				$vec->set_col(0, \@diff_from_mean);
1050				my $transposed = transpose( $vec );
1051				my $covariance_inverse;
1052				if ($cluster_covariance->det() > 0.001) {
1053				$covariance_inverse = $cluster_covariance->inverse;
1054				} else {
1055				die "covariance matrix may not have an inverse";
1056				}
1057				my $determinant = $covariance_inverse->det();
1058				my $distance = $transposed * $covariance_inverse * $vec;
1059				my @distance = $distance->as_list;
1060				$distance = $distance[0];
1061				return sqrt $distance;
1062				}
1063				# Same as the previous method except that the first argument can be the actual
1064				# coordinates of the data element. Our goal is to find the Mahalanobis distance
1065				# from a given data element to a cluster whose center and covariance are known. As
1066				# for the previous method, it requires three arguments.
1067				sub distance_mahalanobis2 {
1068				my $self = shift;
1069				my $ele = shift; # is now a ref to the array of coords for a point
1070				my $cluster_center = shift;
1071				my $cluster_covariance = shift;
1072				return undef unless defined $cluster_covariance;
1073				my $det_of_covar = $cluster_covariance->det();
1074				my @diff_from_mean = vector_subtract($ele, $cluster_center);
1075				my $vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
1076				$vec->set_col(0, \@diff_from_mean);
1077				my $transposed = transpose( $vec );
1078				my $covariance_inverse;
1079				if ($cluster_covariance->det() > 0.001) {
1080				$covariance_inverse = $cluster_covariance->inverse;
1081				} else {
1082				die "covariance matrix may not have an inverse";
1083				}
1084				my $determinant = $covariance_inverse->det();
1085				my $distance = $transposed * $covariance_inverse * $vec;
1086				my @distance = $distance->as_list;
1087				$distance = $distance[0];
1088				return sqrt $distance;
1089				}
1090
1091				sub estimate_cluster_covariance {
1092				my $self = shift;
1093				my $cluster = shift;
1094				my $cluster_size = @$cluster;
1095				my @cluster_center = @{$self->add_point_coords( $cluster )};
1096				@cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;
1097				# for covariance calculation:
1098				my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$cluster));
1099				my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
1100				my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
1101				# All the record labels are stored in the array $self->{_data_id_tags}. The
1102				# actual data for clustering is stored in a hash at $self->{_data} whose keys are
1103				# the record labels; the value associated with each key is the array holding the
1104				# corresponding numerical multidimensional data.
1105				foreach my $j (0..$num_cols-1) {
1106				my $tag = $cluster->[$j];
1107				my $data = $self->{_data}->{$tag};
1108				my @diff_from_mean = vector_subtract($data, \@cluster_center);
1109				$matrix->set_col($j, \@diff_from_mean);
1110				}
1111				my $transposed = transpose( $matrix );
1112				my $covariance = $matrix * $transposed;
1113				$covariance *= 1.0 / $num_cols;
1114				if ($self->{_debug}) {
1115				print "\nDisplaying the Covariance Matrix for cluster:";
1116				display_matrix( $covariance );
1117				}
1118				return $covariance;
1119				}
1120
1121				sub write_clusters_to_files {
1122				my $self = shift;
1123				my @clusters = @{$self->{_clusters}};
1124				unlink glob "cluster*.dat";
1125				foreach my $i (0..@clusters-1) {
1126				my $filename = "cluster" . $i . ".txt";
1127				print "\nWriting cluster $i to file $filename\n" if $self->{_terminal_output};
1128				open FILEHANDLE, "\| sort > $filename" or die "Unable to open file: $!";
1129				foreach my $ele (@{$clusters[$i]}) {
1130				print FILEHANDLE "$ele ";
1131				}
1132				close FILEHANDLE;
1133				}
1134				}
1135
1136				sub get_K_best {
1137				my $self = shift;
1138				croak "You need to run the clusterer with K=0 option " .
1139				"before you can call this method" if $self->{_K_best} eq 'unknown';
1140				print "\nThe best value of K: $self->{_K_best}\n" if $self->{_terminal_output};
1141				return $self->{_K_best};
1142				}
1143
1144				sub show_QoC_values {
1145				my $self = shift;
1146				croak "\nYou need to run the clusterer with K=0 option before you can call this method"
1147				if $self->{_K_best} eq 'unknown';
1148				print "\nShown below are K on the left and the QoC values on the right (the smaller " .
1149				"the QoC, the better it is)\n";
1150				foreach my $key (sort keys %{$self->{_QoC_values}} ) {
1151				print " $key => $self->{_QoC_values}->{$key}\n" if defined $self->{_QoC_values}->{$key};
1152				}
1153				}
1154
1155				sub DESTROY {
1156				unlink "__temp_" . basename($_[0]->{_datafile});
1157				unlink "__temp_data_" . basename($_[0]->{_datafile});
1158				unlink "__temp_normed_data_" . basename($_[0]->{_datafile});
1159				}
1160
1161				################################## Visualization Code ###################################
1162
1163				# It makes sense to call visualize_clusters() only AFTER you have called kmeans().
1164				#
1165				# The visualize_clusters() implementation automatically figures out whether it
1166				# should do a 2D plot or a 3D plot. If the number of on bits in the mask that is
1167				# supplied as one of the arguments is greater than 2, it does a 3D plot for the
1168				# first three data coordinates. That is, the clusters will be displayed in the 3D
1169				# space formed by the first three data coordinates. On the other hand, if the number
1170				# of on bits in the mask is exactly 2, it does a 2D plot. Should it happen that
1171				# only one on bit is specified for the mask, visualize_clusters() aborts.
1172				#
1173				# The visualization code consists of first accessing each of clusters created by the
1174				# kmeans() subroutine. Note that the clusters contain only the symbolic names for
1175				# the individual records in the source data file. We therefore next reach into the
1176				# $self->{_original_data} hash and get the data coordinates associated with each
1177				# symbolic label in a cluster. The numerical data thus generated is then written
1178				# out to a temp file. When doing so we must remember to insert TWO BLANK LINES
1179				# between the data blocks corresponding to the different clusters. This constraint
1180				# is imposed on us by Gnuplot when plotting data from the same file since we want to
1181				# use different point styles for the data points in different cluster files.
1182				#
1183				# Subsequently, we call upon the Perl interface provided by the Graphics::GnuplotIF
1184				# module to plot the data clusters.
1185				sub visualize_clusters {
1186				my $self = shift;
1187				my $v_mask;
1188				my $pause_time;
1189				if (@_ == 1) {
1190				$v_mask = shift \|\| croak "visualization mask missing";
1191				} elsif (@_ == 2) {
1192				$v_mask = shift \|\| croak "visualization mask missing";
1193				$pause_time = shift;
1194				} else {
1195				croak "visualize_clusters() called with wrong args";
1196				}
1197				my $master_datafile = $self->{_datafile};
1198				my @v_mask = split //, $v_mask;
1199				my $visualization_mask_width = @v_mask;
1200				my $original_data_mask = $self->{_mask};
1201				my @mask = split //, $original_data_mask;
1202				my $data_field_width = scalar grep {$_ eq '1'} @mask;
1203				croak "\n\nABORTED: The width of the visualization mask (including " .
1204				"all its 1s and 0s) must equal the width of the original mask " .
1205				"used for reading the data file (counting only the 1's)"
1206				if $visualization_mask_width != $data_field_width;
1207				my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1208				my %visualization_data;
1209				while ( my ($record_id, $data) = each %{$self->{_original_data}} ) {
1210				my @fields = @$data;
1211				croak "\nABORTED: Visualization mask size exceeds data record size\n"
1212				if $#v_mask > $#fields;
1213				my @data_fields;
1214				foreach my $i (0..@fields-1) {
1215				if ($v_mask[$i] eq '0') {
1216				next;
1217				} elsif ($v_mask[$i] eq '1') {
1218				push @data_fields, $fields[$i];
1219				} else {
1220				croak "Misformed visualization mask. It can only have 1s and 0s\n";
1221				}
1222				}
1223				$visualization_data{ $record_id } = \@data_fields;
1224				}
1225				my @all_data_ids = @{$self->{_data_id_tags}};
1226				my $K = scalar @{$self->{_clusters}};
1227				my $filename = basename($master_datafile);
1228				my $temp_file = "__temp_" . $filename;
1229				unlink $temp_file if -e $temp_file;
1230				open OUTPUT, ">$temp_file"
1231				or die "Unable to open a temp file in this directory: $!\n";
1232				foreach my $cluster (@{$self->{_clusters}}) {
1233				foreach my $item (@$cluster) {
1234				print OUTPUT "@{$visualization_data{$item}}";
1235				print OUTPUT "\n";
1236				}
1237				print OUTPUT "\n\n";
1238				}
1239				close OUTPUT;
1240				my $plot;
1241				my $hardcopy_plot;
1242				if (!defined $pause_time) {
1243				$plot = Graphics::GnuplotIF->new( persist => 1 );
1244				$hardcopy_plot = Graphics::GnuplotIF->new();
1245				$hardcopy_plot->gnuplot_cmd('set terminal png', "set output \"clustering_results.png\"");
1246				} else {
1247				$plot = Graphics::GnuplotIF->new();
1248				}
1249				$plot->gnuplot_cmd( "set noclip" );
1250				$plot->gnuplot_cmd( "set pointsize 2" );
1251				my $arg_string = "";
1252				if ($visualization_data_field_width > 2) {
1253				foreach my $i (0..$K-1) {
1254				my $j = $i + 1;
1255				$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i\" with points lt $j pt $j, ";
1256				}
1257				} elsif ($visualization_data_field_width == 2) {
1258				foreach my $i (0..$K-1) {
1259				my $j = $i + 1;
1260				$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i\" with points lt $j pt $j, ";
1261				}
1262				} elsif ($visualization_data_field_width == 1 ) {
1263				foreach my $i (0..$K-1) {
1264				my $j = $i + 1;
1265				$arg_string .= "\"$temp_file\" index $i using 1 title \"Cluster $i\" with points lt $j pt $j, ";
1266				}
1267				}
1268				$arg_string = $arg_string =~ /^(.*),[ ]+$/;
1269				$arg_string = $1;
1270				if ($visualization_data_field_width > 2) {
1271				$plot->gnuplot_cmd( "splot $arg_string" );
1272				$hardcopy_plot->gnuplot_cmd( "splot $arg_string" ) unless defined $pause_time;
1273				$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1274				} elsif ($visualization_data_field_width == 2) {
1275				$plot->gnuplot_cmd( "plot $arg_string" );
1276				$hardcopy_plot->gnuplot_cmd( "plot $arg_string" ) unless defined $pause_time;
1277				$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1278				} elsif ($visualization_data_field_width == 1) {
1279				croak "No provision for plotting 1-D data\n";
1280				}
1281				}
1282
1283				# It makes sense to call visualize_data() only AFTER you have called the method
1284				# read_data_from_file().
1285				#
1286				# The visualize_data() is meant for the visualization of the original data in its
1287				# various 2D or 3D subspaces. The method can also be used to visualize the normed
1288				# data in a similar manner. Recall the normed data is the original data after each
1289				# data dimension is normalized by the standard-deviation along that dimension.
1290				#
1291				# Whether you see the original data or the normed data depends on the second
1292				# argument supplied in the method call. It must be either the string 'original' or
1293				# the string 'normed'.
1294				sub visualize_data {
1295				my $self = shift;
1296				my $v_mask = shift \|\| croak "visualization mask missing";
1297				my $datatype = shift; # must be either 'original' or 'normed'
1298				croak "\n\nABORTED: You called visualize_data() for normed data " .
1299				"but without first turning on data normalization in the " .
1300				"in the KMeans constructor"
1301				if ($datatype eq 'normed') && ! $self->{_var_normalize};
1302				my $master_datafile = $self->{_datafile};
1303				my @v_mask = split //, $v_mask;
1304				my $visualization_mask_width = @v_mask;
1305				my $original_data_mask = $self->{_mask};
1306				my @mask = split //, $original_data_mask;
1307				my $data_field_width = scalar grep {$_ eq '1'} @mask;
1308				croak "\n\nABORTED: The width of the visualization mask (including " .
1309				"all its 1s and 0s) must equal the width of the original mask " .
1310				"used for reading the data file (counting only the 1's)"
1311				if $visualization_mask_width != $data_field_width;
1312				my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1313				my %visualization_data;
1314				my $data_source;
1315				if ($datatype eq 'original') {
1316				$data_source = $self->{_original_data};
1317				} elsif ($datatype eq 'normed') {
1318				$data_source = $self->{_data};
1319				} else {
1320				croak "\n\nABORTED: improper call to visualize_data()";
1321				}
1322				while ( my ($record_id, $data) = each %{$data_source} ) {
1323				my @fields = @$data;
1324				croak "\nABORTED: Visualization mask size exceeds data record size\n"
1325				if $#v_mask > $#fields;
1326				my @data_fields;
1327				foreach my $i (0..@fields-1) {
1328				if ($v_mask[$i] eq '0') {
1329				next;
1330				} elsif ($v_mask[$i] eq '1') {
1331				push @data_fields, $fields[$i];
1332				} else {
1333				croak "Misformed visualization mask. It can only have 1s and 0s\n";
1334				}
1335				}
1336				$visualization_data{ $record_id } = \@data_fields;
1337				}
1338				my $filename = basename($master_datafile);
1339				my $temp_file;
1340				if ($datatype eq 'original') {
1341				$temp_file = "__temp_data_" . $filename;
1342				} elsif ($datatype eq 'normed') {
1343				$temp_file = "__temp_normed_data_" . $filename;
1344				} else {
1345				croak "ABORTED: Improper call to visualize_data()";
1346				}
1347				unlink $temp_file if -e $temp_file;
1348				open OUTPUT, ">$temp_file"
1349				or die "Unable to open a temp file in this directory: $!\n";
1350				foreach my $datapoint (values %visualization_data) {
1351				print OUTPUT "@$datapoint";
1352				print OUTPUT "\n";
1353				}
1354				close OUTPUT;
1355				my $plot = Graphics::GnuplotIF->new( persist => 1 );
1356				$plot->gnuplot_cmd( "set noclip" );
1357				$plot->gnuplot_cmd( "set pointsize 2" );
1358				my $plot_title = $datatype eq 'original' ? '"data"' : '"normed data"';
1359				my $arg_string ;
1360				if ($visualization_data_field_width > 2) {
1361				$arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
1362				} elsif ($visualization_data_field_width == 2) {
1363				$arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
1364				} elsif ($visualization_data_field_width == 1 ) {
1365				$arg_string = "\"$temp_file\" using 1 notitle with points lt -1 pt 1";
1366				}
1367				if ($visualization_data_field_width > 2) {
1368				$plot->gnuplot_cmd( "splot $arg_string" );
1369				} elsif ($visualization_data_field_width == 2) {
1370				$plot->gnuplot_cmd( "plot $arg_string" );
1371				} elsif ($visualization_data_field_width == 1) {
1372				croak "No provision for plotting 1-D data\n";
1373				}
1374				}
1375
1376				########################### Generating Synthetic Data for Clustering ##############################
1377
1378				# The data generated corresponds to a multivariate distribution. The mean and the
1379				# covariance of each Gaussian in the distribution are specified individually in a
1380				# parameter file. See the example parameter file param.txt in the examples
1381				# directory. Just edit this file for your own needs.
1382				#
1383				# The multivariate random numbers are generated by calling the Math::Random module.
1384				# As you would expect, that module will insist that the covariance matrix you
1385				# specify be symmetric and positive definite.
1386				sub cluster_data_generator {
1387				my $class = shift;
1388				croak "illegal call of a class method" unless $class eq 'Algorithm::KMeans';
1389				my %args = @_;
1390				my $input_parameter_file = $args{input_parameter_file};
1391				my $output_file = $args{output_datafile};
1392				my $N = $args{number_data_points_per_cluster};
1393				my @all_params;
1394				my $param_string;
1395				if (defined $input_parameter_file) {
1396				open INPUT, $input_parameter_file \|\| "unable to open parameter file: $!";
1397				@all_params = ;
1398				@all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
1399				chomp @all_params;
1400				$param_string = join ' ', @all_params;
1401				} else {
1402				# Just for testing. Used in t/test.t
1403				$param_string = "cluster 5 0 0 1 0 0 0 1 0 0 0 1 " .
1404				"cluster 0 5 0 1 0 0 0 1 0 0 0 1 " .
1405				"cluster 0 0 5 1 0 0 0 1 0 0 0 1";
1406				}
1407				my @cluster_strings = split /[ ]cluster[ ]/, $param_string;
1408				@cluster_strings = grep $_, @cluster_strings;
1409				my $K = @cluster_strings;
1410				croak "Too many clusters requested" if $K > 12;
1411				my @point_labels = ('a'..'z');
1412				print "Number of Gaussians used for the synthetic data: $K\n";
1413				my @means;
1414				my @covariances;
1415				my $data_dimension;
1416				foreach my $i (0..$K-1) {
1417				my @num_strings = split / /, $cluster_strings[$i];
1418				my @cluster_mean = map {/$_num_regex/;$_} split / /, $num_strings[0];
1419				$data_dimension = @cluster_mean;
1420				push @means, \@cluster_mean;
1421				my @covariance_nums = map {/$_num_regex/;$_} split / /, $num_strings[1];
1422				croak "dimensionality error" if @covariance_nums !=
1423				($data_dimension ** 2);
1424				my $cluster_covariance;
1425				foreach my $j (0..$data_dimension-1) {
1426				foreach my $k (0..$data_dimension-1) {
1427				$cluster_covariance->[$j]->[$k] =
1428				$covariance_nums[$j*$data_dimension + $k];
1429				}
1430				}
1431				push @covariances, $cluster_covariance;
1432				}
1433				random_seed_from_phrase( 'hellojello' );
1434				my @data_dump;
1435				foreach my $i (0..$K-1) {
1436				my @m = @{shift @means};
1437				my @covar = @{shift @covariances};
1438				my @new_data = Math::Random::random_multivariate_normal( $N, @m, @covar );
1439				my $p = 0;
1440				my $label = $point_labels[$i];
1441				@new_data = map {unshift @$_, $label.$i; $i++; $_} @new_data;
1442				push @data_dump, @new_data;
1443				}
1444				fisher_yates_shuffle( \@data_dump );
1445				open OUTPUT, ">$output_file";
1446				foreach my $ele (@data_dump) {
1447				foreach my $coord ( @$ele ) {
1448				print OUTPUT "$coord ";
1449				}
1450				print OUTPUT "\n";
1451				}
1452				print "Data written out to file $output_file\n";
1453				close OUTPUT;
1454				}
1455
1456				sub add_point_coords {
1457				my $self = shift;
1458				my @arr_of_ids = @{shift @_}; # array of data element names
1459				my @result;
1460				my $data_dimensionality = $self->{_data_dimensions};
1461				foreach my $i (0..$data_dimensionality-1) {
1462				$result[$i] = 0.0;
1463				}
1464				foreach my $id (@arr_of_ids) {
1465				my $ele = $self->{_data}->{$id};
1466				my $i = 0;
1467				foreach my $component (@$ele) {
1468				$result[$i] += $component;
1469				$i++;
1470				}
1471				}
1472				return \@result;
1473				}
1474
1475				sub add_point_coords_from_original_data {
1476				my $self = shift;
1477				my @arr_of_ids = @{shift @_}; # array of data element names
1478				my @result;
1479				my $data_dimensionality = $self->{_data_dimensions};
1480				foreach my $i (0..$data_dimensionality-1) {
1481				$result[$i] = 0.0;
1482				}
1483				foreach my $id (@arr_of_ids) {
1484				my $ele = $self->{_original_data}->{$id};
1485				my $i = 0;
1486				foreach my $component (@$ele) {
1487				$result[$i] += $component;
1488				$i++;
1489				}
1490				}
1491				return \@result;
1492				}
1493
1494				################################### Support Routines ########################################
1495
1496				sub get_index_at_value {
1497				my $value = shift;
1498				my @array = @{shift @_};
1499				foreach my $i (0..@array-1) {
1500				return $i if $value == $array[$i];
1501				}
1502				return -1;
1503				}
1504
1505				# This routine is really not necessary in light of the new `~~' operator in Perl.
1506				# Will use the new operator in the next version.
1507				sub vector_equal {
1508				my $vec1 = shift;
1509				my $vec2 = shift;
1510				die "wrong data types for vector-equal predicate\n" if @$vec1 != @$vec2;
1511				foreach my $i (0..@$vec1-1){
1512				return 0 if $vec1->[$i] != $vec2->[$i];
1513				}
1514				return 1;
1515				}
1516
1517				# Returns the minimum value and its positional index in an array
1518				sub minimum {
1519				my $arr = shift;
1520				my $min;
1521				my $index;
1522				foreach my $i (0..@{$arr}-1) {
1523				if ( (!defined $min) \|\| ($arr->[$i] < $min) ) {
1524				$index = $i;
1525				$min = $arr->[$i];
1526				}
1527				}
1528				return ($min, $index);
1529				}
1530
1531				sub minmax {
1532				my $arr = shift;
1533				my $min;
1534				my $max;
1535				foreach my $i (0..@{$arr}-1) {
1536				next if !defined $arr->[$i];
1537				if ( (!defined $min) && (!defined $max) ) {
1538				$min = $arr->[$i];
1539				$max = $arr->[$i];
1540				} elsif ( $arr->[$i] < $min ) {
1541				$min = $arr->[$i];
1542				} elsif ( $arr->[$i] > $max ) {
1543				$max = $arr->[$i];
1544				}
1545				}
1546				return ($min, $max);
1547				}
1548
1549				# Meant only for constructing a deep copy of an array of arrays:
1550				sub deep_copy_AoA {
1551				my $ref_in = shift;
1552				my $ref_out;
1553				foreach my $i (0..@{$ref_in}-1) {
1554				foreach my $j (0..@{$ref_in->[$i]}-1) {
1555				$ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
1556				}
1557				}
1558				return $ref_out;
1559				}
1560
1561				# Meant only for constructing a deep copy of an array of arrays for the case when
1562				# some elements of the top-level array may be undefined:
1563				sub deep_copy_AoA_with_nulls {
1564				my $ref_in = shift;
1565				my $ref_out;
1566				foreach my $i (0..@{$ref_in}-1) {
1567				if ( !defined $ref_in->[$i] ) {
1568				$ref_out->[$i] = undef;
1569				next;
1570				}
1571				foreach my $j (0..@{$ref_in->[$i]}-1) {
1572				$ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
1573				}
1574				}
1575				return $ref_out;
1576				}
1577
1578				# Meant only for constructing a deep copy of a hash in which each value is an
1579				# anonymous array of numbers:
1580				sub deep_copy_hash {
1581				my $ref_in = shift;
1582				my $ref_out;
1583				while ( my ($key, $value) = each( %$ref_in ) ) {
1584				$ref_out->{$key} = deep_copy_array( $value );
1585				}
1586				return $ref_out;
1587				}
1588
1589				# Meant only for an array of numbers:
1590				sub deep_copy_array {
1591				my $ref_in = shift;
1592				my $ref_out;
1593				foreach my $i (0..@{$ref_in}-1) {
1594				$ref_out->[$i] = $ref_in->[$i];
1595				}
1596				return $ref_out;
1597				}
1598
1599				sub display_cluster_centers {
1600				my $self = shift;
1601				my @clusters = @{shift @_};
1602				my $i = 0;
1603				foreach my $cluster (@clusters) {
1604				my $cluster_size = @$cluster;
1605				my @cluster_center =
1606				@{$self->add_point_coords_from_original_data( $cluster )};
1607				@cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;
1608				print "\ncluster $i ($cluster_size records):\n";
1609				print "cluster center $i: " .
1610				"@{[map {my $x = sprintf('%.4f', $_); $x} @cluster_center]}\n";
1611				$i++;
1612				}
1613				}
1614
1615				# For displaying the individual clusters on a terminal screen. Each cluster is
1616				# displayed through the symbolic names associated with the data points.
1617				sub display_clusters {
1618				my @clusters = @{shift @_};
1619				my $i = 0;
1620				foreach my $cluster (@clusters) {
1621				@$cluster = sort @$cluster;
1622				my $cluster_size = @$cluster;
1623				print "\n\nCluster $i ($cluster_size records):\n";
1624				foreach my $ele (@$cluster) {
1625				print " $ele";
1626				}
1627				$i++
1628				}
1629				print "\n\n";
1630				}
1631
1632				# from perl docs:
1633				sub fisher_yates_shuffle {
1634				my $arr = shift;
1635				my $i = @$arr;
1636				while (--$i) {
1637				my $j = int rand( $i + 1 );
1638				@$arr[$i, $j] = @$arr[$j, $i];
1639				}
1640				}
1641
1642				sub variance_normalization {
1643				my %data_hash = %{shift @_};
1644				my @all_data_points = values %data_hash;
1645				my $dimensions = @{$all_data_points[0]};
1646				my @data_projections;
1647				foreach my $data_point (@all_data_points) {
1648				my $i = 0;
1649				foreach my $proj (@$data_point) {
1650				push @{$data_projections[$i++]}, $proj;
1651				}
1652				}
1653				my @variance_vec;
1654				foreach my $vec (@data_projections) {
1655				my ($mean, $variance) = mean_and_variance( $vec );
1656				push @variance_vec, $variance;
1657				}
1658				my %new_data_hash;
1659				while (my ($label, $data) = each(%data_hash) ) {
1660				my @new_data;
1661				foreach my $i (0..@{$data}-1) {
1662				my $new = $data->[$i] / sqrt($variance_vec[$i]);
1663				push @new_data, $data->[$i] / sqrt($variance_vec[$i]);
1664				}
1665				$new_data_hash{$label} = \@new_data;
1666				}
1667				return \%new_data_hash;
1668				}
1669
1670				sub mean_and_variance {
1671				my @data = @{shift @_};
1672				my ($mean, $variance);
1673				foreach my $i (1..@data) {
1674				if ($i == 1) {
1675				$mean = $data[0];
1676				$variance = 0;
1677				} else {
1678				$mean = ( (($i-1)/$i) * $mean ) + $data[$i-1] / $i;
1679				$variance = ( (($i-1)/$i) * $variance ) + ($data[$i-1]-$mean)**2 / ($i-1);
1680				}
1681				}
1682				return ($mean, $variance);
1683				}
1684
1685				sub check_for_illegal_params {
1686				my @params = @_;
1687				my @legal_params = qw / datafile
1688				mask
1689				K
1690				Kmin
1691				Kmax
1692				terminal_output
1693				write_clusters_to_files
1694				do_variance_normalization
1695				cluster_seeding
1696				use_mahalanobis_metric
1697				debug
1698				/;
1699				my $found_match_flag;
1700				foreach my $param (@params) {
1701				foreach my $legal (@legal_params) {
1702				$found_match_flag = 0;
1703				if ($param eq $legal) {
1704				$found_match_flag = 1;
1705				last;
1706				}
1707				}
1708				last if $found_match_flag == 0;
1709				}
1710				return $found_match_flag;
1711				}
1712
1713				sub get_value_index_hash {
1714				my $arr = shift;
1715				my %hash;
1716				foreach my $index (0..@$arr-1) {
1717				$hash{$arr->[$index]} = $index if $arr->[$index] > 0;
1718				}
1719				return \%hash;
1720				}
1721
1722				sub non_maximum_suppression {
1723				my $arr = shift;
1724				my @output = (0) x @$arr;
1725				my @final_output = (0) x @$arr;
1726				my %hash;
1727				my @array_of_runs = ([$arr->[0]]);
1728				foreach my $index (1..@$arr-1) {
1729				if ($arr->[$index] == $arr->[$index-1]) {
1730				push @{$array_of_runs[-1]}, $arr->[$index];
1731				} else {
1732				push @array_of_runs, [$arr->[$index]];
1733				}
1734				}
1735				my $runstart_index = 0;
1736				foreach my $run_index (1..@array_of_runs-2) {
1737				$runstart_index += @{$array_of_runs[$run_index-1]};
1738				if ($array_of_runs[$run_index]->[0] >
1739				$array_of_runs[$run_index-1]->[0] &&
1740				$array_of_runs[$run_index]->[0] >
1741				$array_of_runs[$run_index+1]->[0]) {
1742				my $run_center = @{$array_of_runs[$run_index]} / 2;
1743				my $assignment_index = $runstart_index + $run_center;
1744				$output[$assignment_index] = $arr->[$assignment_index];
1745				}
1746				}
1747				if ($array_of_runs[-1]->[0] > $array_of_runs[-2]->[0]) {
1748				$runstart_index += @{$array_of_runs[-2]};
1749				my $run_center = @{$array_of_runs[-1]} / 2;
1750				my $assignment_index = $runstart_index + $run_center;
1751				$output[$assignment_index] = $arr->[$assignment_index];
1752				}
1753				if ($array_of_runs[0]->[0] > $array_of_runs[1]->[0]) {
1754				my $run_center = @{$array_of_runs[0]} / 2;
1755				$output[$run_center] = $arr->[$run_center];
1756				}
1757				return \@output;
1758				}
1759
1760				sub display_matrix {
1761				my $matrix = shift;
1762				my $nrows = $matrix->rows();
1763				my $ncols = $matrix->cols();
1764				print "\n\nDisplaying matrix of size $nrows rows and $ncols columns:\n";
1765				foreach my $i (0..$nrows-1) {
1766				my $row = $matrix->row($i);
1767				my @row_as_list = $row->as_list;
1768				print "@row_as_list\n";
1769				}
1770				print "\n\n";
1771				}
1772
1773				sub transpose {
1774				my $matrix = shift;
1775				my $num_rows = $matrix->rows();
1776				my $num_cols = $matrix->cols();
1777				my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
1778				foreach my $i (0..$num_rows-1) {
1779				my @row = $matrix->row($i)->as_list;
1780				$transpose->set_col($i, \@row );
1781				}
1782				return $transpose;
1783				}
1784
1785				sub vector_subtract {
1786				my $vec1 = shift;
1787				my $vec2 = shift;
1788				die "wrong data types for vector subtract calculation\n" if @$vec1 != @$vec2;
1789				my @result;
1790				foreach my $i (0..@$vec1-1){
1791				push @result, $vec1->[$i] - $vec2->[$i];
1792				}
1793				return @result;
1794				}
1795
1796				sub vector_multiply {
1797				my $vec1 = shift;
1798				my $vec2 = shift;
1799				die "vec_multiply called with two vectors of different sizes"
1800				unless @$vec1 == @$vec2;
1801				my $result = 0;
1802				foreach my $i (0..@$vec1-1) {
1803				$result += $vec1->[$i] * $vec2->[$i];
1804				}
1805				return $result;
1806				}
1807
1808				sub matrix_multiply {
1809				my $matrix1 = shift;
1810				my $matrix2 = shift;
1811				my ($nrows1, $ncols1) = ($matrix1->rows(), $matrix1->cols());
1812				my ($nrows2, $ncols2) = ($matrix2->rows(), $matrix2->cols());
1813				die "matrix multiplication called with non-matching matrix arguments"
1814				unless $nrows1 == $ncols2 && $ncols1 == $nrows2;
1815				if ($nrows1 == 1) {
1816				my @row = $matrix1->row(0)->as_list;
1817				my @col = $matrix2->col(0)->as_list;
1818				my $result;
1819				foreach my $j (0..$ncols1-1) {
1820				$result += $row[$j] * $col[$j];
1821				}
1822				return $result;
1823				}
1824				my $product = Math::GSL::Matrix->new($nrows1, $nrows1);
1825				foreach my $i (0..$nrows1-1) {
1826				my $row = $matrix1->row($i);
1827				my @product_row;
1828				foreach my $j (0..$ncols2-1) {
1829				my $col = $matrix2->col($j);
1830				my $row_times_col = matrix_multiply($row, $col);
1831				push @product_row, $row_times_col;
1832				}
1833				$product->set_row($i, \@product_row);
1834				}
1835				return $product;
1836				}
1837
1838				1;
1839
1840				=pod
1841
1842				=head1 NAME
1843
1844				Algorithm::KMeans - for clustering multidimensional data
1845
1846				=head1 SYNOPSIS
1847
1848				# You now have four different choices for clustering your data with this module:
1849				#
1850				# 1) With Euclidean distances and with random cluster seeding
1851				#
1852				# 2) With Mahalanobis distances and with random cluster seeding
1853				#
1854				# 3) With Euclidean distances and with smart cluster seeding
1855				#
1856				# 4) With Mahalanobis distances and with smart cluster seeding
1857				#
1858				# Despite the qualifier 'smart' in 'smart cluster seeding', it may not always
1859				# produce results that are superior to those obtained with random seeding. (If you
1860				# also factor in the choice regarding variance normalization, you actually have
1861				# eight different choices for data clustering with this module.)
1862				#
1863				# In all cases, you'd obviously begin with
1864
1865				use Algorithm::KMeans;
1866
1867				# You'd then name the data file:
1868
1869				my $datafile = "mydatafile.csv";
1870
1871				# Next, set the mask to indicate which columns of the datafile to use for
1872				# clustering and which column contains a symbolic ID for each data record. For
1873				# example, if the symbolic name is in the first column, you want the second column
1874				# to be ignored, and you want the next three columns to be used for 3D clustering,
1875				# you'd set the mask to:
1876
1877				my $mask = "N0111";
1878
1879				# Now construct an instance of the clusterer. The parameter K controls the number
1880				# of clusters. If you know how many clusters you want (let's say 3), call
1881
1882				my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1883				mask => $mask,
1884				K => 3,
1885				cluster_seeding => 'random',
1886				terminal_output => 1,
1887				write_clusters_to_files => 1,
1888				);
1889
1890				# By default, this constructor call will set you up for clustering based on
1891				# Euclidean distances. If you want the module to use Mahalanobis distances, your
1892				# constructor call will look like:
1893
1894				my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1895				mask => $mask,
1896				K => 3,
1897				cluster_seeding => 'random',
1898				use_mahalanobis_metric => 1,
1899				terminal_output => 1,
1900				write_clusters_to_files => 1,
1901				);
1902
1903				# For both constructor calls shown above, you can use smart seeding of the clusters
1904				# by changing 'random' to 'smart' for the cluster_seeding option. See the
1905				# explanation of smart seeding in the Methods section of this documentation.
1906
1907				# If your data is such that its variability along the different dimensions of the
1908				# data space is significantly different, you may get better clustering if you first
1909				# normalize your data by setting the constructor parameter
1910				# do_variance_normalization as shown below:
1911
1912				my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1913				mask => $mask,
1914				K => 3,
1915				cluster_seeding => 'smart', # or 'random'
1916				terminal_output => 1,
1917				do_variance_normalization => 1,
1918				write_clusters_to_files => 1,
1919				);
1920
1921				# But bear in mind that such data normalization may actually decrease the
1922				# performance of the clusterer if the variability in the data is more a result of
1923				# the separation between the means than a consequence of intra-cluster variance.
1924
1925				# Set K to 0 if you want the module to figure out the optimum number of clusters
1926				# from the data. (It is best to run this option with the terminal_output set to 1
1927				# so that you can see the different value of QoC for the different K):
1928
1929				my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1930				mask => $mask,
1931				K => 0,
1932				cluster_seeding => 'random', # or 'smart'
1933				terminal_output => 1,
1934				write_clusters_to_files => 1,
1935				);
1936
1937				# Although not shown above, you can obviously set the 'do_variance_normalization'
1938				# flag here also if you wish.
1939
1940				# For very large data files, setting K to 0 will result in searching through too
1941				# many values for K. For such cases, you can range limit the values of K to search
1942				# through by
1943
1944				my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1945				mask => "N111",
1946				Kmin => 3,
1947				Kmax => 10,
1948				cluster_seeding => 'random', # or 'smart'
1949				terminal_output => 1,
1950				write_clusters_to_files => 1,
1951				);
1952
1953				# FOR ALL CASES ABOVE, YOU'D NEED TO MAKE THE FOLLOWING CALLS ON THE CLUSTERER
1954				# INSTANCE TO ACTUALLY CLUSTER THE DATA:
1955
1956				$clusterer->read_data_from_file();
1957				$clusterer->kmeans();
1958
1959				# If you want to directly access the clusters and the cluster centers in your own
1960				# top-level script, replace the above two statements with:
1961
1962				$clusterer->read_data_from_file();
1963				my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans();
1964
1965				# You can subsequently access the clusters directly in your own code, as in:
1966
1967				foreach my $cluster_id (sort keys %{$clusters_hash}) {
1968				print "\n$cluster_id => @{$clusters_hash->{$cluster_id}}\n";
1969				}
1970				foreach my $cluster_id (sort keys %{$cluster_centers_hash}) {
1971				print "\n$cluster_id => @{$cluster_centers_hash->{$cluster_id}}\n";
1972				}
1973
1974
1975				# CLUSTER VISUALIZATION:
1976
1977				# You must first set the mask for cluster visualization. This mask tells the module
1978				# which 2D or 3D subspace of the original data space you wish to visualize the
1979				# clusters in:
1980
1981				my $visualization_mask = "111";
1982				$clusterer->visualize_clusters($visualization_mask);
1983
1984
1985				# SYNTHETIC DATA GENERATION:
1986
1987				# The module has been provided with a class method for generating multivariate data
1988				# for experimenting with clustering. The data generation is controlled by the
1989				# contents of the parameter file that is supplied as an argument to the data
1990				# generator method. The mean and covariance matrix entries in the parameter file
1991				# must be according to the syntax shown in the param.txt file in the examples
1992				# directory. It is best to edit this file as needed:
1993
1994				my $parameter_file = "param.txt";
1995				my $out_datafile = "mydatafile.dat";
1996				Algorithm::KMeans->cluster_data_generator(
1997				input_parameter_file => $parameter_file,
1998				output_datafile => $out_datafile,
1999				number_data_points_per_cluster => $N );
2000
2001				=head1 CHANGES
2002
2003				Version 2.05 removes the restriction on the version of Perl that is required. This
2004				is based on Srezic's recommendation. He had no problem building and testing the
2005				previous version with Perl 5.8.9. Version 2.05 also includes a small augmentation of
2006				the code in the method C for guarding against user errors
2007				in the specification of the mask that tells the module which columns of the data file
2008				are to be used for clustering.
2009
2010				Version 2.04 allows you to use CSV data files for clustering.
2011
2012				Version 2.03 incorporates minor code cleanup. The main implementation of the module
2013				remains unchanged.
2014
2015				Version 2.02 downshifts the version of Perl that is required for this module. The
2016				module should work with versions 5.10 and higher of Perl. The implementation code
2017				for the module remains unchanged.
2018
2019				Version 2.01 removes many errors in the documentation. The changes made to the module
2020				in Version 2.0 were not reflected properly in the documentation page for that
2021				version. The implementation code remains unchanged.
2022
2023				Version 2.0 includes significant additional functionality: (1) You now have the
2024				option to cluster using the Mahalanobis distance metric (the default is the Euclidean
2025				metric); and (2) With the two C methods that have been added to the
2026				module, you can now determine the best cluster for a new data sample after you have
2027				created the clusters with the previously available data. Finding the best cluster
2028				for a new data sample can be done using either the Euclidean metric or the
2029				Mahalanobis metric.
2030
2031				Version 1.40 includes a C option for seeding the clusters. This option,
2032				supplied through the constructor parameter C, means that the
2033				clusterer will (1) Subject the data to principal components analysis in order to
2034				determine the maximum variance direction; (2) Project the data onto this direction;
2035				(3) Find peaks in a smoothed histogram of the projected points; and (4) Use the
2036				locations of the highest peaks as initial guesses for the cluster centers. If you
2037				don't want to use this option, set C to C. That should work
2038				as in the previous version of the module.
2039
2040				Version 1.30 includes a bug fix for the case when the datafile contains empty lines,
2041				that is, lines with no data records. Another bug fix in Version 1.30 deals with the
2042				case when you want the module to figure out how many clusters to form (this is the
2043				C option in the constructor call) and the number of data records is close to the
2044				minimum.
2045
2046				Version 1.21 includes fixes to handle the possibility that, when clustering the data
2047				for a fixed number of clusters, a cluster may become empty during iterative
2048				calculation of cluster assignments of the data elements and the updating of the
2049				cluster centers. The code changes are in the C and
2050				C subroutines.
2051
2052				Version 1.20 includes an option to normalize the data with respect to its variability
2053				along the different coordinates before clustering is carried out.
2054
2055				Version 1.1.1 allows for range limiting the values of C to search through. C
2056				stands for the number of clusters to form. This version also declares the module
2057				dependencies in the C file.
2058
2059				Version 1.1 is a an object-oriented version of the implementation presented in
2060				version 1.0. The current version should lend itself more easily to code extension.
2061				You could, for example, create your own class by subclassing from the class presented
2062				here and, in your subclass, use your own criteria for the similarity distance between
2063				the data points and for the QoC (Quality of Clustering) metric, and, possibly a
2064				different rule to stop the iterations. Version 1.1 also allows you to directly
2065				access the clusters formed and the cluster centers in your calling script.
2066
2067
2068				=head1 SPECIAL USAGE NOTE
2069
2070				If you were directly accessing in your own scripts the clusters produced by the older
2071				versions of this module, you'd need to make changes to your code if you wish to use
2072				Version 2.0 or higher. Instead of returning arrays of clusters and cluster centers,
2073				Versions 2.0 and higher return hashes. This change was made necessary by the logic
2074				required for implementing the two new C methods that were introduced
2075				in Version 2.0. These methods return the best cluster for a new data sample from the
2076				clusters you created using the existing data. One of the C methods is
2077				based on the Euclidean metric for finding the cluster that is closest to the new data
2078				sample, and the other on the Mahalanobis metric. Another point of incompatibility
2079				with the previous versions is that you must now explicitly set the C
2080				parameter in the call to the constructor to either C or C. This
2081				parameter does not have a default associated with it starting with Version 2.0.
2082
2083
2084				=head1 DESCRIPTION
2085
2086				Clustering with K-Means takes place iteratively and involves two steps: 1) assignment
2087				of data samples to clusters on the basis of how far the data samples are from the
2088				cluster centers; and 2) Recalculation of the cluster centers (and cluster covariances
2089				if you are using the Mahalanobis distance metric for clustering).
2090
2091				Obviously, before the two-step approach can proceed, we need to initialize the the
2092				cluster centers. How this initialization is carried out is important. The module
2093				gives you two very different ways for carrying out this initialization. One option,
2094				called the C option, consists of subjecting the data to principal components
2095				analysis to discover the direction of maximum variance in the data space. The data
2096				points are then projected on to this direction and a histogram constructed from the
2097				projections. Centers of the smoothed histogram are used to seed the clustering
2098				operation. The other option is to choose the cluster centers purely randomly. You
2099				get the first option if you set C to C in the constructor,
2100				and you get the second option if you set it to C.
2101
2102				How to specify the number of clusters, C, is one of the most vexing issues in any
2103				approach to clustering. In some case, we can set C on the basis of prior
2104				knowledge. But, more often than not, no such prior knowledge is available. When the
2105				programmer does not explicitly specify a value for C, the approach taken in the
2106				current implementation is to try all possible values between 2 and some largest
2107				possible value that makes statistical sense. We then choose that value for C
2108				which yields the best value for the QoC (Quality of Clustering) metric. It is
2109				generally believed that the largest value for C should not exceed C
2110				where C is the number of data samples to be clustered.
2111
2112				What to use for the QoC metric is obviously a critical issue unto itself. In the
2113				current implementation, the value of QoC is the ratio of the average radius of the
2114				clusters and the average distance between the cluster centers.
2115
2116				Every iterative algorithm requires a stopping criterion. The criterion implemented
2117				here is that we stop iterations when there is no re-assignment of the data points
2118				during the assignment step.
2119
2120				Ordinarily, the output produced by a K-Means clusterer will correspond to a local
2121				minimum for the QoC values, as opposed to a global minimum. The current
2122				implementation protects against that when the module constructor is called with the
2123				C option for C by trying different randomly selected initial
2124				cluster centers and then selecting the one that gives the best overall QoC value.
2125
2126				A K-Means clusterer will generally produce good results if the overlap between the
2127				clusters is minimal and if each cluster exhibits variability that is uniform in all
2128				directions. When the data variability is different along the different directions in
2129				the data space, the results you obtain with a K-Means clusterer may be improved by
2130				first normalizing the data appropriately, as can be done in this module when you set
2131				the C option in the constructor. However, as pointed out
2132				elsewhere in this documentation, such normalization may actually decrease the
2133				performance of the clusterer if the overall data variability along any dimension is
2134				more a result of separation between the means than a consequence of intra-cluster
2135				variability.
2136
2137
2138				=head1 METHODS
2139
2140				The module provides the following methods for clustering, for cluster visualization,
2141				for data visualization, for the generation of data for testing a clustering
2142				algorithm, and for determining the cluster membership of a new data sample:
2143
2144				=over 4
2145
2146				=item B
2147
2148				my $clusterer = Algorithm::KMeans->new(datafile => $datafile,
2149				mask => $mask,
2150				K => $K,
2151				cluster_seeding => 'random', # also try 'smart'
2152				use_mahalanobis_metric => 1, # also try '0'
2153				terminal_output => 1,
2154				write_clusters_to_files => 1,
2155				);
2156
2157				A call to C constructs a new instance of the C class. When
2158				C<$K> is a non-zero positive integer, the module will construct exactly that many
2159				clusters. However, when C<$K> is 0, the module will find the best number of clusters
2160				to partition the data into. As explained in the Description, setting
2161				C to C causes PCA (principal components analysis) to be used
2162				for discovering the best choices for the initial cluster centers. If you want purely
2163				random decisions to be made for the initial choices for the cluster centers, set
2164				C to C.
2165
2166				The data file is expected to contain entries in the following format
2167
2168				c20 0 10.7087017086940 9.63528386251712 10.9512155258108 ...
2169				c7 0 12.8025925026787 10.6126270065785 10.5228482095349 ...
2170				b9 0 7.60118206283120 5.05889245193079 5.82841781759102 ...
2171				....
2172				....
2173
2174				where the first column contains the symbolic ID tag for each data record and the rest
2175				of the columns the numerical information. As to which columns are actually used for
2176				clustering is decided by the string value of the mask. For example, if we wanted to
2177				cluster on the basis of the entries in just the 3rd, the 4th, and the 5th columns
2178				above, the mask value would be C where the character C indicates that the
2179				ID tag is in the first column, the character C<0> that the second column is to be
2180				ignored, and the C<1>'s that follow that the 3rd, the 4th, and the 5th columns are to
2181				be used for clustering.
2182
2183				If you wish for the clusterer to search through a C<(Kmin,Kmax)> range of values for
2184				C, the constructor should be called in the following fashion:
2185
2186				my $clusterer = Algorithm::KMeans->new(datafile => $datafile,
2187				mask => $mask,
2188				Kmin => 3,
2189				Kmax => 10,
2190				cluster_seeding => 'smart', # try 'random' also
2191				terminal_output => 1,
2192				);
2193
2194				where obviously you can choose any reasonable values for C and C. If you
2195				choose a value for C that is statistically too large, the module will let you
2196				know. Again, you may choose C for C, the default value being
2197				C.
2198
2199				If you believe that the variability of the data is very different along the different
2200				dimensions of the data space, you may get better clustering by first normalizing the
2201				data coordinates by the standard-deviations along those directions. When you set the
2202				constructor option C as shown below, the module uses the
2203				overall data standard-deviation along a direction for the normalization in that
2204				direction. (As mentioned elsewhere in the documentation, such a normalization could
2205				backfire on you if the data variability along a dimension is more a result of the
2206				separation between the means than a consequence of the intra-cluster variability.):
2207
2208				my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
2209				mask => "N111",
2210				K => 2,
2211				cluster_seeding => 'smart', # try 'random' also
2212				terminal_output => 1,
2213				do_variance_normalization => 1,
2214				);
2215
2216				=back
2217
2218				=head2 Constructor Parameters
2219
2220				=over 8
2221
2222				=item C:
2223
2224				This parameter names the data file that contains the multidimensional data records
2225				you want the module to cluster.
2226
2227				=item C:
2228
2229				This parameter supplies the mask to be applied to the columns of your data file. See
2230				the explanation in Synopsis for what this mask looks like.
2231
2232				=item C:
2233
2234				This parameter supplies the number of clusters you are looking for. If you set this
2235				option to 0, that means that you want the module to search for the best value for
2236				C. (Keep in mind the fact that searching for the best C may take a long time
2237				for large data files.)
2238
2239				=item C:
2240
2241				If you supply an integer value for C, the search for the best C will begin
2242				with that value.
2243
2244				=item C:
2245
2246				If you supply an integer value for C, the search for the best C will end at
2247				that value.
2248
2249				=item C:
2250
2251				This parameter must be set to either C or C. Depending on your data,
2252				you may get superior clustering with the C option. The choice C means
2253				that the clusterer will (1) subject the data to principal components analysis to
2254				determine the maximum variance direction; (2) project the data onto this direction;
2255				(3) find peaks in a smoothed histogram of the projected points; and (4) use the
2256				locations of the highest peaks as seeds for cluster centers. If the C option
2257				produces bizarre results, try C.
2258
2259				=item C:
2260
2261				When set to 1, this option causes Mahalanobis distances to be used for clustering.
2262				The default is 0 for this parameter. By default, the module uses the Euclidean
2263				distances for clustering. In general, Mahalanobis distance based clustering will
2264				fail if your data resides on a lower-dimensional hyperplane in the data space, if you
2265				seek too many clusters, and if you do not have a sufficient number of samples in your
2266				data file. A necessary requirement for the module to be able to compute Mahalanobis
2267				distances is that the cluster covariance matrices be non-singular. (Let's say your
2268				data dimensionality is C and the module is considering a cluster that has only
2269				C samples in it where C is less than C. In this case, the covariance matrix
2270				will be singular since its rank will not exceed C. For the covariance matrix to
2271				be non-singular, it must be of full rank, that is, its rank must be C.)
2272
2273				=item C:
2274
2275				When set, the module will first normalize the data variance along the different
2276				dimensions of the data space before attempting clustering. Depending on your data,
2277				this option may or may not result in better clustering.
2278
2279				=item C:
2280
2281				This boolean parameter, when not supplied in the call to C, defaults to 0.
2282				When set, you will see in your terminal window the different clusters as lists of the
2283				symbolic IDs and their cluster centers. You will also see the QoC (Quality of
2284				Clustering) values for the clusters displayed.
2285
2286				=item C:
2287
2288				This parameter is also boolean. When set to 1, the clusters are written out to files
2289				that are named in the following manner:
2290
2291				cluster0.txt
2292				cluster1.txt
2293				cluster2.txt
2294				...
2295				...
2296
2297				Before the clusters are written to these files, the module destroys all files with
2298				such names in the directory in which you call the module.
2299
2300
2301				=back
2302
2303				=over
2304
2305				=item B
2306
2307				$clusterer->read_data_from_file()
2308
2309				=item B
2310
2311				$clusterer->kmeans();
2312
2313				or
2314
2315				my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans();
2316
2317				The first call above works solely by side-effect. The second call also returns the
2318				clusters and the cluster centers. See the C script in the
2319				C directory for how you can in your own code extract the clusters and the
2320				cluster centers from the variables C<$clusters_hash> and C<$cluster_centers_hash>.
2321
2322				=item B
2323
2324				$clusterer->get_K_best();
2325
2326				This call makes sense only if you supply either the C option to the constructor,
2327				or if you specify values for the C and C options. The C and the
2328				C<(Kmin,Kmax)> options cause the module to determine the best value for C.
2329				Remember, C is the number of clusters the data is partitioned into.
2330
2331				=item B
2332
2333				$clusterer->show_QoC_values();
2334
2335				presents a table with C values in the left column and the corresponding QoC
2336				(Quality-of-Clustering) values in the right column. Note that this call makes sense
2337				only if you either supply the C option to the constructor, or if you specify
2338				values for the C and C options.
2339
2340				=item B
2341
2342				$clusterer->visualize_clusters( $visualization_mask )
2343
2344				The visualization mask here does not have to be identical to the one used for
2345				clustering, but must be a subset of that mask. This is convenient for visualizing
2346				the clusters in two- or three-dimensional subspaces of the original space.
2347
2348				=item B
2349
2350				$clusterer->visualize_data($visualization_mask, 'original');
2351
2352				$clusterer->visualize_data($visualization_mask, 'normed');
2353
2354				This method requires a second argument and, as shown, it must be either the string
2355				C or the string C, the former for the visualization of the raw data
2356				and the latter for the visualization of the data after its different dimensions are
2357				normalized by the standard-deviations along those directions. If you call the method
2358				with the second argument set to C, but do so without turning on the
2359				C option in the KMeans constructor, it will let you know.
2360
2361
2362				=item B
2363
2364				If you wish to determine the cluster membership of a new data sample after you have
2365				created the clusters with the existing data samples, you would need to call this
2366				method. The C script in the C directory
2367				shows how to use this method.
2368
2369
2370				=item B
2371
2372				This does the same thing as the previous method, except that it determines the
2373				cluster membership using the Mahalanobis distance metric. As for the previous
2374				method, see the C script in the C directory
2375				for how to use this method.
2376
2377
2378				=item B
2379
2380				Algorithm::KMeans->cluster_data_generator(
2381				input_parameter_file => $parameter_file,
2382				output_datafile => $out_datafile,
2383				number_data_points_per_cluster => 20 );
2384
2385				for generating multivariate data for clustering if you wish to play with synthetic
2386				data for clustering. The input parameter file contains the means and the variances
2387				for the different Gaussians you wish to use for the synthetic data. See the file
2388				C provided in the examples directory. It will be easiest for you to just
2389				edit this file for your data generation needs. In addition to the format of the
2390				parameter file, the main constraint you need to observe in specifying the parameters
2391				is that the dimensionality of the covariance matrix must correspond to the
2392				dimensionality of the mean vectors. The multivariate random numbers are generated by
2393				calling the C module. As you would expect, this module requires that
2394				the covariance matrices you specify in your parameter file be symmetric and positive
2395				definite. Should the covariances in your parameter file not obey this condition, the
2396				C module will let you know.
2397
2398				=back
2399
2400				=head1 HOW THE CLUSTERS ARE OUTPUT
2401
2402				When the option C is set in the call to the constructor, the
2403				clusters are displayed on the terminal screen.
2404
2405				When the option C is set in the call to the constructor, the
2406				module dumps the clusters in files named
2407
2408				cluster0.txt
2409				cluster1.txt
2410				cluster2.txt
2411				...
2412				...
2413
2414				in the directory in which you execute the module. The number of such files will
2415				equal the number of clusters formed. All such existing files in the directory are
2416				destroyed before any fresh ones are created. Each cluster file contains the symbolic
2417				ID tags of the data samples in that cluster.
2418
2419				The module also leaves in your directory a printable `.png' file that is a point plot
2420				of the different clusters. The name of this file is always C.
2421
2422				Also, as mentioned previously, a call to C in your own code will return the
2423				clusters and the cluster centers.
2424
2425				=head1 REQUIRED
2426
2427				This module requires the following three modules:
2428
2429				Math::Random
2430				Graphics::GnuplotIF
2431				Math::GSL
2432
2433				With regard to the third item above, what is actually required is the
2434				C module. However, that module is a part of the C
2435				distribution. The acronym GSL stands for the GNU Scientific Library. C is
2436				a Perl interface to the GSL C-based library.
2437
2438
2439				=head1 THE C DIRECTORY
2440
2441				The C directory contains several scripts to help you become familiar with
2442				this module. The following script is an example of how the module can be expected to
2443				be used most of the time. It calls for clustering to be carried out with a fixed
2444				C:
2445
2446				cluster_and_visualize.pl
2447
2448				The more time you spend with this script, the more comfortable you will become with
2449				the use of this module. The script file contains a large comment block that mentions
2450				six locations in the script where you have to make decisions about how to use the
2451				module.
2452
2453				See the following script if you do not know what value to use for C for clustering
2454				your data:
2455
2456				find_best_K_and_cluster.pl
2457
2458				This script uses the C option in the constructor that causes the module to
2459				search for the best C for your data. Since this search is virtually unbounded ---
2460				limited only by the number of samples in your data file --- the script may take a
2461				long time to run for a large data file. Hence the next script.
2462
2463				If your datafile is too large, you may need to range limit the values of C that
2464				are searched through, as in the following script:
2465
2466				find_best_K_in_range_and_cluster.pl
2467
2468				If you also want to include data normalization (it may reduce the performance of the
2469				clusterer in some cases), see the following script:
2470
2471				cluster_after_data_normalization.pl
2472
2473				When you include the data normalization step and you would like to visualize the data
2474				before and after normalization, see the following script:
2475
2476				cluster_and_visualize_with_data_visualization.pl*
2477
2478				After you are done clustering, let's say you want to find the cluster membership of a
2479				new data sample. To see how you can do that, see the script:
2480
2481				which_cluster_for_new_data.pl
2482
2483				This script returns two answers for which cluster a new data sample belongs to: one
2484				using the Euclidean metric to calculate the distances between the new data sample and
2485				the cluster centers, and the other using the Mahalanobis metric. If the clusters are
2486				strongly elliptical in shape, you are likely to get better results with the
2487				Mahalanobis metric. (To see that you can get two different answers using the two
2488				different distance metrics, run the C script on the
2489				data in the file C. To make this run, note that you have to comment
2490				out and uncomment the lines at four different locations in the script.)
2491
2492				The C directory also contains the following support scripts:
2493
2494				For generating the data for experiments with clustering:
2495
2496				data_generator.pl
2497
2498				For cleaning up the examples directory:
2499
2500				cleanup_directory.pl
2501
2502				The examples directory also includes a parameter file, C, for generating
2503				synthetic data for clustering. Just edit this file if you would like to generate
2504				your own multivariate data for clustering. The parameter file is for the 3D case,
2505				but you can generate data with any dimensionality through appropriate entries in the
2506				parameter file.
2507
2508				=head1 EXPORT
2509
2510				None by design.
2511
2512				=head1 CAVEATS
2513
2514				K-Means based clustering usually does not work well when the clusters are strongly
2515				overlapping and when the extent of variability along the different dimensions is
2516				different for the different clusters. The module does give you the ability to
2517				normalize the variability in your data with the constructor option
2518				C. However, as described elsewhere, this may actually
2519				reduce the performance of the clusterer if the data variability along a direction is
2520				more a result of the separation between the means than because of intra-cluster
2521				variability. For better clustering with difficult-to-cluster data, you could try
2522				using the author's C module.
2523
2524				=head1 BUGS
2525
2526				Please notify the author if you encounter any bugs. When sending email, please place
2527				the string 'KMeans' in the subject line.
2528
2529				=head1 INSTALLATION
2530
2531				Download the archive from CPAN in any directory of your choice. Unpack the archive
2532				with a command that on a Linux machine would look like:
2533
2534				tar zxvf Algorithm-KMeans-2.05.tar.gz
2535
2536				This will create an installation directory for you whose name will be
2537				C. Enter this directory and execute the following commands
2538				for a standard install of the module if you have root privileges:
2539
2540				perl Makefile.PL
2541				make
2542				make test
2543				sudo make install
2544
2545				If you do not have root privileges, you can carry out a non-standard install the
2546				module in any directory of your choice by:
2547
2548				perl Makefile.PL prefix=/some/other/directory/
2549				make
2550				make test
2551				make install
2552
2553				With a non-standard install, you may also have to set your PERL5LIB environment
2554				variable so that this module can find the required other modules. How you do that
2555				would depend on what platform you are working on. In order to install this module in
2556				a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
2557				variable by
2558
2559				setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2560
2561				If I used bash, I'd need to declare:
2562
2563				export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2564
2565
2566				=head1 THANKS
2567
2568				I thank Slaven for pointing out that I needed to downshift the required version of Perl
2569				for this module. Fortunately, I had access to an old machine still running Perl
2570				5.10.1. The current version, 2.02, is based on my testing the module on that machine.
2571
2572				I added two C methods in Version 2.0 as a result of an email from
2573				Jerome White who expressed a need for such methods in order to determine the best
2574				cluster for a new data record after you have successfully clustered your existing
2575				data. Thanks Jerome for your feedback!
2576
2577				It was an email from Nadeem Bulsara that prompted me to create Version 1.40 of this
2578				module. Working with Version 1.30, Nadeem noticed that occasionally the module would
2579				produce variable clustering results on the same dataset. I believe that this
2580				variability was caused (at least partly) by the purely random mode that was used in
2581				Version 1.30 for the seeding of the cluster centers. Version 1.40 now includes a
2582				C mode. With the new mode the clusterer uses a PCA (Principal Components
2583				Analysis) of the data to make good guesses for the cluster centers. However,
2584				depending on how the data is jumbled up, it is possible that the new mode will not
2585				produce uniformly good results in all cases. So you can still use the old mode by
2586				setting C to C in the constructor. Thanks Nadeem for your
2587				feedback!
2588
2589				Version 1.30 resulted from Martin Kalin reporting problems with a very small data
2590				set. Thanks Martin!
2591
2592				Version 1.21 came about in response to the problems encountered by Luis Fernando
2593				D'Haro with version 1.20. Although the module would yield the clusters for some of
2594				its runs, more frequently than not the module would abort with an "empty cluster"
2595				message for his data. Luis Fernando has also suggested other improvements (such as
2596				clustering directly from the contents of a hash) that I intend to make in future
2597				versions of this module. Thanks Luis Fernando.
2598
2599				Chad Aeschliman was kind enough to test out the interface of this module and to give
2600				suggestions for its improvement. His key slogan: "If you cannot figure out how to
2601				use a module in under 10 minutes, it's not going to be used." That should explain
2602				the longish Synopsis included here.
2603
2604				=head1 AUTHOR
2605
2606				Avinash Kak, kak@purdue.edu
2607
2608				If you send email, please place the string "KMeans" in your subject line to get past
2609				my spam filter.
2610
2611				=head1 COPYRIGHT
2612
2613				This library is free software; you can redistribute it and/or modify it under the
2614				same terms as Perl itself.
2615
2616				Copyright 2014 Avinash Kak
2617
2618				=cut
2619