File Coverage

blib/lib/Algorithm/KMeans.pm
Criterion Covered Total %
statement 22 24 91.6
branch n/a
condition n/a
subroutine 8 8 100.0
pod n/a
total 30 32 93.7


line stmt bran cond sub pod time code
1             package Algorithm::KMeans;
2              
3             #------------------------------------------------------------------------------------
4             # Copyright (c) 2014 Avinash Kak. All rights reserved. This program is free
5             # software. You may modify and/or distribute it under the same terms as Perl itself.
6             # This copyright notice must remain attached to the file.
7             #
8             # Algorithm::KMeans is a Perl module for clustering multidimensional data.
9             # -----------------------------------------------------------------------------------
10              
11 1     1   12262 use 5.10.0;
  1         4  
  1         37  
12 1     1   4 use strict;
  1         1  
  1         21  
13 1     1   3 use warnings;
  1         5  
  1         34  
14 1     1   3 use Carp;
  1         1  
  1         71  
15 1     1   4 use File::Basename;
  1         1  
  1         66  
16 1     1   569 use Math::Random;
  1         4655  
  1         76  
17 1     1   513 use Graphics::GnuplotIF;
  1         9001  
  1         52  
18 1     1   986 use Math::GSL::Matrix;
  0            
  0            
19              
20              
21             our $VERSION = '2.04';
22              
23             # from Perl docs:
24             my $_num_regex = '^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$';
25              
26             # Constructor:
27             sub new {
28             my ($class, %args) = @_;
29             my @params = keys %args;
30             croak "\nYou have used a wrong name for a keyword argument " .
31             "--- perhaps a misspelling\n"
32             if check_for_illegal_params(@params) == 0;
33             bless {
34             _datafile => $args{datafile} || croak("datafile required"),
35             _mask => $args{mask} || croak("mask required"),
36             _K => $args{K} || 0,
37             _K_min => $args{Kmin} || 'unknown',
38             _K_max => $args{Kmax} || 'unknown',
39             _cluster_seeding => $args{cluster_seeding} || croak("must choose smart or random ".
40             "for cluster seeding"),
41             _var_normalize => $args{do_variance_normalization} || 0,
42             _use_mahalanobis_metric => $args{use_mahalanobis_metric} || 0,
43             _clusters_2_files => $args{write_clusters_to_files} || 0,
44             _terminal_output => $args{terminal_output} || 0,
45             _debug => $args{debug} || 0,
46             _N => 0,
47             _K_best => 'unknown',
48             _original_data => {},
49             _data => {},
50             _data_id_tags => [],
51             _QoC_values => {},
52             _clusters => [],
53             _cluster_centers => [],
54             _clusters_hash => {},
55             _cluster_centers_hash => {},
56             _cluster_covariances_hash => {},
57             _data_dimensions => 0,
58              
59             }, $class;
60             }
61              
62             sub read_data_from_file {
63             my $self = shift;
64             my $filename = $self->{_datafile};
65             $self->read_data_from_file_csv() if $filename =~ /.csv$/;
66             $self->read_data_from_file_dat() if $filename =~ /.dat$/;
67             }
68              
69             sub read_data_from_file_csv {
70             my $self = shift;
71             my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
72             my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered";
73             my $mask = $self->{_mask};
74             my @mask = split //, $mask;
75             $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
76             print "data dimensionality: $self->{_data_dimensions} \n"if $self->{_terminal_output};
77             open FILEIN, $filename or die "Unable to open $filename: $!";
78             die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
79             local $/ = undef;
80             my @all_data = split /\s+/, ;
81             my %data_hash = ();
82             my @data_tags = ();
83             foreach my $record (@all_data) {
84             my @splits = split /,/, $record;
85             my $record_name = shift @splits;
86             $data_hash{$record_name} = \@splits;
87             push @data_tags, $record_name;
88             }
89             $self->{_original_data} = \%data_hash;
90             $self->{_data_id_tags} = \@data_tags;
91             $self->{_N} = scalar @data_tags;
92             if ($self->{_var_normalize}) {
93             $self->{_data} = variance_normalization( $self->{_original_data} );
94             } else {
95             $self->{_data} = deep_copy_hash( $self->{_original_data} );
96             }
97             # Need to make the following call to set the global mean and covariance:
98             # my $covariance = $self->estimate_mean_and_covariance(\@data_tags);
99             # Need to make the following call to set the global eigenvec eigenval sets:
100             # $self->eigen_analysis_of_covariance($covariance);
101             if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
102             carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
103             "points must satisfy the relation N > 2xK**2 where K is " .
104             "the number of clusters requested for the clusters to be " .
105             "meaningful $!"
106             if ( $self->{_N} < (2 * $self->{_K} ** 2) );
107             print "\n\n\n";
108             }
109             }
110              
111             sub read_data_from_file_dat {
112             my $self = shift;
113             my $datafile = $self->{_datafile};
114             my $mask = $self->{_mask};
115             my @mask = split //, $mask;
116             $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
117             print "data dimensionality: $self->{_data_dimensions} \n"if $self->{_terminal_output};
118             open INPUT, $datafile or die "unable to open file $datafile: $!\n";
119             chomp( my @raw_data = );
120             close INPUT;
121             # Transform strings into number data
122             foreach my $record (@raw_data) {
123             next unless $record;
124             next if $record =~ /^#/;
125             my @data_fields;
126             my @fields = split /\s+/, $record;
127             die "\nABORTED: Mask size does not correspond to row record size\n" if $#fields != $#mask;
128             my $record_id;
129             foreach my $i (0..@fields-1) {
130             if ($mask[$i] eq '0') {
131             next;
132             } elsif ($mask[$i] eq 'N') {
133             $record_id = $fields[$i];
134             } elsif ($mask[$i] eq '1') {
135             push @data_fields, $fields[$i];
136             } else {
137             die "misformed mask for reading the data file\n";
138             }
139             }
140             my @nums = map {/$_num_regex/;$_} @data_fields;
141             $self->{_original_data}->{ $record_id } = \@nums;
142             }
143             if ($self->{_var_normalize}) {
144             $self->{_data} = variance_normalization( $self->{_original_data} );
145             } else {
146             $self->{_data} = deep_copy_hash( $self->{_original_data} );
147             }
148             my @all_data_ids = keys %{$self->{_data}};
149             $self->{_data_id_tags} = \@all_data_ids;
150             $self->{_N} = scalar @all_data_ids;
151             if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
152             carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
153             "points must satisfy the relation N > 2xK**2 where K is " .
154             "the number of clusters requested for the clusters to be " .
155             "meaningful $!"
156             if ( $self->{_N} < (2 * $self->{_K} ** 2) );
157             print "\n\n\n";
158             }
159             }
160              
161             sub kmeans {
162             my $self = shift;
163             my $K = $self->{_K};
164             if ( ($K == 0) ||
165             ( ($self->{_K_min} ne 'unknown') &&
166             ($self->{_K_max} ne 'unknown') ) ) {
167             eval {
168             $self->iterate_through_K();
169             };
170             die "in kmeans(): $@" if ($@);
171             } elsif ( $K =~ /\d+/) {
172             eval {
173             $self->cluster_for_fixed_K_multiple_random_tries($K)
174             if $self->{_cluster_seeding} eq 'random';
175             $self->cluster_for_fixed_K_single_smart_try($K)
176             if $self->{_cluster_seeding} eq 'smart';
177             };
178             die "in kmeans(): $@" if ($@);
179             } else {
180             croak "Incorrect call syntax used. See documentation.\n";
181             }
182             if ((defined $self->{_clusters}) && (defined $self->{_cluster_centers})){
183             foreach my $i (0..@{$self->{_clusters}}-1) {
184             $self->{_clusters_hash}->{"cluster$i"} = $self->{_clusters}->[$i];
185             $self->{_cluster_centers_hash}->{"cluster$i"} = $self->{_cluster_centers}->[$i];
186             $self->{_cluster_covariances_hash}->{"cluster$i"} =
187             $self->estimate_cluster_covariance($self->{_clusters}->[$i]);
188             }
189             $self->write_clusters_to_files( $self->{_clusters} ) if $self->{_clusters_2_files};
190             return ($self->{_clusters_hash}, $self->{_cluster_centers_hash});
191             } else {
192             croak "Clustering failed. Try again.";
193             }
194             }
195              
196             # This is the subroutine to call if you prefer purely random choices for the initial
197             # seeding of the cluster centers.
198             sub cluster_for_fixed_K_multiple_random_tries {
199             my $self = shift;
200             my $K = shift;
201             my @all_data_ids = @{$self->{_data_id_tags}};
202             my $QoC;
203             my @QoC_values;
204             my @array_of_clusters;
205             my @array_of_cluster_centers;
206             my $clusters;
207             my $new_clusters;
208             my $cluster_centers;
209             my $new_cluster_centers;
210             print "Clustering for K = $K\n" if $self->{_terminal_output};
211             foreach my $trial (1..20) {
212             print ". ";
213             my ($new_clusters, $new_cluster_centers);
214             eval {
215             ($new_clusters, $new_cluster_centers) = $self->cluster_for_given_K($K);
216             };
217             next if $@;
218             next if @$new_clusters <= 1;
219             my $newQoC = $self->cluster_quality( $new_clusters, $new_cluster_centers );
220             if ( (!defined $QoC) || ($newQoC < $QoC) ) {
221             $QoC = $newQoC;
222             $clusters = deep_copy_AoA( $new_clusters );
223             $cluster_centers = deep_copy_AoA( $new_cluster_centers );
224             }
225             }
226             die "\n\nThe constructor options you have chosen do not work with the data. Try\n" .
227             "turning off the Mahalanobis option if you are using it.\n"
228             unless defined $clusters;
229             $self->{_clusters} = $clusters;
230             $self->{_cluster_centers} = $cluster_centers;
231             $self->{_QoC_values}->{"$K"} = $QoC;
232             if ($self->{_terminal_output}) {
233             print "\nDisplaying final clusters for best K (= $K) :\n";
234             display_clusters( $clusters );
235             $self->display_cluster_centers( $clusters );
236             print "\nQoC value (the smaller, the better): $QoC\n";
237             }
238             }
239              
240             # For the smart try, we do initial cluster seeding by subjecting the data to
241             # principal components analysis in order to discover the direction of maximum
242             # variance in the data space. Subsequently, we try to find the K largest peaks along
243             # this direction. The coordinates of these peaks serve as the seeds for the K
244             # clusters.
245             sub cluster_for_fixed_K_single_smart_try {
246             my $self = shift;
247             my $K = shift;
248             my @all_data_ids = @{$self->{_data_id_tags}};
249             print "Clustering for K = $K\n" if $self->{_terminal_output};
250             my ($clusters, $cluster_centers);
251             eval {
252             ($clusters, $cluster_centers) = $self->cluster_for_given_K($K);
253             };
254             if ($@) {
255             die "In cluster_for_fixed_K_single_smart_try: insufficient data for clustering " .
256             "with $self->{_K} clusters --- $@";
257             }
258             my $QoC = $self->cluster_quality( $clusters, $cluster_centers );
259             $self->{_clusters} = $clusters;
260             $self->{_cluster_centers} = $cluster_centers;
261             $self->{_QoC_values}->{"$K"} = $QoC;
262             if ($self->{_terminal_output}) {
263             print "\nDisplaying final clusters for best K (= $K) :\n";
264             display_clusters( $clusters );
265             $self->display_cluster_centers( $clusters );
266             print "\nQoC value (the smaller, the better): $QoC\n";
267             }
268             }
269              
270             # The following subroutine is the top-level routine to call if you want the system to
271             # figure out on its own what value to use for K, the number of clusters. If you call
272             # the KMeans constructor with the K=0 option, the method below will try every
273             # possible value of K between 2 and the maximum possible depending on the number of
274             # data points available. For example, if the number of data points is 10,000, it will
275             # try all possible values of K between 2 and 70. For how the maximum value is set for
276             # K, see the comments made under Description. Note also how this method makes 20
277             # different tries for each value of K as a defense against the problem of the final
278             # result corresponding to some local minimum in the values of the QoC metric. Out of
279             # these 20 tries for each K, it retains the clusters and the cluster centers for only
280             # that try that yields the smallest value for the QoC metric. After estimating the
281             # "best" QoC values for all possible K in this manner, it then finds the K for which
282             # the QoC is the minimum. This is taken to be the best value for K. Finally, the
283             # output clusters are written out to separate files.
284             #
285             # If the KMeans constructor is invoked with the (Kmin, Kmax) options, then, instead
286             # of iterating through 2 and the maximum permissible value for K, the iterations are
287             # carried out only between Kmin and Kmax.
288             sub iterate_through_K {
289             my $self = shift;
290             my @all_data_ids = @{$self->{_data_id_tags}};
291             my $N = $self->{_N};
292             croak "You need more than 8 data samples. The number of data points must satisfy " .
293             "the relation N > 2xK**2 where K is the number of clusters. The smallest " .
294             "value for K is 2.\n" if $N <= 8;
295             my $K_statistical_max = int( sqrt( $N / 2.0 ) );
296             my $Kmin = $self->{_K_min} eq 'unknown'
297             ? 2
298             : $self->{_K_min};
299             my $Kmax = $self->{_K_max} eq 'unknown'
300             ? int( sqrt( $N / 2.0 ) )
301             : $self->{_K_max};
302             croak "\n\nYour Kmax value is too high. Ideally, it should not exceed sqrt(N/2) " .
303             "where N is the number of data points" if $Kmax > $K_statistical_max;
304             print "Value of Kmax is: $Kmax\n" if $self->{_terminal_output};
305             my @QoC_values;
306             my @array_of_clusters;
307             my @array_of_cluster_centers;
308             foreach my $K ($Kmin..$Kmax) {
309             my $QoC;
310             my $clusters;
311             my $cluster_centers;
312             print "Clustering for K = $K\n" if $self->{_terminal_output};
313             if ($self->{_cluster_seeding} eq 'random') {
314             foreach my $trial (1..21) {
315             print ". ";
316             my ($new_clusters, $new_cluster_centers);
317             if ($self->{_use_mahalanobis_metric}) {
318             eval {
319             ($new_clusters, $new_cluster_centers) = $self->cluster_for_given_K($K);
320             };
321             next if $@;
322             } else {
323             ($new_clusters, $new_cluster_centers) = $self->cluster_for_given_K($K);
324             }
325             my $newQoC = $self->cluster_quality( $new_clusters, $new_cluster_centers );
326             if ( (!defined $QoC) || ($newQoC < $QoC) ) {
327             $QoC = $newQoC;
328             $clusters = deep_copy_AoA( $new_clusters );
329             $cluster_centers = deep_copy_AoA( $new_cluster_centers );
330             }
331             }
332             print "\n";
333             } elsif ($self->{_cluster_seeding} eq 'smart') {
334             eval {
335             ($clusters, $cluster_centers) = $self->cluster_for_given_K($K);
336             };
337             if ($@) {
338             $Kmax = $K - 1;
339             last;
340             }
341             $QoC = $self->cluster_quality($clusters,$cluster_centers);
342             } else {
343             die "You must either choose 'smart' for cluster_seeding or 'random'. " .
344             "Fix your constructor call."
345             }
346             push @QoC_values, $QoC;
347             push @array_of_clusters, $clusters;
348             push @array_of_cluster_centers, $cluster_centers;
349             }
350             my ($min, $max) = minmax( \@QoC_values );
351             my $K_best_relative_to_Kmin = get_index_at_value($min, \@QoC_values );
352             my $K_best = $K_best_relative_to_Kmin + $Kmin;
353             if ($self->{_terminal_output}) {
354             print "\nDisplaying final clusters for best K (= $K_best) :\n";
355             display_clusters( $array_of_clusters[$K_best_relative_to_Kmin] );
356             $self->display_cluster_centers($array_of_clusters[$K_best_relative_to_Kmin]);
357             print "\nBest clustering achieved for K=$K_best with QoC = $min\n" if defined $min;
358             my @printableQoC = grep {$_} @QoC_values;
359             print "\nQoC values array (the smaller the value, the better it is) for different " .
360             "K starting with K=$Kmin: @printableQoC\n";
361             }
362             $self->{_K_best} = $K_best;
363             foreach my $i (0..@QoC_values-1) {
364             my $k = $i + $Kmin;
365             $self->{_QoC_values}->{"$k"} = $QoC_values[$i];
366             }
367             $self->{_clusters} = $array_of_clusters[$K_best_relative_to_Kmin];
368             $self->{_cluster_centers} =
369             $array_of_cluster_centers[$K_best_relative_to_Kmin];
370             }
371              
372             # This is the function to call if you already know what value you want to use for K,
373             # the number of expected clusters. The purpose of this function is to do the
374             # initialization of the cluster centers and to carry out the initial assignment of
375             # the data to the clusters with the initial cluster centers. The initialization
376             # consists of 3 steps: Construct a random sequence of K integers between 0 and N-1
377             # where N is the number of data points to be clustered; 2) Call
378             # get_initial_cluster_centers() to index into the data array with the random integers
379             # to get a list of K data points that would serve as the initial cluster centers; and
380             # (3) Call assign_data_to_clusters_initial() to assign the rest of the data to each
381             # of the K clusters on the basis of the proximity to the cluster centers.
382             sub cluster_for_given_K {
383             my $self = shift;
384             my $K = shift;
385             my @all_data_ids = @{$self->{_data_id_tags}};
386             my $cluster_centers;
387             if ($self->{_cluster_seeding} eq 'smart') {
388             $cluster_centers = $self->get_initial_cluster_centers_smart($K);
389             } elsif ($self->{_cluster_seeding} eq 'random') {
390             $cluster_centers = $self->get_initial_cluster_centers($K);
391             } else {
392             die "You must either choose 'smart' for cluster_seeding or 'random'. " .
393             "Fix your constructor call."
394             }
395             my $clusters;
396             if ($self->{_use_mahalanobis_metric}) {
397             my $clusters_and_determinants =
398             $self->assign_data_to_clusters_initial_mahalanobis($cluster_centers);
399             $clusters = $clusters_and_determinants->[0];
400             my @determinants = @{$clusters_and_determinants->[1]};
401             my ($min,$max) = minmax(\@determinants);
402             die "In cluster_for_given_K(): min determinant of covariance matrix for at " .
403             "least one cluster is too small" if $min / $max < 0.001;
404             } else {
405             $clusters = $self->assign_data_to_clusters_initial($cluster_centers);
406             }
407             my $cluster_nonexistent_flag = 0;
408             foreach my $trial (0..2) {
409             if ($self->{_use_mahalanobis_metric}) {
410             ($clusters, $cluster_centers) = $self->assign_data_to_clusters_mahalanobis($clusters, $K);
411             } else {
412             ($clusters, $cluster_centers) = $self->assign_data_to_clusters( $clusters, $K );
413             }
414             my $num_of_clusters_returned = @$clusters;
415             foreach my $cluster (@$clusters) {
416             $cluster_nonexistent_flag = 1 if ((!defined $cluster) || (@$cluster == 0));
417             }
418             last unless $cluster_nonexistent_flag;
419             }
420             return ($clusters, $cluster_centers);
421             }
422              
423             # This function is used when you set the "cluster_seeding" option to 'random' in the
424             # constructor. Returns a set of K random integers. These serve as indices to reach
425             # into the data array. A data element whose index is one of the random numbers
426             # returned by this routine serves as an initial cluster center. Note the quality
427             # check it runs on the list of K random integers constructed. We first make sure
428             # that all K random integers are different. Subsequently, we carry out a quality
429             # assessment of the K random integers constructed. This quality measure consists of
430             # the ratio of the values spanned by the random integers to the value of N, the total
431             # number of data points to be clustered. Currently, if this ratio is less than 0.3,
432             # we discard the K integers and try again.
433             sub initialize_cluster_centers {
434             my $self = shift;
435             my $K = shift;
436             my $data_store_size = $self->{_N};
437             my @cluster_center_indices;
438             while (1) {
439             foreach my $i (0..$K-1) {
440             $cluster_center_indices[$i] = int rand( $data_store_size );
441             next if $i == 0;
442             foreach my $j (0..$i-1) {
443             while ( $cluster_center_indices[$j] ==
444             $cluster_center_indices[$i] ) {
445             my $old = $cluster_center_indices[$i];
446             $cluster_center_indices[$i] = int rand($data_store_size);
447             }
448             }
449             }
450             my ($min,$max) = minmax(\@cluster_center_indices );
451             my $quality = ($max - $min) / $data_store_size;
452             last if $quality > 0.3;
453             }
454             return @cluster_center_indices;
455             }
456              
457             # This function is used when you set the "cluster_seeding" option to 'random' in the
458             # constructor. This routine merely reaches into the data array with the random
459             # integers, as constructed by the previous routine, serving as indices and fetching
460             # values corresponding to those indices. The fetched data samples serve as the
461             # initial cluster centers.
462             sub get_initial_cluster_centers {
463             my $self = shift;
464             my $K = shift;
465             my @cluster_center_indices = $self->initialize_cluster_centers($K);
466             my @result;
467             foreach my $i (@cluster_center_indices) {
468             my $tag = $self->{_data_id_tags}[$i];
469             push @result, $self->{_data}->{$tag};
470             }
471             return \@result;
472             }
473              
474             # This method is invoked when you choose the 'smart' option for the "cluster_seeding"
475             # option in the constructor. It subjects the data to a principal components analysis
476             # to figure out the direction of maximal variance. Subsequently, it tries to locate
477             # K peaks in a smoothed histogram of the data points projected onto the maximal
478             # variance direction.
479             sub get_initial_cluster_centers_smart {
480             my $self = shift;
481             my $K = shift;
482             if ($self->{_data_dimensions} == 1) {
483             my @one_d_data;
484             foreach my $j (0..$self->{_N}-1) {
485             my $tag = $self->{_data_id_tags}[$j];
486             push @one_d_data, $self->{_data}->{$tag}->[0];
487             }
488             my @peak_points = find_peak_points_in_given_direction(\@one_d_data,$K);
489             print "highest points at data values: @peak_points\n" if $self->{_debug};
490             my @cluster_centers;
491             foreach my $peakpoint (@peak_points) {
492             push @cluster_centers, [$peakpoint];
493             }
494             return \@cluster_centers;
495             }
496             my ($num_rows,$num_cols) = ($self->{_data_dimensions},$self->{_N});
497             my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
498             my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
499             # All the record labels are stored in the array $self->{_data_id_tags}. The
500             # actual data for clustering is stored in a hash at $self->{_data} whose keys are
501             # the record labels; the value associated with each key is the array holding the
502             # corresponding numerical multidimensional data.
503             foreach my $j (0..$num_cols-1) {
504             my $tag = $self->{_data_id_tags}[$j];
505             my $data = $self->{_data}->{$tag};
506             $matrix->set_col($j, $data);
507             }
508             if ($self->{_debug}) {
509             print "\nDisplaying the original data as a matrix:";
510             display_matrix( $matrix );
511             }
512             foreach my $j (0..$num_cols-1) {
513             $mean_vec += $matrix->col($j);
514             }
515             $mean_vec *= 1.0 / $num_cols;
516             if ($self->{_debug}) {
517             print "Displaying the mean vector for the data:";
518             display_matrix( $mean_vec );
519             }
520             foreach my $j (0..$num_cols-1) {
521             my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
522             $matrix->set_col($j, \@new_col);
523             }
524             if ($self->{_debug}) {
525             print "Displaying mean subtracted data as a matrix:";
526             display_matrix( $matrix );
527             }
528             my $transposed = transpose( $matrix );
529             if ($self->{_debug}) {
530             print "Displaying transposed data matrix:";
531             display_matrix( $transposed );
532             }
533             my $covariance = matrix_multiply( $matrix, $transposed );
534             $covariance *= 1.0 / $num_cols;
535             if ($self->{_debug}) {
536             print "\nDisplaying the Covariance Matrix for your data:";
537             display_matrix( $covariance );
538             }
539             my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
540             my $num_of_eigens = @$eigenvalues;
541             my $largest_eigen_index = 0;
542             my $smallest_eigen_index = 0;
543             print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug};
544             foreach my $i (1..$num_of_eigens-1) {
545             $largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
546             $smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
547             print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug};
548             }
549             print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
550             print "\nsmallest eigen index: $smallest_eigen_index\n\n" if $self->{_debug};
551             foreach my $i (0..$num_of_eigens-1) {
552             my @vec = $eigenvectors->[$i]->as_list;
553             print "Eigenvector $i: @vec\n" if $self->{_debug};
554             }
555             my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
556             print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug};
557             my @max_var_direction;
558             # Each element of the array @largest_eigen_vec is a Math::Complex object
559             foreach my $k (0..@largest_eigen_vec-1) {
560             my ($mag, $theta) = $largest_eigen_vec[$k] =~ /\[(\d*\.\d+),(\S+)\]/;
561             if ($theta eq '0') {
562             $max_var_direction[$k] = $mag;
563             } elsif ($theta eq 'pi') {
564             $max_var_direction[$k] = -1.0 * $mag;
565             } else {
566             die "eigendecomposition of covariance matrix produced a complex " .
567             "eigenvector --- something is wrong";
568             }
569             }
570             print "\nMaximum Variance Direction: @max_var_direction\n\n" if $self->{_debug};
571             # We now project all data points on the largest eigenvector.
572             # Each projection will yield a single point on the eigenvector.
573             my @projections;
574             foreach my $j (0..$self->{_N}-1) {
575             my $tag = $self->{_data_id_tags}[$j];
576             my $data = $self->{_data}->{$tag};
577             die "Dimensionality of the largest eigenvector does not "
578             . "match the dimensionality of the data"
579             unless @max_var_direction == $self->{_data_dimensions};
580             my $projection = vector_multiply($data, \@max_var_direction);
581             push @projections, $projection;
582             }
583             print "All projection points: @projections\n" if $self->{_debug};
584             my @peak_points = find_peak_points_in_given_direction(\@projections, $K);
585             print "highest points at points along largest eigenvec: @peak_points\n" if $self->{_debug};
586             my @cluster_centers;
587             foreach my $peakpoint (@peak_points) {
588             my @actual_peak_coords = map {$peakpoint * $_} @max_var_direction;
589             push @cluster_centers, \@actual_peak_coords;
590             }
591             return \@cluster_centers;
592             }
593              
594             # This method is invoked when you choose the 'smart' option for the "cluster_seeding"
595             # option in the constructor. It is called by the previous method to locate K peaks
596             # in a smoothed histogram of the data points projected onto the maximal variance
597             # direction.
598             sub find_peak_points_in_given_direction {
599             my $dataref = shift;
600             my $how_many = shift;
601             my @data = @$dataref;
602             my ($min, $max) = minmax(\@data);
603             my $num_points = @data;
604             my @sorted_data = sort {$a <=> $b} @data;
605             my $scale = $max - $min;
606             foreach my $index (0..$#sorted_data-1) {
607             $sorted_data[$index] = ($sorted_data[$index] - $min) / $scale;
608             }
609             my $avg_diff = 0;
610             foreach my $index (0..$#sorted_data-1) {
611             my $diff = $sorted_data[$index+1] - $sorted_data[$index];
612             $avg_diff += ($diff - $avg_diff) / ($index + 1);
613             }
614             my $delta = 1.0 / 1000.0;
615             my @accumulator = (0) x 1000;
616             foreach my $index (0..@sorted_data-1) {
617             my $cell_index = int($sorted_data[$index] / $delta);
618             my $smoothness = 40;
619             for my $index ($cell_index-$smoothness..$cell_index+$smoothness) {
620             next if $index < 0 || $index > 999;
621             $accumulator[$index]++;
622             }
623             }
624             my $peaks_array = non_maximum_suppression( \@accumulator );
625             my $peaks_index_hash = get_value_index_hash( $peaks_array );
626             my @K_highest_peak_locations;
627             my $k = 0;
628             foreach my $peak (sort {$b <=> $a} keys %$peaks_index_hash) {
629             my $unscaled_peak_point = $min + $peaks_index_hash->{$peak} * $scale * $delta;
630             push @K_highest_peak_locations, $unscaled_peak_point
631             if $k < $how_many;
632             last if ++$k == $how_many;
633             }
634             return @K_highest_peak_locations;
635             }
636              
637             # The purpose of this routine is to form initial clusters by assigning the data
638             # samples to the initial clusters formed by the previous routine on the basis of the
639             # best proximity of the data samples to the different cluster centers.
640             sub assign_data_to_clusters_initial {
641             my $self = shift;
642             my @cluster_centers = @{ shift @_ };
643             my @clusters;
644             foreach my $ele (@{$self->{_data_id_tags}}) {
645             my $best_cluster;
646             my @dist_from_clust_centers;
647             foreach my $center (@cluster_centers) {
648             push @dist_from_clust_centers, $self->distance($ele, $center);
649             }
650             my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
651             push @{$clusters[$best_center_index]}, $ele;
652             }
653             return \@clusters;
654             }
655              
656             sub assign_data_to_clusters_initial_mahalanobis {
657             my $self = shift;
658             my @cluster_centers = @{ shift @_ };
659             my @clusters;
660             foreach my $ele (@{$self->{_data_id_tags}}) {
661             my $best_cluster;
662             my @dist_from_clust_centers;
663             foreach my $center (@cluster_centers) {
664             push @dist_from_clust_centers, $self->distance($ele, $center);
665             }
666             my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
667             push @{$clusters[$best_center_index]}, $ele if defined $best_center_index;
668             }
669             # Since a cluster center may not correspond to any particular sample, it is possible
670             # for one of the elements of the array @clusters to be null using the above
671             # strategy for populating the initial clusters. Let's say there are five cluster
672             # centers in the array @cluster_centers. The $best_center_index may populate the
673             # the elements of the array @clusters for the indices 0, 1, 2, 4, which would leave
674             # $clusters[3] as undefined. So, in what follows, we must first check if all of
675             # the elements of @clusters are defined.
676             my @determinants;
677             foreach my $cluster(@clusters) {
678             die "The clustering program started with bad initialization. Please start over"
679             unless defined $cluster;
680             my $covariance = $self->estimate_cluster_covariance($cluster);
681             my $determinant = $covariance->det();
682             push @determinants, $determinant;
683             }
684             return [\@clusters, \@determinants];
685             }
686              
687             # This is the main routine that along with the update_cluster_centers() routine
688             # constitute the two key steps of the K-Means algorithm. In most cases, the infinite
689             # while() loop will terminate automatically when the cluster assignments of the data
690             # points remain unchanged. For the sake of safety, we keep track of the number of
691             # iterations. If this number reaches 100, we exit the while() loop anyway. In most
692             # cases, this limit will not be reached.
693             sub assign_data_to_clusters {
694             my $self = shift;
695             my $clusters = shift;
696             my $K = shift;
697             my $final_cluster_centers;
698             my $iteration_index = 0;
699             while (1) {
700             my $new_clusters;
701             my $assignment_changed_flag = 0;
702             my $current_cluster_center_index = 0;
703             my $cluster_size_zero_condition = 0;
704             my $how_many = @$clusters;
705             my $cluster_centers = $self->update_cluster_centers( deep_copy_AoA_with_nulls( $clusters ) );
706             $iteration_index++;
707             foreach my $cluster (@$clusters) {
708             my $current_cluster_center = $cluster_centers->[$current_cluster_center_index];
709             foreach my $ele (@$cluster) {
710             my @dist_from_clust_centers;
711             foreach my $center (@$cluster_centers) {
712             push @dist_from_clust_centers,
713             $self->distance($ele, $center);
714             }
715             my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
716             my $best_cluster_center = $cluster_centers->[$best_center_index];
717             if (vector_equal($current_cluster_center, $best_cluster_center)){
718             push @{$new_clusters->[$current_cluster_center_index]}, $ele;
719             } else {
720             $assignment_changed_flag = 1;
721             push @{$new_clusters->[$best_center_index]}, $ele;
722             }
723             }
724             $current_cluster_center_index++;
725             }
726             next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
727             # Now make sure that none of the K clusters is an empty cluster:
728             foreach my $newcluster (@$new_clusters) {
729             $cluster_size_zero_condition = 1 if ((!defined $newcluster) or (@$newcluster == 0));
730             }
731             push @$new_clusters, (undef) x ($K - @$new_clusters) if @$new_clusters < $K;
732             # During clustering for a fixed K, should a cluster inadvertently
733             # become empty, steal a member from the largest cluster to hopefully
734             # spawn a new cluster:
735             my $largest_cluster;
736             foreach my $local_cluster (@$new_clusters) {
737             next if !defined $local_cluster;
738             $largest_cluster = $local_cluster if !defined $largest_cluster;
739             if (@$local_cluster > @$largest_cluster) {
740             $largest_cluster = $local_cluster;
741             }
742             }
743             foreach my $local_cluster (@$new_clusters) {
744             if ( (!defined $local_cluster) || (@$local_cluster == 0) ) {
745             push @$local_cluster, pop @$largest_cluster;
746             }
747             }
748             next if (($cluster_size_zero_condition) && ($iteration_index < 100));
749             last if $iteration_index == 100;
750             $clusters = deep_copy_AoA( $new_clusters );
751             last if $assignment_changed_flag == 0;
752             }
753             $final_cluster_centers = $self->update_cluster_centers( $clusters );
754             return ($clusters, $final_cluster_centers);
755             }
756              
757             sub assign_data_to_clusters_mahalanobis {
758             my $self = shift;
759             my $clusters = shift;
760             my $K = shift;
761             my $final_cluster_centers;
762             my $iteration_index = 0;
763             while (1) {
764             my $new_clusters;
765             my $assignment_changed_flag = 0;
766             my $current_cluster_center_index = 0;
767             my $cluster_size_zero_condition = 0;
768             my $how_many = @$clusters;
769             my $cluster_centers_and_covariances =
770             $self->update_cluster_centers_and_covariances_mahalanobis(deep_copy_AoA_with_nulls($clusters));
771             my $cluster_centers = $cluster_centers_and_covariances->[0];
772             my $cluster_covariances = $cluster_centers_and_covariances->[1];
773             $iteration_index++;
774             foreach my $cluster (@$clusters) {
775             my $current_cluster_center = $cluster_centers->[$current_cluster_center_index];
776             my $current_cluster_covariance = $cluster_covariances->[$current_cluster_center_index];
777             foreach my $ele (@$cluster) {
778             my @mahalanobis_dist_from_clust_centers;
779             foreach my $i (0..@$clusters-1) {
780             my $center = $cluster_centers->[$i];
781             my $covariance = $cluster_covariances->[$i];
782             my $maha_distance;
783             eval {
784             $maha_distance = $self->distance_mahalanobis($ele, $center, $covariance);
785             };
786             next if $@;
787             push @mahalanobis_dist_from_clust_centers, $maha_distance;
788             }
789             my ($min, $best_center_index) = minimum( \@mahalanobis_dist_from_clust_centers );
790             die "The Mahalanobis metric may not be appropriate for the data"
791             unless defined $best_center_index;
792             my $best_cluster_center = $cluster_centers->[$best_center_index];
793             if (vector_equal($current_cluster_center, $best_cluster_center)){
794             push @{$new_clusters->[$current_cluster_center_index]}, $ele;
795             } else {
796             $assignment_changed_flag = 1;
797             push @{$new_clusters->[$best_center_index]}, $ele;
798             }
799             }
800             $current_cluster_center_index++;
801             }
802             next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
803             # Now make sure that none of the K clusters is an empty cluster:
804             foreach my $newcluster (@$new_clusters) {
805             $cluster_size_zero_condition = 1 if ((!defined $newcluster) or (@$newcluster == 0));
806             }
807             push @$new_clusters, (undef) x ($K - @$new_clusters) if @$new_clusters < $K;
808             # During clustering for a fixed K, should a cluster inadvertently
809             # become empty, steal a member from the largest cluster to hopefully
810             # spawn a new cluster:
811             my $largest_cluster;
812             foreach my $local_cluster (@$new_clusters) {
813             next if !defined $local_cluster;
814             $largest_cluster = $local_cluster if !defined $largest_cluster;
815             if (@$local_cluster > @$largest_cluster) {
816             $largest_cluster = $local_cluster;
817             }
818             }
819             foreach my $local_cluster (@$new_clusters) {
820             if ( (!defined $local_cluster) || (@$local_cluster == 0) ) {
821             push @$local_cluster, pop @$largest_cluster;
822             }
823             }
824             next if (($cluster_size_zero_condition) && ($iteration_index < 100));
825             last if $iteration_index == 100;
826             # Now do a deep copy of new_clusters into clusters
827             $clusters = deep_copy_AoA( $new_clusters );
828             last if $assignment_changed_flag == 0;
829             }
830             $final_cluster_centers = $self->update_cluster_centers( $clusters );
831             return ($clusters, $final_cluster_centers);
832             }
833              
834             sub update_cluster_centers_and_covariances_mahalanobis {
835             my $self = shift;
836             my @clusters = @{ shift @_ };
837             my @new_cluster_centers;
838             my @new_cluster_covariances;
839             # During clustering for a fixed K, should a cluster inadvertently become empty,
840             # steal a member from the largest cluster to hopefully spawn a new cluster:
841             my $largest_cluster;
842             foreach my $cluster (@clusters) {
843             next if !defined $cluster;
844             $largest_cluster = $cluster if !defined $largest_cluster;
845             if (@$cluster > @$largest_cluster) {
846             $largest_cluster = $cluster;
847             }
848             }
849             foreach my $cluster (@clusters) {
850             if ( (!defined $cluster) || (@$cluster == 0) ) {
851             push @$cluster, pop @$largest_cluster;
852             }
853             }
854             foreach my $cluster (@clusters) {
855             die "Cluster became empty --- untenable condition " .
856             "for a given K. Try again. \n" if !defined $cluster;
857             my $cluster_size = @$cluster;
858             die "Cluster size is zero --- untenable.\n" if $cluster_size == 0;
859             my @new_cluster_center = @{$self->add_point_coords( $cluster )};
860             @new_cluster_center = map {my $x = $_/$cluster_size; $x} @new_cluster_center;
861             push @new_cluster_centers, \@new_cluster_center;
862             # for covariance calculation:
863             my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$cluster));
864             my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
865             my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
866             # All the record labels are stored in the array $self->{_data_id_tags}. The
867             # actual data for clustering is stored in a hash at $self->{_data} whose keys are
868             # the record labels; the value associated with each key is the array holding the
869             # corresponding numerical multidimensional data.
870             foreach my $j (0..$num_cols-1) {
871             my $tag = $cluster->[$j];
872             my $data = $self->{_data}->{$tag};
873             my @diff_from_mean = vector_subtract($data, \@new_cluster_center);
874             $matrix->set_col($j, \@diff_from_mean);
875             }
876             my $transposed = transpose( $matrix );
877             my $covariance = matrix_multiply( $matrix, $transposed );
878             $covariance *= 1.0 / $num_cols;
879             if ($self->{_debug}) {
880             print "\nDisplaying the Covariance Matrix for cluster:";
881             display_matrix( $covariance );
882             }
883             push @new_cluster_covariances, $covariance;
884             }
885             return [\@new_cluster_centers, \@new_cluster_covariances];
886             }
887              
888             # After each new assignment of the data points to the clusters on the basis of the
889             # current values for the cluster centers, we call the routine shown here for updating
890             # the values of the cluster centers.
891             sub update_cluster_centers {
892             my $self = shift;
893             my @clusters = @{ shift @_ };
894             my @new_cluster_centers;
895             # During clustering for a fixed K, should a cluster inadvertently become empty,
896             # steal a member from the largest cluster to hopefully spawn a new cluster:
897             my $largest_cluster;
898             foreach my $cluster (@clusters) {
899             next if !defined $cluster;
900             $largest_cluster = $cluster if !defined $largest_cluster;
901             if (@$cluster > @$largest_cluster) {
902             $largest_cluster = $cluster;
903             }
904             }
905             foreach my $cluster (@clusters) {
906             if ( (!defined $cluster) || (@$cluster == 0) ) {
907             push @$cluster, pop @$largest_cluster;
908             }
909             }
910             foreach my $cluster (@clusters) {
911             die "Cluster became empty --- untenable condition " .
912             "for a given K. Try again. \n" if !defined $cluster;
913             my $cluster_size = @$cluster;
914             die "Cluster size is zero --- untenable.\n" if $cluster_size == 0;
915             my @new_cluster_center = @{$self->add_point_coords( $cluster )};
916             @new_cluster_center = map {my $x = $_/$cluster_size; $x}
917             @new_cluster_center;
918             push @new_cluster_centers, \@new_cluster_center;
919             }
920             return \@new_cluster_centers;
921             }
922              
923             sub which_cluster_for_new_data_element {
924             my $self = shift;
925             my $ele = shift;
926             die "The dimensionality of the new data element is not correct: $!"
927             unless @$ele == $self->{_data_dimensions};
928             my %distance_to_new_ele_hash;
929             foreach my $cluster_id (sort keys %{$self->{_cluster_centers_hash}}) {
930             $distance_to_new_ele_hash{$cluster_id} = $self->distance2($ele,
931             $self->{_cluster_centers_hash}->{$cluster_id});
932             }
933             my @values = values %distance_to_new_ele_hash;
934             my ($min,$max) = minmax(\@values);
935             my $answer;
936             foreach my $cluster_id (keys %distance_to_new_ele_hash) {
937             $answer = $cluster_id if $distance_to_new_ele_hash{$cluster_id} == $min;
938             }
939             return $answer;
940             }
941              
942             sub which_cluster_for_new_data_element_mahalanobis {
943             my $self = shift;
944             my $ele = shift;
945             die "The dimensionality of the new data element is not correct: $!"
946             unless @$ele == $self->{_data_dimensions};
947             my %distance_to_new_ele_hash;
948             foreach my $cluster_id (sort keys %{$self->{_cluster_centers_hash}}) {
949             $distance_to_new_ele_hash{$cluster_id} =
950             $self->distance_mahalanobis2($ele, $self->{_cluster_centers_hash}->{$cluster_id},
951             $self->{_cluster_covariances_hash}->{$cluster_id});
952             }
953             my @values = values %distance_to_new_ele_hash;
954             my ($min,$max) = minmax(\@values);
955             my $answer;
956             foreach my $cluster_id (keys %distance_to_new_ele_hash) {
957             $answer = $cluster_id if $distance_to_new_ele_hash{$cluster_id} == $min;
958             }
959             return $answer;
960             }
961              
962             # The following function returns the value of QoC for a given partitioning of the
963             # data into K clusters. It calculates two things: the average value for the distance
964             # between a data point and the center of the cluster in which the data point resides,
965             # and the average value for the distances between the cluster centers. We obviously
966             # want to minimize the former and maximize the latter. All of the "from center"
967             # distances within each cluster are stored in the variable
968             # $sum_of_distances_for_one_cluster. When this variable, after it is divided by the
969             # number of data elements in the cluster, is summed over all the clusters, we get a
970             # value that is stored in $avg_dist_for_cluster. The inter-cluster-center distances
971             # are stored in the variable $inter_cluster_center_dist.
972             sub cluster_quality {
973             my $self = shift;
974             my $clusters = shift;
975             my $cluster_centers = shift;
976             my $K = @$cluster_centers; # Number of clusters
977             my $cluster_radius = 0;
978             foreach my $i (0..@$clusters-1) {
979             my $sum_of_distances_for_one_cluster = 0;
980             foreach my $ele (@{$clusters->[$i]}) {
981             $sum_of_distances_for_one_cluster +=
982             $self->distance( $ele, $cluster_centers->[$i] );
983             }
984             $cluster_radius +=
985             $sum_of_distances_for_one_cluster / @{$clusters->[$i]};
986             }
987             my $inter_cluster_center_dist = 0;
988             foreach my $i (0..@$cluster_centers-1) {
989             foreach my $j (0..@$cluster_centers-1) {
990             $inter_cluster_center_dist +=
991             $self->distance2( $cluster_centers->[$i],
992             $cluster_centers->[$j] );
993             }
994             }
995             my $avg_inter_cluster_center_dist = $inter_cluster_center_dist /
996             ( $K * ($K-1) / 2.0 );
997             return $cluster_radius / $avg_inter_cluster_center_dist;
998             }
999              
1000             # The following routine is for computing the distance between a data point specified
1001             # by its symbolic name in the master datafile and a point (such as the center of a
1002             # cluster) expressed as a vector of coordinates:
1003             sub distance {
1004             my $self = shift;
1005             my $ele1_id = shift @_; # symbolic name of data sample
1006             my @ele1 = @{$self->{_data}->{$ele1_id}};
1007             my @ele2 = @{shift @_};
1008             die "wrong data types for distance calculation\n" if @ele1 != @ele2;
1009             my $how_many = @ele1;
1010             my $squared_sum = 0;
1011             foreach my $i (0..$how_many-1) {
1012             $squared_sum += ($ele1[$i] - $ele2[$i])**2;
1013             }
1014             my $dist = sqrt $squared_sum;
1015             return $dist;
1016             }
1017              
1018             # The following routine does the same as above but now both arguments are expected to
1019             # be arrays of numbers:
1020             sub distance2 {
1021             my $self = shift;
1022             my @ele1 = @{shift @_};
1023             my @ele2 = @{shift @_};
1024             die "wrong data types for distance calculation\n" if @ele1 != @ele2;
1025             my $how_many = @ele1;
1026             my $squared_sum = 0;
1027             foreach my $i (0..$how_many-1) {
1028             $squared_sum += ($ele1[$i] - $ele2[$i])**2;
1029             }
1030             return sqrt $squared_sum;
1031             }
1032              
1033             # Requires three args: $ele for the symbolic tag of the element, $center for the
1034             # coordinates of the center of the cluster, and $covariance for the covariance of
1035             # cluster. Our goal is to find the distance of the element ele from the cluster
1036             # whose mean and covariance are provided.
1037             sub distance_mahalanobis {
1038             my $self = shift;
1039             my $ele = shift;
1040             my $cluster_center = shift;
1041             my $cluster_covariance = shift;
1042             my $det_of_covar = $cluster_covariance->det();
1043             my $ele_data = $self->{_data}->{$ele};
1044             my @diff_from_mean = vector_subtract($ele_data, $cluster_center);
1045             my $vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
1046             $vec->set_col(0, \@diff_from_mean);
1047             my $transposed = transpose( $vec );
1048             my $covariance_inverse;
1049             if ($cluster_covariance->det() > 0.001) {
1050             $covariance_inverse = $cluster_covariance->inverse;
1051             } else {
1052             die "covariance matrix may not have an inverse";
1053             }
1054             my $determinant = $covariance_inverse->det();
1055             my $distance = $transposed * $covariance_inverse * $vec;
1056             my @distance = $distance->as_list;
1057             $distance = $distance[0];
1058             return sqrt $distance;
1059             }
1060             # Same as the previous method except that the first argument can be the actual
1061             # coordinates of the data element. Our goal is to find the Mahalanobis distance
1062             # from a given data element to a cluster whose center and covariance are known. As
1063             # for the previous method, it requires three arguments.
1064             sub distance_mahalanobis2 {
1065             my $self = shift;
1066             my $ele = shift; # is now a ref to the array of coords for a point
1067             my $cluster_center = shift;
1068             my $cluster_covariance = shift;
1069             return undef unless defined $cluster_covariance;
1070             my $det_of_covar = $cluster_covariance->det();
1071             my @diff_from_mean = vector_subtract($ele, $cluster_center);
1072             my $vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
1073             $vec->set_col(0, \@diff_from_mean);
1074             my $transposed = transpose( $vec );
1075             my $covariance_inverse;
1076             if ($cluster_covariance->det() > 0.001) {
1077             $covariance_inverse = $cluster_covariance->inverse;
1078             } else {
1079             die "covariance matrix may not have an inverse";
1080             }
1081             my $determinant = $covariance_inverse->det();
1082             my $distance = $transposed * $covariance_inverse * $vec;
1083             my @distance = $distance->as_list;
1084             $distance = $distance[0];
1085             return sqrt $distance;
1086             }
1087              
1088             sub estimate_cluster_covariance {
1089             my $self = shift;
1090             my $cluster = shift;
1091             my $cluster_size = @$cluster;
1092             my @cluster_center = @{$self->add_point_coords( $cluster )};
1093             @cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;
1094             # for covariance calculation:
1095             my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$cluster));
1096             my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
1097             my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
1098             # All the record labels are stored in the array $self->{_data_id_tags}. The
1099             # actual data for clustering is stored in a hash at $self->{_data} whose keys are
1100             # the record labels; the value associated with each key is the array holding the
1101             # corresponding numerical multidimensional data.
1102             foreach my $j (0..$num_cols-1) {
1103             my $tag = $cluster->[$j];
1104             my $data = $self->{_data}->{$tag};
1105             my @diff_from_mean = vector_subtract($data, \@cluster_center);
1106             $matrix->set_col($j, \@diff_from_mean);
1107             }
1108             my $transposed = transpose( $matrix );
1109             my $covariance = $matrix * $transposed;
1110             $covariance *= 1.0 / $num_cols;
1111             if ($self->{_debug}) {
1112             print "\nDisplaying the Covariance Matrix for cluster:";
1113             display_matrix( $covariance );
1114             }
1115             return $covariance;
1116             }
1117              
1118             sub write_clusters_to_files {
1119             my $self = shift;
1120             my @clusters = @{$self->{_clusters}};
1121             unlink glob "cluster*.dat";
1122             foreach my $i (0..@clusters-1) {
1123             my $filename = "cluster" . $i . ".txt";
1124             print "\nWriting cluster $i to file $filename\n" if $self->{_terminal_output};
1125             open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!";
1126             foreach my $ele (@{$clusters[$i]}) {
1127             print FILEHANDLE "$ele ";
1128             }
1129             close FILEHANDLE;
1130             }
1131             }
1132              
1133             sub get_K_best {
1134             my $self = shift;
1135             croak "You need to run the clusterer with K=0 option " .
1136             "before you can call this method" if $self->{_K_best} eq 'unknown';
1137             print "\nThe best value of K: $self->{_K_best}\n" if $self->{_terminal_output};
1138             return $self->{_K_best};
1139             }
1140              
1141             sub show_QoC_values {
1142             my $self = shift;
1143             croak "\nYou need to run the clusterer with K=0 option before you can call this method"
1144             if $self->{_K_best} eq 'unknown';
1145             print "\nShown below are K on the left and the QoC values on the right (the smaller " .
1146             "the QoC, the better it is)\n";
1147             foreach my $key (sort keys %{$self->{_QoC_values}} ) {
1148             print " $key => $self->{_QoC_values}->{$key}\n" if defined $self->{_QoC_values}->{$key};
1149             }
1150             }
1151              
1152             sub DESTROY {
1153             unlink "__temp_" . basename($_[0]->{_datafile});
1154             unlink "__temp_data_" . basename($_[0]->{_datafile});
1155             unlink "__temp_normed_data_" . basename($_[0]->{_datafile});
1156             }
1157              
1158             ################################## Visualization Code ###################################
1159              
1160             # It makes sense to call visualize_clusters() only AFTER you have called kmeans().
1161             #
1162             # The visualize_clusters() implementation automatically figures out whether it
1163             # should do a 2D plot or a 3D plot. If the number of on bits in the mask that is
1164             # supplied as one of the arguments is greater than 2, it does a 3D plot for the
1165             # first three data coordinates. That is, the clusters will be displayed in the 3D
1166             # space formed by the first three data coordinates. On the other hand, if the number
1167             # of on bits in the mask is exactly 2, it does a 2D plot. Should it happen that
1168             # only one on bit is specified for the mask, visualize_clusters() aborts.
1169             #
1170             # The visualization code consists of first accessing each of clusters created by the
1171             # kmeans() subroutine. Note that the clusters contain only the symbolic names for
1172             # the individual records in the source data file. We therefore next reach into the
1173             # $self->{_original_data} hash and get the data coordinates associated with each
1174             # symbolic label in a cluster. The numerical data thus generated is then written
1175             # out to a temp file. When doing so we must remember to insert TWO BLANK LINES
1176             # between the data blocks corresponding to the different clusters. This constraint
1177             # is imposed on us by Gnuplot when plotting data from the same file since we want to
1178             # use different point styles for the data points in different cluster files.
1179             #
1180             # Subsequently, we call upon the Perl interface provided by the Graphics::GnuplotIF
1181             # module to plot the data clusters.
1182             sub visualize_clusters {
1183             my $self = shift;
1184             my $v_mask;
1185             my $pause_time;
1186             if (@_ == 1) {
1187             $v_mask = shift || croak "visualization mask missing";
1188             } elsif (@_ == 2) {
1189             $v_mask = shift || croak "visualization mask missing";
1190             $pause_time = shift;
1191             } else {
1192             croak "visualize_clusters() called with wrong args";
1193             }
1194             my $master_datafile = $self->{_datafile};
1195             my @v_mask = split //, $v_mask;
1196             my $visualization_mask_width = @v_mask;
1197             my $original_data_mask = $self->{_mask};
1198             my @mask = split //, $original_data_mask;
1199             my $data_field_width = scalar grep {$_ eq '1'} @mask;
1200             croak "\n\nABORTED: The width of the visualization mask (including " .
1201             "all its 1s and 0s) must equal the width of the original mask " .
1202             "used for reading the data file (counting only the 1's)"
1203             if $visualization_mask_width != $data_field_width;
1204             my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1205             my %visualization_data;
1206             while ( my ($record_id, $data) = each %{$self->{_original_data}} ) {
1207             my @fields = @$data;
1208             croak "\nABORTED: Visualization mask size exceeds data record size\n"
1209             if $#v_mask > $#fields;
1210             my @data_fields;
1211             foreach my $i (0..@fields-1) {
1212             if ($v_mask[$i] eq '0') {
1213             next;
1214             } elsif ($v_mask[$i] eq '1') {
1215             push @data_fields, $fields[$i];
1216             } else {
1217             croak "Misformed visualization mask. It can only have 1s and 0s\n";
1218             }
1219             }
1220             $visualization_data{ $record_id } = \@data_fields;
1221             }
1222             my @all_data_ids = @{$self->{_data_id_tags}};
1223             my $K = scalar @{$self->{_clusters}};
1224             my $filename = basename($master_datafile);
1225             my $temp_file = "__temp_" . $filename;
1226             unlink $temp_file if -e $temp_file;
1227             open OUTPUT, ">$temp_file"
1228             or die "Unable to open a temp file in this directory: $!\n";
1229             foreach my $cluster (@{$self->{_clusters}}) {
1230             foreach my $item (@$cluster) {
1231             print OUTPUT "@{$visualization_data{$item}}";
1232             print OUTPUT "\n";
1233             }
1234             print OUTPUT "\n\n";
1235             }
1236             close OUTPUT;
1237             my $plot;
1238             my $hardcopy_plot;
1239             if (!defined $pause_time) {
1240             $plot = Graphics::GnuplotIF->new( persist => 1 );
1241             $hardcopy_plot = Graphics::GnuplotIF->new();
1242             $hardcopy_plot->gnuplot_cmd('set terminal png', "set output \"clustering_results.png\"");
1243             } else {
1244             $plot = Graphics::GnuplotIF->new();
1245             }
1246             $plot->gnuplot_cmd( "set noclip" );
1247             $plot->gnuplot_cmd( "set pointsize 2" );
1248             my $arg_string = "";
1249             if ($visualization_data_field_width > 2) {
1250             foreach my $i (0..$K-1) {
1251             my $j = $i + 1;
1252             $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i\" with points lt $j pt $j, ";
1253             }
1254             } elsif ($visualization_data_field_width == 2) {
1255             foreach my $i (0..$K-1) {
1256             my $j = $i + 1;
1257             $arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i\" with points lt $j pt $j, ";
1258             }
1259             } elsif ($visualization_data_field_width == 1 ) {
1260             foreach my $i (0..$K-1) {
1261             my $j = $i + 1;
1262             $arg_string .= "\"$temp_file\" index $i using 1 title \"Cluster $i\" with points lt $j pt $j, ";
1263             }
1264             }
1265             $arg_string = $arg_string =~ /^(.*),[ ]+$/;
1266             $arg_string = $1;
1267             if ($visualization_data_field_width > 2) {
1268             $plot->gnuplot_cmd( "splot $arg_string" );
1269             $hardcopy_plot->gnuplot_cmd( "splot $arg_string" ) unless defined $pause_time;
1270             $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1271             } elsif ($visualization_data_field_width == 2) {
1272             $plot->gnuplot_cmd( "plot $arg_string" );
1273             $hardcopy_plot->gnuplot_cmd( "plot $arg_string" ) unless defined $pause_time;
1274             $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1275             } elsif ($visualization_data_field_width == 1) {
1276             croak "No provision for plotting 1-D data\n";
1277             }
1278             }
1279              
1280             # It makes sense to call visualize_data() only AFTER you have called the method
1281             # read_data_from_file().
1282             #
1283             # The visualize_data() is meant for the visualization of the original data in its
1284             # various 2D or 3D subspaces. The method can also be used to visualize the normed
1285             # data in a similar manner. Recall the normed data is the original data after each
1286             # data dimension is normalized by the standard-deviation along that dimension.
1287             #
1288             # Whether you see the original data or the normed data depends on the second
1289             # argument supplied in the method call. It must be either the string 'original' or
1290             # the string 'normed'.
1291             sub visualize_data {
1292             my $self = shift;
1293             my $v_mask = shift || croak "visualization mask missing";
1294             my $datatype = shift; # must be either 'original' or 'normed'
1295             croak "\n\nABORTED: You called visualize_data() for normed data " .
1296             "but without first turning on data normalization in the " .
1297             "in the KMeans constructor"
1298             if ($datatype eq 'normed') && ! $self->{_var_normalize};
1299             my $master_datafile = $self->{_datafile};
1300             my @v_mask = split //, $v_mask;
1301             my $visualization_mask_width = @v_mask;
1302             my $original_data_mask = $self->{_mask};
1303             my @mask = split //, $original_data_mask;
1304             my $data_field_width = scalar grep {$_ eq '1'} @mask;
1305             croak "\n\nABORTED: The width of the visualization mask (including " .
1306             "all its 1s and 0s) must equal the width of the original mask " .
1307             "used for reading the data file (counting only the 1's)"
1308             if $visualization_mask_width != $data_field_width;
1309             my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1310             my %visualization_data;
1311             my $data_source;
1312             if ($datatype eq 'original') {
1313             $data_source = $self->{_original_data};
1314             } elsif ($datatype eq 'normed') {
1315             $data_source = $self->{_data};
1316             } else {
1317             croak "\n\nABORTED: improper call to visualize_data()";
1318             }
1319             while ( my ($record_id, $data) = each %{$data_source} ) {
1320             my @fields = @$data;
1321             croak "\nABORTED: Visualization mask size exceeds data record size\n"
1322             if $#v_mask > $#fields;
1323             my @data_fields;
1324             foreach my $i (0..@fields-1) {
1325             if ($v_mask[$i] eq '0') {
1326             next;
1327             } elsif ($v_mask[$i] eq '1') {
1328             push @data_fields, $fields[$i];
1329             } else {
1330             croak "Misformed visualization mask. It can only have 1s and 0s\n";
1331             }
1332             }
1333             $visualization_data{ $record_id } = \@data_fields;
1334             }
1335             my $filename = basename($master_datafile);
1336             my $temp_file;
1337             if ($datatype eq 'original') {
1338             $temp_file = "__temp_data_" . $filename;
1339             } elsif ($datatype eq 'normed') {
1340             $temp_file = "__temp_normed_data_" . $filename;
1341             } else {
1342             croak "ABORTED: Improper call to visualize_data()";
1343             }
1344             unlink $temp_file if -e $temp_file;
1345             open OUTPUT, ">$temp_file"
1346             or die "Unable to open a temp file in this directory: $!\n";
1347             foreach my $datapoint (values %visualization_data) {
1348             print OUTPUT "@$datapoint";
1349             print OUTPUT "\n";
1350             }
1351             close OUTPUT;
1352             my $plot = Graphics::GnuplotIF->new( persist => 1 );
1353             $plot->gnuplot_cmd( "set noclip" );
1354             $plot->gnuplot_cmd( "set pointsize 2" );
1355             my $plot_title = $datatype eq 'original' ? '"data"' : '"normed data"';
1356             my $arg_string ;
1357             if ($visualization_data_field_width > 2) {
1358             $arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
1359             } elsif ($visualization_data_field_width == 2) {
1360             $arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
1361             } elsif ($visualization_data_field_width == 1 ) {
1362             $arg_string = "\"$temp_file\" using 1 notitle with points lt -1 pt 1";
1363             }
1364             if ($visualization_data_field_width > 2) {
1365             $plot->gnuplot_cmd( "splot $arg_string" );
1366             } elsif ($visualization_data_field_width == 2) {
1367             $plot->gnuplot_cmd( "plot $arg_string" );
1368             } elsif ($visualization_data_field_width == 1) {
1369             croak "No provision for plotting 1-D data\n";
1370             }
1371             }
1372              
1373             ########################### Generating Synthetic Data for Clustering ##############################
1374              
1375             # The data generated corresponds to a multivariate distribution. The mean and the
1376             # covariance of each Gaussian in the distribution are specified individually in a
1377             # parameter file. See the example parameter file param.txt in the examples
1378             # directory. Just edit this file for your own needs.
1379             #
1380             # The multivariate random numbers are generated by calling the Math::Random module.
1381             # As you would expect, that module will insist that the covariance matrix you
1382             # specify be symmetric and positive definite.
1383             sub cluster_data_generator {
1384             my $class = shift;
1385             croak "illegal call of a class method" unless $class eq 'Algorithm::KMeans';
1386             my %args = @_;
1387             my $input_parameter_file = $args{input_parameter_file};
1388             my $output_file = $args{output_datafile};
1389             my $N = $args{number_data_points_per_cluster};
1390             my @all_params;
1391             my $param_string;
1392             if (defined $input_parameter_file) {
1393             open INPUT, $input_parameter_file || "unable to open parameter file: $!";
1394             @all_params = ;
1395             @all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
1396             chomp @all_params;
1397             $param_string = join ' ', @all_params;
1398             } else {
1399             # Just for testing. Used in t/test.t
1400             $param_string = "cluster 5 0 0 1 0 0 0 1 0 0 0 1 " .
1401             "cluster 0 5 0 1 0 0 0 1 0 0 0 1 " .
1402             "cluster 0 0 5 1 0 0 0 1 0 0 0 1";
1403             }
1404             my @cluster_strings = split /[ ]*cluster[ ]*/, $param_string;
1405             @cluster_strings = grep $_, @cluster_strings;
1406             my $K = @cluster_strings;
1407             croak "Too many clusters requested" if $K > 12;
1408             my @point_labels = ('a'..'z');
1409             print "Number of Gaussians used for the synthetic data: $K\n";
1410             my @means;
1411             my @covariances;
1412             my $data_dimension;
1413             foreach my $i (0..$K-1) {
1414             my @num_strings = split / /, $cluster_strings[$i];
1415             my @cluster_mean = map {/$_num_regex/;$_} split / /, $num_strings[0];
1416             $data_dimension = @cluster_mean;
1417             push @means, \@cluster_mean;
1418             my @covariance_nums = map {/$_num_regex/;$_} split / /, $num_strings[1];
1419             croak "dimensionality error" if @covariance_nums !=
1420             ($data_dimension ** 2);
1421             my $cluster_covariance;
1422             foreach my $j (0..$data_dimension-1) {
1423             foreach my $k (0..$data_dimension-1) {
1424             $cluster_covariance->[$j]->[$k] =
1425             $covariance_nums[$j*$data_dimension + $k];
1426             }
1427             }
1428             push @covariances, $cluster_covariance;
1429             }
1430             random_seed_from_phrase( 'hellojello' );
1431             my @data_dump;
1432             foreach my $i (0..$K-1) {
1433             my @m = @{shift @means};
1434             my @covar = @{shift @covariances};
1435             my @new_data = Math::Random::random_multivariate_normal( $N, @m, @covar );
1436             my $p = 0;
1437             my $label = $point_labels[$i];
1438             @new_data = map {unshift @$_, $label.$i; $i++; $_} @new_data;
1439             push @data_dump, @new_data;
1440             }
1441             fisher_yates_shuffle( \@data_dump );
1442             open OUTPUT, ">$output_file";
1443             foreach my $ele (@data_dump) {
1444             foreach my $coord ( @$ele ) {
1445             print OUTPUT "$coord ";
1446             }
1447             print OUTPUT "\n";
1448             }
1449             print "Data written out to file $output_file\n";
1450             close OUTPUT;
1451             }
1452              
1453             sub add_point_coords {
1454             my $self = shift;
1455             my @arr_of_ids = @{shift @_}; # array of data element names
1456             my @result;
1457             my $data_dimensionality = $self->{_data_dimensions};
1458             foreach my $i (0..$data_dimensionality-1) {
1459             $result[$i] = 0.0;
1460             }
1461             foreach my $id (@arr_of_ids) {
1462             my $ele = $self->{_data}->{$id};
1463             my $i = 0;
1464             foreach my $component (@$ele) {
1465             $result[$i] += $component;
1466             $i++;
1467             }
1468             }
1469             return \@result;
1470             }
1471              
1472             sub add_point_coords_from_original_data {
1473             my $self = shift;
1474             my @arr_of_ids = @{shift @_}; # array of data element names
1475             my @result;
1476             my $data_dimensionality = $self->{_data_dimensions};
1477             foreach my $i (0..$data_dimensionality-1) {
1478             $result[$i] = 0.0;
1479             }
1480             foreach my $id (@arr_of_ids) {
1481             my $ele = $self->{_original_data}->{$id};
1482             my $i = 0;
1483             foreach my $component (@$ele) {
1484             $result[$i] += $component;
1485             $i++;
1486             }
1487             }
1488             return \@result;
1489             }
1490              
1491             ################################### Support Routines ########################################
1492              
1493             sub get_index_at_value {
1494             my $value = shift;
1495             my @array = @{shift @_};
1496             foreach my $i (0..@array-1) {
1497             return $i if $value == $array[$i];
1498             }
1499             return -1;
1500             }
1501              
1502             # This routine is really not necessary in light of the new `~~' operator in Perl.
1503             # Will use the new operator in the next version.
1504             sub vector_equal {
1505             my $vec1 = shift;
1506             my $vec2 = shift;
1507             die "wrong data types for vector-equal predicate\n" if @$vec1 != @$vec2;
1508             foreach my $i (0..@$vec1-1){
1509             return 0 if $vec1->[$i] != $vec2->[$i];
1510             }
1511             return 1;
1512             }
1513              
1514             # Returns the minimum value and its positional index in an array
1515             sub minimum {
1516             my $arr = shift;
1517             my $min;
1518             my $index;
1519             foreach my $i (0..@{$arr}-1) {
1520             if ( (!defined $min) || ($arr->[$i] < $min) ) {
1521             $index = $i;
1522             $min = $arr->[$i];
1523             }
1524             }
1525             return ($min, $index);
1526             }
1527              
1528             sub minmax {
1529             my $arr = shift;
1530             my $min;
1531             my $max;
1532             foreach my $i (0..@{$arr}-1) {
1533             next if !defined $arr->[$i];
1534             if ( (!defined $min) && (!defined $max) ) {
1535             $min = $arr->[$i];
1536             $max = $arr->[$i];
1537             } elsif ( $arr->[$i] < $min ) {
1538             $min = $arr->[$i];
1539             } elsif ( $arr->[$i] > $max ) {
1540             $max = $arr->[$i];
1541             }
1542             }
1543             return ($min, $max);
1544             }
1545              
1546             # Meant only for constructing a deep copy of an array of arrays:
1547             sub deep_copy_AoA {
1548             my $ref_in = shift;
1549             my $ref_out;
1550             foreach my $i (0..@{$ref_in}-1) {
1551             foreach my $j (0..@{$ref_in->[$i]}-1) {
1552             $ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
1553             }
1554             }
1555             return $ref_out;
1556             }
1557              
1558             # Meant only for constructing a deep copy of an array of arrays for the case when
1559             # some elements of the top-level array may be undefined:
1560             sub deep_copy_AoA_with_nulls {
1561             my $ref_in = shift;
1562             my $ref_out;
1563             foreach my $i (0..@{$ref_in}-1) {
1564             if ( !defined $ref_in->[$i] ) {
1565             $ref_out->[$i] = undef;
1566             next;
1567             }
1568             foreach my $j (0..@{$ref_in->[$i]}-1) {
1569             $ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
1570             }
1571             }
1572             return $ref_out;
1573             }
1574              
1575             # Meant only for constructing a deep copy of a hash in which each value is an
1576             # anonymous array of numbers:
1577             sub deep_copy_hash {
1578             my $ref_in = shift;
1579             my $ref_out;
1580             while ( my ($key, $value) = each( %$ref_in ) ) {
1581             $ref_out->{$key} = deep_copy_array( $value );
1582             }
1583             return $ref_out;
1584             }
1585              
1586             # Meant only for an array of numbers:
1587             sub deep_copy_array {
1588             my $ref_in = shift;
1589             my $ref_out;
1590             foreach my $i (0..@{$ref_in}-1) {
1591             $ref_out->[$i] = $ref_in->[$i];
1592             }
1593             return $ref_out;
1594             }
1595              
1596             sub display_cluster_centers {
1597             my $self = shift;
1598             my @clusters = @{shift @_};
1599             my $i = 0;
1600             foreach my $cluster (@clusters) {
1601             my $cluster_size = @$cluster;
1602             my @cluster_center =
1603             @{$self->add_point_coords_from_original_data( $cluster )};
1604             @cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;
1605             print "\ncluster $i ($cluster_size records):\n";
1606             print "cluster center $i: " .
1607             "@{[map {my $x = sprintf('%.4f', $_); $x} @cluster_center]}\n";
1608             $i++;
1609             }
1610             }
1611              
1612             # For displaying the individual clusters on a terminal screen. Each cluster is
1613             # displayed through the symbolic names associated with the data points.
1614             sub display_clusters {
1615             my @clusters = @{shift @_};
1616             my $i = 0;
1617             foreach my $cluster (@clusters) {
1618             @$cluster = sort @$cluster;
1619             my $cluster_size = @$cluster;
1620             print "\n\nCluster $i ($cluster_size records):\n";
1621             foreach my $ele (@$cluster) {
1622             print " $ele";
1623             }
1624             $i++
1625             }
1626             print "\n\n";
1627             }
1628              
1629             # from perl docs:
1630             sub fisher_yates_shuffle {
1631             my $arr = shift;
1632             my $i = @$arr;
1633             while (--$i) {
1634             my $j = int rand( $i + 1 );
1635             @$arr[$i, $j] = @$arr[$j, $i];
1636             }
1637             }
1638              
1639             sub variance_normalization {
1640             my %data_hash = %{shift @_};
1641             my @all_data_points = values %data_hash;
1642             my $dimensions = @{$all_data_points[0]};
1643             my @data_projections;
1644             foreach my $data_point (@all_data_points) {
1645             my $i = 0;
1646             foreach my $proj (@$data_point) {
1647             push @{$data_projections[$i++]}, $proj;
1648             }
1649             }
1650             my @variance_vec;
1651             foreach my $vec (@data_projections) {
1652             my ($mean, $variance) = mean_and_variance( $vec );
1653             push @variance_vec, $variance;
1654             }
1655             my %new_data_hash;
1656             while (my ($label, $data) = each(%data_hash) ) {
1657             my @new_data;
1658             foreach my $i (0..@{$data}-1) {
1659             my $new = $data->[$i] / sqrt($variance_vec[$i]);
1660             push @new_data, $data->[$i] / sqrt($variance_vec[$i]);
1661             }
1662             $new_data_hash{$label} = \@new_data;
1663             }
1664             return \%new_data_hash;
1665             }
1666              
1667             sub mean_and_variance {
1668             my @data = @{shift @_};
1669             my ($mean, $variance);
1670             foreach my $i (1..@data) {
1671             if ($i == 1) {
1672             $mean = $data[0];
1673             $variance = 0;
1674             } else {
1675             $mean = ( (($i-1)/$i) * $mean ) + $data[$i-1] / $i;
1676             $variance = ( (($i-1)/$i) * $variance ) + ($data[$i-1]-$mean)**2 / ($i-1);
1677             }
1678             }
1679             return ($mean, $variance);
1680             }
1681              
1682             sub check_for_illegal_params {
1683             my @params = @_;
1684             my @legal_params = qw / datafile
1685             mask
1686             K
1687             Kmin
1688             Kmax
1689             terminal_output
1690             write_clusters_to_files
1691             do_variance_normalization
1692             cluster_seeding
1693             use_mahalanobis_metric
1694             debug
1695             /;
1696             my $found_match_flag;
1697             foreach my $param (@params) {
1698             foreach my $legal (@legal_params) {
1699             $found_match_flag = 0;
1700             if ($param eq $legal) {
1701             $found_match_flag = 1;
1702             last;
1703             }
1704             }
1705             last if $found_match_flag == 0;
1706             }
1707             return $found_match_flag;
1708             }
1709              
1710             sub get_value_index_hash {
1711             my $arr = shift;
1712             my %hash;
1713             foreach my $index (0..@$arr-1) {
1714             $hash{$arr->[$index]} = $index if $arr->[$index] > 0;
1715             }
1716             return \%hash;
1717             }
1718              
1719             sub non_maximum_suppression {
1720             my $arr = shift;
1721             my @output = (0) x @$arr;
1722             my @final_output = (0) x @$arr;
1723             my %hash;
1724             my @array_of_runs = ([$arr->[0]]);
1725             foreach my $index (1..@$arr-1) {
1726             if ($arr->[$index] == $arr->[$index-1]) {
1727             push @{$array_of_runs[-1]}, $arr->[$index];
1728             } else {
1729             push @array_of_runs, [$arr->[$index]];
1730             }
1731             }
1732             my $runstart_index = 0;
1733             foreach my $run_index (1..@array_of_runs-2) {
1734             $runstart_index += @{$array_of_runs[$run_index-1]};
1735             if ($array_of_runs[$run_index]->[0] >
1736             $array_of_runs[$run_index-1]->[0] &&
1737             $array_of_runs[$run_index]->[0] >
1738             $array_of_runs[$run_index+1]->[0]) {
1739             my $run_center = @{$array_of_runs[$run_index]} / 2;
1740             my $assignment_index = $runstart_index + $run_center;
1741             $output[$assignment_index] = $arr->[$assignment_index];
1742             }
1743             }
1744             if ($array_of_runs[-1]->[0] > $array_of_runs[-2]->[0]) {
1745             $runstart_index += @{$array_of_runs[-2]};
1746             my $run_center = @{$array_of_runs[-1]} / 2;
1747             my $assignment_index = $runstart_index + $run_center;
1748             $output[$assignment_index] = $arr->[$assignment_index];
1749             }
1750             if ($array_of_runs[0]->[0] > $array_of_runs[1]->[0]) {
1751             my $run_center = @{$array_of_runs[0]} / 2;
1752             $output[$run_center] = $arr->[$run_center];
1753             }
1754             return \@output;
1755             }
1756              
1757             sub display_matrix {
1758             my $matrix = shift;
1759             my $nrows = $matrix->rows();
1760             my $ncols = $matrix->cols();
1761             print "\n\nDisplaying matrix of size $nrows rows and $ncols columns:\n";
1762             foreach my $i (0..$nrows-1) {
1763             my $row = $matrix->row($i);
1764             my @row_as_list = $row->as_list;
1765             print "@row_as_list\n";
1766             }
1767             print "\n\n";
1768             }
1769              
1770             sub transpose {
1771             my $matrix = shift;
1772             my $num_rows = $matrix->rows();
1773             my $num_cols = $matrix->cols();
1774             my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
1775             foreach my $i (0..$num_rows-1) {
1776             my @row = $matrix->row($i)->as_list;
1777             $transpose->set_col($i, \@row );
1778             }
1779             return $transpose;
1780             }
1781              
1782             sub vector_subtract {
1783             my $vec1 = shift;
1784             my $vec2 = shift;
1785             die "wrong data types for vector subtract calculation\n" if @$vec1 != @$vec2;
1786             my @result;
1787             foreach my $i (0..@$vec1-1){
1788             push @result, $vec1->[$i] - $vec2->[$i];
1789             }
1790             return @result;
1791             }
1792              
1793             sub vector_multiply {
1794             my $vec1 = shift;
1795             my $vec2 = shift;
1796             die "vec_multiply called with two vectors of different sizes"
1797             unless @$vec1 == @$vec2;
1798             my $result = 0;
1799             foreach my $i (0..@$vec1-1) {
1800             $result += $vec1->[$i] * $vec2->[$i];
1801             }
1802             return $result;
1803             }
1804              
1805             sub matrix_multiply {
1806             my $matrix1 = shift;
1807             my $matrix2 = shift;
1808             my ($nrows1, $ncols1) = ($matrix1->rows(), $matrix1->cols());
1809             my ($nrows2, $ncols2) = ($matrix2->rows(), $matrix2->cols());
1810             die "matrix multiplication called with non-matching matrix arguments"
1811             unless $nrows1 == $ncols2 && $ncols1 == $nrows2;
1812             if ($nrows1 == 1) {
1813             my @row = $matrix1->row(0)->as_list;
1814             my @col = $matrix2->col(0)->as_list;
1815             my $result;
1816             foreach my $j (0..$ncols1-1) {
1817             $result += $row[$j] * $col[$j];
1818             }
1819             return $result;
1820             }
1821             my $product = Math::GSL::Matrix->new($nrows1, $nrows1);
1822             foreach my $i (0..$nrows1-1) {
1823             my $row = $matrix1->row($i);
1824             my @product_row;
1825             foreach my $j (0..$ncols2-1) {
1826             my $col = $matrix2->col($j);
1827             my $row_times_col = matrix_multiply($row, $col);
1828             push @product_row, $row_times_col;
1829             }
1830             $product->set_row($i, \@product_row);
1831             }
1832             return $product;
1833             }
1834              
1835             1;
1836              
1837             =pod
1838              
1839             =head1 NAME
1840              
1841             Algorithm::KMeans - for clustering multidimensional data
1842              
1843             =head1 SYNOPSIS
1844              
1845             # You now have four different choices for clustering your data with this module:
1846             #
1847             # 1) With Euclidean distances and with random cluster seeding
1848             #
1849             # 2) With Mahalanobis distances and with random cluster seeding
1850             #
1851             # 3) With Euclidean distances and with smart cluster seeding
1852             #
1853             # 4) With Mahalanobis distances and with smart cluster seeding
1854             #
1855             # Despite the qualifier 'smart' in 'smart cluster seeding', it may not always
1856             # produce results that are superior to those obtained with random seeding. (If you
1857             # also factor in the choice regarding variance normalization, you actually have
1858             # eight different choices for data clustering with this module.)
1859             #
1860             # In all cases, you'd obviously begin with
1861              
1862             use Algorithm::KMeans;
1863              
1864             # You'd then name the data file:
1865              
1866             my $datafile = "mydatafile.csv";
1867              
1868             # Next, set the mask to indicate which columns of the datafile to use for
1869             # clustering and which column contains a symbolic ID for each data record. For
1870             # example, if the symbolic name is in the first column, you want the second column
1871             # to be ignored, and you want the next three columns to be used for 3D clustering,
1872             # you'd set the mask to:
1873              
1874             my $mask = "N0111";
1875              
1876             # Now construct an instance of the clusterer. The parameter K controls the number
1877             # of clusters. If you know how many clusters you want (let's say 3), call
1878              
1879             my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1880             mask => $mask,
1881             K => 3,
1882             cluster_seeding => 'random',
1883             terminal_output => 1,
1884             write_clusters_to_files => 1,
1885             );
1886              
1887             # By default, this constructor call will set you up for clustering based on
1888             # Euclidean distances. If you want the module to use Mahalanobis distances, your
1889             # constructor call will look like:
1890              
1891             my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1892             mask => $mask,
1893             K => 3,
1894             cluster_seeding => 'random',
1895             use_mahalanobis_metric => 1,
1896             terminal_output => 1,
1897             write_clusters_to_files => 1,
1898             );
1899              
1900             # For both constructor calls shown above, you can use smart seeding of the clusters
1901             # by changing 'random' to 'smart' for the cluster_seeding option. See the
1902             # explanation of smart seeding in the Methods section of this documentation.
1903              
1904             # If your data is such that its variability along the different dimensions of the
1905             # data space is significantly different, you may get better clustering if you first
1906             # normalize your data by setting the constructor parameter
1907             # do_variance_normalization as shown below:
1908              
1909             my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1910             mask => $mask,
1911             K => 3,
1912             cluster_seeding => 'smart', # or 'random'
1913             terminal_output => 1,
1914             do_variance_normalization => 1,
1915             write_clusters_to_files => 1,
1916             );
1917              
1918             # But bear in mind that such data normalization may actually decrease the
1919             # performance of the clusterer if the variability in the data is more a result of
1920             # the separation between the means than a consequence of intra-cluster variance.
1921              
1922             # Set K to 0 if you want the module to figure out the optimum number of clusters
1923             # from the data. (It is best to run this option with the terminal_output set to 1
1924             # so that you can see the different value of QoC for the different K):
1925              
1926             my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1927             mask => $mask,
1928             K => 0,
1929             cluster_seeding => 'random', # or 'smart'
1930             terminal_output => 1,
1931             write_clusters_to_files => 1,
1932             );
1933              
1934             # Although not shown above, you can obviously set the 'do_variance_normalization'
1935             # flag here also if you wish.
1936              
1937             # For very large data files, setting K to 0 will result in searching through too
1938             # many values for K. For such cases, you can range limit the values of K to search
1939             # through by
1940              
1941             my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1942             mask => "N111",
1943             Kmin => 3,
1944             Kmax => 10,
1945             cluster_seeding => 'random', # or 'smart'
1946             terminal_output => 1,
1947             write_clusters_to_files => 1,
1948             );
1949              
1950             # FOR ALL CASES ABOVE, YOU'D NEED TO MAKE THE FOLLOWING CALLS ON THE CLUSTERER
1951             # INSTANCE TO ACTUALLY CLUSTER THE DATA:
1952              
1953             $clusterer->read_data_from_file();
1954             $clusterer->kmeans();
1955              
1956             # If you want to directly access the clusters and the cluster centers in your own
1957             # top-level script, replace the above two statements with:
1958              
1959             $clusterer->read_data_from_file();
1960             my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans();
1961              
1962             # You can subsequently access the clusters directly in your own code, as in:
1963              
1964             foreach my $cluster_id (sort keys %{$clusters_hash}) {
1965             print "\n$cluster_id => @{$clusters_hash->{$cluster_id}}\n";
1966             }
1967             foreach my $cluster_id (sort keys %{$cluster_centers_hash}) {
1968             print "\n$cluster_id => @{$cluster_centers_hash->{$cluster_id}}\n";
1969             }
1970              
1971              
1972             # CLUSTER VISUALIZATION:
1973              
1974             # You must first set the mask for cluster visualization. This mask tells the module
1975             # which 2D or 3D subspace of the original data space you wish to visualize the
1976             # clusters in:
1977              
1978             my $visualization_mask = "111";
1979             $clusterer->visualize_clusters($visualization_mask);
1980              
1981              
1982             # SYNTHETIC DATA GENERATION:
1983              
1984             # The module has been provided with a class method for generating multivariate data
1985             # for experimenting with clustering. The data generation is controlled by the
1986             # contents of the parameter file that is supplied as an argument to the data
1987             # generator method. The mean and covariance matrix entries in the parameter file
1988             # must be according to the syntax shown in the param.txt file in the examples
1989             # directory. It is best to edit this file as needed:
1990              
1991             my $parameter_file = "param.txt";
1992             my $out_datafile = "mydatafile.dat";
1993             Algorithm::KMeans->cluster_data_generator(
1994             input_parameter_file => $parameter_file,
1995             output_datafile => $out_datafile,
1996             number_data_points_per_cluster => $N );
1997              
1998             =head1 CHANGES
1999              
2000             Version 2.04 allows you to use CSV data files for clustering.
2001              
2002             Version 2.03 incorporates minor code cleanup. The main implementation of the module
2003             remains unchanged.
2004              
2005             Version 2.02 downshifts the version of Perl that is required for this module. The
2006             module should work with versions 5.10 and higher of Perl. The implementation code
2007             for the module remains unchanged.
2008              
2009             Version 2.01 removes many errors in the documentation. The changes made to the module
2010             in Version 2.0 were not reflected properly in the documentation page for that
2011             version. The implementation code remains unchanged.
2012              
2013             Version 2.0 includes significant additional functionality: (1) You now have the
2014             option to cluster using the Mahalanobis distance metric (the default is the Euclidean
2015             metric); and (2) With the two C methods that have been added to the
2016             module, you can now determine the best cluster for a new data sample after you have
2017             created the clusters with the previously available data. Finding the best cluster
2018             for a new data sample can be done using either the Euclidean metric or the
2019             Mahalanobis metric.
2020              
2021             Version 1.40 includes a C option for seeding the clusters. This option,
2022             supplied through the constructor parameter C, means that the
2023             clusterer will (1) Subject the data to principal components analysis in order to
2024             determine the maximum variance direction; (2) Project the data onto this direction;
2025             (3) Find peaks in a smoothed histogram of the projected points; and (4) Use the
2026             locations of the highest peaks as initial guesses for the cluster centers. If you
2027             don't want to use this option, set C to C. That should work
2028             as in the previous version of the module.
2029              
2030             Version 1.30 includes a bug fix for the case when the datafile contains empty lines,
2031             that is, lines with no data records. Another bug fix in Version 1.30 deals with the
2032             case when you want the module to figure out how many clusters to form (this is the
2033             C option in the constructor call) and the number of data records is close to the
2034             minimum.
2035              
2036             Version 1.21 includes fixes to handle the possibility that, when clustering the data
2037             for a fixed number of clusters, a cluster may become empty during iterative
2038             calculation of cluster assignments of the data elements and the updating of the
2039             cluster centers. The code changes are in the C and
2040             C subroutines.
2041              
2042             Version 1.20 includes an option to normalize the data with respect to its variability
2043             along the different coordinates before clustering is carried out.
2044              
2045             Version 1.1.1 allows for range limiting the values of C to search through. C
2046             stands for the number of clusters to form. This version also declares the module
2047             dependencies in the C file.
2048              
2049             Version 1.1 is a an object-oriented version of the implementation presented in
2050             version 1.0. The current version should lend itself more easily to code extension.
2051             You could, for example, create your own class by subclassing from the class presented
2052             here and, in your subclass, use your own criteria for the similarity distance between
2053             the data points and for the QoC (Quality of Clustering) metric, and, possibly a
2054             different rule to stop the iterations. Version 1.1 also allows you to directly
2055             access the clusters formed and the cluster centers in your calling script.
2056              
2057              
2058             =head1 SPECIAL USAGE NOTE
2059              
2060             If you were directly accessing in your own scripts the clusters produced by the older
2061             versions of this module, you'd need to make changes to your code if you wish to use
2062             Version 2.0 or higher. Instead of returning arrays of clusters and cluster centers,
2063             Versions 2.0 and higher return hashes. This change was made necessary by the logic
2064             required for implementing the two new C methods that were introduced
2065             in Version 2.0. These methods return the best cluster for a new data sample from the
2066             clusters you created using the existing data. One of the C methods is
2067             based on the Euclidean metric for finding the cluster that is closest to the new data
2068             sample, and the other on the Mahalanobis metric. Another point of incompatibility
2069             with the previous versions is that you must now explicitly set the C
2070             parameter in the call to the constructor to either C or C. This
2071             parameter does not have a default associated with it starting with Version 2.0.
2072              
2073              
2074             =head1 DESCRIPTION
2075              
2076             Clustering with K-Means takes place iteratively and involves two steps: 1) assignment
2077             of data samples to clusters on the basis of how far the data samples are from the
2078             cluster centers; and 2) Recalculation of the cluster centers (and cluster covariances
2079             if you are using the Mahalanobis distance metric for clustering).
2080              
2081             Obviously, before the two-step approach can proceed, we need to initialize the the
2082             cluster centers. How this initialization is carried out is important. The module
2083             gives you two very different ways for carrying out this initialization. One option,
2084             called the C option, consists of subjecting the data to principal components
2085             analysis to discover the direction of maximum variance in the data space. The data
2086             points are then projected on to this direction and a histogram constructed from the
2087             projections. Centers of the smoothed histogram are used to seed the clustering
2088             operation. The other option is to choose the cluster centers purely randomly. You
2089             get the first option if you set C to C in the constructor,
2090             and you get the second option if you set it to C.
2091              
2092             How to specify the number of clusters, C, is one of the most vexing issues in any
2093             approach to clustering. In some case, we can set C on the basis of prior
2094             knowledge. But, more often than not, no such prior knowledge is available. When the
2095             programmer does not explicitly specify a value for C, the approach taken in the
2096             current implementation is to try all possible values between 2 and some largest
2097             possible value that makes statistical sense. We then choose that value for C
2098             which yields the best value for the QoC (Quality of Clustering) metric. It is
2099             generally believed that the largest value for C should not exceed C
2100             where C is the number of data samples to be clustered.
2101              
2102             What to use for the QoC metric is obviously a critical issue unto itself. In the
2103             current implementation, the value of QoC is the ratio of the average radius of the
2104             clusters and the average distance between the cluster centers.
2105              
2106             Every iterative algorithm requires a stopping criterion. The criterion implemented
2107             here is that we stop iterations when there is no re-assignment of the data points
2108             during the assignment step.
2109              
2110             Ordinarily, the output produced by a K-Means clusterer will correspond to a local
2111             minimum for the QoC values, as opposed to a global minimum. The current
2112             implementation protects against that when the module constructor is called with the
2113             C option for C by trying different randomly selected initial
2114             cluster centers and then selecting the one that gives the best overall QoC value.
2115              
2116             A K-Means clusterer will generally produce good results if the overlap between the
2117             clusters is minimal and if each cluster exhibits variability that is uniform in all
2118             directions. When the data variability is different along the different directions in
2119             the data space, the results you obtain with a K-Means clusterer may be improved by
2120             first normalizing the data appropriately, as can be done in this module when you set
2121             the C option in the constructor. However, as pointed out
2122             elsewhere in this documentation, such normalization may actually decrease the
2123             performance of the clusterer if the overall data variability along any dimension is
2124             more a result of separation between the means than a consequence of intra-cluster
2125             variability.
2126              
2127              
2128             =head1 METHODS
2129              
2130             The module provides the following methods for clustering, for cluster visualization,
2131             for data visualization, for the generation of data for testing a clustering
2132             algorithm, and for determining the cluster membership of a new data sample:
2133              
2134             =over 4
2135              
2136             =item B
2137              
2138             my $clusterer = Algorithm::KMeans->new(datafile => $datafile,
2139             mask => $mask,
2140             K => $K,
2141             cluster_seeding => 'random', # also try 'smart'
2142             use_mahalanobis_metric => 1, # also try '0'
2143             terminal_output => 1,
2144             write_clusters_to_files => 1,
2145             );
2146              
2147             A call to C constructs a new instance of the C class. When
2148             C<$K> is a non-zero positive integer, the module will construct exactly that many
2149             clusters. However, when C<$K> is 0, the module will find the best number of clusters
2150             to partition the data into. As explained in the Description, setting
2151             C to C causes PCA (principal components analysis) to be used
2152             for discovering the best choices for the initial cluster centers. If you want purely
2153             random decisions to be made for the initial choices for the cluster centers, set
2154             C to C.
2155              
2156             The data file is expected to contain entries in the following format
2157              
2158             c20 0 10.7087017086940 9.63528386251712 10.9512155258108 ...
2159             c7 0 12.8025925026787 10.6126270065785 10.5228482095349 ...
2160             b9 0 7.60118206283120 5.05889245193079 5.82841781759102 ...
2161             ....
2162             ....
2163              
2164             where the first column contains the symbolic ID tag for each data record and the rest
2165             of the columns the numerical information. As to which columns are actually used for
2166             clustering is decided by the string value of the mask. For example, if we wanted to
2167             cluster on the basis of the entries in just the 3rd, the 4th, and the 5th columns
2168             above, the mask value would be C where the character C indicates that the
2169             ID tag is in the first column, the character C<0> that the second column is to be
2170             ignored, and the C<1>'s that follow that the 3rd, the 4th, and the 5th columns are to
2171             be used for clustering.
2172              
2173             If you wish for the clusterer to search through a C<(Kmin,Kmax)> range of values for
2174             C, the constructor should be called in the following fashion:
2175              
2176             my $clusterer = Algorithm::KMeans->new(datafile => $datafile,
2177             mask => $mask,
2178             Kmin => 3,
2179             Kmax => 10,
2180             cluster_seeding => 'smart', # try 'random' also
2181             terminal_output => 1,
2182             );
2183              
2184             where obviously you can choose any reasonable values for C and C. If you
2185             choose a value for C that is statistically too large, the module will let you
2186             know. Again, you may choose C for C, the default value being
2187             C.
2188              
2189             If you believe that the variability of the data is very different along the different
2190             dimensions of the data space, you may get better clustering by first normalizing the
2191             data coordinates by the standard-deviations along those directions. When you set the
2192             constructor option C as shown below, the module uses the
2193             overall data standard-deviation along a direction for the normalization in that
2194             direction. (As mentioned elsewhere in the documentation, such a normalization could
2195             backfire on you if the data variability along a dimension is more a result of the
2196             separation between the means than a consequence of the intra-cluster variability.):
2197              
2198             my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
2199             mask => "N111",
2200             K => 2,
2201             cluster_seeding => 'smart', # try 'random' also
2202             terminal_output => 1,
2203             do_variance_normalization => 1,
2204             );
2205              
2206             =back
2207              
2208             =head2 Constructor Parameters
2209              
2210             =over 8
2211              
2212             =item C:
2213              
2214             This parameter names the data file that contains the multidimensional data records
2215             you want the module to cluster.
2216              
2217             =item C:
2218              
2219             This parameter supplies the mask to be applied to the columns of your data file. See
2220             the explanation in Synopsis for what this mask looks like.
2221              
2222             =item C:
2223              
2224             This parameter supplies the number of clusters you are looking for. If you set this
2225             option to 0, that means that you want the module to search for the best value for
2226             C. (Keep in mind the fact that searching for the best C may take a long time
2227             for large data files.)
2228              
2229             =item C:
2230              
2231             If you supply an integer value for C, the search for the best C will begin
2232             with that value.
2233              
2234             =item C:
2235              
2236             If you supply an integer value for C, the search for the best C will end at
2237             that value.
2238              
2239             =item C:
2240              
2241             This parameter must be set to either C or C. Depending on your data,
2242             you may get superior clustering with the C option. The choice C means
2243             that the clusterer will (1) subject the data to principal components analysis to
2244             determine the maximum variance direction; (2) project the data onto this direction;
2245             (3) find peaks in a smoothed histogram of the projected points; and (4) use the
2246             locations of the highest peaks as seeds for cluster centers. If the C option
2247             produces bizarre results, try C.
2248              
2249             =item C:
2250              
2251             When set to 1, this option causes Mahalanobis distances to be used for clustering.
2252             The default is 0 for this parameter. By default, the module uses the Euclidean
2253             distances for clustering. In general, Mahalanobis distance based clustering will
2254             fail if your data resides on a lower-dimensional hyperplane in the data space, if you
2255             seek too many clusters, and if you do not have a sufficient number of samples in your
2256             data file. A necessary requirement for the module to be able to compute Mahalanobis
2257             distances is that the cluster covariance matrices be non-singular. (Let's say your
2258             data dimensionality is C and the module is considering a cluster that has only
2259             C samples in it where C is less than C. In this case, the covariance matrix
2260             will be singular since its rank will not exceed C. For the covariance matrix to
2261             be non-singular, it must be of full rank, that is, its rank must be C.)
2262              
2263             =item C:
2264              
2265             When set, the module will first normalize the data variance along the different
2266             dimensions of the data space before attempting clustering. Depending on your data,
2267             this option may or may not result in better clustering.
2268              
2269             =item C:
2270              
2271             This boolean parameter, when not supplied in the call to C, defaults to 0.
2272             When set, you will see in your terminal window the different clusters as lists of the
2273             symbolic IDs and their cluster centers. You will also see the QoC (Quality of
2274             Clustering) values for the clusters displayed.
2275              
2276             =item C:
2277              
2278             This parameter is also boolean. When set to 1, the clusters are written out to files
2279             that are named in the following manner:
2280              
2281             cluster0.txt
2282             cluster1.txt
2283             cluster2.txt
2284             ...
2285             ...
2286              
2287             Before the clusters are written to these files, the module destroys all files with
2288             such names in the directory in which you call the module.
2289              
2290              
2291             =back
2292              
2293             =over
2294              
2295             =item B
2296              
2297             $clusterer->read_data_from_file()
2298              
2299             =item B
2300              
2301             $clusterer->kmeans();
2302              
2303             or
2304              
2305             my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans();
2306              
2307             The first call above works solely by side-effect. The second call also returns the
2308             clusters and the cluster centers. See the C script in the
2309             C directory for how you can in your own code extract the clusters and the
2310             cluster centers from the variables C<$clusters_hash> and C<$cluster_centers_hash>.
2311              
2312             =item B
2313              
2314             $clusterer->get_K_best();
2315              
2316             This call makes sense only if you supply either the C option to the constructor,
2317             or if you specify values for the C and C options. The C and the
2318             C<(Kmin,Kmax)> options cause the module to determine the best value for C.
2319             Remember, C is the number of clusters the data is partitioned into.
2320              
2321             =item B
2322              
2323             $clusterer->show_QoC_values();
2324              
2325             presents a table with C values in the left column and the corresponding QoC
2326             (Quality-of-Clustering) values in the right column. Note that this call makes sense
2327             only if you either supply the C option to the constructor, or if you specify
2328             values for the C and C options.
2329              
2330             =item B
2331              
2332             $clusterer->visualize_clusters( $visualization_mask )
2333              
2334             The visualization mask here does not have to be identical to the one used for
2335             clustering, but must be a subset of that mask. This is convenient for visualizing
2336             the clusters in two- or three-dimensional subspaces of the original space.
2337              
2338             =item B
2339              
2340             $clusterer->visualize_data($visualization_mask, 'original');
2341              
2342             $clusterer->visualize_data($visualization_mask, 'normed');
2343              
2344             This method requires a second argument and, as shown, it must be either the string
2345             C or the string C, the former for the visualization of the raw data
2346             and the latter for the visualization of the data after its different dimensions are
2347             normalized by the standard-deviations along those directions. If you call the method
2348             with the second argument set to C, but do so without turning on the
2349             C option in the KMeans constructor, it will let you know.
2350              
2351              
2352             =item B
2353              
2354             If you wish to determine the cluster membership of a new data sample after you have
2355             created the clusters with the existing data samples, you would need to call this
2356             method. The C script in the C directory
2357             shows how to use this method.
2358              
2359              
2360             =item B
2361              
2362             This does the same thing as the previous method, except that it determines the
2363             cluster membership using the Mahalanobis distance metric. As for the previous
2364             method, see the C script in the C directory
2365             for how to use this method.
2366              
2367              
2368             =item B
2369              
2370             Algorithm::KMeans->cluster_data_generator(
2371             input_parameter_file => $parameter_file,
2372             output_datafile => $out_datafile,
2373             number_data_points_per_cluster => 20 );
2374              
2375             for generating multivariate data for clustering if you wish to play with synthetic
2376             data for clustering. The input parameter file contains the means and the variances
2377             for the different Gaussians you wish to use for the synthetic data. See the file
2378             C provided in the examples directory. It will be easiest for you to just
2379             edit this file for your data generation needs. In addition to the format of the
2380             parameter file, the main constraint you need to observe in specifying the parameters
2381             is that the dimensionality of the covariance matrix must correspond to the
2382             dimensionality of the mean vectors. The multivariate random numbers are generated by
2383             calling the C module. As you would expect, this module requires that
2384             the covariance matrices you specify in your parameter file be symmetric and positive
2385             definite. Should the covariances in your parameter file not obey this condition, the
2386             C module will let you know.
2387              
2388             =back
2389              
2390             =head1 HOW THE CLUSTERS ARE OUTPUT
2391              
2392             When the option C is set in the call to the constructor, the
2393             clusters are displayed on the terminal screen.
2394              
2395             When the option C is set in the call to the constructor, the
2396             module dumps the clusters in files named
2397              
2398             cluster0.txt
2399             cluster1.txt
2400             cluster2.txt
2401             ...
2402             ...
2403              
2404             in the directory in which you execute the module. The number of such files will
2405             equal the number of clusters formed. All such existing files in the directory are
2406             destroyed before any fresh ones are created. Each cluster file contains the symbolic
2407             ID tags of the data samples in that cluster.
2408              
2409             The module also leaves in your directory a printable `.png' file that is a point plot
2410             of the different clusters. The name of this file is always C.
2411              
2412             Also, as mentioned previously, a call to C in your own code will return the
2413             clusters and the cluster centers.
2414              
2415             =head1 REQUIRED
2416              
2417             This module requires the following three modules:
2418              
2419             Math::Random
2420             Graphics::GnuplotIF
2421             Math::GSL
2422              
2423             With regard to the third item above, what is actually required is the
2424             C module. However, that module is a part of the C
2425             distribution. The acronym GSL stands for the GNU Scientific Library. C is
2426             a Perl interface to the GSL C-based library.
2427              
2428              
2429             =head1 THE C DIRECTORY
2430              
2431             The C directory contains several scripts to help you become familiar with
2432             this module. The following script is an example of how the module can be expected to
2433             be used most of the time. It calls for clustering to be carried out with a fixed
2434             C:
2435              
2436             cluster_and_visualize.pl
2437              
2438             The more time you spend with this script, the more comfortable you will become with
2439             the use of this module. The script file contains a large comment block that mentions
2440             six locations in the script where you have to make decisions about how to use the
2441             module.
2442              
2443             See the following script if you do not know what value to use for C for clustering
2444             your data:
2445              
2446             find_best_K_and_cluster.pl
2447              
2448             This script uses the C option in the constructor that causes the module to
2449             search for the best C for your data. Since this search is virtually unbounded ---
2450             limited only by the number of samples in your data file --- the script may take a
2451             long time to run for a large data file. Hence the next script.
2452              
2453             If your datafile is too large, you may need to range limit the values of C that
2454             are searched through, as in the following script:
2455              
2456             find_best_K_in_range_and_cluster.pl
2457              
2458             If you also want to include data normalization (it may reduce the performance of the
2459             clusterer in some cases), see the following script:
2460              
2461             cluster_after_data_normalization.pl
2462              
2463             When you include the data normalization step and you would like to visualize the data
2464             before and after normalization, see the following script:
2465              
2466             cluster_and_visualize_with_data_visualization.pl*
2467              
2468             After you are done clustering, let's say you want to find the cluster membership of a
2469             new data sample. To see how you can do that, see the script:
2470              
2471             which_cluster_for_new_data.pl
2472              
2473             This script returns two answers for which cluster a new data sample belongs to: one
2474             using the Euclidean metric to calculate the distances between the new data sample and
2475             the cluster centers, and the other using the Mahalanobis metric. If the clusters are
2476             strongly elliptical in shape, you are likely to get better results with the
2477             Mahalanobis metric. (To see that you can get two different answers using the two
2478             different distance metrics, run the C script on the
2479             data in the file C. To make this run, note that you have to comment
2480             out and uncomment the lines at four different locations in the script.)
2481              
2482             The C directory also contains the following support scripts:
2483              
2484             For generating the data for experiments with clustering:
2485              
2486             data_generator.pl
2487              
2488             For cleaning up the examples directory:
2489              
2490             cleanup_directory.pl
2491              
2492             The examples directory also includes a parameter file, C, for generating
2493             synthetic data for clustering. Just edit this file if you would like to generate
2494             your own multivariate data for clustering. The parameter file is for the 3D case,
2495             but you can generate data with any dimensionality through appropriate entries in the
2496             parameter file.
2497              
2498             =head1 EXPORT
2499              
2500             None by design.
2501              
2502             =head1 CAVEATS
2503              
2504             K-Means based clustering usually does not work well when the clusters are strongly
2505             overlapping and when the extent of variability along the different dimensions is
2506             different for the different clusters. The module does give you the ability to
2507             normalize the variability in your data with the constructor option
2508             C. However, as described elsewhere, this may actually
2509             reduce the performance of the clusterer if the data variability along a direction is
2510             more a result of the separation between the means than because of intra-cluster
2511             variability. For better clustering with difficult-to-cluster data, you could try
2512             using the author's C module.
2513              
2514             =head1 BUGS
2515              
2516             Please notify the author if you encounter any bugs. When sending email, please place
2517             the string 'KMeans' in the subject line.
2518              
2519             =head1 INSTALLATION
2520              
2521             Download the archive from CPAN in any directory of your choice. Unpack the archive
2522             with a command that on a Linux machine would look like:
2523              
2524             tar zxvf Algorithm-KMeans-2.04.tar.gz
2525              
2526             This will create an installation directory for you whose name will be
2527             C. Enter this directory and execute the following commands
2528             for a standard install of the module if you have root privileges:
2529              
2530             perl Makefile.PL
2531             make
2532             make test
2533             sudo make install
2534              
2535             If you do not have root privileges, you can carry out a non-standard install the
2536             module in any directory of your choice by:
2537              
2538             perl Makefile.PL prefix=/some/other/directory/
2539             make
2540             make test
2541             make install
2542              
2543             With a non-standard install, you may also have to set your PERL5LIB environment
2544             variable so that this module can find the required other modules. How you do that
2545             would depend on what platform you are working on. In order to install this module in
2546             a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
2547             variable by
2548              
2549             setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2550              
2551             If I used bash, I'd need to declare:
2552              
2553             export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2554              
2555              
2556             =head1 THANKS
2557              
2558             I thank Slaven for pointing out that I needed to downshift the required version of Perl
2559             for this module. Fortunately, I had access to an old machine still running Perl
2560             5.10.1. The current version, 2.02, is based on my testing the module on that machine.
2561              
2562             I added two C methods in Version 2.0 as a result of an email from
2563             Jerome White who expressed a need for such methods in order to determine the best
2564             cluster for a new data record after you have successfully clustered your existing
2565             data. Thanks Jerome for your feedback!
2566              
2567             It was an email from Nadeem Bulsara that prompted me to create Version 1.40 of this
2568             module. Working with Version 1.30, Nadeem noticed that occasionally the module would
2569             produce variable clustering results on the same dataset. I believe that this
2570             variability was caused (at least partly) by the purely random mode that was used in
2571             Version 1.30 for the seeding of the cluster centers. Version 1.40 now includes a
2572             C mode. With the new mode the clusterer uses a PCA (Principal Components
2573             Analysis) of the data to make good guesses for the cluster centers. However,
2574             depending on how the data is jumbled up, it is possible that the new mode will not
2575             produce uniformly good results in all cases. So you can still use the old mode by
2576             setting C to C in the constructor. Thanks Nadeem for your
2577             feedback!
2578              
2579             Version 1.30 resulted from Martin Kalin reporting problems with a very small data
2580             set. Thanks Martin!
2581              
2582             Version 1.21 came about in response to the problems encountered by Luis Fernando
2583             D'Haro with version 1.20. Although the module would yield the clusters for some of
2584             its runs, more frequently than not the module would abort with an "empty cluster"
2585             message for his data. Luis Fernando has also suggested other improvements (such as
2586             clustering directly from the contents of a hash) that I intend to make in future
2587             versions of this module. Thanks Luis Fernando.
2588              
2589             Chad Aeschliman was kind enough to test out the interface of this module and to give
2590             suggestions for its improvement. His key slogan: "If you cannot figure out how to
2591             use a module in under 10 minutes, it's not going to be used." That should explain
2592             the longish Synopsis included here.
2593              
2594             =head1 AUTHOR
2595              
2596             Avinash Kak, kak@purdue.edu
2597              
2598             If you send email, please place the string "KMeans" in your subject line to get past
2599             my spam filter.
2600              
2601             =head1 COPYRIGHT
2602              
2603             This library is free software; you can redistribute it and/or modify it under the
2604             same terms as Perl itself.
2605              
2606             Copyright 2014 Avinash Kak
2607              
2608             =cut
2609