File Coverage

blib/lib/Algorithm/KMeans.pm
Criterion Covered Total %
statement 19 21 90.4
branch n/a
condition n/a
subroutine 7 7 100.0
pod n/a
total 26 28 92.8


line stmt bran cond sub pod time code
1             package Algorithm::KMeans;
2              
3             #------------------------------------------------------------------------------------
4             # Copyright (c) 2014 Avinash Kak. All rights reserved. This program is free
5             # software. You may modify and/or distribute it under the same terms as Perl itself.
6             # This copyright notice must remain attached to the file.
7             #
8             # Algorithm::KMeans is a Perl module for clustering multidimensional data.
9             # -----------------------------------------------------------------------------------
10              
11             #use 5.10.0;
12 1     1   20289 use strict;
  1         2  
  1         44  
13 1     1   6 use warnings;
  1         1  
  1         43  
14 1     1   8 use Carp;
  1         6  
  1         109  
15 1     1   7 use File::Basename;
  1         2  
  1         107  
16 1     1   754 use Math::Random;
  1         13847  
  1         138  
17 1     1   817 use Graphics::GnuplotIF;
  1         14462  
  1         78  
18 1     1   1334 use Math::GSL::Matrix;
  0            
  0            
19              
20              
21             our $VERSION = '2.05';
22              
23             # from Perl docs:
24             my $_num_regex = '^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$';
25              
26             # Constructor:
27             sub new {
28             my ($class, %args) = @_;
29             my @params = keys %args;
30             croak "\nYou have used a wrong name for a keyword argument " .
31             "--- perhaps a misspelling\n"
32             if check_for_illegal_params(@params) == 0;
33             bless {
34             _datafile => $args{datafile} || croak("datafile required"),
35             _mask => $args{mask} || croak("mask required"),
36             _K => $args{K} || 0,
37             _K_min => $args{Kmin} || 'unknown',
38             _K_max => $args{Kmax} || 'unknown',
39             _cluster_seeding => $args{cluster_seeding} || croak("must choose smart or random ".
40             "for cluster seeding"),
41             _var_normalize => $args{do_variance_normalization} || 0,
42             _use_mahalanobis_metric => $args{use_mahalanobis_metric} || 0,
43             _clusters_2_files => $args{write_clusters_to_files} || 0,
44             _terminal_output => $args{terminal_output} || 0,
45             _debug => $args{debug} || 0,
46             _N => 0,
47             _K_best => 'unknown',
48             _original_data => {},
49             _data => {},
50             _data_id_tags => [],
51             _QoC_values => {},
52             _clusters => [],
53             _cluster_centers => [],
54             _clusters_hash => {},
55             _cluster_centers_hash => {},
56             _cluster_covariances_hash => {},
57             _data_dimensions => 0,
58              
59             }, $class;
60             }
61              
62             sub read_data_from_file {
63             my $self = shift;
64             my $filename = $self->{_datafile};
65             $self->read_data_from_file_csv() if $filename =~ /.csv$/;
66             $self->read_data_from_file_dat() if $filename =~ /.dat$/;
67             }
68              
69             sub read_data_from_file_csv {
70             my $self = shift;
71             my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
72             my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered";
73             my $mask = $self->{_mask};
74             my @mask = split //, $mask;
75             $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
76             print "data dimensionality: $self->{_data_dimensions} \n"if $self->{_terminal_output};
77             open FILEIN, $filename or die "Unable to open $filename: $!";
78             die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
79             local $/ = undef;
80             my @all_data = split /\s+/, ;
81             my %data_hash = ();
82             my @data_tags = ();
83             foreach my $record (@all_data) {
84             my @splits = split /,/, $record;
85             die "\nYour mask size (including `N' and 1's and 0's) does not match\n" .
86             "the size of at least one of the data records in the file: $!"
87             unless scalar(@mask) == scalar(@splits);
88             my $record_name = shift @splits;
89             $data_hash{$record_name} = \@splits;
90             push @data_tags, $record_name;
91             }
92             $self->{_original_data} = \%data_hash;
93             $self->{_data_id_tags} = \@data_tags;
94             $self->{_N} = scalar @data_tags;
95             if ($self->{_var_normalize}) {
96             $self->{_data} = variance_normalization( $self->{_original_data} );
97             } else {
98             $self->{_data} = deep_copy_hash( $self->{_original_data} );
99             }
100             # Need to make the following call to set the global mean and covariance:
101             # my $covariance = $self->estimate_mean_and_covariance(\@data_tags);
102             # Need to make the following call to set the global eigenvec eigenval sets:
103             # $self->eigen_analysis_of_covariance($covariance);
104             if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
105             carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
106             "points must satisfy the relation N > 2xK**2 where K is " .
107             "the number of clusters requested for the clusters to be " .
108             "meaningful $!"
109             if ( $self->{_N} < (2 * $self->{_K} ** 2) );
110             print "\n\n\n";
111             }
112             }
113              
114             sub read_data_from_file_dat {
115             my $self = shift;
116             my $datafile = $self->{_datafile};
117             my $mask = $self->{_mask};
118             my @mask = split //, $mask;
119             $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
120             print "data dimensionality: $self->{_data_dimensions} \n"if $self->{_terminal_output};
121             open INPUT, $datafile or die "unable to open file $datafile: $!\n";
122             chomp( my @raw_data = );
123             close INPUT;
124             # Transform strings into number data
125             foreach my $record (@raw_data) {
126             next unless $record;
127             next if $record =~ /^#/;
128             my @data_fields;
129             my @fields = split /\s+/, $record;
130             die "\nABORTED: Mask size does not correspond to row record size\n" if $#fields != $#mask;
131             my $record_id;
132             foreach my $i (0..@fields-1) {
133             if ($mask[$i] eq '0') {
134             next;
135             } elsif ($mask[$i] eq 'N') {
136             $record_id = $fields[$i];
137             } elsif ($mask[$i] eq '1') {
138             push @data_fields, $fields[$i];
139             } else {
140             die "misformed mask for reading the data file\n";
141             }
142             }
143             my @nums = map {/$_num_regex/;$_} @data_fields;
144             $self->{_original_data}->{ $record_id } = \@nums;
145             }
146             if ($self->{_var_normalize}) {
147             $self->{_data} = variance_normalization( $self->{_original_data} );
148             } else {
149             $self->{_data} = deep_copy_hash( $self->{_original_data} );
150             }
151             my @all_data_ids = keys %{$self->{_data}};
152             $self->{_data_id_tags} = \@all_data_ids;
153             $self->{_N} = scalar @all_data_ids;
154             if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
155             carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
156             "points must satisfy the relation N > 2xK**2 where K is " .
157             "the number of clusters requested for the clusters to be " .
158             "meaningful $!"
159             if ( $self->{_N} < (2 * $self->{_K} ** 2) );
160             print "\n\n\n";
161             }
162             }
163              
164             sub kmeans {
165             my $self = shift;
166             my $K = $self->{_K};
167             if ( ($K == 0) ||
168             ( ($self->{_K_min} ne 'unknown') &&
169             ($self->{_K_max} ne 'unknown') ) ) {
170             eval {
171             $self->iterate_through_K();
172             };
173             die "in kmeans(): $@" if ($@);
174             } elsif ( $K =~ /\d+/) {
175             eval {
176             $self->cluster_for_fixed_K_multiple_random_tries($K)
177             if $self->{_cluster_seeding} eq 'random';
178             $self->cluster_for_fixed_K_single_smart_try($K)
179             if $self->{_cluster_seeding} eq 'smart';
180             };
181             die "in kmeans(): $@" if ($@);
182             } else {
183             croak "Incorrect call syntax used. See documentation.\n";
184             }
185             if ((defined $self->{_clusters}) && (defined $self->{_cluster_centers})){
186             foreach my $i (0..@{$self->{_clusters}}-1) {
187             $self->{_clusters_hash}->{"cluster$i"} = $self->{_clusters}->[$i];
188             $self->{_cluster_centers_hash}->{"cluster$i"} = $self->{_cluster_centers}->[$i];
189             $self->{_cluster_covariances_hash}->{"cluster$i"} =
190             $self->estimate_cluster_covariance($self->{_clusters}->[$i]);
191             }
192             $self->write_clusters_to_files( $self->{_clusters} ) if $self->{_clusters_2_files};
193             return ($self->{_clusters_hash}, $self->{_cluster_centers_hash});
194             } else {
195             croak "Clustering failed. Try again.";
196             }
197             }
198              
199             # This is the subroutine to call if you prefer purely random choices for the initial
200             # seeding of the cluster centers.
201             sub cluster_for_fixed_K_multiple_random_tries {
202             my $self = shift;
203             my $K = shift;
204             my @all_data_ids = @{$self->{_data_id_tags}};
205             my $QoC;
206             my @QoC_values;
207             my @array_of_clusters;
208             my @array_of_cluster_centers;
209             my $clusters;
210             my $new_clusters;
211             my $cluster_centers;
212             my $new_cluster_centers;
213             print "Clustering for K = $K\n" if $self->{_terminal_output};
214             foreach my $trial (1..20) {
215             print ". ";
216             my ($new_clusters, $new_cluster_centers);
217             eval {
218             ($new_clusters, $new_cluster_centers) = $self->cluster_for_given_K($K);
219             };
220             next if $@;
221             next if @$new_clusters <= 1;
222             my $newQoC = $self->cluster_quality( $new_clusters, $new_cluster_centers );
223             if ( (!defined $QoC) || ($newQoC < $QoC) ) {
224             $QoC = $newQoC;
225             $clusters = deep_copy_AoA( $new_clusters );
226             $cluster_centers = deep_copy_AoA( $new_cluster_centers );
227             }
228             }
229             die "\n\nThe constructor options you have chosen do not work with the data. Try\n" .
230             "turning off the Mahalanobis option if you are using it.\n"
231             unless defined $clusters;
232             $self->{_clusters} = $clusters;
233             $self->{_cluster_centers} = $cluster_centers;
234             $self->{_QoC_values}->{"$K"} = $QoC;
235             if ($self->{_terminal_output}) {
236             print "\nDisplaying final clusters for best K (= $K) :\n";
237             display_clusters( $clusters );
238             $self->display_cluster_centers( $clusters );
239             print "\nQoC value (the smaller, the better): $QoC\n";
240             }
241             }
242              
243             # For the smart try, we do initial cluster seeding by subjecting the data to
244             # principal components analysis in order to discover the direction of maximum
245             # variance in the data space. Subsequently, we try to find the K largest peaks along
246             # this direction. The coordinates of these peaks serve as the seeds for the K
247             # clusters.
248             sub cluster_for_fixed_K_single_smart_try {
249             my $self = shift;
250             my $K = shift;
251             my @all_data_ids = @{$self->{_data_id_tags}};
252             print "Clustering for K = $K\n" if $self->{_terminal_output};
253             my ($clusters, $cluster_centers);
254             eval {
255             ($clusters, $cluster_centers) = $self->cluster_for_given_K($K);
256             };
257             if ($@) {
258             die "In cluster_for_fixed_K_single_smart_try: insufficient data for clustering " .
259             "with $self->{_K} clusters --- $@";
260             }
261             my $QoC = $self->cluster_quality( $clusters, $cluster_centers );
262             $self->{_clusters} = $clusters;
263             $self->{_cluster_centers} = $cluster_centers;
264             $self->{_QoC_values}->{"$K"} = $QoC;
265             if ($self->{_terminal_output}) {
266             print "\nDisplaying final clusters for best K (= $K) :\n";
267             display_clusters( $clusters );
268             $self->display_cluster_centers( $clusters );
269             print "\nQoC value (the smaller, the better): $QoC\n";
270             }
271             }
272              
273             # The following subroutine is the top-level routine to call if you want the system to
274             # figure out on its own what value to use for K, the number of clusters. If you call
275             # the KMeans constructor with the K=0 option, the method below will try every
276             # possible value of K between 2 and the maximum possible depending on the number of
277             # data points available. For example, if the number of data points is 10,000, it will
278             # try all possible values of K between 2 and 70. For how the maximum value is set for
279             # K, see the comments made under Description. Note also how this method makes 20
280             # different tries for each value of K as a defense against the problem of the final
281             # result corresponding to some local minimum in the values of the QoC metric. Out of
282             # these 20 tries for each K, it retains the clusters and the cluster centers for only
283             # that try that yields the smallest value for the QoC metric. After estimating the
284             # "best" QoC values for all possible K in this manner, it then finds the K for which
285             # the QoC is the minimum. This is taken to be the best value for K. Finally, the
286             # output clusters are written out to separate files.
287             #
288             # If the KMeans constructor is invoked with the (Kmin, Kmax) options, then, instead
289             # of iterating through 2 and the maximum permissible value for K, the iterations are
290             # carried out only between Kmin and Kmax.
291             sub iterate_through_K {
292             my $self = shift;
293             my @all_data_ids = @{$self->{_data_id_tags}};
294             my $N = $self->{_N};
295             croak "You need more than 8 data samples. The number of data points must satisfy " .
296             "the relation N > 2xK**2 where K is the number of clusters. The smallest " .
297             "value for K is 2.\n" if $N <= 8;
298             my $K_statistical_max = int( sqrt( $N / 2.0 ) );
299             my $Kmin = $self->{_K_min} eq 'unknown'
300             ? 2
301             : $self->{_K_min};
302             my $Kmax = $self->{_K_max} eq 'unknown'
303             ? int( sqrt( $N / 2.0 ) )
304             : $self->{_K_max};
305             croak "\n\nYour Kmax value is too high. Ideally, it should not exceed sqrt(N/2) " .
306             "where N is the number of data points" if $Kmax > $K_statistical_max;
307             print "Value of Kmax is: $Kmax\n" if $self->{_terminal_output};
308             my @QoC_values;
309             my @array_of_clusters;
310             my @array_of_cluster_centers;
311             foreach my $K ($Kmin..$Kmax) {
312             my $QoC;
313             my $clusters;
314             my $cluster_centers;
315             print "Clustering for K = $K\n" if $self->{_terminal_output};
316             if ($self->{_cluster_seeding} eq 'random') {
317             foreach my $trial (1..21) {
318             print ". ";
319             my ($new_clusters, $new_cluster_centers);
320             if ($self->{_use_mahalanobis_metric}) {
321             eval {
322             ($new_clusters, $new_cluster_centers) = $self->cluster_for_given_K($K);
323             };
324             next if $@;
325             } else {
326             ($new_clusters, $new_cluster_centers) = $self->cluster_for_given_K($K);
327             }
328             my $newQoC = $self->cluster_quality( $new_clusters, $new_cluster_centers );
329             if ( (!defined $QoC) || ($newQoC < $QoC) ) {
330             $QoC = $newQoC;
331             $clusters = deep_copy_AoA( $new_clusters );
332             $cluster_centers = deep_copy_AoA( $new_cluster_centers );
333             }
334             }
335             print "\n";
336             } elsif ($self->{_cluster_seeding} eq 'smart') {
337             eval {
338             ($clusters, $cluster_centers) = $self->cluster_for_given_K($K);
339             };
340             if ($@) {
341             $Kmax = $K - 1;
342             last;
343             }
344             $QoC = $self->cluster_quality($clusters,$cluster_centers);
345             } else {
346             die "You must either choose 'smart' for cluster_seeding or 'random'. " .
347             "Fix your constructor call."
348             }
349             push @QoC_values, $QoC;
350             push @array_of_clusters, $clusters;
351             push @array_of_cluster_centers, $cluster_centers;
352             }
353             my ($min, $max) = minmax( \@QoC_values );
354             my $K_best_relative_to_Kmin = get_index_at_value($min, \@QoC_values );
355             my $K_best = $K_best_relative_to_Kmin + $Kmin;
356             if ($self->{_terminal_output}) {
357             print "\nDisplaying final clusters for best K (= $K_best) :\n";
358             display_clusters( $array_of_clusters[$K_best_relative_to_Kmin] );
359             $self->display_cluster_centers($array_of_clusters[$K_best_relative_to_Kmin]);
360             print "\nBest clustering achieved for K=$K_best with QoC = $min\n" if defined $min;
361             my @printableQoC = grep {$_} @QoC_values;
362             print "\nQoC values array (the smaller the value, the better it is) for different " .
363             "K starting with K=$Kmin: @printableQoC\n";
364             }
365             $self->{_K_best} = $K_best;
366             foreach my $i (0..@QoC_values-1) {
367             my $k = $i + $Kmin;
368             $self->{_QoC_values}->{"$k"} = $QoC_values[$i];
369             }
370             $self->{_clusters} = $array_of_clusters[$K_best_relative_to_Kmin];
371             $self->{_cluster_centers} =
372             $array_of_cluster_centers[$K_best_relative_to_Kmin];
373             }
374              
375             # This is the function to call if you already know what value you want to use for K,
376             # the number of expected clusters. The purpose of this function is to do the
377             # initialization of the cluster centers and to carry out the initial assignment of
378             # the data to the clusters with the initial cluster centers. The initialization
379             # consists of 3 steps: Construct a random sequence of K integers between 0 and N-1
380             # where N is the number of data points to be clustered; 2) Call
381             # get_initial_cluster_centers() to index into the data array with the random integers
382             # to get a list of K data points that would serve as the initial cluster centers; and
383             # (3) Call assign_data_to_clusters_initial() to assign the rest of the data to each
384             # of the K clusters on the basis of the proximity to the cluster centers.
385             sub cluster_for_given_K {
386             my $self = shift;
387             my $K = shift;
388             my @all_data_ids = @{$self->{_data_id_tags}};
389             my $cluster_centers;
390             if ($self->{_cluster_seeding} eq 'smart') {
391             $cluster_centers = $self->get_initial_cluster_centers_smart($K);
392             } elsif ($self->{_cluster_seeding} eq 'random') {
393             $cluster_centers = $self->get_initial_cluster_centers($K);
394             } else {
395             die "You must either choose 'smart' for cluster_seeding or 'random'. " .
396             "Fix your constructor call."
397             }
398             my $clusters;
399             if ($self->{_use_mahalanobis_metric}) {
400             my $clusters_and_determinants =
401             $self->assign_data_to_clusters_initial_mahalanobis($cluster_centers);
402             $clusters = $clusters_and_determinants->[0];
403             my @determinants = @{$clusters_and_determinants->[1]};
404             my ($min,$max) = minmax(\@determinants);
405             die "In cluster_for_given_K(): min determinant of covariance matrix for at " .
406             "least one cluster is too small" if $min / $max < 0.001;
407             } else {
408             $clusters = $self->assign_data_to_clusters_initial($cluster_centers);
409             }
410             my $cluster_nonexistent_flag = 0;
411             foreach my $trial (0..2) {
412             if ($self->{_use_mahalanobis_metric}) {
413             ($clusters, $cluster_centers) = $self->assign_data_to_clusters_mahalanobis($clusters, $K);
414             } else {
415             ($clusters, $cluster_centers) = $self->assign_data_to_clusters( $clusters, $K );
416             }
417             my $num_of_clusters_returned = @$clusters;
418             foreach my $cluster (@$clusters) {
419             $cluster_nonexistent_flag = 1 if ((!defined $cluster) || (@$cluster == 0));
420             }
421             last unless $cluster_nonexistent_flag;
422             }
423             return ($clusters, $cluster_centers);
424             }
425              
426             # This function is used when you set the "cluster_seeding" option to 'random' in the
427             # constructor. Returns a set of K random integers. These serve as indices to reach
428             # into the data array. A data element whose index is one of the random numbers
429             # returned by this routine serves as an initial cluster center. Note the quality
430             # check it runs on the list of K random integers constructed. We first make sure
431             # that all K random integers are different. Subsequently, we carry out a quality
432             # assessment of the K random integers constructed. This quality measure consists of
433             # the ratio of the values spanned by the random integers to the value of N, the total
434             # number of data points to be clustered. Currently, if this ratio is less than 0.3,
435             # we discard the K integers and try again.
436             sub initialize_cluster_centers {
437             my $self = shift;
438             my $K = shift;
439             my $data_store_size = $self->{_N};
440             my @cluster_center_indices;
441             while (1) {
442             foreach my $i (0..$K-1) {
443             $cluster_center_indices[$i] = int rand( $data_store_size );
444             next if $i == 0;
445             foreach my $j (0..$i-1) {
446             while ( $cluster_center_indices[$j] ==
447             $cluster_center_indices[$i] ) {
448             my $old = $cluster_center_indices[$i];
449             $cluster_center_indices[$i] = int rand($data_store_size);
450             }
451             }
452             }
453             my ($min,$max) = minmax(\@cluster_center_indices );
454             my $quality = ($max - $min) / $data_store_size;
455             last if $quality > 0.3;
456             }
457             return @cluster_center_indices;
458             }
459              
460             # This function is used when you set the "cluster_seeding" option to 'random' in the
461             # constructor. This routine merely reaches into the data array with the random
462             # integers, as constructed by the previous routine, serving as indices and fetching
463             # values corresponding to those indices. The fetched data samples serve as the
464             # initial cluster centers.
465             sub get_initial_cluster_centers {
466             my $self = shift;
467             my $K = shift;
468             my @cluster_center_indices = $self->initialize_cluster_centers($K);
469             my @result;
470             foreach my $i (@cluster_center_indices) {
471             my $tag = $self->{_data_id_tags}[$i];
472             push @result, $self->{_data}->{$tag};
473             }
474             return \@result;
475             }
476              
477             # This method is invoked when you choose the 'smart' option for the "cluster_seeding"
478             # option in the constructor. It subjects the data to a principal components analysis
479             # to figure out the direction of maximal variance. Subsequently, it tries to locate
480             # K peaks in a smoothed histogram of the data points projected onto the maximal
481             # variance direction.
482             sub get_initial_cluster_centers_smart {
483             my $self = shift;
484             my $K = shift;
485             if ($self->{_data_dimensions} == 1) {
486             my @one_d_data;
487             foreach my $j (0..$self->{_N}-1) {
488             my $tag = $self->{_data_id_tags}[$j];
489             push @one_d_data, $self->{_data}->{$tag}->[0];
490             }
491             my @peak_points = find_peak_points_in_given_direction(\@one_d_data,$K);
492             print "highest points at data values: @peak_points\n" if $self->{_debug};
493             my @cluster_centers;
494             foreach my $peakpoint (@peak_points) {
495             push @cluster_centers, [$peakpoint];
496             }
497             return \@cluster_centers;
498             }
499             my ($num_rows,$num_cols) = ($self->{_data_dimensions},$self->{_N});
500             my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
501             my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
502             # All the record labels are stored in the array $self->{_data_id_tags}. The
503             # actual data for clustering is stored in a hash at $self->{_data} whose keys are
504             # the record labels; the value associated with each key is the array holding the
505             # corresponding numerical multidimensional data.
506             foreach my $j (0..$num_cols-1) {
507             my $tag = $self->{_data_id_tags}[$j];
508             my $data = $self->{_data}->{$tag};
509             $matrix->set_col($j, $data);
510             }
511             if ($self->{_debug}) {
512             print "\nDisplaying the original data as a matrix:";
513             display_matrix( $matrix );
514             }
515             foreach my $j (0..$num_cols-1) {
516             $mean_vec += $matrix->col($j);
517             }
518             $mean_vec *= 1.0 / $num_cols;
519             if ($self->{_debug}) {
520             print "Displaying the mean vector for the data:";
521             display_matrix( $mean_vec );
522             }
523             foreach my $j (0..$num_cols-1) {
524             my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
525             $matrix->set_col($j, \@new_col);
526             }
527             if ($self->{_debug}) {
528             print "Displaying mean subtracted data as a matrix:";
529             display_matrix( $matrix );
530             }
531             my $transposed = transpose( $matrix );
532             if ($self->{_debug}) {
533             print "Displaying transposed data matrix:";
534             display_matrix( $transposed );
535             }
536             my $covariance = matrix_multiply( $matrix, $transposed );
537             $covariance *= 1.0 / $num_cols;
538             if ($self->{_debug}) {
539             print "\nDisplaying the Covariance Matrix for your data:";
540             display_matrix( $covariance );
541             }
542             my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
543             my $num_of_eigens = @$eigenvalues;
544             my $largest_eigen_index = 0;
545             my $smallest_eigen_index = 0;
546             print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug};
547             foreach my $i (1..$num_of_eigens-1) {
548             $largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
549             $smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
550             print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug};
551             }
552             print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
553             print "\nsmallest eigen index: $smallest_eigen_index\n\n" if $self->{_debug};
554             foreach my $i (0..$num_of_eigens-1) {
555             my @vec = $eigenvectors->[$i]->as_list;
556             print "Eigenvector $i: @vec\n" if $self->{_debug};
557             }
558             my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
559             print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug};
560             my @max_var_direction;
561             # Each element of the array @largest_eigen_vec is a Math::Complex object
562             foreach my $k (0..@largest_eigen_vec-1) {
563             my ($mag, $theta) = $largest_eigen_vec[$k] =~ /\[(\d*\.\d+),(\S+)\]/;
564             if ($theta eq '0') {
565             $max_var_direction[$k] = $mag;
566             } elsif ($theta eq 'pi') {
567             $max_var_direction[$k] = -1.0 * $mag;
568             } else {
569             die "eigendecomposition of covariance matrix produced a complex " .
570             "eigenvector --- something is wrong";
571             }
572             }
573             print "\nMaximum Variance Direction: @max_var_direction\n\n" if $self->{_debug};
574             # We now project all data points on the largest eigenvector.
575             # Each projection will yield a single point on the eigenvector.
576             my @projections;
577             foreach my $j (0..$self->{_N}-1) {
578             my $tag = $self->{_data_id_tags}[$j];
579             my $data = $self->{_data}->{$tag};
580             die "Dimensionality of the largest eigenvector does not "
581             . "match the dimensionality of the data"
582             unless @max_var_direction == $self->{_data_dimensions};
583             my $projection = vector_multiply($data, \@max_var_direction);
584             push @projections, $projection;
585             }
586             print "All projection points: @projections\n" if $self->{_debug};
587             my @peak_points = find_peak_points_in_given_direction(\@projections, $K);
588             print "highest points at points along largest eigenvec: @peak_points\n" if $self->{_debug};
589             my @cluster_centers;
590             foreach my $peakpoint (@peak_points) {
591             my @actual_peak_coords = map {$peakpoint * $_} @max_var_direction;
592             push @cluster_centers, \@actual_peak_coords;
593             }
594             return \@cluster_centers;
595             }
596              
597             # This method is invoked when you choose the 'smart' option for the "cluster_seeding"
598             # option in the constructor. It is called by the previous method to locate K peaks
599             # in a smoothed histogram of the data points projected onto the maximal variance
600             # direction.
601             sub find_peak_points_in_given_direction {
602             my $dataref = shift;
603             my $how_many = shift;
604             my @data = @$dataref;
605             my ($min, $max) = minmax(\@data);
606             my $num_points = @data;
607             my @sorted_data = sort {$a <=> $b} @data;
608             my $scale = $max - $min;
609             foreach my $index (0..$#sorted_data-1) {
610             $sorted_data[$index] = ($sorted_data[$index] - $min) / $scale;
611             }
612             my $avg_diff = 0;
613             foreach my $index (0..$#sorted_data-1) {
614             my $diff = $sorted_data[$index+1] - $sorted_data[$index];
615             $avg_diff += ($diff - $avg_diff) / ($index + 1);
616             }
617             my $delta = 1.0 / 1000.0;
618             my @accumulator = (0) x 1000;
619             foreach my $index (0..@sorted_data-1) {
620             my $cell_index = int($sorted_data[$index] / $delta);
621             my $smoothness = 40;
622             for my $index ($cell_index-$smoothness..$cell_index+$smoothness) {
623             next if $index < 0 || $index > 999;
624             $accumulator[$index]++;
625             }
626             }
627             my $peaks_array = non_maximum_suppression( \@accumulator );
628             my $peaks_index_hash = get_value_index_hash( $peaks_array );
629             my @K_highest_peak_locations;
630             my $k = 0;
631             foreach my $peak (sort {$b <=> $a} keys %$peaks_index_hash) {
632             my $unscaled_peak_point = $min + $peaks_index_hash->{$peak} * $scale * $delta;
633             push @K_highest_peak_locations, $unscaled_peak_point
634             if $k < $how_many;
635             last if ++$k == $how_many;
636             }
637             return @K_highest_peak_locations;
638             }
639              
640             # The purpose of this routine is to form initial clusters by assigning the data
641             # samples to the initial clusters formed by the previous routine on the basis of the
642             # best proximity of the data samples to the different cluster centers.
643             sub assign_data_to_clusters_initial {
644             my $self = shift;
645             my @cluster_centers = @{ shift @_ };
646             my @clusters;
647             foreach my $ele (@{$self->{_data_id_tags}}) {
648             my $best_cluster;
649             my @dist_from_clust_centers;
650             foreach my $center (@cluster_centers) {
651             push @dist_from_clust_centers, $self->distance($ele, $center);
652             }
653             my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
654             push @{$clusters[$best_center_index]}, $ele;
655             }
656             return \@clusters;
657             }
658              
659             sub assign_data_to_clusters_initial_mahalanobis {
660             my $self = shift;
661             my @cluster_centers = @{ shift @_ };
662             my @clusters;
663             foreach my $ele (@{$self->{_data_id_tags}}) {
664             my $best_cluster;
665             my @dist_from_clust_centers;
666             foreach my $center (@cluster_centers) {
667             push @dist_from_clust_centers, $self->distance($ele, $center);
668             }
669             my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
670             push @{$clusters[$best_center_index]}, $ele if defined $best_center_index;
671             }
672             # Since a cluster center may not correspond to any particular sample, it is possible
673             # for one of the elements of the array @clusters to be null using the above
674             # strategy for populating the initial clusters. Let's say there are five cluster
675             # centers in the array @cluster_centers. The $best_center_index may populate the
676             # the elements of the array @clusters for the indices 0, 1, 2, 4, which would leave
677             # $clusters[3] as undefined. So, in what follows, we must first check if all of
678             # the elements of @clusters are defined.
679             my @determinants;
680             foreach my $cluster(@clusters) {
681             die "The clustering program started with bad initialization. Please start over"
682             unless defined $cluster;
683             my $covariance = $self->estimate_cluster_covariance($cluster);
684             my $determinant = $covariance->det();
685             push @determinants, $determinant;
686             }
687             return [\@clusters, \@determinants];
688             }
689              
690             # This is the main routine that along with the update_cluster_centers() routine
691             # constitute the two key steps of the K-Means algorithm. In most cases, the infinite
692             # while() loop will terminate automatically when the cluster assignments of the data
693             # points remain unchanged. For the sake of safety, we keep track of the number of
694             # iterations. If this number reaches 100, we exit the while() loop anyway. In most
695             # cases, this limit will not be reached.
696             sub assign_data_to_clusters {
697             my $self = shift;
698             my $clusters = shift;
699             my $K = shift;
700             my $final_cluster_centers;
701             my $iteration_index = 0;
702             while (1) {
703             my $new_clusters;
704             my $assignment_changed_flag = 0;
705             my $current_cluster_center_index = 0;
706             my $cluster_size_zero_condition = 0;
707             my $how_many = @$clusters;
708             my $cluster_centers = $self->update_cluster_centers( deep_copy_AoA_with_nulls( $clusters ) );
709             $iteration_index++;
710             foreach my $cluster (@$clusters) {
711             my $current_cluster_center = $cluster_centers->[$current_cluster_center_index];
712             foreach my $ele (@$cluster) {
713             my @dist_from_clust_centers;
714             foreach my $center (@$cluster_centers) {
715             push @dist_from_clust_centers,
716             $self->distance($ele, $center);
717             }
718             my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
719             my $best_cluster_center = $cluster_centers->[$best_center_index];
720             if (vector_equal($current_cluster_center, $best_cluster_center)){
721             push @{$new_clusters->[$current_cluster_center_index]}, $ele;
722             } else {
723             $assignment_changed_flag = 1;
724             push @{$new_clusters->[$best_center_index]}, $ele;
725             }
726             }
727             $current_cluster_center_index++;
728             }
729             next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
730             # Now make sure that none of the K clusters is an empty cluster:
731             foreach my $newcluster (@$new_clusters) {
732             $cluster_size_zero_condition = 1 if ((!defined $newcluster) or (@$newcluster == 0));
733             }
734             push @$new_clusters, (undef) x ($K - @$new_clusters) if @$new_clusters < $K;
735             # During clustering for a fixed K, should a cluster inadvertently
736             # become empty, steal a member from the largest cluster to hopefully
737             # spawn a new cluster:
738             my $largest_cluster;
739             foreach my $local_cluster (@$new_clusters) {
740             next if !defined $local_cluster;
741             $largest_cluster = $local_cluster if !defined $largest_cluster;
742             if (@$local_cluster > @$largest_cluster) {
743             $largest_cluster = $local_cluster;
744             }
745             }
746             foreach my $local_cluster (@$new_clusters) {
747             if ( (!defined $local_cluster) || (@$local_cluster == 0) ) {
748             push @$local_cluster, pop @$largest_cluster;
749             }
750             }
751             next if (($cluster_size_zero_condition) && ($iteration_index < 100));
752             last if $iteration_index == 100;
753             $clusters = deep_copy_AoA( $new_clusters );
754             last if $assignment_changed_flag == 0;
755             }
756             $final_cluster_centers = $self->update_cluster_centers( $clusters );
757             return ($clusters, $final_cluster_centers);
758             }
759              
760             sub assign_data_to_clusters_mahalanobis {
761             my $self = shift;
762             my $clusters = shift;
763             my $K = shift;
764             my $final_cluster_centers;
765             my $iteration_index = 0;
766             while (1) {
767             my $new_clusters;
768             my $assignment_changed_flag = 0;
769             my $current_cluster_center_index = 0;
770             my $cluster_size_zero_condition = 0;
771             my $how_many = @$clusters;
772             my $cluster_centers_and_covariances =
773             $self->update_cluster_centers_and_covariances_mahalanobis(deep_copy_AoA_with_nulls($clusters));
774             my $cluster_centers = $cluster_centers_and_covariances->[0];
775             my $cluster_covariances = $cluster_centers_and_covariances->[1];
776             $iteration_index++;
777             foreach my $cluster (@$clusters) {
778             my $current_cluster_center = $cluster_centers->[$current_cluster_center_index];
779             my $current_cluster_covariance = $cluster_covariances->[$current_cluster_center_index];
780             foreach my $ele (@$cluster) {
781             my @mahalanobis_dist_from_clust_centers;
782             foreach my $i (0..@$clusters-1) {
783             my $center = $cluster_centers->[$i];
784             my $covariance = $cluster_covariances->[$i];
785             my $maha_distance;
786             eval {
787             $maha_distance = $self->distance_mahalanobis($ele, $center, $covariance);
788             };
789             next if $@;
790             push @mahalanobis_dist_from_clust_centers, $maha_distance;
791             }
792             my ($min, $best_center_index) = minimum( \@mahalanobis_dist_from_clust_centers );
793             die "The Mahalanobis metric may not be appropriate for the data"
794             unless defined $best_center_index;
795             my $best_cluster_center = $cluster_centers->[$best_center_index];
796             if (vector_equal($current_cluster_center, $best_cluster_center)){
797             push @{$new_clusters->[$current_cluster_center_index]}, $ele;
798             } else {
799             $assignment_changed_flag = 1;
800             push @{$new_clusters->[$best_center_index]}, $ele;
801             }
802             }
803             $current_cluster_center_index++;
804             }
805             next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
806             # Now make sure that none of the K clusters is an empty cluster:
807             foreach my $newcluster (@$new_clusters) {
808             $cluster_size_zero_condition = 1 if ((!defined $newcluster) or (@$newcluster == 0));
809             }
810             push @$new_clusters, (undef) x ($K - @$new_clusters) if @$new_clusters < $K;
811             # During clustering for a fixed K, should a cluster inadvertently
812             # become empty, steal a member from the largest cluster to hopefully
813             # spawn a new cluster:
814             my $largest_cluster;
815             foreach my $local_cluster (@$new_clusters) {
816             next if !defined $local_cluster;
817             $largest_cluster = $local_cluster if !defined $largest_cluster;
818             if (@$local_cluster > @$largest_cluster) {
819             $largest_cluster = $local_cluster;
820             }
821             }
822             foreach my $local_cluster (@$new_clusters) {
823             if ( (!defined $local_cluster) || (@$local_cluster == 0) ) {
824             push @$local_cluster, pop @$largest_cluster;
825             }
826             }
827             next if (($cluster_size_zero_condition) && ($iteration_index < 100));
828             last if $iteration_index == 100;
829             # Now do a deep copy of new_clusters into clusters
830             $clusters = deep_copy_AoA( $new_clusters );
831             last if $assignment_changed_flag == 0;
832             }
833             $final_cluster_centers = $self->update_cluster_centers( $clusters );
834             return ($clusters, $final_cluster_centers);
835             }
836              
837             sub update_cluster_centers_and_covariances_mahalanobis {
838             my $self = shift;
839             my @clusters = @{ shift @_ };
840             my @new_cluster_centers;
841             my @new_cluster_covariances;
842             # During clustering for a fixed K, should a cluster inadvertently become empty,
843             # steal a member from the largest cluster to hopefully spawn a new cluster:
844             my $largest_cluster;
845             foreach my $cluster (@clusters) {
846             next if !defined $cluster;
847             $largest_cluster = $cluster if !defined $largest_cluster;
848             if (@$cluster > @$largest_cluster) {
849             $largest_cluster = $cluster;
850             }
851             }
852             foreach my $cluster (@clusters) {
853             if ( (!defined $cluster) || (@$cluster == 0) ) {
854             push @$cluster, pop @$largest_cluster;
855             }
856             }
857             foreach my $cluster (@clusters) {
858             die "Cluster became empty --- untenable condition " .
859             "for a given K. Try again. \n" if !defined $cluster;
860             my $cluster_size = @$cluster;
861             die "Cluster size is zero --- untenable.\n" if $cluster_size == 0;
862             my @new_cluster_center = @{$self->add_point_coords( $cluster )};
863             @new_cluster_center = map {my $x = $_/$cluster_size; $x} @new_cluster_center;
864             push @new_cluster_centers, \@new_cluster_center;
865             # for covariance calculation:
866             my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$cluster));
867             my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
868             my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
869             # All the record labels are stored in the array $self->{_data_id_tags}. The
870             # actual data for clustering is stored in a hash at $self->{_data} whose keys are
871             # the record labels; the value associated with each key is the array holding the
872             # corresponding numerical multidimensional data.
873             foreach my $j (0..$num_cols-1) {
874             my $tag = $cluster->[$j];
875             my $data = $self->{_data}->{$tag};
876             my @diff_from_mean = vector_subtract($data, \@new_cluster_center);
877             $matrix->set_col($j, \@diff_from_mean);
878             }
879             my $transposed = transpose( $matrix );
880             my $covariance = matrix_multiply( $matrix, $transposed );
881             $covariance *= 1.0 / $num_cols;
882             if ($self->{_debug}) {
883             print "\nDisplaying the Covariance Matrix for cluster:";
884             display_matrix( $covariance );
885             }
886             push @new_cluster_covariances, $covariance;
887             }
888             return [\@new_cluster_centers, \@new_cluster_covariances];
889             }
890              
891             # After each new assignment of the data points to the clusters on the basis of the
892             # current values for the cluster centers, we call the routine shown here for updating
893             # the values of the cluster centers.
894             sub update_cluster_centers {
895             my $self = shift;
896             my @clusters = @{ shift @_ };
897             my @new_cluster_centers;
898             # During clustering for a fixed K, should a cluster inadvertently become empty,
899             # steal a member from the largest cluster to hopefully spawn a new cluster:
900             my $largest_cluster;
901             foreach my $cluster (@clusters) {
902             next if !defined $cluster;
903             $largest_cluster = $cluster if !defined $largest_cluster;
904             if (@$cluster > @$largest_cluster) {
905             $largest_cluster = $cluster;
906             }
907             }
908             foreach my $cluster (@clusters) {
909             if ( (!defined $cluster) || (@$cluster == 0) ) {
910             push @$cluster, pop @$largest_cluster;
911             }
912             }
913             foreach my $cluster (@clusters) {
914             die "Cluster became empty --- untenable condition " .
915             "for a given K. Try again. \n" if !defined $cluster;
916             my $cluster_size = @$cluster;
917             die "Cluster size is zero --- untenable.\n" if $cluster_size == 0;
918             my @new_cluster_center = @{$self->add_point_coords( $cluster )};
919             @new_cluster_center = map {my $x = $_/$cluster_size; $x}
920             @new_cluster_center;
921             push @new_cluster_centers, \@new_cluster_center;
922             }
923             return \@new_cluster_centers;
924             }
925              
926             sub which_cluster_for_new_data_element {
927             my $self = shift;
928             my $ele = shift;
929             die "The dimensionality of the new data element is not correct: $!"
930             unless @$ele == $self->{_data_dimensions};
931             my %distance_to_new_ele_hash;
932             foreach my $cluster_id (sort keys %{$self->{_cluster_centers_hash}}) {
933             $distance_to_new_ele_hash{$cluster_id} = $self->distance2($ele,
934             $self->{_cluster_centers_hash}->{$cluster_id});
935             }
936             my @values = values %distance_to_new_ele_hash;
937             my ($min,$max) = minmax(\@values);
938             my $answer;
939             foreach my $cluster_id (keys %distance_to_new_ele_hash) {
940             $answer = $cluster_id if $distance_to_new_ele_hash{$cluster_id} == $min;
941             }
942             return $answer;
943             }
944              
945             sub which_cluster_for_new_data_element_mahalanobis {
946             my $self = shift;
947             my $ele = shift;
948             die "The dimensionality of the new data element is not correct: $!"
949             unless @$ele == $self->{_data_dimensions};
950             my %distance_to_new_ele_hash;
951             foreach my $cluster_id (sort keys %{$self->{_cluster_centers_hash}}) {
952             $distance_to_new_ele_hash{$cluster_id} =
953             $self->distance_mahalanobis2($ele, $self->{_cluster_centers_hash}->{$cluster_id},
954             $self->{_cluster_covariances_hash}->{$cluster_id});
955             }
956             my @values = values %distance_to_new_ele_hash;
957             my ($min,$max) = minmax(\@values);
958             my $answer;
959             foreach my $cluster_id (keys %distance_to_new_ele_hash) {
960             $answer = $cluster_id if $distance_to_new_ele_hash{$cluster_id} == $min;
961             }
962             return $answer;
963             }
964              
965             # The following function returns the value of QoC for a given partitioning of the
966             # data into K clusters. It calculates two things: the average value for the distance
967             # between a data point and the center of the cluster in which the data point resides,
968             # and the average value for the distances between the cluster centers. We obviously
969             # want to minimize the former and maximize the latter. All of the "from center"
970             # distances within each cluster are stored in the variable
971             # $sum_of_distances_for_one_cluster. When this variable, after it is divided by the
972             # number of data elements in the cluster, is summed over all the clusters, we get a
973             # value that is stored in $avg_dist_for_cluster. The inter-cluster-center distances
974             # are stored in the variable $inter_cluster_center_dist.
975             sub cluster_quality {
976             my $self = shift;
977             my $clusters = shift;
978             my $cluster_centers = shift;
979             my $K = @$cluster_centers; # Number of clusters
980             my $cluster_radius = 0;
981             foreach my $i (0..@$clusters-1) {
982             my $sum_of_distances_for_one_cluster = 0;
983             foreach my $ele (@{$clusters->[$i]}) {
984             $sum_of_distances_for_one_cluster +=
985             $self->distance( $ele, $cluster_centers->[$i] );
986             }
987             $cluster_radius +=
988             $sum_of_distances_for_one_cluster / @{$clusters->[$i]};
989             }
990             my $inter_cluster_center_dist = 0;
991             foreach my $i (0..@$cluster_centers-1) {
992             foreach my $j (0..@$cluster_centers-1) {
993             $inter_cluster_center_dist +=
994             $self->distance2( $cluster_centers->[$i],
995             $cluster_centers->[$j] );
996             }
997             }
998             my $avg_inter_cluster_center_dist = $inter_cluster_center_dist /
999             ( $K * ($K-1) / 2.0 );
1000             return $cluster_radius / $avg_inter_cluster_center_dist;
1001             }
1002              
1003             # The following routine is for computing the distance between a data point specified
1004             # by its symbolic name in the master datafile and a point (such as the center of a
1005             # cluster) expressed as a vector of coordinates:
1006             sub distance {
1007             my $self = shift;
1008             my $ele1_id = shift @_; # symbolic name of data sample
1009             my @ele1 = @{$self->{_data}->{$ele1_id}};
1010             my @ele2 = @{shift @_};
1011             die "wrong data types for distance calculation\n" if @ele1 != @ele2;
1012             my $how_many = @ele1;
1013             my $squared_sum = 0;
1014             foreach my $i (0..$how_many-1) {
1015             $squared_sum += ($ele1[$i] - $ele2[$i])**2;
1016             }
1017             my $dist = sqrt $squared_sum;
1018             return $dist;
1019             }
1020              
1021             # The following routine does the same as above but now both arguments are expected to
1022             # be arrays of numbers:
1023             sub distance2 {
1024             my $self = shift;
1025             my @ele1 = @{shift @_};
1026             my @ele2 = @{shift @_};
1027             die "wrong data types for distance calculation\n" if @ele1 != @ele2;
1028             my $how_many = @ele1;
1029             my $squared_sum = 0;
1030             foreach my $i (0..$how_many-1) {
1031             $squared_sum += ($ele1[$i] - $ele2[$i])**2;
1032             }
1033             return sqrt $squared_sum;
1034             }
1035              
1036             # Requires three args: $ele for the symbolic tag of the element, $center for the
1037             # coordinates of the center of the cluster, and $covariance for the covariance of
1038             # cluster. Our goal is to find the distance of the element ele from the cluster
1039             # whose mean and covariance are provided.
1040             sub distance_mahalanobis {
1041             my $self = shift;
1042             my $ele = shift;
1043             my $cluster_center = shift;
1044             my $cluster_covariance = shift;
1045             my $det_of_covar = $cluster_covariance->det();
1046             my $ele_data = $self->{_data}->{$ele};
1047             my @diff_from_mean = vector_subtract($ele_data, $cluster_center);
1048             my $vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
1049             $vec->set_col(0, \@diff_from_mean);
1050             my $transposed = transpose( $vec );
1051             my $covariance_inverse;
1052             if ($cluster_covariance->det() > 0.001) {
1053             $covariance_inverse = $cluster_covariance->inverse;
1054             } else {
1055             die "covariance matrix may not have an inverse";
1056             }
1057             my $determinant = $covariance_inverse->det();
1058             my $distance = $transposed * $covariance_inverse * $vec;
1059             my @distance = $distance->as_list;
1060             $distance = $distance[0];
1061             return sqrt $distance;
1062             }
1063             # Same as the previous method except that the first argument can be the actual
1064             # coordinates of the data element. Our goal is to find the Mahalanobis distance
1065             # from a given data element to a cluster whose center and covariance are known. As
1066             # for the previous method, it requires three arguments.
1067             sub distance_mahalanobis2 {
1068             my $self = shift;
1069             my $ele = shift; # is now a ref to the array of coords for a point
1070             my $cluster_center = shift;
1071             my $cluster_covariance = shift;
1072             return undef unless defined $cluster_covariance;
1073             my $det_of_covar = $cluster_covariance->det();
1074             my @diff_from_mean = vector_subtract($ele, $cluster_center);
1075             my $vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
1076             $vec->set_col(0, \@diff_from_mean);
1077             my $transposed = transpose( $vec );
1078             my $covariance_inverse;
1079             if ($cluster_covariance->det() > 0.001) {
1080             $covariance_inverse = $cluster_covariance->inverse;
1081             } else {
1082             die "covariance matrix may not have an inverse";
1083             }
1084             my $determinant = $covariance_inverse->det();
1085             my $distance = $transposed * $covariance_inverse * $vec;
1086             my @distance = $distance->as_list;
1087             $distance = $distance[0];
1088             return sqrt $distance;
1089             }
1090              
1091             sub estimate_cluster_covariance {
1092             my $self = shift;
1093             my $cluster = shift;
1094             my $cluster_size = @$cluster;
1095             my @cluster_center = @{$self->add_point_coords( $cluster )};
1096             @cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;
1097             # for covariance calculation:
1098             my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$cluster));
1099             my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
1100             my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
1101             # All the record labels are stored in the array $self->{_data_id_tags}. The
1102             # actual data for clustering is stored in a hash at $self->{_data} whose keys are
1103             # the record labels; the value associated with each key is the array holding the
1104             # corresponding numerical multidimensional data.
1105             foreach my $j (0..$num_cols-1) {
1106             my $tag = $cluster->[$j];
1107             my $data = $self->{_data}->{$tag};
1108             my @diff_from_mean = vector_subtract($data, \@cluster_center);
1109             $matrix->set_col($j, \@diff_from_mean);
1110             }
1111             my $transposed = transpose( $matrix );
1112             my $covariance = $matrix * $transposed;
1113             $covariance *= 1.0 / $num_cols;
1114             if ($self->{_debug}) {
1115             print "\nDisplaying the Covariance Matrix for cluster:";
1116             display_matrix( $covariance );
1117             }
1118             return $covariance;
1119             }
1120              
1121             sub write_clusters_to_files {
1122             my $self = shift;
1123             my @clusters = @{$self->{_clusters}};
1124             unlink glob "cluster*.dat";
1125             foreach my $i (0..@clusters-1) {
1126             my $filename = "cluster" . $i . ".txt";
1127             print "\nWriting cluster $i to file $filename\n" if $self->{_terminal_output};
1128             open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!";
1129             foreach my $ele (@{$clusters[$i]}) {
1130             print FILEHANDLE "$ele ";
1131             }
1132             close FILEHANDLE;
1133             }
1134             }
1135              
1136             sub get_K_best {
1137             my $self = shift;
1138             croak "You need to run the clusterer with K=0 option " .
1139             "before you can call this method" if $self->{_K_best} eq 'unknown';
1140             print "\nThe best value of K: $self->{_K_best}\n" if $self->{_terminal_output};
1141             return $self->{_K_best};
1142             }
1143              
1144             sub show_QoC_values {
1145             my $self = shift;
1146             croak "\nYou need to run the clusterer with K=0 option before you can call this method"
1147             if $self->{_K_best} eq 'unknown';
1148             print "\nShown below are K on the left and the QoC values on the right (the smaller " .
1149             "the QoC, the better it is)\n";
1150             foreach my $key (sort keys %{$self->{_QoC_values}} ) {
1151             print " $key => $self->{_QoC_values}->{$key}\n" if defined $self->{_QoC_values}->{$key};
1152             }
1153             }
1154              
1155             sub DESTROY {
1156             unlink "__temp_" . basename($_[0]->{_datafile});
1157             unlink "__temp_data_" . basename($_[0]->{_datafile});
1158             unlink "__temp_normed_data_" . basename($_[0]->{_datafile});
1159             }
1160              
1161             ################################## Visualization Code ###################################
1162              
1163             # It makes sense to call visualize_clusters() only AFTER you have called kmeans().
1164             #
1165             # The visualize_clusters() implementation automatically figures out whether it
1166             # should do a 2D plot or a 3D plot. If the number of on bits in the mask that is
1167             # supplied as one of the arguments is greater than 2, it does a 3D plot for the
1168             # first three data coordinates. That is, the clusters will be displayed in the 3D
1169             # space formed by the first three data coordinates. On the other hand, if the number
1170             # of on bits in the mask is exactly 2, it does a 2D plot. Should it happen that
1171             # only one on bit is specified for the mask, visualize_clusters() aborts.
1172             #
1173             # The visualization code consists of first accessing each of clusters created by the
1174             # kmeans() subroutine. Note that the clusters contain only the symbolic names for
1175             # the individual records in the source data file. We therefore next reach into the
1176             # $self->{_original_data} hash and get the data coordinates associated with each
1177             # symbolic label in a cluster. The numerical data thus generated is then written
1178             # out to a temp file. When doing so we must remember to insert TWO BLANK LINES
1179             # between the data blocks corresponding to the different clusters. This constraint
1180             # is imposed on us by Gnuplot when plotting data from the same file since we want to
1181             # use different point styles for the data points in different cluster files.
1182             #
1183             # Subsequently, we call upon the Perl interface provided by the Graphics::GnuplotIF
1184             # module to plot the data clusters.
1185             sub visualize_clusters {
1186             my $self = shift;
1187             my $v_mask;
1188             my $pause_time;
1189             if (@_ == 1) {
1190             $v_mask = shift || croak "visualization mask missing";
1191             } elsif (@_ == 2) {
1192             $v_mask = shift || croak "visualization mask missing";
1193             $pause_time = shift;
1194             } else {
1195             croak "visualize_clusters() called with wrong args";
1196             }
1197             my $master_datafile = $self->{_datafile};
1198             my @v_mask = split //, $v_mask;
1199             my $visualization_mask_width = @v_mask;
1200             my $original_data_mask = $self->{_mask};
1201             my @mask = split //, $original_data_mask;
1202             my $data_field_width = scalar grep {$_ eq '1'} @mask;
1203             croak "\n\nABORTED: The width of the visualization mask (including " .
1204             "all its 1s and 0s) must equal the width of the original mask " .
1205             "used for reading the data file (counting only the 1's)"
1206             if $visualization_mask_width != $data_field_width;
1207             my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1208             my %visualization_data;
1209             while ( my ($record_id, $data) = each %{$self->{_original_data}} ) {
1210             my @fields = @$data;
1211             croak "\nABORTED: Visualization mask size exceeds data record size\n"
1212             if $#v_mask > $#fields;
1213             my @data_fields;
1214             foreach my $i (0..@fields-1) {
1215             if ($v_mask[$i] eq '0') {
1216             next;
1217             } elsif ($v_mask[$i] eq '1') {
1218             push @data_fields, $fields[$i];
1219             } else {
1220             croak "Misformed visualization mask. It can only have 1s and 0s\n";
1221             }
1222             }
1223             $visualization_data{ $record_id } = \@data_fields;
1224             }
1225             my @all_data_ids = @{$self->{_data_id_tags}};
1226             my $K = scalar @{$self->{_clusters}};
1227             my $filename = basename($master_datafile);
1228             my $temp_file = "__temp_" . $filename;
1229             unlink $temp_file if -e $temp_file;
1230             open OUTPUT, ">$temp_file"
1231             or die "Unable to open a temp file in this directory: $!\n";
1232             foreach my $cluster (@{$self->{_clusters}}) {
1233             foreach my $item (@$cluster) {
1234             print OUTPUT "@{$visualization_data{$item}}";
1235             print OUTPUT "\n";
1236             }
1237             print OUTPUT "\n\n";
1238             }
1239             close OUTPUT;
1240             my $plot;
1241             my $hardcopy_plot;
1242             if (!defined $pause_time) {
1243             $plot = Graphics::GnuplotIF->new( persist => 1 );
1244             $hardcopy_plot = Graphics::GnuplotIF->new();
1245             $hardcopy_plot->gnuplot_cmd('set terminal png', "set output \"clustering_results.png\"");
1246             } else {
1247             $plot = Graphics::GnuplotIF->new();
1248             }
1249             $plot->gnuplot_cmd( "set noclip" );
1250             $plot->gnuplot_cmd( "set pointsize 2" );
1251             my $arg_string = "";
1252             if ($visualization_data_field_width > 2) {
1253             foreach my $i (0..$K-1) {
1254             my $j = $i + 1;
1255             $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i\" with points lt $j pt $j, ";
1256             }
1257             } elsif ($visualization_data_field_width == 2) {
1258             foreach my $i (0..$K-1) {
1259             my $j = $i + 1;
1260             $arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i\" with points lt $j pt $j, ";
1261             }
1262             } elsif ($visualization_data_field_width == 1 ) {
1263             foreach my $i (0..$K-1) {
1264             my $j = $i + 1;
1265             $arg_string .= "\"$temp_file\" index $i using 1 title \"Cluster $i\" with points lt $j pt $j, ";
1266             }
1267             }
1268             $arg_string = $arg_string =~ /^(.*),[ ]+$/;
1269             $arg_string = $1;
1270             if ($visualization_data_field_width > 2) {
1271             $plot->gnuplot_cmd( "splot $arg_string" );
1272             $hardcopy_plot->gnuplot_cmd( "splot $arg_string" ) unless defined $pause_time;
1273             $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1274             } elsif ($visualization_data_field_width == 2) {
1275             $plot->gnuplot_cmd( "plot $arg_string" );
1276             $hardcopy_plot->gnuplot_cmd( "plot $arg_string" ) unless defined $pause_time;
1277             $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1278             } elsif ($visualization_data_field_width == 1) {
1279             croak "No provision for plotting 1-D data\n";
1280             }
1281             }
1282              
1283             # It makes sense to call visualize_data() only AFTER you have called the method
1284             # read_data_from_file().
1285             #
1286             # The visualize_data() is meant for the visualization of the original data in its
1287             # various 2D or 3D subspaces. The method can also be used to visualize the normed
1288             # data in a similar manner. Recall the normed data is the original data after each
1289             # data dimension is normalized by the standard-deviation along that dimension.
1290             #
1291             # Whether you see the original data or the normed data depends on the second
1292             # argument supplied in the method call. It must be either the string 'original' or
1293             # the string 'normed'.
1294             sub visualize_data {
1295             my $self = shift;
1296             my $v_mask = shift || croak "visualization mask missing";
1297             my $datatype = shift; # must be either 'original' or 'normed'
1298             croak "\n\nABORTED: You called visualize_data() for normed data " .
1299             "but without first turning on data normalization in the " .
1300             "in the KMeans constructor"
1301             if ($datatype eq 'normed') && ! $self->{_var_normalize};
1302             my $master_datafile = $self->{_datafile};
1303             my @v_mask = split //, $v_mask;
1304             my $visualization_mask_width = @v_mask;
1305             my $original_data_mask = $self->{_mask};
1306             my @mask = split //, $original_data_mask;
1307             my $data_field_width = scalar grep {$_ eq '1'} @mask;
1308             croak "\n\nABORTED: The width of the visualization mask (including " .
1309             "all its 1s and 0s) must equal the width of the original mask " .
1310             "used for reading the data file (counting only the 1's)"
1311             if $visualization_mask_width != $data_field_width;
1312             my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1313             my %visualization_data;
1314             my $data_source;
1315             if ($datatype eq 'original') {
1316             $data_source = $self->{_original_data};
1317             } elsif ($datatype eq 'normed') {
1318             $data_source = $self->{_data};
1319             } else {
1320             croak "\n\nABORTED: improper call to visualize_data()";
1321             }
1322             while ( my ($record_id, $data) = each %{$data_source} ) {
1323             my @fields = @$data;
1324             croak "\nABORTED: Visualization mask size exceeds data record size\n"
1325             if $#v_mask > $#fields;
1326             my @data_fields;
1327             foreach my $i (0..@fields-1) {
1328             if ($v_mask[$i] eq '0') {
1329             next;
1330             } elsif ($v_mask[$i] eq '1') {
1331             push @data_fields, $fields[$i];
1332             } else {
1333             croak "Misformed visualization mask. It can only have 1s and 0s\n";
1334             }
1335             }
1336             $visualization_data{ $record_id } = \@data_fields;
1337             }
1338             my $filename = basename($master_datafile);
1339             my $temp_file;
1340             if ($datatype eq 'original') {
1341             $temp_file = "__temp_data_" . $filename;
1342             } elsif ($datatype eq 'normed') {
1343             $temp_file = "__temp_normed_data_" . $filename;
1344             } else {
1345             croak "ABORTED: Improper call to visualize_data()";
1346             }
1347             unlink $temp_file if -e $temp_file;
1348             open OUTPUT, ">$temp_file"
1349             or die "Unable to open a temp file in this directory: $!\n";
1350             foreach my $datapoint (values %visualization_data) {
1351             print OUTPUT "@$datapoint";
1352             print OUTPUT "\n";
1353             }
1354             close OUTPUT;
1355             my $plot = Graphics::GnuplotIF->new( persist => 1 );
1356             $plot->gnuplot_cmd( "set noclip" );
1357             $plot->gnuplot_cmd( "set pointsize 2" );
1358             my $plot_title = $datatype eq 'original' ? '"data"' : '"normed data"';
1359             my $arg_string ;
1360             if ($visualization_data_field_width > 2) {
1361             $arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
1362             } elsif ($visualization_data_field_width == 2) {
1363             $arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
1364             } elsif ($visualization_data_field_width == 1 ) {
1365             $arg_string = "\"$temp_file\" using 1 notitle with points lt -1 pt 1";
1366             }
1367             if ($visualization_data_field_width > 2) {
1368             $plot->gnuplot_cmd( "splot $arg_string" );
1369             } elsif ($visualization_data_field_width == 2) {
1370             $plot->gnuplot_cmd( "plot $arg_string" );
1371             } elsif ($visualization_data_field_width == 1) {
1372             croak "No provision for plotting 1-D data\n";
1373             }
1374             }
1375              
1376             ########################### Generating Synthetic Data for Clustering ##############################
1377              
1378             # The data generated corresponds to a multivariate distribution. The mean and the
1379             # covariance of each Gaussian in the distribution are specified individually in a
1380             # parameter file. See the example parameter file param.txt in the examples
1381             # directory. Just edit this file for your own needs.
1382             #
1383             # The multivariate random numbers are generated by calling the Math::Random module.
1384             # As you would expect, that module will insist that the covariance matrix you
1385             # specify be symmetric and positive definite.
1386             sub cluster_data_generator {
1387             my $class = shift;
1388             croak "illegal call of a class method" unless $class eq 'Algorithm::KMeans';
1389             my %args = @_;
1390             my $input_parameter_file = $args{input_parameter_file};
1391             my $output_file = $args{output_datafile};
1392             my $N = $args{number_data_points_per_cluster};
1393             my @all_params;
1394             my $param_string;
1395             if (defined $input_parameter_file) {
1396             open INPUT, $input_parameter_file || "unable to open parameter file: $!";
1397             @all_params = ;
1398             @all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
1399             chomp @all_params;
1400             $param_string = join ' ', @all_params;
1401             } else {
1402             # Just for testing. Used in t/test.t
1403             $param_string = "cluster 5 0 0 1 0 0 0 1 0 0 0 1 " .
1404             "cluster 0 5 0 1 0 0 0 1 0 0 0 1 " .
1405             "cluster 0 0 5 1 0 0 0 1 0 0 0 1";
1406             }
1407             my @cluster_strings = split /[ ]*cluster[ ]*/, $param_string;
1408             @cluster_strings = grep $_, @cluster_strings;
1409             my $K = @cluster_strings;
1410             croak "Too many clusters requested" if $K > 12;
1411             my @point_labels = ('a'..'z');
1412             print "Number of Gaussians used for the synthetic data: $K\n";
1413             my @means;
1414             my @covariances;
1415             my $data_dimension;
1416             foreach my $i (0..$K-1) {
1417             my @num_strings = split / /, $cluster_strings[$i];
1418             my @cluster_mean = map {/$_num_regex/;$_} split / /, $num_strings[0];
1419             $data_dimension = @cluster_mean;
1420             push @means, \@cluster_mean;
1421             my @covariance_nums = map {/$_num_regex/;$_} split / /, $num_strings[1];
1422             croak "dimensionality error" if @covariance_nums !=
1423             ($data_dimension ** 2);
1424             my $cluster_covariance;
1425             foreach my $j (0..$data_dimension-1) {
1426             foreach my $k (0..$data_dimension-1) {
1427             $cluster_covariance->[$j]->[$k] =
1428             $covariance_nums[$j*$data_dimension + $k];
1429             }
1430             }
1431             push @covariances, $cluster_covariance;
1432             }
1433             random_seed_from_phrase( 'hellojello' );
1434             my @data_dump;
1435             foreach my $i (0..$K-1) {
1436             my @m = @{shift @means};
1437             my @covar = @{shift @covariances};
1438             my @new_data = Math::Random::random_multivariate_normal( $N, @m, @covar );
1439             my $p = 0;
1440             my $label = $point_labels[$i];
1441             @new_data = map {unshift @$_, $label.$i; $i++; $_} @new_data;
1442             push @data_dump, @new_data;
1443             }
1444             fisher_yates_shuffle( \@data_dump );
1445             open OUTPUT, ">$output_file";
1446             foreach my $ele (@data_dump) {
1447             foreach my $coord ( @$ele ) {
1448             print OUTPUT "$coord ";
1449             }
1450             print OUTPUT "\n";
1451             }
1452             print "Data written out to file $output_file\n";
1453             close OUTPUT;
1454             }
1455              
1456             sub add_point_coords {
1457             my $self = shift;
1458             my @arr_of_ids = @{shift @_}; # array of data element names
1459             my @result;
1460             my $data_dimensionality = $self->{_data_dimensions};
1461             foreach my $i (0..$data_dimensionality-1) {
1462             $result[$i] = 0.0;
1463             }
1464             foreach my $id (@arr_of_ids) {
1465             my $ele = $self->{_data}->{$id};
1466             my $i = 0;
1467             foreach my $component (@$ele) {
1468             $result[$i] += $component;
1469             $i++;
1470             }
1471             }
1472             return \@result;
1473             }
1474              
1475             sub add_point_coords_from_original_data {
1476             my $self = shift;
1477             my @arr_of_ids = @{shift @_}; # array of data element names
1478             my @result;
1479             my $data_dimensionality = $self->{_data_dimensions};
1480             foreach my $i (0..$data_dimensionality-1) {
1481             $result[$i] = 0.0;
1482             }
1483             foreach my $id (@arr_of_ids) {
1484             my $ele = $self->{_original_data}->{$id};
1485             my $i = 0;
1486             foreach my $component (@$ele) {
1487             $result[$i] += $component;
1488             $i++;
1489             }
1490             }
1491             return \@result;
1492             }
1493              
1494             ################################### Support Routines ########################################
1495              
1496             sub get_index_at_value {
1497             my $value = shift;
1498             my @array = @{shift @_};
1499             foreach my $i (0..@array-1) {
1500             return $i if $value == $array[$i];
1501             }
1502             return -1;
1503             }
1504              
1505             # This routine is really not necessary in light of the new `~~' operator in Perl.
1506             # Will use the new operator in the next version.
1507             sub vector_equal {
1508             my $vec1 = shift;
1509             my $vec2 = shift;
1510             die "wrong data types for vector-equal predicate\n" if @$vec1 != @$vec2;
1511             foreach my $i (0..@$vec1-1){
1512             return 0 if $vec1->[$i] != $vec2->[$i];
1513             }
1514             return 1;
1515             }
1516              
1517             # Returns the minimum value and its positional index in an array
1518             sub minimum {
1519             my $arr = shift;
1520             my $min;
1521             my $index;
1522             foreach my $i (0..@{$arr}-1) {
1523             if ( (!defined $min) || ($arr->[$i] < $min) ) {
1524             $index = $i;
1525             $min = $arr->[$i];
1526             }
1527             }
1528             return ($min, $index);
1529             }
1530              
1531             sub minmax {
1532             my $arr = shift;
1533             my $min;
1534             my $max;
1535             foreach my $i (0..@{$arr}-1) {
1536             next if !defined $arr->[$i];
1537             if ( (!defined $min) && (!defined $max) ) {
1538             $min = $arr->[$i];
1539             $max = $arr->[$i];
1540             } elsif ( $arr->[$i] < $min ) {
1541             $min = $arr->[$i];
1542             } elsif ( $arr->[$i] > $max ) {
1543             $max = $arr->[$i];
1544             }
1545             }
1546             return ($min, $max);
1547             }
1548              
1549             # Meant only for constructing a deep copy of an array of arrays:
1550             sub deep_copy_AoA {
1551             my $ref_in = shift;
1552             my $ref_out;
1553             foreach my $i (0..@{$ref_in}-1) {
1554             foreach my $j (0..@{$ref_in->[$i]}-1) {
1555             $ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
1556             }
1557             }
1558             return $ref_out;
1559             }
1560              
1561             # Meant only for constructing a deep copy of an array of arrays for the case when
1562             # some elements of the top-level array may be undefined:
1563             sub deep_copy_AoA_with_nulls {
1564             my $ref_in = shift;
1565             my $ref_out;
1566             foreach my $i (0..@{$ref_in}-1) {
1567             if ( !defined $ref_in->[$i] ) {
1568             $ref_out->[$i] = undef;
1569             next;
1570             }
1571             foreach my $j (0..@{$ref_in->[$i]}-1) {
1572             $ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
1573             }
1574             }
1575             return $ref_out;
1576             }
1577              
1578             # Meant only for constructing a deep copy of a hash in which each value is an
1579             # anonymous array of numbers:
1580             sub deep_copy_hash {
1581             my $ref_in = shift;
1582             my $ref_out;
1583             while ( my ($key, $value) = each( %$ref_in ) ) {
1584             $ref_out->{$key} = deep_copy_array( $value );
1585             }
1586             return $ref_out;
1587             }
1588              
1589             # Meant only for an array of numbers:
1590             sub deep_copy_array {
1591             my $ref_in = shift;
1592             my $ref_out;
1593             foreach my $i (0..@{$ref_in}-1) {
1594             $ref_out->[$i] = $ref_in->[$i];
1595             }
1596             return $ref_out;
1597             }
1598              
1599             sub display_cluster_centers {
1600             my $self = shift;
1601             my @clusters = @{shift @_};
1602             my $i = 0;
1603             foreach my $cluster (@clusters) {
1604             my $cluster_size = @$cluster;
1605             my @cluster_center =
1606             @{$self->add_point_coords_from_original_data( $cluster )};
1607             @cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;
1608             print "\ncluster $i ($cluster_size records):\n";
1609             print "cluster center $i: " .
1610             "@{[map {my $x = sprintf('%.4f', $_); $x} @cluster_center]}\n";
1611             $i++;
1612             }
1613             }
1614              
1615             # For displaying the individual clusters on a terminal screen. Each cluster is
1616             # displayed through the symbolic names associated with the data points.
1617             sub display_clusters {
1618             my @clusters = @{shift @_};
1619             my $i = 0;
1620             foreach my $cluster (@clusters) {
1621             @$cluster = sort @$cluster;
1622             my $cluster_size = @$cluster;
1623             print "\n\nCluster $i ($cluster_size records):\n";
1624             foreach my $ele (@$cluster) {
1625             print " $ele";
1626             }
1627             $i++
1628             }
1629             print "\n\n";
1630             }
1631              
1632             # from perl docs:
1633             sub fisher_yates_shuffle {
1634             my $arr = shift;
1635             my $i = @$arr;
1636             while (--$i) {
1637             my $j = int rand( $i + 1 );
1638             @$arr[$i, $j] = @$arr[$j, $i];
1639             }
1640             }
1641              
1642             sub variance_normalization {
1643             my %data_hash = %{shift @_};
1644             my @all_data_points = values %data_hash;
1645             my $dimensions = @{$all_data_points[0]};
1646             my @data_projections;
1647             foreach my $data_point (@all_data_points) {
1648             my $i = 0;
1649             foreach my $proj (@$data_point) {
1650             push @{$data_projections[$i++]}, $proj;
1651             }
1652             }
1653             my @variance_vec;
1654             foreach my $vec (@data_projections) {
1655             my ($mean, $variance) = mean_and_variance( $vec );
1656             push @variance_vec, $variance;
1657             }
1658             my %new_data_hash;
1659             while (my ($label, $data) = each(%data_hash) ) {
1660             my @new_data;
1661             foreach my $i (0..@{$data}-1) {
1662             my $new = $data->[$i] / sqrt($variance_vec[$i]);
1663             push @new_data, $data->[$i] / sqrt($variance_vec[$i]);
1664             }
1665             $new_data_hash{$label} = \@new_data;
1666             }
1667             return \%new_data_hash;
1668             }
1669              
1670             sub mean_and_variance {
1671             my @data = @{shift @_};
1672             my ($mean, $variance);
1673             foreach my $i (1..@data) {
1674             if ($i == 1) {
1675             $mean = $data[0];
1676             $variance = 0;
1677             } else {
1678             $mean = ( (($i-1)/$i) * $mean ) + $data[$i-1] / $i;
1679             $variance = ( (($i-1)/$i) * $variance ) + ($data[$i-1]-$mean)**2 / ($i-1);
1680             }
1681             }
1682             return ($mean, $variance);
1683             }
1684              
1685             sub check_for_illegal_params {
1686             my @params = @_;
1687             my @legal_params = qw / datafile
1688             mask
1689             K
1690             Kmin
1691             Kmax
1692             terminal_output
1693             write_clusters_to_files
1694             do_variance_normalization
1695             cluster_seeding
1696             use_mahalanobis_metric
1697             debug
1698             /;
1699             my $found_match_flag;
1700             foreach my $param (@params) {
1701             foreach my $legal (@legal_params) {
1702             $found_match_flag = 0;
1703             if ($param eq $legal) {
1704             $found_match_flag = 1;
1705             last;
1706             }
1707             }
1708             last if $found_match_flag == 0;
1709             }
1710             return $found_match_flag;
1711             }
1712              
1713             sub get_value_index_hash {
1714             my $arr = shift;
1715             my %hash;
1716             foreach my $index (0..@$arr-1) {
1717             $hash{$arr->[$index]} = $index if $arr->[$index] > 0;
1718             }
1719             return \%hash;
1720             }
1721              
1722             sub non_maximum_suppression {
1723             my $arr = shift;
1724             my @output = (0) x @$arr;
1725             my @final_output = (0) x @$arr;
1726             my %hash;
1727             my @array_of_runs = ([$arr->[0]]);
1728             foreach my $index (1..@$arr-1) {
1729             if ($arr->[$index] == $arr->[$index-1]) {
1730             push @{$array_of_runs[-1]}, $arr->[$index];
1731             } else {
1732             push @array_of_runs, [$arr->[$index]];
1733             }
1734             }
1735             my $runstart_index = 0;
1736             foreach my $run_index (1..@array_of_runs-2) {
1737             $runstart_index += @{$array_of_runs[$run_index-1]};
1738             if ($array_of_runs[$run_index]->[0] >
1739             $array_of_runs[$run_index-1]->[0] &&
1740             $array_of_runs[$run_index]->[0] >
1741             $array_of_runs[$run_index+1]->[0]) {
1742             my $run_center = @{$array_of_runs[$run_index]} / 2;
1743             my $assignment_index = $runstart_index + $run_center;
1744             $output[$assignment_index] = $arr->[$assignment_index];
1745             }
1746             }
1747             if ($array_of_runs[-1]->[0] > $array_of_runs[-2]->[0]) {
1748             $runstart_index += @{$array_of_runs[-2]};
1749             my $run_center = @{$array_of_runs[-1]} / 2;
1750             my $assignment_index = $runstart_index + $run_center;
1751             $output[$assignment_index] = $arr->[$assignment_index];
1752             }
1753             if ($array_of_runs[0]->[0] > $array_of_runs[1]->[0]) {
1754             my $run_center = @{$array_of_runs[0]} / 2;
1755             $output[$run_center] = $arr->[$run_center];
1756             }
1757             return \@output;
1758             }
1759              
1760             sub display_matrix {
1761             my $matrix = shift;
1762             my $nrows = $matrix->rows();
1763             my $ncols = $matrix->cols();
1764             print "\n\nDisplaying matrix of size $nrows rows and $ncols columns:\n";
1765             foreach my $i (0..$nrows-1) {
1766             my $row = $matrix->row($i);
1767             my @row_as_list = $row->as_list;
1768             print "@row_as_list\n";
1769             }
1770             print "\n\n";
1771             }
1772              
1773             sub transpose {
1774             my $matrix = shift;
1775             my $num_rows = $matrix->rows();
1776             my $num_cols = $matrix->cols();
1777             my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
1778             foreach my $i (0..$num_rows-1) {
1779             my @row = $matrix->row($i)->as_list;
1780             $transpose->set_col($i, \@row );
1781             }
1782             return $transpose;
1783             }
1784              
1785             sub vector_subtract {
1786             my $vec1 = shift;
1787             my $vec2 = shift;
1788             die "wrong data types for vector subtract calculation\n" if @$vec1 != @$vec2;
1789             my @result;
1790             foreach my $i (0..@$vec1-1){
1791             push @result, $vec1->[$i] - $vec2->[$i];
1792             }
1793             return @result;
1794             }
1795              
1796             sub vector_multiply {
1797             my $vec1 = shift;
1798             my $vec2 = shift;
1799             die "vec_multiply called with two vectors of different sizes"
1800             unless @$vec1 == @$vec2;
1801             my $result = 0;
1802             foreach my $i (0..@$vec1-1) {
1803             $result += $vec1->[$i] * $vec2->[$i];
1804             }
1805             return $result;
1806             }
1807              
1808             sub matrix_multiply {
1809             my $matrix1 = shift;
1810             my $matrix2 = shift;
1811             my ($nrows1, $ncols1) = ($matrix1->rows(), $matrix1->cols());
1812             my ($nrows2, $ncols2) = ($matrix2->rows(), $matrix2->cols());
1813             die "matrix multiplication called with non-matching matrix arguments"
1814             unless $nrows1 == $ncols2 && $ncols1 == $nrows2;
1815             if ($nrows1 == 1) {
1816             my @row = $matrix1->row(0)->as_list;
1817             my @col = $matrix2->col(0)->as_list;
1818             my $result;
1819             foreach my $j (0..$ncols1-1) {
1820             $result += $row[$j] * $col[$j];
1821             }
1822             return $result;
1823             }
1824             my $product = Math::GSL::Matrix->new($nrows1, $nrows1);
1825             foreach my $i (0..$nrows1-1) {
1826             my $row = $matrix1->row($i);
1827             my @product_row;
1828             foreach my $j (0..$ncols2-1) {
1829             my $col = $matrix2->col($j);
1830             my $row_times_col = matrix_multiply($row, $col);
1831             push @product_row, $row_times_col;
1832             }
1833             $product->set_row($i, \@product_row);
1834             }
1835             return $product;
1836             }
1837              
1838             1;
1839              
1840             =pod
1841              
1842             =head1 NAME
1843              
1844             Algorithm::KMeans - for clustering multidimensional data
1845              
1846             =head1 SYNOPSIS
1847              
1848             # You now have four different choices for clustering your data with this module:
1849             #
1850             # 1) With Euclidean distances and with random cluster seeding
1851             #
1852             # 2) With Mahalanobis distances and with random cluster seeding
1853             #
1854             # 3) With Euclidean distances and with smart cluster seeding
1855             #
1856             # 4) With Mahalanobis distances and with smart cluster seeding
1857             #
1858             # Despite the qualifier 'smart' in 'smart cluster seeding', it may not always
1859             # produce results that are superior to those obtained with random seeding. (If you
1860             # also factor in the choice regarding variance normalization, you actually have
1861             # eight different choices for data clustering with this module.)
1862             #
1863             # In all cases, you'd obviously begin with
1864              
1865             use Algorithm::KMeans;
1866              
1867             # You'd then name the data file:
1868              
1869             my $datafile = "mydatafile.csv";
1870              
1871             # Next, set the mask to indicate which columns of the datafile to use for
1872             # clustering and which column contains a symbolic ID for each data record. For
1873             # example, if the symbolic name is in the first column, you want the second column
1874             # to be ignored, and you want the next three columns to be used for 3D clustering,
1875             # you'd set the mask to:
1876              
1877             my $mask = "N0111";
1878              
1879             # Now construct an instance of the clusterer. The parameter K controls the number
1880             # of clusters. If you know how many clusters you want (let's say 3), call
1881              
1882             my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1883             mask => $mask,
1884             K => 3,
1885             cluster_seeding => 'random',
1886             terminal_output => 1,
1887             write_clusters_to_files => 1,
1888             );
1889              
1890             # By default, this constructor call will set you up for clustering based on
1891             # Euclidean distances. If you want the module to use Mahalanobis distances, your
1892             # constructor call will look like:
1893              
1894             my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1895             mask => $mask,
1896             K => 3,
1897             cluster_seeding => 'random',
1898             use_mahalanobis_metric => 1,
1899             terminal_output => 1,
1900             write_clusters_to_files => 1,
1901             );
1902              
1903             # For both constructor calls shown above, you can use smart seeding of the clusters
1904             # by changing 'random' to 'smart' for the cluster_seeding option. See the
1905             # explanation of smart seeding in the Methods section of this documentation.
1906              
1907             # If your data is such that its variability along the different dimensions of the
1908             # data space is significantly different, you may get better clustering if you first
1909             # normalize your data by setting the constructor parameter
1910             # do_variance_normalization as shown below:
1911              
1912             my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1913             mask => $mask,
1914             K => 3,
1915             cluster_seeding => 'smart', # or 'random'
1916             terminal_output => 1,
1917             do_variance_normalization => 1,
1918             write_clusters_to_files => 1,
1919             );
1920              
1921             # But bear in mind that such data normalization may actually decrease the
1922             # performance of the clusterer if the variability in the data is more a result of
1923             # the separation between the means than a consequence of intra-cluster variance.
1924              
1925             # Set K to 0 if you want the module to figure out the optimum number of clusters
1926             # from the data. (It is best to run this option with the terminal_output set to 1
1927             # so that you can see the different value of QoC for the different K):
1928              
1929             my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1930             mask => $mask,
1931             K => 0,
1932             cluster_seeding => 'random', # or 'smart'
1933             terminal_output => 1,
1934             write_clusters_to_files => 1,
1935             );
1936              
1937             # Although not shown above, you can obviously set the 'do_variance_normalization'
1938             # flag here also if you wish.
1939              
1940             # For very large data files, setting K to 0 will result in searching through too
1941             # many values for K. For such cases, you can range limit the values of K to search
1942             # through by
1943              
1944             my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1945             mask => "N111",
1946             Kmin => 3,
1947             Kmax => 10,
1948             cluster_seeding => 'random', # or 'smart'
1949             terminal_output => 1,
1950             write_clusters_to_files => 1,
1951             );
1952              
1953             # FOR ALL CASES ABOVE, YOU'D NEED TO MAKE THE FOLLOWING CALLS ON THE CLUSTERER
1954             # INSTANCE TO ACTUALLY CLUSTER THE DATA:
1955              
1956             $clusterer->read_data_from_file();
1957             $clusterer->kmeans();
1958              
1959             # If you want to directly access the clusters and the cluster centers in your own
1960             # top-level script, replace the above two statements with:
1961              
1962             $clusterer->read_data_from_file();
1963             my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans();
1964              
1965             # You can subsequently access the clusters directly in your own code, as in:
1966              
1967             foreach my $cluster_id (sort keys %{$clusters_hash}) {
1968             print "\n$cluster_id => @{$clusters_hash->{$cluster_id}}\n";
1969             }
1970             foreach my $cluster_id (sort keys %{$cluster_centers_hash}) {
1971             print "\n$cluster_id => @{$cluster_centers_hash->{$cluster_id}}\n";
1972             }
1973              
1974              
1975             # CLUSTER VISUALIZATION:
1976              
1977             # You must first set the mask for cluster visualization. This mask tells the module
1978             # which 2D or 3D subspace of the original data space you wish to visualize the
1979             # clusters in:
1980              
1981             my $visualization_mask = "111";
1982             $clusterer->visualize_clusters($visualization_mask);
1983              
1984              
1985             # SYNTHETIC DATA GENERATION:
1986              
1987             # The module has been provided with a class method for generating multivariate data
1988             # for experimenting with clustering. The data generation is controlled by the
1989             # contents of the parameter file that is supplied as an argument to the data
1990             # generator method. The mean and covariance matrix entries in the parameter file
1991             # must be according to the syntax shown in the param.txt file in the examples
1992             # directory. It is best to edit this file as needed:
1993              
1994             my $parameter_file = "param.txt";
1995             my $out_datafile = "mydatafile.dat";
1996             Algorithm::KMeans->cluster_data_generator(
1997             input_parameter_file => $parameter_file,
1998             output_datafile => $out_datafile,
1999             number_data_points_per_cluster => $N );
2000              
2001             =head1 CHANGES
2002              
2003             Version 2.05 removes the restriction on the version of Perl that is required. This
2004             is based on Srezic's recommendation. He had no problem building and testing the
2005             previous version with Perl 5.8.9. Version 2.05 also includes a small augmentation of
2006             the code in the method C for guarding against user errors
2007             in the specification of the mask that tells the module which columns of the data file
2008             are to be used for clustering.
2009              
2010             Version 2.04 allows you to use CSV data files for clustering.
2011              
2012             Version 2.03 incorporates minor code cleanup. The main implementation of the module
2013             remains unchanged.
2014              
2015             Version 2.02 downshifts the version of Perl that is required for this module. The
2016             module should work with versions 5.10 and higher of Perl. The implementation code
2017             for the module remains unchanged.
2018              
2019             Version 2.01 removes many errors in the documentation. The changes made to the module
2020             in Version 2.0 were not reflected properly in the documentation page for that
2021             version. The implementation code remains unchanged.
2022              
2023             Version 2.0 includes significant additional functionality: (1) You now have the
2024             option to cluster using the Mahalanobis distance metric (the default is the Euclidean
2025             metric); and (2) With the two C methods that have been added to the
2026             module, you can now determine the best cluster for a new data sample after you have
2027             created the clusters with the previously available data. Finding the best cluster
2028             for a new data sample can be done using either the Euclidean metric or the
2029             Mahalanobis metric.
2030              
2031             Version 1.40 includes a C option for seeding the clusters. This option,
2032             supplied through the constructor parameter C, means that the
2033             clusterer will (1) Subject the data to principal components analysis in order to
2034             determine the maximum variance direction; (2) Project the data onto this direction;
2035             (3) Find peaks in a smoothed histogram of the projected points; and (4) Use the
2036             locations of the highest peaks as initial guesses for the cluster centers. If you
2037             don't want to use this option, set C to C. That should work
2038             as in the previous version of the module.
2039              
2040             Version 1.30 includes a bug fix for the case when the datafile contains empty lines,
2041             that is, lines with no data records. Another bug fix in Version 1.30 deals with the
2042             case when you want the module to figure out how many clusters to form (this is the
2043             C option in the constructor call) and the number of data records is close to the
2044             minimum.
2045              
2046             Version 1.21 includes fixes to handle the possibility that, when clustering the data
2047             for a fixed number of clusters, a cluster may become empty during iterative
2048             calculation of cluster assignments of the data elements and the updating of the
2049             cluster centers. The code changes are in the C and
2050             C subroutines.
2051              
2052             Version 1.20 includes an option to normalize the data with respect to its variability
2053             along the different coordinates before clustering is carried out.
2054              
2055             Version 1.1.1 allows for range limiting the values of C to search through. C
2056             stands for the number of clusters to form. This version also declares the module
2057             dependencies in the C file.
2058              
2059             Version 1.1 is a an object-oriented version of the implementation presented in
2060             version 1.0. The current version should lend itself more easily to code extension.
2061             You could, for example, create your own class by subclassing from the class presented
2062             here and, in your subclass, use your own criteria for the similarity distance between
2063             the data points and for the QoC (Quality of Clustering) metric, and, possibly a
2064             different rule to stop the iterations. Version 1.1 also allows you to directly
2065             access the clusters formed and the cluster centers in your calling script.
2066              
2067              
2068             =head1 SPECIAL USAGE NOTE
2069              
2070             If you were directly accessing in your own scripts the clusters produced by the older
2071             versions of this module, you'd need to make changes to your code if you wish to use
2072             Version 2.0 or higher. Instead of returning arrays of clusters and cluster centers,
2073             Versions 2.0 and higher return hashes. This change was made necessary by the logic
2074             required for implementing the two new C methods that were introduced
2075             in Version 2.0. These methods return the best cluster for a new data sample from the
2076             clusters you created using the existing data. One of the C methods is
2077             based on the Euclidean metric for finding the cluster that is closest to the new data
2078             sample, and the other on the Mahalanobis metric. Another point of incompatibility
2079             with the previous versions is that you must now explicitly set the C
2080             parameter in the call to the constructor to either C or C. This
2081             parameter does not have a default associated with it starting with Version 2.0.
2082              
2083              
2084             =head1 DESCRIPTION
2085              
2086             Clustering with K-Means takes place iteratively and involves two steps: 1) assignment
2087             of data samples to clusters on the basis of how far the data samples are from the
2088             cluster centers; and 2) Recalculation of the cluster centers (and cluster covariances
2089             if you are using the Mahalanobis distance metric for clustering).
2090              
2091             Obviously, before the two-step approach can proceed, we need to initialize the the
2092             cluster centers. How this initialization is carried out is important. The module
2093             gives you two very different ways for carrying out this initialization. One option,
2094             called the C option, consists of subjecting the data to principal components
2095             analysis to discover the direction of maximum variance in the data space. The data
2096             points are then projected on to this direction and a histogram constructed from the
2097             projections. Centers of the smoothed histogram are used to seed the clustering
2098             operation. The other option is to choose the cluster centers purely randomly. You
2099             get the first option if you set C to C in the constructor,
2100             and you get the second option if you set it to C.
2101              
2102             How to specify the number of clusters, C, is one of the most vexing issues in any
2103             approach to clustering. In some case, we can set C on the basis of prior
2104             knowledge. But, more often than not, no such prior knowledge is available. When the
2105             programmer does not explicitly specify a value for C, the approach taken in the
2106             current implementation is to try all possible values between 2 and some largest
2107             possible value that makes statistical sense. We then choose that value for C
2108             which yields the best value for the QoC (Quality of Clustering) metric. It is
2109             generally believed that the largest value for C should not exceed C
2110             where C is the number of data samples to be clustered.
2111              
2112             What to use for the QoC metric is obviously a critical issue unto itself. In the
2113             current implementation, the value of QoC is the ratio of the average radius of the
2114             clusters and the average distance between the cluster centers.
2115              
2116             Every iterative algorithm requires a stopping criterion. The criterion implemented
2117             here is that we stop iterations when there is no re-assignment of the data points
2118             during the assignment step.
2119              
2120             Ordinarily, the output produced by a K-Means clusterer will correspond to a local
2121             minimum for the QoC values, as opposed to a global minimum. The current
2122             implementation protects against that when the module constructor is called with the
2123             C option for C by trying different randomly selected initial
2124             cluster centers and then selecting the one that gives the best overall QoC value.
2125              
2126             A K-Means clusterer will generally produce good results if the overlap between the
2127             clusters is minimal and if each cluster exhibits variability that is uniform in all
2128             directions. When the data variability is different along the different directions in
2129             the data space, the results you obtain with a K-Means clusterer may be improved by
2130             first normalizing the data appropriately, as can be done in this module when you set
2131             the C option in the constructor. However, as pointed out
2132             elsewhere in this documentation, such normalization may actually decrease the
2133             performance of the clusterer if the overall data variability along any dimension is
2134             more a result of separation between the means than a consequence of intra-cluster
2135             variability.
2136              
2137              
2138             =head1 METHODS
2139              
2140             The module provides the following methods for clustering, for cluster visualization,
2141             for data visualization, for the generation of data for testing a clustering
2142             algorithm, and for determining the cluster membership of a new data sample:
2143              
2144             =over 4
2145              
2146             =item B
2147              
2148             my $clusterer = Algorithm::KMeans->new(datafile => $datafile,
2149             mask => $mask,
2150             K => $K,
2151             cluster_seeding => 'random', # also try 'smart'
2152             use_mahalanobis_metric => 1, # also try '0'
2153             terminal_output => 1,
2154             write_clusters_to_files => 1,
2155             );
2156              
2157             A call to C constructs a new instance of the C class. When
2158             C<$K> is a non-zero positive integer, the module will construct exactly that many
2159             clusters. However, when C<$K> is 0, the module will find the best number of clusters
2160             to partition the data into. As explained in the Description, setting
2161             C to C causes PCA (principal components analysis) to be used
2162             for discovering the best choices for the initial cluster centers. If you want purely
2163             random decisions to be made for the initial choices for the cluster centers, set
2164             C to C.
2165              
2166             The data file is expected to contain entries in the following format
2167              
2168             c20 0 10.7087017086940 9.63528386251712 10.9512155258108 ...
2169             c7 0 12.8025925026787 10.6126270065785 10.5228482095349 ...
2170             b9 0 7.60118206283120 5.05889245193079 5.82841781759102 ...
2171             ....
2172             ....
2173              
2174             where the first column contains the symbolic ID tag for each data record and the rest
2175             of the columns the numerical information. As to which columns are actually used for
2176             clustering is decided by the string value of the mask. For example, if we wanted to
2177             cluster on the basis of the entries in just the 3rd, the 4th, and the 5th columns
2178             above, the mask value would be C where the character C indicates that the
2179             ID tag is in the first column, the character C<0> that the second column is to be
2180             ignored, and the C<1>'s that follow that the 3rd, the 4th, and the 5th columns are to
2181             be used for clustering.
2182              
2183             If you wish for the clusterer to search through a C<(Kmin,Kmax)> range of values for
2184             C, the constructor should be called in the following fashion:
2185              
2186             my $clusterer = Algorithm::KMeans->new(datafile => $datafile,
2187             mask => $mask,
2188             Kmin => 3,
2189             Kmax => 10,
2190             cluster_seeding => 'smart', # try 'random' also
2191             terminal_output => 1,
2192             );
2193              
2194             where obviously you can choose any reasonable values for C and C. If you
2195             choose a value for C that is statistically too large, the module will let you
2196             know. Again, you may choose C for C, the default value being
2197             C.
2198              
2199             If you believe that the variability of the data is very different along the different
2200             dimensions of the data space, you may get better clustering by first normalizing the
2201             data coordinates by the standard-deviations along those directions. When you set the
2202             constructor option C as shown below, the module uses the
2203             overall data standard-deviation along a direction for the normalization in that
2204             direction. (As mentioned elsewhere in the documentation, such a normalization could
2205             backfire on you if the data variability along a dimension is more a result of the
2206             separation between the means than a consequence of the intra-cluster variability.):
2207              
2208             my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
2209             mask => "N111",
2210             K => 2,
2211             cluster_seeding => 'smart', # try 'random' also
2212             terminal_output => 1,
2213             do_variance_normalization => 1,
2214             );
2215              
2216             =back
2217              
2218             =head2 Constructor Parameters
2219              
2220             =over 8
2221              
2222             =item C:
2223              
2224             This parameter names the data file that contains the multidimensional data records
2225             you want the module to cluster.
2226              
2227             =item C:
2228              
2229             This parameter supplies the mask to be applied to the columns of your data file. See
2230             the explanation in Synopsis for what this mask looks like.
2231              
2232             =item C:
2233              
2234             This parameter supplies the number of clusters you are looking for. If you set this
2235             option to 0, that means that you want the module to search for the best value for
2236             C. (Keep in mind the fact that searching for the best C may take a long time
2237             for large data files.)
2238              
2239             =item C:
2240              
2241             If you supply an integer value for C, the search for the best C will begin
2242             with that value.
2243              
2244             =item C:
2245              
2246             If you supply an integer value for C, the search for the best C will end at
2247             that value.
2248              
2249             =item C:
2250              
2251             This parameter must be set to either C or C. Depending on your data,
2252             you may get superior clustering with the C option. The choice C means
2253             that the clusterer will (1) subject the data to principal components analysis to
2254             determine the maximum variance direction; (2) project the data onto this direction;
2255             (3) find peaks in a smoothed histogram of the projected points; and (4) use the
2256             locations of the highest peaks as seeds for cluster centers. If the C option
2257             produces bizarre results, try C.
2258              
2259             =item C:
2260              
2261             When set to 1, this option causes Mahalanobis distances to be used for clustering.
2262             The default is 0 for this parameter. By default, the module uses the Euclidean
2263             distances for clustering. In general, Mahalanobis distance based clustering will
2264             fail if your data resides on a lower-dimensional hyperplane in the data space, if you
2265             seek too many clusters, and if you do not have a sufficient number of samples in your
2266             data file. A necessary requirement for the module to be able to compute Mahalanobis
2267             distances is that the cluster covariance matrices be non-singular. (Let's say your
2268             data dimensionality is C and the module is considering a cluster that has only
2269             C samples in it where C is less than C. In this case, the covariance matrix
2270             will be singular since its rank will not exceed C. For the covariance matrix to
2271             be non-singular, it must be of full rank, that is, its rank must be C.)
2272              
2273             =item C:
2274              
2275             When set, the module will first normalize the data variance along the different
2276             dimensions of the data space before attempting clustering. Depending on your data,
2277             this option may or may not result in better clustering.
2278              
2279             =item C:
2280              
2281             This boolean parameter, when not supplied in the call to C, defaults to 0.
2282             When set, you will see in your terminal window the different clusters as lists of the
2283             symbolic IDs and their cluster centers. You will also see the QoC (Quality of
2284             Clustering) values for the clusters displayed.
2285              
2286             =item C:
2287              
2288             This parameter is also boolean. When set to 1, the clusters are written out to files
2289             that are named in the following manner:
2290              
2291             cluster0.txt
2292             cluster1.txt
2293             cluster2.txt
2294             ...
2295             ...
2296              
2297             Before the clusters are written to these files, the module destroys all files with
2298             such names in the directory in which you call the module.
2299              
2300              
2301             =back
2302              
2303             =over
2304              
2305             =item B
2306              
2307             $clusterer->read_data_from_file()
2308              
2309             =item B
2310              
2311             $clusterer->kmeans();
2312              
2313             or
2314              
2315             my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans();
2316              
2317             The first call above works solely by side-effect. The second call also returns the
2318             clusters and the cluster centers. See the C script in the
2319             C directory for how you can in your own code extract the clusters and the
2320             cluster centers from the variables C<$clusters_hash> and C<$cluster_centers_hash>.
2321              
2322             =item B
2323              
2324             $clusterer->get_K_best();
2325              
2326             This call makes sense only if you supply either the C option to the constructor,
2327             or if you specify values for the C and C options. The C and the
2328             C<(Kmin,Kmax)> options cause the module to determine the best value for C.
2329             Remember, C is the number of clusters the data is partitioned into.
2330              
2331             =item B
2332              
2333             $clusterer->show_QoC_values();
2334              
2335             presents a table with C values in the left column and the corresponding QoC
2336             (Quality-of-Clustering) values in the right column. Note that this call makes sense
2337             only if you either supply the C option to the constructor, or if you specify
2338             values for the C and C options.
2339              
2340             =item B
2341              
2342             $clusterer->visualize_clusters( $visualization_mask )
2343              
2344             The visualization mask here does not have to be identical to the one used for
2345             clustering, but must be a subset of that mask. This is convenient for visualizing
2346             the clusters in two- or three-dimensional subspaces of the original space.
2347              
2348             =item B
2349              
2350             $clusterer->visualize_data($visualization_mask, 'original');
2351              
2352             $clusterer->visualize_data($visualization_mask, 'normed');
2353              
2354             This method requires a second argument and, as shown, it must be either the string
2355             C or the string C, the former for the visualization of the raw data
2356             and the latter for the visualization of the data after its different dimensions are
2357             normalized by the standard-deviations along those directions. If you call the method
2358             with the second argument set to C, but do so without turning on the
2359             C option in the KMeans constructor, it will let you know.
2360              
2361              
2362             =item B
2363              
2364             If you wish to determine the cluster membership of a new data sample after you have
2365             created the clusters with the existing data samples, you would need to call this
2366             method. The C script in the C directory
2367             shows how to use this method.
2368              
2369              
2370             =item B
2371              
2372             This does the same thing as the previous method, except that it determines the
2373             cluster membership using the Mahalanobis distance metric. As for the previous
2374             method, see the C script in the C directory
2375             for how to use this method.
2376              
2377              
2378             =item B
2379              
2380             Algorithm::KMeans->cluster_data_generator(
2381             input_parameter_file => $parameter_file,
2382             output_datafile => $out_datafile,
2383             number_data_points_per_cluster => 20 );
2384              
2385             for generating multivariate data for clustering if you wish to play with synthetic
2386             data for clustering. The input parameter file contains the means and the variances
2387             for the different Gaussians you wish to use for the synthetic data. See the file
2388             C provided in the examples directory. It will be easiest for you to just
2389             edit this file for your data generation needs. In addition to the format of the
2390             parameter file, the main constraint you need to observe in specifying the parameters
2391             is that the dimensionality of the covariance matrix must correspond to the
2392             dimensionality of the mean vectors. The multivariate random numbers are generated by
2393             calling the C module. As you would expect, this module requires that
2394             the covariance matrices you specify in your parameter file be symmetric and positive
2395             definite. Should the covariances in your parameter file not obey this condition, the
2396             C module will let you know.
2397              
2398             =back
2399              
2400             =head1 HOW THE CLUSTERS ARE OUTPUT
2401              
2402             When the option C is set in the call to the constructor, the
2403             clusters are displayed on the terminal screen.
2404              
2405             When the option C is set in the call to the constructor, the
2406             module dumps the clusters in files named
2407              
2408             cluster0.txt
2409             cluster1.txt
2410             cluster2.txt
2411             ...
2412             ...
2413              
2414             in the directory in which you execute the module. The number of such files will
2415             equal the number of clusters formed. All such existing files in the directory are
2416             destroyed before any fresh ones are created. Each cluster file contains the symbolic
2417             ID tags of the data samples in that cluster.
2418              
2419             The module also leaves in your directory a printable `.png' file that is a point plot
2420             of the different clusters. The name of this file is always C.
2421              
2422             Also, as mentioned previously, a call to C in your own code will return the
2423             clusters and the cluster centers.
2424              
2425             =head1 REQUIRED
2426              
2427             This module requires the following three modules:
2428              
2429             Math::Random
2430             Graphics::GnuplotIF
2431             Math::GSL
2432              
2433             With regard to the third item above, what is actually required is the
2434             C module. However, that module is a part of the C
2435             distribution. The acronym GSL stands for the GNU Scientific Library. C is
2436             a Perl interface to the GSL C-based library.
2437              
2438              
2439             =head1 THE C DIRECTORY
2440              
2441             The C directory contains several scripts to help you become familiar with
2442             this module. The following script is an example of how the module can be expected to
2443             be used most of the time. It calls for clustering to be carried out with a fixed
2444             C:
2445              
2446             cluster_and_visualize.pl
2447              
2448             The more time you spend with this script, the more comfortable you will become with
2449             the use of this module. The script file contains a large comment block that mentions
2450             six locations in the script where you have to make decisions about how to use the
2451             module.
2452              
2453             See the following script if you do not know what value to use for C for clustering
2454             your data:
2455              
2456             find_best_K_and_cluster.pl
2457              
2458             This script uses the C option in the constructor that causes the module to
2459             search for the best C for your data. Since this search is virtually unbounded ---
2460             limited only by the number of samples in your data file --- the script may take a
2461             long time to run for a large data file. Hence the next script.
2462              
2463             If your datafile is too large, you may need to range limit the values of C that
2464             are searched through, as in the following script:
2465              
2466             find_best_K_in_range_and_cluster.pl
2467              
2468             If you also want to include data normalization (it may reduce the performance of the
2469             clusterer in some cases), see the following script:
2470              
2471             cluster_after_data_normalization.pl
2472              
2473             When you include the data normalization step and you would like to visualize the data
2474             before and after normalization, see the following script:
2475              
2476             cluster_and_visualize_with_data_visualization.pl*
2477              
2478             After you are done clustering, let's say you want to find the cluster membership of a
2479             new data sample. To see how you can do that, see the script:
2480              
2481             which_cluster_for_new_data.pl
2482              
2483             This script returns two answers for which cluster a new data sample belongs to: one
2484             using the Euclidean metric to calculate the distances between the new data sample and
2485             the cluster centers, and the other using the Mahalanobis metric. If the clusters are
2486             strongly elliptical in shape, you are likely to get better results with the
2487             Mahalanobis metric. (To see that you can get two different answers using the two
2488             different distance metrics, run the C script on the
2489             data in the file C. To make this run, note that you have to comment
2490             out and uncomment the lines at four different locations in the script.)
2491              
2492             The C directory also contains the following support scripts:
2493              
2494             For generating the data for experiments with clustering:
2495              
2496             data_generator.pl
2497              
2498             For cleaning up the examples directory:
2499              
2500             cleanup_directory.pl
2501              
2502             The examples directory also includes a parameter file, C, for generating
2503             synthetic data for clustering. Just edit this file if you would like to generate
2504             your own multivariate data for clustering. The parameter file is for the 3D case,
2505             but you can generate data with any dimensionality through appropriate entries in the
2506             parameter file.
2507              
2508             =head1 EXPORT
2509              
2510             None by design.
2511              
2512             =head1 CAVEATS
2513              
2514             K-Means based clustering usually does not work well when the clusters are strongly
2515             overlapping and when the extent of variability along the different dimensions is
2516             different for the different clusters. The module does give you the ability to
2517             normalize the variability in your data with the constructor option
2518             C. However, as described elsewhere, this may actually
2519             reduce the performance of the clusterer if the data variability along a direction is
2520             more a result of the separation between the means than because of intra-cluster
2521             variability. For better clustering with difficult-to-cluster data, you could try
2522             using the author's C module.
2523              
2524             =head1 BUGS
2525              
2526             Please notify the author if you encounter any bugs. When sending email, please place
2527             the string 'KMeans' in the subject line.
2528              
2529             =head1 INSTALLATION
2530              
2531             Download the archive from CPAN in any directory of your choice. Unpack the archive
2532             with a command that on a Linux machine would look like:
2533              
2534             tar zxvf Algorithm-KMeans-2.05.tar.gz
2535              
2536             This will create an installation directory for you whose name will be
2537             C. Enter this directory and execute the following commands
2538             for a standard install of the module if you have root privileges:
2539              
2540             perl Makefile.PL
2541             make
2542             make test
2543             sudo make install
2544              
2545             If you do not have root privileges, you can carry out a non-standard install the
2546             module in any directory of your choice by:
2547              
2548             perl Makefile.PL prefix=/some/other/directory/
2549             make
2550             make test
2551             make install
2552              
2553             With a non-standard install, you may also have to set your PERL5LIB environment
2554             variable so that this module can find the required other modules. How you do that
2555             would depend on what platform you are working on. In order to install this module in
2556             a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
2557             variable by
2558              
2559             setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2560              
2561             If I used bash, I'd need to declare:
2562              
2563             export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2564              
2565              
2566             =head1 THANKS
2567              
2568             I thank Slaven for pointing out that I needed to downshift the required version of Perl
2569             for this module. Fortunately, I had access to an old machine still running Perl
2570             5.10.1. The current version, 2.02, is based on my testing the module on that machine.
2571              
2572             I added two C methods in Version 2.0 as a result of an email from
2573             Jerome White who expressed a need for such methods in order to determine the best
2574             cluster for a new data record after you have successfully clustered your existing
2575             data. Thanks Jerome for your feedback!
2576              
2577             It was an email from Nadeem Bulsara that prompted me to create Version 1.40 of this
2578             module. Working with Version 1.30, Nadeem noticed that occasionally the module would
2579             produce variable clustering results on the same dataset. I believe that this
2580             variability was caused (at least partly) by the purely random mode that was used in
2581             Version 1.30 for the seeding of the cluster centers. Version 1.40 now includes a
2582             C mode. With the new mode the clusterer uses a PCA (Principal Components
2583             Analysis) of the data to make good guesses for the cluster centers. However,
2584             depending on how the data is jumbled up, it is possible that the new mode will not
2585             produce uniformly good results in all cases. So you can still use the old mode by
2586             setting C to C in the constructor. Thanks Nadeem for your
2587             feedback!
2588              
2589             Version 1.30 resulted from Martin Kalin reporting problems with a very small data
2590             set. Thanks Martin!
2591              
2592             Version 1.21 came about in response to the problems encountered by Luis Fernando
2593             D'Haro with version 1.20. Although the module would yield the clusters for some of
2594             its runs, more frequently than not the module would abort with an "empty cluster"
2595             message for his data. Luis Fernando has also suggested other improvements (such as
2596             clustering directly from the contents of a hash) that I intend to make in future
2597             versions of this module. Thanks Luis Fernando.
2598              
2599             Chad Aeschliman was kind enough to test out the interface of this module and to give
2600             suggestions for its improvement. His key slogan: "If you cannot figure out how to
2601             use a module in under 10 minutes, it's not going to be used." That should explain
2602             the longish Synopsis included here.
2603              
2604             =head1 AUTHOR
2605              
2606             Avinash Kak, kak@purdue.edu
2607              
2608             If you send email, please place the string "KMeans" in your subject line to get past
2609             my spam filter.
2610              
2611             =head1 COPYRIGHT
2612              
2613             This library is free software; you can redistribute it and/or modify it under the
2614             same terms as Perl itself.
2615              
2616             Copyright 2014 Avinash Kak
2617              
2618             =cut
2619