File Coverage

blib/lib/Algorithm/KMeans.pm
Criterion Covered Total %
statement 22 24 91.6
branch n/a
condition n/a
subroutine 8 8 100.0
pod n/a
total 30 32 93.7


line stmt bran cond sub pod time code
1             package Algorithm::KMeans;
2              
3             #------------------------------------------------------------------------------------
4             # Copyright (c) 2014 Avinash Kak. All rights reserved. This program is free
5             # software. You may modify and/or distribute it under the same terms as Perl itself.
6             # This copyright notice must remain attached to the file.
7             #
8             # Algorithm::KMeans is a Perl module for clustering multidimensional data.
9             # -----------------------------------------------------------------------------------
10              
11 1     1   15887 use 5.10.0;
  1         3  
  1         40  
12 1     1   3 use strict;
  1         1  
  1         55  
13 1     1   4 use warnings;
  1         5  
  1         37  
14 1     1   4 use Carp;
  1         1  
  1         74  
15 1     1   5 use File::Basename;
  1         1  
  1         66  
16 1     1   555 use Math::Random;
  1         4937  
  1         93  
17 1     1   536 use Graphics::GnuplotIF;
  1         8773  
  1         53  
18 1     1   966 use Math::GSL::Matrix;
  0            
  0            
19              
20              
21             our $VERSION = '2.03';
22              
23             # from Perl docs:
24             my $_num_regex = '^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$';
25              
26             # Constructor:
27             sub new {
28             my ($class, %args) = @_;
29             my @params = keys %args;
30             croak "\nYou have used a wrong name for a keyword argument " .
31             "--- perhaps a misspelling\n"
32             if check_for_illegal_params(@params) == 0;
33             bless {
34             _datafile => $args{datafile} || croak("datafile required"),
35             _mask => $args{mask} || croak("mask required"),
36             _K => $args{K} || 0,
37             _K_min => $args{Kmin} || 'unknown',
38             _K_max => $args{Kmax} || 'unknown',
39             _cluster_seeding => $args{cluster_seeding} || croak("must choose smart or random ".
40             "for cluster seeding"),
41             _var_normalize => $args{do_variance_normalization} || 0,
42             _use_mahalanobis_metric => $args{use_mahalanobis_metric} || 0,
43             _clusters_2_files => $args{write_clusters_to_files} || 0,
44             _terminal_output => $args{terminal_output} || 0,
45             _debug => $args{debug} || 0,
46             _N => 0,
47             _K_best => 'unknown',
48             _original_data => {},
49             _data => {},
50             _data_id_tags => [],
51             _QoC_values => {},
52             _clusters => [],
53             _cluster_centers => [],
54             _clusters_hash => {},
55             _cluster_centers_hash => {},
56             _cluster_covariances_hash => {},
57             _data_dimensions => 0,
58              
59             }, $class;
60             }
61              
62             sub read_data_from_file {
63             my $self = shift;
64             my $datafile = $self->{_datafile};
65             my $mask = $self->{_mask};
66             my @mask = split //, $mask;
67             $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
68             print "data dimensionality: $self->{_data_dimensions} \n"if $self->{_terminal_output};
69             open INPUT, $datafile or die "unable to open file $datafile: $!\n";
70             chomp( my @raw_data = );
71             close INPUT;
72             # Transform strings into number data
73             foreach my $record (@raw_data) {
74             next unless $record;
75             next if $record =~ /^#/;
76             my @data_fields;
77             my @fields = split /\s+/, $record;
78             die "\nABORTED: Mask size does not correspond to row record size\n" if $#fields != $#mask;
79             my $record_id;
80             foreach my $i (0..@fields-1) {
81             if ($mask[$i] eq '0') {
82             next;
83             } elsif ($mask[$i] eq 'N') {
84             $record_id = $fields[$i];
85             } elsif ($mask[$i] eq '1') {
86             push @data_fields, $fields[$i];
87             } else {
88             die "misformed mask for reading the data file\n";
89             }
90             }
91             my @nums = map {/$_num_regex/;$_} @data_fields;
92             $self->{_original_data}->{ $record_id } = \@nums;
93             }
94             if ($self->{_var_normalize}) {
95             $self->{_data} = variance_normalization( $self->{_original_data} );
96             } else {
97             $self->{_data} = deep_copy_hash( $self->{_original_data} );
98             }
99             my @all_data_ids = keys %{$self->{_data}};
100             $self->{_data_id_tags} = \@all_data_ids;
101             $self->{_N} = scalar @all_data_ids;
102             if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
103             carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
104             "points must satisfy the relation N > 2xK**2 where K is " .
105             "the number of clusters requested for the clusters to be " .
106             "meaningful $!"
107             if ( $self->{_N} < (2 * $self->{_K} ** 2) );
108             print "\n\n\n";
109             }
110             srand(123456789);
111             }
112              
113             sub kmeans {
114             my $self = shift;
115             my $K = $self->{_K};
116             if ( ($K == 0) ||
117             ( ($self->{_K_min} ne 'unknown') &&
118             ($self->{_K_max} ne 'unknown') ) ) {
119             eval {
120             $self->iterate_through_K();
121             };
122             die "in kmeans(): $@" if ($@);
123             } elsif ( $K =~ /\d+/) {
124             eval {
125             $self->cluster_for_fixed_K_multiple_random_tries($K)
126             if $self->{_cluster_seeding} eq 'random';
127             $self->cluster_for_fixed_K_single_smart_try($K)
128             if $self->{_cluster_seeding} eq 'smart';
129             };
130             die "in kmeans(): $@" if ($@);
131             } else {
132             croak "Incorrect call syntax used. See documentation.\n";
133             }
134             if ((defined $self->{_clusters}) && (defined $self->{_cluster_centers})){
135             foreach my $i (0..@{$self->{_clusters}}-1) {
136             $self->{_clusters_hash}->{"cluster$i"} = $self->{_clusters}->[$i];
137             $self->{_cluster_centers_hash}->{"cluster$i"} = $self->{_cluster_centers}->[$i];
138             $self->{_cluster_covariances_hash}->{"cluster$i"} =
139             $self->estimate_cluster_covariance($self->{_clusters}->[$i]);
140             }
141             $self->write_clusters_to_files( $self->{_clusters} ) if $self->{_clusters_2_files};
142             return ($self->{_clusters_hash}, $self->{_cluster_centers_hash});
143             } else {
144             croak "Clustering failed. Try again.";
145             }
146             }
147              
148             # This is the subroutine to call if you prefer purely random choices for the initial
149             # seeding of the cluster centers.
150             sub cluster_for_fixed_K_multiple_random_tries {
151             my $self = shift;
152             my $K = shift;
153             my @all_data_ids = @{$self->{_data_id_tags}};
154             my $QoC;
155             my @QoC_values;
156             my @array_of_clusters;
157             my @array_of_cluster_centers;
158             my $clusters;
159             my $new_clusters;
160             my $cluster_centers;
161             my $new_cluster_centers;
162             print "Clustering for K = $K\n" if $self->{_terminal_output};
163             foreach my $trial (1..20) {
164             print ". ";
165             my ($new_clusters, $new_cluster_centers);
166             eval {
167             ($new_clusters, $new_cluster_centers) = $self->cluster_for_given_K($K);
168             };
169             next if $@;
170             next if @$new_clusters <= 1;
171             my $newQoC = $self->cluster_quality( $new_clusters, $new_cluster_centers );
172             if ( (!defined $QoC) || ($newQoC < $QoC) ) {
173             $QoC = $newQoC;
174             $clusters = deep_copy_AoA( $new_clusters );
175             $cluster_centers = deep_copy_AoA( $new_cluster_centers );
176             }
177             }
178             $self->{_clusters} = $clusters;
179             $self->{_cluster_centers} = $cluster_centers;
180             $self->{_QoC_values}->{"$K"} = $QoC;
181             if ($self->{_terminal_output}) {
182             print "\nDisplaying final clusters for best K (= $K) :\n";
183             display_clusters( $clusters );
184             $self->display_cluster_centers( $clusters );
185             print "\nQoC value (the smaller, the better): $QoC\n";
186             }
187             }
188              
189             # For the smart try, we do initial cluster seeding by subjecting the data to
190             # principal components analysis in order to discover the direction of maximum
191             # variance in the data space. Subsequently, we try to find the K largest peaks along
192             # this direction. The coordinates of these peaks serve as the seeds for the K
193             # clusters.
194             sub cluster_for_fixed_K_single_smart_try {
195             my $self = shift;
196             my $K = shift;
197             my @all_data_ids = @{$self->{_data_id_tags}};
198             print "Clustering for K = $K\n" if $self->{_terminal_output};
199             my ($clusters, $cluster_centers);
200             eval {
201             ($clusters, $cluster_centers) = $self->cluster_for_given_K($K);
202             };
203             if ($@) {
204             die "In cluster_for_fixed_K_single_smart_try: insufficient data for clustering " .
205             "with $self->{_K} clusters --- $@";
206             }
207             my $QoC = $self->cluster_quality( $clusters, $cluster_centers );
208             $self->{_clusters} = $clusters;
209             $self->{_cluster_centers} = $cluster_centers;
210             $self->{_QoC_values}->{"$K"} = $QoC;
211             if ($self->{_terminal_output}) {
212             print "\nDisplaying final clusters for best K (= $K) :\n";
213             display_clusters( $clusters );
214             $self->display_cluster_centers( $clusters );
215             print "\nQoC value (the smaller, the better): $QoC\n";
216             }
217             }
218              
219             # The following subroutine is the top-level routine to call if you want the system to
220             # figure out on its own what value to use for K, the number of clusters. If you call
221             # the KMeans constructor with the K=0 option, the method below will try every
222             # possible value of K between 2 and the maximum possible depending on the number of
223             # data points available. For example, if the number of data points is 10,000, it will
224             # try all possible values of K between 2 and 70. For how the maximum value is set for
225             # K, see the comments made under Description. Note also how this method makes 20
226             # different tries for each value of K as a defense against the problem of the final
227             # result corresponding to some local minimum in the values of the QoC metric. Out of
228             # these 20 tries for each K, it retains the clusters and the cluster centers for only
229             # that try that yields the smallest value for the QoC metric. After estimating the
230             # "best" QoC values for all possible K in this manner, it then finds the K for which
231             # the QoC is the minimum. This is taken to be the best value for K. Finally, the
232             # output clusters are written out to separate files.
233             #
234             # If the KMeans constructor is invoked with the (Kmin, Kmax) options, then, instead
235             # of iterating through 2 and the maximum permissible value for K, the iterations are
236             # carried out only between Kmin and Kmax.
237             sub iterate_through_K {
238             my $self = shift;
239             my @all_data_ids = @{$self->{_data_id_tags}};
240             my $N = $self->{_N};
241             croak "You need more than 8 data samples. The number of data points must satisfy " .
242             "the relation N > 2xK**2 where K is the number of clusters. The smallest " .
243             "value for K is 2.\n" if $N <= 8;
244             my $K_statistical_max = int( sqrt( $N / 2.0 ) );
245             my $Kmin = $self->{_K_min} eq 'unknown'
246             ? 2
247             : $self->{_K_min};
248             my $Kmax = $self->{_K_max} eq 'unknown'
249             ? int( sqrt( $N / 2.0 ) )
250             : $self->{_K_max};
251             croak "\n\nYour Kmax value is too high. Ideally, it should not exceed sqrt(N/2) " .
252             "where N is the number of data points" if $Kmax > $K_statistical_max;
253             print "Value of Kmax is: $Kmax\n" if $self->{_terminal_output};
254             my @QoC_values;
255             my @array_of_clusters;
256             my @array_of_cluster_centers;
257             foreach my $K ($Kmin..$Kmax) {
258             my $QoC;
259             my $clusters;
260             my $cluster_centers;
261             print "Clustering for K = $K\n" if $self->{_terminal_output};
262             if ($self->{_cluster_seeding} eq 'random') {
263             foreach my $trial (1..21) {
264             print ". ";
265             my ($new_clusters, $new_cluster_centers);
266             if ($self->{_use_mahalanobis_metric}) {
267             eval {
268             ($new_clusters, $new_cluster_centers) = $self->cluster_for_given_K($K);
269             };
270             next if $@;
271             } else {
272             ($new_clusters, $new_cluster_centers) = $self->cluster_for_given_K($K);
273             }
274             my $newQoC = $self->cluster_quality( $new_clusters, $new_cluster_centers );
275             if ( (!defined $QoC) || ($newQoC < $QoC) ) {
276             $QoC = $newQoC;
277             $clusters = deep_copy_AoA( $new_clusters );
278             $cluster_centers = deep_copy_AoA( $new_cluster_centers );
279             }
280             }
281             print "\n";
282             } elsif ($self->{_cluster_seeding} eq 'smart') {
283             eval {
284             ($clusters, $cluster_centers) = $self->cluster_for_given_K($K);
285             };
286             if ($@) {
287             $Kmax = $K - 1;
288             last;
289             }
290             $QoC = $self->cluster_quality($clusters,$cluster_centers);
291             } else {
292             die "You must either choose 'smart' for cluster_seeding or 'random'. " .
293             "Fix your constructor call."
294             }
295             push @QoC_values, $QoC;
296             push @array_of_clusters, $clusters;
297             push @array_of_cluster_centers, $cluster_centers;
298             }
299             my ($min, $max) = minmax( \@QoC_values );
300             my $K_best_relative_to_Kmin = get_index_at_value($min, \@QoC_values );
301             my $K_best = $K_best_relative_to_Kmin + $Kmin;
302             if ($self->{_terminal_output}) {
303             print "\nDisplaying final clusters for best K (= $K_best) :\n";
304             display_clusters( $array_of_clusters[$K_best_relative_to_Kmin] );
305             $self->display_cluster_centers($array_of_clusters[$K_best_relative_to_Kmin]);
306             print "\nBest clustering achieved for K=$K_best with QoC = $min\n" if defined $min;
307             my @printableQoC = grep {$_} @QoC_values;
308             print "\nQoC values array (the smaller the value, the better it is) for different " .
309             "K starting with K=$Kmin: @printableQoC\n";
310             }
311             $self->{_K_best} = $K_best;
312             foreach my $i (0..@QoC_values-1) {
313             my $k = $i + $Kmin;
314             $self->{_QoC_values}->{"$k"} = $QoC_values[$i];
315             }
316             $self->{_clusters} = $array_of_clusters[$K_best_relative_to_Kmin];
317             $self->{_cluster_centers} =
318             $array_of_cluster_centers[$K_best_relative_to_Kmin];
319             }
320              
321             # This is the function to call if you already know what value you want to use for K,
322             # the number of expected clusters. The purpose of this function is to do the
323             # initialization of the cluster centers and to carry out the initial assignment of
324             # the data to the clusters with the initial cluster centers. The initialization
325             # consists of 3 steps: Construct a random sequence of K integers between 0 and N-1
326             # where N is the number of data points to be clustered; 2) Call
327             # get_initial_cluster_centers() to index into the data array with the random integers
328             # to get a list of K data points that would serve as the initial cluster centers; and
329             # (3) Call assign_data_to_clusters_initial() to assign the rest of the data to each
330             # of the K clusters on the basis of the proximity to the cluster centers.
331             sub cluster_for_given_K {
332             my $self = shift;
333             my $K = shift;
334             my @all_data_ids = @{$self->{_data_id_tags}};
335             my $cluster_centers;
336             if ($self->{_cluster_seeding} eq 'smart') {
337             $cluster_centers = $self->get_initial_cluster_centers_smart($K);
338             } elsif ($self->{_cluster_seeding} eq 'random') {
339             $cluster_centers = $self->get_initial_cluster_centers($K);
340             } else {
341             die "You must either choose 'smart' for cluster_seeding or 'random'. " .
342             "Fix your constructor call."
343             }
344             my $clusters;
345             if ($self->{_use_mahalanobis_metric}) {
346             my $clusters_and_determinants =
347             $self->assign_data_to_clusters_initial_mahalanobis($cluster_centers);
348             $clusters = $clusters_and_determinants->[0];
349             my @determinants = @{$clusters_and_determinants->[1]};
350             my ($min,$max) = minmax(\@determinants);
351             die "In cluster_for_given_K(): min determinant of covariance matrix for at " .
352             "least one cluster is too small" if $min / $max < 0.001;
353             } else {
354             $clusters = $self->assign_data_to_clusters_initial($cluster_centers);
355             }
356             my $cluster_nonexistent_flag = 0;
357             foreach my $trial (0..2) {
358             if ($self->{_use_mahalanobis_metric}) {
359             ($clusters, $cluster_centers) = $self->assign_data_to_clusters_mahalanobis($clusters, $K);
360             } else {
361             ($clusters, $cluster_centers) = $self->assign_data_to_clusters( $clusters, $K );
362             }
363             my $num_of_clusters_returned = @$clusters;
364             foreach my $cluster (@$clusters) {
365             $cluster_nonexistent_flag = 1 if ((!defined $cluster) || (@$cluster == 0));
366             }
367             last unless $cluster_nonexistent_flag;
368             }
369             return ($clusters, $cluster_centers);
370             }
371              
372             # This function is used when you set the "cluster_seeding" option to 'random' in the
373             # constructor. Returns a set of K random integers. These serve as indices to reach
374             # into the data array. A data element whose index is one of the random numbers
375             # returned by this routine serves as an initial cluster center. Note the quality
376             # check it runs on the list of K random integers constructed. We first make sure
377             # that all K random integers are different. Subsequently, we carry out a quality
378             # assessment of the K random integers constructed. This quality measure consists of
379             # the ratio of the values spanned by the random integers to the value of N, the total
380             # number of data points to be clustered. Currently, if this ratio is less than 0.3,
381             # we discard the K integers and try again.
382             sub initialize_cluster_centers {
383             my $self = shift;
384             my $K = shift;
385             my $data_store_size = $self->{_N};
386             my @cluster_center_indices;
387             while (1) {
388             foreach my $i (0..$K-1) {
389             $cluster_center_indices[$i] = int rand( $data_store_size );
390             next if $i == 0;
391             foreach my $j (0..$i-1) {
392             while ( $cluster_center_indices[$j] ==
393             $cluster_center_indices[$i] ) {
394             my $old = $cluster_center_indices[$i];
395             $cluster_center_indices[$i] = int rand($data_store_size);
396             }
397             }
398             }
399             my ($min,$max) = minmax(\@cluster_center_indices );
400             my $quality = ($max - $min) / $data_store_size;
401             last if $quality > 0.3;
402             }
403             return @cluster_center_indices;
404             }
405              
406             # This function is used when you set the "cluster_seeding" option to 'random' in the
407             # constructor. This routine merely reaches into the data array with the random
408             # integers, as constructed by the previous routine, serving as indices and fetching
409             # values corresponding to those indices. The fetched data samples serve as the
410             # initial cluster centers.
411             sub get_initial_cluster_centers {
412             my $self = shift;
413             my $K = shift;
414             my @cluster_center_indices = $self->initialize_cluster_centers($K);
415             my @result;
416             foreach my $i (@cluster_center_indices) {
417             my $tag = $self->{_data_id_tags}[$i];
418             push @result, $self->{_data}->{$tag};
419             }
420             return \@result;
421             }
422              
423             # This method is invoked when you choose the 'smart' option for the "cluster_seeding"
424             # option in the constructor. It subjects the data to a principal components analysis
425             # to figure out the direction of maximal variance. Subsequently, it tries to locate
426             # K peaks in a smoothed histogram of the data points projected onto the maximal
427             # variance direction.
428             sub get_initial_cluster_centers_smart {
429             my $self = shift;
430             my $K = shift;
431             if ($self->{_data_dimensions} == 1) {
432             my @one_d_data;
433             foreach my $j (0..$self->{_N}-1) {
434             my $tag = $self->{_data_id_tags}[$j];
435             push @one_d_data, $self->{_data}->{$tag}->[0];
436             }
437             my @peak_points = find_peak_points_in_given_direction(\@one_d_data,$K);
438             print "highest points at data values: @peak_points\n" if $self->{_debug};
439             my @cluster_centers;
440             foreach my $peakpoint (@peak_points) {
441             push @cluster_centers, [$peakpoint];
442             }
443             return \@cluster_centers;
444             }
445             my ($num_rows,$num_cols) = ($self->{_data_dimensions},$self->{_N});
446             my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
447             my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
448             # All the record labels are stored in the array $self->{_data_id_tags}. The
449             # actual data for clustering is stored in a hash at $self->{_data} whose keys are
450             # the record labels; the value associated with each key is the array holding the
451             # corresponding numerical multidimensional data.
452             foreach my $j (0..$num_cols-1) {
453             my $tag = $self->{_data_id_tags}[$j];
454             my $data = $self->{_data}->{$tag};
455             $matrix->set_col($j, $data);
456             }
457             if ($self->{_debug}) {
458             print "\nDisplaying the original data as a matrix:";
459             display_matrix( $matrix );
460             }
461             foreach my $j (0..$num_cols-1) {
462             $mean_vec += $matrix->col($j);
463             }
464             $mean_vec *= 1.0 / $num_cols;
465             if ($self->{_debug}) {
466             print "Displaying the mean vector for the data:";
467             display_matrix( $mean_vec );
468             }
469             foreach my $j (0..$num_cols-1) {
470             my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
471             $matrix->set_col($j, \@new_col);
472             }
473             if ($self->{_debug}) {
474             print "Displaying mean subtracted data as a matrix:";
475             display_matrix( $matrix );
476             }
477             my $transposed = transpose( $matrix );
478             if ($self->{_debug}) {
479             print "Displaying transposed data matrix:";
480             display_matrix( $transposed );
481             }
482             my $covariance = matrix_multiply( $matrix, $transposed );
483             $covariance *= 1.0 / $num_cols;
484             if ($self->{_debug}) {
485             print "\nDisplaying the Covariance Matrix for your data:";
486             display_matrix( $covariance );
487             }
488             my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
489             my $num_of_eigens = @$eigenvalues;
490             my $largest_eigen_index = 0;
491             my $smallest_eigen_index = 0;
492             print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug};
493             foreach my $i (1..$num_of_eigens-1) {
494             $largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
495             $smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
496             print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug};
497             }
498             print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
499             print "\nsmallest eigen index: $smallest_eigen_index\n\n" if $self->{_debug};
500             foreach my $i (0..$num_of_eigens-1) {
501             my @vec = $eigenvectors->[$i]->as_list;
502             print "Eigenvector $i: @vec\n" if $self->{_debug};
503             }
504             my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
505             print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug};
506             my @max_var_direction;
507             # Each element of the array @largest_eigen_vec is a Math::Complex object
508             foreach my $k (0..@largest_eigen_vec-1) {
509             my ($mag, $theta) = $largest_eigen_vec[$k] =~ /\[(\d*\.\d+),(\S+)\]/;
510             if ($theta eq '0') {
511             $max_var_direction[$k] = $mag;
512             } elsif ($theta eq 'pi') {
513             $max_var_direction[$k] = -1.0 * $mag;
514             } else {
515             die "eigendecomposition of covariance matrix produced a complex " .
516             "eigenvector --- something is wrong";
517             }
518             }
519             print "\nMaximum Variance Direction: @max_var_direction\n\n" if $self->{_debug};
520             # We now project all data points on the largest eigenvector.
521             # Each projection will yield a single point on the eigenvector.
522             my @projections;
523             foreach my $j (0..$self->{_N}-1) {
524             my $tag = $self->{_data_id_tags}[$j];
525             my $data = $self->{_data}->{$tag};
526             die "Dimensionality of the largest eigenvector does not "
527             . "match the dimensionality of the data"
528             unless @max_var_direction == $self->{_data_dimensions};
529             my $projection = vector_multiply($data, \@max_var_direction);
530             push @projections, $projection;
531             }
532             print "All projection points: @projections\n" if $self->{_debug};
533             my @peak_points = find_peak_points_in_given_direction(\@projections, $K);
534             print "highest points at points along largest eigenvec: @peak_points\n" if $self->{_debug};
535             my @cluster_centers;
536             foreach my $peakpoint (@peak_points) {
537             my @actual_peak_coords = map {$peakpoint * $_} @max_var_direction;
538             push @cluster_centers, \@actual_peak_coords;
539             }
540             return \@cluster_centers;
541             }
542              
543             # This method is invoked when you choose the 'smart' option for the "cluster_seeding"
544             # option in the constructor. It is called by the previous method to locate K peaks
545             # in a smoothed histogram of the data points projected onto the maximal variance
546             # direction.
547             sub find_peak_points_in_given_direction {
548             my $dataref = shift;
549             my $how_many = shift;
550             my @data = @$dataref;
551             my ($min, $max) = minmax(\@data);
552             my $num_points = @data;
553             my @sorted_data = sort {$a <=> $b} @data;
554             my $scale = $max - $min;
555             foreach my $index (0..$#sorted_data-1) {
556             $sorted_data[$index] = ($sorted_data[$index] - $min) / $scale;
557             }
558             my $avg_diff = 0;
559             foreach my $index (0..$#sorted_data-1) {
560             my $diff = $sorted_data[$index+1] - $sorted_data[$index];
561             $avg_diff += ($diff - $avg_diff) / ($index + 1);
562             }
563             my $delta = 1.0 / 1000.0;
564             my @accumulator = (0) x 1000;
565             foreach my $index (0..@sorted_data-1) {
566             my $cell_index = int($sorted_data[$index] / $delta);
567             my $smoothness = 40;
568             for my $index ($cell_index-$smoothness..$cell_index+$smoothness) {
569             next if $index < 0 || $index > 999;
570             $accumulator[$index]++;
571             }
572             }
573             my $peaks_array = non_maximum_suppression( \@accumulator );
574             my $peaks_index_hash = get_value_index_hash( $peaks_array );
575             my @K_highest_peak_locations;
576             my $k = 0;
577             foreach my $peak (sort {$b <=> $a} keys %$peaks_index_hash) {
578             my $unscaled_peak_point = $min + $peaks_index_hash->{$peak} * $scale * $delta;
579             push @K_highest_peak_locations, $unscaled_peak_point
580             if $k < $how_many;
581             last if ++$k == $how_many;
582             }
583             return @K_highest_peak_locations;
584             }
585              
586             # The purpose of this routine is to form initial clusters by assigning the data
587             # samples to the initial clusters formed by the previous routine on the basis of the
588             # best proximity of the data samples to the different cluster centers.
589             sub assign_data_to_clusters_initial {
590             my $self = shift;
591             my @cluster_centers = @{ shift @_ };
592             my @clusters;
593             foreach my $ele (@{$self->{_data_id_tags}}) {
594             my $best_cluster;
595             my @dist_from_clust_centers;
596             foreach my $center (@cluster_centers) {
597             push @dist_from_clust_centers, $self->distance($ele, $center);
598             }
599             my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
600             push @{$clusters[$best_center_index]}, $ele;
601             }
602             return \@clusters;
603             }
604              
605             sub assign_data_to_clusters_initial_mahalanobis {
606             my $self = shift;
607             my @cluster_centers = @{ shift @_ };
608             my @clusters;
609             foreach my $ele (@{$self->{_data_id_tags}}) {
610             my $best_cluster;
611             my @dist_from_clust_centers;
612             foreach my $center (@cluster_centers) {
613             push @dist_from_clust_centers, $self->distance($ele, $center);
614             }
615             my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
616             push @{$clusters[$best_center_index]}, $ele if defined $best_center_index;
617             }
618             # Since a cluster center may not correspond to any particular sample, it is possible
619             # for one of the elements of the array @clusters to be null using the above
620             # strategy for populating the initial clusters. Let's say there are five cluster
621             # centers in the array @cluster_centers. The $best_center_index may populate the
622             # the elements of the array @clusters for the indices 0, 1, 2, 4, which would leave
623             # $clusters[3] as undefined. So, in what follows, we must first check if all of
624             # the elements of @clusters are defined.
625             my @determinants;
626             foreach my $cluster(@clusters) {
627             die "The clustering program started with bad initialization. Please start over"
628             unless defined $cluster;
629             my $covariance = $self->estimate_cluster_covariance($cluster);
630             my $determinant = $covariance->det();
631             push @determinants, $determinant;
632             }
633             return [\@clusters, \@determinants];
634             }
635              
636             # This is the main routine that along with the update_cluster_centers() routine
637             # constitute the two key steps of the K-Means algorithm. In most cases, the infinite
638             # while() loop will terminate automatically when the cluster assignments of the data
639             # points remain unchanged. For the sake of safety, we keep track of the number of
640             # iterations. If this number reaches 100, we exit the while() loop anyway. In most
641             # cases, this limit will not be reached.
642             sub assign_data_to_clusters {
643             my $self = shift;
644             my $clusters = shift;
645             my $K = shift;
646             my $final_cluster_centers;
647             my $iteration_index = 0;
648             while (1) {
649             my $new_clusters;
650             my $assignment_changed_flag = 0;
651             my $current_cluster_center_index = 0;
652             my $cluster_size_zero_condition = 0;
653             my $how_many = @$clusters;
654             my $cluster_centers = $self->update_cluster_centers( deep_copy_AoA_with_nulls( $clusters ) );
655             $iteration_index++;
656             foreach my $cluster (@$clusters) {
657             my $current_cluster_center = $cluster_centers->[$current_cluster_center_index];
658             foreach my $ele (@$cluster) {
659             my @dist_from_clust_centers;
660             foreach my $center (@$cluster_centers) {
661             push @dist_from_clust_centers,
662             $self->distance($ele, $center);
663             }
664             my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
665             my $best_cluster_center = $cluster_centers->[$best_center_index];
666             if (vector_equal($current_cluster_center, $best_cluster_center)){
667             push @{$new_clusters->[$current_cluster_center_index]}, $ele;
668             } else {
669             $assignment_changed_flag = 1;
670             push @{$new_clusters->[$best_center_index]}, $ele;
671             }
672             }
673             $current_cluster_center_index++;
674             }
675             next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
676             # Now make sure that none of the K clusters is an empty cluster:
677             foreach my $newcluster (@$new_clusters) {
678             $cluster_size_zero_condition = 1 if ((!defined $newcluster) or (@$newcluster == 0));
679             }
680             push @$new_clusters, (undef) x ($K - @$new_clusters) if @$new_clusters < $K;
681             # During clustering for a fixed K, should a cluster inadvertently
682             # become empty, steal a member from the largest cluster to hopefully
683             # spawn a new cluster:
684             my $largest_cluster;
685             foreach my $local_cluster (@$new_clusters) {
686             next if !defined $local_cluster;
687             $largest_cluster = $local_cluster if !defined $largest_cluster;
688             if (@$local_cluster > @$largest_cluster) {
689             $largest_cluster = $local_cluster;
690             }
691             }
692             foreach my $local_cluster (@$new_clusters) {
693             if ( (!defined $local_cluster) || (@$local_cluster == 0) ) {
694             push @$local_cluster, pop @$largest_cluster;
695             }
696             }
697             next if (($cluster_size_zero_condition) && ($iteration_index < 100));
698             last if $iteration_index == 100;
699             $clusters = deep_copy_AoA( $new_clusters );
700             last if $assignment_changed_flag == 0;
701             }
702             $final_cluster_centers = $self->update_cluster_centers( $clusters );
703             return ($clusters, $final_cluster_centers);
704             }
705              
706             sub assign_data_to_clusters_mahalanobis {
707             my $self = shift;
708             my $clusters = shift;
709             my $K = shift;
710             my $final_cluster_centers;
711             my $iteration_index = 0;
712             while (1) {
713             my $new_clusters;
714             my $assignment_changed_flag = 0;
715             my $current_cluster_center_index = 0;
716             my $cluster_size_zero_condition = 0;
717             my $how_many = @$clusters;
718             my $cluster_centers_and_covariances =
719             $self->update_cluster_centers_and_covariances_mahalanobis(deep_copy_AoA_with_nulls($clusters));
720             my $cluster_centers = $cluster_centers_and_covariances->[0];
721             my $cluster_covariances = $cluster_centers_and_covariances->[1];
722             $iteration_index++;
723             foreach my $cluster (@$clusters) {
724             my $current_cluster_center = $cluster_centers->[$current_cluster_center_index];
725             my $current_cluster_covariance = $cluster_covariances->[$current_cluster_center_index];
726             foreach my $ele (@$cluster) {
727             my @mahalanobis_dist_from_clust_centers;
728             foreach my $i (0..@$clusters-1) {
729             my $center = $cluster_centers->[$i];
730             my $covariance = $cluster_covariances->[$i];
731             my $maha_distance;
732             eval {
733             $maha_distance = $self->distance_mahalanobis($ele, $center, $covariance);
734             };
735             next if $@;
736             push @mahalanobis_dist_from_clust_centers, $maha_distance;
737             }
738             my ($min, $best_center_index) = minimum( \@mahalanobis_dist_from_clust_centers );
739             my $best_cluster_center = $cluster_centers->[$best_center_index];
740             if (vector_equal($current_cluster_center, $best_cluster_center)){
741             push @{$new_clusters->[$current_cluster_center_index]}, $ele;
742             } else {
743             $assignment_changed_flag = 1;
744             push @{$new_clusters->[$best_center_index]}, $ele;
745             }
746             }
747             $current_cluster_center_index++;
748             }
749             next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
750             # Now make sure that none of the K clusters is an empty cluster:
751             foreach my $newcluster (@$new_clusters) {
752             $cluster_size_zero_condition = 1 if ((!defined $newcluster) or (@$newcluster == 0));
753             }
754             push @$new_clusters, (undef) x ($K - @$new_clusters) if @$new_clusters < $K;
755             # During clustering for a fixed K, should a cluster inadvertently
756             # become empty, steal a member from the largest cluster to hopefully
757             # spawn a new cluster:
758             my $largest_cluster;
759             foreach my $local_cluster (@$new_clusters) {
760             next if !defined $local_cluster;
761             $largest_cluster = $local_cluster if !defined $largest_cluster;
762             if (@$local_cluster > @$largest_cluster) {
763             $largest_cluster = $local_cluster;
764             }
765             }
766             foreach my $local_cluster (@$new_clusters) {
767             if ( (!defined $local_cluster) || (@$local_cluster == 0) ) {
768             push @$local_cluster, pop @$largest_cluster;
769             }
770             }
771             next if (($cluster_size_zero_condition) && ($iteration_index < 100));
772             last if $iteration_index == 100;
773             # Now do a deep copy of new_clusters into clusters
774             $clusters = deep_copy_AoA( $new_clusters );
775             last if $assignment_changed_flag == 0;
776             }
777             $final_cluster_centers = $self->update_cluster_centers( $clusters );
778             return ($clusters, $final_cluster_centers);
779             }
780              
781             sub update_cluster_centers_and_covariances_mahalanobis {
782             my $self = shift;
783             my @clusters = @{ shift @_ };
784             my @new_cluster_centers;
785             my @new_cluster_covariances;
786             # During clustering for a fixed K, should a cluster inadvertently become empty,
787             # steal a member from the largest cluster to hopefully spawn a new cluster:
788             my $largest_cluster;
789             foreach my $cluster (@clusters) {
790             next if !defined $cluster;
791             $largest_cluster = $cluster if !defined $largest_cluster;
792             if (@$cluster > @$largest_cluster) {
793             $largest_cluster = $cluster;
794             }
795             }
796             foreach my $cluster (@clusters) {
797             if ( (!defined $cluster) || (@$cluster == 0) ) {
798             push @$cluster, pop @$largest_cluster;
799             }
800             }
801             foreach my $cluster (@clusters) {
802             die "Cluster became empty --- untenable condition " .
803             "for a given K. Try again. \n" if !defined $cluster;
804             my $cluster_size = @$cluster;
805             die "Cluster size is zero --- untenable.\n" if $cluster_size == 0;
806             my @new_cluster_center = @{$self->add_point_coords( $cluster )};
807             @new_cluster_center = map {my $x = $_/$cluster_size; $x} @new_cluster_center;
808             push @new_cluster_centers, \@new_cluster_center;
809             # for covariance calculation:
810             my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$cluster));
811             my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
812             my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
813             # All the record labels are stored in the array $self->{_data_id_tags}. The
814             # actual data for clustering is stored in a hash at $self->{_data} whose keys are
815             # the record labels; the value associated with each key is the array holding the
816             # corresponding numerical multidimensional data.
817             foreach my $j (0..$num_cols-1) {
818             my $tag = $cluster->[$j];
819             my $data = $self->{_data}->{$tag};
820             my @diff_from_mean = vector_subtract($data, \@new_cluster_center);
821             $matrix->set_col($j, \@diff_from_mean);
822             }
823             my $transposed = transpose( $matrix );
824             my $covariance = matrix_multiply( $matrix, $transposed );
825             $covariance *= 1.0 / $num_cols;
826             if ($self->{_debug}) {
827             print "\nDisplaying the Covariance Matrix for cluster:";
828             display_matrix( $covariance );
829             }
830             push @new_cluster_covariances, $covariance;
831             }
832             return [\@new_cluster_centers, \@new_cluster_covariances];
833             }
834              
835             # After each new assignment of the data points to the clusters on the basis of the
836             # current values for the cluster centers, we call the routine shown here for updating
837             # the values of the cluster centers.
838             sub update_cluster_centers {
839             my $self = shift;
840             my @clusters = @{ shift @_ };
841             my @new_cluster_centers;
842             # During clustering for a fixed K, should a cluster inadvertently become empty,
843             # steal a member from the largest cluster to hopefully spawn a new cluster:
844             my $largest_cluster;
845             foreach my $cluster (@clusters) {
846             next if !defined $cluster;
847             $largest_cluster = $cluster if !defined $largest_cluster;
848             if (@$cluster > @$largest_cluster) {
849             $largest_cluster = $cluster;
850             }
851             }
852             foreach my $cluster (@clusters) {
853             if ( (!defined $cluster) || (@$cluster == 0) ) {
854             push @$cluster, pop @$largest_cluster;
855             }
856             }
857             foreach my $cluster (@clusters) {
858             die "Cluster became empty --- untenable condition " .
859             "for a given K. Try again. \n" if !defined $cluster;
860             my $cluster_size = @$cluster;
861             die "Cluster size is zero --- untenable.\n" if $cluster_size == 0;
862             my @new_cluster_center = @{$self->add_point_coords( $cluster )};
863             @new_cluster_center = map {my $x = $_/$cluster_size; $x}
864             @new_cluster_center;
865             push @new_cluster_centers, \@new_cluster_center;
866             }
867             return \@new_cluster_centers;
868             }
869              
870             sub which_cluster_for_new_data_element {
871             my $self = shift;
872             my $ele = shift;
873             die "The dimensionality of the new data element is not correct: $!"
874             unless @$ele == $self->{_data_dimensions};
875             my %distance_to_new_ele_hash;
876             foreach my $cluster_id (sort keys %{$self->{_cluster_centers_hash}}) {
877             $distance_to_new_ele_hash{$cluster_id} = $self->distance2($ele,
878             $self->{_cluster_centers_hash}->{$cluster_id});
879             }
880             my @values = values %distance_to_new_ele_hash;
881             my ($min,$max) = minmax(\@values);
882             my $answer;
883             foreach my $cluster_id (keys %distance_to_new_ele_hash) {
884             $answer = $cluster_id if $distance_to_new_ele_hash{$cluster_id} == $min;
885             }
886             return $answer;
887             }
888              
889             sub which_cluster_for_new_data_element_mahalanobis {
890             my $self = shift;
891             my $ele = shift;
892             die "The dimensionality of the new data element is not correct: $!"
893             unless @$ele == $self->{_data_dimensions};
894             my %distance_to_new_ele_hash;
895             foreach my $cluster_id (sort keys %{$self->{_cluster_centers_hash}}) {
896             $distance_to_new_ele_hash{$cluster_id} =
897             $self->distance_mahalanobis2($ele, $self->{_cluster_centers_hash}->{$cluster_id},
898             $self->{_cluster_covariances_hash}->{$cluster_id});
899             }
900             my @values = values %distance_to_new_ele_hash;
901             my ($min,$max) = minmax(\@values);
902             my $answer;
903             foreach my $cluster_id (keys %distance_to_new_ele_hash) {
904             $answer = $cluster_id if $distance_to_new_ele_hash{$cluster_id} == $min;
905             }
906             return $answer;
907             }
908              
909             # The following function returns the value of QoC for a given partitioning of the
910             # data into K clusters. It calculates two things: the average value for the distance
911             # between a data point and the center of the cluster in which the data point resides,
912             # and the average value for the distances between the cluster centers. We obviously
913             # want to minimize the former and maximize the latter. All of the "from center"
914             # distances within each cluster are stored in the variable
915             # $sum_of_distances_for_one_cluster. When this variable, after it is divided by the
916             # number of data elements in the cluster, is summed over all the clusters, we get a
917             # value that is stored in $avg_dist_for_cluster. The inter-cluster-center distances
918             # are stored in the variable $inter_cluster_center_dist.
919             sub cluster_quality {
920             my $self = shift;
921             my $clusters = shift;
922             my $cluster_centers = shift;
923             my $K = @$cluster_centers; # Number of clusters
924             my $cluster_radius = 0;
925             foreach my $i (0..@$clusters-1) {
926             my $sum_of_distances_for_one_cluster = 0;
927             foreach my $ele (@{$clusters->[$i]}) {
928             $sum_of_distances_for_one_cluster +=
929             $self->distance( $ele, $cluster_centers->[$i] );
930             }
931             $cluster_radius +=
932             $sum_of_distances_for_one_cluster / @{$clusters->[$i]};
933             }
934             my $inter_cluster_center_dist = 0;
935             foreach my $i (0..@$cluster_centers-1) {
936             foreach my $j (0..@$cluster_centers-1) {
937             $inter_cluster_center_dist +=
938             $self->distance2( $cluster_centers->[$i],
939             $cluster_centers->[$j] );
940             }
941             }
942             my $avg_inter_cluster_center_dist = $inter_cluster_center_dist /
943             ( $K * ($K-1) / 2.0 );
944             return $cluster_radius / $avg_inter_cluster_center_dist;
945             }
946              
947             # The following routine is for computing the distance between a data point specified
948             # by its symbolic name in the master datafile and a point (such as the center of a
949             # cluster) expressed as a vector of coordinates:
950             sub distance {
951             my $self = shift;
952             my $ele1_id = shift @_; # symbolic name of data sample
953             my @ele1 = @{$self->{_data}->{$ele1_id}};
954             my @ele2 = @{shift @_};
955             die "wrong data types for distance calculation\n" if @ele1 != @ele2;
956             my $how_many = @ele1;
957             my $squared_sum = 0;
958             foreach my $i (0..$how_many-1) {
959             $squared_sum += ($ele1[$i] - $ele2[$i])**2;
960             }
961             my $dist = sqrt $squared_sum;
962             return $dist;
963             }
964              
965             # The following routine does the same as above but now both arguments are expected to
966             # be arrays of numbers:
967             sub distance2 {
968             my $self = shift;
969             my @ele1 = @{shift @_};
970             my @ele2 = @{shift @_};
971             die "wrong data types for distance calculation\n" if @ele1 != @ele2;
972             my $how_many = @ele1;
973             my $squared_sum = 0;
974             foreach my $i (0..$how_many-1) {
975             $squared_sum += ($ele1[$i] - $ele2[$i])**2;
976             }
977             return sqrt $squared_sum;
978             }
979              
980             # Requires three args: $ele for the symbolic tag of the element, $center for the
981             # coordinates of the center of the cluster, and $covariance for the covariance of
982             # cluster. Our goal is to find the distance of the element ele from the cluster
983             # whose mean and covariance are provided.
984             sub distance_mahalanobis {
985             my $self = shift;
986             my $ele = shift;
987             my $cluster_center = shift;
988             my $cluster_covariance = shift;
989             my $det_of_covar = $cluster_covariance->det();
990             my $ele_data = $self->{_data}->{$ele};
991             my @diff_from_mean = vector_subtract($ele_data, $cluster_center);
992             my $vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
993             $vec->set_col(0, \@diff_from_mean);
994             my $transposed = transpose( $vec );
995             my $covariance_inverse;
996             if ($cluster_covariance->det() > 0.001) {
997             $covariance_inverse = $cluster_covariance->inverse;
998             } else {
999             die "covariance matrix may not have an inverse";
1000             }
1001             my $determinant = $covariance_inverse->det();
1002             my $distance = $transposed * $covariance_inverse * $vec;
1003             my @distance = $distance->as_list;
1004             $distance = $distance[0];
1005             return sqrt $distance;
1006             }
1007             # Same as the previous method except that the first argument can be the actual
1008             # coordinates of the data element. Our goal is to find the Mahalanobis distance
1009             # from a given data element to a cluster whose center and covariance are known. As
1010             # for the previous method, it requires three arguments.
1011             sub distance_mahalanobis2 {
1012             my $self = shift;
1013             my $ele = shift; # is now a ref to the array of coords for a point
1014             my $cluster_center = shift;
1015             my $cluster_covariance = shift;
1016             return undef unless defined $cluster_covariance;
1017             my $det_of_covar = $cluster_covariance->det();
1018             my @diff_from_mean = vector_subtract($ele, $cluster_center);
1019             my $vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
1020             $vec->set_col(0, \@diff_from_mean);
1021             my $transposed = transpose( $vec );
1022             my $covariance_inverse;
1023             if ($cluster_covariance->det() > 0.001) {
1024             $covariance_inverse = $cluster_covariance->inverse;
1025             } else {
1026             die "covariance matrix may not have an inverse";
1027             }
1028             my $determinant = $covariance_inverse->det();
1029             my $distance = $transposed * $covariance_inverse * $vec;
1030             my @distance = $distance->as_list;
1031             $distance = $distance[0];
1032             return sqrt $distance;
1033             }
1034              
1035             sub estimate_cluster_covariance {
1036             my $self = shift;
1037             my $cluster = shift;
1038             my $cluster_size = @$cluster;
1039             my @cluster_center = @{$self->add_point_coords( $cluster )};
1040             @cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;
1041             # for covariance calculation:
1042             my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$cluster));
1043             my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
1044             my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
1045             # All the record labels are stored in the array $self->{_data_id_tags}. The
1046             # actual data for clustering is stored in a hash at $self->{_data} whose keys are
1047             # the record labels; the value associated with each key is the array holding the
1048             # corresponding numerical multidimensional data.
1049             foreach my $j (0..$num_cols-1) {
1050             my $tag = $cluster->[$j];
1051             my $data = $self->{_data}->{$tag};
1052             my @diff_from_mean = vector_subtract($data, \@cluster_center);
1053             $matrix->set_col($j, \@diff_from_mean);
1054             }
1055             my $transposed = transpose( $matrix );
1056             my $covariance = $matrix * $transposed;
1057             $covariance *= 1.0 / $num_cols;
1058             if ($self->{_debug}) {
1059             print "\nDisplaying the Covariance Matrix for cluster:";
1060             display_matrix( $covariance );
1061             }
1062             return $covariance;
1063             }
1064              
1065             sub write_clusters_to_files {
1066             my $self = shift;
1067             my @clusters = @{$self->{_clusters}};
1068             unlink glob "cluster*.dat";
1069             foreach my $i (0..@clusters-1) {
1070             my $filename = "cluster" . $i . ".dat";
1071             print "\nWriting cluster $i to file $filename\n" if $self->{_terminal_output};
1072             open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!";
1073             foreach my $ele (@{$clusters[$i]}) {
1074             print FILEHANDLE "$ele ";
1075             }
1076             close FILEHANDLE;
1077             }
1078             }
1079              
1080             sub get_K_best {
1081             my $self = shift;
1082             croak "You need to run the clusterer with K=0 option " .
1083             "before you can call this method" if $self->{_K_best} eq 'unknown';
1084             print "\nThe best value of K: $self->{_K_best}\n" if $self->{_terminal_output};
1085             return $self->{_K_best};
1086             }
1087              
1088             sub show_QoC_values {
1089             my $self = shift;
1090             croak "\nYou need to run the clusterer with K=0 option before you can call this method"
1091             if $self->{_K_best} eq 'unknown';
1092             print "\nShown below are K on the left and the QoC values on the right (the smaller " .
1093             "the QoC, the better it is)\n";
1094             foreach my $key (sort keys %{$self->{_QoC_values}} ) {
1095             print " $key => $self->{_QoC_values}->{$key}\n" if defined $self->{_QoC_values}->{$key};
1096             }
1097             }
1098              
1099             sub DESTROY {
1100             unlink "__temp_" . basename($_[0]->{_datafile});
1101             unlink "__temp_data_" . basename($_[0]->{_datafile});
1102             unlink "__temp_normed_data_" . basename($_[0]->{_datafile});
1103             }
1104              
1105             ################################## Visualization Code ###################################
1106              
1107             # It makes sense to call visualize_clusters() only AFTER you have called kmeans().
1108             #
1109             # The visualize_clusters() implementation automatically figures out whether it
1110             # should do a 2D plot or a 3D plot. If the number of on bits in the mask that is
1111             # supplied as one of the arguments is greater than 2, it does a 3D plot for the
1112             # first three data coordinates. That is, the clusters will be displayed in the 3D
1113             # space formed by the first three data coordinates. On the other hand, if the number
1114             # of on bits in the mask is exactly 2, it does a 2D plot. Should it happen that
1115             # only one on bit is specified for the mask, visualize_clusters() aborts.
1116             #
1117             # The visualization code consists of first accessing each of clusters created by the
1118             # kmeans() subroutine. Note that the clusters contain only the symbolic names for
1119             # the individual records in the source data file. We therefore next reach into the
1120             # $self->{_original_data} hash and get the data coordinates associated with each
1121             # symbolic label in a cluster. The numerical data thus generated is then written
1122             # out to a temp file. When doing so we must remember to insert TWO BLANK LINES
1123             # between the data blocks corresponding to the different clusters. This constraint
1124             # is imposed on us by Gnuplot when plotting data from the same file since we want to
1125             # use different point styles for the data points in different cluster files.
1126             #
1127             # Subsequently, we call upon the Perl interface provided by the Graphics::GnuplotIF
1128             # module to plot the data clusters.
1129             sub visualize_clusters {
1130             my $self = shift;
1131             my $v_mask;
1132             my $pause_time;
1133             if (@_ == 1) {
1134             $v_mask = shift || croak "visualization mask missing";
1135             } elsif (@_ == 2) {
1136             $v_mask = shift || croak "visualization mask missing";
1137             $pause_time = shift;
1138             } else {
1139             croak "visualize_clusters() called with wrong args";
1140             }
1141             my $master_datafile = $self->{_datafile};
1142             my @v_mask = split //, $v_mask;
1143             my $visualization_mask_width = @v_mask;
1144             my $original_data_mask = $self->{_mask};
1145             my @mask = split //, $original_data_mask;
1146             my $data_field_width = scalar grep {$_ eq '1'} @mask;
1147             croak "\n\nABORTED: The width of the visualization mask (including " .
1148             "all its 1s and 0s) must equal the width of the original mask " .
1149             "used for reading the data file (counting only the 1's)"
1150             if $visualization_mask_width != $data_field_width;
1151             my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1152             my %visualization_data;
1153             while ( my ($record_id, $data) = each %{$self->{_original_data}} ) {
1154             my @fields = @$data;
1155             croak "\nABORTED: Visualization mask size exceeds data record size\n"
1156             if $#v_mask > $#fields;
1157             my @data_fields;
1158             foreach my $i (0..@fields-1) {
1159             if ($v_mask[$i] eq '0') {
1160             next;
1161             } elsif ($v_mask[$i] eq '1') {
1162             push @data_fields, $fields[$i];
1163             } else {
1164             croak "Misformed visualization mask. It can only have 1s and 0s\n";
1165             }
1166             }
1167             $visualization_data{ $record_id } = \@data_fields;
1168             }
1169             my @all_data_ids = @{$self->{_data_id_tags}};
1170             my $K = scalar @{$self->{_clusters}};
1171             my $filename = basename($master_datafile);
1172             my $temp_file = "__temp_" . $filename;
1173             unlink $temp_file if -e $temp_file;
1174             open OUTPUT, ">$temp_file"
1175             or die "Unable to open a temp file in this directory: $!\n";
1176             foreach my $cluster (@{$self->{_clusters}}) {
1177             foreach my $item (@$cluster) {
1178             print OUTPUT "@{$visualization_data{$item}}";
1179             print OUTPUT "\n";
1180             }
1181             print OUTPUT "\n\n";
1182             }
1183             close OUTPUT;
1184             my $plot;
1185             my $hardcopy_plot;
1186             if (!defined $pause_time) {
1187             $plot = Graphics::GnuplotIF->new( persist => 1 );
1188             $hardcopy_plot = Graphics::GnuplotIF->new();
1189             $hardcopy_plot->gnuplot_cmd('set terminal png', "set output \"clustering_results.png\"");
1190             } else {
1191             $plot = Graphics::GnuplotIF->new();
1192             }
1193             $plot->gnuplot_cmd( "set noclip" );
1194             $plot->gnuplot_cmd( "set pointsize 2" );
1195             my $arg_string = "";
1196             if ($visualization_data_field_width > 2) {
1197             foreach my $i (0..$K-1) {
1198             my $j = $i + 1;
1199             $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i\" with points lt $j pt $j, ";
1200             }
1201             } elsif ($visualization_data_field_width == 2) {
1202             foreach my $i (0..$K-1) {
1203             my $j = $i + 1;
1204             $arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i\" with points lt $j pt $j, ";
1205             }
1206             } elsif ($visualization_data_field_width == 1 ) {
1207             foreach my $i (0..$K-1) {
1208             my $j = $i + 1;
1209             $arg_string .= "\"$temp_file\" index $i using 1 title \"Cluster $i\" with points lt $j pt $j, ";
1210             }
1211             }
1212             $arg_string = $arg_string =~ /^(.*),[ ]+$/;
1213             $arg_string = $1;
1214             if ($visualization_data_field_width > 2) {
1215             $plot->gnuplot_cmd( "splot $arg_string" );
1216             $hardcopy_plot->gnuplot_cmd( "splot $arg_string" ) unless defined $pause_time;
1217             $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1218             } elsif ($visualization_data_field_width == 2) {
1219             $plot->gnuplot_cmd( "plot $arg_string" );
1220             $hardcopy_plot->gnuplot_cmd( "plot $arg_string" ) unless defined $pause_time;
1221             $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1222             } elsif ($visualization_data_field_width == 1) {
1223             croak "No provision for plotting 1-D data\n";
1224             }
1225             }
1226              
1227             # It makes sense to call visualize_data() only AFTER you have called the method
1228             # read_data_from_file().
1229             #
1230             # The visualize_data() is meant for the visualization of the original data in its
1231             # various 2D or 3D subspaces. The method can also be used to visualize the normed
1232             # data in a similar manner. Recall the normed data is the original data after each
1233             # data dimension is normalized by the standard-deviation along that dimension.
1234             #
1235             # Whether you see the original data or the normed data depends on the second
1236             # argument supplied in the method call. It must be either the string 'original' or
1237             # the string 'normed'.
1238             sub visualize_data {
1239             my $self = shift;
1240             my $v_mask = shift || croak "visualization mask missing";
1241             my $datatype = shift; # must be either 'original' or 'normed'
1242             croak "\n\nABORTED: You called visualize_data() for normed data " .
1243             "but without first turning on data normalization in the " .
1244             "in the KMeans constructor"
1245             if ($datatype eq 'normed') && ! $self->{_var_normalize};
1246             my $master_datafile = $self->{_datafile};
1247             my @v_mask = split //, $v_mask;
1248             my $visualization_mask_width = @v_mask;
1249             my $original_data_mask = $self->{_mask};
1250             my @mask = split //, $original_data_mask;
1251             my $data_field_width = scalar grep {$_ eq '1'} @mask;
1252             croak "\n\nABORTED: The width of the visualization mask (including " .
1253             "all its 1s and 0s) must equal the width of the original mask " .
1254             "used for reading the data file (counting only the 1's)"
1255             if $visualization_mask_width != $data_field_width;
1256             my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1257             my %visualization_data;
1258             my $data_source;
1259             if ($datatype eq 'original') {
1260             $data_source = $self->{_original_data};
1261             } elsif ($datatype eq 'normed') {
1262             $data_source = $self->{_data};
1263             } else {
1264             croak "\n\nABORTED: improper call to visualize_data()";
1265             }
1266             while ( my ($record_id, $data) = each %{$data_source} ) {
1267             my @fields = @$data;
1268             croak "\nABORTED: Visualization mask size exceeds data record size\n"
1269             if $#v_mask > $#fields;
1270             my @data_fields;
1271             foreach my $i (0..@fields-1) {
1272             if ($v_mask[$i] eq '0') {
1273             next;
1274             } elsif ($v_mask[$i] eq '1') {
1275             push @data_fields, $fields[$i];
1276             } else {
1277             croak "Misformed visualization mask. It can only have 1s and 0s\n";
1278             }
1279             }
1280             $visualization_data{ $record_id } = \@data_fields;
1281             }
1282             my $filename = basename($master_datafile);
1283             my $temp_file;
1284             if ($datatype eq 'original') {
1285             $temp_file = "__temp_data_" . $filename;
1286             } elsif ($datatype eq 'normed') {
1287             $temp_file = "__temp_normed_data_" . $filename;
1288             } else {
1289             croak "ABORTED: Improper call to visualize_data()";
1290             }
1291             unlink $temp_file if -e $temp_file;
1292             open OUTPUT, ">$temp_file"
1293             or die "Unable to open a temp file in this directory: $!\n";
1294             foreach my $datapoint (values %visualization_data) {
1295             print OUTPUT "@$datapoint";
1296             print OUTPUT "\n";
1297             }
1298             close OUTPUT;
1299             my $plot = Graphics::GnuplotIF->new( persist => 1 );
1300             $plot->gnuplot_cmd( "set noclip" );
1301             $plot->gnuplot_cmd( "set pointsize 2" );
1302             my $plot_title = $datatype eq 'original' ? '"data"' : '"normed data"';
1303             my $arg_string ;
1304             if ($visualization_data_field_width > 2) {
1305             $arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
1306             } elsif ($visualization_data_field_width == 2) {
1307             $arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
1308             } elsif ($visualization_data_field_width == 1 ) {
1309             $arg_string = "\"$temp_file\" using 1 notitle with points lt -1 pt 1";
1310             }
1311             if ($visualization_data_field_width > 2) {
1312             $plot->gnuplot_cmd( "splot $arg_string" );
1313             } elsif ($visualization_data_field_width == 2) {
1314             $plot->gnuplot_cmd( "plot $arg_string" );
1315             } elsif ($visualization_data_field_width == 1) {
1316             croak "No provision for plotting 1-D data\n";
1317             }
1318             }
1319              
1320             ########################### Generating Synthetic Data for Clustering ##############################
1321              
1322             # The data generated corresponds to a multivariate distribution. The mean and the
1323             # covariance of each Gaussian in the distribution are specified individually in a
1324             # parameter file. See the example parameter file param.txt in the examples
1325             # directory. Just edit this file for your own needs.
1326             #
1327             # The multivariate random numbers are generated by calling the Math::Random module.
1328             # As you would expect, that module will insist that the covariance matrix you
1329             # specify be symmetric and positive definite.
1330             sub cluster_data_generator {
1331             my $class = shift;
1332             croak "illegal call of a class method" unless $class eq 'Algorithm::KMeans';
1333             my %args = @_;
1334             my $input_parameter_file = $args{input_parameter_file};
1335             my $output_file = $args{output_datafile};
1336             my $N = $args{number_data_points_per_cluster};
1337             my @all_params;
1338             my $param_string;
1339             if (defined $input_parameter_file) {
1340             open INPUT, $input_parameter_file || "unable to open parameter file: $!";
1341             @all_params = ;
1342             @all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
1343             chomp @all_params;
1344             $param_string = join ' ', @all_params;
1345             } else {
1346             # Just for testing. Used in t/test.t
1347             $param_string = "cluster 5 0 0 1 0 0 0 1 0 0 0 1 " .
1348             "cluster 0 5 0 1 0 0 0 1 0 0 0 1 " .
1349             "cluster 0 0 5 1 0 0 0 1 0 0 0 1";
1350             }
1351             my @cluster_strings = split /[ ]*cluster[ ]*/, $param_string;
1352             @cluster_strings = grep $_, @cluster_strings;
1353             my $K = @cluster_strings;
1354             croak "Too many clusters requested" if $K > 12;
1355             my @point_labels = ('a'..'z');
1356             print "Number of Gaussians used for the synthetic data: $K\n";
1357             my @means;
1358             my @covariances;
1359             my $data_dimension;
1360             foreach my $i (0..$K-1) {
1361             my @num_strings = split / /, $cluster_strings[$i];
1362             my @cluster_mean = map {/$_num_regex/;$_} split / /, $num_strings[0];
1363             $data_dimension = @cluster_mean;
1364             push @means, \@cluster_mean;
1365             my @covariance_nums = map {/$_num_regex/;$_} split / /, $num_strings[1];
1366             croak "dimensionality error" if @covariance_nums !=
1367             ($data_dimension ** 2);
1368             my $cluster_covariance;
1369             foreach my $j (0..$data_dimension-1) {
1370             foreach my $k (0..$data_dimension-1) {
1371             $cluster_covariance->[$j]->[$k] =
1372             $covariance_nums[$j*$data_dimension + $k];
1373             }
1374             }
1375             push @covariances, $cluster_covariance;
1376             }
1377             random_seed_from_phrase( 'hellojello' );
1378             my @data_dump;
1379             foreach my $i (0..$K-1) {
1380             my @m = @{shift @means};
1381             my @covar = @{shift @covariances};
1382             my @new_data = Math::Random::random_multivariate_normal( $N, @m, @covar );
1383             my $p = 0;
1384             my $label = $point_labels[$i];
1385             @new_data = map {unshift @$_, $label.$i; $i++; $_} @new_data;
1386             push @data_dump, @new_data;
1387             }
1388             fisher_yates_shuffle( \@data_dump );
1389             open OUTPUT, ">$output_file";
1390             foreach my $ele (@data_dump) {
1391             foreach my $coord ( @$ele ) {
1392             print OUTPUT "$coord ";
1393             }
1394             print OUTPUT "\n";
1395             }
1396             print "Data written out to file $output_file\n";
1397             close OUTPUT;
1398             }
1399              
1400             sub add_point_coords {
1401             my $self = shift;
1402             my @arr_of_ids = @{shift @_}; # array of data element names
1403             my @result;
1404             my $data_dimensionality = $self->{_data_dimensions};
1405             foreach my $i (0..$data_dimensionality-1) {
1406             $result[$i] = 0.0;
1407             }
1408             foreach my $id (@arr_of_ids) {
1409             my $ele = $self->{_data}->{$id};
1410             my $i = 0;
1411             foreach my $component (@$ele) {
1412             $result[$i] += $component;
1413             $i++;
1414             }
1415             }
1416             return \@result;
1417             }
1418              
1419             sub add_point_coords_from_original_data {
1420             my $self = shift;
1421             my @arr_of_ids = @{shift @_}; # array of data element names
1422             my @result;
1423             my $data_dimensionality = $self->{_data_dimensions};
1424             foreach my $i (0..$data_dimensionality-1) {
1425             $result[$i] = 0.0;
1426             }
1427             foreach my $id (@arr_of_ids) {
1428             my $ele = $self->{_original_data}->{$id};
1429             my $i = 0;
1430             foreach my $component (@$ele) {
1431             $result[$i] += $component;
1432             $i++;
1433             }
1434             }
1435             return \@result;
1436             }
1437              
1438             ################################### Support Routines ########################################
1439              
1440             sub get_index_at_value {
1441             my $value = shift;
1442             my @array = @{shift @_};
1443             foreach my $i (0..@array-1) {
1444             return $i if $value == $array[$i];
1445             }
1446             return -1;
1447             }
1448              
1449             # This routine is really not necessary in light of the new `~~' operator in Perl.
1450             # Will use the new operator in the next version.
1451             sub vector_equal {
1452             my $vec1 = shift;
1453             my $vec2 = shift;
1454             die "wrong data types for vector-equal predicate\n" if @$vec1 != @$vec2;
1455             foreach my $i (0..@$vec1-1){
1456             return 0 if $vec1->[$i] != $vec2->[$i];
1457             }
1458             return 1;
1459             }
1460              
1461             # Returns the minimum value and its positional index in an array
1462             sub minimum {
1463             my $arr = shift;
1464             my $min;
1465             my $index;
1466             foreach my $i (0..@{$arr}-1) {
1467             if ( (!defined $min) || ($arr->[$i] < $min) ) {
1468             $index = $i;
1469             $min = $arr->[$i];
1470             }
1471             }
1472             return ($min, $index);
1473             }
1474              
1475             sub minmax {
1476             my $arr = shift;
1477             my $min;
1478             my $max;
1479             foreach my $i (0..@{$arr}-1) {
1480             next if !defined $arr->[$i];
1481             if ( (!defined $min) && (!defined $max) ) {
1482             $min = $arr->[$i];
1483             $max = $arr->[$i];
1484             } elsif ( $arr->[$i] < $min ) {
1485             $min = $arr->[$i];
1486             } elsif ( $arr->[$i] > $max ) {
1487             $max = $arr->[$i];
1488             }
1489             }
1490             return ($min, $max);
1491             }
1492              
1493             # Meant only for constructing a deep copy of an array of arrays:
1494             sub deep_copy_AoA {
1495             my $ref_in = shift;
1496             my $ref_out;
1497             foreach my $i (0..@{$ref_in}-1) {
1498             foreach my $j (0..@{$ref_in->[$i]}-1) {
1499             $ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
1500             }
1501             }
1502             return $ref_out;
1503             }
1504              
1505             # Meant only for constructing a deep copy of an array of arrays for the case when
1506             # some elements of the top-level array may be undefined:
1507             sub deep_copy_AoA_with_nulls {
1508             my $ref_in = shift;
1509             my $ref_out;
1510             foreach my $i (0..@{$ref_in}-1) {
1511             if ( !defined $ref_in->[$i] ) {
1512             $ref_out->[$i] = undef;
1513             next;
1514             }
1515             foreach my $j (0..@{$ref_in->[$i]}-1) {
1516             $ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
1517             }
1518             }
1519             return $ref_out;
1520             }
1521              
1522             # Meant only for constructing a deep copy of a hash in which each value is an
1523             # anonymous array of numbers:
1524             sub deep_copy_hash {
1525             my $ref_in = shift;
1526             my $ref_out;
1527             while ( my ($key, $value) = each( %$ref_in ) ) {
1528             $ref_out->{$key} = deep_copy_array( $value );
1529             }
1530             return $ref_out;
1531             }
1532              
1533             # Meant only for an array of numbers:
1534             sub deep_copy_array {
1535             my $ref_in = shift;
1536             my $ref_out;
1537             foreach my $i (0..@{$ref_in}-1) {
1538             $ref_out->[$i] = $ref_in->[$i];
1539             }
1540             return $ref_out;
1541             }
1542              
1543             sub display_cluster_centers {
1544             my $self = shift;
1545             my @clusters = @{shift @_};
1546             my $i = 0;
1547             foreach my $cluster (@clusters) {
1548             my $cluster_size = @$cluster;
1549             my @cluster_center =
1550             @{$self->add_point_coords_from_original_data( $cluster )};
1551             @cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;
1552             print "\ncluster $i ($cluster_size records):\n";
1553             print "cluster center $i: " .
1554             "@{[map {my $x = sprintf('%.4f', $_); $x} @cluster_center]}\n";
1555             $i++;
1556             }
1557             }
1558              
1559             # For displaying the individual clusters on a terminal screen. Each cluster is
1560             # displayed through the symbolic names associated with the data points.
1561             sub display_clusters {
1562             my @clusters = @{shift @_};
1563             my $i = 0;
1564             foreach my $cluster (@clusters) {
1565             @$cluster = sort @$cluster;
1566             my $cluster_size = @$cluster;
1567             print "\n\nCluster $i ($cluster_size records):\n";
1568             foreach my $ele (@$cluster) {
1569             print " $ele";
1570             }
1571             $i++
1572             }
1573             print "\n\n";
1574             }
1575              
1576             # from perl docs:
1577             sub fisher_yates_shuffle {
1578             my $arr = shift;
1579             my $i = @$arr;
1580             while (--$i) {
1581             my $j = int rand( $i + 1 );
1582             @$arr[$i, $j] = @$arr[$j, $i];
1583             }
1584             }
1585              
1586             sub variance_normalization {
1587             my %data_hash = %{shift @_};
1588             my @all_data_points = values %data_hash;
1589             my $dimensions = @{$all_data_points[0]};
1590             my @data_projections;
1591             foreach my $data_point (@all_data_points) {
1592             my $i = 0;
1593             foreach my $proj (@$data_point) {
1594             push @{$data_projections[$i++]}, $proj;
1595             }
1596             }
1597             my @variance_vec;
1598             foreach my $vec (@data_projections) {
1599             my ($mean, $variance) = mean_and_variance( $vec );
1600             push @variance_vec, $variance;
1601             }
1602             my %new_data_hash;
1603             while (my ($label, $data) = each(%data_hash) ) {
1604             my @new_data;
1605             foreach my $i (0..@{$data}-1) {
1606             my $new = $data->[$i] / sqrt($variance_vec[$i]);
1607             push @new_data, $data->[$i] / sqrt($variance_vec[$i]);
1608             }
1609             $new_data_hash{$label} = \@new_data;
1610             }
1611             return \%new_data_hash;
1612             }
1613              
1614             sub mean_and_variance {
1615             my @data = @{shift @_};
1616             my ($mean, $variance);
1617             foreach my $i (1..@data) {
1618             if ($i == 1) {
1619             $mean = $data[0];
1620             $variance = 0;
1621             } else {
1622             $mean = ( (($i-1)/$i) * $mean ) + $data[$i-1] / $i;
1623             $variance = ( (($i-1)/$i) * $variance ) + ($data[$i-1]-$mean)**2 / ($i-1);
1624             }
1625             }
1626             return ($mean, $variance);
1627             }
1628              
1629             sub check_for_illegal_params {
1630             my @params = @_;
1631             my @legal_params = qw / datafile
1632             mask
1633             K
1634             Kmin
1635             Kmax
1636             terminal_output
1637             write_clusters_to_files
1638             do_variance_normalization
1639             cluster_seeding
1640             use_mahalanobis_metric
1641             debug
1642             /;
1643             my $found_match_flag;
1644             foreach my $param (@params) {
1645             foreach my $legal (@legal_params) {
1646             $found_match_flag = 0;
1647             if ($param eq $legal) {
1648             $found_match_flag = 1;
1649             last;
1650             }
1651             }
1652             last if $found_match_flag == 0;
1653             }
1654             return $found_match_flag;
1655             }
1656              
1657             sub get_value_index_hash {
1658             my $arr = shift;
1659             my %hash;
1660             foreach my $index (0..@$arr-1) {
1661             $hash{$arr->[$index]} = $index if $arr->[$index] > 0;
1662             }
1663             return \%hash;
1664             }
1665              
1666             sub non_maximum_suppression {
1667             my $arr = shift;
1668             my @output = (0) x @$arr;
1669             my @final_output = (0) x @$arr;
1670             my %hash;
1671             my @array_of_runs = ([$arr->[0]]);
1672             foreach my $index (1..@$arr-1) {
1673             if ($arr->[$index] == $arr->[$index-1]) {
1674             push @{$array_of_runs[-1]}, $arr->[$index];
1675             } else {
1676             push @array_of_runs, [$arr->[$index]];
1677             }
1678             }
1679             my $runstart_index = 0;
1680             foreach my $run_index (1..@array_of_runs-2) {
1681             $runstart_index += @{$array_of_runs[$run_index-1]};
1682             if ($array_of_runs[$run_index]->[0] >
1683             $array_of_runs[$run_index-1]->[0] &&
1684             $array_of_runs[$run_index]->[0] >
1685             $array_of_runs[$run_index+1]->[0]) {
1686             my $run_center = @{$array_of_runs[$run_index]} / 2;
1687             my $assignment_index = $runstart_index + $run_center;
1688             $output[$assignment_index] = $arr->[$assignment_index];
1689             }
1690             }
1691             if ($array_of_runs[-1]->[0] > $array_of_runs[-2]->[0]) {
1692             $runstart_index += @{$array_of_runs[-2]};
1693             my $run_center = @{$array_of_runs[-1]} / 2;
1694             my $assignment_index = $runstart_index + $run_center;
1695             $output[$assignment_index] = $arr->[$assignment_index];
1696             }
1697             if ($array_of_runs[0]->[0] > $array_of_runs[1]->[0]) {
1698             my $run_center = @{$array_of_runs[0]} / 2;
1699             $output[$run_center] = $arr->[$run_center];
1700             }
1701             return \@output;
1702             }
1703              
1704             sub display_matrix {
1705             my $matrix = shift;
1706             my $nrows = $matrix->rows();
1707             my $ncols = $matrix->cols();
1708             print "\n\nDisplaying matrix of size $nrows rows and $ncols columns:\n";
1709             foreach my $i (0..$nrows-1) {
1710             my $row = $matrix->row($i);
1711             my @row_as_list = $row->as_list;
1712             print "@row_as_list\n";
1713             }
1714             print "\n\n";
1715             }
1716              
1717             sub transpose {
1718             my $matrix = shift;
1719             my $num_rows = $matrix->rows();
1720             my $num_cols = $matrix->cols();
1721             my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
1722             foreach my $i (0..$num_rows-1) {
1723             my @row = $matrix->row($i)->as_list;
1724             $transpose->set_col($i, \@row );
1725             }
1726             return $transpose;
1727             }
1728              
1729             sub vector_subtract {
1730             my $vec1 = shift;
1731             my $vec2 = shift;
1732             die "wrong data types for vector subtract calculation\n" if @$vec1 != @$vec2;
1733             my @result;
1734             foreach my $i (0..@$vec1-1){
1735             push @result, $vec1->[$i] - $vec2->[$i];
1736             }
1737             return @result;
1738             }
1739              
1740             sub vector_multiply {
1741             my $vec1 = shift;
1742             my $vec2 = shift;
1743             die "vec_multiply called with two vectors of different sizes"
1744             unless @$vec1 == @$vec2;
1745             my $result = 0;
1746             foreach my $i (0..@$vec1-1) {
1747             $result += $vec1->[$i] * $vec2->[$i];
1748             }
1749             return $result;
1750             }
1751              
1752             sub matrix_multiply {
1753             my $matrix1 = shift;
1754             my $matrix2 = shift;
1755             my ($nrows1, $ncols1) = ($matrix1->rows(), $matrix1->cols());
1756             my ($nrows2, $ncols2) = ($matrix2->rows(), $matrix2->cols());
1757             die "matrix multiplication called with non-matching matrix arguments"
1758             unless $nrows1 == $ncols2 && $ncols1 == $nrows2;
1759             if ($nrows1 == 1) {
1760             my @row = $matrix1->row(0)->as_list;
1761             my @col = $matrix2->col(0)->as_list;
1762             my $result;
1763             foreach my $j (0..$ncols1-1) {
1764             $result += $row[$j] * $col[$j];
1765             }
1766             return $result;
1767             }
1768             my $product = Math::GSL::Matrix->new($nrows1, $nrows1);
1769             foreach my $i (0..$nrows1-1) {
1770             my $row = $matrix1->row($i);
1771             my @product_row;
1772             foreach my $j (0..$ncols2-1) {
1773             my $col = $matrix2->col($j);
1774             my $row_times_col = matrix_multiply($row, $col);
1775             push @product_row, $row_times_col;
1776             }
1777             $product->set_row($i, \@product_row);
1778             }
1779             return $product;
1780             }
1781              
1782             1;
1783              
1784             =pod
1785              
1786             =head1 NAME
1787              
1788             Algorithm::KMeans - for clustering multidimensional data
1789              
1790             =head1 SYNOPSIS
1791              
1792             # You now have four different choices for clustering your data with this module:
1793             #
1794             # 1) With Euclidean distances and with random cluster seeding
1795             #
1796             # 2) With Mahalanobis distances and with random cluster seeding
1797             #
1798             # 3) With Euclidean distances and with smart cluster seeding
1799             #
1800             # 4) With Mahalanobis distances and with smart cluster seeding
1801             #
1802             # Despite the qualifier 'smart' in 'smart cluster seeding', it may not always
1803             # produce results that are superior to those obtained with random seeding. (If you
1804             # also factor in the choice regarding variance normalization, you actually have
1805             # eight different choices for data clustering with this module.)
1806             #
1807             # In all cases, you'd obviously begin with
1808              
1809             use Algorithm::KMeans;
1810              
1811             # You'd then name the data file:
1812              
1813             my $datafile = "mydatafile.dat";
1814              
1815             # Next, set the mask to indicate which columns of the datafile to use for
1816             # clustering and which column contains a symbolic ID for each data record. For
1817             # example, if the symbolic name is in the first column, you want the second column
1818             # to be ignored, and you want the next three columns to be used for 3D clustering,
1819             # you'd set the mask to:
1820              
1821             my $mask = "N0111";
1822              
1823             # Now construct an instance of the clusterer. The parameter K controls the number
1824             # of clusters. If you know how many clusters you want (let's say 3), call
1825              
1826             my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1827             mask => $mask,
1828             K => 3,
1829             cluster_seeding => 'random',
1830             terminal_output => 1,
1831             write_clusters_to_files => 1,
1832             );
1833              
1834             # By default, this constructor call will set you up for clustering based on
1835             # Euclidean distances. If you want the module to use Mahalanobis distances, your
1836             # constructor call will look like:
1837              
1838             my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1839             mask => $mask,
1840             K => 3,
1841             cluster_seeding => 'random',
1842             use_mahalanobis_metric => 1,
1843             terminal_output => 1,
1844             write_clusters_to_files => 1,
1845             );
1846              
1847             # For both constructor calls shown above, you can use smart seeding of the clusters
1848             # by changing 'random' to 'smart' for the cluster_seeding option. See the
1849             # explanation of smart seeding in the Methods section of this documentation.
1850              
1851             # If your data is such that its variability along the different dimensions of the
1852             # data space is significantly different, you may get better clustering if you first
1853             # normalize your data by setting the constructor parameter
1854             # do_variance_normalization as shown below:
1855              
1856             my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1857             mask => $mask,
1858             K => 3,
1859             cluster_seeding => 'smart', # or 'random'
1860             terminal_output => 1,
1861             do_variance_normalization => 1,
1862             write_clusters_to_files => 1,
1863             );
1864              
1865             # But bear in mind that such data normalization may actually decrease the
1866             # performance of the clusterer if the variability in the data is more a result of
1867             # the separation between the means than a consequence of intra-cluster variance.
1868              
1869             # Set K to 0 if you want the module to figure out the optimum number of clusters
1870             # from the data. (It is best to run this option with the terminal_output set to 1
1871             # so that you can see the different value of QoC for the different K):
1872              
1873             my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1874             mask => $mask,
1875             K => 0,
1876             cluster_seeding => 'random', # or 'smart'
1877             terminal_output => 1,
1878             write_clusters_to_files => 1,
1879             );
1880              
1881             # Although not shown above, you can obviously set the 'do_variance_normalization'
1882             # flag here also if you wish.
1883              
1884             # For very large data files, setting K to 0 will result in searching through too
1885             # many values for K. For such cases, you can range limit the values of K to search
1886             # through by
1887              
1888             my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
1889             mask => "N111",
1890             Kmin => 3,
1891             Kmax => 10,
1892             cluster_seeding => 'random', # or 'smart'
1893             terminal_output => 1,
1894             write_clusters_to_files => 1,
1895             );
1896              
1897             # FOR ALL CASES ABOVE, YOU'D NEED TO MAKE THE FOLLOWING CALLS ON THE CLUSTERER
1898             # INSTANCE TO ACTUALLY CLUSTER THE DATA:
1899              
1900             $clusterer->read_data_from_file();
1901             $clusterer->kmeans();
1902              
1903             # If you want to directly access the clusters and the cluster centers in your own
1904             # top-level script, replace the above two statements with:
1905              
1906             $clusterer->read_data_from_file();
1907             my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans();
1908              
1909             # You can subsequently access the clusters directly in your own code, as in:
1910              
1911             foreach my $cluster_id (sort keys %{$clusters_hash}) {
1912             print "\n$cluster_id => @{$clusters_hash->{$cluster_id}}\n";
1913             }
1914             foreach my $cluster_id (sort keys %{$cluster_centers_hash}) {
1915             print "\n$cluster_id => @{$cluster_centers_hash->{$cluster_id}}\n";
1916             }
1917              
1918              
1919             # CLUSTER VISUALIZATION:
1920              
1921             # You must first set the mask for cluster visualization. This mask tells the module
1922             # which 2D or 3D subspace of the original data space you wish to visualize the
1923             # clusters in:
1924              
1925             my $visualization_mask = "111";
1926             $clusterer->visualize_clusters($visualization_mask);
1927              
1928              
1929             # SYNTHETIC DATA GENERATION:
1930              
1931             # The module has been provided with a class method for generating multivariate data
1932             # for experimenting with clustering. The data generation is controlled by the
1933             # contents of the parameter file that is supplied as an argument to the data
1934             # generator method. The mean and covariance matrix entries in the parameter file
1935             # must be according to the syntax shown in the param.txt file in the examples
1936             # directory. It is best to edit this file as needed:
1937              
1938             my $parameter_file = "param.txt";
1939             my $out_datafile = "mydatafile.dat";
1940             Algorithm::KMeans->cluster_data_generator(
1941             input_parameter_file => $parameter_file,
1942             output_datafile => $out_datafile,
1943             number_data_points_per_cluster => $N );
1944              
1945             =head1 CHANGES
1946              
1947             Version 2.03 incorporates minor code cleanup. The main implementation of the module
1948             remains unchanged.
1949              
1950             Version 2.02 downshifts the version of Perl that is required for this module. The
1951             module should work with versions 5.10 and higher of Perl. The implementation code
1952             for the module remains unchanged.
1953              
1954             Version 2.01 removes many errors in the documentation. The changes made to the module
1955             in Version 2.0 were not reflected properly in the documentation page for that
1956             version. The implementation code remains unchanged.
1957              
1958             Version 2.0 includes significant additional functionality: (1) You now have the
1959             option to cluster using the Mahalanobis distance metric (the default is the Euclidean
1960             metric); and (2) With the two C methods that have been added to the
1961             module, you can now determine the best cluster for a new data sample after you have
1962             created the clusters with the previously available data. Finding the best cluster
1963             for a new data sample can be done using either the Euclidean metric or the
1964             Mahalanobis metric.
1965              
1966             Version 1.40 includes a C option for seeding the clusters. This option,
1967             supplied through the constructor parameter C, means that the
1968             clusterer will (1) Subject the data to principal components analysis in order to
1969             determine the maximum variance direction; (2) Project the data onto this direction;
1970             (3) Find peaks in a smoothed histogram of the projected points; and (4) Use the
1971             locations of the highest peaks as initial guesses for the cluster centers. If you
1972             don't want to use this option, set C to C. That should work
1973             as in the previous version of the module.
1974              
1975             Version 1.30 includes a bug fix for the case when the datafile contains empty lines,
1976             that is, lines with no data records. Another bug fix in Version 1.30 deals with the
1977             case when you want the module to figure out how many clusters to form (this is the
1978             C option in the constructor call) and the number of data records is close to the
1979             minimum.
1980              
1981             Version 1.21 includes fixes to handle the possibility that, when clustering the data
1982             for a fixed number of clusters, a cluster may become empty during iterative
1983             calculation of cluster assignments of the data elements and the updating of the
1984             cluster centers. The code changes are in the C and
1985             C subroutines.
1986              
1987             Version 1.20 includes an option to normalize the data with respect to its variability
1988             along the different coordinates before clustering is carried out.
1989              
1990             Version 1.1.1 allows for range limiting the values of C to search through. C
1991             stands for the number of clusters to form. This version also declares the module
1992             dependencies in the C file.
1993              
1994             Version 1.1 is a an object-oriented version of the implementation presented in
1995             version 1.0. The current version should lend itself more easily to code extension.
1996             You could, for example, create your own class by subclassing from the class presented
1997             here and, in your subclass, use your own criteria for the similarity distance between
1998             the data points and for the QoC (Quality of Clustering) metric, and, possibly a
1999             different rule to stop the iterations. Version 1.1 also allows you to directly
2000             access the clusters formed and the cluster centers in your calling script.
2001              
2002              
2003             =head1 SPECIAL USAGE NOTE
2004              
2005             If you were directly accessing in your own scripts the clusters produced by the older
2006             versions of this module, you'd need to make changes to your code if you wish to use
2007             Version 2.0 or higher. Instead of returning arrays of clusters and cluster centers,
2008             Versions 2.0 and higher return hashes. This change was made necessary by the logic
2009             required for implementing the two new C methods that were introduced
2010             in Version 2.0. These methods return the best cluster for a new data sample from the
2011             clusters you created using the existing data. One of the C methods is
2012             based on the Euclidean metric for finding the cluster that is closest to the new data
2013             sample, and the other on the Mahalanobis metric. Another point of incompatibility
2014             with the previous versions is that you must now explicitly set the C
2015             parameter in the call to the constructor to either C or C. This
2016             parameter does not have a default associated with it starting with Version 2.0.
2017              
2018              
2019             =head1 DESCRIPTION
2020              
2021             Clustering with K-Means takes place iteratively and involves two steps: 1) assignment
2022             of data samples to clusters on the basis of how far the data samples are from the
2023             cluster centers; and 2) Recalculation of the cluster centers (and cluster covariances
2024             if you are using the Mahalanobis distance metric for clustering).
2025              
2026             Obviously, before the two-step approach can proceed, we need to initialize the the
2027             cluster centers. How this initialization is carried out is important. The module
2028             gives you two very different ways for carrying out this initialization. One option,
2029             called the C option, consists of subjecting the data to principal components
2030             analysis to discover the direction of maximum variance in the data space. The data
2031             points are then projected on to this direction and a histogram constructed from the
2032             projections. Centers of the smoothed histogram are used to seed the clustering
2033             operation. The other option is to choose the cluster centers purely randomly. You
2034             get the first option if you set C to C in the constructor,
2035             and you get the second option if you set it to C.
2036              
2037             How to specify the number of clusters, C, is one of the most vexing issues in any
2038             approach to clustering. In some case, we can set C on the basis of prior
2039             knowledge. But, more often than not, no such prior knowledge is available. When the
2040             programmer does not explicitly specify a value for C, the approach taken in the
2041             current implementation is to try all possible values between 2 and some largest
2042             possible value that makes statistical sense. We then choose that value for C
2043             which yields the best value for the QoC (Quality of Clustering) metric. It is
2044             generally believed that the largest value for C should not exceed C
2045             where C is the number of data samples to be clustered.
2046              
2047             What to use for the QoC metric is obviously a critical issue unto itself. In the
2048             current implementation, the value of QoC is the ratio of the average radius of the
2049             clusters and the average distance between the cluster centers.
2050              
2051             Every iterative algorithm requires a stopping criterion. The criterion implemented
2052             here is that we stop iterations when there is no re-assignment of the data points
2053             during the assignment step.
2054              
2055             Ordinarily, the output produced by a K-Means clusterer will correspond to a local
2056             minimum for the QoC values, as opposed to a global minimum. The current
2057             implementation protects against that when the module constructor is called with the
2058             C option for C by trying different randomly selected initial
2059             cluster centers and then selecting the one that gives the best overall QoC value.
2060              
2061             A K-Means clusterer will generally produce good results if the overlap between the
2062             clusters is minimal and if each cluster exhibits variability that is uniform in all
2063             directions. When the data variability is different along the different directions in
2064             the data space, the results you obtain with a K-Means clusterer may be improved by
2065             first normalizing the data appropriately, as can be done in this module when you set
2066             the C option in the constructor. However, as pointed out
2067             elsewhere in this documentation, such normalization may actually decrease the
2068             performance of the clusterer if the overall data variability along any dimension is
2069             more a result of separation between the means than a consequence of intra-cluster
2070             variability.
2071              
2072              
2073             =head1 METHODS
2074              
2075             The module provides the following methods for clustering, for cluster visualization,
2076             for data visualization, for the generation of data for testing a clustering
2077             algorithm, and for determining the cluster membership of a new data sample:
2078              
2079             =over 4
2080              
2081             =item B
2082              
2083             my $clusterer = Algorithm::KMeans->new(datafile => $datafile,
2084             mask => $mask,
2085             K => $K,
2086             cluster_seeding => 'random', # also try 'smart'
2087             use_mahalanobis_metric => 1, # also try '0'
2088             terminal_output => 1,
2089             write_clusters_to_files => 1,
2090             );
2091              
2092             A call to C constructs a new instance of the C class. When
2093             C<$K> is a non-zero positive integer, the module will construct exactly that many
2094             clusters. However, when C<$K> is 0, the module will find the best number of clusters
2095             to partition the data into. As explained in the Description, setting
2096             C to C causes PCA (principal components analysis) to be used
2097             for discovering the best choices for the initial cluster centers. If you want purely
2098             random decisions to be made for the initial choices for the cluster centers, set
2099             C to C.
2100              
2101             The data file is expected to contain entries in the following format
2102              
2103             c20 0 10.7087017086940 9.63528386251712 10.9512155258108 ...
2104             c7 0 12.8025925026787 10.6126270065785 10.5228482095349 ...
2105             b9 0 7.60118206283120 5.05889245193079 5.82841781759102 ...
2106             ....
2107             ....
2108              
2109             where the first column contains the symbolic ID tag for each data record and the rest
2110             of the columns the numerical information. As to which columns are actually used for
2111             clustering is decided by the string value of the mask. For example, if we wanted to
2112             cluster on the basis of the entries in just the 3rd, the 4th, and the 5th columns
2113             above, the mask value would be C where the character C indicates that the
2114             ID tag is in the first column, the character C<0> that the second column is to be
2115             ignored, and the C<1>'s that follow that the 3rd, the 4th, and the 5th columns are to
2116             be used for clustering.
2117              
2118             If you wish for the clusterer to search through a C<(Kmin,Kmax)> range of values for
2119             C, the constructor should be called in the following fashion:
2120              
2121             my $clusterer = Algorithm::KMeans->new(datafile => $datafile,
2122             mask => $mask,
2123             Kmin => 3,
2124             Kmax => 10,
2125             cluster_seeding => 'smart', # try 'random' also
2126             terminal_output => 1,
2127             );
2128              
2129             where obviously you can choose any reasonable values for C and C. If you
2130             choose a value for C that is statistically too large, the module will let you
2131             know. Again, you may choose C for C, the default value being
2132             C.
2133              
2134             If you believe that the variability of the data is very different along the different
2135             dimensions of the data space, you may get better clustering by first normalizing the
2136             data coordinates by the standard-deviations along those directions. When you set the
2137             constructor option C as shown below, the module uses the
2138             overall data standard-deviation along a direction for the normalization in that
2139             direction. (As mentioned elsewhere in the documentation, such a normalization could
2140             backfire on you if the data variability along a dimension is more a result of the
2141             separation between the means than a consequence of the intra-cluster variability.):
2142              
2143             my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
2144             mask => "N111",
2145             K => 2,
2146             cluster_seeding => 'smart', # try 'random' also
2147             terminal_output => 1,
2148             do_variance_normalization => 1,
2149             );
2150              
2151             =back
2152              
2153             =head2 Constructor Parameters
2154              
2155             =over 8
2156              
2157             =item C:
2158              
2159             This parameter names the data file that contains the multidimensional data records
2160             you want the module to cluster.
2161              
2162             =item C:
2163              
2164             This parameter supplies the mask to be applied to the columns of your data file. See
2165             the explanation in Synopsis for what this mask looks like.
2166              
2167             =item C:
2168              
2169             This parameter supplies the number of clusters you are looking for. If you set this
2170             option to 0, that means that you want the module to search for the best value for
2171             C. (Keep in mind the fact that searching for the best C may take a long time
2172             for large data files.)
2173              
2174             =item C:
2175              
2176             If you supply an integer value for C, the search for the best C will begin
2177             with that value.
2178              
2179             =item C:
2180              
2181             If you supply an integer value for C, the search for the best C will end at
2182             that value.
2183              
2184             =item C:
2185              
2186             This parameter must be set to either C or C. Depending on your data,
2187             you may get superior clustering with the C option. The choice C means
2188             that the clusterer will (1) subject the data to principal components analysis to
2189             determine the maximum variance direction; (2) project the data onto this direction;
2190             (3) find peaks in a smoothed histogram of the projected points; and (4) use the
2191             locations of the highest peaks as seeds for cluster centers. If the C option
2192             produces bizarre results, try C.
2193              
2194             =item C:
2195              
2196             When set to 1, this option causes Mahalanobis distances to be used for clustering.
2197             The default is 0 for this parameter. By default, the module uses the Euclidean
2198             distances for clustering. In general, Mahalanobis distance based clustering will
2199             fail if you seek too many clusters, your data dimensionality is high, and you do not
2200             have a sufficient number of samples in your data file. A necessary requirement for
2201             the module to be able to compute Mahalanobis distances is that the cluster covariance
2202             matrices be non-singular. (Let's say your data dimensionality is C and the module
2203             is considering a cluster that has only C samples in it where C is less than
2204             C. In this case, the covariance matrix will be singular since its rank will not
2205             exceed C. For the covariance matrix to be non-singular, it must be of full rank,
2206             that is, its rank must be C.)
2207              
2208             =item C:
2209              
2210             When set, the module will first normalize the data variance along the different
2211             dimensions of the data space before attempting clustering. Depending on your data,
2212             this option may or may not result in better clustering.
2213              
2214             =item C:
2215              
2216             This boolean parameter, when not supplied in the call to C, defaults to 0.
2217             When set, you will see in your terminal window the different clusters as lists of the
2218             symbolic IDs and their cluster centers. You will also see the QoC (Quality of
2219             Clustering) values for the clusters displayed.
2220              
2221             =item C:
2222              
2223             This parameter is also boolean. When set to 1, the clusters are written out to files
2224             that are named in the following manner:
2225              
2226             cluster0.dat
2227             cluster1.dat
2228             cluster2.dat
2229             ...
2230             ...
2231              
2232             Before the clusters are written to these files, the module destroys all files with
2233             such names in the directory in which you call the module.
2234              
2235              
2236             =back
2237              
2238             =over
2239              
2240             =item B
2241              
2242             $clusterer->read_data_from_file()
2243              
2244             =item B
2245              
2246             $clusterer->kmeans();
2247              
2248             or
2249              
2250             my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans();
2251              
2252             The first call above works solely by side-effect. The second call also returns the
2253             clusters and the cluster centers. See the C script in the
2254             C directory for how you can in your own code extract the clusters and the
2255             cluster centers from the variables C<$clusters_hash> and C<$cluster_centers_hash>.
2256              
2257             =item B
2258              
2259             $clusterer->get_K_best();
2260              
2261             This call makes sense only if you supply either the C option to the constructor,
2262             or if you specify values for the C and C options. The C and the
2263             C<(Kmin,Kmax)> options cause the module to determine the best value for C.
2264             Remember, C is the number of clusters the data is partitioned into.
2265              
2266             =item B
2267              
2268             $clusterer->show_QoC_values();
2269              
2270             presents a table with C values in the left column and the corresponding QoC
2271             (Quality-of-Clustering) values in the right column. Note that this call makes sense
2272             only if you either supply the C option to the constructor, or if you specify
2273             values for the C and C options.
2274              
2275             =item B
2276              
2277             $clusterer->visualize_clusters( $visualization_mask )
2278              
2279             The visualization mask here does not have to be identical to the one used for
2280             clustering, but must be a subset of that mask. This is convenient for visualizing
2281             the clusters in two- or three-dimensional subspaces of the original space.
2282              
2283             =item B
2284              
2285             $clusterer->visualize_data($visualization_mask, 'original');
2286              
2287             $clusterer->visualize_data($visualization_mask, 'normed');
2288              
2289             This method requires a second argument and, as shown, it must be either the string
2290             C or the string C, the former for the visualization of the raw data
2291             and the latter for the visualization of the data after its different dimensions are
2292             normalized by the standard-deviations along those directions. If you call the method
2293             with the second argument set to C, but do so without turning on the
2294             C option in the KMeans constructor, it will let you know.
2295              
2296              
2297             =item B
2298              
2299             If you wish to determine the cluster membership of a new data sample after you have
2300             created the clusters with the existing data samples, you would need to call this
2301             method. The C script in the C directory
2302             shows how to use this method.
2303              
2304              
2305             =item B
2306              
2307             This does the same thing as the previous method, except that it determines the
2308             cluster membership using the Mahalanobis distance metric. As for the previous
2309             method, see the C script in the C directory
2310             for how to use this method.
2311              
2312              
2313             =item B
2314              
2315             Algorithm::KMeans->cluster_data_generator(
2316             input_parameter_file => $parameter_file,
2317             output_datafile => $out_datafile,
2318             number_data_points_per_cluster => 20 );
2319              
2320             for generating multivariate data for clustering if you wish to play with synthetic
2321             data for clustering. The input parameter file contains the means and the variances
2322             for the different Gaussians you wish to use for the synthetic data. See the file
2323             C provided in the examples directory. It will be easiest for you to just
2324             edit this file for your data generation needs. In addition to the format of the
2325             parameter file, the main constraint you need to observe in specifying the parameters
2326             is that the dimensionality of the covariance matrix must correspond to the
2327             dimensionality of the mean vectors. The multivariate random numbers are generated by
2328             calling the C module. As you would expect, this module requires that
2329             the covariance matrices you specify in your parameter file be symmetric and positive
2330             definite. Should the covariances in your parameter file not obey this condition, the
2331             C module will let you know.
2332              
2333             =back
2334              
2335             =head1 HOW THE CLUSTERS ARE OUTPUT
2336              
2337             When the option C is set in the call to the constructor, the
2338             clusters are displayed on the terminal screen.
2339              
2340             When the option C is set in the call to the constructor, the
2341             module dumps the clusters in files named
2342              
2343             cluster0.dat
2344             cluster1.dat
2345             cluster2.dat
2346             ...
2347             ...
2348              
2349             in the directory in which you execute the module. The number of such files will
2350             equal the number of clusters formed. All such existing files in the directory are
2351             destroyed before any fresh ones are created. Each cluster file contains the symbolic
2352             ID tags of the data samples in that cluster.
2353              
2354             The module also leaves in your directory a printable `.png' file that is a point plot
2355             of the different clusters. The name of this file is always C.
2356              
2357             Also, as mentioned previously, a call to C in your own code will return the
2358             clusters and the cluster centers.
2359              
2360             =head1 REQUIRED
2361              
2362             This module requires the following three modules:
2363              
2364             Math::Random
2365             Graphics::GnuplotIF
2366             Math::GSL
2367              
2368             With regard to the third item above, what is actually required is the
2369             C module. However, that module is a part of the C
2370             distribution. The acronym GSL stands for the GNU Scientific Library. C is
2371             a Perl interface to the GSL C-based library.
2372              
2373              
2374             =head1 THE C DIRECTORY
2375              
2376             The C directory contains several scripts to help you become familiar with
2377             this module. The following script is an example of how the module can be expected to
2378             be used most of the time. It calls for clustering to be carried out with a fixed
2379             C:
2380              
2381             cluster_and_visualize.pl
2382              
2383             The more time you spend with this script, the more comfortable you will become with
2384             the use of this module. The script file contains a large comment block that mentions
2385             six locations in the script where you have to make decisions about how to use the
2386             module.
2387              
2388             See the following script if you do not know what value to use for C for clustering
2389             your data:
2390              
2391             find_best_K_and_cluster.pl
2392              
2393             This script uses the C option in the constructor that causes the module to
2394             search for the best C for your data. Since this search is virtually unbounded ---
2395             limited only by the number of samples in your data file --- the script may take a
2396             long time to run for a large data file. Hence the next script.
2397              
2398             If your datafile is too large, you may need to range limit the values of C that
2399             are searched through, as in the following script:
2400              
2401             find_best_K_in_range_and_cluster.pl
2402              
2403             If you also want to include data normalization (it may reduce the performance of the
2404             clusterer in some cases), see the following script:
2405              
2406             cluster_after_data_normalization.pl
2407              
2408             When you include the data normalization step and you would like to visualize the data
2409             before and after normalization, see the following script:
2410              
2411             cluster_and_visualize_with_data_visualization.pl*
2412              
2413             After you are done clustering, let's say you want to find the cluster membership of a
2414             new data sample. To see how you can do that, see the script:
2415              
2416             which_cluster_for_new_data.pl
2417              
2418             This script returns two answers for which cluster a new data sample belongs to: one
2419             using the Euclidean metric to calculate the distances between the new data sample and
2420             the cluster centers, and the other using the Mahalanobis metric. If the clusters are
2421             strongly elliptical in shape, you are likely to get better results with the
2422             Mahalanobis metric. (To see that you can get two different answers using the two
2423             different distance metrics, run the C script on the
2424             data in the file C. To make this run, note that you have to comment
2425             out and uncomment the lines at four different locations in the script.)
2426              
2427             The C directory also contains the following support scripts:
2428              
2429             For generating the data for experiments with clustering:
2430              
2431             data_generator.pl
2432              
2433             For cleaning up the examples directory:
2434              
2435             cleanup_directory.pl
2436              
2437             The examples directory also includes a parameter file, C, for generating
2438             synthetic data for clustering. Just edit this file if you would like to generate
2439             your own multivariate data for clustering. The parameter file is for the 3D case,
2440             but you can generate data with any dimensionality through appropriate entries in the
2441             parameter file.
2442              
2443             =head1 EXPORT
2444              
2445             None by design.
2446              
2447             =head1 CAVEATS
2448              
2449             K-Means based clustering usually does not work well when the clusters are strongly
2450             overlapping and when the extent of variability along the different dimensions is
2451             different for the different clusters. The module does give you the ability to
2452             normalize the variability in your data with the constructor option
2453             C. However, as described elsewhere, this may actually
2454             reduce the performance of the clusterer if the data variability along a direction is
2455             more a result of the separation between the means than because of intra-cluster
2456             variability. For better clustering with difficult-to-cluster data, you could try
2457             using the author's C module.
2458              
2459             =head1 BUGS
2460              
2461             Please notify the author if you encounter any bugs. When sending email, please place
2462             the string 'KMeans' in the subject line.
2463              
2464             =head1 INSTALLATION
2465              
2466             Download the archive from CPAN in any directory of your choice. Unpack the archive
2467             with a command that on a Linux machine would look like:
2468              
2469             tar zxvf Algorithm-KMeans-2.03.tar.gz
2470              
2471             This will create an installation directory for you whose name will be
2472             C. Enter this directory and execute the following commands
2473             for a standard install of the module if you have root privileges:
2474              
2475             perl Makefile.PL
2476             make
2477             make test
2478             sudo make install
2479              
2480             If you do not have root privileges, you can carry out a non-standard install the
2481             module in any directory of your choice by:
2482              
2483             perl Makefile.PL prefix=/some/other/directory/
2484             make
2485             make test
2486             make install
2487              
2488             With a non-standard install, you may also have to set your PERL5LIB environment
2489             variable so that this module can find the required other modules. How you do that
2490             would depend on what platform you are working on. In order to install this module in
2491             a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
2492             variable by
2493              
2494             setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2495              
2496             If I used bash, I'd need to declare:
2497              
2498             export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2499              
2500              
2501             =head1 THANKS
2502              
2503             I thank Slaven for pointing out that I needed to downshift the required version of Perl
2504             for this module. Fortunately, I had access to an old machine still running Perl
2505             5.10.1. The current version, 2.02, is based on my testing the module on that machine.
2506              
2507             I added two C methods in Version 2.0 as a result of an email from
2508             Jerome White who expressed a need for such methods in order to determine the best
2509             cluster for a new data record after you have successfully clustered your existing
2510             data. Thanks Jerome for your feedback!
2511              
2512             It was an email from Nadeem Bulsara that prompted me to create Version 1.40 of this
2513             module. Working with Version 1.30, Nadeem noticed that occasionally the module would
2514             produce variable clustering results on the same dataset. I believe that this
2515             variability was caused (at least partly) by the purely random mode that was used in
2516             Version 1.30 for the seeding of the cluster centers. Version 1.40 now includes a
2517             C mode. With the new mode the clusterer uses a PCA (Principal Components
2518             Analysis) of the data to make good guesses for the cluster centers. However,
2519             depending on how the data is jumbled up, it is possible that the new mode will not
2520             produce uniformly good results in all cases. So you can still use the old mode by
2521             setting C to C in the constructor. Thanks Nadeem for your
2522             feedback!
2523              
2524             Version 1.30 resulted from Martin Kalin reporting problems with a very small data
2525             set. Thanks Martin!
2526              
2527             Version 1.21 came about in response to the problems encountered by Luis Fernando
2528             D'Haro with version 1.20. Although the module would yield the clusters for some of
2529             its runs, more frequently than not the module would abort with an "empty cluster"
2530             message for his data. Luis Fernando has also suggested other improvements (such as
2531             clustering directly from the contents of a hash) that I intend to make in future
2532             versions of this module. Thanks Luis Fernando.
2533              
2534             Chad Aeschliman was kind enough to test out the interface of this module and to give
2535             suggestions for its improvement. His key slogan: "If you cannot figure out how to
2536             use a module in under 10 minutes, it's not going to be used." That should explain
2537             the longish Synopsis included here.
2538              
2539             =head1 AUTHOR
2540              
2541             Avinash Kak, kak@purdue.edu
2542              
2543             If you send email, please place the string "KMeans" in your subject line to get past
2544             my spam filter.
2545              
2546             =head1 COPYRIGHT
2547              
2548             This library is free software; you can redistribute it and/or modify it under the
2549             same terms as Perl itself.
2550              
2551             Copyright 2014 Avinash Kak
2552              
2553             =cut
2554