| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package Algorithm::KMeans; | 
| 2 |  |  |  |  |  |  |  | 
| 3 |  |  |  |  |  |  | #------------------------------------------------------------------------------------ | 
| 4 |  |  |  |  |  |  | # Copyright (c) 2014 Avinash Kak. All rights reserved.  This program is free | 
| 5 |  |  |  |  |  |  | # software.  You may modify and/or distribute it under the same terms as Perl itself. | 
| 6 |  |  |  |  |  |  | # This copyright notice must remain attached to the file. | 
| 7 |  |  |  |  |  |  | # | 
| 8 |  |  |  |  |  |  | # Algorithm::KMeans is a Perl module for clustering multidimensional data. | 
| 9 |  |  |  |  |  |  | # ----------------------------------------------------------------------------------- | 
| 10 |  |  |  |  |  |  |  | 
| 11 | 1 |  |  | 1 |  | 15887 | use 5.10.0; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 40 |  | 
| 12 | 1 |  |  | 1 |  | 3 | use strict; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 55 |  | 
| 13 | 1 |  |  | 1 |  | 4 | use warnings; | 
|  | 1 |  |  |  |  | 5 |  | 
|  | 1 |  |  |  |  | 37 |  | 
| 14 | 1 |  |  | 1 |  | 4 | use Carp; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 74 |  | 
| 15 | 1 |  |  | 1 |  | 5 | use File::Basename; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 66 |  | 
| 16 | 1 |  |  | 1 |  | 555 | use Math::Random; | 
|  | 1 |  |  |  |  | 4937 |  | 
|  | 1 |  |  |  |  | 93 |  | 
| 17 | 1 |  |  | 1 |  | 536 | use Graphics::GnuplotIF; | 
|  | 1 |  |  |  |  | 8773 |  | 
|  | 1 |  |  |  |  | 53 |  | 
| 18 | 1 |  |  | 1 |  | 966 | use Math::GSL::Matrix; | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 19 |  |  |  |  |  |  |  | 
| 20 |  |  |  |  |  |  |  | 
| 21 |  |  |  |  |  |  | our $VERSION = '2.03'; | 
| 22 |  |  |  |  |  |  |  | 
| 23 |  |  |  |  |  |  | # from Perl docs: | 
| 24 |  |  |  |  |  |  | my $_num_regex =  '^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$'; | 
| 25 |  |  |  |  |  |  |  | 
| 26 |  |  |  |  |  |  | # Constructor: | 
| 27 |  |  |  |  |  |  | sub new { | 
| 28 |  |  |  |  |  |  | my ($class, %args) = @_; | 
| 29 |  |  |  |  |  |  | my @params = keys %args; | 
| 30 |  |  |  |  |  |  | croak "\nYou have used a wrong name for a keyword argument " . | 
| 31 |  |  |  |  |  |  | "--- perhaps a misspelling\n" | 
| 32 |  |  |  |  |  |  | if check_for_illegal_params(@params) == 0; | 
| 33 |  |  |  |  |  |  | bless { | 
| 34 |  |  |  |  |  |  | _datafile                 =>   $args{datafile} || croak("datafile required"), | 
| 35 |  |  |  |  |  |  | _mask                     =>   $args{mask}     || croak("mask required"), | 
| 36 |  |  |  |  |  |  | _K                        =>   $args{K}        || 0, | 
| 37 |  |  |  |  |  |  | _K_min                    =>   $args{Kmin} || 'unknown', | 
| 38 |  |  |  |  |  |  | _K_max                    =>   $args{Kmax} || 'unknown', | 
| 39 |  |  |  |  |  |  | _cluster_seeding          =>   $args{cluster_seeding} || croak("must choose smart or random ". | 
| 40 |  |  |  |  |  |  | "for cluster seeding"), | 
| 41 |  |  |  |  |  |  | _var_normalize            =>   $args{do_variance_normalization} || 0, | 
| 42 |  |  |  |  |  |  | _use_mahalanobis_metric   =>   $args{use_mahalanobis_metric} || 0, | 
| 43 |  |  |  |  |  |  | _clusters_2_files         =>   $args{write_clusters_to_files} || 0, | 
| 44 |  |  |  |  |  |  | _terminal_output          =>   $args{terminal_output} || 0, | 
| 45 |  |  |  |  |  |  | _debug                    =>   $args{debug} || 0, | 
| 46 |  |  |  |  |  |  | _N                        =>   0, | 
| 47 |  |  |  |  |  |  | _K_best                   =>   'unknown', | 
| 48 |  |  |  |  |  |  | _original_data            =>   {}, | 
| 49 |  |  |  |  |  |  | _data                     =>   {}, | 
| 50 |  |  |  |  |  |  | _data_id_tags             =>   [], | 
| 51 |  |  |  |  |  |  | _QoC_values               =>   {}, | 
| 52 |  |  |  |  |  |  | _clusters                 =>   [], | 
| 53 |  |  |  |  |  |  | _cluster_centers          =>   [], | 
| 54 |  |  |  |  |  |  | _clusters_hash            =>   {}, | 
| 55 |  |  |  |  |  |  | _cluster_centers_hash     =>   {}, | 
| 56 |  |  |  |  |  |  | _cluster_covariances_hash =>   {}, | 
| 57 |  |  |  |  |  |  | _data_dimensions          =>   0, | 
| 58 |  |  |  |  |  |  |  | 
| 59 |  |  |  |  |  |  | }, $class; | 
| 60 |  |  |  |  |  |  | } | 
| 61 |  |  |  |  |  |  |  | 
| 62 |  |  |  |  |  |  | sub read_data_from_file { | 
| 63 |  |  |  |  |  |  | my $self = shift; | 
| 64 |  |  |  |  |  |  | my $datafile = $self->{_datafile}; | 
| 65 |  |  |  |  |  |  | my $mask = $self->{_mask}; | 
| 66 |  |  |  |  |  |  | my @mask = split //, $mask; | 
| 67 |  |  |  |  |  |  | $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask; | 
| 68 |  |  |  |  |  |  | print "data dimensionality:  $self->{_data_dimensions} \n"if $self->{_terminal_output}; | 
| 69 |  |  |  |  |  |  | open INPUT, $datafile or die "unable to open file $datafile: $!\n"; | 
| 70 |  |  |  |  |  |  | chomp( my @raw_data =  ); | 
| 71 |  |  |  |  |  |  | close INPUT; | 
| 72 |  |  |  |  |  |  | # Transform strings into number data | 
| 73 |  |  |  |  |  |  | foreach my $record (@raw_data) { | 
| 74 |  |  |  |  |  |  | next unless $record; | 
| 75 |  |  |  |  |  |  | next if $record =~ /^#/; | 
| 76 |  |  |  |  |  |  | my @data_fields; | 
| 77 |  |  |  |  |  |  | my @fields = split /\s+/, $record; | 
| 78 |  |  |  |  |  |  | die "\nABORTED: Mask size does not correspond to row record size\n" if $#fields != $#mask; | 
| 79 |  |  |  |  |  |  | my $record_id; | 
| 80 |  |  |  |  |  |  | foreach my $i (0..@fields-1) { | 
| 81 |  |  |  |  |  |  | if ($mask[$i] eq '0') { | 
| 82 |  |  |  |  |  |  | next; | 
| 83 |  |  |  |  |  |  | } elsif ($mask[$i] eq 'N') { | 
| 84 |  |  |  |  |  |  | $record_id = $fields[$i]; | 
| 85 |  |  |  |  |  |  | } elsif ($mask[$i] eq '1') { | 
| 86 |  |  |  |  |  |  | push @data_fields, $fields[$i]; | 
| 87 |  |  |  |  |  |  | } else { | 
| 88 |  |  |  |  |  |  | die "misformed mask for reading the data file\n"; | 
| 89 |  |  |  |  |  |  | } | 
| 90 |  |  |  |  |  |  | } | 
| 91 |  |  |  |  |  |  | my @nums = map {/$_num_regex/;$_} @data_fields; | 
| 92 |  |  |  |  |  |  | $self->{_original_data}->{ $record_id } = \@nums; | 
| 93 |  |  |  |  |  |  | } | 
| 94 |  |  |  |  |  |  | if ($self->{_var_normalize}) { | 
| 95 |  |  |  |  |  |  | $self->{_data} =  variance_normalization( $self->{_original_data} ); | 
| 96 |  |  |  |  |  |  | } else { | 
| 97 |  |  |  |  |  |  | $self->{_data} = deep_copy_hash( $self->{_original_data} ); | 
| 98 |  |  |  |  |  |  | } | 
| 99 |  |  |  |  |  |  | my @all_data_ids = keys %{$self->{_data}}; | 
| 100 |  |  |  |  |  |  | $self->{_data_id_tags} = \@all_data_ids; | 
| 101 |  |  |  |  |  |  | $self->{_N} = scalar @all_data_ids; | 
| 102 |  |  |  |  |  |  | if ( defined($self->{_K}) && ($self->{_K} > 0) ) { | 
| 103 |  |  |  |  |  |  | carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " . | 
| 104 |  |  |  |  |  |  | "points must satisfy the relation N > 2xK**2 where K is " . | 
| 105 |  |  |  |  |  |  | "the number of clusters requested for the clusters to be " . | 
| 106 |  |  |  |  |  |  | "meaningful $!" | 
| 107 |  |  |  |  |  |  | if ( $self->{_N} < (2 * $self->{_K} ** 2) ); | 
| 108 |  |  |  |  |  |  | print "\n\n\n"; | 
| 109 |  |  |  |  |  |  | } | 
| 110 |  |  |  |  |  |  | srand(123456789); | 
| 111 |  |  |  |  |  |  | } | 
| 112 |  |  |  |  |  |  |  | 
| 113 |  |  |  |  |  |  | sub kmeans { | 
| 114 |  |  |  |  |  |  | my $self = shift; | 
| 115 |  |  |  |  |  |  | my $K = $self->{_K}; | 
| 116 |  |  |  |  |  |  | if ( ($K == 0) || | 
| 117 |  |  |  |  |  |  | ( ($self->{_K_min} ne 'unknown') && | 
| 118 |  |  |  |  |  |  | ($self->{_K_max} ne 'unknown') ) ) { | 
| 119 |  |  |  |  |  |  | eval { | 
| 120 |  |  |  |  |  |  | $self->iterate_through_K(); | 
| 121 |  |  |  |  |  |  | }; | 
| 122 |  |  |  |  |  |  | die "in kmeans(): $@" if ($@); | 
| 123 |  |  |  |  |  |  | } elsif ( $K =~ /\d+/) { | 
| 124 |  |  |  |  |  |  | eval { | 
| 125 |  |  |  |  |  |  | $self->cluster_for_fixed_K_multiple_random_tries($K) | 
| 126 |  |  |  |  |  |  | if $self->{_cluster_seeding} eq 'random'; | 
| 127 |  |  |  |  |  |  | $self->cluster_for_fixed_K_single_smart_try($K) | 
| 128 |  |  |  |  |  |  | if $self->{_cluster_seeding} eq 'smart'; | 
| 129 |  |  |  |  |  |  | }; | 
| 130 |  |  |  |  |  |  | die "in kmeans(): $@" if ($@); | 
| 131 |  |  |  |  |  |  | } else { | 
| 132 |  |  |  |  |  |  | croak "Incorrect call syntax used.  See documentation.\n"; | 
| 133 |  |  |  |  |  |  | } | 
| 134 |  |  |  |  |  |  | if ((defined $self->{_clusters}) && (defined $self->{_cluster_centers})){ | 
| 135 |  |  |  |  |  |  | foreach my $i (0..@{$self->{_clusters}}-1) { | 
| 136 |  |  |  |  |  |  | $self->{_clusters_hash}->{"cluster$i"} = $self->{_clusters}->[$i]; | 
| 137 |  |  |  |  |  |  | $self->{_cluster_centers_hash}->{"cluster$i"} = $self->{_cluster_centers}->[$i]; | 
| 138 |  |  |  |  |  |  | $self->{_cluster_covariances_hash}->{"cluster$i"} = | 
| 139 |  |  |  |  |  |  | $self->estimate_cluster_covariance($self->{_clusters}->[$i]); | 
| 140 |  |  |  |  |  |  | } | 
| 141 |  |  |  |  |  |  | $self->write_clusters_to_files( $self->{_clusters} ) if $self->{_clusters_2_files}; | 
| 142 |  |  |  |  |  |  | return ($self->{_clusters_hash}, $self->{_cluster_centers_hash}); | 
| 143 |  |  |  |  |  |  | } else { | 
| 144 |  |  |  |  |  |  | croak "Clustering failed.  Try again."; | 
| 145 |  |  |  |  |  |  | } | 
| 146 |  |  |  |  |  |  | } | 
| 147 |  |  |  |  |  |  |  | 
| 148 |  |  |  |  |  |  | # This is the subroutine to call if you prefer purely random choices for the initial | 
| 149 |  |  |  |  |  |  | # seeding of the cluster centers. | 
| 150 |  |  |  |  |  |  | sub cluster_for_fixed_K_multiple_random_tries { | 
| 151 |  |  |  |  |  |  | my $self = shift; | 
| 152 |  |  |  |  |  |  | my $K = shift; | 
| 153 |  |  |  |  |  |  | my @all_data_ids = @{$self->{_data_id_tags}}; | 
| 154 |  |  |  |  |  |  | my $QoC; | 
| 155 |  |  |  |  |  |  | my @QoC_values; | 
| 156 |  |  |  |  |  |  | my @array_of_clusters; | 
| 157 |  |  |  |  |  |  | my @array_of_cluster_centers; | 
| 158 |  |  |  |  |  |  | my $clusters; | 
| 159 |  |  |  |  |  |  | my $new_clusters; | 
| 160 |  |  |  |  |  |  | my $cluster_centers; | 
| 161 |  |  |  |  |  |  | my $new_cluster_centers; | 
| 162 |  |  |  |  |  |  | print "Clustering for K = $K\n" if $self->{_terminal_output}; | 
| 163 |  |  |  |  |  |  | foreach my $trial (1..20) { | 
| 164 |  |  |  |  |  |  | print ". "; | 
| 165 |  |  |  |  |  |  | my ($new_clusters, $new_cluster_centers); | 
| 166 |  |  |  |  |  |  | eval { | 
| 167 |  |  |  |  |  |  | ($new_clusters, $new_cluster_centers) = $self->cluster_for_given_K($K); | 
| 168 |  |  |  |  |  |  | }; | 
| 169 |  |  |  |  |  |  | next if $@; | 
| 170 |  |  |  |  |  |  | next if @$new_clusters <= 1; | 
| 171 |  |  |  |  |  |  | my $newQoC = $self->cluster_quality( $new_clusters, $new_cluster_centers ); | 
| 172 |  |  |  |  |  |  | if ( (!defined $QoC) || ($newQoC < $QoC) ) { | 
| 173 |  |  |  |  |  |  | $QoC = $newQoC; | 
| 174 |  |  |  |  |  |  | $clusters = deep_copy_AoA( $new_clusters ); | 
| 175 |  |  |  |  |  |  | $cluster_centers = deep_copy_AoA( $new_cluster_centers ); | 
| 176 |  |  |  |  |  |  | } | 
| 177 |  |  |  |  |  |  | } | 
| 178 |  |  |  |  |  |  | $self->{_clusters} = $clusters; | 
| 179 |  |  |  |  |  |  | $self->{_cluster_centers} = $cluster_centers; | 
| 180 |  |  |  |  |  |  | $self->{_QoC_values}->{"$K"} = $QoC; | 
| 181 |  |  |  |  |  |  | if ($self->{_terminal_output}) { | 
| 182 |  |  |  |  |  |  | print "\nDisplaying final clusters for best K (= $K) :\n"; | 
| 183 |  |  |  |  |  |  | display_clusters( $clusters ); | 
| 184 |  |  |  |  |  |  | $self->display_cluster_centers( $clusters ); | 
| 185 |  |  |  |  |  |  | print "\nQoC value (the smaller, the better): $QoC\n"; | 
| 186 |  |  |  |  |  |  | } | 
| 187 |  |  |  |  |  |  | } | 
| 188 |  |  |  |  |  |  |  | 
| 189 |  |  |  |  |  |  | # For the smart try, we do initial cluster seeding by subjecting the data to | 
| 190 |  |  |  |  |  |  | # principal components analysis in order to discover the direction of maximum | 
| 191 |  |  |  |  |  |  | # variance in the data space.  Subsequently, we try to find the K largest peaks along | 
| 192 |  |  |  |  |  |  | # this direction.  The coordinates of these peaks serve as the seeds for the K | 
| 193 |  |  |  |  |  |  | # clusters. | 
| 194 |  |  |  |  |  |  | sub cluster_for_fixed_K_single_smart_try { | 
| 195 |  |  |  |  |  |  | my $self = shift; | 
| 196 |  |  |  |  |  |  | my $K = shift; | 
| 197 |  |  |  |  |  |  | my @all_data_ids = @{$self->{_data_id_tags}}; | 
| 198 |  |  |  |  |  |  | print "Clustering for K = $K\n" if $self->{_terminal_output}; | 
| 199 |  |  |  |  |  |  | my ($clusters, $cluster_centers); | 
| 200 |  |  |  |  |  |  | eval { | 
| 201 |  |  |  |  |  |  | ($clusters, $cluster_centers) = $self->cluster_for_given_K($K); | 
| 202 |  |  |  |  |  |  | }; | 
| 203 |  |  |  |  |  |  | if ($@) { | 
| 204 |  |  |  |  |  |  | die "In cluster_for_fixed_K_single_smart_try:  insufficient data for clustering " . | 
| 205 |  |  |  |  |  |  | "with $self->{_K} clusters --- $@"; | 
| 206 |  |  |  |  |  |  | } | 
| 207 |  |  |  |  |  |  | my $QoC = $self->cluster_quality( $clusters, $cluster_centers ); | 
| 208 |  |  |  |  |  |  | $self->{_clusters} = $clusters; | 
| 209 |  |  |  |  |  |  | $self->{_cluster_centers} = $cluster_centers; | 
| 210 |  |  |  |  |  |  | $self->{_QoC_values}->{"$K"} = $QoC; | 
| 211 |  |  |  |  |  |  | if ($self->{_terminal_output}) { | 
| 212 |  |  |  |  |  |  | print "\nDisplaying final clusters for best K (= $K) :\n"; | 
| 213 |  |  |  |  |  |  | display_clusters( $clusters ); | 
| 214 |  |  |  |  |  |  | $self->display_cluster_centers( $clusters ); | 
| 215 |  |  |  |  |  |  | print "\nQoC value (the smaller, the better): $QoC\n"; | 
| 216 |  |  |  |  |  |  | } | 
| 217 |  |  |  |  |  |  | } | 
| 218 |  |  |  |  |  |  |  | 
| 219 |  |  |  |  |  |  | # The following subroutine is the top-level routine to call if you want the system to | 
| 220 |  |  |  |  |  |  | # figure out on its own what value to use for K, the number of clusters.  If you call | 
| 221 |  |  |  |  |  |  | # the KMeans constructor with the K=0 option, the method below will try every | 
| 222 |  |  |  |  |  |  | # possible value of K between 2 and the maximum possible depending on the number of | 
| 223 |  |  |  |  |  |  | # data points available. For example, if the number of data points is 10,000, it will | 
| 224 |  |  |  |  |  |  | # try all possible values of K between 2 and 70. For how the maximum value is set for | 
| 225 |  |  |  |  |  |  | # K, see the comments made under Description.  Note also how this method makes 20 | 
| 226 |  |  |  |  |  |  | # different tries for each value of K as a defense against the problem of the final | 
| 227 |  |  |  |  |  |  | # result corresponding to some local minimum in the values of the QoC metric.  Out of | 
| 228 |  |  |  |  |  |  | # these 20 tries for each K, it retains the clusters and the cluster centers for only | 
| 229 |  |  |  |  |  |  | # that try that yields the smallest value for the QoC metric.  After estimating the | 
| 230 |  |  |  |  |  |  | # "best" QoC values for all possible K in this manner, it then finds the K for which | 
| 231 |  |  |  |  |  |  | # the QoC is the minimum.  This is taken to be the best value for K.  Finally, the | 
| 232 |  |  |  |  |  |  | # output clusters are written out to separate files. | 
| 233 |  |  |  |  |  |  | # | 
| 234 |  |  |  |  |  |  | # If the KMeans constructor is invoked with the (Kmin, Kmax) options, then, instead | 
| 235 |  |  |  |  |  |  | # of iterating through 2 and the maximum permissible value for K, the iterations are | 
| 236 |  |  |  |  |  |  | # carried out only between Kmin and Kmax. | 
| 237 |  |  |  |  |  |  | sub iterate_through_K { | 
| 238 |  |  |  |  |  |  | my $self = shift; | 
| 239 |  |  |  |  |  |  | my @all_data_ids = @{$self->{_data_id_tags}}; | 
| 240 |  |  |  |  |  |  | my $N = $self->{_N}; | 
| 241 |  |  |  |  |  |  | croak "You need more than 8 data samples. The number of data points must satisfy " . | 
| 242 |  |  |  |  |  |  | "the relation N > 2xK**2 where K is the number of clusters.  The smallest " . | 
| 243 |  |  |  |  |  |  | "value for K is 2.\n"  if $N <= 8; | 
| 244 |  |  |  |  |  |  | my $K_statistical_max = int( sqrt( $N / 2.0 ) ); | 
| 245 |  |  |  |  |  |  | my $Kmin = $self->{_K_min} eq 'unknown' | 
| 246 |  |  |  |  |  |  | ? 2 | 
| 247 |  |  |  |  |  |  | : $self->{_K_min}; | 
| 248 |  |  |  |  |  |  | my $Kmax = $self->{_K_max} eq 'unknown' | 
| 249 |  |  |  |  |  |  | ? int( sqrt( $N / 2.0 ) ) | 
| 250 |  |  |  |  |  |  | : $self->{_K_max}; | 
| 251 |  |  |  |  |  |  | croak  "\n\nYour Kmax value is too high.  Ideally, it should not exceed sqrt(N/2) " . | 
| 252 |  |  |  |  |  |  | "where N is the number of data points" if $Kmax > $K_statistical_max; | 
| 253 |  |  |  |  |  |  | print "Value of Kmax is: $Kmax\n" if $self->{_terminal_output}; | 
| 254 |  |  |  |  |  |  | my @QoC_values; | 
| 255 |  |  |  |  |  |  | my @array_of_clusters; | 
| 256 |  |  |  |  |  |  | my @array_of_cluster_centers; | 
| 257 |  |  |  |  |  |  | foreach my $K ($Kmin..$Kmax) { | 
| 258 |  |  |  |  |  |  | my $QoC; | 
| 259 |  |  |  |  |  |  | my $clusters; | 
| 260 |  |  |  |  |  |  | my $cluster_centers; | 
| 261 |  |  |  |  |  |  | print "Clustering for K = $K\n" if $self->{_terminal_output}; | 
| 262 |  |  |  |  |  |  | if ($self->{_cluster_seeding} eq 'random') { | 
| 263 |  |  |  |  |  |  | foreach my $trial (1..21) { | 
| 264 |  |  |  |  |  |  | print ". "; | 
| 265 |  |  |  |  |  |  | my ($new_clusters, $new_cluster_centers); | 
| 266 |  |  |  |  |  |  | if ($self->{_use_mahalanobis_metric}) { | 
| 267 |  |  |  |  |  |  | eval { | 
| 268 |  |  |  |  |  |  | ($new_clusters, $new_cluster_centers) = $self->cluster_for_given_K($K); | 
| 269 |  |  |  |  |  |  | }; | 
| 270 |  |  |  |  |  |  | next if $@; | 
| 271 |  |  |  |  |  |  | } else { | 
| 272 |  |  |  |  |  |  | ($new_clusters, $new_cluster_centers) = $self->cluster_for_given_K($K); | 
| 273 |  |  |  |  |  |  | } | 
| 274 |  |  |  |  |  |  | my $newQoC = $self->cluster_quality( $new_clusters, $new_cluster_centers ); | 
| 275 |  |  |  |  |  |  | if ( (!defined $QoC) || ($newQoC < $QoC) ) { | 
| 276 |  |  |  |  |  |  | $QoC = $newQoC; | 
| 277 |  |  |  |  |  |  | $clusters = deep_copy_AoA( $new_clusters ); | 
| 278 |  |  |  |  |  |  | $cluster_centers = deep_copy_AoA( $new_cluster_centers ); | 
| 279 |  |  |  |  |  |  | } | 
| 280 |  |  |  |  |  |  | } | 
| 281 |  |  |  |  |  |  | print "\n"; | 
| 282 |  |  |  |  |  |  | } elsif ($self->{_cluster_seeding} eq 'smart') { | 
| 283 |  |  |  |  |  |  | eval { | 
| 284 |  |  |  |  |  |  | ($clusters, $cluster_centers) = $self->cluster_for_given_K($K); | 
| 285 |  |  |  |  |  |  | }; | 
| 286 |  |  |  |  |  |  | if ($@) { | 
| 287 |  |  |  |  |  |  | $Kmax = $K - 1; | 
| 288 |  |  |  |  |  |  | last; | 
| 289 |  |  |  |  |  |  | } | 
| 290 |  |  |  |  |  |  | $QoC = $self->cluster_quality($clusters,$cluster_centers); | 
| 291 |  |  |  |  |  |  | } else { | 
| 292 |  |  |  |  |  |  | die "You must either choose 'smart' for cluster_seeding or 'random'. " . | 
| 293 |  |  |  |  |  |  | "Fix your constructor call." | 
| 294 |  |  |  |  |  |  | } | 
| 295 |  |  |  |  |  |  | push @QoC_values, $QoC; | 
| 296 |  |  |  |  |  |  | push @array_of_clusters, $clusters; | 
| 297 |  |  |  |  |  |  | push @array_of_cluster_centers, $cluster_centers; | 
| 298 |  |  |  |  |  |  | } | 
| 299 |  |  |  |  |  |  | my ($min, $max) = minmax( \@QoC_values ); | 
| 300 |  |  |  |  |  |  | my $K_best_relative_to_Kmin = get_index_at_value($min, \@QoC_values ); | 
| 301 |  |  |  |  |  |  | my $K_best = $K_best_relative_to_Kmin + $Kmin; | 
| 302 |  |  |  |  |  |  | if ($self->{_terminal_output}) { | 
| 303 |  |  |  |  |  |  | print "\nDisplaying final clusters for best K (= $K_best) :\n"; | 
| 304 |  |  |  |  |  |  | display_clusters( $array_of_clusters[$K_best_relative_to_Kmin] ); | 
| 305 |  |  |  |  |  |  | $self->display_cluster_centers($array_of_clusters[$K_best_relative_to_Kmin]); | 
| 306 |  |  |  |  |  |  | print "\nBest clustering achieved for K=$K_best with QoC = $min\n" if defined $min; | 
| 307 |  |  |  |  |  |  | my @printableQoC = grep {$_} @QoC_values; | 
| 308 |  |  |  |  |  |  | print "\nQoC values array (the smaller the value, the better it is) for different " . | 
| 309 |  |  |  |  |  |  | "K starting with K=$Kmin:  @printableQoC\n"; | 
| 310 |  |  |  |  |  |  | } | 
| 311 |  |  |  |  |  |  | $self->{_K_best} = $K_best; | 
| 312 |  |  |  |  |  |  | foreach my $i (0..@QoC_values-1) { | 
| 313 |  |  |  |  |  |  | my $k = $i + $Kmin; | 
| 314 |  |  |  |  |  |  | $self->{_QoC_values}->{"$k"} = $QoC_values[$i]; | 
| 315 |  |  |  |  |  |  | } | 
| 316 |  |  |  |  |  |  | $self->{_clusters} = $array_of_clusters[$K_best_relative_to_Kmin]; | 
| 317 |  |  |  |  |  |  | $self->{_cluster_centers} = | 
| 318 |  |  |  |  |  |  | $array_of_cluster_centers[$K_best_relative_to_Kmin]; | 
| 319 |  |  |  |  |  |  | } | 
| 320 |  |  |  |  |  |  |  | 
| 321 |  |  |  |  |  |  | # This is the function to call if you already know what value you want to use for K, | 
| 322 |  |  |  |  |  |  | # the number of expected clusters.  The purpose of this function is to do the | 
| 323 |  |  |  |  |  |  | # initialization of the cluster centers and to carry out the initial assignment of | 
| 324 |  |  |  |  |  |  | # the data to the clusters with the initial cluster centers.  The initialization | 
| 325 |  |  |  |  |  |  | # consists of 3 steps: Construct a random sequence of K integers between 0 and N-1 | 
| 326 |  |  |  |  |  |  | # where N is the number of data points to be clustered; 2) Call | 
| 327 |  |  |  |  |  |  | # get_initial_cluster_centers() to index into the data array with the random integers | 
| 328 |  |  |  |  |  |  | # to get a list of K data points that would serve as the initial cluster centers; and | 
| 329 |  |  |  |  |  |  | # (3) Call assign_data_to_clusters_initial() to assign the rest of the data to each | 
| 330 |  |  |  |  |  |  | # of the K clusters on the basis of the proximity to the cluster centers. | 
| 331 |  |  |  |  |  |  | sub cluster_for_given_K { | 
| 332 |  |  |  |  |  |  | my $self = shift; | 
| 333 |  |  |  |  |  |  | my $K = shift; | 
| 334 |  |  |  |  |  |  | my @all_data_ids = @{$self->{_data_id_tags}}; | 
| 335 |  |  |  |  |  |  | my $cluster_centers; | 
| 336 |  |  |  |  |  |  | if ($self->{_cluster_seeding} eq 'smart') { | 
| 337 |  |  |  |  |  |  | $cluster_centers = $self->get_initial_cluster_centers_smart($K); | 
| 338 |  |  |  |  |  |  | } elsif ($self->{_cluster_seeding} eq 'random') { | 
| 339 |  |  |  |  |  |  | $cluster_centers = $self->get_initial_cluster_centers($K); | 
| 340 |  |  |  |  |  |  | } else { | 
| 341 |  |  |  |  |  |  | die "You must either choose 'smart' for cluster_seeding or 'random'. " . | 
| 342 |  |  |  |  |  |  | "Fix your constructor call." | 
| 343 |  |  |  |  |  |  | } | 
| 344 |  |  |  |  |  |  | my $clusters; | 
| 345 |  |  |  |  |  |  | if ($self->{_use_mahalanobis_metric}) { | 
| 346 |  |  |  |  |  |  | my $clusters_and_determinants = | 
| 347 |  |  |  |  |  |  | $self->assign_data_to_clusters_initial_mahalanobis($cluster_centers); | 
| 348 |  |  |  |  |  |  | $clusters = $clusters_and_determinants->[0]; | 
| 349 |  |  |  |  |  |  | my @determinants = @{$clusters_and_determinants->[1]}; | 
| 350 |  |  |  |  |  |  | my ($min,$max) = minmax(\@determinants); | 
| 351 |  |  |  |  |  |  | die "In cluster_for_given_K(): min determinant of covariance matrix for at " . | 
| 352 |  |  |  |  |  |  | "least one cluster is too small" if $min / $max < 0.001; | 
| 353 |  |  |  |  |  |  | } else { | 
| 354 |  |  |  |  |  |  | $clusters = $self->assign_data_to_clusters_initial($cluster_centers); | 
| 355 |  |  |  |  |  |  | } | 
| 356 |  |  |  |  |  |  | my $cluster_nonexistent_flag = 0; | 
| 357 |  |  |  |  |  |  | foreach my $trial (0..2) { | 
| 358 |  |  |  |  |  |  | if ($self->{_use_mahalanobis_metric}) { | 
| 359 |  |  |  |  |  |  | ($clusters, $cluster_centers) = $self->assign_data_to_clusters_mahalanobis($clusters, $K); | 
| 360 |  |  |  |  |  |  | } else { | 
| 361 |  |  |  |  |  |  | ($clusters, $cluster_centers) = $self->assign_data_to_clusters( $clusters, $K ); | 
| 362 |  |  |  |  |  |  | } | 
| 363 |  |  |  |  |  |  | my $num_of_clusters_returned = @$clusters; | 
| 364 |  |  |  |  |  |  | foreach my $cluster (@$clusters) { | 
| 365 |  |  |  |  |  |  | $cluster_nonexistent_flag = 1 if ((!defined $cluster) ||  (@$cluster == 0)); | 
| 366 |  |  |  |  |  |  | } | 
| 367 |  |  |  |  |  |  | last unless $cluster_nonexistent_flag; | 
| 368 |  |  |  |  |  |  | } | 
| 369 |  |  |  |  |  |  | return ($clusters, $cluster_centers); | 
| 370 |  |  |  |  |  |  | } | 
| 371 |  |  |  |  |  |  |  | 
| 372 |  |  |  |  |  |  | # This function is used when you set the "cluster_seeding" option to 'random' in the | 
| 373 |  |  |  |  |  |  | # constructor.  Returns a set of K random integers.  These serve as indices to reach | 
| 374 |  |  |  |  |  |  | # into the data array.  A data element whose index is one of the random numbers | 
| 375 |  |  |  |  |  |  | # returned by this routine serves as an initial cluster center.  Note the quality | 
| 376 |  |  |  |  |  |  | # check it runs on the list of K random integers constructed.  We first make sure | 
| 377 |  |  |  |  |  |  | # that all K random integers are different.  Subsequently, we carry out a quality | 
| 378 |  |  |  |  |  |  | # assessment of the K random integers constructed.  This quality measure consists of | 
| 379 |  |  |  |  |  |  | # the ratio of the values spanned by the random integers to the value of N, the total | 
| 380 |  |  |  |  |  |  | # number of data points to be clustered.  Currently, if this ratio is less than 0.3, | 
| 381 |  |  |  |  |  |  | # we discard the K integers and try again. | 
| 382 |  |  |  |  |  |  | sub initialize_cluster_centers { | 
| 383 |  |  |  |  |  |  | my $self = shift; | 
| 384 |  |  |  |  |  |  | my $K = shift; | 
| 385 |  |  |  |  |  |  | my $data_store_size = $self->{_N}; | 
| 386 |  |  |  |  |  |  | my @cluster_center_indices; | 
| 387 |  |  |  |  |  |  | while (1) { | 
| 388 |  |  |  |  |  |  | foreach my $i (0..$K-1) { | 
| 389 |  |  |  |  |  |  | $cluster_center_indices[$i] = int rand( $data_store_size ); | 
| 390 |  |  |  |  |  |  | next if $i == 0; | 
| 391 |  |  |  |  |  |  | foreach my $j (0..$i-1) { | 
| 392 |  |  |  |  |  |  | while ( $cluster_center_indices[$j] == | 
| 393 |  |  |  |  |  |  | $cluster_center_indices[$i] ) { | 
| 394 |  |  |  |  |  |  | my $old = $cluster_center_indices[$i]; | 
| 395 |  |  |  |  |  |  | $cluster_center_indices[$i] = int rand($data_store_size); | 
| 396 |  |  |  |  |  |  | } | 
| 397 |  |  |  |  |  |  | } | 
| 398 |  |  |  |  |  |  | } | 
| 399 |  |  |  |  |  |  | my ($min,$max) = minmax(\@cluster_center_indices ); | 
| 400 |  |  |  |  |  |  | my $quality = ($max - $min) / $data_store_size; | 
| 401 |  |  |  |  |  |  | last if $quality > 0.3; | 
| 402 |  |  |  |  |  |  | } | 
| 403 |  |  |  |  |  |  | return @cluster_center_indices; | 
| 404 |  |  |  |  |  |  | } | 
| 405 |  |  |  |  |  |  |  | 
| 406 |  |  |  |  |  |  | # This function is used when you set the "cluster_seeding" option to 'random' in the | 
| 407 |  |  |  |  |  |  | # constructor.  This routine merely reaches into the data array with the random | 
| 408 |  |  |  |  |  |  | # integers, as constructed by the previous routine, serving as indices and fetching | 
| 409 |  |  |  |  |  |  | # values corresponding to those indices.  The fetched data samples serve as the | 
| 410 |  |  |  |  |  |  | # initial cluster centers. | 
| 411 |  |  |  |  |  |  | sub get_initial_cluster_centers { | 
| 412 |  |  |  |  |  |  | my $self = shift; | 
| 413 |  |  |  |  |  |  | my $K = shift; | 
| 414 |  |  |  |  |  |  | my @cluster_center_indices = $self->initialize_cluster_centers($K); | 
| 415 |  |  |  |  |  |  | my @result; | 
| 416 |  |  |  |  |  |  | foreach my $i (@cluster_center_indices) { | 
| 417 |  |  |  |  |  |  | my $tag = $self->{_data_id_tags}[$i]; | 
| 418 |  |  |  |  |  |  | push @result, $self->{_data}->{$tag}; | 
| 419 |  |  |  |  |  |  | } | 
| 420 |  |  |  |  |  |  | return \@result; | 
| 421 |  |  |  |  |  |  | } | 
| 422 |  |  |  |  |  |  |  | 
| 423 |  |  |  |  |  |  | # This method is invoked when you choose the 'smart' option for the "cluster_seeding" | 
| 424 |  |  |  |  |  |  | # option in the constructor.  It subjects the data to a principal components analysis | 
| 425 |  |  |  |  |  |  | # to figure out the direction of maximal variance.  Subsequently, it tries to locate | 
| 426 |  |  |  |  |  |  | # K peaks in a smoothed histogram of the data points projected onto the maximal | 
| 427 |  |  |  |  |  |  | # variance direction. | 
| 428 |  |  |  |  |  |  | sub get_initial_cluster_centers_smart { | 
| 429 |  |  |  |  |  |  | my $self = shift; | 
| 430 |  |  |  |  |  |  | my $K = shift; | 
| 431 |  |  |  |  |  |  | if ($self->{_data_dimensions} == 1) { | 
| 432 |  |  |  |  |  |  | my @one_d_data; | 
| 433 |  |  |  |  |  |  | foreach my $j (0..$self->{_N}-1) { | 
| 434 |  |  |  |  |  |  | my $tag = $self->{_data_id_tags}[$j]; | 
| 435 |  |  |  |  |  |  | push @one_d_data, $self->{_data}->{$tag}->[0]; | 
| 436 |  |  |  |  |  |  | } | 
| 437 |  |  |  |  |  |  | my @peak_points = find_peak_points_in_given_direction(\@one_d_data,$K); | 
| 438 |  |  |  |  |  |  | print "highest points at data values: @peak_points\n" if $self->{_debug}; | 
| 439 |  |  |  |  |  |  | my @cluster_centers; | 
| 440 |  |  |  |  |  |  | foreach my $peakpoint (@peak_points) { | 
| 441 |  |  |  |  |  |  | push @cluster_centers, [$peakpoint]; | 
| 442 |  |  |  |  |  |  | } | 
| 443 |  |  |  |  |  |  | return \@cluster_centers; | 
| 444 |  |  |  |  |  |  | } | 
| 445 |  |  |  |  |  |  | my ($num_rows,$num_cols) = ($self->{_data_dimensions},$self->{_N}); | 
| 446 |  |  |  |  |  |  | my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols); | 
| 447 |  |  |  |  |  |  | my $mean_vec = Math::GSL::Matrix->new($num_rows,1); | 
| 448 |  |  |  |  |  |  | # All the record labels are stored in the array $self->{_data_id_tags}.  The | 
| 449 |  |  |  |  |  |  | # actual data for clustering is stored in a hash at $self->{_data} whose keys are | 
| 450 |  |  |  |  |  |  | # the record labels; the value associated with each key is the array holding the | 
| 451 |  |  |  |  |  |  | # corresponding numerical multidimensional data. | 
| 452 |  |  |  |  |  |  | foreach my $j (0..$num_cols-1) { | 
| 453 |  |  |  |  |  |  | my $tag = $self->{_data_id_tags}[$j]; | 
| 454 |  |  |  |  |  |  | my $data = $self->{_data}->{$tag}; | 
| 455 |  |  |  |  |  |  | $matrix->set_col($j, $data); | 
| 456 |  |  |  |  |  |  | } | 
| 457 |  |  |  |  |  |  | if ($self->{_debug}) { | 
| 458 |  |  |  |  |  |  | print "\nDisplaying the original data as a matrix:"; | 
| 459 |  |  |  |  |  |  | display_matrix( $matrix ); | 
| 460 |  |  |  |  |  |  | } | 
| 461 |  |  |  |  |  |  | foreach my $j (0..$num_cols-1) { | 
| 462 |  |  |  |  |  |  | $mean_vec += $matrix->col($j); | 
| 463 |  |  |  |  |  |  | } | 
| 464 |  |  |  |  |  |  | $mean_vec *=  1.0 / $num_cols; | 
| 465 |  |  |  |  |  |  | if ($self->{_debug}) { | 
| 466 |  |  |  |  |  |  | print "Displaying the mean vector for the data:"; | 
| 467 |  |  |  |  |  |  | display_matrix( $mean_vec ); | 
| 468 |  |  |  |  |  |  | } | 
| 469 |  |  |  |  |  |  | foreach my $j (0..$num_cols-1) { | 
| 470 |  |  |  |  |  |  | my @new_col = ($matrix->col($j) - $mean_vec)->as_list; | 
| 471 |  |  |  |  |  |  | $matrix->set_col($j, \@new_col); | 
| 472 |  |  |  |  |  |  | } | 
| 473 |  |  |  |  |  |  | if ($self->{_debug}) { | 
| 474 |  |  |  |  |  |  | print "Displaying mean subtracted data as a matrix:"; | 
| 475 |  |  |  |  |  |  | display_matrix( $matrix ); | 
| 476 |  |  |  |  |  |  | } | 
| 477 |  |  |  |  |  |  | my $transposed = transpose( $matrix ); | 
| 478 |  |  |  |  |  |  | if ($self->{_debug}) { | 
| 479 |  |  |  |  |  |  | print "Displaying transposed data matrix:"; | 
| 480 |  |  |  |  |  |  | display_matrix( $transposed ); | 
| 481 |  |  |  |  |  |  | } | 
| 482 |  |  |  |  |  |  | my $covariance = matrix_multiply( $matrix, $transposed ); | 
| 483 |  |  |  |  |  |  | $covariance *= 1.0 / $num_cols; | 
| 484 |  |  |  |  |  |  | if ($self->{_debug}) { | 
| 485 |  |  |  |  |  |  | print "\nDisplaying the Covariance Matrix for your data:"; | 
| 486 |  |  |  |  |  |  | display_matrix( $covariance ); | 
| 487 |  |  |  |  |  |  | } | 
| 488 |  |  |  |  |  |  | my ($eigenvalues, $eigenvectors) = $covariance->eigenpair; | 
| 489 |  |  |  |  |  |  | my $num_of_eigens = @$eigenvalues; | 
| 490 |  |  |  |  |  |  | my $largest_eigen_index = 0; | 
| 491 |  |  |  |  |  |  | my $smallest_eigen_index = 0; | 
| 492 |  |  |  |  |  |  | print "Eigenvalue 0:   $eigenvalues->[0]\n" if $self->{_debug}; | 
| 493 |  |  |  |  |  |  | foreach my $i (1..$num_of_eigens-1) { | 
| 494 |  |  |  |  |  |  | $largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index]; | 
| 495 |  |  |  |  |  |  | $smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index]; | 
| 496 |  |  |  |  |  |  | print "Eigenvalue $i:   $eigenvalues->[$i]\n" if $self->{_debug}; | 
| 497 |  |  |  |  |  |  | } | 
| 498 |  |  |  |  |  |  | print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug}; | 
| 499 |  |  |  |  |  |  | print "\nsmallest eigen index: $smallest_eigen_index\n\n" if $self->{_debug}; | 
| 500 |  |  |  |  |  |  | foreach my $i (0..$num_of_eigens-1) { | 
| 501 |  |  |  |  |  |  | my @vec = $eigenvectors->[$i]->as_list; | 
| 502 |  |  |  |  |  |  | print "Eigenvector $i:   @vec\n" if $self->{_debug}; | 
| 503 |  |  |  |  |  |  | } | 
| 504 |  |  |  |  |  |  | my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list; | 
| 505 |  |  |  |  |  |  | print "\nLargest eigenvector:   @largest_eigen_vec\n" if $self->{_debug}; | 
| 506 |  |  |  |  |  |  | my @max_var_direction; | 
| 507 |  |  |  |  |  |  | # Each element of the array @largest_eigen_vec is a Math::Complex object | 
| 508 |  |  |  |  |  |  | foreach my $k (0..@largest_eigen_vec-1) { | 
| 509 |  |  |  |  |  |  | my ($mag, $theta) = $largest_eigen_vec[$k] =~ /\[(\d*\.\d+),(\S+)\]/; | 
| 510 |  |  |  |  |  |  | if ($theta eq '0') { | 
| 511 |  |  |  |  |  |  | $max_var_direction[$k] = $mag; | 
| 512 |  |  |  |  |  |  | } elsif ($theta eq 'pi') { | 
| 513 |  |  |  |  |  |  | $max_var_direction[$k] = -1.0 * $mag; | 
| 514 |  |  |  |  |  |  | } else { | 
| 515 |  |  |  |  |  |  | die "eigendecomposition of covariance matrix produced a complex " . | 
| 516 |  |  |  |  |  |  | "eigenvector --- something is wrong"; | 
| 517 |  |  |  |  |  |  | } | 
| 518 |  |  |  |  |  |  | } | 
| 519 |  |  |  |  |  |  | print "\nMaximum Variance Direction: @max_var_direction\n\n" if $self->{_debug}; | 
| 520 |  |  |  |  |  |  | # We now project all data points on the largest eigenvector. | 
| 521 |  |  |  |  |  |  | # Each projection will yield a single point on the eigenvector. | 
| 522 |  |  |  |  |  |  | my @projections; | 
| 523 |  |  |  |  |  |  | foreach my $j (0..$self->{_N}-1) { | 
| 524 |  |  |  |  |  |  | my $tag = $self->{_data_id_tags}[$j]; | 
| 525 |  |  |  |  |  |  | my $data = $self->{_data}->{$tag}; | 
| 526 |  |  |  |  |  |  | die "Dimensionality of the largest eigenvector does not " | 
| 527 |  |  |  |  |  |  | . "match the dimensionality of the data" | 
| 528 |  |  |  |  |  |  | unless @max_var_direction == $self->{_data_dimensions}; | 
| 529 |  |  |  |  |  |  | my $projection = vector_multiply($data, \@max_var_direction); | 
| 530 |  |  |  |  |  |  | push @projections, $projection; | 
| 531 |  |  |  |  |  |  | } | 
| 532 |  |  |  |  |  |  | print "All projection points: @projections\n" if $self->{_debug}; | 
| 533 |  |  |  |  |  |  | my @peak_points = find_peak_points_in_given_direction(\@projections, $K); | 
| 534 |  |  |  |  |  |  | print "highest points at points along largest eigenvec: @peak_points\n" if $self->{_debug}; | 
| 535 |  |  |  |  |  |  | my @cluster_centers; | 
| 536 |  |  |  |  |  |  | foreach my $peakpoint (@peak_points) { | 
| 537 |  |  |  |  |  |  | my @actual_peak_coords = map {$peakpoint * $_} @max_var_direction; | 
| 538 |  |  |  |  |  |  | push @cluster_centers, \@actual_peak_coords; | 
| 539 |  |  |  |  |  |  | } | 
| 540 |  |  |  |  |  |  | return \@cluster_centers; | 
| 541 |  |  |  |  |  |  | } | 
| 542 |  |  |  |  |  |  |  | 
| 543 |  |  |  |  |  |  | # This method is invoked when you choose the 'smart' option for the "cluster_seeding" | 
| 544 |  |  |  |  |  |  | # option in the constructor.  It is called by the previous method to locate K peaks | 
| 545 |  |  |  |  |  |  | # in a smoothed histogram of the data points projected onto the maximal variance | 
| 546 |  |  |  |  |  |  | # direction. | 
| 547 |  |  |  |  |  |  | sub find_peak_points_in_given_direction { | 
| 548 |  |  |  |  |  |  | my $dataref = shift; | 
| 549 |  |  |  |  |  |  | my $how_many = shift; | 
| 550 |  |  |  |  |  |  | my @data = @$dataref; | 
| 551 |  |  |  |  |  |  | my ($min, $max) = minmax(\@data); | 
| 552 |  |  |  |  |  |  | my $num_points = @data; | 
| 553 |  |  |  |  |  |  | my @sorted_data = sort {$a <=> $b} @data; | 
| 554 |  |  |  |  |  |  | my $scale = $max - $min; | 
| 555 |  |  |  |  |  |  | foreach my $index (0..$#sorted_data-1) { | 
| 556 |  |  |  |  |  |  | $sorted_data[$index] = ($sorted_data[$index] - $min) / $scale; | 
| 557 |  |  |  |  |  |  | } | 
| 558 |  |  |  |  |  |  | my $avg_diff = 0; | 
| 559 |  |  |  |  |  |  | foreach my $index (0..$#sorted_data-1) { | 
| 560 |  |  |  |  |  |  | my $diff = $sorted_data[$index+1] - $sorted_data[$index]; | 
| 561 |  |  |  |  |  |  | $avg_diff += ($diff - $avg_diff) / ($index + 1); | 
| 562 |  |  |  |  |  |  | } | 
| 563 |  |  |  |  |  |  | my $delta = 1.0 / 1000.0; | 
| 564 |  |  |  |  |  |  | my @accumulator = (0) x 1000; | 
| 565 |  |  |  |  |  |  | foreach my $index (0..@sorted_data-1) { | 
| 566 |  |  |  |  |  |  | my $cell_index = int($sorted_data[$index] / $delta); | 
| 567 |  |  |  |  |  |  | my $smoothness = 40; | 
| 568 |  |  |  |  |  |  | for my $index ($cell_index-$smoothness..$cell_index+$smoothness) { | 
| 569 |  |  |  |  |  |  | next if $index < 0 || $index > 999; | 
| 570 |  |  |  |  |  |  | $accumulator[$index]++; | 
| 571 |  |  |  |  |  |  | } | 
| 572 |  |  |  |  |  |  | } | 
| 573 |  |  |  |  |  |  | my $peaks_array = non_maximum_suppression( \@accumulator ); | 
| 574 |  |  |  |  |  |  | my $peaks_index_hash = get_value_index_hash( $peaks_array ); | 
| 575 |  |  |  |  |  |  | my @K_highest_peak_locations; | 
| 576 |  |  |  |  |  |  | my $k = 0; | 
| 577 |  |  |  |  |  |  | foreach my $peak (sort {$b <=> $a} keys %$peaks_index_hash) { | 
| 578 |  |  |  |  |  |  | my $unscaled_peak_point = $min + $peaks_index_hash->{$peak} * $scale * $delta; | 
| 579 |  |  |  |  |  |  | push @K_highest_peak_locations, $unscaled_peak_point | 
| 580 |  |  |  |  |  |  | if $k < $how_many; | 
| 581 |  |  |  |  |  |  | last if ++$k == $how_many; | 
| 582 |  |  |  |  |  |  | } | 
| 583 |  |  |  |  |  |  | return @K_highest_peak_locations; | 
| 584 |  |  |  |  |  |  | } | 
| 585 |  |  |  |  |  |  |  | 
| 586 |  |  |  |  |  |  | # The purpose of this routine is to form initial clusters by assigning the data | 
| 587 |  |  |  |  |  |  | # samples to the initial clusters formed by the previous routine on the basis of the | 
| 588 |  |  |  |  |  |  | # best proximity of the data samples to the different cluster centers. | 
| 589 |  |  |  |  |  |  | sub assign_data_to_clusters_initial { | 
| 590 |  |  |  |  |  |  | my $self = shift; | 
| 591 |  |  |  |  |  |  | my @cluster_centers = @{ shift @_ }; | 
| 592 |  |  |  |  |  |  | my @clusters; | 
| 593 |  |  |  |  |  |  | foreach my $ele (@{$self->{_data_id_tags}}) { | 
| 594 |  |  |  |  |  |  | my $best_cluster; | 
| 595 |  |  |  |  |  |  | my @dist_from_clust_centers; | 
| 596 |  |  |  |  |  |  | foreach my $center (@cluster_centers) { | 
| 597 |  |  |  |  |  |  | push @dist_from_clust_centers, $self->distance($ele, $center); | 
| 598 |  |  |  |  |  |  | } | 
| 599 |  |  |  |  |  |  | my ($min, $best_center_index) = minimum( \@dist_from_clust_centers ); | 
| 600 |  |  |  |  |  |  | push @{$clusters[$best_center_index]}, $ele; | 
| 601 |  |  |  |  |  |  | } | 
| 602 |  |  |  |  |  |  | return \@clusters; | 
| 603 |  |  |  |  |  |  | } | 
| 604 |  |  |  |  |  |  |  | 
| 605 |  |  |  |  |  |  | sub assign_data_to_clusters_initial_mahalanobis { | 
| 606 |  |  |  |  |  |  | my $self = shift; | 
| 607 |  |  |  |  |  |  | my @cluster_centers = @{ shift @_ }; | 
| 608 |  |  |  |  |  |  | my @clusters; | 
| 609 |  |  |  |  |  |  | foreach my $ele (@{$self->{_data_id_tags}}) { | 
| 610 |  |  |  |  |  |  | my $best_cluster; | 
| 611 |  |  |  |  |  |  | my @dist_from_clust_centers; | 
| 612 |  |  |  |  |  |  | foreach my $center (@cluster_centers) { | 
| 613 |  |  |  |  |  |  | push @dist_from_clust_centers, $self->distance($ele, $center); | 
| 614 |  |  |  |  |  |  | } | 
| 615 |  |  |  |  |  |  | my ($min, $best_center_index) = minimum( \@dist_from_clust_centers ); | 
| 616 |  |  |  |  |  |  | push @{$clusters[$best_center_index]}, $ele if defined $best_center_index; | 
| 617 |  |  |  |  |  |  | } | 
| 618 |  |  |  |  |  |  | # Since a cluster center may not correspond to any particular sample, it is possible | 
| 619 |  |  |  |  |  |  | # for one of the elements of the array @clusters to be null using the above | 
| 620 |  |  |  |  |  |  | # strategy for populating the initial clusters.  Let's say there are five cluster | 
| 621 |  |  |  |  |  |  | # centers in the array @cluster_centers.  The $best_center_index may populate the | 
| 622 |  |  |  |  |  |  | # the elements of the array @clusters for the indices 0, 1, 2, 4, which would leave | 
| 623 |  |  |  |  |  |  | # $clusters[3] as undefined.  So, in what follows, we must first check if all of | 
| 624 |  |  |  |  |  |  | # the elements of @clusters are defined. | 
| 625 |  |  |  |  |  |  | my @determinants; | 
| 626 |  |  |  |  |  |  | foreach my $cluster(@clusters) { | 
| 627 |  |  |  |  |  |  | die "The clustering program started with bad initialization.  Please start over" | 
| 628 |  |  |  |  |  |  | unless defined $cluster; | 
| 629 |  |  |  |  |  |  | my $covariance = $self->estimate_cluster_covariance($cluster); | 
| 630 |  |  |  |  |  |  | my $determinant = $covariance->det(); | 
| 631 |  |  |  |  |  |  | push @determinants, $determinant; | 
| 632 |  |  |  |  |  |  | } | 
| 633 |  |  |  |  |  |  | return [\@clusters, \@determinants]; | 
| 634 |  |  |  |  |  |  | } | 
| 635 |  |  |  |  |  |  |  | 
| 636 |  |  |  |  |  |  | # This is the main routine that along with the update_cluster_centers() routine | 
| 637 |  |  |  |  |  |  | # constitute the two key steps of the K-Means algorithm.  In most cases, the infinite | 
| 638 |  |  |  |  |  |  | # while() loop will terminate automatically when the cluster assignments of the data | 
| 639 |  |  |  |  |  |  | # points remain unchanged. For the sake of safety, we keep track of the number of | 
| 640 |  |  |  |  |  |  | # iterations. If this number reaches 100, we exit the while() loop anyway.  In most | 
| 641 |  |  |  |  |  |  | # cases, this limit will not be reached. | 
| 642 |  |  |  |  |  |  | sub assign_data_to_clusters { | 
| 643 |  |  |  |  |  |  | my $self = shift; | 
| 644 |  |  |  |  |  |  | my $clusters = shift; | 
| 645 |  |  |  |  |  |  | my $K = shift; | 
| 646 |  |  |  |  |  |  | my $final_cluster_centers; | 
| 647 |  |  |  |  |  |  | my $iteration_index = 0; | 
| 648 |  |  |  |  |  |  | while (1) { | 
| 649 |  |  |  |  |  |  | my $new_clusters; | 
| 650 |  |  |  |  |  |  | my $assignment_changed_flag = 0; | 
| 651 |  |  |  |  |  |  | my $current_cluster_center_index = 0; | 
| 652 |  |  |  |  |  |  | my $cluster_size_zero_condition = 0; | 
| 653 |  |  |  |  |  |  | my $how_many = @$clusters; | 
| 654 |  |  |  |  |  |  | my $cluster_centers = $self->update_cluster_centers( deep_copy_AoA_with_nulls( $clusters ) ); | 
| 655 |  |  |  |  |  |  | $iteration_index++; | 
| 656 |  |  |  |  |  |  | foreach my $cluster (@$clusters) { | 
| 657 |  |  |  |  |  |  | my $current_cluster_center = $cluster_centers->[$current_cluster_center_index]; | 
| 658 |  |  |  |  |  |  | foreach my $ele (@$cluster) { | 
| 659 |  |  |  |  |  |  | my @dist_from_clust_centers; | 
| 660 |  |  |  |  |  |  | foreach my $center (@$cluster_centers) { | 
| 661 |  |  |  |  |  |  | push @dist_from_clust_centers, | 
| 662 |  |  |  |  |  |  | $self->distance($ele, $center); | 
| 663 |  |  |  |  |  |  | } | 
| 664 |  |  |  |  |  |  | my ($min, $best_center_index) = minimum( \@dist_from_clust_centers ); | 
| 665 |  |  |  |  |  |  | my $best_cluster_center = $cluster_centers->[$best_center_index]; | 
| 666 |  |  |  |  |  |  | if (vector_equal($current_cluster_center, $best_cluster_center)){ | 
| 667 |  |  |  |  |  |  | push @{$new_clusters->[$current_cluster_center_index]}, $ele; | 
| 668 |  |  |  |  |  |  | } else { | 
| 669 |  |  |  |  |  |  | $assignment_changed_flag = 1; | 
| 670 |  |  |  |  |  |  | push @{$new_clusters->[$best_center_index]}, $ele; | 
| 671 |  |  |  |  |  |  | } | 
| 672 |  |  |  |  |  |  | } | 
| 673 |  |  |  |  |  |  | $current_cluster_center_index++; | 
| 674 |  |  |  |  |  |  | } | 
| 675 |  |  |  |  |  |  | next if ((@$new_clusters != @$clusters) && ($iteration_index < 100)); | 
| 676 |  |  |  |  |  |  | # Now make sure that none of the K clusters is an empty cluster: | 
| 677 |  |  |  |  |  |  | foreach my $newcluster (@$new_clusters) { | 
| 678 |  |  |  |  |  |  | $cluster_size_zero_condition = 1 if ((!defined $newcluster) or  (@$newcluster == 0)); | 
| 679 |  |  |  |  |  |  | } | 
| 680 |  |  |  |  |  |  | push @$new_clusters, (undef) x ($K - @$new_clusters) if @$new_clusters < $K; | 
| 681 |  |  |  |  |  |  | # During clustering for a fixed K, should a cluster inadvertently | 
| 682 |  |  |  |  |  |  | # become empty, steal a member from the largest cluster to hopefully | 
| 683 |  |  |  |  |  |  | # spawn a new cluster: | 
| 684 |  |  |  |  |  |  | my $largest_cluster; | 
| 685 |  |  |  |  |  |  | foreach my $local_cluster (@$new_clusters) { | 
| 686 |  |  |  |  |  |  | next if !defined $local_cluster; | 
| 687 |  |  |  |  |  |  | $largest_cluster = $local_cluster if !defined $largest_cluster; | 
| 688 |  |  |  |  |  |  | if (@$local_cluster > @$largest_cluster) { | 
| 689 |  |  |  |  |  |  | $largest_cluster = $local_cluster; | 
| 690 |  |  |  |  |  |  | } | 
| 691 |  |  |  |  |  |  | } | 
| 692 |  |  |  |  |  |  | foreach my $local_cluster (@$new_clusters) { | 
| 693 |  |  |  |  |  |  | if ( (!defined $local_cluster) || (@$local_cluster == 0) ) { | 
| 694 |  |  |  |  |  |  | push @$local_cluster, pop @$largest_cluster; | 
| 695 |  |  |  |  |  |  | } | 
| 696 |  |  |  |  |  |  | } | 
| 697 |  |  |  |  |  |  | next if (($cluster_size_zero_condition) && ($iteration_index < 100)); | 
| 698 |  |  |  |  |  |  | last if $iteration_index == 100; | 
| 699 |  |  |  |  |  |  | $clusters = deep_copy_AoA( $new_clusters ); | 
| 700 |  |  |  |  |  |  | last if $assignment_changed_flag == 0; | 
| 701 |  |  |  |  |  |  | } | 
| 702 |  |  |  |  |  |  | $final_cluster_centers = $self->update_cluster_centers( $clusters ); | 
| 703 |  |  |  |  |  |  | return ($clusters, $final_cluster_centers); | 
| 704 |  |  |  |  |  |  | } | 
| 705 |  |  |  |  |  |  |  | 
| 706 |  |  |  |  |  |  | sub assign_data_to_clusters_mahalanobis { | 
| 707 |  |  |  |  |  |  | my $self = shift; | 
| 708 |  |  |  |  |  |  | my $clusters = shift; | 
| 709 |  |  |  |  |  |  | my $K = shift; | 
| 710 |  |  |  |  |  |  | my $final_cluster_centers; | 
| 711 |  |  |  |  |  |  | my $iteration_index = 0; | 
| 712 |  |  |  |  |  |  | while (1) { | 
| 713 |  |  |  |  |  |  | my $new_clusters; | 
| 714 |  |  |  |  |  |  | my $assignment_changed_flag = 0; | 
| 715 |  |  |  |  |  |  | my $current_cluster_center_index = 0; | 
| 716 |  |  |  |  |  |  | my $cluster_size_zero_condition = 0; | 
| 717 |  |  |  |  |  |  | my $how_many = @$clusters; | 
| 718 |  |  |  |  |  |  | my $cluster_centers_and_covariances = | 
| 719 |  |  |  |  |  |  | $self->update_cluster_centers_and_covariances_mahalanobis(deep_copy_AoA_with_nulls($clusters)); | 
| 720 |  |  |  |  |  |  | my $cluster_centers = $cluster_centers_and_covariances->[0]; | 
| 721 |  |  |  |  |  |  | my $cluster_covariances = $cluster_centers_and_covariances->[1]; | 
| 722 |  |  |  |  |  |  | $iteration_index++; | 
| 723 |  |  |  |  |  |  | foreach my $cluster (@$clusters) { | 
| 724 |  |  |  |  |  |  | my $current_cluster_center = $cluster_centers->[$current_cluster_center_index]; | 
| 725 |  |  |  |  |  |  | my $current_cluster_covariance = $cluster_covariances->[$current_cluster_center_index]; | 
| 726 |  |  |  |  |  |  | foreach my $ele (@$cluster) { | 
| 727 |  |  |  |  |  |  | my @mahalanobis_dist_from_clust_centers; | 
| 728 |  |  |  |  |  |  | foreach my $i (0..@$clusters-1) { | 
| 729 |  |  |  |  |  |  | my $center = $cluster_centers->[$i]; | 
| 730 |  |  |  |  |  |  | my $covariance = $cluster_covariances->[$i]; | 
| 731 |  |  |  |  |  |  | my $maha_distance; | 
| 732 |  |  |  |  |  |  | eval { | 
| 733 |  |  |  |  |  |  | $maha_distance = $self->distance_mahalanobis($ele, $center, $covariance); | 
| 734 |  |  |  |  |  |  | }; | 
| 735 |  |  |  |  |  |  | next if $@; | 
| 736 |  |  |  |  |  |  | push @mahalanobis_dist_from_clust_centers, $maha_distance; | 
| 737 |  |  |  |  |  |  | } | 
| 738 |  |  |  |  |  |  | my ($min, $best_center_index) = minimum( \@mahalanobis_dist_from_clust_centers ); | 
| 739 |  |  |  |  |  |  | my $best_cluster_center = $cluster_centers->[$best_center_index]; | 
| 740 |  |  |  |  |  |  | if (vector_equal($current_cluster_center, $best_cluster_center)){ | 
| 741 |  |  |  |  |  |  | push @{$new_clusters->[$current_cluster_center_index]}, $ele; | 
| 742 |  |  |  |  |  |  | } else { | 
| 743 |  |  |  |  |  |  | $assignment_changed_flag = 1; | 
| 744 |  |  |  |  |  |  | push @{$new_clusters->[$best_center_index]}, $ele; | 
| 745 |  |  |  |  |  |  | } | 
| 746 |  |  |  |  |  |  | } | 
| 747 |  |  |  |  |  |  | $current_cluster_center_index++; | 
| 748 |  |  |  |  |  |  | } | 
| 749 |  |  |  |  |  |  | next if ((@$new_clusters != @$clusters) && ($iteration_index < 100)); | 
| 750 |  |  |  |  |  |  | # Now make sure that none of the K clusters is an empty cluster: | 
| 751 |  |  |  |  |  |  | foreach my $newcluster (@$new_clusters) { | 
| 752 |  |  |  |  |  |  | $cluster_size_zero_condition = 1 if ((!defined $newcluster) or  (@$newcluster == 0)); | 
| 753 |  |  |  |  |  |  | } | 
| 754 |  |  |  |  |  |  | push @$new_clusters, (undef) x ($K - @$new_clusters) if @$new_clusters < $K; | 
| 755 |  |  |  |  |  |  | # During clustering for a fixed K, should a cluster inadvertently | 
| 756 |  |  |  |  |  |  | # become empty, steal a member from the largest cluster to hopefully | 
| 757 |  |  |  |  |  |  | # spawn a new cluster: | 
| 758 |  |  |  |  |  |  | my $largest_cluster; | 
| 759 |  |  |  |  |  |  | foreach my $local_cluster (@$new_clusters) { | 
| 760 |  |  |  |  |  |  | next if !defined $local_cluster; | 
| 761 |  |  |  |  |  |  | $largest_cluster = $local_cluster if !defined $largest_cluster; | 
| 762 |  |  |  |  |  |  | if (@$local_cluster > @$largest_cluster) { | 
| 763 |  |  |  |  |  |  | $largest_cluster = $local_cluster; | 
| 764 |  |  |  |  |  |  | } | 
| 765 |  |  |  |  |  |  | } | 
| 766 |  |  |  |  |  |  | foreach my $local_cluster (@$new_clusters) { | 
| 767 |  |  |  |  |  |  | if ( (!defined $local_cluster) || (@$local_cluster == 0) ) { | 
| 768 |  |  |  |  |  |  | push @$local_cluster, pop @$largest_cluster; | 
| 769 |  |  |  |  |  |  | } | 
| 770 |  |  |  |  |  |  | } | 
| 771 |  |  |  |  |  |  | next if (($cluster_size_zero_condition) && ($iteration_index < 100)); | 
| 772 |  |  |  |  |  |  | last if $iteration_index == 100; | 
| 773 |  |  |  |  |  |  | # Now do a deep copy of new_clusters into clusters | 
| 774 |  |  |  |  |  |  | $clusters = deep_copy_AoA( $new_clusters ); | 
| 775 |  |  |  |  |  |  | last if $assignment_changed_flag == 0; | 
| 776 |  |  |  |  |  |  | } | 
| 777 |  |  |  |  |  |  | $final_cluster_centers = $self->update_cluster_centers( $clusters ); | 
| 778 |  |  |  |  |  |  | return ($clusters, $final_cluster_centers); | 
| 779 |  |  |  |  |  |  | } | 
| 780 |  |  |  |  |  |  |  | 
| 781 |  |  |  |  |  |  | sub update_cluster_centers_and_covariances_mahalanobis { | 
| 782 |  |  |  |  |  |  | my $self = shift; | 
| 783 |  |  |  |  |  |  | my @clusters = @{ shift @_ }; | 
| 784 |  |  |  |  |  |  | my @new_cluster_centers; | 
| 785 |  |  |  |  |  |  | my @new_cluster_covariances; | 
| 786 |  |  |  |  |  |  | # During clustering for a fixed K, should a cluster inadvertently become empty, | 
| 787 |  |  |  |  |  |  | # steal a member from the largest cluster to hopefully spawn a new cluster: | 
| 788 |  |  |  |  |  |  | my $largest_cluster; | 
| 789 |  |  |  |  |  |  | foreach my $cluster (@clusters) { | 
| 790 |  |  |  |  |  |  | next if !defined $cluster; | 
| 791 |  |  |  |  |  |  | $largest_cluster = $cluster if !defined $largest_cluster; | 
| 792 |  |  |  |  |  |  | if (@$cluster > @$largest_cluster) { | 
| 793 |  |  |  |  |  |  | $largest_cluster = $cluster; | 
| 794 |  |  |  |  |  |  | } | 
| 795 |  |  |  |  |  |  | } | 
| 796 |  |  |  |  |  |  | foreach my $cluster (@clusters) { | 
| 797 |  |  |  |  |  |  | if ( (!defined $cluster) || (@$cluster == 0) ) { | 
| 798 |  |  |  |  |  |  | push @$cluster, pop @$largest_cluster; | 
| 799 |  |  |  |  |  |  | } | 
| 800 |  |  |  |  |  |  | } | 
| 801 |  |  |  |  |  |  | foreach my $cluster (@clusters) { | 
| 802 |  |  |  |  |  |  | die "Cluster became empty --- untenable condition " . | 
| 803 |  |  |  |  |  |  | "for a given K.  Try again. \n" if !defined $cluster; | 
| 804 |  |  |  |  |  |  | my $cluster_size = @$cluster; | 
| 805 |  |  |  |  |  |  | die "Cluster size is zero --- untenable.\n" if $cluster_size == 0; | 
| 806 |  |  |  |  |  |  | my @new_cluster_center = @{$self->add_point_coords( $cluster )}; | 
| 807 |  |  |  |  |  |  | @new_cluster_center = map {my $x = $_/$cluster_size; $x} @new_cluster_center; | 
| 808 |  |  |  |  |  |  | push @new_cluster_centers, \@new_cluster_center; | 
| 809 |  |  |  |  |  |  | # for covariance calculation: | 
| 810 |  |  |  |  |  |  | my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$cluster)); | 
| 811 |  |  |  |  |  |  | my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols); | 
| 812 |  |  |  |  |  |  | my $mean_vec = Math::GSL::Matrix->new($num_rows,1); | 
| 813 |  |  |  |  |  |  | # All the record labels are stored in the array $self->{_data_id_tags}.  The | 
| 814 |  |  |  |  |  |  | # actual data for clustering is stored in a hash at $self->{_data} whose keys are | 
| 815 |  |  |  |  |  |  | # the record labels; the value associated with each key is the array holding the | 
| 816 |  |  |  |  |  |  | # corresponding numerical multidimensional data. | 
| 817 |  |  |  |  |  |  | foreach my $j (0..$num_cols-1) { | 
| 818 |  |  |  |  |  |  | my $tag = $cluster->[$j]; | 
| 819 |  |  |  |  |  |  | my $data = $self->{_data}->{$tag}; | 
| 820 |  |  |  |  |  |  | my @diff_from_mean = vector_subtract($data, \@new_cluster_center); | 
| 821 |  |  |  |  |  |  | $matrix->set_col($j, \@diff_from_mean); | 
| 822 |  |  |  |  |  |  | } | 
| 823 |  |  |  |  |  |  | my $transposed = transpose( $matrix ); | 
| 824 |  |  |  |  |  |  | my $covariance = matrix_multiply( $matrix, $transposed ); | 
| 825 |  |  |  |  |  |  | $covariance *= 1.0 / $num_cols; | 
| 826 |  |  |  |  |  |  | if ($self->{_debug}) { | 
| 827 |  |  |  |  |  |  | print "\nDisplaying the Covariance Matrix for cluster:"; | 
| 828 |  |  |  |  |  |  | display_matrix( $covariance ); | 
| 829 |  |  |  |  |  |  | } | 
| 830 |  |  |  |  |  |  | push @new_cluster_covariances, $covariance; | 
| 831 |  |  |  |  |  |  | } | 
| 832 |  |  |  |  |  |  | return [\@new_cluster_centers, \@new_cluster_covariances]; | 
| 833 |  |  |  |  |  |  | } | 
| 834 |  |  |  |  |  |  |  | 
| 835 |  |  |  |  |  |  | # After each new assignment of the data points to the clusters on the basis of the | 
| 836 |  |  |  |  |  |  | # current values for the cluster centers, we call the routine shown here for updating | 
| 837 |  |  |  |  |  |  | # the values of the cluster centers. | 
| 838 |  |  |  |  |  |  | sub update_cluster_centers { | 
| 839 |  |  |  |  |  |  | my $self = shift; | 
| 840 |  |  |  |  |  |  | my @clusters = @{ shift @_ }; | 
| 841 |  |  |  |  |  |  | my @new_cluster_centers; | 
| 842 |  |  |  |  |  |  | # During clustering for a fixed K, should a cluster inadvertently become empty, | 
| 843 |  |  |  |  |  |  | # steal a member from the largest cluster to hopefully spawn a new cluster: | 
| 844 |  |  |  |  |  |  | my $largest_cluster; | 
| 845 |  |  |  |  |  |  | foreach my $cluster (@clusters) { | 
| 846 |  |  |  |  |  |  | next if !defined $cluster; | 
| 847 |  |  |  |  |  |  | $largest_cluster = $cluster if !defined $largest_cluster; | 
| 848 |  |  |  |  |  |  | if (@$cluster > @$largest_cluster) { | 
| 849 |  |  |  |  |  |  | $largest_cluster = $cluster; | 
| 850 |  |  |  |  |  |  | } | 
| 851 |  |  |  |  |  |  | } | 
| 852 |  |  |  |  |  |  | foreach my $cluster (@clusters) { | 
| 853 |  |  |  |  |  |  | if ( (!defined $cluster) || (@$cluster == 0) ) { | 
| 854 |  |  |  |  |  |  | push @$cluster, pop @$largest_cluster; | 
| 855 |  |  |  |  |  |  | } | 
| 856 |  |  |  |  |  |  | } | 
| 857 |  |  |  |  |  |  | foreach my $cluster (@clusters) { | 
| 858 |  |  |  |  |  |  | die "Cluster became empty --- untenable condition " . | 
| 859 |  |  |  |  |  |  | "for a given K.  Try again. \n" if !defined $cluster; | 
| 860 |  |  |  |  |  |  | my $cluster_size = @$cluster; | 
| 861 |  |  |  |  |  |  | die "Cluster size is zero --- untenable.\n" if $cluster_size == 0; | 
| 862 |  |  |  |  |  |  | my @new_cluster_center = @{$self->add_point_coords( $cluster )}; | 
| 863 |  |  |  |  |  |  | @new_cluster_center = map {my $x = $_/$cluster_size; $x} | 
| 864 |  |  |  |  |  |  | @new_cluster_center; | 
| 865 |  |  |  |  |  |  | push @new_cluster_centers, \@new_cluster_center; | 
| 866 |  |  |  |  |  |  | } | 
| 867 |  |  |  |  |  |  | return \@new_cluster_centers; | 
| 868 |  |  |  |  |  |  | } | 
| 869 |  |  |  |  |  |  |  | 
| 870 |  |  |  |  |  |  | sub which_cluster_for_new_data_element { | 
| 871 |  |  |  |  |  |  | my $self = shift; | 
| 872 |  |  |  |  |  |  | my $ele = shift; | 
| 873 |  |  |  |  |  |  | die "The dimensionality of the new data element is not correct: $!" | 
| 874 |  |  |  |  |  |  | unless @$ele == $self->{_data_dimensions}; | 
| 875 |  |  |  |  |  |  | my %distance_to_new_ele_hash; | 
| 876 |  |  |  |  |  |  | foreach my $cluster_id (sort keys %{$self->{_cluster_centers_hash}}) { | 
| 877 |  |  |  |  |  |  | $distance_to_new_ele_hash{$cluster_id} = $self->distance2($ele, | 
| 878 |  |  |  |  |  |  | $self->{_cluster_centers_hash}->{$cluster_id}); | 
| 879 |  |  |  |  |  |  | } | 
| 880 |  |  |  |  |  |  | my @values = values %distance_to_new_ele_hash; | 
| 881 |  |  |  |  |  |  | my ($min,$max) = minmax(\@values); | 
| 882 |  |  |  |  |  |  | my $answer; | 
| 883 |  |  |  |  |  |  | foreach my $cluster_id (keys %distance_to_new_ele_hash) { | 
| 884 |  |  |  |  |  |  | $answer = $cluster_id if $distance_to_new_ele_hash{$cluster_id} == $min; | 
| 885 |  |  |  |  |  |  | } | 
| 886 |  |  |  |  |  |  | return $answer; | 
| 887 |  |  |  |  |  |  | } | 
| 888 |  |  |  |  |  |  |  | 
| 889 |  |  |  |  |  |  | sub which_cluster_for_new_data_element_mahalanobis { | 
| 890 |  |  |  |  |  |  | my $self = shift; | 
| 891 |  |  |  |  |  |  | my $ele = shift; | 
| 892 |  |  |  |  |  |  | die "The dimensionality of the new data element is not correct: $!" | 
| 893 |  |  |  |  |  |  | unless @$ele == $self->{_data_dimensions}; | 
| 894 |  |  |  |  |  |  | my %distance_to_new_ele_hash; | 
| 895 |  |  |  |  |  |  | foreach my $cluster_id (sort keys %{$self->{_cluster_centers_hash}}) { | 
| 896 |  |  |  |  |  |  | $distance_to_new_ele_hash{$cluster_id} = | 
| 897 |  |  |  |  |  |  | $self->distance_mahalanobis2($ele, $self->{_cluster_centers_hash}->{$cluster_id}, | 
| 898 |  |  |  |  |  |  | $self->{_cluster_covariances_hash}->{$cluster_id}); | 
| 899 |  |  |  |  |  |  | } | 
| 900 |  |  |  |  |  |  | my @values = values %distance_to_new_ele_hash; | 
| 901 |  |  |  |  |  |  | my ($min,$max) = minmax(\@values); | 
| 902 |  |  |  |  |  |  | my $answer; | 
| 903 |  |  |  |  |  |  | foreach my $cluster_id (keys %distance_to_new_ele_hash) { | 
| 904 |  |  |  |  |  |  | $answer = $cluster_id if $distance_to_new_ele_hash{$cluster_id} == $min; | 
| 905 |  |  |  |  |  |  | } | 
| 906 |  |  |  |  |  |  | return $answer; | 
| 907 |  |  |  |  |  |  | } | 
| 908 |  |  |  |  |  |  |  | 
| 909 |  |  |  |  |  |  | # The following function returns the value of QoC for a given partitioning of the | 
| 910 |  |  |  |  |  |  | # data into K clusters.  It calculates two things: the average value for the distance | 
| 911 |  |  |  |  |  |  | # between a data point and the center of the cluster in which the data point resides, | 
| 912 |  |  |  |  |  |  | # and the average value for the distances between the cluster centers.  We obviously | 
| 913 |  |  |  |  |  |  | # want to minimize the former and maximize the latter.  All of the "from center" | 
| 914 |  |  |  |  |  |  | # distances within each cluster are stored in the variable | 
| 915 |  |  |  |  |  |  | # $sum_of_distances_for_one_cluster.  When this variable, after it is divided by the | 
| 916 |  |  |  |  |  |  | # number of data elements in the cluster, is summed over all the clusters, we get a | 
| 917 |  |  |  |  |  |  | # value that is stored in $avg_dist_for_cluster.  The inter-cluster-center distances | 
| 918 |  |  |  |  |  |  | # are stored in the variable $inter_cluster_center_dist. | 
| 919 |  |  |  |  |  |  | sub cluster_quality { | 
| 920 |  |  |  |  |  |  | my $self = shift; | 
| 921 |  |  |  |  |  |  | my $clusters = shift; | 
| 922 |  |  |  |  |  |  | my $cluster_centers = shift; | 
| 923 |  |  |  |  |  |  | my $K = @$cluster_centers;          # Number of clusters | 
| 924 |  |  |  |  |  |  | my $cluster_radius = 0; | 
| 925 |  |  |  |  |  |  | foreach my $i (0..@$clusters-1) { | 
| 926 |  |  |  |  |  |  | my $sum_of_distances_for_one_cluster = 0; | 
| 927 |  |  |  |  |  |  | foreach my $ele (@{$clusters->[$i]}) { | 
| 928 |  |  |  |  |  |  | $sum_of_distances_for_one_cluster += | 
| 929 |  |  |  |  |  |  | $self->distance( $ele, $cluster_centers->[$i] ); | 
| 930 |  |  |  |  |  |  | } | 
| 931 |  |  |  |  |  |  | $cluster_radius += | 
| 932 |  |  |  |  |  |  | $sum_of_distances_for_one_cluster / @{$clusters->[$i]}; | 
| 933 |  |  |  |  |  |  | } | 
| 934 |  |  |  |  |  |  | my $inter_cluster_center_dist = 0; | 
| 935 |  |  |  |  |  |  | foreach my $i (0..@$cluster_centers-1) { | 
| 936 |  |  |  |  |  |  | foreach my $j (0..@$cluster_centers-1) { | 
| 937 |  |  |  |  |  |  | $inter_cluster_center_dist += | 
| 938 |  |  |  |  |  |  | $self->distance2( $cluster_centers->[$i], | 
| 939 |  |  |  |  |  |  | $cluster_centers->[$j] ); | 
| 940 |  |  |  |  |  |  | } | 
| 941 |  |  |  |  |  |  | } | 
| 942 |  |  |  |  |  |  | my $avg_inter_cluster_center_dist = $inter_cluster_center_dist / | 
| 943 |  |  |  |  |  |  | ( $K * ($K-1) / 2.0 ); | 
| 944 |  |  |  |  |  |  | return $cluster_radius / $avg_inter_cluster_center_dist; | 
| 945 |  |  |  |  |  |  | } | 
| 946 |  |  |  |  |  |  |  | 
| 947 |  |  |  |  |  |  | # The following routine is for computing the distance between a data point specified | 
| 948 |  |  |  |  |  |  | # by its symbolic name in the master datafile and a point (such as the center of a | 
| 949 |  |  |  |  |  |  | # cluster) expressed as a vector of coordinates: | 
| 950 |  |  |  |  |  |  | sub distance { | 
| 951 |  |  |  |  |  |  | my $self = shift; | 
| 952 |  |  |  |  |  |  | my $ele1_id = shift @_;            # symbolic name of data sample | 
| 953 |  |  |  |  |  |  | my @ele1 = @{$self->{_data}->{$ele1_id}}; | 
| 954 |  |  |  |  |  |  | my @ele2 = @{shift @_}; | 
| 955 |  |  |  |  |  |  | die "wrong data types for distance calculation\n" if @ele1 != @ele2; | 
| 956 |  |  |  |  |  |  | my $how_many = @ele1; | 
| 957 |  |  |  |  |  |  | my $squared_sum = 0; | 
| 958 |  |  |  |  |  |  | foreach my $i (0..$how_many-1) { | 
| 959 |  |  |  |  |  |  | $squared_sum += ($ele1[$i] - $ele2[$i])**2; | 
| 960 |  |  |  |  |  |  | } | 
| 961 |  |  |  |  |  |  | my $dist = sqrt $squared_sum; | 
| 962 |  |  |  |  |  |  | return $dist; | 
| 963 |  |  |  |  |  |  | } | 
| 964 |  |  |  |  |  |  |  | 
| 965 |  |  |  |  |  |  | # The following routine does the same as above but now both arguments are expected to | 
| 966 |  |  |  |  |  |  | # be arrays of numbers: | 
| 967 |  |  |  |  |  |  | sub distance2 { | 
| 968 |  |  |  |  |  |  | my $self = shift; | 
| 969 |  |  |  |  |  |  | my @ele1 = @{shift @_}; | 
| 970 |  |  |  |  |  |  | my @ele2 = @{shift @_}; | 
| 971 |  |  |  |  |  |  | die "wrong data types for distance calculation\n" if @ele1 != @ele2; | 
| 972 |  |  |  |  |  |  | my $how_many = @ele1; | 
| 973 |  |  |  |  |  |  | my $squared_sum = 0; | 
| 974 |  |  |  |  |  |  | foreach my $i (0..$how_many-1) { | 
| 975 |  |  |  |  |  |  | $squared_sum += ($ele1[$i] - $ele2[$i])**2; | 
| 976 |  |  |  |  |  |  | } | 
| 977 |  |  |  |  |  |  | return sqrt $squared_sum; | 
| 978 |  |  |  |  |  |  | } | 
| 979 |  |  |  |  |  |  |  | 
| 980 |  |  |  |  |  |  | # Requires three args: $ele for the symbolic tag of the element, $center for the | 
| 981 |  |  |  |  |  |  | # coordinates of the center of the cluster, and $covariance for the covariance of | 
| 982 |  |  |  |  |  |  | # cluster.  Our goal is to find the distance of the element ele from the cluster | 
| 983 |  |  |  |  |  |  | # whose mean and covariance are provided. | 
| 984 |  |  |  |  |  |  | sub distance_mahalanobis { | 
| 985 |  |  |  |  |  |  | my $self = shift; | 
| 986 |  |  |  |  |  |  | my $ele = shift; | 
| 987 |  |  |  |  |  |  | my $cluster_center = shift; | 
| 988 |  |  |  |  |  |  | my $cluster_covariance = shift; | 
| 989 |  |  |  |  |  |  | my $det_of_covar = $cluster_covariance->det(); | 
| 990 |  |  |  |  |  |  | my $ele_data = $self->{_data}->{$ele}; | 
| 991 |  |  |  |  |  |  | my @diff_from_mean = vector_subtract($ele_data, $cluster_center); | 
| 992 |  |  |  |  |  |  | my $vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); | 
| 993 |  |  |  |  |  |  | $vec->set_col(0, \@diff_from_mean); | 
| 994 |  |  |  |  |  |  | my $transposed = transpose( $vec ); | 
| 995 |  |  |  |  |  |  | my $covariance_inverse; | 
| 996 |  |  |  |  |  |  | if ($cluster_covariance->det() > 0.001) { | 
| 997 |  |  |  |  |  |  | $covariance_inverse = $cluster_covariance->inverse; | 
| 998 |  |  |  |  |  |  | } else { | 
| 999 |  |  |  |  |  |  | die "covariance matrix may not have an inverse"; | 
| 1000 |  |  |  |  |  |  | } | 
| 1001 |  |  |  |  |  |  | my $determinant = $covariance_inverse->det(); | 
| 1002 |  |  |  |  |  |  | my $distance = $transposed * $covariance_inverse * $vec; | 
| 1003 |  |  |  |  |  |  | my @distance = $distance->as_list; | 
| 1004 |  |  |  |  |  |  | $distance = $distance[0]; | 
| 1005 |  |  |  |  |  |  | return sqrt $distance; | 
| 1006 |  |  |  |  |  |  | } | 
| 1007 |  |  |  |  |  |  | # Same as the previous method except that the first argument can be the actual | 
| 1008 |  |  |  |  |  |  | # coordinates of the data element.  Our goal is to find the Mahalanobis distance | 
| 1009 |  |  |  |  |  |  | # from a given data element to a cluster whose center and covariance are known.  As | 
| 1010 |  |  |  |  |  |  | # for the previous method, it requires three arguments. | 
| 1011 |  |  |  |  |  |  | sub distance_mahalanobis2 { | 
| 1012 |  |  |  |  |  |  | my $self = shift; | 
| 1013 |  |  |  |  |  |  | my $ele = shift;              # is now a ref to the array of coords for a point | 
| 1014 |  |  |  |  |  |  | my $cluster_center = shift; | 
| 1015 |  |  |  |  |  |  | my $cluster_covariance = shift; | 
| 1016 |  |  |  |  |  |  | return undef unless defined $cluster_covariance; | 
| 1017 |  |  |  |  |  |  | my $det_of_covar = $cluster_covariance->det(); | 
| 1018 |  |  |  |  |  |  | my @diff_from_mean = vector_subtract($ele, $cluster_center); | 
| 1019 |  |  |  |  |  |  | my $vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); | 
| 1020 |  |  |  |  |  |  | $vec->set_col(0, \@diff_from_mean); | 
| 1021 |  |  |  |  |  |  | my $transposed = transpose( $vec ); | 
| 1022 |  |  |  |  |  |  | my $covariance_inverse; | 
| 1023 |  |  |  |  |  |  | if ($cluster_covariance->det() > 0.001) { | 
| 1024 |  |  |  |  |  |  | $covariance_inverse = $cluster_covariance->inverse; | 
| 1025 |  |  |  |  |  |  | } else { | 
| 1026 |  |  |  |  |  |  | die "covariance matrix may not have an inverse"; | 
| 1027 |  |  |  |  |  |  | } | 
| 1028 |  |  |  |  |  |  | my $determinant = $covariance_inverse->det(); | 
| 1029 |  |  |  |  |  |  | my $distance = $transposed * $covariance_inverse * $vec; | 
| 1030 |  |  |  |  |  |  | my @distance = $distance->as_list; | 
| 1031 |  |  |  |  |  |  | $distance = $distance[0]; | 
| 1032 |  |  |  |  |  |  | return sqrt $distance; | 
| 1033 |  |  |  |  |  |  | } | 
| 1034 |  |  |  |  |  |  |  | 
| 1035 |  |  |  |  |  |  | sub estimate_cluster_covariance { | 
| 1036 |  |  |  |  |  |  | my $self = shift; | 
| 1037 |  |  |  |  |  |  | my $cluster = shift; | 
| 1038 |  |  |  |  |  |  | my $cluster_size = @$cluster; | 
| 1039 |  |  |  |  |  |  | my @cluster_center = @{$self->add_point_coords( $cluster )}; | 
| 1040 |  |  |  |  |  |  | @cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center; | 
| 1041 |  |  |  |  |  |  | # for covariance calculation: | 
| 1042 |  |  |  |  |  |  | my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$cluster)); | 
| 1043 |  |  |  |  |  |  | my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols); | 
| 1044 |  |  |  |  |  |  | my $mean_vec = Math::GSL::Matrix->new($num_rows,1); | 
| 1045 |  |  |  |  |  |  | # All the record labels are stored in the array $self->{_data_id_tags}.  The | 
| 1046 |  |  |  |  |  |  | # actual data for clustering is stored in a hash at $self->{_data} whose keys are | 
| 1047 |  |  |  |  |  |  | # the record labels; the value associated with each key is the array holding the | 
| 1048 |  |  |  |  |  |  | # corresponding numerical multidimensional data. | 
| 1049 |  |  |  |  |  |  | foreach my $j (0..$num_cols-1) { | 
| 1050 |  |  |  |  |  |  | my $tag = $cluster->[$j]; | 
| 1051 |  |  |  |  |  |  | my $data = $self->{_data}->{$tag}; | 
| 1052 |  |  |  |  |  |  | my @diff_from_mean = vector_subtract($data, \@cluster_center); | 
| 1053 |  |  |  |  |  |  | $matrix->set_col($j, \@diff_from_mean); | 
| 1054 |  |  |  |  |  |  | } | 
| 1055 |  |  |  |  |  |  | my $transposed = transpose( $matrix ); | 
| 1056 |  |  |  |  |  |  | my $covariance = $matrix * $transposed; | 
| 1057 |  |  |  |  |  |  | $covariance *= 1.0 / $num_cols; | 
| 1058 |  |  |  |  |  |  | if ($self->{_debug}) { | 
| 1059 |  |  |  |  |  |  | print "\nDisplaying the Covariance Matrix for cluster:"; | 
| 1060 |  |  |  |  |  |  | display_matrix( $covariance ); | 
| 1061 |  |  |  |  |  |  | } | 
| 1062 |  |  |  |  |  |  | return $covariance; | 
| 1063 |  |  |  |  |  |  | } | 
| 1064 |  |  |  |  |  |  |  | 
| 1065 |  |  |  |  |  |  | sub write_clusters_to_files { | 
| 1066 |  |  |  |  |  |  | my $self = shift; | 
| 1067 |  |  |  |  |  |  | my @clusters = @{$self->{_clusters}}; | 
| 1068 |  |  |  |  |  |  | unlink glob "cluster*.dat"; | 
| 1069 |  |  |  |  |  |  | foreach my $i (0..@clusters-1) { | 
| 1070 |  |  |  |  |  |  | my $filename = "cluster" . $i . ".dat"; | 
| 1071 |  |  |  |  |  |  | print "\nWriting cluster $i to file $filename\n" if $self->{_terminal_output}; | 
| 1072 |  |  |  |  |  |  | open FILEHANDLE, "| sort > $filename"  or die "Unable to open file: $!"; | 
| 1073 |  |  |  |  |  |  | foreach my $ele (@{$clusters[$i]}) { | 
| 1074 |  |  |  |  |  |  | print FILEHANDLE "$ele "; | 
| 1075 |  |  |  |  |  |  | } | 
| 1076 |  |  |  |  |  |  | close FILEHANDLE; | 
| 1077 |  |  |  |  |  |  | } | 
| 1078 |  |  |  |  |  |  | } | 
| 1079 |  |  |  |  |  |  |  | 
| 1080 |  |  |  |  |  |  | sub get_K_best { | 
| 1081 |  |  |  |  |  |  | my $self = shift; | 
| 1082 |  |  |  |  |  |  | croak "You need to run the clusterer with K=0 option " . | 
| 1083 |  |  |  |  |  |  | "before you can call this method" if $self->{_K_best} eq 'unknown'; | 
| 1084 |  |  |  |  |  |  | print "\nThe best value of K: $self->{_K_best}\n" if $self->{_terminal_output}; | 
| 1085 |  |  |  |  |  |  | return $self->{_K_best}; | 
| 1086 |  |  |  |  |  |  | } | 
| 1087 |  |  |  |  |  |  |  | 
| 1088 |  |  |  |  |  |  | sub show_QoC_values { | 
| 1089 |  |  |  |  |  |  | my $self = shift; | 
| 1090 |  |  |  |  |  |  | croak "\nYou need to run the clusterer with K=0 option before you can call this method" | 
| 1091 |  |  |  |  |  |  | if $self->{_K_best} eq 'unknown'; | 
| 1092 |  |  |  |  |  |  | print "\nShown below are K on the left and the QoC values on the right (the smaller " . | 
| 1093 |  |  |  |  |  |  | "the QoC, the better it is)\n"; | 
| 1094 |  |  |  |  |  |  | foreach my $key (sort keys %{$self->{_QoC_values}} ) { | 
| 1095 |  |  |  |  |  |  | print " $key  =>  $self->{_QoC_values}->{$key}\n" if defined $self->{_QoC_values}->{$key}; | 
| 1096 |  |  |  |  |  |  | } | 
| 1097 |  |  |  |  |  |  | } | 
| 1098 |  |  |  |  |  |  |  | 
| 1099 |  |  |  |  |  |  | sub DESTROY { | 
| 1100 |  |  |  |  |  |  | unlink "__temp_" . basename($_[0]->{_datafile}); | 
| 1101 |  |  |  |  |  |  | unlink "__temp_data_" . basename($_[0]->{_datafile}); | 
| 1102 |  |  |  |  |  |  | unlink "__temp_normed_data_" . basename($_[0]->{_datafile}); | 
| 1103 |  |  |  |  |  |  | } | 
| 1104 |  |  |  |  |  |  |  | 
| 1105 |  |  |  |  |  |  | ##################################  Visualization Code ################################### | 
| 1106 |  |  |  |  |  |  |  | 
| 1107 |  |  |  |  |  |  | #  It makes sense to call visualize_clusters() only AFTER you have called kmeans(). | 
| 1108 |  |  |  |  |  |  | # | 
| 1109 |  |  |  |  |  |  | #  The visualize_clusters() implementation automatically figures out whether it | 
| 1110 |  |  |  |  |  |  | #  should do a 2D plot or a 3D plot.  If the number of on bits in the mask that is | 
| 1111 |  |  |  |  |  |  | #  supplied as one of the arguments is greater than 2, it does a 3D plot for the | 
| 1112 |  |  |  |  |  |  | #  first three data coordinates.  That is, the clusters will be displayed in the 3D | 
| 1113 |  |  |  |  |  |  | #  space formed by the first three data coordinates. On the other hand, if the number | 
| 1114 |  |  |  |  |  |  | #  of on bits in the mask is exactly 2, it does a 2D plot.  Should it happen that | 
| 1115 |  |  |  |  |  |  | #  only one on bit is specified for the mask, visualize_clusters() aborts. | 
| 1116 |  |  |  |  |  |  | # | 
| 1117 |  |  |  |  |  |  | #  The visualization code consists of first accessing each of clusters created by the | 
| 1118 |  |  |  |  |  |  | #  kmeans() subroutine.  Note that the clusters contain only the symbolic names for | 
| 1119 |  |  |  |  |  |  | #  the individual records in the source data file.  We therefore next reach into the | 
| 1120 |  |  |  |  |  |  | #  $self->{_original_data} hash and get the data coordinates associated with each | 
| 1121 |  |  |  |  |  |  | #  symbolic label in a cluster.  The numerical data thus generated is then written | 
| 1122 |  |  |  |  |  |  | #  out to a temp file.  When doing so we must remember to insert TWO BLANK LINES | 
| 1123 |  |  |  |  |  |  | #  between the data blocks corresponding to the different clusters.  This constraint | 
| 1124 |  |  |  |  |  |  | #  is imposed on us by Gnuplot when plotting data from the same file since we want to | 
| 1125 |  |  |  |  |  |  | #  use different point styles for the data points in different cluster files. | 
| 1126 |  |  |  |  |  |  | # | 
| 1127 |  |  |  |  |  |  | #  Subsequently, we call upon the Perl interface provided by the Graphics::GnuplotIF | 
| 1128 |  |  |  |  |  |  | #  module to plot the data clusters. | 
| 1129 |  |  |  |  |  |  | sub visualize_clusters { | 
| 1130 |  |  |  |  |  |  | my $self = shift; | 
| 1131 |  |  |  |  |  |  | my $v_mask; | 
| 1132 |  |  |  |  |  |  | my $pause_time; | 
| 1133 |  |  |  |  |  |  | if (@_ == 1) { | 
| 1134 |  |  |  |  |  |  | $v_mask = shift || croak "visualization mask missing"; | 
| 1135 |  |  |  |  |  |  | } elsif (@_ == 2) { | 
| 1136 |  |  |  |  |  |  | $v_mask = shift || croak "visualization mask missing"; | 
| 1137 |  |  |  |  |  |  | $pause_time = shift; | 
| 1138 |  |  |  |  |  |  | } else { | 
| 1139 |  |  |  |  |  |  | croak "visualize_clusters() called with wrong args"; | 
| 1140 |  |  |  |  |  |  | } | 
| 1141 |  |  |  |  |  |  | my $master_datafile = $self->{_datafile}; | 
| 1142 |  |  |  |  |  |  | my @v_mask = split //, $v_mask; | 
| 1143 |  |  |  |  |  |  | my $visualization_mask_width = @v_mask; | 
| 1144 |  |  |  |  |  |  | my $original_data_mask = $self->{_mask}; | 
| 1145 |  |  |  |  |  |  | my @mask = split //, $original_data_mask; | 
| 1146 |  |  |  |  |  |  | my $data_field_width = scalar grep {$_ eq '1'} @mask; | 
| 1147 |  |  |  |  |  |  | croak "\n\nABORTED: The width of the visualization mask (including " . | 
| 1148 |  |  |  |  |  |  | "all its 1s and 0s) must equal the width of the original mask " . | 
| 1149 |  |  |  |  |  |  | "used for reading the data file (counting only the 1's)" | 
| 1150 |  |  |  |  |  |  | if $visualization_mask_width != $data_field_width; | 
| 1151 |  |  |  |  |  |  | my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask; | 
| 1152 |  |  |  |  |  |  | my %visualization_data; | 
| 1153 |  |  |  |  |  |  | while ( my ($record_id, $data) = each %{$self->{_original_data}} ) { | 
| 1154 |  |  |  |  |  |  | my @fields = @$data; | 
| 1155 |  |  |  |  |  |  | croak "\nABORTED: Visualization mask size exceeds data record size\n" | 
| 1156 |  |  |  |  |  |  | if $#v_mask > $#fields; | 
| 1157 |  |  |  |  |  |  | my @data_fields; | 
| 1158 |  |  |  |  |  |  | foreach my $i (0..@fields-1) { | 
| 1159 |  |  |  |  |  |  | if ($v_mask[$i] eq '0') { | 
| 1160 |  |  |  |  |  |  | next; | 
| 1161 |  |  |  |  |  |  | } elsif ($v_mask[$i] eq '1') { | 
| 1162 |  |  |  |  |  |  | push @data_fields, $fields[$i]; | 
| 1163 |  |  |  |  |  |  | } else { | 
| 1164 |  |  |  |  |  |  | croak "Misformed visualization mask. It can only have 1s and 0s\n"; | 
| 1165 |  |  |  |  |  |  | } | 
| 1166 |  |  |  |  |  |  | } | 
| 1167 |  |  |  |  |  |  | $visualization_data{ $record_id } = \@data_fields; | 
| 1168 |  |  |  |  |  |  | } | 
| 1169 |  |  |  |  |  |  | my @all_data_ids = @{$self->{_data_id_tags}}; | 
| 1170 |  |  |  |  |  |  | my $K = scalar @{$self->{_clusters}}; | 
| 1171 |  |  |  |  |  |  | my $filename = basename($master_datafile); | 
| 1172 |  |  |  |  |  |  | my $temp_file = "__temp_" . $filename; | 
| 1173 |  |  |  |  |  |  | unlink $temp_file if -e $temp_file; | 
| 1174 |  |  |  |  |  |  | open OUTPUT, ">$temp_file" | 
| 1175 |  |  |  |  |  |  | or die "Unable to open a temp file in this directory: $!\n"; | 
| 1176 |  |  |  |  |  |  | foreach my $cluster (@{$self->{_clusters}}) { | 
| 1177 |  |  |  |  |  |  | foreach my $item (@$cluster) { | 
| 1178 |  |  |  |  |  |  | print OUTPUT "@{$visualization_data{$item}}"; | 
| 1179 |  |  |  |  |  |  | print OUTPUT "\n"; | 
| 1180 |  |  |  |  |  |  | } | 
| 1181 |  |  |  |  |  |  | print OUTPUT "\n\n"; | 
| 1182 |  |  |  |  |  |  | } | 
| 1183 |  |  |  |  |  |  | close OUTPUT; | 
| 1184 |  |  |  |  |  |  | my $plot; | 
| 1185 |  |  |  |  |  |  | my $hardcopy_plot; | 
| 1186 |  |  |  |  |  |  | if (!defined $pause_time) { | 
| 1187 |  |  |  |  |  |  | $plot = Graphics::GnuplotIF->new( persist => 1 ); | 
| 1188 |  |  |  |  |  |  | $hardcopy_plot = Graphics::GnuplotIF->new(); | 
| 1189 |  |  |  |  |  |  | $hardcopy_plot->gnuplot_cmd('set terminal png', "set output \"clustering_results.png\""); | 
| 1190 |  |  |  |  |  |  | } else { | 
| 1191 |  |  |  |  |  |  | $plot = Graphics::GnuplotIF->new(); | 
| 1192 |  |  |  |  |  |  | } | 
| 1193 |  |  |  |  |  |  | $plot->gnuplot_cmd( "set noclip" ); | 
| 1194 |  |  |  |  |  |  | $plot->gnuplot_cmd( "set pointsize 2" ); | 
| 1195 |  |  |  |  |  |  | my $arg_string = ""; | 
| 1196 |  |  |  |  |  |  | if ($visualization_data_field_width > 2) { | 
| 1197 |  |  |  |  |  |  | foreach my $i (0..$K-1) { | 
| 1198 |  |  |  |  |  |  | my $j = $i + 1; | 
| 1199 |  |  |  |  |  |  | $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i\" with points lt $j pt $j, "; | 
| 1200 |  |  |  |  |  |  | } | 
| 1201 |  |  |  |  |  |  | } elsif ($visualization_data_field_width == 2) { | 
| 1202 |  |  |  |  |  |  | foreach my $i (0..$K-1) { | 
| 1203 |  |  |  |  |  |  | my $j = $i + 1; | 
| 1204 |  |  |  |  |  |  | $arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i\" with points lt $j pt $j, "; | 
| 1205 |  |  |  |  |  |  | } | 
| 1206 |  |  |  |  |  |  | } elsif ($visualization_data_field_width == 1 ) { | 
| 1207 |  |  |  |  |  |  | foreach my $i (0..$K-1) { | 
| 1208 |  |  |  |  |  |  | my $j = $i + 1; | 
| 1209 |  |  |  |  |  |  | $arg_string .= "\"$temp_file\" index $i using 1 title \"Cluster $i\" with points lt $j pt $j, "; | 
| 1210 |  |  |  |  |  |  | } | 
| 1211 |  |  |  |  |  |  | } | 
| 1212 |  |  |  |  |  |  | $arg_string = $arg_string =~ /^(.*),[ ]+$/; | 
| 1213 |  |  |  |  |  |  | $arg_string = $1; | 
| 1214 |  |  |  |  |  |  | if ($visualization_data_field_width > 2) { | 
| 1215 |  |  |  |  |  |  | $plot->gnuplot_cmd( "splot $arg_string" ); | 
| 1216 |  |  |  |  |  |  | $hardcopy_plot->gnuplot_cmd( "splot $arg_string" ) unless defined $pause_time; | 
| 1217 |  |  |  |  |  |  | $plot->gnuplot_pause( $pause_time ) if defined $pause_time; | 
| 1218 |  |  |  |  |  |  | } elsif ($visualization_data_field_width == 2) { | 
| 1219 |  |  |  |  |  |  | $plot->gnuplot_cmd( "plot $arg_string" ); | 
| 1220 |  |  |  |  |  |  | $hardcopy_plot->gnuplot_cmd( "plot $arg_string" ) unless defined $pause_time; | 
| 1221 |  |  |  |  |  |  | $plot->gnuplot_pause( $pause_time ) if defined $pause_time; | 
| 1222 |  |  |  |  |  |  | } elsif ($visualization_data_field_width == 1) { | 
| 1223 |  |  |  |  |  |  | croak "No provision for plotting 1-D data\n"; | 
| 1224 |  |  |  |  |  |  | } | 
| 1225 |  |  |  |  |  |  | } | 
| 1226 |  |  |  |  |  |  |  | 
| 1227 |  |  |  |  |  |  | #  It makes sense to call visualize_data() only AFTER you have called the method | 
| 1228 |  |  |  |  |  |  | #  read_data_from_file(). | 
| 1229 |  |  |  |  |  |  | # | 
| 1230 |  |  |  |  |  |  | #  The visualize_data() is meant for the visualization of the original data in its | 
| 1231 |  |  |  |  |  |  | #  various 2D or 3D subspaces.  The method can also be used to visualize the normed | 
| 1232 |  |  |  |  |  |  | #  data in a similar manner.  Recall the normed data is the original data after each | 
| 1233 |  |  |  |  |  |  | #  data dimension is normalized by the standard-deviation along that dimension. | 
| 1234 |  |  |  |  |  |  | # | 
| 1235 |  |  |  |  |  |  | #  Whether you see the original data or the normed data depends on the second | 
| 1236 |  |  |  |  |  |  | #  argument supplied in the method call.  It must be either the string 'original' or | 
| 1237 |  |  |  |  |  |  | #  the string 'normed'. | 
| 1238 |  |  |  |  |  |  | sub visualize_data { | 
| 1239 |  |  |  |  |  |  | my $self = shift; | 
| 1240 |  |  |  |  |  |  | my $v_mask = shift || croak "visualization mask missing"; | 
| 1241 |  |  |  |  |  |  | my $datatype = shift;    # must be either 'original' or 'normed' | 
| 1242 |  |  |  |  |  |  | croak "\n\nABORTED: You called visualize_data() for normed data " . | 
| 1243 |  |  |  |  |  |  | "but without first turning on data normalization in the " . | 
| 1244 |  |  |  |  |  |  | "in the KMeans constructor" | 
| 1245 |  |  |  |  |  |  | if ($datatype eq 'normed') && ! $self->{_var_normalize}; | 
| 1246 |  |  |  |  |  |  | my $master_datafile = $self->{_datafile}; | 
| 1247 |  |  |  |  |  |  | my @v_mask = split //, $v_mask; | 
| 1248 |  |  |  |  |  |  | my $visualization_mask_width = @v_mask; | 
| 1249 |  |  |  |  |  |  | my $original_data_mask = $self->{_mask}; | 
| 1250 |  |  |  |  |  |  | my @mask = split //, $original_data_mask; | 
| 1251 |  |  |  |  |  |  | my $data_field_width = scalar grep {$_ eq '1'} @mask; | 
| 1252 |  |  |  |  |  |  | croak "\n\nABORTED: The width of the visualization mask (including " . | 
| 1253 |  |  |  |  |  |  | "all its 1s and 0s) must equal the width of the original mask " . | 
| 1254 |  |  |  |  |  |  | "used for reading the data file (counting only the 1's)" | 
| 1255 |  |  |  |  |  |  | if $visualization_mask_width != $data_field_width; | 
| 1256 |  |  |  |  |  |  | my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask; | 
| 1257 |  |  |  |  |  |  | my %visualization_data; | 
| 1258 |  |  |  |  |  |  | my $data_source; | 
| 1259 |  |  |  |  |  |  | if ($datatype eq 'original') { | 
| 1260 |  |  |  |  |  |  | $data_source  =  $self->{_original_data}; | 
| 1261 |  |  |  |  |  |  | } elsif ($datatype eq 'normed') { | 
| 1262 |  |  |  |  |  |  | $data_source  =  $self->{_data}; | 
| 1263 |  |  |  |  |  |  | } else { | 
| 1264 |  |  |  |  |  |  | croak "\n\nABORTED: improper call to visualize_data()"; | 
| 1265 |  |  |  |  |  |  | } | 
| 1266 |  |  |  |  |  |  | while ( my ($record_id, $data) = each %{$data_source} ) { | 
| 1267 |  |  |  |  |  |  | my @fields = @$data; | 
| 1268 |  |  |  |  |  |  | croak "\nABORTED: Visualization mask size exceeds data record size\n" | 
| 1269 |  |  |  |  |  |  | if $#v_mask > $#fields; | 
| 1270 |  |  |  |  |  |  | my @data_fields; | 
| 1271 |  |  |  |  |  |  | foreach my $i (0..@fields-1) { | 
| 1272 |  |  |  |  |  |  | if ($v_mask[$i] eq '0') { | 
| 1273 |  |  |  |  |  |  | next; | 
| 1274 |  |  |  |  |  |  | } elsif ($v_mask[$i] eq '1') { | 
| 1275 |  |  |  |  |  |  | push @data_fields, $fields[$i]; | 
| 1276 |  |  |  |  |  |  | } else { | 
| 1277 |  |  |  |  |  |  | croak "Misformed visualization mask. It can only have 1s and 0s\n"; | 
| 1278 |  |  |  |  |  |  | } | 
| 1279 |  |  |  |  |  |  | } | 
| 1280 |  |  |  |  |  |  | $visualization_data{ $record_id } = \@data_fields; | 
| 1281 |  |  |  |  |  |  | } | 
| 1282 |  |  |  |  |  |  | my $filename = basename($master_datafile); | 
| 1283 |  |  |  |  |  |  | my $temp_file; | 
| 1284 |  |  |  |  |  |  | if ($datatype eq 'original') { | 
| 1285 |  |  |  |  |  |  | $temp_file = "__temp_data_" . $filename; | 
| 1286 |  |  |  |  |  |  | } elsif ($datatype eq 'normed') { | 
| 1287 |  |  |  |  |  |  | $temp_file = "__temp_normed_data_" . $filename; | 
| 1288 |  |  |  |  |  |  | } else { | 
| 1289 |  |  |  |  |  |  | croak "ABORTED: Improper call to visualize_data()"; | 
| 1290 |  |  |  |  |  |  | } | 
| 1291 |  |  |  |  |  |  | unlink $temp_file if -e $temp_file; | 
| 1292 |  |  |  |  |  |  | open OUTPUT, ">$temp_file" | 
| 1293 |  |  |  |  |  |  | or die "Unable to open a temp file in this directory: $!\n"; | 
| 1294 |  |  |  |  |  |  | foreach my $datapoint (values %visualization_data) { | 
| 1295 |  |  |  |  |  |  | print OUTPUT "@$datapoint"; | 
| 1296 |  |  |  |  |  |  | print OUTPUT "\n"; | 
| 1297 |  |  |  |  |  |  | } | 
| 1298 |  |  |  |  |  |  | close OUTPUT; | 
| 1299 |  |  |  |  |  |  | my $plot = Graphics::GnuplotIF->new( persist => 1 ); | 
| 1300 |  |  |  |  |  |  | $plot->gnuplot_cmd( "set noclip" ); | 
| 1301 |  |  |  |  |  |  | $plot->gnuplot_cmd( "set pointsize 2" ); | 
| 1302 |  |  |  |  |  |  | my $plot_title =  $datatype eq 'original' ? '"data"' : '"normed data"'; | 
| 1303 |  |  |  |  |  |  | my $arg_string ; | 
| 1304 |  |  |  |  |  |  | if ($visualization_data_field_width > 2) { | 
| 1305 |  |  |  |  |  |  | $arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1"; | 
| 1306 |  |  |  |  |  |  | } elsif ($visualization_data_field_width == 2) { | 
| 1307 |  |  |  |  |  |  | $arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1"; | 
| 1308 |  |  |  |  |  |  | } elsif ($visualization_data_field_width == 1 ) { | 
| 1309 |  |  |  |  |  |  | $arg_string = "\"$temp_file\" using 1 notitle with points lt -1 pt 1"; | 
| 1310 |  |  |  |  |  |  | } | 
| 1311 |  |  |  |  |  |  | if ($visualization_data_field_width > 2) { | 
| 1312 |  |  |  |  |  |  | $plot->gnuplot_cmd( "splot $arg_string" ); | 
| 1313 |  |  |  |  |  |  | } elsif ($visualization_data_field_width == 2) { | 
| 1314 |  |  |  |  |  |  | $plot->gnuplot_cmd( "plot $arg_string" ); | 
| 1315 |  |  |  |  |  |  | } elsif ($visualization_data_field_width == 1) { | 
| 1316 |  |  |  |  |  |  | croak "No provision for plotting 1-D data\n"; | 
| 1317 |  |  |  |  |  |  | } | 
| 1318 |  |  |  |  |  |  | } | 
| 1319 |  |  |  |  |  |  |  | 
| 1320 |  |  |  |  |  |  | ###########################  Generating Synthetic Data for Clustering  ############################## | 
| 1321 |  |  |  |  |  |  |  | 
| 1322 |  |  |  |  |  |  | #  The data generated corresponds to a multivariate distribution.  The mean and the | 
| 1323 |  |  |  |  |  |  | #  covariance of each Gaussian in the distribution are specified individually in a | 
| 1324 |  |  |  |  |  |  | #  parameter file.  See the example parameter file param.txt in the examples | 
| 1325 |  |  |  |  |  |  | #  directory.  Just edit this file for your own needs. | 
| 1326 |  |  |  |  |  |  | # | 
| 1327 |  |  |  |  |  |  | #  The multivariate random numbers are generated by calling the Math::Random module. | 
| 1328 |  |  |  |  |  |  | #  As you would expect, that module will insist that the covariance matrix you | 
| 1329 |  |  |  |  |  |  | #  specify be symmetric and positive definite. | 
| 1330 |  |  |  |  |  |  | sub cluster_data_generator { | 
| 1331 |  |  |  |  |  |  | my $class = shift; | 
| 1332 |  |  |  |  |  |  | croak "illegal call of a class method" unless $class eq 'Algorithm::KMeans'; | 
| 1333 |  |  |  |  |  |  | my %args = @_; | 
| 1334 |  |  |  |  |  |  | my $input_parameter_file = $args{input_parameter_file}; | 
| 1335 |  |  |  |  |  |  | my $output_file = $args{output_datafile}; | 
| 1336 |  |  |  |  |  |  | my $N = $args{number_data_points_per_cluster}; | 
| 1337 |  |  |  |  |  |  | my @all_params; | 
| 1338 |  |  |  |  |  |  | my $param_string; | 
| 1339 |  |  |  |  |  |  | if (defined $input_parameter_file) { | 
| 1340 |  |  |  |  |  |  | open INPUT, $input_parameter_file || "unable to open parameter file: $!"; | 
| 1341 |  |  |  |  |  |  | @all_params = ; | 
| 1342 |  |  |  |  |  |  | @all_params = grep { $_ !~ /^[ ]*#/ } @all_params; | 
| 1343 |  |  |  |  |  |  | chomp @all_params; | 
| 1344 |  |  |  |  |  |  | $param_string = join ' ', @all_params; | 
| 1345 |  |  |  |  |  |  | } else { | 
| 1346 |  |  |  |  |  |  | # Just for testing. Used in t/test.t | 
| 1347 |  |  |  |  |  |  | $param_string = "cluster 5 0 0  1 0 0 0 1 0 0 0 1 " . | 
| 1348 |  |  |  |  |  |  | "cluster 0 5 0  1 0 0 0 1 0 0 0 1 " . | 
| 1349 |  |  |  |  |  |  | "cluster 0 0 5  1 0 0 0 1 0 0 0 1"; | 
| 1350 |  |  |  |  |  |  | } | 
| 1351 |  |  |  |  |  |  | my @cluster_strings = split /[ ]*cluster[ ]*/, $param_string; | 
| 1352 |  |  |  |  |  |  | @cluster_strings = grep  $_, @cluster_strings; | 
| 1353 |  |  |  |  |  |  | my $K = @cluster_strings; | 
| 1354 |  |  |  |  |  |  | croak "Too many clusters requested" if $K > 12; | 
| 1355 |  |  |  |  |  |  | my @point_labels = ('a'..'z'); | 
| 1356 |  |  |  |  |  |  | print "Number of Gaussians used for the synthetic data: $K\n"; | 
| 1357 |  |  |  |  |  |  | my @means; | 
| 1358 |  |  |  |  |  |  | my @covariances; | 
| 1359 |  |  |  |  |  |  | my $data_dimension; | 
| 1360 |  |  |  |  |  |  | foreach my $i (0..$K-1) { | 
| 1361 |  |  |  |  |  |  | my @num_strings = split /  /, $cluster_strings[$i]; | 
| 1362 |  |  |  |  |  |  | my @cluster_mean = map {/$_num_regex/;$_} split / /, $num_strings[0]; | 
| 1363 |  |  |  |  |  |  | $data_dimension = @cluster_mean; | 
| 1364 |  |  |  |  |  |  | push @means, \@cluster_mean; | 
| 1365 |  |  |  |  |  |  | my @covariance_nums = map {/$_num_regex/;$_} split / /, $num_strings[1]; | 
| 1366 |  |  |  |  |  |  | croak "dimensionality error" if @covariance_nums != | 
| 1367 |  |  |  |  |  |  | ($data_dimension ** 2); | 
| 1368 |  |  |  |  |  |  | my $cluster_covariance; | 
| 1369 |  |  |  |  |  |  | foreach my $j (0..$data_dimension-1) { | 
| 1370 |  |  |  |  |  |  | foreach my $k (0..$data_dimension-1) { | 
| 1371 |  |  |  |  |  |  | $cluster_covariance->[$j]->[$k] = | 
| 1372 |  |  |  |  |  |  | $covariance_nums[$j*$data_dimension + $k]; | 
| 1373 |  |  |  |  |  |  | } | 
| 1374 |  |  |  |  |  |  | } | 
| 1375 |  |  |  |  |  |  | push @covariances, $cluster_covariance; | 
| 1376 |  |  |  |  |  |  | } | 
| 1377 |  |  |  |  |  |  | random_seed_from_phrase( 'hellojello' ); | 
| 1378 |  |  |  |  |  |  | my @data_dump; | 
| 1379 |  |  |  |  |  |  | foreach my $i (0..$K-1) { | 
| 1380 |  |  |  |  |  |  | my @m = @{shift @means}; | 
| 1381 |  |  |  |  |  |  | my @covar = @{shift @covariances}; | 
| 1382 |  |  |  |  |  |  | my @new_data = Math::Random::random_multivariate_normal( $N, @m, @covar ); | 
| 1383 |  |  |  |  |  |  | my $p = 0; | 
| 1384 |  |  |  |  |  |  | my $label = $point_labels[$i]; | 
| 1385 |  |  |  |  |  |  | @new_data = map {unshift @$_, $label.$i; $i++; $_} @new_data; | 
| 1386 |  |  |  |  |  |  | push @data_dump, @new_data; | 
| 1387 |  |  |  |  |  |  | } | 
| 1388 |  |  |  |  |  |  | fisher_yates_shuffle( \@data_dump ); | 
| 1389 |  |  |  |  |  |  | open OUTPUT, ">$output_file"; | 
| 1390 |  |  |  |  |  |  | foreach my $ele (@data_dump) { | 
| 1391 |  |  |  |  |  |  | foreach my $coord ( @$ele ) { | 
| 1392 |  |  |  |  |  |  | print OUTPUT "$coord "; | 
| 1393 |  |  |  |  |  |  | } | 
| 1394 |  |  |  |  |  |  | print OUTPUT "\n"; | 
| 1395 |  |  |  |  |  |  | } | 
| 1396 |  |  |  |  |  |  | print "Data written out to file $output_file\n"; | 
| 1397 |  |  |  |  |  |  | close OUTPUT; | 
| 1398 |  |  |  |  |  |  | } | 
| 1399 |  |  |  |  |  |  |  | 
| 1400 |  |  |  |  |  |  | sub add_point_coords { | 
| 1401 |  |  |  |  |  |  | my $self = shift; | 
| 1402 |  |  |  |  |  |  | my @arr_of_ids = @{shift @_};      # array of data element names | 
| 1403 |  |  |  |  |  |  | my @result; | 
| 1404 |  |  |  |  |  |  | my $data_dimensionality = $self->{_data_dimensions}; | 
| 1405 |  |  |  |  |  |  | foreach my $i (0..$data_dimensionality-1) { | 
| 1406 |  |  |  |  |  |  | $result[$i] = 0.0; | 
| 1407 |  |  |  |  |  |  | } | 
| 1408 |  |  |  |  |  |  | foreach my $id (@arr_of_ids) { | 
| 1409 |  |  |  |  |  |  | my $ele = $self->{_data}->{$id}; | 
| 1410 |  |  |  |  |  |  | my $i = 0; | 
| 1411 |  |  |  |  |  |  | foreach my $component (@$ele) { | 
| 1412 |  |  |  |  |  |  | $result[$i] += $component; | 
| 1413 |  |  |  |  |  |  | $i++; | 
| 1414 |  |  |  |  |  |  | } | 
| 1415 |  |  |  |  |  |  | } | 
| 1416 |  |  |  |  |  |  | return \@result; | 
| 1417 |  |  |  |  |  |  | } | 
| 1418 |  |  |  |  |  |  |  | 
| 1419 |  |  |  |  |  |  | sub add_point_coords_from_original_data { | 
| 1420 |  |  |  |  |  |  | my $self = shift; | 
| 1421 |  |  |  |  |  |  | my @arr_of_ids = @{shift @_};      # array of data element names | 
| 1422 |  |  |  |  |  |  | my @result; | 
| 1423 |  |  |  |  |  |  | my $data_dimensionality = $self->{_data_dimensions}; | 
| 1424 |  |  |  |  |  |  | foreach my $i (0..$data_dimensionality-1) { | 
| 1425 |  |  |  |  |  |  | $result[$i] = 0.0; | 
| 1426 |  |  |  |  |  |  | } | 
| 1427 |  |  |  |  |  |  | foreach my $id (@arr_of_ids) { | 
| 1428 |  |  |  |  |  |  | my $ele = $self->{_original_data}->{$id}; | 
| 1429 |  |  |  |  |  |  | my $i = 0; | 
| 1430 |  |  |  |  |  |  | foreach my $component (@$ele) { | 
| 1431 |  |  |  |  |  |  | $result[$i] += $component; | 
| 1432 |  |  |  |  |  |  | $i++; | 
| 1433 |  |  |  |  |  |  | } | 
| 1434 |  |  |  |  |  |  | } | 
| 1435 |  |  |  |  |  |  | return \@result; | 
| 1436 |  |  |  |  |  |  | } | 
| 1437 |  |  |  |  |  |  |  | 
| 1438 |  |  |  |  |  |  | ###################################   Support Routines  ######################################## | 
| 1439 |  |  |  |  |  |  |  | 
| 1440 |  |  |  |  |  |  | sub get_index_at_value { | 
| 1441 |  |  |  |  |  |  | my $value = shift; | 
| 1442 |  |  |  |  |  |  | my @array = @{shift @_}; | 
| 1443 |  |  |  |  |  |  | foreach my $i (0..@array-1) { | 
| 1444 |  |  |  |  |  |  | return $i if $value == $array[$i]; | 
| 1445 |  |  |  |  |  |  | } | 
| 1446 |  |  |  |  |  |  | return -1; | 
| 1447 |  |  |  |  |  |  | } | 
| 1448 |  |  |  |  |  |  |  | 
| 1449 |  |  |  |  |  |  | # This routine is really not necessary in light of the new `~~' operator in Perl. | 
| 1450 |  |  |  |  |  |  | # Will use the new operator in the next version. | 
| 1451 |  |  |  |  |  |  | sub vector_equal { | 
| 1452 |  |  |  |  |  |  | my $vec1 = shift; | 
| 1453 |  |  |  |  |  |  | my $vec2 = shift; | 
| 1454 |  |  |  |  |  |  | die "wrong data types for vector-equal predicate\n" if @$vec1 != @$vec2; | 
| 1455 |  |  |  |  |  |  | foreach my $i (0..@$vec1-1){ | 
| 1456 |  |  |  |  |  |  | return 0 if $vec1->[$i] != $vec2->[$i]; | 
| 1457 |  |  |  |  |  |  | } | 
| 1458 |  |  |  |  |  |  | return 1; | 
| 1459 |  |  |  |  |  |  | } | 
| 1460 |  |  |  |  |  |  |  | 
| 1461 |  |  |  |  |  |  | # Returns the minimum value and its positional index in an array | 
| 1462 |  |  |  |  |  |  | sub minimum { | 
| 1463 |  |  |  |  |  |  | my $arr = shift; | 
| 1464 |  |  |  |  |  |  | my $min; | 
| 1465 |  |  |  |  |  |  | my $index; | 
| 1466 |  |  |  |  |  |  | foreach my $i (0..@{$arr}-1) { | 
| 1467 |  |  |  |  |  |  | if ( (!defined $min) || ($arr->[$i] < $min) ) { | 
| 1468 |  |  |  |  |  |  | $index = $i; | 
| 1469 |  |  |  |  |  |  | $min = $arr->[$i]; | 
| 1470 |  |  |  |  |  |  | } | 
| 1471 |  |  |  |  |  |  | } | 
| 1472 |  |  |  |  |  |  | return ($min, $index); | 
| 1473 |  |  |  |  |  |  | } | 
| 1474 |  |  |  |  |  |  |  | 
| 1475 |  |  |  |  |  |  | sub minmax { | 
| 1476 |  |  |  |  |  |  | my $arr = shift; | 
| 1477 |  |  |  |  |  |  | my $min; | 
| 1478 |  |  |  |  |  |  | my $max; | 
| 1479 |  |  |  |  |  |  | foreach my $i (0..@{$arr}-1) { | 
| 1480 |  |  |  |  |  |  | next if !defined $arr->[$i]; | 
| 1481 |  |  |  |  |  |  | if ( (!defined $min) && (!defined $max) ) { | 
| 1482 |  |  |  |  |  |  | $min = $arr->[$i]; | 
| 1483 |  |  |  |  |  |  | $max = $arr->[$i]; | 
| 1484 |  |  |  |  |  |  | } elsif ( $arr->[$i] < $min ) { | 
| 1485 |  |  |  |  |  |  | $min = $arr->[$i]; | 
| 1486 |  |  |  |  |  |  | } elsif ( $arr->[$i] > $max ) { | 
| 1487 |  |  |  |  |  |  | $max = $arr->[$i]; | 
| 1488 |  |  |  |  |  |  | } | 
| 1489 |  |  |  |  |  |  | } | 
| 1490 |  |  |  |  |  |  | return ($min, $max); | 
| 1491 |  |  |  |  |  |  | } | 
| 1492 |  |  |  |  |  |  |  | 
| 1493 |  |  |  |  |  |  | # Meant only for constructing a deep copy of an array of arrays: | 
| 1494 |  |  |  |  |  |  | sub deep_copy_AoA { | 
| 1495 |  |  |  |  |  |  | my $ref_in = shift; | 
| 1496 |  |  |  |  |  |  | my $ref_out; | 
| 1497 |  |  |  |  |  |  | foreach my $i (0..@{$ref_in}-1) { | 
| 1498 |  |  |  |  |  |  | foreach my $j (0..@{$ref_in->[$i]}-1) { | 
| 1499 |  |  |  |  |  |  | $ref_out->[$i]->[$j] = $ref_in->[$i]->[$j]; | 
| 1500 |  |  |  |  |  |  | } | 
| 1501 |  |  |  |  |  |  | } | 
| 1502 |  |  |  |  |  |  | return $ref_out; | 
| 1503 |  |  |  |  |  |  | } | 
| 1504 |  |  |  |  |  |  |  | 
| 1505 |  |  |  |  |  |  | # Meant only for constructing a deep copy of an array of arrays for the case when | 
| 1506 |  |  |  |  |  |  | # some elements of the top-level array may be undefined: | 
| 1507 |  |  |  |  |  |  | sub deep_copy_AoA_with_nulls { | 
| 1508 |  |  |  |  |  |  | my $ref_in = shift; | 
| 1509 |  |  |  |  |  |  | my $ref_out; | 
| 1510 |  |  |  |  |  |  | foreach my $i (0..@{$ref_in}-1) { | 
| 1511 |  |  |  |  |  |  | if ( !defined $ref_in->[$i] ) { | 
| 1512 |  |  |  |  |  |  | $ref_out->[$i] = undef; | 
| 1513 |  |  |  |  |  |  | next; | 
| 1514 |  |  |  |  |  |  | } | 
| 1515 |  |  |  |  |  |  | foreach my $j (0..@{$ref_in->[$i]}-1) { | 
| 1516 |  |  |  |  |  |  | $ref_out->[$i]->[$j] = $ref_in->[$i]->[$j]; | 
| 1517 |  |  |  |  |  |  | } | 
| 1518 |  |  |  |  |  |  | } | 
| 1519 |  |  |  |  |  |  | return $ref_out; | 
| 1520 |  |  |  |  |  |  | } | 
| 1521 |  |  |  |  |  |  |  | 
| 1522 |  |  |  |  |  |  | # Meant only for constructing a deep copy of a hash in which each value is an | 
| 1523 |  |  |  |  |  |  | # anonymous array of numbers: | 
| 1524 |  |  |  |  |  |  | sub deep_copy_hash { | 
| 1525 |  |  |  |  |  |  | my $ref_in = shift; | 
| 1526 |  |  |  |  |  |  | my $ref_out; | 
| 1527 |  |  |  |  |  |  | while ( my ($key, $value) = each( %$ref_in ) ) { | 
| 1528 |  |  |  |  |  |  | $ref_out->{$key} = deep_copy_array( $value ); | 
| 1529 |  |  |  |  |  |  | } | 
| 1530 |  |  |  |  |  |  | return $ref_out; | 
| 1531 |  |  |  |  |  |  | } | 
| 1532 |  |  |  |  |  |  |  | 
| 1533 |  |  |  |  |  |  | # Meant only for an array of numbers: | 
| 1534 |  |  |  |  |  |  | sub deep_copy_array { | 
| 1535 |  |  |  |  |  |  | my $ref_in = shift; | 
| 1536 |  |  |  |  |  |  | my $ref_out; | 
| 1537 |  |  |  |  |  |  | foreach my $i (0..@{$ref_in}-1) { | 
| 1538 |  |  |  |  |  |  | $ref_out->[$i] = $ref_in->[$i]; | 
| 1539 |  |  |  |  |  |  | } | 
| 1540 |  |  |  |  |  |  | return $ref_out; | 
| 1541 |  |  |  |  |  |  | } | 
| 1542 |  |  |  |  |  |  |  | 
| 1543 |  |  |  |  |  |  | sub display_cluster_centers { | 
| 1544 |  |  |  |  |  |  | my $self = shift; | 
| 1545 |  |  |  |  |  |  | my @clusters = @{shift @_}; | 
| 1546 |  |  |  |  |  |  | my $i = 0; | 
| 1547 |  |  |  |  |  |  | foreach my $cluster (@clusters) { | 
| 1548 |  |  |  |  |  |  | my $cluster_size = @$cluster; | 
| 1549 |  |  |  |  |  |  | my @cluster_center = | 
| 1550 |  |  |  |  |  |  | @{$self->add_point_coords_from_original_data( $cluster )}; | 
| 1551 |  |  |  |  |  |  | @cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center; | 
| 1552 |  |  |  |  |  |  | print "\ncluster $i ($cluster_size records):\n"; | 
| 1553 |  |  |  |  |  |  | print "cluster center $i: " . | 
| 1554 |  |  |  |  |  |  | "@{[map {my $x = sprintf('%.4f', $_); $x} @cluster_center]}\n"; | 
| 1555 |  |  |  |  |  |  | $i++; | 
| 1556 |  |  |  |  |  |  | } | 
| 1557 |  |  |  |  |  |  | } | 
| 1558 |  |  |  |  |  |  |  | 
| 1559 |  |  |  |  |  |  | # For displaying the individual clusters on a terminal screen.  Each cluster is | 
| 1560 |  |  |  |  |  |  | # displayed through the symbolic names associated with the data points. | 
| 1561 |  |  |  |  |  |  | sub display_clusters { | 
| 1562 |  |  |  |  |  |  | my @clusters = @{shift @_}; | 
| 1563 |  |  |  |  |  |  | my $i = 0; | 
| 1564 |  |  |  |  |  |  | foreach my $cluster (@clusters) { | 
| 1565 |  |  |  |  |  |  | @$cluster = sort @$cluster; | 
| 1566 |  |  |  |  |  |  | my $cluster_size = @$cluster; | 
| 1567 |  |  |  |  |  |  | print "\n\nCluster $i ($cluster_size records):\n"; | 
| 1568 |  |  |  |  |  |  | foreach my $ele (@$cluster) { | 
| 1569 |  |  |  |  |  |  | print "  $ele"; | 
| 1570 |  |  |  |  |  |  | } | 
| 1571 |  |  |  |  |  |  | $i++ | 
| 1572 |  |  |  |  |  |  | } | 
| 1573 |  |  |  |  |  |  | print "\n\n"; | 
| 1574 |  |  |  |  |  |  | } | 
| 1575 |  |  |  |  |  |  |  | 
| 1576 |  |  |  |  |  |  | # from perl docs: | 
| 1577 |  |  |  |  |  |  | sub fisher_yates_shuffle { | 
| 1578 |  |  |  |  |  |  | my $arr =  shift; | 
| 1579 |  |  |  |  |  |  | my $i = @$arr; | 
| 1580 |  |  |  |  |  |  | while (--$i) { | 
| 1581 |  |  |  |  |  |  | my $j = int rand( $i + 1 ); | 
| 1582 |  |  |  |  |  |  | @$arr[$i, $j] = @$arr[$j, $i]; | 
| 1583 |  |  |  |  |  |  | } | 
| 1584 |  |  |  |  |  |  | } | 
| 1585 |  |  |  |  |  |  |  | 
| 1586 |  |  |  |  |  |  | sub variance_normalization { | 
| 1587 |  |  |  |  |  |  | my %data_hash = %{shift @_}; | 
| 1588 |  |  |  |  |  |  | my @all_data_points = values %data_hash; | 
| 1589 |  |  |  |  |  |  | my $dimensions = @{$all_data_points[0]}; | 
| 1590 |  |  |  |  |  |  | my @data_projections; | 
| 1591 |  |  |  |  |  |  | foreach my $data_point (@all_data_points) { | 
| 1592 |  |  |  |  |  |  | my $i = 0; | 
| 1593 |  |  |  |  |  |  | foreach my $proj (@$data_point) { | 
| 1594 |  |  |  |  |  |  | push @{$data_projections[$i++]}, $proj; | 
| 1595 |  |  |  |  |  |  | } | 
| 1596 |  |  |  |  |  |  | } | 
| 1597 |  |  |  |  |  |  | my @variance_vec; | 
| 1598 |  |  |  |  |  |  | foreach my $vec (@data_projections) { | 
| 1599 |  |  |  |  |  |  | my ($mean, $variance) = mean_and_variance( $vec ); | 
| 1600 |  |  |  |  |  |  | push @variance_vec, $variance; | 
| 1601 |  |  |  |  |  |  | } | 
| 1602 |  |  |  |  |  |  | my %new_data_hash; | 
| 1603 |  |  |  |  |  |  | while (my ($label, $data) = each(%data_hash) ) { | 
| 1604 |  |  |  |  |  |  | my @new_data; | 
| 1605 |  |  |  |  |  |  | foreach my $i (0..@{$data}-1) { | 
| 1606 |  |  |  |  |  |  | my $new = $data->[$i] / sqrt($variance_vec[$i]); | 
| 1607 |  |  |  |  |  |  | push @new_data, $data->[$i] / sqrt($variance_vec[$i]); | 
| 1608 |  |  |  |  |  |  | } | 
| 1609 |  |  |  |  |  |  | $new_data_hash{$label} = \@new_data; | 
| 1610 |  |  |  |  |  |  | } | 
| 1611 |  |  |  |  |  |  | return \%new_data_hash; | 
| 1612 |  |  |  |  |  |  | } | 
| 1613 |  |  |  |  |  |  |  | 
| 1614 |  |  |  |  |  |  | sub mean_and_variance { | 
| 1615 |  |  |  |  |  |  | my @data = @{shift @_}; | 
| 1616 |  |  |  |  |  |  | my ($mean, $variance); | 
| 1617 |  |  |  |  |  |  | foreach my $i (1..@data) { | 
| 1618 |  |  |  |  |  |  | if ($i == 1) { | 
| 1619 |  |  |  |  |  |  | $mean = $data[0]; | 
| 1620 |  |  |  |  |  |  | $variance = 0; | 
| 1621 |  |  |  |  |  |  | } else { | 
| 1622 |  |  |  |  |  |  | $mean = ( (($i-1)/$i) * $mean ) + $data[$i-1] / $i; | 
| 1623 |  |  |  |  |  |  | $variance = ( (($i-1)/$i) * $variance )  + ($data[$i-1]-$mean)**2 / ($i-1); | 
| 1624 |  |  |  |  |  |  | } | 
| 1625 |  |  |  |  |  |  | } | 
| 1626 |  |  |  |  |  |  | return ($mean, $variance); | 
| 1627 |  |  |  |  |  |  | } | 
| 1628 |  |  |  |  |  |  |  | 
| 1629 |  |  |  |  |  |  | sub check_for_illegal_params { | 
| 1630 |  |  |  |  |  |  | my @params = @_; | 
| 1631 |  |  |  |  |  |  | my @legal_params = qw / datafile | 
| 1632 |  |  |  |  |  |  | mask | 
| 1633 |  |  |  |  |  |  | K | 
| 1634 |  |  |  |  |  |  | Kmin | 
| 1635 |  |  |  |  |  |  | Kmax | 
| 1636 |  |  |  |  |  |  | terminal_output | 
| 1637 |  |  |  |  |  |  | write_clusters_to_files | 
| 1638 |  |  |  |  |  |  | do_variance_normalization | 
| 1639 |  |  |  |  |  |  | cluster_seeding | 
| 1640 |  |  |  |  |  |  | use_mahalanobis_metric | 
| 1641 |  |  |  |  |  |  | debug | 
| 1642 |  |  |  |  |  |  | /; | 
| 1643 |  |  |  |  |  |  | my $found_match_flag; | 
| 1644 |  |  |  |  |  |  | foreach my $param (@params) { | 
| 1645 |  |  |  |  |  |  | foreach my $legal (@legal_params) { | 
| 1646 |  |  |  |  |  |  | $found_match_flag = 0; | 
| 1647 |  |  |  |  |  |  | if ($param eq $legal) { | 
| 1648 |  |  |  |  |  |  | $found_match_flag = 1; | 
| 1649 |  |  |  |  |  |  | last; | 
| 1650 |  |  |  |  |  |  | } | 
| 1651 |  |  |  |  |  |  | } | 
| 1652 |  |  |  |  |  |  | last if $found_match_flag == 0; | 
| 1653 |  |  |  |  |  |  | } | 
| 1654 |  |  |  |  |  |  | return $found_match_flag; | 
| 1655 |  |  |  |  |  |  | } | 
| 1656 |  |  |  |  |  |  |  | 
| 1657 |  |  |  |  |  |  | sub get_value_index_hash { | 
| 1658 |  |  |  |  |  |  | my $arr = shift; | 
| 1659 |  |  |  |  |  |  | my %hash; | 
| 1660 |  |  |  |  |  |  | foreach my $index (0..@$arr-1) { | 
| 1661 |  |  |  |  |  |  | $hash{$arr->[$index]} = $index if $arr->[$index] > 0; | 
| 1662 |  |  |  |  |  |  | } | 
| 1663 |  |  |  |  |  |  | return \%hash; | 
| 1664 |  |  |  |  |  |  | } | 
| 1665 |  |  |  |  |  |  |  | 
| 1666 |  |  |  |  |  |  | sub non_maximum_suppression { | 
| 1667 |  |  |  |  |  |  | my $arr = shift; | 
| 1668 |  |  |  |  |  |  | my @output = (0) x @$arr; | 
| 1669 |  |  |  |  |  |  | my @final_output = (0) x @$arr; | 
| 1670 |  |  |  |  |  |  | my %hash; | 
| 1671 |  |  |  |  |  |  | my @array_of_runs = ([$arr->[0]]); | 
| 1672 |  |  |  |  |  |  | foreach my $index (1..@$arr-1) { | 
| 1673 |  |  |  |  |  |  | if ($arr->[$index] == $arr->[$index-1]) { | 
| 1674 |  |  |  |  |  |  | push @{$array_of_runs[-1]}, $arr->[$index]; | 
| 1675 |  |  |  |  |  |  | } else { | 
| 1676 |  |  |  |  |  |  | push @array_of_runs, [$arr->[$index]]; | 
| 1677 |  |  |  |  |  |  | } | 
| 1678 |  |  |  |  |  |  | } | 
| 1679 |  |  |  |  |  |  | my $runstart_index = 0; | 
| 1680 |  |  |  |  |  |  | foreach my $run_index (1..@array_of_runs-2) { | 
| 1681 |  |  |  |  |  |  | $runstart_index += @{$array_of_runs[$run_index-1]}; | 
| 1682 |  |  |  |  |  |  | if ($array_of_runs[$run_index]->[0] > | 
| 1683 |  |  |  |  |  |  | $array_of_runs[$run_index-1]->[0]  && | 
| 1684 |  |  |  |  |  |  | $array_of_runs[$run_index]->[0] > | 
| 1685 |  |  |  |  |  |  | $array_of_runs[$run_index+1]->[0]) { | 
| 1686 |  |  |  |  |  |  | my $run_center = @{$array_of_runs[$run_index]} / 2; | 
| 1687 |  |  |  |  |  |  | my $assignment_index = $runstart_index + $run_center; | 
| 1688 |  |  |  |  |  |  | $output[$assignment_index] = $arr->[$assignment_index]; | 
| 1689 |  |  |  |  |  |  | } | 
| 1690 |  |  |  |  |  |  | } | 
| 1691 |  |  |  |  |  |  | if ($array_of_runs[-1]->[0] > $array_of_runs[-2]->[0]) { | 
| 1692 |  |  |  |  |  |  | $runstart_index += @{$array_of_runs[-2]}; | 
| 1693 |  |  |  |  |  |  | my $run_center = @{$array_of_runs[-1]} / 2; | 
| 1694 |  |  |  |  |  |  | my $assignment_index = $runstart_index + $run_center; | 
| 1695 |  |  |  |  |  |  | $output[$assignment_index] = $arr->[$assignment_index]; | 
| 1696 |  |  |  |  |  |  | } | 
| 1697 |  |  |  |  |  |  | if ($array_of_runs[0]->[0] > $array_of_runs[1]->[0]) { | 
| 1698 |  |  |  |  |  |  | my $run_center = @{$array_of_runs[0]} / 2; | 
| 1699 |  |  |  |  |  |  | $output[$run_center] = $arr->[$run_center]; | 
| 1700 |  |  |  |  |  |  | } | 
| 1701 |  |  |  |  |  |  | return \@output; | 
| 1702 |  |  |  |  |  |  | } | 
| 1703 |  |  |  |  |  |  |  | 
| 1704 |  |  |  |  |  |  | sub display_matrix { | 
| 1705 |  |  |  |  |  |  | my $matrix = shift; | 
| 1706 |  |  |  |  |  |  | my $nrows = $matrix->rows(); | 
| 1707 |  |  |  |  |  |  | my $ncols = $matrix->cols(); | 
| 1708 |  |  |  |  |  |  | print "\n\nDisplaying matrix of size $nrows rows and $ncols columns:\n"; | 
| 1709 |  |  |  |  |  |  | foreach my $i (0..$nrows-1) { | 
| 1710 |  |  |  |  |  |  | my $row = $matrix->row($i); | 
| 1711 |  |  |  |  |  |  | my @row_as_list = $row->as_list; | 
| 1712 |  |  |  |  |  |  | print "@row_as_list\n"; | 
| 1713 |  |  |  |  |  |  | } | 
| 1714 |  |  |  |  |  |  | print "\n\n"; | 
| 1715 |  |  |  |  |  |  | } | 
| 1716 |  |  |  |  |  |  |  | 
| 1717 |  |  |  |  |  |  | sub transpose { | 
| 1718 |  |  |  |  |  |  | my $matrix = shift; | 
| 1719 |  |  |  |  |  |  | my $num_rows = $matrix->rows(); | 
| 1720 |  |  |  |  |  |  | my $num_cols = $matrix->cols(); | 
| 1721 |  |  |  |  |  |  | my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows); | 
| 1722 |  |  |  |  |  |  | foreach my $i (0..$num_rows-1) { | 
| 1723 |  |  |  |  |  |  | my @row = $matrix->row($i)->as_list; | 
| 1724 |  |  |  |  |  |  | $transpose->set_col($i, \@row ); | 
| 1725 |  |  |  |  |  |  | } | 
| 1726 |  |  |  |  |  |  | return $transpose; | 
| 1727 |  |  |  |  |  |  | } | 
| 1728 |  |  |  |  |  |  |  | 
| 1729 |  |  |  |  |  |  | sub vector_subtract { | 
| 1730 |  |  |  |  |  |  | my $vec1 = shift; | 
| 1731 |  |  |  |  |  |  | my $vec2 = shift; | 
| 1732 |  |  |  |  |  |  | die "wrong data types for vector subtract calculation\n" if @$vec1 != @$vec2; | 
| 1733 |  |  |  |  |  |  | my @result; | 
| 1734 |  |  |  |  |  |  | foreach my $i (0..@$vec1-1){ | 
| 1735 |  |  |  |  |  |  | push @result, $vec1->[$i] - $vec2->[$i]; | 
| 1736 |  |  |  |  |  |  | } | 
| 1737 |  |  |  |  |  |  | return @result; | 
| 1738 |  |  |  |  |  |  | } | 
| 1739 |  |  |  |  |  |  |  | 
| 1740 |  |  |  |  |  |  | sub vector_multiply { | 
| 1741 |  |  |  |  |  |  | my $vec1 = shift; | 
| 1742 |  |  |  |  |  |  | my $vec2 = shift; | 
| 1743 |  |  |  |  |  |  | die "vec_multiply called with two vectors of different sizes" | 
| 1744 |  |  |  |  |  |  | unless @$vec1 == @$vec2; | 
| 1745 |  |  |  |  |  |  | my $result = 0; | 
| 1746 |  |  |  |  |  |  | foreach my $i (0..@$vec1-1) { | 
| 1747 |  |  |  |  |  |  | $result += $vec1->[$i] * $vec2->[$i]; | 
| 1748 |  |  |  |  |  |  | } | 
| 1749 |  |  |  |  |  |  | return $result; | 
| 1750 |  |  |  |  |  |  | } | 
| 1751 |  |  |  |  |  |  |  | 
| 1752 |  |  |  |  |  |  | sub matrix_multiply { | 
| 1753 |  |  |  |  |  |  | my $matrix1 = shift; | 
| 1754 |  |  |  |  |  |  | my $matrix2 = shift; | 
| 1755 |  |  |  |  |  |  | my ($nrows1, $ncols1) = ($matrix1->rows(), $matrix1->cols()); | 
| 1756 |  |  |  |  |  |  | my ($nrows2, $ncols2) = ($matrix2->rows(), $matrix2->cols()); | 
| 1757 |  |  |  |  |  |  | die "matrix multiplication called with non-matching matrix arguments" | 
| 1758 |  |  |  |  |  |  | unless $nrows1 == $ncols2 && $ncols1 == $nrows2; | 
| 1759 |  |  |  |  |  |  | if ($nrows1 == 1) { | 
| 1760 |  |  |  |  |  |  | my @row = $matrix1->row(0)->as_list; | 
| 1761 |  |  |  |  |  |  | my @col = $matrix2->col(0)->as_list; | 
| 1762 |  |  |  |  |  |  | my $result; | 
| 1763 |  |  |  |  |  |  | foreach my $j (0..$ncols1-1) { | 
| 1764 |  |  |  |  |  |  | $result += $row[$j] * $col[$j]; | 
| 1765 |  |  |  |  |  |  | } | 
| 1766 |  |  |  |  |  |  | return $result; | 
| 1767 |  |  |  |  |  |  | } | 
| 1768 |  |  |  |  |  |  | my $product = Math::GSL::Matrix->new($nrows1, $nrows1); | 
| 1769 |  |  |  |  |  |  | foreach my $i (0..$nrows1-1) { | 
| 1770 |  |  |  |  |  |  | my $row = $matrix1->row($i); | 
| 1771 |  |  |  |  |  |  | my @product_row; | 
| 1772 |  |  |  |  |  |  | foreach my $j (0..$ncols2-1) { | 
| 1773 |  |  |  |  |  |  | my $col = $matrix2->col($j); | 
| 1774 |  |  |  |  |  |  | my $row_times_col = matrix_multiply($row, $col); | 
| 1775 |  |  |  |  |  |  | push @product_row, $row_times_col; | 
| 1776 |  |  |  |  |  |  | } | 
| 1777 |  |  |  |  |  |  | $product->set_row($i, \@product_row); | 
| 1778 |  |  |  |  |  |  | } | 
| 1779 |  |  |  |  |  |  | return $product; | 
| 1780 |  |  |  |  |  |  | } | 
| 1781 |  |  |  |  |  |  |  | 
| 1782 |  |  |  |  |  |  | 1; | 
| 1783 |  |  |  |  |  |  |  | 
| 1784 |  |  |  |  |  |  | =pod | 
| 1785 |  |  |  |  |  |  |  | 
| 1786 |  |  |  |  |  |  | =head1 NAME | 
| 1787 |  |  |  |  |  |  |  | 
| 1788 |  |  |  |  |  |  | Algorithm::KMeans - for clustering multidimensional data | 
| 1789 |  |  |  |  |  |  |  | 
| 1790 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 1791 |  |  |  |  |  |  |  | 
| 1792 |  |  |  |  |  |  | # You now have four different choices for clustering your data with this module: | 
| 1793 |  |  |  |  |  |  | # | 
| 1794 |  |  |  |  |  |  | #     1)  With Euclidean distances and with random cluster seeding | 
| 1795 |  |  |  |  |  |  | # | 
| 1796 |  |  |  |  |  |  | #     2)  With Mahalanobis distances and with random cluster seeding | 
| 1797 |  |  |  |  |  |  | # | 
| 1798 |  |  |  |  |  |  | #     3)  With Euclidean distances and with smart cluster seeding | 
| 1799 |  |  |  |  |  |  | # | 
| 1800 |  |  |  |  |  |  | #     4)  With Mahalanobis distances and with smart cluster seeding | 
| 1801 |  |  |  |  |  |  | # | 
| 1802 |  |  |  |  |  |  | # Despite the qualifier 'smart' in 'smart cluster seeding', it may not always | 
| 1803 |  |  |  |  |  |  | # produce results that are superior to those obtained with random seeding.  (If you | 
| 1804 |  |  |  |  |  |  | # also factor in the choice regarding variance normalization, you actually have | 
| 1805 |  |  |  |  |  |  | # eight different choices for data clustering with this module.) | 
| 1806 |  |  |  |  |  |  | # | 
| 1807 |  |  |  |  |  |  | # In all cases, you'd obviously begin with | 
| 1808 |  |  |  |  |  |  |  | 
| 1809 |  |  |  |  |  |  | use Algorithm::KMeans; | 
| 1810 |  |  |  |  |  |  |  | 
| 1811 |  |  |  |  |  |  | # You'd then name the data file: | 
| 1812 |  |  |  |  |  |  |  | 
| 1813 |  |  |  |  |  |  | my $datafile = "mydatafile.dat"; | 
| 1814 |  |  |  |  |  |  |  | 
| 1815 |  |  |  |  |  |  | # Next, set the mask to indicate which columns of the datafile to use for | 
| 1816 |  |  |  |  |  |  | # clustering and which column contains a symbolic ID for each data record. For | 
| 1817 |  |  |  |  |  |  | # example, if the symbolic name is in the first column, you want the second column | 
| 1818 |  |  |  |  |  |  | # to be ignored, and you want the next three columns to be used for 3D clustering, | 
| 1819 |  |  |  |  |  |  | # you'd set the mask to: | 
| 1820 |  |  |  |  |  |  |  | 
| 1821 |  |  |  |  |  |  | my $mask = "N0111"; | 
| 1822 |  |  |  |  |  |  |  | 
| 1823 |  |  |  |  |  |  | # Now construct an instance of the clusterer.  The parameter K controls the number | 
| 1824 |  |  |  |  |  |  | # of clusters.  If you know how many clusters you want (let's say 3), call | 
| 1825 |  |  |  |  |  |  |  | 
| 1826 |  |  |  |  |  |  | my $clusterer = Algorithm::KMeans->new( datafile        => $datafile, | 
| 1827 |  |  |  |  |  |  | mask            => $mask, | 
| 1828 |  |  |  |  |  |  | K               => 3, | 
| 1829 |  |  |  |  |  |  | cluster_seeding => 'random', | 
| 1830 |  |  |  |  |  |  | terminal_output => 1, | 
| 1831 |  |  |  |  |  |  | write_clusters_to_files => 1, | 
| 1832 |  |  |  |  |  |  | ); | 
| 1833 |  |  |  |  |  |  |  | 
| 1834 |  |  |  |  |  |  | # By default, this constructor call will set you up for clustering based on | 
| 1835 |  |  |  |  |  |  | # Euclidean distances.  If you want the module to use Mahalanobis distances, your | 
| 1836 |  |  |  |  |  |  | # constructor call will look like: | 
| 1837 |  |  |  |  |  |  |  | 
| 1838 |  |  |  |  |  |  | my $clusterer = Algorithm::KMeans->new( datafile        => $datafile, | 
| 1839 |  |  |  |  |  |  | mask            => $mask, | 
| 1840 |  |  |  |  |  |  | K               => 3, | 
| 1841 |  |  |  |  |  |  | cluster_seeding => 'random', | 
| 1842 |  |  |  |  |  |  | use_mahalanobis_metric => 1, | 
| 1843 |  |  |  |  |  |  | terminal_output => 1, | 
| 1844 |  |  |  |  |  |  | write_clusters_to_files => 1, | 
| 1845 |  |  |  |  |  |  | ); | 
| 1846 |  |  |  |  |  |  |  | 
| 1847 |  |  |  |  |  |  | # For both constructor calls shown above, you can use smart seeding of the clusters | 
| 1848 |  |  |  |  |  |  | # by changing 'random' to 'smart' for the cluster_seeding option.  See the | 
| 1849 |  |  |  |  |  |  | # explanation of smart seeding in the Methods section of this documentation. | 
| 1850 |  |  |  |  |  |  |  | 
| 1851 |  |  |  |  |  |  | # If your data is such that its variability along the different dimensions of the | 
| 1852 |  |  |  |  |  |  | # data space is significantly different, you may get better clustering if you first | 
| 1853 |  |  |  |  |  |  | # normalize your data by setting the constructor parameter | 
| 1854 |  |  |  |  |  |  | # do_variance_normalization as shown below: | 
| 1855 |  |  |  |  |  |  |  | 
| 1856 |  |  |  |  |  |  | my $clusterer = Algorithm::KMeans->new( datafile => $datafile, | 
| 1857 |  |  |  |  |  |  | mask     => $mask, | 
| 1858 |  |  |  |  |  |  | K        => 3, | 
| 1859 |  |  |  |  |  |  | cluster_seeding => 'smart',    # or 'random' | 
| 1860 |  |  |  |  |  |  | terminal_output => 1, | 
| 1861 |  |  |  |  |  |  | do_variance_normalization => 1, | 
| 1862 |  |  |  |  |  |  | write_clusters_to_files => 1, | 
| 1863 |  |  |  |  |  |  | ); | 
| 1864 |  |  |  |  |  |  |  | 
| 1865 |  |  |  |  |  |  | # But bear in mind that such data normalization may actually decrease the | 
| 1866 |  |  |  |  |  |  | # performance of the clusterer if the variability in the data is more a result of | 
| 1867 |  |  |  |  |  |  | # the separation between the means than a consequence of intra-cluster variance. | 
| 1868 |  |  |  |  |  |  |  | 
| 1869 |  |  |  |  |  |  | # Set K to 0 if you want the module to figure out the optimum number of clusters | 
| 1870 |  |  |  |  |  |  | # from the data. (It is best to run this option with the terminal_output set to 1 | 
| 1871 |  |  |  |  |  |  | # so that you can see the different value of QoC for the different K): | 
| 1872 |  |  |  |  |  |  |  | 
| 1873 |  |  |  |  |  |  | my $clusterer = Algorithm::KMeans->new( datafile => $datafile, | 
| 1874 |  |  |  |  |  |  | mask     => $mask, | 
| 1875 |  |  |  |  |  |  | K        => 0, | 
| 1876 |  |  |  |  |  |  | cluster_seeding => 'random',    # or 'smart' | 
| 1877 |  |  |  |  |  |  | terminal_output => 1, | 
| 1878 |  |  |  |  |  |  | write_clusters_to_files => 1, | 
| 1879 |  |  |  |  |  |  | ); | 
| 1880 |  |  |  |  |  |  |  | 
| 1881 |  |  |  |  |  |  | # Although not shown above, you can obviously set the 'do_variance_normalization' | 
| 1882 |  |  |  |  |  |  | # flag here also if you wish. | 
| 1883 |  |  |  |  |  |  |  | 
| 1884 |  |  |  |  |  |  | # For very large data files, setting K to 0 will result in searching through too | 
| 1885 |  |  |  |  |  |  | # many values for K.  For such cases, you can range limit the values of K to search | 
| 1886 |  |  |  |  |  |  | # through by | 
| 1887 |  |  |  |  |  |  |  | 
| 1888 |  |  |  |  |  |  | my $clusterer = Algorithm::KMeans->new( datafile => $datafile, | 
| 1889 |  |  |  |  |  |  | mask     => "N111", | 
| 1890 |  |  |  |  |  |  | Kmin     => 3, | 
| 1891 |  |  |  |  |  |  | Kmax     => 10, | 
| 1892 |  |  |  |  |  |  | cluster_seeding => 'random',    # or 'smart' | 
| 1893 |  |  |  |  |  |  | terminal_output => 1, | 
| 1894 |  |  |  |  |  |  | write_clusters_to_files => 1, | 
| 1895 |  |  |  |  |  |  | ); | 
| 1896 |  |  |  |  |  |  |  | 
| 1897 |  |  |  |  |  |  | # FOR ALL CASES ABOVE, YOU'D NEED TO MAKE THE FOLLOWING CALLS ON THE CLUSTERER | 
| 1898 |  |  |  |  |  |  | # INSTANCE TO ACTUALLY CLUSTER THE DATA: | 
| 1899 |  |  |  |  |  |  |  | 
| 1900 |  |  |  |  |  |  | $clusterer->read_data_from_file(); | 
| 1901 |  |  |  |  |  |  | $clusterer->kmeans(); | 
| 1902 |  |  |  |  |  |  |  | 
| 1903 |  |  |  |  |  |  | # If you want to directly access the clusters and the cluster centers in your own | 
| 1904 |  |  |  |  |  |  | # top-level script, replace the above two statements with: | 
| 1905 |  |  |  |  |  |  |  | 
| 1906 |  |  |  |  |  |  | $clusterer->read_data_from_file(); | 
| 1907 |  |  |  |  |  |  | my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans(); | 
| 1908 |  |  |  |  |  |  |  | 
| 1909 |  |  |  |  |  |  | # You can subsequently access the clusters directly in your own code, as in: | 
| 1910 |  |  |  |  |  |  |  | 
| 1911 |  |  |  |  |  |  | foreach my $cluster_id (sort keys %{$clusters_hash}) { | 
| 1912 |  |  |  |  |  |  | print "\n$cluster_id   =>   @{$clusters_hash->{$cluster_id}}\n"; | 
| 1913 |  |  |  |  |  |  | } | 
| 1914 |  |  |  |  |  |  | foreach my $cluster_id (sort keys %{$cluster_centers_hash}) { | 
| 1915 |  |  |  |  |  |  | print "\n$cluster_id   =>   @{$cluster_centers_hash->{$cluster_id}}\n"; | 
| 1916 |  |  |  |  |  |  | } | 
| 1917 |  |  |  |  |  |  |  | 
| 1918 |  |  |  |  |  |  |  | 
| 1919 |  |  |  |  |  |  | # CLUSTER VISUALIZATION: | 
| 1920 |  |  |  |  |  |  |  | 
| 1921 |  |  |  |  |  |  | # You must first set the mask for cluster visualization. This mask tells the module | 
| 1922 |  |  |  |  |  |  | # which 2D or 3D subspace of the original data space you wish to visualize the | 
| 1923 |  |  |  |  |  |  | # clusters in: | 
| 1924 |  |  |  |  |  |  |  | 
| 1925 |  |  |  |  |  |  | my $visualization_mask = "111"; | 
| 1926 |  |  |  |  |  |  | $clusterer->visualize_clusters($visualization_mask); | 
| 1927 |  |  |  |  |  |  |  | 
| 1928 |  |  |  |  |  |  |  | 
| 1929 |  |  |  |  |  |  | # SYNTHETIC DATA GENERATION: | 
| 1930 |  |  |  |  |  |  |  | 
| 1931 |  |  |  |  |  |  | # The module has been provided with a class method for generating multivariate data | 
| 1932 |  |  |  |  |  |  | # for experimenting with clustering.  The data generation is controlled by the | 
| 1933 |  |  |  |  |  |  | # contents of the parameter file that is supplied as an argument to the data | 
| 1934 |  |  |  |  |  |  | # generator method.  The mean and covariance matrix entries in the parameter file | 
| 1935 |  |  |  |  |  |  | # must be according to the syntax shown in the param.txt file in the examples | 
| 1936 |  |  |  |  |  |  | # directory. It is best to edit this file as needed: | 
| 1937 |  |  |  |  |  |  |  | 
| 1938 |  |  |  |  |  |  | my $parameter_file = "param.txt"; | 
| 1939 |  |  |  |  |  |  | my $out_datafile = "mydatafile.dat"; | 
| 1940 |  |  |  |  |  |  | Algorithm::KMeans->cluster_data_generator( | 
| 1941 |  |  |  |  |  |  | input_parameter_file => $parameter_file, | 
| 1942 |  |  |  |  |  |  | output_datafile => $out_datafile, | 
| 1943 |  |  |  |  |  |  | number_data_points_per_cluster => $N ); | 
| 1944 |  |  |  |  |  |  |  | 
| 1945 |  |  |  |  |  |  | =head1 CHANGES | 
| 1946 |  |  |  |  |  |  |  | 
| 1947 |  |  |  |  |  |  | Version 2.03 incorporates minor code cleanup.  The main implementation of the module | 
| 1948 |  |  |  |  |  |  | remains unchanged. | 
| 1949 |  |  |  |  |  |  |  | 
| 1950 |  |  |  |  |  |  | Version 2.02 downshifts the version of Perl that is required for this module.  The | 
| 1951 |  |  |  |  |  |  | module should work with versions 5.10 and higher of Perl.  The implementation code | 
| 1952 |  |  |  |  |  |  | for the module remains unchanged. | 
| 1953 |  |  |  |  |  |  |  | 
| 1954 |  |  |  |  |  |  | Version 2.01 removes many errors in the documentation. The changes made to the module | 
| 1955 |  |  |  |  |  |  | in Version 2.0 were not reflected properly in the documentation page for that | 
| 1956 |  |  |  |  |  |  | version.  The implementation code remains unchanged. | 
| 1957 |  |  |  |  |  |  |  | 
| 1958 |  |  |  |  |  |  | Version 2.0 includes significant additional functionality: (1) You now have the | 
| 1959 |  |  |  |  |  |  | option to cluster using the Mahalanobis distance metric (the default is the Euclidean | 
| 1960 |  |  |  |  |  |  | metric); and (2) With the two C methods that have been added to the | 
| 1961 |  |  |  |  |  |  | module, you can now determine the best cluster for a new data sample after you have | 
| 1962 |  |  |  |  |  |  | created the clusters with the previously available data.  Finding the best cluster | 
| 1963 |  |  |  |  |  |  | for a new data sample can be done using either the Euclidean metric or the | 
| 1964 |  |  |  |  |  |  | Mahalanobis metric. | 
| 1965 |  |  |  |  |  |  |  | 
| 1966 |  |  |  |  |  |  | Version 1.40 includes a C option for seeding the clusters.  This option, | 
| 1967 |  |  |  |  |  |  | supplied through the constructor parameter C, means that the | 
| 1968 |  |  |  |  |  |  | clusterer will (1) Subject the data to principal components analysis in order to | 
| 1969 |  |  |  |  |  |  | determine the maximum variance direction; (2) Project the data onto this direction; | 
| 1970 |  |  |  |  |  |  | (3) Find peaks in a smoothed histogram of the projected points; and (4) Use the | 
| 1971 |  |  |  |  |  |  | locations of the highest peaks as initial guesses for the cluster centers.  If you | 
| 1972 |  |  |  |  |  |  | don't want to use this option, set C to C. That should work | 
| 1973 |  |  |  |  |  |  | as in the previous version of the module. | 
| 1974 |  |  |  |  |  |  |  | 
| 1975 |  |  |  |  |  |  | Version 1.30 includes a bug fix for the case when the datafile contains empty lines, | 
| 1976 |  |  |  |  |  |  | that is, lines with no data records.  Another bug fix in Version 1.30 deals with the | 
| 1977 |  |  |  |  |  |  | case when you want the module to figure out how many clusters to form (this is the | 
| 1978 |  |  |  |  |  |  | C option in the constructor call) and the number of data records is close to the | 
| 1979 |  |  |  |  |  |  | minimum. | 
| 1980 |  |  |  |  |  |  |  | 
| 1981 |  |  |  |  |  |  | Version 1.21 includes fixes to handle the possibility that, when clustering the data | 
| 1982 |  |  |  |  |  |  | for a fixed number of clusters, a cluster may become empty during iterative | 
| 1983 |  |  |  |  |  |  | calculation of cluster assignments of the data elements and the updating of the | 
| 1984 |  |  |  |  |  |  | cluster centers.  The code changes are in the C and | 
| 1985 |  |  |  |  |  |  | C subroutines. | 
| 1986 |  |  |  |  |  |  |  | 
| 1987 |  |  |  |  |  |  | Version 1.20 includes an option to normalize the data with respect to its variability | 
| 1988 |  |  |  |  |  |  | along the different coordinates before clustering is carried out. | 
| 1989 |  |  |  |  |  |  |  | 
| 1990 |  |  |  |  |  |  | Version 1.1.1 allows for range limiting the values of C to search through.  C | 
| 1991 |  |  |  |  |  |  | stands for the number of clusters to form.  This version also declares the module | 
| 1992 |  |  |  |  |  |  | dependencies in the C file. | 
| 1993 |  |  |  |  |  |  |  | 
| 1994 |  |  |  |  |  |  | Version 1.1 is a an object-oriented version of the implementation presented in | 
| 1995 |  |  |  |  |  |  | version 1.0.  The current version should lend itself more easily to code extension. | 
| 1996 |  |  |  |  |  |  | You could, for example, create your own class by subclassing from the class presented | 
| 1997 |  |  |  |  |  |  | here and, in your subclass, use your own criteria for the similarity distance between | 
| 1998 |  |  |  |  |  |  | the data points and for the QoC (Quality of Clustering) metric, and, possibly a | 
| 1999 |  |  |  |  |  |  | different rule to stop the iterations.  Version 1.1 also allows you to directly | 
| 2000 |  |  |  |  |  |  | access the clusters formed and the cluster centers in your calling script. | 
| 2001 |  |  |  |  |  |  |  | 
| 2002 |  |  |  |  |  |  |  | 
| 2003 |  |  |  |  |  |  | =head1 SPECIAL USAGE NOTE | 
| 2004 |  |  |  |  |  |  |  | 
| 2005 |  |  |  |  |  |  | If you were directly accessing in your own scripts the clusters produced by the older | 
| 2006 |  |  |  |  |  |  | versions of this module, you'd need to make changes to your code if you wish to use | 
| 2007 |  |  |  |  |  |  | Version 2.0 or higher.  Instead of returning arrays of clusters and cluster centers, | 
| 2008 |  |  |  |  |  |  | Versions 2.0 and higher return hashes. This change was made necessary by the logic | 
| 2009 |  |  |  |  |  |  | required for implementing the two new C methods that were introduced | 
| 2010 |  |  |  |  |  |  | in Version 2.0.  These methods return the best cluster for a new data sample from the | 
| 2011 |  |  |  |  |  |  | clusters you created using the existing data.  One of the C methods is | 
| 2012 |  |  |  |  |  |  | based on the Euclidean metric for finding the cluster that is closest to the new data | 
| 2013 |  |  |  |  |  |  | sample, and the other on the Mahalanobis metric.  Another point of incompatibility | 
| 2014 |  |  |  |  |  |  | with the previous versions is that you must now explicitly set the C | 
| 2015 |  |  |  |  |  |  | parameter in the call to the constructor to either C or C.  This | 
| 2016 |  |  |  |  |  |  | parameter does not have a default associated with it starting with Version 2.0. | 
| 2017 |  |  |  |  |  |  |  | 
| 2018 |  |  |  |  |  |  |  | 
| 2019 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 2020 |  |  |  |  |  |  |  | 
| 2021 |  |  |  |  |  |  | Clustering with K-Means takes place iteratively and involves two steps: 1) assignment | 
| 2022 |  |  |  |  |  |  | of data samples to clusters on the basis of how far the data samples are from the | 
| 2023 |  |  |  |  |  |  | cluster centers; and 2) Recalculation of the cluster centers (and cluster covariances | 
| 2024 |  |  |  |  |  |  | if you are using the Mahalanobis distance metric for clustering). | 
| 2025 |  |  |  |  |  |  |  | 
| 2026 |  |  |  |  |  |  | Obviously, before the two-step approach can proceed, we need to initialize the the | 
| 2027 |  |  |  |  |  |  | cluster centers.  How this initialization is carried out is important.  The module | 
| 2028 |  |  |  |  |  |  | gives you two very different ways for carrying out this initialization.  One option, | 
| 2029 |  |  |  |  |  |  | called the C option, consists of subjecting the data to principal components | 
| 2030 |  |  |  |  |  |  | analysis to discover the direction of maximum variance in the data space.  The data | 
| 2031 |  |  |  |  |  |  | points are then projected on to this direction and a histogram constructed from the | 
| 2032 |  |  |  |  |  |  | projections.  Centers of the smoothed histogram are used to seed the clustering | 
| 2033 |  |  |  |  |  |  | operation.  The other option is to choose the cluster centers purely randomly.  You | 
| 2034 |  |  |  |  |  |  | get the first option if you set C to C in the constructor, | 
| 2035 |  |  |  |  |  |  | and you get the second option if you set it to C. | 
| 2036 |  |  |  |  |  |  |  | 
| 2037 |  |  |  |  |  |  | How to specify the number of clusters, C, is one of the most vexing issues in any | 
| 2038 |  |  |  |  |  |  | approach to clustering.  In some case, we can set C on the basis of prior | 
| 2039 |  |  |  |  |  |  | knowledge.  But, more often than not, no such prior knowledge is available.  When the | 
| 2040 |  |  |  |  |  |  | programmer does not explicitly specify a value for C, the approach taken in the | 
| 2041 |  |  |  |  |  |  | current implementation is to try all possible values between 2 and some largest | 
| 2042 |  |  |  |  |  |  | possible value that makes statistical sense.  We then choose that value for C | 
| 2043 |  |  |  |  |  |  | which yields the best value for the QoC (Quality of Clustering) metric.  It is | 
| 2044 |  |  |  |  |  |  | generally believed that the largest value for C should not exceed C | 
| 2045 |  |  |  |  |  |  | where C is the number of data samples to be clustered. | 
| 2046 |  |  |  |  |  |  |  | 
| 2047 |  |  |  |  |  |  | What to use for the QoC metric is obviously a critical issue unto itself.  In the | 
| 2048 |  |  |  |  |  |  | current implementation, the value of QoC is the ratio of the average radius of the | 
| 2049 |  |  |  |  |  |  | clusters and the average distance between the cluster centers. | 
| 2050 |  |  |  |  |  |  |  | 
| 2051 |  |  |  |  |  |  | Every iterative algorithm requires a stopping criterion.  The criterion implemented | 
| 2052 |  |  |  |  |  |  | here is that we stop iterations when there is no re-assignment of the data points | 
| 2053 |  |  |  |  |  |  | during the assignment step. | 
| 2054 |  |  |  |  |  |  |  | 
| 2055 |  |  |  |  |  |  | Ordinarily, the output produced by a K-Means clusterer will correspond to a local | 
| 2056 |  |  |  |  |  |  | minimum for the QoC values, as opposed to a global minimum.  The current | 
| 2057 |  |  |  |  |  |  | implementation protects against that when the module constructor is called with the | 
| 2058 |  |  |  |  |  |  | C option for C by trying different randomly selected initial | 
| 2059 |  |  |  |  |  |  | cluster centers and then selecting the one that gives the best overall QoC value. | 
| 2060 |  |  |  |  |  |  |  | 
| 2061 |  |  |  |  |  |  | A K-Means clusterer will generally produce good results if the overlap between the | 
| 2062 |  |  |  |  |  |  | clusters is minimal and if each cluster exhibits variability that is uniform in all | 
| 2063 |  |  |  |  |  |  | directions.  When the data variability is different along the different directions in | 
| 2064 |  |  |  |  |  |  | the data space, the results you obtain with a K-Means clusterer may be improved by | 
| 2065 |  |  |  |  |  |  | first normalizing the data appropriately, as can be done in this module when you set | 
| 2066 |  |  |  |  |  |  | the C option in the constructor.  However, as pointed out | 
| 2067 |  |  |  |  |  |  | elsewhere in this documentation, such normalization may actually decrease the | 
| 2068 |  |  |  |  |  |  | performance of the clusterer if the overall data variability along any dimension is | 
| 2069 |  |  |  |  |  |  | more a result of separation between the means than a consequence of intra-cluster | 
| 2070 |  |  |  |  |  |  | variability. | 
| 2071 |  |  |  |  |  |  |  | 
| 2072 |  |  |  |  |  |  |  | 
| 2073 |  |  |  |  |  |  | =head1 METHODS | 
| 2074 |  |  |  |  |  |  |  | 
| 2075 |  |  |  |  |  |  | The module provides the following methods for clustering, for cluster visualization, | 
| 2076 |  |  |  |  |  |  | for data visualization, for the generation of data for testing a clustering | 
| 2077 |  |  |  |  |  |  | algorithm, and for determining the cluster membership of a new data sample: | 
| 2078 |  |  |  |  |  |  |  | 
| 2079 |  |  |  |  |  |  | =over 4 | 
| 2080 |  |  |  |  |  |  |  | 
| 2081 |  |  |  |  |  |  | =item B | 
| 2082 |  |  |  |  |  |  |  | 
| 2083 |  |  |  |  |  |  | my $clusterer = Algorithm::KMeans->new(datafile        => $datafile, | 
| 2084 |  |  |  |  |  |  | mask            => $mask, | 
| 2085 |  |  |  |  |  |  | K               => $K, | 
| 2086 |  |  |  |  |  |  | cluster_seeding => 'random',     # also try 'smart' | 
| 2087 |  |  |  |  |  |  | use_mahalanobis_metric => 1,     # also try '0' | 
| 2088 |  |  |  |  |  |  | terminal_output => 1, | 
| 2089 |  |  |  |  |  |  | write_clusters_to_files => 1, | 
| 2090 |  |  |  |  |  |  | ); | 
| 2091 |  |  |  |  |  |  |  | 
| 2092 |  |  |  |  |  |  | A call to C constructs a new instance of the C class.  When | 
| 2093 |  |  |  |  |  |  | C<$K> is a non-zero positive integer, the module will construct exactly that many | 
| 2094 |  |  |  |  |  |  | clusters.  However, when C<$K> is 0, the module will find the best number of clusters | 
| 2095 |  |  |  |  |  |  | to partition the data into.  As explained in the Description, setting | 
| 2096 |  |  |  |  |  |  | C to C causes PCA (principal components analysis) to be used | 
| 2097 |  |  |  |  |  |  | for discovering the best choices for the initial cluster centers.  If you want purely | 
| 2098 |  |  |  |  |  |  | random decisions to be made for the initial choices for the cluster centers, set | 
| 2099 |  |  |  |  |  |  | C to C. | 
| 2100 |  |  |  |  |  |  |  | 
| 2101 |  |  |  |  |  |  | The data file is expected to contain entries in the following format | 
| 2102 |  |  |  |  |  |  |  | 
| 2103 |  |  |  |  |  |  | c20  0  10.7087017086940  9.63528386251712  10.9512155258108  ... | 
| 2104 |  |  |  |  |  |  | c7   0  12.8025925026787  10.6126270065785  10.5228482095349  ... | 
| 2105 |  |  |  |  |  |  | b9   0  7.60118206283120  5.05889245193079  5.82841781759102  ... | 
| 2106 |  |  |  |  |  |  | .... | 
| 2107 |  |  |  |  |  |  | .... | 
| 2108 |  |  |  |  |  |  |  | 
| 2109 |  |  |  |  |  |  | where the first column contains the symbolic ID tag for each data record and the rest | 
| 2110 |  |  |  |  |  |  | of the columns the numerical information.  As to which columns are actually used for | 
| 2111 |  |  |  |  |  |  | clustering is decided by the string value of the mask.  For example, if we wanted to | 
| 2112 |  |  |  |  |  |  | cluster on the basis of the entries in just the 3rd, the 4th, and the 5th columns | 
| 2113 |  |  |  |  |  |  | above, the mask value would be C where the character C indicates that the | 
| 2114 |  |  |  |  |  |  | ID tag is in the first column, the character C<0> that the second column is to be | 
| 2115 |  |  |  |  |  |  | ignored, and the C<1>'s that follow that the 3rd, the 4th, and the 5th columns are to | 
| 2116 |  |  |  |  |  |  | be used for clustering. | 
| 2117 |  |  |  |  |  |  |  | 
| 2118 |  |  |  |  |  |  | If you wish for the clusterer to search through a C<(Kmin,Kmax)> range of values for | 
| 2119 |  |  |  |  |  |  | C, the constructor should be called in the following fashion: | 
| 2120 |  |  |  |  |  |  |  | 
| 2121 |  |  |  |  |  |  | my $clusterer = Algorithm::KMeans->new(datafile => $datafile, | 
| 2122 |  |  |  |  |  |  | mask     => $mask, | 
| 2123 |  |  |  |  |  |  | Kmin     => 3, | 
| 2124 |  |  |  |  |  |  | Kmax     => 10, | 
| 2125 |  |  |  |  |  |  | cluster_seeding => 'smart',   # try 'random' also | 
| 2126 |  |  |  |  |  |  | terminal_output => 1, | 
| 2127 |  |  |  |  |  |  | ); | 
| 2128 |  |  |  |  |  |  |  | 
| 2129 |  |  |  |  |  |  | where obviously you can choose any reasonable values for C and C.  If you | 
| 2130 |  |  |  |  |  |  | choose a value for C that is statistically too large, the module will let you | 
| 2131 |  |  |  |  |  |  | know. Again, you may choose C for C, the default value being | 
| 2132 |  |  |  |  |  |  | C. | 
| 2133 |  |  |  |  |  |  |  | 
| 2134 |  |  |  |  |  |  | If you believe that the variability of the data is very different along the different | 
| 2135 |  |  |  |  |  |  | dimensions of the data space, you may get better clustering by first normalizing the | 
| 2136 |  |  |  |  |  |  | data coordinates by the standard-deviations along those directions.  When you set the | 
| 2137 |  |  |  |  |  |  | constructor option C as shown below, the module uses the | 
| 2138 |  |  |  |  |  |  | overall data standard-deviation along a direction for the normalization in that | 
| 2139 |  |  |  |  |  |  | direction.  (As mentioned elsewhere in the documentation, such a normalization could | 
| 2140 |  |  |  |  |  |  | backfire on you if the data variability along a dimension is more a result of the | 
| 2141 |  |  |  |  |  |  | separation between the means than a consequence of the intra-cluster variability.): | 
| 2142 |  |  |  |  |  |  |  | 
| 2143 |  |  |  |  |  |  | my $clusterer = Algorithm::KMeans->new( datafile => $datafile, | 
| 2144 |  |  |  |  |  |  | mask     => "N111", | 
| 2145 |  |  |  |  |  |  | K        => 2, | 
| 2146 |  |  |  |  |  |  | cluster_seeding => 'smart',   # try 'random' also | 
| 2147 |  |  |  |  |  |  | terminal_output => 1, | 
| 2148 |  |  |  |  |  |  | do_variance_normalization => 1, | 
| 2149 |  |  |  |  |  |  | ); | 
| 2150 |  |  |  |  |  |  |  | 
| 2151 |  |  |  |  |  |  | =back | 
| 2152 |  |  |  |  |  |  |  | 
| 2153 |  |  |  |  |  |  | =head2 Constructor Parameters | 
| 2154 |  |  |  |  |  |  |  | 
| 2155 |  |  |  |  |  |  | =over 8 | 
| 2156 |  |  |  |  |  |  |  | 
| 2157 |  |  |  |  |  |  | =item C: | 
| 2158 |  |  |  |  |  |  |  | 
| 2159 |  |  |  |  |  |  | This parameter names the data file that contains the multidimensional data records | 
| 2160 |  |  |  |  |  |  | you want the module to cluster. | 
| 2161 |  |  |  |  |  |  |  | 
| 2162 |  |  |  |  |  |  | =item C: | 
| 2163 |  |  |  |  |  |  |  | 
| 2164 |  |  |  |  |  |  | This parameter supplies the mask to be applied to the columns of your data file. See | 
| 2165 |  |  |  |  |  |  | the explanation in Synopsis for what this mask looks like. | 
| 2166 |  |  |  |  |  |  |  | 
| 2167 |  |  |  |  |  |  | =item C: | 
| 2168 |  |  |  |  |  |  |  | 
| 2169 |  |  |  |  |  |  | This parameter supplies the number of clusters you are looking for.  If you set this | 
| 2170 |  |  |  |  |  |  | option to 0, that means that you want the module to search for the best value for | 
| 2171 |  |  |  |  |  |  | C.  (Keep in mind the fact that searching for the best C may take a long time | 
| 2172 |  |  |  |  |  |  | for large data files.) | 
| 2173 |  |  |  |  |  |  |  | 
| 2174 |  |  |  |  |  |  | =item C: | 
| 2175 |  |  |  |  |  |  |  | 
| 2176 |  |  |  |  |  |  | If you supply an integer value for C, the search for the best C will begin | 
| 2177 |  |  |  |  |  |  | with that value. | 
| 2178 |  |  |  |  |  |  |  | 
| 2179 |  |  |  |  |  |  | =item C: | 
| 2180 |  |  |  |  |  |  |  | 
| 2181 |  |  |  |  |  |  | If you supply an integer value for C, the search for the best C will end at | 
| 2182 |  |  |  |  |  |  | that value. | 
| 2183 |  |  |  |  |  |  |  | 
| 2184 |  |  |  |  |  |  | =item C: | 
| 2185 |  |  |  |  |  |  |  | 
| 2186 |  |  |  |  |  |  | This parameter must be set to either C or C.  Depending on your data, | 
| 2187 |  |  |  |  |  |  | you may get superior clustering with the C option.  The choice C means | 
| 2188 |  |  |  |  |  |  | that the clusterer will (1) subject the data to principal components analysis to | 
| 2189 |  |  |  |  |  |  | determine the maximum variance direction; (2) project the data onto this direction; | 
| 2190 |  |  |  |  |  |  | (3) find peaks in a smoothed histogram of the projected points; and (4) use the | 
| 2191 |  |  |  |  |  |  | locations of the highest peaks as seeds for cluster centers.  If the C option | 
| 2192 |  |  |  |  |  |  | produces bizarre results, try C. | 
| 2193 |  |  |  |  |  |  |  | 
| 2194 |  |  |  |  |  |  | =item C: | 
| 2195 |  |  |  |  |  |  |  | 
| 2196 |  |  |  |  |  |  | When set to 1, this option causes Mahalanobis distances to be used for clustering. | 
| 2197 |  |  |  |  |  |  | The default is 0 for this parameter. By default, the module uses the Euclidean | 
| 2198 |  |  |  |  |  |  | distances for clustering.  In general, Mahalanobis distance based clustering will | 
| 2199 |  |  |  |  |  |  | fail if you seek too many clusters, your data dimensionality is high, and you do not | 
| 2200 |  |  |  |  |  |  | have a sufficient number of samples in your data file.  A necessary requirement for | 
| 2201 |  |  |  |  |  |  | the module to be able to compute Mahalanobis distances is that the cluster covariance | 
| 2202 |  |  |  |  |  |  | matrices be non-singular. (Let's say your data dimensionality is C and the module | 
| 2203 |  |  |  |  |  |  | is considering a cluster that has only C samples in it where C is less than | 
| 2204 |  |  |  |  |  |  | C.  In this case, the covariance matrix will be singular since its rank will not | 
| 2205 |  |  |  |  |  |  | exceed C.  For the covariance matrix to be non-singular, it must be of full rank, | 
| 2206 |  |  |  |  |  |  | that is, its rank must be C.) | 
| 2207 |  |  |  |  |  |  |  | 
| 2208 |  |  |  |  |  |  | =item C: | 
| 2209 |  |  |  |  |  |  |  | 
| 2210 |  |  |  |  |  |  | When set, the module will first normalize the data variance along the different | 
| 2211 |  |  |  |  |  |  | dimensions of the data space before attempting clustering.  Depending on your data, | 
| 2212 |  |  |  |  |  |  | this option may or may not result in better clustering. | 
| 2213 |  |  |  |  |  |  |  | 
| 2214 |  |  |  |  |  |  | =item C: | 
| 2215 |  |  |  |  |  |  |  | 
| 2216 |  |  |  |  |  |  | This boolean parameter, when not supplied in the call to C, defaults to 0. | 
| 2217 |  |  |  |  |  |  | When set, you will see in your terminal window the different clusters as lists of the | 
| 2218 |  |  |  |  |  |  | symbolic IDs and their cluster centers. You will also see the QoC (Quality of | 
| 2219 |  |  |  |  |  |  | Clustering) values for the clusters displayed. | 
| 2220 |  |  |  |  |  |  |  | 
| 2221 |  |  |  |  |  |  | =item C: | 
| 2222 |  |  |  |  |  |  |  | 
| 2223 |  |  |  |  |  |  | This parameter is also boolean.  When set to 1, the clusters are written out to files | 
| 2224 |  |  |  |  |  |  | that are named in the following manner: | 
| 2225 |  |  |  |  |  |  |  | 
| 2226 |  |  |  |  |  |  | cluster0.dat | 
| 2227 |  |  |  |  |  |  | cluster1.dat | 
| 2228 |  |  |  |  |  |  | cluster2.dat | 
| 2229 |  |  |  |  |  |  | ... | 
| 2230 |  |  |  |  |  |  | ... | 
| 2231 |  |  |  |  |  |  |  | 
| 2232 |  |  |  |  |  |  | Before the clusters are written to these files, the module destroys all files with | 
| 2233 |  |  |  |  |  |  | such names in the directory in which you call the module. | 
| 2234 |  |  |  |  |  |  |  | 
| 2235 |  |  |  |  |  |  |  | 
| 2236 |  |  |  |  |  |  | =back | 
| 2237 |  |  |  |  |  |  |  | 
| 2238 |  |  |  |  |  |  | =over | 
| 2239 |  |  |  |  |  |  |  | 
| 2240 |  |  |  |  |  |  | =item B | 
| 2241 |  |  |  |  |  |  |  | 
| 2242 |  |  |  |  |  |  | $clusterer->read_data_from_file() | 
| 2243 |  |  |  |  |  |  |  | 
| 2244 |  |  |  |  |  |  | =item B | 
| 2245 |  |  |  |  |  |  |  | 
| 2246 |  |  |  |  |  |  | $clusterer->kmeans(); | 
| 2247 |  |  |  |  |  |  |  | 
| 2248 |  |  |  |  |  |  | or | 
| 2249 |  |  |  |  |  |  |  | 
| 2250 |  |  |  |  |  |  | my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans(); | 
| 2251 |  |  |  |  |  |  |  | 
| 2252 |  |  |  |  |  |  | The first call above works solely by side-effect.  The second call also returns the | 
| 2253 |  |  |  |  |  |  | clusters and the cluster centers. See the C script in the | 
| 2254 |  |  |  |  |  |  | C directory for how you can in your own code extract the clusters and the | 
| 2255 |  |  |  |  |  |  | cluster centers from the variables C<$clusters_hash> and C<$cluster_centers_hash>. | 
| 2256 |  |  |  |  |  |  |  | 
| 2257 |  |  |  |  |  |  | =item B | 
| 2258 |  |  |  |  |  |  |  | 
| 2259 |  |  |  |  |  |  | $clusterer->get_K_best(); | 
| 2260 |  |  |  |  |  |  |  | 
| 2261 |  |  |  |  |  |  | This call makes sense only if you supply either the C option to the constructor, | 
| 2262 |  |  |  |  |  |  | or if you specify values for the C and C options. The C and the | 
| 2263 |  |  |  |  |  |  | C<(Kmin,Kmax)> options cause the module to determine the best value for C. | 
| 2264 |  |  |  |  |  |  | Remember, C is the number of clusters the data is partitioned into. | 
| 2265 |  |  |  |  |  |  |  | 
| 2266 |  |  |  |  |  |  | =item B | 
| 2267 |  |  |  |  |  |  |  | 
| 2268 |  |  |  |  |  |  | $clusterer->show_QoC_values(); | 
| 2269 |  |  |  |  |  |  |  | 
| 2270 |  |  |  |  |  |  | presents a table with C values in the left column and the corresponding QoC | 
| 2271 |  |  |  |  |  |  | (Quality-of-Clustering) values in the right column.  Note that this call makes sense | 
| 2272 |  |  |  |  |  |  | only if you either supply the C option to the constructor, or if you specify | 
| 2273 |  |  |  |  |  |  | values for the C and C options. | 
| 2274 |  |  |  |  |  |  |  | 
| 2275 |  |  |  |  |  |  | =item B | 
| 2276 |  |  |  |  |  |  |  | 
| 2277 |  |  |  |  |  |  | $clusterer->visualize_clusters( $visualization_mask ) | 
| 2278 |  |  |  |  |  |  |  | 
| 2279 |  |  |  |  |  |  | The visualization mask here does not have to be identical to the one used for | 
| 2280 |  |  |  |  |  |  | clustering, but must be a subset of that mask.  This is convenient for visualizing | 
| 2281 |  |  |  |  |  |  | the clusters in two- or three-dimensional subspaces of the original space. | 
| 2282 |  |  |  |  |  |  |  | 
| 2283 |  |  |  |  |  |  | =item B | 
| 2284 |  |  |  |  |  |  |  | 
| 2285 |  |  |  |  |  |  | $clusterer->visualize_data($visualization_mask, 'original'); | 
| 2286 |  |  |  |  |  |  |  | 
| 2287 |  |  |  |  |  |  | $clusterer->visualize_data($visualization_mask, 'normed'); | 
| 2288 |  |  |  |  |  |  |  | 
| 2289 |  |  |  |  |  |  | This method requires a second argument and, as shown, it must be either the string | 
| 2290 |  |  |  |  |  |  | C or the string C, the former for the visualization of the raw data | 
| 2291 |  |  |  |  |  |  | and the latter for the visualization of the data after its different dimensions are | 
| 2292 |  |  |  |  |  |  | normalized by the standard-deviations along those directions.  If you call the method | 
| 2293 |  |  |  |  |  |  | with the second argument set to C, but do so without turning on the | 
| 2294 |  |  |  |  |  |  | C option in the KMeans constructor, it will let you know. | 
| 2295 |  |  |  |  |  |  |  | 
| 2296 |  |  |  |  |  |  |  | 
| 2297 |  |  |  |  |  |  | =item  B | 
| 2298 |  |  |  |  |  |  |  | 
| 2299 |  |  |  |  |  |  | If you wish to determine the cluster membership of a new data sample after you have | 
| 2300 |  |  |  |  |  |  | created the clusters with the existing data samples, you would need to call this | 
| 2301 |  |  |  |  |  |  | method. The C script in the C directory | 
| 2302 |  |  |  |  |  |  | shows how to use this method. | 
| 2303 |  |  |  |  |  |  |  | 
| 2304 |  |  |  |  |  |  |  | 
| 2305 |  |  |  |  |  |  | =item  B | 
| 2306 |  |  |  |  |  |  |  | 
| 2307 |  |  |  |  |  |  | This does the same thing as the previous method, except that it determines the | 
| 2308 |  |  |  |  |  |  | cluster membership using the Mahalanobis distance metric.  As for the previous | 
| 2309 |  |  |  |  |  |  | method, see the C script in the C directory | 
| 2310 |  |  |  |  |  |  | for how to use this method. | 
| 2311 |  |  |  |  |  |  |  | 
| 2312 |  |  |  |  |  |  |  | 
| 2313 |  |  |  |  |  |  | =item  B | 
| 2314 |  |  |  |  |  |  |  | 
| 2315 |  |  |  |  |  |  | Algorithm::KMeans->cluster_data_generator( | 
| 2316 |  |  |  |  |  |  | input_parameter_file => $parameter_file, | 
| 2317 |  |  |  |  |  |  | output_datafile => $out_datafile, | 
| 2318 |  |  |  |  |  |  | number_data_points_per_cluster => 20 ); | 
| 2319 |  |  |  |  |  |  |  | 
| 2320 |  |  |  |  |  |  | for generating multivariate data for clustering if you wish to play with synthetic | 
| 2321 |  |  |  |  |  |  | data for clustering.  The input parameter file contains the means and the variances | 
| 2322 |  |  |  |  |  |  | for the different Gaussians you wish to use for the synthetic data.  See the file | 
| 2323 |  |  |  |  |  |  | C provided in the examples directory.  It will be easiest for you to just | 
| 2324 |  |  |  |  |  |  | edit this file for your data generation needs.  In addition to the format of the | 
| 2325 |  |  |  |  |  |  | parameter file, the main constraint you need to observe in specifying the parameters | 
| 2326 |  |  |  |  |  |  | is that the dimensionality of the covariance matrix must correspond to the | 
| 2327 |  |  |  |  |  |  | dimensionality of the mean vectors.  The multivariate random numbers are generated by | 
| 2328 |  |  |  |  |  |  | calling the C module.  As you would expect, this module requires that | 
| 2329 |  |  |  |  |  |  | the covariance matrices you specify in your parameter file be symmetric and positive | 
| 2330 |  |  |  |  |  |  | definite.  Should the covariances in your parameter file not obey this condition, the | 
| 2331 |  |  |  |  |  |  | C module will let you know. | 
| 2332 |  |  |  |  |  |  |  | 
| 2333 |  |  |  |  |  |  | =back | 
| 2334 |  |  |  |  |  |  |  | 
| 2335 |  |  |  |  |  |  | =head1 HOW THE CLUSTERS ARE OUTPUT | 
| 2336 |  |  |  |  |  |  |  | 
| 2337 |  |  |  |  |  |  | When the option C is set in the call to the constructor, the | 
| 2338 |  |  |  |  |  |  | clusters are displayed on the terminal screen. | 
| 2339 |  |  |  |  |  |  |  | 
| 2340 |  |  |  |  |  |  | When the option C is set in the call to the constructor, the | 
| 2341 |  |  |  |  |  |  | module dumps the clusters in files named | 
| 2342 |  |  |  |  |  |  |  | 
| 2343 |  |  |  |  |  |  | cluster0.dat | 
| 2344 |  |  |  |  |  |  | cluster1.dat | 
| 2345 |  |  |  |  |  |  | cluster2.dat | 
| 2346 |  |  |  |  |  |  | ... | 
| 2347 |  |  |  |  |  |  | ... | 
| 2348 |  |  |  |  |  |  |  | 
| 2349 |  |  |  |  |  |  | in the directory in which you execute the module.  The number of such files will | 
| 2350 |  |  |  |  |  |  | equal the number of clusters formed.  All such existing files in the directory are | 
| 2351 |  |  |  |  |  |  | destroyed before any fresh ones are created.  Each cluster file contains the symbolic | 
| 2352 |  |  |  |  |  |  | ID tags of the data samples in that cluster. | 
| 2353 |  |  |  |  |  |  |  | 
| 2354 |  |  |  |  |  |  | The module also leaves in your directory a printable `.png' file that is a point plot | 
| 2355 |  |  |  |  |  |  | of the different clusters. The name of this file is always C. | 
| 2356 |  |  |  |  |  |  |  | 
| 2357 |  |  |  |  |  |  | Also, as mentioned previously, a call to C in your own code will return the | 
| 2358 |  |  |  |  |  |  | clusters and the cluster centers. | 
| 2359 |  |  |  |  |  |  |  | 
| 2360 |  |  |  |  |  |  | =head1 REQUIRED | 
| 2361 |  |  |  |  |  |  |  | 
| 2362 |  |  |  |  |  |  | This module requires the following three modules: | 
| 2363 |  |  |  |  |  |  |  | 
| 2364 |  |  |  |  |  |  | Math::Random | 
| 2365 |  |  |  |  |  |  | Graphics::GnuplotIF | 
| 2366 |  |  |  |  |  |  | Math::GSL | 
| 2367 |  |  |  |  |  |  |  | 
| 2368 |  |  |  |  |  |  | With regard to the third item above, what is actually required is the | 
| 2369 |  |  |  |  |  |  | C module.  However, that module is a part of the C | 
| 2370 |  |  |  |  |  |  | distribution. The acronym GSL stands for the GNU Scientific Library.  C is | 
| 2371 |  |  |  |  |  |  | a Perl interface to the GSL C-based library. | 
| 2372 |  |  |  |  |  |  |  | 
| 2373 |  |  |  |  |  |  |  | 
| 2374 |  |  |  |  |  |  | =head1 THE C DIRECTORY | 
| 2375 |  |  |  |  |  |  |  | 
| 2376 |  |  |  |  |  |  | The C directory contains several scripts to help you become familiar with | 
| 2377 |  |  |  |  |  |  | this module.  The following script is an example of how the module can be expected to | 
| 2378 |  |  |  |  |  |  | be used most of the time. It calls for clustering to be carried out with a fixed | 
| 2379 |  |  |  |  |  |  | C: | 
| 2380 |  |  |  |  |  |  |  | 
| 2381 |  |  |  |  |  |  | cluster_and_visualize.pl | 
| 2382 |  |  |  |  |  |  |  | 
| 2383 |  |  |  |  |  |  | The more time you spend with this script, the more comfortable you will become with | 
| 2384 |  |  |  |  |  |  | the use of this module. The script file contains a large comment block that mentions | 
| 2385 |  |  |  |  |  |  | six locations in the script where you have to make decisions about how to use the | 
| 2386 |  |  |  |  |  |  | module. | 
| 2387 |  |  |  |  |  |  |  | 
| 2388 |  |  |  |  |  |  | See the following script if you do not know what value to use for C for clustering | 
| 2389 |  |  |  |  |  |  | your data: | 
| 2390 |  |  |  |  |  |  |  | 
| 2391 |  |  |  |  |  |  | find_best_K_and_cluster.pl | 
| 2392 |  |  |  |  |  |  |  | 
| 2393 |  |  |  |  |  |  | This script uses the C option in the constructor that causes the module to | 
| 2394 |  |  |  |  |  |  | search for the best C for your data.  Since this search is virtually unbounded --- | 
| 2395 |  |  |  |  |  |  | limited only by the number of samples in your data file --- the script may take a | 
| 2396 |  |  |  |  |  |  | long time to run for a large data file.  Hence the next script. | 
| 2397 |  |  |  |  |  |  |  | 
| 2398 |  |  |  |  |  |  | If your datafile is too large, you may need to range limit the values of C that | 
| 2399 |  |  |  |  |  |  | are searched through, as in the following script: | 
| 2400 |  |  |  |  |  |  |  | 
| 2401 |  |  |  |  |  |  | find_best_K_in_range_and_cluster.pl | 
| 2402 |  |  |  |  |  |  |  | 
| 2403 |  |  |  |  |  |  | If you also want to include data normalization (it may reduce the performance of the | 
| 2404 |  |  |  |  |  |  | clusterer in some cases), see the following script: | 
| 2405 |  |  |  |  |  |  |  | 
| 2406 |  |  |  |  |  |  | cluster_after_data_normalization.pl | 
| 2407 |  |  |  |  |  |  |  | 
| 2408 |  |  |  |  |  |  | When you include the data normalization step and you would like to visualize the data | 
| 2409 |  |  |  |  |  |  | before and after normalization, see the following script: | 
| 2410 |  |  |  |  |  |  |  | 
| 2411 |  |  |  |  |  |  | cluster_and_visualize_with_data_visualization.pl* | 
| 2412 |  |  |  |  |  |  |  | 
| 2413 |  |  |  |  |  |  | After you are done clustering, let's say you want to find the cluster membership of a | 
| 2414 |  |  |  |  |  |  | new data sample. To see how you can do that, see the script: | 
| 2415 |  |  |  |  |  |  |  | 
| 2416 |  |  |  |  |  |  | which_cluster_for_new_data.pl | 
| 2417 |  |  |  |  |  |  |  | 
| 2418 |  |  |  |  |  |  | This script returns two answers for which cluster a new data sample belongs to: one | 
| 2419 |  |  |  |  |  |  | using the Euclidean metric to calculate the distances between the new data sample and | 
| 2420 |  |  |  |  |  |  | the cluster centers, and the other using the Mahalanobis metric.  If the clusters are | 
| 2421 |  |  |  |  |  |  | strongly elliptical in shape, you are likely to get better results with the | 
| 2422 |  |  |  |  |  |  | Mahalanobis metric.  (To see that you can get two different answers using the two | 
| 2423 |  |  |  |  |  |  | different distance metrics, run the C script on the | 
| 2424 |  |  |  |  |  |  | data in the file C.  To make this run, note that you have to comment | 
| 2425 |  |  |  |  |  |  | out and uncomment the lines at four different locations in the script.) | 
| 2426 |  |  |  |  |  |  |  | 
| 2427 |  |  |  |  |  |  | The C directory also contains the following support scripts: | 
| 2428 |  |  |  |  |  |  |  | 
| 2429 |  |  |  |  |  |  | For generating the data for experiments with clustering: | 
| 2430 |  |  |  |  |  |  |  | 
| 2431 |  |  |  |  |  |  | data_generator.pl | 
| 2432 |  |  |  |  |  |  |  | 
| 2433 |  |  |  |  |  |  | For cleaning up the examples directory: | 
| 2434 |  |  |  |  |  |  |  | 
| 2435 |  |  |  |  |  |  | cleanup_directory.pl | 
| 2436 |  |  |  |  |  |  |  | 
| 2437 |  |  |  |  |  |  | The examples directory also includes a parameter file, C, for generating | 
| 2438 |  |  |  |  |  |  | synthetic data for clustering.  Just edit this file if you would like to generate | 
| 2439 |  |  |  |  |  |  | your own multivariate data for clustering.  The parameter file is for the 3D case, | 
| 2440 |  |  |  |  |  |  | but you can generate data with any dimensionality through appropriate entries in the | 
| 2441 |  |  |  |  |  |  | parameter file. | 
| 2442 |  |  |  |  |  |  |  | 
| 2443 |  |  |  |  |  |  | =head1 EXPORT | 
| 2444 |  |  |  |  |  |  |  | 
| 2445 |  |  |  |  |  |  | None by design. | 
| 2446 |  |  |  |  |  |  |  | 
| 2447 |  |  |  |  |  |  | =head1 CAVEATS | 
| 2448 |  |  |  |  |  |  |  | 
| 2449 |  |  |  |  |  |  | K-Means based clustering usually does not work well when the clusters are strongly | 
| 2450 |  |  |  |  |  |  | overlapping and when the extent of variability along the different dimensions is | 
| 2451 |  |  |  |  |  |  | different for the different clusters.  The module does give you the ability to | 
| 2452 |  |  |  |  |  |  | normalize the variability in your data with the constructor option | 
| 2453 |  |  |  |  |  |  | C.  However, as described elsewhere, this may actually | 
| 2454 |  |  |  |  |  |  | reduce the performance of the clusterer if the data variability along a direction is | 
| 2455 |  |  |  |  |  |  | more a result of the separation between the means than because of intra-cluster | 
| 2456 |  |  |  |  |  |  | variability.  For better clustering with difficult-to-cluster data, you could try | 
| 2457 |  |  |  |  |  |  | using the author's C module. | 
| 2458 |  |  |  |  |  |  |  | 
| 2459 |  |  |  |  |  |  | =head1 BUGS | 
| 2460 |  |  |  |  |  |  |  | 
| 2461 |  |  |  |  |  |  | Please notify the author if you encounter any bugs.  When sending email, please place | 
| 2462 |  |  |  |  |  |  | the string 'KMeans' in the subject line. | 
| 2463 |  |  |  |  |  |  |  | 
| 2464 |  |  |  |  |  |  | =head1 INSTALLATION | 
| 2465 |  |  |  |  |  |  |  | 
| 2466 |  |  |  |  |  |  | Download the archive from CPAN in any directory of your choice.  Unpack the archive | 
| 2467 |  |  |  |  |  |  | with a command that on a Linux machine would look like: | 
| 2468 |  |  |  |  |  |  |  | 
| 2469 |  |  |  |  |  |  | tar zxvf Algorithm-KMeans-2.03.tar.gz | 
| 2470 |  |  |  |  |  |  |  | 
| 2471 |  |  |  |  |  |  | This will create an installation directory for you whose name will be | 
| 2472 |  |  |  |  |  |  | C.  Enter this directory and execute the following commands | 
| 2473 |  |  |  |  |  |  | for a standard install of the module if you have root privileges: | 
| 2474 |  |  |  |  |  |  |  | 
| 2475 |  |  |  |  |  |  | perl Makefile.PL | 
| 2476 |  |  |  |  |  |  | make | 
| 2477 |  |  |  |  |  |  | make test | 
| 2478 |  |  |  |  |  |  | sudo make install | 
| 2479 |  |  |  |  |  |  |  | 
| 2480 |  |  |  |  |  |  | If you do not have root privileges, you can carry out a non-standard install the | 
| 2481 |  |  |  |  |  |  | module in any directory of your choice by: | 
| 2482 |  |  |  |  |  |  |  | 
| 2483 |  |  |  |  |  |  | perl Makefile.PL prefix=/some/other/directory/ | 
| 2484 |  |  |  |  |  |  | make | 
| 2485 |  |  |  |  |  |  | make test | 
| 2486 |  |  |  |  |  |  | make install | 
| 2487 |  |  |  |  |  |  |  | 
| 2488 |  |  |  |  |  |  | With a non-standard install, you may also have to set your PERL5LIB environment | 
| 2489 |  |  |  |  |  |  | variable so that this module can find the required other modules. How you do that | 
| 2490 |  |  |  |  |  |  | would depend on what platform you are working on.  In order to install this module in | 
| 2491 |  |  |  |  |  |  | a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment | 
| 2492 |  |  |  |  |  |  | variable by | 
| 2493 |  |  |  |  |  |  |  | 
| 2494 |  |  |  |  |  |  | setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/ | 
| 2495 |  |  |  |  |  |  |  | 
| 2496 |  |  |  |  |  |  | If I used bash, I'd need to declare: | 
| 2497 |  |  |  |  |  |  |  | 
| 2498 |  |  |  |  |  |  | export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/ | 
| 2499 |  |  |  |  |  |  |  | 
| 2500 |  |  |  |  |  |  |  | 
| 2501 |  |  |  |  |  |  | =head1 THANKS | 
| 2502 |  |  |  |  |  |  |  | 
| 2503 |  |  |  |  |  |  | I thank Slaven for pointing out that I needed to downshift the required version of Perl | 
| 2504 |  |  |  |  |  |  | for this module.  Fortunately, I had access to an old machine still running Perl | 
| 2505 |  |  |  |  |  |  | 5.10.1.  The current version, 2.02, is based on my testing the module on that machine. | 
| 2506 |  |  |  |  |  |  |  | 
| 2507 |  |  |  |  |  |  | I added two C methods in Version 2.0 as a result of an email from | 
| 2508 |  |  |  |  |  |  | Jerome White who expressed a need for such methods in order to determine the best | 
| 2509 |  |  |  |  |  |  | cluster for a new data record after you have successfully clustered your existing | 
| 2510 |  |  |  |  |  |  | data.  Thanks Jerome for your feedback! | 
| 2511 |  |  |  |  |  |  |  | 
| 2512 |  |  |  |  |  |  | It was an email from Nadeem Bulsara that prompted me to create Version 1.40 of this | 
| 2513 |  |  |  |  |  |  | module.  Working with Version 1.30, Nadeem noticed that occasionally the module would | 
| 2514 |  |  |  |  |  |  | produce variable clustering results on the same dataset.  I believe that this | 
| 2515 |  |  |  |  |  |  | variability was caused (at least partly) by the purely random mode that was used in | 
| 2516 |  |  |  |  |  |  | Version 1.30 for the seeding of the cluster centers.  Version 1.40 now includes a | 
| 2517 |  |  |  |  |  |  | C mode. With the new mode the clusterer uses a PCA (Principal Components | 
| 2518 |  |  |  |  |  |  | Analysis) of the data to make good guesses for the cluster centers.  However, | 
| 2519 |  |  |  |  |  |  | depending on how the data is jumbled up, it is possible that the new mode will not | 
| 2520 |  |  |  |  |  |  | produce uniformly good results in all cases.  So you can still use the old mode by | 
| 2521 |  |  |  |  |  |  | setting C to C in the constructor.  Thanks Nadeem for your | 
| 2522 |  |  |  |  |  |  | feedback! | 
| 2523 |  |  |  |  |  |  |  | 
| 2524 |  |  |  |  |  |  | Version 1.30 resulted from Martin Kalin reporting problems with a very small data | 
| 2525 |  |  |  |  |  |  | set. Thanks Martin! | 
| 2526 |  |  |  |  |  |  |  | 
| 2527 |  |  |  |  |  |  | Version 1.21 came about in response to the problems encountered by Luis Fernando | 
| 2528 |  |  |  |  |  |  | D'Haro with version 1.20.  Although the module would yield the clusters for some of | 
| 2529 |  |  |  |  |  |  | its runs, more frequently than not the module would abort with an "empty cluster" | 
| 2530 |  |  |  |  |  |  | message for his data. Luis Fernando has also suggested other improvements (such as | 
| 2531 |  |  |  |  |  |  | clustering directly from the contents of a hash) that I intend to make in future | 
| 2532 |  |  |  |  |  |  | versions of this module.  Thanks Luis Fernando. | 
| 2533 |  |  |  |  |  |  |  | 
| 2534 |  |  |  |  |  |  | Chad Aeschliman was kind enough to test out the interface of this module and to give | 
| 2535 |  |  |  |  |  |  | suggestions for its improvement.  His key slogan: "If you cannot figure out how to | 
| 2536 |  |  |  |  |  |  | use a module in under 10 minutes, it's not going to be used."  That should explain | 
| 2537 |  |  |  |  |  |  | the longish Synopsis included here. | 
| 2538 |  |  |  |  |  |  |  | 
| 2539 |  |  |  |  |  |  | =head1 AUTHOR | 
| 2540 |  |  |  |  |  |  |  | 
| 2541 |  |  |  |  |  |  | Avinash Kak, kak@purdue.edu | 
| 2542 |  |  |  |  |  |  |  | 
| 2543 |  |  |  |  |  |  | If you send email, please place the string "KMeans" in your subject line to get past | 
| 2544 |  |  |  |  |  |  | my spam filter. | 
| 2545 |  |  |  |  |  |  |  | 
| 2546 |  |  |  |  |  |  | =head1 COPYRIGHT | 
| 2547 |  |  |  |  |  |  |  | 
| 2548 |  |  |  |  |  |  | This library is free software; you can redistribute it and/or modify it under the | 
| 2549 |  |  |  |  |  |  | same terms as Perl itself. | 
| 2550 |  |  |  |  |  |  |  | 
| 2551 |  |  |  |  |  |  | Copyright 2014 Avinash Kak | 
| 2552 |  |  |  |  |  |  |  | 
| 2553 |  |  |  |  |  |  | =cut | 
| 2554 |  |  |  |  |  |  |  |