line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Algorithm::KMeans; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
#------------------------------------------------------------------------------------ |
4
|
|
|
|
|
|
|
# Copyright (c) 2014 Avinash Kak. All rights reserved. This program is free |
5
|
|
|
|
|
|
|
# software. You may modify and/or distribute it under the same terms as Perl itself. |
6
|
|
|
|
|
|
|
# This copyright notice must remain attached to the file. |
7
|
|
|
|
|
|
|
# |
8
|
|
|
|
|
|
|
# Algorithm::KMeans is a Perl module for clustering multidimensional data. |
9
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------------- |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
#use 5.10.0; |
12
|
1
|
|
|
1
|
|
20289
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
44
|
|
13
|
1
|
|
|
1
|
|
6
|
use warnings; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
43
|
|
14
|
1
|
|
|
1
|
|
8
|
use Carp; |
|
1
|
|
|
|
|
6
|
|
|
1
|
|
|
|
|
109
|
|
15
|
1
|
|
|
1
|
|
7
|
use File::Basename; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
107
|
|
16
|
1
|
|
|
1
|
|
754
|
use Math::Random; |
|
1
|
|
|
|
|
13847
|
|
|
1
|
|
|
|
|
138
|
|
17
|
1
|
|
|
1
|
|
817
|
use Graphics::GnuplotIF; |
|
1
|
|
|
|
|
14462
|
|
|
1
|
|
|
|
|
78
|
|
18
|
1
|
|
|
1
|
|
1334
|
use Math::GSL::Matrix; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
our $VERSION = '2.05'; |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
# from Perl docs: |
24
|
|
|
|
|
|
|
my $_num_regex = '^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$'; |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
# Constructor: |
27
|
|
|
|
|
|
|
sub new { |
28
|
|
|
|
|
|
|
my ($class, %args) = @_; |
29
|
|
|
|
|
|
|
my @params = keys %args; |
30
|
|
|
|
|
|
|
croak "\nYou have used a wrong name for a keyword argument " . |
31
|
|
|
|
|
|
|
"--- perhaps a misspelling\n" |
32
|
|
|
|
|
|
|
if check_for_illegal_params(@params) == 0; |
33
|
|
|
|
|
|
|
bless { |
34
|
|
|
|
|
|
|
_datafile => $args{datafile} || croak("datafile required"), |
35
|
|
|
|
|
|
|
_mask => $args{mask} || croak("mask required"), |
36
|
|
|
|
|
|
|
_K => $args{K} || 0, |
37
|
|
|
|
|
|
|
_K_min => $args{Kmin} || 'unknown', |
38
|
|
|
|
|
|
|
_K_max => $args{Kmax} || 'unknown', |
39
|
|
|
|
|
|
|
_cluster_seeding => $args{cluster_seeding} || croak("must choose smart or random ". |
40
|
|
|
|
|
|
|
"for cluster seeding"), |
41
|
|
|
|
|
|
|
_var_normalize => $args{do_variance_normalization} || 0, |
42
|
|
|
|
|
|
|
_use_mahalanobis_metric => $args{use_mahalanobis_metric} || 0, |
43
|
|
|
|
|
|
|
_clusters_2_files => $args{write_clusters_to_files} || 0, |
44
|
|
|
|
|
|
|
_terminal_output => $args{terminal_output} || 0, |
45
|
|
|
|
|
|
|
_debug => $args{debug} || 0, |
46
|
|
|
|
|
|
|
_N => 0, |
47
|
|
|
|
|
|
|
_K_best => 'unknown', |
48
|
|
|
|
|
|
|
_original_data => {}, |
49
|
|
|
|
|
|
|
_data => {}, |
50
|
|
|
|
|
|
|
_data_id_tags => [], |
51
|
|
|
|
|
|
|
_QoC_values => {}, |
52
|
|
|
|
|
|
|
_clusters => [], |
53
|
|
|
|
|
|
|
_cluster_centers => [], |
54
|
|
|
|
|
|
|
_clusters_hash => {}, |
55
|
|
|
|
|
|
|
_cluster_centers_hash => {}, |
56
|
|
|
|
|
|
|
_cluster_covariances_hash => {}, |
57
|
|
|
|
|
|
|
_data_dimensions => 0, |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
}, $class; |
60
|
|
|
|
|
|
|
} |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
sub read_data_from_file { |
63
|
|
|
|
|
|
|
my $self = shift; |
64
|
|
|
|
|
|
|
my $filename = $self->{_datafile}; |
65
|
|
|
|
|
|
|
$self->read_data_from_file_csv() if $filename =~ /.csv$/; |
66
|
|
|
|
|
|
|
$self->read_data_from_file_dat() if $filename =~ /.dat$/; |
67
|
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
sub read_data_from_file_csv { |
70
|
|
|
|
|
|
|
my $self = shift; |
71
|
|
|
|
|
|
|
my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?'; |
72
|
|
|
|
|
|
|
my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered"; |
73
|
|
|
|
|
|
|
my $mask = $self->{_mask}; |
74
|
|
|
|
|
|
|
my @mask = split //, $mask; |
75
|
|
|
|
|
|
|
$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask; |
76
|
|
|
|
|
|
|
print "data dimensionality: $self->{_data_dimensions} \n"if $self->{_terminal_output}; |
77
|
|
|
|
|
|
|
open FILEIN, $filename or die "Unable to open $filename: $!"; |
78
|
|
|
|
|
|
|
die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/; |
79
|
|
|
|
|
|
|
local $/ = undef; |
80
|
|
|
|
|
|
|
my @all_data = split /\s+/, ; |
81
|
|
|
|
|
|
|
my %data_hash = (); |
82
|
|
|
|
|
|
|
my @data_tags = (); |
83
|
|
|
|
|
|
|
foreach my $record (@all_data) { |
84
|
|
|
|
|
|
|
my @splits = split /,/, $record; |
85
|
|
|
|
|
|
|
die "\nYour mask size (including `N' and 1's and 0's) does not match\n" . |
86
|
|
|
|
|
|
|
"the size of at least one of the data records in the file: $!" |
87
|
|
|
|
|
|
|
unless scalar(@mask) == scalar(@splits); |
88
|
|
|
|
|
|
|
my $record_name = shift @splits; |
89
|
|
|
|
|
|
|
$data_hash{$record_name} = \@splits; |
90
|
|
|
|
|
|
|
push @data_tags, $record_name; |
91
|
|
|
|
|
|
|
} |
92
|
|
|
|
|
|
|
$self->{_original_data} = \%data_hash; |
93
|
|
|
|
|
|
|
$self->{_data_id_tags} = \@data_tags; |
94
|
|
|
|
|
|
|
$self->{_N} = scalar @data_tags; |
95
|
|
|
|
|
|
|
if ($self->{_var_normalize}) { |
96
|
|
|
|
|
|
|
$self->{_data} = variance_normalization( $self->{_original_data} ); |
97
|
|
|
|
|
|
|
} else { |
98
|
|
|
|
|
|
|
$self->{_data} = deep_copy_hash( $self->{_original_data} ); |
99
|
|
|
|
|
|
|
} |
100
|
|
|
|
|
|
|
# Need to make the following call to set the global mean and covariance: |
101
|
|
|
|
|
|
|
# my $covariance = $self->estimate_mean_and_covariance(\@data_tags); |
102
|
|
|
|
|
|
|
# Need to make the following call to set the global eigenvec eigenval sets: |
103
|
|
|
|
|
|
|
# $self->eigen_analysis_of_covariance($covariance); |
104
|
|
|
|
|
|
|
if ( defined($self->{_K}) && ($self->{_K} > 0) ) { |
105
|
|
|
|
|
|
|
carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " . |
106
|
|
|
|
|
|
|
"points must satisfy the relation N > 2xK**2 where K is " . |
107
|
|
|
|
|
|
|
"the number of clusters requested for the clusters to be " . |
108
|
|
|
|
|
|
|
"meaningful $!" |
109
|
|
|
|
|
|
|
if ( $self->{_N} < (2 * $self->{_K} ** 2) ); |
110
|
|
|
|
|
|
|
print "\n\n\n"; |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
} |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
sub read_data_from_file_dat { |
115
|
|
|
|
|
|
|
my $self = shift; |
116
|
|
|
|
|
|
|
my $datafile = $self->{_datafile}; |
117
|
|
|
|
|
|
|
my $mask = $self->{_mask}; |
118
|
|
|
|
|
|
|
my @mask = split //, $mask; |
119
|
|
|
|
|
|
|
$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask; |
120
|
|
|
|
|
|
|
print "data dimensionality: $self->{_data_dimensions} \n"if $self->{_terminal_output}; |
121
|
|
|
|
|
|
|
open INPUT, $datafile or die "unable to open file $datafile: $!\n"; |
122
|
|
|
|
|
|
|
chomp( my @raw_data = ); |
123
|
|
|
|
|
|
|
close INPUT; |
124
|
|
|
|
|
|
|
# Transform strings into number data |
125
|
|
|
|
|
|
|
foreach my $record (@raw_data) { |
126
|
|
|
|
|
|
|
next unless $record; |
127
|
|
|
|
|
|
|
next if $record =~ /^#/; |
128
|
|
|
|
|
|
|
my @data_fields; |
129
|
|
|
|
|
|
|
my @fields = split /\s+/, $record; |
130
|
|
|
|
|
|
|
die "\nABORTED: Mask size does not correspond to row record size\n" if $#fields != $#mask; |
131
|
|
|
|
|
|
|
my $record_id; |
132
|
|
|
|
|
|
|
foreach my $i (0..@fields-1) { |
133
|
|
|
|
|
|
|
if ($mask[$i] eq '0') { |
134
|
|
|
|
|
|
|
next; |
135
|
|
|
|
|
|
|
} elsif ($mask[$i] eq 'N') { |
136
|
|
|
|
|
|
|
$record_id = $fields[$i]; |
137
|
|
|
|
|
|
|
} elsif ($mask[$i] eq '1') { |
138
|
|
|
|
|
|
|
push @data_fields, $fields[$i]; |
139
|
|
|
|
|
|
|
} else { |
140
|
|
|
|
|
|
|
die "misformed mask for reading the data file\n"; |
141
|
|
|
|
|
|
|
} |
142
|
|
|
|
|
|
|
} |
143
|
|
|
|
|
|
|
my @nums = map {/$_num_regex/;$_} @data_fields; |
144
|
|
|
|
|
|
|
$self->{_original_data}->{ $record_id } = \@nums; |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
if ($self->{_var_normalize}) { |
147
|
|
|
|
|
|
|
$self->{_data} = variance_normalization( $self->{_original_data} ); |
148
|
|
|
|
|
|
|
} else { |
149
|
|
|
|
|
|
|
$self->{_data} = deep_copy_hash( $self->{_original_data} ); |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
my @all_data_ids = keys %{$self->{_data}}; |
152
|
|
|
|
|
|
|
$self->{_data_id_tags} = \@all_data_ids; |
153
|
|
|
|
|
|
|
$self->{_N} = scalar @all_data_ids; |
154
|
|
|
|
|
|
|
if ( defined($self->{_K}) && ($self->{_K} > 0) ) { |
155
|
|
|
|
|
|
|
carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " . |
156
|
|
|
|
|
|
|
"points must satisfy the relation N > 2xK**2 where K is " . |
157
|
|
|
|
|
|
|
"the number of clusters requested for the clusters to be " . |
158
|
|
|
|
|
|
|
"meaningful $!" |
159
|
|
|
|
|
|
|
if ( $self->{_N} < (2 * $self->{_K} ** 2) ); |
160
|
|
|
|
|
|
|
print "\n\n\n"; |
161
|
|
|
|
|
|
|
} |
162
|
|
|
|
|
|
|
} |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
sub kmeans { |
165
|
|
|
|
|
|
|
my $self = shift; |
166
|
|
|
|
|
|
|
my $K = $self->{_K}; |
167
|
|
|
|
|
|
|
if ( ($K == 0) || |
168
|
|
|
|
|
|
|
( ($self->{_K_min} ne 'unknown') && |
169
|
|
|
|
|
|
|
($self->{_K_max} ne 'unknown') ) ) { |
170
|
|
|
|
|
|
|
eval { |
171
|
|
|
|
|
|
|
$self->iterate_through_K(); |
172
|
|
|
|
|
|
|
}; |
173
|
|
|
|
|
|
|
die "in kmeans(): $@" if ($@); |
174
|
|
|
|
|
|
|
} elsif ( $K =~ /\d+/) { |
175
|
|
|
|
|
|
|
eval { |
176
|
|
|
|
|
|
|
$self->cluster_for_fixed_K_multiple_random_tries($K) |
177
|
|
|
|
|
|
|
if $self->{_cluster_seeding} eq 'random'; |
178
|
|
|
|
|
|
|
$self->cluster_for_fixed_K_single_smart_try($K) |
179
|
|
|
|
|
|
|
if $self->{_cluster_seeding} eq 'smart'; |
180
|
|
|
|
|
|
|
}; |
181
|
|
|
|
|
|
|
die "in kmeans(): $@" if ($@); |
182
|
|
|
|
|
|
|
} else { |
183
|
|
|
|
|
|
|
croak "Incorrect call syntax used. See documentation.\n"; |
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
if ((defined $self->{_clusters}) && (defined $self->{_cluster_centers})){ |
186
|
|
|
|
|
|
|
foreach my $i (0..@{$self->{_clusters}}-1) { |
187
|
|
|
|
|
|
|
$self->{_clusters_hash}->{"cluster$i"} = $self->{_clusters}->[$i]; |
188
|
|
|
|
|
|
|
$self->{_cluster_centers_hash}->{"cluster$i"} = $self->{_cluster_centers}->[$i]; |
189
|
|
|
|
|
|
|
$self->{_cluster_covariances_hash}->{"cluster$i"} = |
190
|
|
|
|
|
|
|
$self->estimate_cluster_covariance($self->{_clusters}->[$i]); |
191
|
|
|
|
|
|
|
} |
192
|
|
|
|
|
|
|
$self->write_clusters_to_files( $self->{_clusters} ) if $self->{_clusters_2_files}; |
193
|
|
|
|
|
|
|
return ($self->{_clusters_hash}, $self->{_cluster_centers_hash}); |
194
|
|
|
|
|
|
|
} else { |
195
|
|
|
|
|
|
|
croak "Clustering failed. Try again."; |
196
|
|
|
|
|
|
|
} |
197
|
|
|
|
|
|
|
} |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
# This is the subroutine to call if you prefer purely random choices for the initial |
200
|
|
|
|
|
|
|
# seeding of the cluster centers. |
201
|
|
|
|
|
|
|
sub cluster_for_fixed_K_multiple_random_tries { |
202
|
|
|
|
|
|
|
my $self = shift; |
203
|
|
|
|
|
|
|
my $K = shift; |
204
|
|
|
|
|
|
|
my @all_data_ids = @{$self->{_data_id_tags}}; |
205
|
|
|
|
|
|
|
my $QoC; |
206
|
|
|
|
|
|
|
my @QoC_values; |
207
|
|
|
|
|
|
|
my @array_of_clusters; |
208
|
|
|
|
|
|
|
my @array_of_cluster_centers; |
209
|
|
|
|
|
|
|
my $clusters; |
210
|
|
|
|
|
|
|
my $new_clusters; |
211
|
|
|
|
|
|
|
my $cluster_centers; |
212
|
|
|
|
|
|
|
my $new_cluster_centers; |
213
|
|
|
|
|
|
|
print "Clustering for K = $K\n" if $self->{_terminal_output}; |
214
|
|
|
|
|
|
|
foreach my $trial (1..20) { |
215
|
|
|
|
|
|
|
print ". "; |
216
|
|
|
|
|
|
|
my ($new_clusters, $new_cluster_centers); |
217
|
|
|
|
|
|
|
eval { |
218
|
|
|
|
|
|
|
($new_clusters, $new_cluster_centers) = $self->cluster_for_given_K($K); |
219
|
|
|
|
|
|
|
}; |
220
|
|
|
|
|
|
|
next if $@; |
221
|
|
|
|
|
|
|
next if @$new_clusters <= 1; |
222
|
|
|
|
|
|
|
my $newQoC = $self->cluster_quality( $new_clusters, $new_cluster_centers ); |
223
|
|
|
|
|
|
|
if ( (!defined $QoC) || ($newQoC < $QoC) ) { |
224
|
|
|
|
|
|
|
$QoC = $newQoC; |
225
|
|
|
|
|
|
|
$clusters = deep_copy_AoA( $new_clusters ); |
226
|
|
|
|
|
|
|
$cluster_centers = deep_copy_AoA( $new_cluster_centers ); |
227
|
|
|
|
|
|
|
} |
228
|
|
|
|
|
|
|
} |
229
|
|
|
|
|
|
|
die "\n\nThe constructor options you have chosen do not work with the data. Try\n" . |
230
|
|
|
|
|
|
|
"turning off the Mahalanobis option if you are using it.\n" |
231
|
|
|
|
|
|
|
unless defined $clusters; |
232
|
|
|
|
|
|
|
$self->{_clusters} = $clusters; |
233
|
|
|
|
|
|
|
$self->{_cluster_centers} = $cluster_centers; |
234
|
|
|
|
|
|
|
$self->{_QoC_values}->{"$K"} = $QoC; |
235
|
|
|
|
|
|
|
if ($self->{_terminal_output}) { |
236
|
|
|
|
|
|
|
print "\nDisplaying final clusters for best K (= $K) :\n"; |
237
|
|
|
|
|
|
|
display_clusters( $clusters ); |
238
|
|
|
|
|
|
|
$self->display_cluster_centers( $clusters ); |
239
|
|
|
|
|
|
|
print "\nQoC value (the smaller, the better): $QoC\n"; |
240
|
|
|
|
|
|
|
} |
241
|
|
|
|
|
|
|
} |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
# For the smart try, we do initial cluster seeding by subjecting the data to |
244
|
|
|
|
|
|
|
# principal components analysis in order to discover the direction of maximum |
245
|
|
|
|
|
|
|
# variance in the data space. Subsequently, we try to find the K largest peaks along |
246
|
|
|
|
|
|
|
# this direction. The coordinates of these peaks serve as the seeds for the K |
247
|
|
|
|
|
|
|
# clusters. |
248
|
|
|
|
|
|
|
sub cluster_for_fixed_K_single_smart_try { |
249
|
|
|
|
|
|
|
my $self = shift; |
250
|
|
|
|
|
|
|
my $K = shift; |
251
|
|
|
|
|
|
|
my @all_data_ids = @{$self->{_data_id_tags}}; |
252
|
|
|
|
|
|
|
print "Clustering for K = $K\n" if $self->{_terminal_output}; |
253
|
|
|
|
|
|
|
my ($clusters, $cluster_centers); |
254
|
|
|
|
|
|
|
eval { |
255
|
|
|
|
|
|
|
($clusters, $cluster_centers) = $self->cluster_for_given_K($K); |
256
|
|
|
|
|
|
|
}; |
257
|
|
|
|
|
|
|
if ($@) { |
258
|
|
|
|
|
|
|
die "In cluster_for_fixed_K_single_smart_try: insufficient data for clustering " . |
259
|
|
|
|
|
|
|
"with $self->{_K} clusters --- $@"; |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
my $QoC = $self->cluster_quality( $clusters, $cluster_centers ); |
262
|
|
|
|
|
|
|
$self->{_clusters} = $clusters; |
263
|
|
|
|
|
|
|
$self->{_cluster_centers} = $cluster_centers; |
264
|
|
|
|
|
|
|
$self->{_QoC_values}->{"$K"} = $QoC; |
265
|
|
|
|
|
|
|
if ($self->{_terminal_output}) { |
266
|
|
|
|
|
|
|
print "\nDisplaying final clusters for best K (= $K) :\n"; |
267
|
|
|
|
|
|
|
display_clusters( $clusters ); |
268
|
|
|
|
|
|
|
$self->display_cluster_centers( $clusters ); |
269
|
|
|
|
|
|
|
print "\nQoC value (the smaller, the better): $QoC\n"; |
270
|
|
|
|
|
|
|
} |
271
|
|
|
|
|
|
|
} |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
# The following subroutine is the top-level routine to call if you want the system to |
274
|
|
|
|
|
|
|
# figure out on its own what value to use for K, the number of clusters. If you call |
275
|
|
|
|
|
|
|
# the KMeans constructor with the K=0 option, the method below will try every |
276
|
|
|
|
|
|
|
# possible value of K between 2 and the maximum possible depending on the number of |
277
|
|
|
|
|
|
|
# data points available. For example, if the number of data points is 10,000, it will |
278
|
|
|
|
|
|
|
# try all possible values of K between 2 and 70. For how the maximum value is set for |
279
|
|
|
|
|
|
|
# K, see the comments made under Description. Note also how this method makes 20 |
280
|
|
|
|
|
|
|
# different tries for each value of K as a defense against the problem of the final |
281
|
|
|
|
|
|
|
# result corresponding to some local minimum in the values of the QoC metric. Out of |
282
|
|
|
|
|
|
|
# these 20 tries for each K, it retains the clusters and the cluster centers for only |
283
|
|
|
|
|
|
|
# that try that yields the smallest value for the QoC metric. After estimating the |
284
|
|
|
|
|
|
|
# "best" QoC values for all possible K in this manner, it then finds the K for which |
285
|
|
|
|
|
|
|
# the QoC is the minimum. This is taken to be the best value for K. Finally, the |
286
|
|
|
|
|
|
|
# output clusters are written out to separate files. |
287
|
|
|
|
|
|
|
# |
288
|
|
|
|
|
|
|
# If the KMeans constructor is invoked with the (Kmin, Kmax) options, then, instead |
289
|
|
|
|
|
|
|
# of iterating through 2 and the maximum permissible value for K, the iterations are |
290
|
|
|
|
|
|
|
# carried out only between Kmin and Kmax. |
291
|
|
|
|
|
|
|
sub iterate_through_K { |
292
|
|
|
|
|
|
|
my $self = shift; |
293
|
|
|
|
|
|
|
my @all_data_ids = @{$self->{_data_id_tags}}; |
294
|
|
|
|
|
|
|
my $N = $self->{_N}; |
295
|
|
|
|
|
|
|
croak "You need more than 8 data samples. The number of data points must satisfy " . |
296
|
|
|
|
|
|
|
"the relation N > 2xK**2 where K is the number of clusters. The smallest " . |
297
|
|
|
|
|
|
|
"value for K is 2.\n" if $N <= 8; |
298
|
|
|
|
|
|
|
my $K_statistical_max = int( sqrt( $N / 2.0 ) ); |
299
|
|
|
|
|
|
|
my $Kmin = $self->{_K_min} eq 'unknown' |
300
|
|
|
|
|
|
|
? 2 |
301
|
|
|
|
|
|
|
: $self->{_K_min}; |
302
|
|
|
|
|
|
|
my $Kmax = $self->{_K_max} eq 'unknown' |
303
|
|
|
|
|
|
|
? int( sqrt( $N / 2.0 ) ) |
304
|
|
|
|
|
|
|
: $self->{_K_max}; |
305
|
|
|
|
|
|
|
croak "\n\nYour Kmax value is too high. Ideally, it should not exceed sqrt(N/2) " . |
306
|
|
|
|
|
|
|
"where N is the number of data points" if $Kmax > $K_statistical_max; |
307
|
|
|
|
|
|
|
print "Value of Kmax is: $Kmax\n" if $self->{_terminal_output}; |
308
|
|
|
|
|
|
|
my @QoC_values; |
309
|
|
|
|
|
|
|
my @array_of_clusters; |
310
|
|
|
|
|
|
|
my @array_of_cluster_centers; |
311
|
|
|
|
|
|
|
foreach my $K ($Kmin..$Kmax) { |
312
|
|
|
|
|
|
|
my $QoC; |
313
|
|
|
|
|
|
|
my $clusters; |
314
|
|
|
|
|
|
|
my $cluster_centers; |
315
|
|
|
|
|
|
|
print "Clustering for K = $K\n" if $self->{_terminal_output}; |
316
|
|
|
|
|
|
|
if ($self->{_cluster_seeding} eq 'random') { |
317
|
|
|
|
|
|
|
foreach my $trial (1..21) { |
318
|
|
|
|
|
|
|
print ". "; |
319
|
|
|
|
|
|
|
my ($new_clusters, $new_cluster_centers); |
320
|
|
|
|
|
|
|
if ($self->{_use_mahalanobis_metric}) { |
321
|
|
|
|
|
|
|
eval { |
322
|
|
|
|
|
|
|
($new_clusters, $new_cluster_centers) = $self->cluster_for_given_K($K); |
323
|
|
|
|
|
|
|
}; |
324
|
|
|
|
|
|
|
next if $@; |
325
|
|
|
|
|
|
|
} else { |
326
|
|
|
|
|
|
|
($new_clusters, $new_cluster_centers) = $self->cluster_for_given_K($K); |
327
|
|
|
|
|
|
|
} |
328
|
|
|
|
|
|
|
my $newQoC = $self->cluster_quality( $new_clusters, $new_cluster_centers ); |
329
|
|
|
|
|
|
|
if ( (!defined $QoC) || ($newQoC < $QoC) ) { |
330
|
|
|
|
|
|
|
$QoC = $newQoC; |
331
|
|
|
|
|
|
|
$clusters = deep_copy_AoA( $new_clusters ); |
332
|
|
|
|
|
|
|
$cluster_centers = deep_copy_AoA( $new_cluster_centers ); |
333
|
|
|
|
|
|
|
} |
334
|
|
|
|
|
|
|
} |
335
|
|
|
|
|
|
|
print "\n"; |
336
|
|
|
|
|
|
|
} elsif ($self->{_cluster_seeding} eq 'smart') { |
337
|
|
|
|
|
|
|
eval { |
338
|
|
|
|
|
|
|
($clusters, $cluster_centers) = $self->cluster_for_given_K($K); |
339
|
|
|
|
|
|
|
}; |
340
|
|
|
|
|
|
|
if ($@) { |
341
|
|
|
|
|
|
|
$Kmax = $K - 1; |
342
|
|
|
|
|
|
|
last; |
343
|
|
|
|
|
|
|
} |
344
|
|
|
|
|
|
|
$QoC = $self->cluster_quality($clusters,$cluster_centers); |
345
|
|
|
|
|
|
|
} else { |
346
|
|
|
|
|
|
|
die "You must either choose 'smart' for cluster_seeding or 'random'. " . |
347
|
|
|
|
|
|
|
"Fix your constructor call." |
348
|
|
|
|
|
|
|
} |
349
|
|
|
|
|
|
|
push @QoC_values, $QoC; |
350
|
|
|
|
|
|
|
push @array_of_clusters, $clusters; |
351
|
|
|
|
|
|
|
push @array_of_cluster_centers, $cluster_centers; |
352
|
|
|
|
|
|
|
} |
353
|
|
|
|
|
|
|
my ($min, $max) = minmax( \@QoC_values ); |
354
|
|
|
|
|
|
|
my $K_best_relative_to_Kmin = get_index_at_value($min, \@QoC_values ); |
355
|
|
|
|
|
|
|
my $K_best = $K_best_relative_to_Kmin + $Kmin; |
356
|
|
|
|
|
|
|
if ($self->{_terminal_output}) { |
357
|
|
|
|
|
|
|
print "\nDisplaying final clusters for best K (= $K_best) :\n"; |
358
|
|
|
|
|
|
|
display_clusters( $array_of_clusters[$K_best_relative_to_Kmin] ); |
359
|
|
|
|
|
|
|
$self->display_cluster_centers($array_of_clusters[$K_best_relative_to_Kmin]); |
360
|
|
|
|
|
|
|
print "\nBest clustering achieved for K=$K_best with QoC = $min\n" if defined $min; |
361
|
|
|
|
|
|
|
my @printableQoC = grep {$_} @QoC_values; |
362
|
|
|
|
|
|
|
print "\nQoC values array (the smaller the value, the better it is) for different " . |
363
|
|
|
|
|
|
|
"K starting with K=$Kmin: @printableQoC\n"; |
364
|
|
|
|
|
|
|
} |
365
|
|
|
|
|
|
|
$self->{_K_best} = $K_best; |
366
|
|
|
|
|
|
|
foreach my $i (0..@QoC_values-1) { |
367
|
|
|
|
|
|
|
my $k = $i + $Kmin; |
368
|
|
|
|
|
|
|
$self->{_QoC_values}->{"$k"} = $QoC_values[$i]; |
369
|
|
|
|
|
|
|
} |
370
|
|
|
|
|
|
|
$self->{_clusters} = $array_of_clusters[$K_best_relative_to_Kmin]; |
371
|
|
|
|
|
|
|
$self->{_cluster_centers} = |
372
|
|
|
|
|
|
|
$array_of_cluster_centers[$K_best_relative_to_Kmin]; |
373
|
|
|
|
|
|
|
} |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
# This is the function to call if you already know what value you want to use for K, |
376
|
|
|
|
|
|
|
# the number of expected clusters. The purpose of this function is to do the |
377
|
|
|
|
|
|
|
# initialization of the cluster centers and to carry out the initial assignment of |
378
|
|
|
|
|
|
|
# the data to the clusters with the initial cluster centers. The initialization |
379
|
|
|
|
|
|
|
# consists of 3 steps: Construct a random sequence of K integers between 0 and N-1 |
380
|
|
|
|
|
|
|
# where N is the number of data points to be clustered; 2) Call |
381
|
|
|
|
|
|
|
# get_initial_cluster_centers() to index into the data array with the random integers |
382
|
|
|
|
|
|
|
# to get a list of K data points that would serve as the initial cluster centers; and |
383
|
|
|
|
|
|
|
# (3) Call assign_data_to_clusters_initial() to assign the rest of the data to each |
384
|
|
|
|
|
|
|
# of the K clusters on the basis of the proximity to the cluster centers. |
385
|
|
|
|
|
|
|
sub cluster_for_given_K { |
386
|
|
|
|
|
|
|
my $self = shift; |
387
|
|
|
|
|
|
|
my $K = shift; |
388
|
|
|
|
|
|
|
my @all_data_ids = @{$self->{_data_id_tags}}; |
389
|
|
|
|
|
|
|
my $cluster_centers; |
390
|
|
|
|
|
|
|
if ($self->{_cluster_seeding} eq 'smart') { |
391
|
|
|
|
|
|
|
$cluster_centers = $self->get_initial_cluster_centers_smart($K); |
392
|
|
|
|
|
|
|
} elsif ($self->{_cluster_seeding} eq 'random') { |
393
|
|
|
|
|
|
|
$cluster_centers = $self->get_initial_cluster_centers($K); |
394
|
|
|
|
|
|
|
} else { |
395
|
|
|
|
|
|
|
die "You must either choose 'smart' for cluster_seeding or 'random'. " . |
396
|
|
|
|
|
|
|
"Fix your constructor call." |
397
|
|
|
|
|
|
|
} |
398
|
|
|
|
|
|
|
my $clusters; |
399
|
|
|
|
|
|
|
if ($self->{_use_mahalanobis_metric}) { |
400
|
|
|
|
|
|
|
my $clusters_and_determinants = |
401
|
|
|
|
|
|
|
$self->assign_data_to_clusters_initial_mahalanobis($cluster_centers); |
402
|
|
|
|
|
|
|
$clusters = $clusters_and_determinants->[0]; |
403
|
|
|
|
|
|
|
my @determinants = @{$clusters_and_determinants->[1]}; |
404
|
|
|
|
|
|
|
my ($min,$max) = minmax(\@determinants); |
405
|
|
|
|
|
|
|
die "In cluster_for_given_K(): min determinant of covariance matrix for at " . |
406
|
|
|
|
|
|
|
"least one cluster is too small" if $min / $max < 0.001; |
407
|
|
|
|
|
|
|
} else { |
408
|
|
|
|
|
|
|
$clusters = $self->assign_data_to_clusters_initial($cluster_centers); |
409
|
|
|
|
|
|
|
} |
410
|
|
|
|
|
|
|
my $cluster_nonexistent_flag = 0; |
411
|
|
|
|
|
|
|
foreach my $trial (0..2) { |
412
|
|
|
|
|
|
|
if ($self->{_use_mahalanobis_metric}) { |
413
|
|
|
|
|
|
|
($clusters, $cluster_centers) = $self->assign_data_to_clusters_mahalanobis($clusters, $K); |
414
|
|
|
|
|
|
|
} else { |
415
|
|
|
|
|
|
|
($clusters, $cluster_centers) = $self->assign_data_to_clusters( $clusters, $K ); |
416
|
|
|
|
|
|
|
} |
417
|
|
|
|
|
|
|
my $num_of_clusters_returned = @$clusters; |
418
|
|
|
|
|
|
|
foreach my $cluster (@$clusters) { |
419
|
|
|
|
|
|
|
$cluster_nonexistent_flag = 1 if ((!defined $cluster) || (@$cluster == 0)); |
420
|
|
|
|
|
|
|
} |
421
|
|
|
|
|
|
|
last unless $cluster_nonexistent_flag; |
422
|
|
|
|
|
|
|
} |
423
|
|
|
|
|
|
|
return ($clusters, $cluster_centers); |
424
|
|
|
|
|
|
|
} |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
# This function is used when you set the "cluster_seeding" option to 'random' in the |
427
|
|
|
|
|
|
|
# constructor. Returns a set of K random integers. These serve as indices to reach |
428
|
|
|
|
|
|
|
# into the data array. A data element whose index is one of the random numbers |
429
|
|
|
|
|
|
|
# returned by this routine serves as an initial cluster center. Note the quality |
430
|
|
|
|
|
|
|
# check it runs on the list of K random integers constructed. We first make sure |
431
|
|
|
|
|
|
|
# that all K random integers are different. Subsequently, we carry out a quality |
432
|
|
|
|
|
|
|
# assessment of the K random integers constructed. This quality measure consists of |
433
|
|
|
|
|
|
|
# the ratio of the values spanned by the random integers to the value of N, the total |
434
|
|
|
|
|
|
|
# number of data points to be clustered. Currently, if this ratio is less than 0.3, |
435
|
|
|
|
|
|
|
# we discard the K integers and try again. |
436
|
|
|
|
|
|
|
sub initialize_cluster_centers { |
437
|
|
|
|
|
|
|
my $self = shift; |
438
|
|
|
|
|
|
|
my $K = shift; |
439
|
|
|
|
|
|
|
my $data_store_size = $self->{_N}; |
440
|
|
|
|
|
|
|
my @cluster_center_indices; |
441
|
|
|
|
|
|
|
while (1) { |
442
|
|
|
|
|
|
|
foreach my $i (0..$K-1) { |
443
|
|
|
|
|
|
|
$cluster_center_indices[$i] = int rand( $data_store_size ); |
444
|
|
|
|
|
|
|
next if $i == 0; |
445
|
|
|
|
|
|
|
foreach my $j (0..$i-1) { |
446
|
|
|
|
|
|
|
while ( $cluster_center_indices[$j] == |
447
|
|
|
|
|
|
|
$cluster_center_indices[$i] ) { |
448
|
|
|
|
|
|
|
my $old = $cluster_center_indices[$i]; |
449
|
|
|
|
|
|
|
$cluster_center_indices[$i] = int rand($data_store_size); |
450
|
|
|
|
|
|
|
} |
451
|
|
|
|
|
|
|
} |
452
|
|
|
|
|
|
|
} |
453
|
|
|
|
|
|
|
my ($min,$max) = minmax(\@cluster_center_indices ); |
454
|
|
|
|
|
|
|
my $quality = ($max - $min) / $data_store_size; |
455
|
|
|
|
|
|
|
last if $quality > 0.3; |
456
|
|
|
|
|
|
|
} |
457
|
|
|
|
|
|
|
return @cluster_center_indices; |
458
|
|
|
|
|
|
|
} |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
# This function is used when you set the "cluster_seeding" option to 'random' in the |
461
|
|
|
|
|
|
|
# constructor. This routine merely reaches into the data array with the random |
462
|
|
|
|
|
|
|
# integers, as constructed by the previous routine, serving as indices and fetching |
463
|
|
|
|
|
|
|
# values corresponding to those indices. The fetched data samples serve as the |
464
|
|
|
|
|
|
|
# initial cluster centers. |
465
|
|
|
|
|
|
|
sub get_initial_cluster_centers { |
466
|
|
|
|
|
|
|
my $self = shift; |
467
|
|
|
|
|
|
|
my $K = shift; |
468
|
|
|
|
|
|
|
my @cluster_center_indices = $self->initialize_cluster_centers($K); |
469
|
|
|
|
|
|
|
my @result; |
470
|
|
|
|
|
|
|
foreach my $i (@cluster_center_indices) { |
471
|
|
|
|
|
|
|
my $tag = $self->{_data_id_tags}[$i]; |
472
|
|
|
|
|
|
|
push @result, $self->{_data}->{$tag}; |
473
|
|
|
|
|
|
|
} |
474
|
|
|
|
|
|
|
return \@result; |
475
|
|
|
|
|
|
|
} |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
# This method is invoked when you choose the 'smart' option for the "cluster_seeding" |
478
|
|
|
|
|
|
|
# option in the constructor. It subjects the data to a principal components analysis |
479
|
|
|
|
|
|
|
# to figure out the direction of maximal variance. Subsequently, it tries to locate |
480
|
|
|
|
|
|
|
# K peaks in a smoothed histogram of the data points projected onto the maximal |
481
|
|
|
|
|
|
|
# variance direction. |
482
|
|
|
|
|
|
|
sub get_initial_cluster_centers_smart { |
483
|
|
|
|
|
|
|
my $self = shift; |
484
|
|
|
|
|
|
|
my $K = shift; |
485
|
|
|
|
|
|
|
if ($self->{_data_dimensions} == 1) { |
486
|
|
|
|
|
|
|
my @one_d_data; |
487
|
|
|
|
|
|
|
foreach my $j (0..$self->{_N}-1) { |
488
|
|
|
|
|
|
|
my $tag = $self->{_data_id_tags}[$j]; |
489
|
|
|
|
|
|
|
push @one_d_data, $self->{_data}->{$tag}->[0]; |
490
|
|
|
|
|
|
|
} |
491
|
|
|
|
|
|
|
my @peak_points = find_peak_points_in_given_direction(\@one_d_data,$K); |
492
|
|
|
|
|
|
|
print "highest points at data values: @peak_points\n" if $self->{_debug}; |
493
|
|
|
|
|
|
|
my @cluster_centers; |
494
|
|
|
|
|
|
|
foreach my $peakpoint (@peak_points) { |
495
|
|
|
|
|
|
|
push @cluster_centers, [$peakpoint]; |
496
|
|
|
|
|
|
|
} |
497
|
|
|
|
|
|
|
return \@cluster_centers; |
498
|
|
|
|
|
|
|
} |
499
|
|
|
|
|
|
|
my ($num_rows,$num_cols) = ($self->{_data_dimensions},$self->{_N}); |
500
|
|
|
|
|
|
|
my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols); |
501
|
|
|
|
|
|
|
my $mean_vec = Math::GSL::Matrix->new($num_rows,1); |
502
|
|
|
|
|
|
|
# All the record labels are stored in the array $self->{_data_id_tags}. The |
503
|
|
|
|
|
|
|
# actual data for clustering is stored in a hash at $self->{_data} whose keys are |
504
|
|
|
|
|
|
|
# the record labels; the value associated with each key is the array holding the |
505
|
|
|
|
|
|
|
# corresponding numerical multidimensional data. |
506
|
|
|
|
|
|
|
foreach my $j (0..$num_cols-1) { |
507
|
|
|
|
|
|
|
my $tag = $self->{_data_id_tags}[$j]; |
508
|
|
|
|
|
|
|
my $data = $self->{_data}->{$tag}; |
509
|
|
|
|
|
|
|
$matrix->set_col($j, $data); |
510
|
|
|
|
|
|
|
} |
511
|
|
|
|
|
|
|
if ($self->{_debug}) { |
512
|
|
|
|
|
|
|
print "\nDisplaying the original data as a matrix:"; |
513
|
|
|
|
|
|
|
display_matrix( $matrix ); |
514
|
|
|
|
|
|
|
} |
515
|
|
|
|
|
|
|
foreach my $j (0..$num_cols-1) { |
516
|
|
|
|
|
|
|
$mean_vec += $matrix->col($j); |
517
|
|
|
|
|
|
|
} |
518
|
|
|
|
|
|
|
$mean_vec *= 1.0 / $num_cols; |
519
|
|
|
|
|
|
|
if ($self->{_debug}) { |
520
|
|
|
|
|
|
|
print "Displaying the mean vector for the data:"; |
521
|
|
|
|
|
|
|
display_matrix( $mean_vec ); |
522
|
|
|
|
|
|
|
} |
523
|
|
|
|
|
|
|
foreach my $j (0..$num_cols-1) { |
524
|
|
|
|
|
|
|
my @new_col = ($matrix->col($j) - $mean_vec)->as_list; |
525
|
|
|
|
|
|
|
$matrix->set_col($j, \@new_col); |
526
|
|
|
|
|
|
|
} |
527
|
|
|
|
|
|
|
if ($self->{_debug}) { |
528
|
|
|
|
|
|
|
print "Displaying mean subtracted data as a matrix:"; |
529
|
|
|
|
|
|
|
display_matrix( $matrix ); |
530
|
|
|
|
|
|
|
} |
531
|
|
|
|
|
|
|
my $transposed = transpose( $matrix ); |
532
|
|
|
|
|
|
|
if ($self->{_debug}) { |
533
|
|
|
|
|
|
|
print "Displaying transposed data matrix:"; |
534
|
|
|
|
|
|
|
display_matrix( $transposed ); |
535
|
|
|
|
|
|
|
} |
536
|
|
|
|
|
|
|
my $covariance = matrix_multiply( $matrix, $transposed ); |
537
|
|
|
|
|
|
|
$covariance *= 1.0 / $num_cols; |
538
|
|
|
|
|
|
|
if ($self->{_debug}) { |
539
|
|
|
|
|
|
|
print "\nDisplaying the Covariance Matrix for your data:"; |
540
|
|
|
|
|
|
|
display_matrix( $covariance ); |
541
|
|
|
|
|
|
|
} |
542
|
|
|
|
|
|
|
my ($eigenvalues, $eigenvectors) = $covariance->eigenpair; |
543
|
|
|
|
|
|
|
my $num_of_eigens = @$eigenvalues; |
544
|
|
|
|
|
|
|
my $largest_eigen_index = 0; |
545
|
|
|
|
|
|
|
my $smallest_eigen_index = 0; |
546
|
|
|
|
|
|
|
print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug}; |
547
|
|
|
|
|
|
|
foreach my $i (1..$num_of_eigens-1) { |
548
|
|
|
|
|
|
|
$largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index]; |
549
|
|
|
|
|
|
|
$smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index]; |
550
|
|
|
|
|
|
|
print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug}; |
551
|
|
|
|
|
|
|
} |
552
|
|
|
|
|
|
|
print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug}; |
553
|
|
|
|
|
|
|
print "\nsmallest eigen index: $smallest_eigen_index\n\n" if $self->{_debug}; |
554
|
|
|
|
|
|
|
foreach my $i (0..$num_of_eigens-1) { |
555
|
|
|
|
|
|
|
my @vec = $eigenvectors->[$i]->as_list; |
556
|
|
|
|
|
|
|
print "Eigenvector $i: @vec\n" if $self->{_debug}; |
557
|
|
|
|
|
|
|
} |
558
|
|
|
|
|
|
|
my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list; |
559
|
|
|
|
|
|
|
print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug}; |
560
|
|
|
|
|
|
|
my @max_var_direction; |
561
|
|
|
|
|
|
|
# Each element of the array @largest_eigen_vec is a Math::Complex object |
562
|
|
|
|
|
|
|
foreach my $k (0..@largest_eigen_vec-1) { |
563
|
|
|
|
|
|
|
my ($mag, $theta) = $largest_eigen_vec[$k] =~ /\[(\d*\.\d+),(\S+)\]/; |
564
|
|
|
|
|
|
|
if ($theta eq '0') { |
565
|
|
|
|
|
|
|
$max_var_direction[$k] = $mag; |
566
|
|
|
|
|
|
|
} elsif ($theta eq 'pi') { |
567
|
|
|
|
|
|
|
$max_var_direction[$k] = -1.0 * $mag; |
568
|
|
|
|
|
|
|
} else { |
569
|
|
|
|
|
|
|
die "eigendecomposition of covariance matrix produced a complex " . |
570
|
|
|
|
|
|
|
"eigenvector --- something is wrong"; |
571
|
|
|
|
|
|
|
} |
572
|
|
|
|
|
|
|
} |
573
|
|
|
|
|
|
|
print "\nMaximum Variance Direction: @max_var_direction\n\n" if $self->{_debug}; |
574
|
|
|
|
|
|
|
# We now project all data points on the largest eigenvector. |
575
|
|
|
|
|
|
|
# Each projection will yield a single point on the eigenvector. |
576
|
|
|
|
|
|
|
my @projections; |
577
|
|
|
|
|
|
|
foreach my $j (0..$self->{_N}-1) { |
578
|
|
|
|
|
|
|
my $tag = $self->{_data_id_tags}[$j]; |
579
|
|
|
|
|
|
|
my $data = $self->{_data}->{$tag}; |
580
|
|
|
|
|
|
|
die "Dimensionality of the largest eigenvector does not " |
581
|
|
|
|
|
|
|
. "match the dimensionality of the data" |
582
|
|
|
|
|
|
|
unless @max_var_direction == $self->{_data_dimensions}; |
583
|
|
|
|
|
|
|
my $projection = vector_multiply($data, \@max_var_direction); |
584
|
|
|
|
|
|
|
push @projections, $projection; |
585
|
|
|
|
|
|
|
} |
586
|
|
|
|
|
|
|
print "All projection points: @projections\n" if $self->{_debug}; |
587
|
|
|
|
|
|
|
my @peak_points = find_peak_points_in_given_direction(\@projections, $K); |
588
|
|
|
|
|
|
|
print "highest points at points along largest eigenvec: @peak_points\n" if $self->{_debug}; |
589
|
|
|
|
|
|
|
my @cluster_centers; |
590
|
|
|
|
|
|
|
foreach my $peakpoint (@peak_points) { |
591
|
|
|
|
|
|
|
my @actual_peak_coords = map {$peakpoint * $_} @max_var_direction; |
592
|
|
|
|
|
|
|
push @cluster_centers, \@actual_peak_coords; |
593
|
|
|
|
|
|
|
} |
594
|
|
|
|
|
|
|
return \@cluster_centers; |
595
|
|
|
|
|
|
|
} |
596
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
# This method is invoked when you choose the 'smart' option for the "cluster_seeding" |
598
|
|
|
|
|
|
|
# option in the constructor. It is called by the previous method to locate K peaks |
599
|
|
|
|
|
|
|
# in a smoothed histogram of the data points projected onto the maximal variance |
600
|
|
|
|
|
|
|
# direction. |
601
|
|
|
|
|
|
|
sub find_peak_points_in_given_direction { |
602
|
|
|
|
|
|
|
my $dataref = shift; |
603
|
|
|
|
|
|
|
my $how_many = shift; |
604
|
|
|
|
|
|
|
my @data = @$dataref; |
605
|
|
|
|
|
|
|
my ($min, $max) = minmax(\@data); |
606
|
|
|
|
|
|
|
my $num_points = @data; |
607
|
|
|
|
|
|
|
my @sorted_data = sort {$a <=> $b} @data; |
608
|
|
|
|
|
|
|
my $scale = $max - $min; |
609
|
|
|
|
|
|
|
foreach my $index (0..$#sorted_data-1) { |
610
|
|
|
|
|
|
|
$sorted_data[$index] = ($sorted_data[$index] - $min) / $scale; |
611
|
|
|
|
|
|
|
} |
612
|
|
|
|
|
|
|
my $avg_diff = 0; |
613
|
|
|
|
|
|
|
foreach my $index (0..$#sorted_data-1) { |
614
|
|
|
|
|
|
|
my $diff = $sorted_data[$index+1] - $sorted_data[$index]; |
615
|
|
|
|
|
|
|
$avg_diff += ($diff - $avg_diff) / ($index + 1); |
616
|
|
|
|
|
|
|
} |
617
|
|
|
|
|
|
|
my $delta = 1.0 / 1000.0; |
618
|
|
|
|
|
|
|
my @accumulator = (0) x 1000; |
619
|
|
|
|
|
|
|
foreach my $index (0..@sorted_data-1) { |
620
|
|
|
|
|
|
|
my $cell_index = int($sorted_data[$index] / $delta); |
621
|
|
|
|
|
|
|
my $smoothness = 40; |
622
|
|
|
|
|
|
|
for my $index ($cell_index-$smoothness..$cell_index+$smoothness) { |
623
|
|
|
|
|
|
|
next if $index < 0 || $index > 999; |
624
|
|
|
|
|
|
|
$accumulator[$index]++; |
625
|
|
|
|
|
|
|
} |
626
|
|
|
|
|
|
|
} |
627
|
|
|
|
|
|
|
my $peaks_array = non_maximum_suppression( \@accumulator ); |
628
|
|
|
|
|
|
|
my $peaks_index_hash = get_value_index_hash( $peaks_array ); |
629
|
|
|
|
|
|
|
my @K_highest_peak_locations; |
630
|
|
|
|
|
|
|
my $k = 0; |
631
|
|
|
|
|
|
|
foreach my $peak (sort {$b <=> $a} keys %$peaks_index_hash) { |
632
|
|
|
|
|
|
|
my $unscaled_peak_point = $min + $peaks_index_hash->{$peak} * $scale * $delta; |
633
|
|
|
|
|
|
|
push @K_highest_peak_locations, $unscaled_peak_point |
634
|
|
|
|
|
|
|
if $k < $how_many; |
635
|
|
|
|
|
|
|
last if ++$k == $how_many; |
636
|
|
|
|
|
|
|
} |
637
|
|
|
|
|
|
|
return @K_highest_peak_locations; |
638
|
|
|
|
|
|
|
} |
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
# The purpose of this routine is to form initial clusters by assigning the data |
641
|
|
|
|
|
|
|
# samples to the initial clusters formed by the previous routine on the basis of the |
642
|
|
|
|
|
|
|
# best proximity of the data samples to the different cluster centers. |
643
|
|
|
|
|
|
|
sub assign_data_to_clusters_initial { |
644
|
|
|
|
|
|
|
my $self = shift; |
645
|
|
|
|
|
|
|
my @cluster_centers = @{ shift @_ }; |
646
|
|
|
|
|
|
|
my @clusters; |
647
|
|
|
|
|
|
|
foreach my $ele (@{$self->{_data_id_tags}}) { |
648
|
|
|
|
|
|
|
my $best_cluster; |
649
|
|
|
|
|
|
|
my @dist_from_clust_centers; |
650
|
|
|
|
|
|
|
foreach my $center (@cluster_centers) { |
651
|
|
|
|
|
|
|
push @dist_from_clust_centers, $self->distance($ele, $center); |
652
|
|
|
|
|
|
|
} |
653
|
|
|
|
|
|
|
my ($min, $best_center_index) = minimum( \@dist_from_clust_centers ); |
654
|
|
|
|
|
|
|
push @{$clusters[$best_center_index]}, $ele; |
655
|
|
|
|
|
|
|
} |
656
|
|
|
|
|
|
|
return \@clusters; |
657
|
|
|
|
|
|
|
} |
658
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
sub assign_data_to_clusters_initial_mahalanobis { |
660
|
|
|
|
|
|
|
my $self = shift; |
661
|
|
|
|
|
|
|
my @cluster_centers = @{ shift @_ }; |
662
|
|
|
|
|
|
|
my @clusters; |
663
|
|
|
|
|
|
|
foreach my $ele (@{$self->{_data_id_tags}}) { |
664
|
|
|
|
|
|
|
my $best_cluster; |
665
|
|
|
|
|
|
|
my @dist_from_clust_centers; |
666
|
|
|
|
|
|
|
foreach my $center (@cluster_centers) { |
667
|
|
|
|
|
|
|
push @dist_from_clust_centers, $self->distance($ele, $center); |
668
|
|
|
|
|
|
|
} |
669
|
|
|
|
|
|
|
my ($min, $best_center_index) = minimum( \@dist_from_clust_centers ); |
670
|
|
|
|
|
|
|
push @{$clusters[$best_center_index]}, $ele if defined $best_center_index; |
671
|
|
|
|
|
|
|
} |
672
|
|
|
|
|
|
|
# Since a cluster center may not correspond to any particular sample, it is possible |
673
|
|
|
|
|
|
|
# for one of the elements of the array @clusters to be null using the above |
674
|
|
|
|
|
|
|
# strategy for populating the initial clusters. Let's say there are five cluster |
675
|
|
|
|
|
|
|
# centers in the array @cluster_centers. The $best_center_index may populate the |
676
|
|
|
|
|
|
|
# the elements of the array @clusters for the indices 0, 1, 2, 4, which would leave |
677
|
|
|
|
|
|
|
# $clusters[3] as undefined. So, in what follows, we must first check if all of |
678
|
|
|
|
|
|
|
# the elements of @clusters are defined. |
679
|
|
|
|
|
|
|
my @determinants; |
680
|
|
|
|
|
|
|
foreach my $cluster(@clusters) { |
681
|
|
|
|
|
|
|
die "The clustering program started with bad initialization. Please start over" |
682
|
|
|
|
|
|
|
unless defined $cluster; |
683
|
|
|
|
|
|
|
my $covariance = $self->estimate_cluster_covariance($cluster); |
684
|
|
|
|
|
|
|
my $determinant = $covariance->det(); |
685
|
|
|
|
|
|
|
push @determinants, $determinant; |
686
|
|
|
|
|
|
|
} |
687
|
|
|
|
|
|
|
return [\@clusters, \@determinants]; |
688
|
|
|
|
|
|
|
} |
689
|
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
# This is the main routine that along with the update_cluster_centers() routine |
691
|
|
|
|
|
|
|
# constitute the two key steps of the K-Means algorithm. In most cases, the infinite |
692
|
|
|
|
|
|
|
# while() loop will terminate automatically when the cluster assignments of the data |
693
|
|
|
|
|
|
|
# points remain unchanged. For the sake of safety, we keep track of the number of |
694
|
|
|
|
|
|
|
# iterations. If this number reaches 100, we exit the while() loop anyway. In most |
695
|
|
|
|
|
|
|
# cases, this limit will not be reached. |
696
|
|
|
|
|
|
|
sub assign_data_to_clusters { |
697
|
|
|
|
|
|
|
my $self = shift; |
698
|
|
|
|
|
|
|
my $clusters = shift; |
699
|
|
|
|
|
|
|
my $K = shift; |
700
|
|
|
|
|
|
|
my $final_cluster_centers; |
701
|
|
|
|
|
|
|
my $iteration_index = 0; |
702
|
|
|
|
|
|
|
while (1) { |
703
|
|
|
|
|
|
|
my $new_clusters; |
704
|
|
|
|
|
|
|
my $assignment_changed_flag = 0; |
705
|
|
|
|
|
|
|
my $current_cluster_center_index = 0; |
706
|
|
|
|
|
|
|
my $cluster_size_zero_condition = 0; |
707
|
|
|
|
|
|
|
my $how_many = @$clusters; |
708
|
|
|
|
|
|
|
my $cluster_centers = $self->update_cluster_centers( deep_copy_AoA_with_nulls( $clusters ) ); |
709
|
|
|
|
|
|
|
$iteration_index++; |
710
|
|
|
|
|
|
|
foreach my $cluster (@$clusters) { |
711
|
|
|
|
|
|
|
my $current_cluster_center = $cluster_centers->[$current_cluster_center_index]; |
712
|
|
|
|
|
|
|
foreach my $ele (@$cluster) { |
713
|
|
|
|
|
|
|
my @dist_from_clust_centers; |
714
|
|
|
|
|
|
|
foreach my $center (@$cluster_centers) { |
715
|
|
|
|
|
|
|
push @dist_from_clust_centers, |
716
|
|
|
|
|
|
|
$self->distance($ele, $center); |
717
|
|
|
|
|
|
|
} |
718
|
|
|
|
|
|
|
my ($min, $best_center_index) = minimum( \@dist_from_clust_centers ); |
719
|
|
|
|
|
|
|
my $best_cluster_center = $cluster_centers->[$best_center_index]; |
720
|
|
|
|
|
|
|
if (vector_equal($current_cluster_center, $best_cluster_center)){ |
721
|
|
|
|
|
|
|
push @{$new_clusters->[$current_cluster_center_index]}, $ele; |
722
|
|
|
|
|
|
|
} else { |
723
|
|
|
|
|
|
|
$assignment_changed_flag = 1; |
724
|
|
|
|
|
|
|
push @{$new_clusters->[$best_center_index]}, $ele; |
725
|
|
|
|
|
|
|
} |
726
|
|
|
|
|
|
|
} |
727
|
|
|
|
|
|
|
$current_cluster_center_index++; |
728
|
|
|
|
|
|
|
} |
729
|
|
|
|
|
|
|
next if ((@$new_clusters != @$clusters) && ($iteration_index < 100)); |
730
|
|
|
|
|
|
|
# Now make sure that none of the K clusters is an empty cluster: |
731
|
|
|
|
|
|
|
foreach my $newcluster (@$new_clusters) { |
732
|
|
|
|
|
|
|
$cluster_size_zero_condition = 1 if ((!defined $newcluster) or (@$newcluster == 0)); |
733
|
|
|
|
|
|
|
} |
734
|
|
|
|
|
|
|
push @$new_clusters, (undef) x ($K - @$new_clusters) if @$new_clusters < $K; |
735
|
|
|
|
|
|
|
# During clustering for a fixed K, should a cluster inadvertently |
736
|
|
|
|
|
|
|
# become empty, steal a member from the largest cluster to hopefully |
737
|
|
|
|
|
|
|
# spawn a new cluster: |
738
|
|
|
|
|
|
|
my $largest_cluster; |
739
|
|
|
|
|
|
|
foreach my $local_cluster (@$new_clusters) { |
740
|
|
|
|
|
|
|
next if !defined $local_cluster; |
741
|
|
|
|
|
|
|
$largest_cluster = $local_cluster if !defined $largest_cluster; |
742
|
|
|
|
|
|
|
if (@$local_cluster > @$largest_cluster) { |
743
|
|
|
|
|
|
|
$largest_cluster = $local_cluster; |
744
|
|
|
|
|
|
|
} |
745
|
|
|
|
|
|
|
} |
746
|
|
|
|
|
|
|
foreach my $local_cluster (@$new_clusters) { |
747
|
|
|
|
|
|
|
if ( (!defined $local_cluster) || (@$local_cluster == 0) ) { |
748
|
|
|
|
|
|
|
push @$local_cluster, pop @$largest_cluster; |
749
|
|
|
|
|
|
|
} |
750
|
|
|
|
|
|
|
} |
751
|
|
|
|
|
|
|
next if (($cluster_size_zero_condition) && ($iteration_index < 100)); |
752
|
|
|
|
|
|
|
last if $iteration_index == 100; |
753
|
|
|
|
|
|
|
$clusters = deep_copy_AoA( $new_clusters ); |
754
|
|
|
|
|
|
|
last if $assignment_changed_flag == 0; |
755
|
|
|
|
|
|
|
} |
756
|
|
|
|
|
|
|
$final_cluster_centers = $self->update_cluster_centers( $clusters ); |
757
|
|
|
|
|
|
|
return ($clusters, $final_cluster_centers); |
758
|
|
|
|
|
|
|
} |
759
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
sub assign_data_to_clusters_mahalanobis { |
761
|
|
|
|
|
|
|
my $self = shift; |
762
|
|
|
|
|
|
|
my $clusters = shift; |
763
|
|
|
|
|
|
|
my $K = shift; |
764
|
|
|
|
|
|
|
my $final_cluster_centers; |
765
|
|
|
|
|
|
|
my $iteration_index = 0; |
766
|
|
|
|
|
|
|
while (1) { |
767
|
|
|
|
|
|
|
my $new_clusters; |
768
|
|
|
|
|
|
|
my $assignment_changed_flag = 0; |
769
|
|
|
|
|
|
|
my $current_cluster_center_index = 0; |
770
|
|
|
|
|
|
|
my $cluster_size_zero_condition = 0; |
771
|
|
|
|
|
|
|
my $how_many = @$clusters; |
772
|
|
|
|
|
|
|
my $cluster_centers_and_covariances = |
773
|
|
|
|
|
|
|
$self->update_cluster_centers_and_covariances_mahalanobis(deep_copy_AoA_with_nulls($clusters)); |
774
|
|
|
|
|
|
|
my $cluster_centers = $cluster_centers_and_covariances->[0]; |
775
|
|
|
|
|
|
|
my $cluster_covariances = $cluster_centers_and_covariances->[1]; |
776
|
|
|
|
|
|
|
$iteration_index++; |
777
|
|
|
|
|
|
|
foreach my $cluster (@$clusters) { |
778
|
|
|
|
|
|
|
my $current_cluster_center = $cluster_centers->[$current_cluster_center_index]; |
779
|
|
|
|
|
|
|
my $current_cluster_covariance = $cluster_covariances->[$current_cluster_center_index]; |
780
|
|
|
|
|
|
|
foreach my $ele (@$cluster) { |
781
|
|
|
|
|
|
|
my @mahalanobis_dist_from_clust_centers; |
782
|
|
|
|
|
|
|
foreach my $i (0..@$clusters-1) { |
783
|
|
|
|
|
|
|
my $center = $cluster_centers->[$i]; |
784
|
|
|
|
|
|
|
my $covariance = $cluster_covariances->[$i]; |
785
|
|
|
|
|
|
|
my $maha_distance; |
786
|
|
|
|
|
|
|
eval { |
787
|
|
|
|
|
|
|
$maha_distance = $self->distance_mahalanobis($ele, $center, $covariance); |
788
|
|
|
|
|
|
|
}; |
789
|
|
|
|
|
|
|
next if $@; |
790
|
|
|
|
|
|
|
push @mahalanobis_dist_from_clust_centers, $maha_distance; |
791
|
|
|
|
|
|
|
} |
792
|
|
|
|
|
|
|
my ($min, $best_center_index) = minimum( \@mahalanobis_dist_from_clust_centers ); |
793
|
|
|
|
|
|
|
die "The Mahalanobis metric may not be appropriate for the data" |
794
|
|
|
|
|
|
|
unless defined $best_center_index; |
795
|
|
|
|
|
|
|
my $best_cluster_center = $cluster_centers->[$best_center_index]; |
796
|
|
|
|
|
|
|
if (vector_equal($current_cluster_center, $best_cluster_center)){ |
797
|
|
|
|
|
|
|
push @{$new_clusters->[$current_cluster_center_index]}, $ele; |
798
|
|
|
|
|
|
|
} else { |
799
|
|
|
|
|
|
|
$assignment_changed_flag = 1; |
800
|
|
|
|
|
|
|
push @{$new_clusters->[$best_center_index]}, $ele; |
801
|
|
|
|
|
|
|
} |
802
|
|
|
|
|
|
|
} |
803
|
|
|
|
|
|
|
$current_cluster_center_index++; |
804
|
|
|
|
|
|
|
} |
805
|
|
|
|
|
|
|
next if ((@$new_clusters != @$clusters) && ($iteration_index < 100)); |
806
|
|
|
|
|
|
|
# Now make sure that none of the K clusters is an empty cluster: |
807
|
|
|
|
|
|
|
foreach my $newcluster (@$new_clusters) { |
808
|
|
|
|
|
|
|
$cluster_size_zero_condition = 1 if ((!defined $newcluster) or (@$newcluster == 0)); |
809
|
|
|
|
|
|
|
} |
810
|
|
|
|
|
|
|
push @$new_clusters, (undef) x ($K - @$new_clusters) if @$new_clusters < $K; |
811
|
|
|
|
|
|
|
# During clustering for a fixed K, should a cluster inadvertently |
812
|
|
|
|
|
|
|
# become empty, steal a member from the largest cluster to hopefully |
813
|
|
|
|
|
|
|
# spawn a new cluster: |
814
|
|
|
|
|
|
|
my $largest_cluster; |
815
|
|
|
|
|
|
|
foreach my $local_cluster (@$new_clusters) { |
816
|
|
|
|
|
|
|
next if !defined $local_cluster; |
817
|
|
|
|
|
|
|
$largest_cluster = $local_cluster if !defined $largest_cluster; |
818
|
|
|
|
|
|
|
if (@$local_cluster > @$largest_cluster) { |
819
|
|
|
|
|
|
|
$largest_cluster = $local_cluster; |
820
|
|
|
|
|
|
|
} |
821
|
|
|
|
|
|
|
} |
822
|
|
|
|
|
|
|
foreach my $local_cluster (@$new_clusters) { |
823
|
|
|
|
|
|
|
if ( (!defined $local_cluster) || (@$local_cluster == 0) ) { |
824
|
|
|
|
|
|
|
push @$local_cluster, pop @$largest_cluster; |
825
|
|
|
|
|
|
|
} |
826
|
|
|
|
|
|
|
} |
827
|
|
|
|
|
|
|
next if (($cluster_size_zero_condition) && ($iteration_index < 100)); |
828
|
|
|
|
|
|
|
last if $iteration_index == 100; |
829
|
|
|
|
|
|
|
# Now do a deep copy of new_clusters into clusters |
830
|
|
|
|
|
|
|
$clusters = deep_copy_AoA( $new_clusters ); |
831
|
|
|
|
|
|
|
last if $assignment_changed_flag == 0; |
832
|
|
|
|
|
|
|
} |
833
|
|
|
|
|
|
|
$final_cluster_centers = $self->update_cluster_centers( $clusters ); |
834
|
|
|
|
|
|
|
return ($clusters, $final_cluster_centers); |
835
|
|
|
|
|
|
|
} |
836
|
|
|
|
|
|
|
|
837
|
|
|
|
|
|
|
sub update_cluster_centers_and_covariances_mahalanobis { |
838
|
|
|
|
|
|
|
my $self = shift; |
839
|
|
|
|
|
|
|
my @clusters = @{ shift @_ }; |
840
|
|
|
|
|
|
|
my @new_cluster_centers; |
841
|
|
|
|
|
|
|
my @new_cluster_covariances; |
842
|
|
|
|
|
|
|
# During clustering for a fixed K, should a cluster inadvertently become empty, |
843
|
|
|
|
|
|
|
# steal a member from the largest cluster to hopefully spawn a new cluster: |
844
|
|
|
|
|
|
|
my $largest_cluster; |
845
|
|
|
|
|
|
|
foreach my $cluster (@clusters) { |
846
|
|
|
|
|
|
|
next if !defined $cluster; |
847
|
|
|
|
|
|
|
$largest_cluster = $cluster if !defined $largest_cluster; |
848
|
|
|
|
|
|
|
if (@$cluster > @$largest_cluster) { |
849
|
|
|
|
|
|
|
$largest_cluster = $cluster; |
850
|
|
|
|
|
|
|
} |
851
|
|
|
|
|
|
|
} |
852
|
|
|
|
|
|
|
foreach my $cluster (@clusters) { |
853
|
|
|
|
|
|
|
if ( (!defined $cluster) || (@$cluster == 0) ) { |
854
|
|
|
|
|
|
|
push @$cluster, pop @$largest_cluster; |
855
|
|
|
|
|
|
|
} |
856
|
|
|
|
|
|
|
} |
857
|
|
|
|
|
|
|
foreach my $cluster (@clusters) { |
858
|
|
|
|
|
|
|
die "Cluster became empty --- untenable condition " . |
859
|
|
|
|
|
|
|
"for a given K. Try again. \n" if !defined $cluster; |
860
|
|
|
|
|
|
|
my $cluster_size = @$cluster; |
861
|
|
|
|
|
|
|
die "Cluster size is zero --- untenable.\n" if $cluster_size == 0; |
862
|
|
|
|
|
|
|
my @new_cluster_center = @{$self->add_point_coords( $cluster )}; |
863
|
|
|
|
|
|
|
@new_cluster_center = map {my $x = $_/$cluster_size; $x} @new_cluster_center; |
864
|
|
|
|
|
|
|
push @new_cluster_centers, \@new_cluster_center; |
865
|
|
|
|
|
|
|
# for covariance calculation: |
866
|
|
|
|
|
|
|
my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$cluster)); |
867
|
|
|
|
|
|
|
my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols); |
868
|
|
|
|
|
|
|
my $mean_vec = Math::GSL::Matrix->new($num_rows,1); |
869
|
|
|
|
|
|
|
# All the record labels are stored in the array $self->{_data_id_tags}. The |
870
|
|
|
|
|
|
|
# actual data for clustering is stored in a hash at $self->{_data} whose keys are |
871
|
|
|
|
|
|
|
# the record labels; the value associated with each key is the array holding the |
872
|
|
|
|
|
|
|
# corresponding numerical multidimensional data. |
873
|
|
|
|
|
|
|
foreach my $j (0..$num_cols-1) { |
874
|
|
|
|
|
|
|
my $tag = $cluster->[$j]; |
875
|
|
|
|
|
|
|
my $data = $self->{_data}->{$tag}; |
876
|
|
|
|
|
|
|
my @diff_from_mean = vector_subtract($data, \@new_cluster_center); |
877
|
|
|
|
|
|
|
$matrix->set_col($j, \@diff_from_mean); |
878
|
|
|
|
|
|
|
} |
879
|
|
|
|
|
|
|
my $transposed = transpose( $matrix ); |
880
|
|
|
|
|
|
|
my $covariance = matrix_multiply( $matrix, $transposed ); |
881
|
|
|
|
|
|
|
$covariance *= 1.0 / $num_cols; |
882
|
|
|
|
|
|
|
if ($self->{_debug}) { |
883
|
|
|
|
|
|
|
print "\nDisplaying the Covariance Matrix for cluster:"; |
884
|
|
|
|
|
|
|
display_matrix( $covariance ); |
885
|
|
|
|
|
|
|
} |
886
|
|
|
|
|
|
|
push @new_cluster_covariances, $covariance; |
887
|
|
|
|
|
|
|
} |
888
|
|
|
|
|
|
|
return [\@new_cluster_centers, \@new_cluster_covariances]; |
889
|
|
|
|
|
|
|
} |
890
|
|
|
|
|
|
|
|
891
|
|
|
|
|
|
|
# After each new assignment of the data points to the clusters on the basis of the |
892
|
|
|
|
|
|
|
# current values for the cluster centers, we call the routine shown here for updating |
893
|
|
|
|
|
|
|
# the values of the cluster centers. |
894
|
|
|
|
|
|
|
sub update_cluster_centers { |
895
|
|
|
|
|
|
|
my $self = shift; |
896
|
|
|
|
|
|
|
my @clusters = @{ shift @_ }; |
897
|
|
|
|
|
|
|
my @new_cluster_centers; |
898
|
|
|
|
|
|
|
# During clustering for a fixed K, should a cluster inadvertently become empty, |
899
|
|
|
|
|
|
|
# steal a member from the largest cluster to hopefully spawn a new cluster: |
900
|
|
|
|
|
|
|
my $largest_cluster; |
901
|
|
|
|
|
|
|
foreach my $cluster (@clusters) { |
902
|
|
|
|
|
|
|
next if !defined $cluster; |
903
|
|
|
|
|
|
|
$largest_cluster = $cluster if !defined $largest_cluster; |
904
|
|
|
|
|
|
|
if (@$cluster > @$largest_cluster) { |
905
|
|
|
|
|
|
|
$largest_cluster = $cluster; |
906
|
|
|
|
|
|
|
} |
907
|
|
|
|
|
|
|
} |
908
|
|
|
|
|
|
|
foreach my $cluster (@clusters) { |
909
|
|
|
|
|
|
|
if ( (!defined $cluster) || (@$cluster == 0) ) { |
910
|
|
|
|
|
|
|
push @$cluster, pop @$largest_cluster; |
911
|
|
|
|
|
|
|
} |
912
|
|
|
|
|
|
|
} |
913
|
|
|
|
|
|
|
foreach my $cluster (@clusters) { |
914
|
|
|
|
|
|
|
die "Cluster became empty --- untenable condition " . |
915
|
|
|
|
|
|
|
"for a given K. Try again. \n" if !defined $cluster; |
916
|
|
|
|
|
|
|
my $cluster_size = @$cluster; |
917
|
|
|
|
|
|
|
die "Cluster size is zero --- untenable.\n" if $cluster_size == 0; |
918
|
|
|
|
|
|
|
my @new_cluster_center = @{$self->add_point_coords( $cluster )}; |
919
|
|
|
|
|
|
|
@new_cluster_center = map {my $x = $_/$cluster_size; $x} |
920
|
|
|
|
|
|
|
@new_cluster_center; |
921
|
|
|
|
|
|
|
push @new_cluster_centers, \@new_cluster_center; |
922
|
|
|
|
|
|
|
} |
923
|
|
|
|
|
|
|
return \@new_cluster_centers; |
924
|
|
|
|
|
|
|
} |
925
|
|
|
|
|
|
|
|
926
|
|
|
|
|
|
|
sub which_cluster_for_new_data_element { |
927
|
|
|
|
|
|
|
my $self = shift; |
928
|
|
|
|
|
|
|
my $ele = shift; |
929
|
|
|
|
|
|
|
die "The dimensionality of the new data element is not correct: $!" |
930
|
|
|
|
|
|
|
unless @$ele == $self->{_data_dimensions}; |
931
|
|
|
|
|
|
|
my %distance_to_new_ele_hash; |
932
|
|
|
|
|
|
|
foreach my $cluster_id (sort keys %{$self->{_cluster_centers_hash}}) { |
933
|
|
|
|
|
|
|
$distance_to_new_ele_hash{$cluster_id} = $self->distance2($ele, |
934
|
|
|
|
|
|
|
$self->{_cluster_centers_hash}->{$cluster_id}); |
935
|
|
|
|
|
|
|
} |
936
|
|
|
|
|
|
|
my @values = values %distance_to_new_ele_hash; |
937
|
|
|
|
|
|
|
my ($min,$max) = minmax(\@values); |
938
|
|
|
|
|
|
|
my $answer; |
939
|
|
|
|
|
|
|
foreach my $cluster_id (keys %distance_to_new_ele_hash) { |
940
|
|
|
|
|
|
|
$answer = $cluster_id if $distance_to_new_ele_hash{$cluster_id} == $min; |
941
|
|
|
|
|
|
|
} |
942
|
|
|
|
|
|
|
return $answer; |
943
|
|
|
|
|
|
|
} |
944
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
sub which_cluster_for_new_data_element_mahalanobis { |
946
|
|
|
|
|
|
|
my $self = shift; |
947
|
|
|
|
|
|
|
my $ele = shift; |
948
|
|
|
|
|
|
|
die "The dimensionality of the new data element is not correct: $!" |
949
|
|
|
|
|
|
|
unless @$ele == $self->{_data_dimensions}; |
950
|
|
|
|
|
|
|
my %distance_to_new_ele_hash; |
951
|
|
|
|
|
|
|
foreach my $cluster_id (sort keys %{$self->{_cluster_centers_hash}}) { |
952
|
|
|
|
|
|
|
$distance_to_new_ele_hash{$cluster_id} = |
953
|
|
|
|
|
|
|
$self->distance_mahalanobis2($ele, $self->{_cluster_centers_hash}->{$cluster_id}, |
954
|
|
|
|
|
|
|
$self->{_cluster_covariances_hash}->{$cluster_id}); |
955
|
|
|
|
|
|
|
} |
956
|
|
|
|
|
|
|
my @values = values %distance_to_new_ele_hash; |
957
|
|
|
|
|
|
|
my ($min,$max) = minmax(\@values); |
958
|
|
|
|
|
|
|
my $answer; |
959
|
|
|
|
|
|
|
foreach my $cluster_id (keys %distance_to_new_ele_hash) { |
960
|
|
|
|
|
|
|
$answer = $cluster_id if $distance_to_new_ele_hash{$cluster_id} == $min; |
961
|
|
|
|
|
|
|
} |
962
|
|
|
|
|
|
|
return $answer; |
963
|
|
|
|
|
|
|
} |
964
|
|
|
|
|
|
|
|
965
|
|
|
|
|
|
|
# The following function returns the value of QoC for a given partitioning of the |
966
|
|
|
|
|
|
|
# data into K clusters. It calculates two things: the average value for the distance |
967
|
|
|
|
|
|
|
# between a data point and the center of the cluster in which the data point resides, |
968
|
|
|
|
|
|
|
# and the average value for the distances between the cluster centers. We obviously |
969
|
|
|
|
|
|
|
# want to minimize the former and maximize the latter. All of the "from center" |
970
|
|
|
|
|
|
|
# distances within each cluster are stored in the variable |
971
|
|
|
|
|
|
|
# $sum_of_distances_for_one_cluster. When this variable, after it is divided by the |
972
|
|
|
|
|
|
|
# number of data elements in the cluster, is summed over all the clusters, we get a |
973
|
|
|
|
|
|
|
# value that is stored in $avg_dist_for_cluster. The inter-cluster-center distances |
974
|
|
|
|
|
|
|
# are stored in the variable $inter_cluster_center_dist. |
975
|
|
|
|
|
|
|
sub cluster_quality { |
976
|
|
|
|
|
|
|
my $self = shift; |
977
|
|
|
|
|
|
|
my $clusters = shift; |
978
|
|
|
|
|
|
|
my $cluster_centers = shift; |
979
|
|
|
|
|
|
|
my $K = @$cluster_centers; # Number of clusters |
980
|
|
|
|
|
|
|
my $cluster_radius = 0; |
981
|
|
|
|
|
|
|
foreach my $i (0..@$clusters-1) { |
982
|
|
|
|
|
|
|
my $sum_of_distances_for_one_cluster = 0; |
983
|
|
|
|
|
|
|
foreach my $ele (@{$clusters->[$i]}) { |
984
|
|
|
|
|
|
|
$sum_of_distances_for_one_cluster += |
985
|
|
|
|
|
|
|
$self->distance( $ele, $cluster_centers->[$i] ); |
986
|
|
|
|
|
|
|
} |
987
|
|
|
|
|
|
|
$cluster_radius += |
988
|
|
|
|
|
|
|
$sum_of_distances_for_one_cluster / @{$clusters->[$i]}; |
989
|
|
|
|
|
|
|
} |
990
|
|
|
|
|
|
|
my $inter_cluster_center_dist = 0; |
991
|
|
|
|
|
|
|
foreach my $i (0..@$cluster_centers-1) { |
992
|
|
|
|
|
|
|
foreach my $j (0..@$cluster_centers-1) { |
993
|
|
|
|
|
|
|
$inter_cluster_center_dist += |
994
|
|
|
|
|
|
|
$self->distance2( $cluster_centers->[$i], |
995
|
|
|
|
|
|
|
$cluster_centers->[$j] ); |
996
|
|
|
|
|
|
|
} |
997
|
|
|
|
|
|
|
} |
998
|
|
|
|
|
|
|
my $avg_inter_cluster_center_dist = $inter_cluster_center_dist / |
999
|
|
|
|
|
|
|
( $K * ($K-1) / 2.0 ); |
1000
|
|
|
|
|
|
|
return $cluster_radius / $avg_inter_cluster_center_dist; |
1001
|
|
|
|
|
|
|
} |
1002
|
|
|
|
|
|
|
|
1003
|
|
|
|
|
|
|
# The following routine is for computing the distance between a data point specified |
1004
|
|
|
|
|
|
|
# by its symbolic name in the master datafile and a point (such as the center of a |
1005
|
|
|
|
|
|
|
# cluster) expressed as a vector of coordinates: |
1006
|
|
|
|
|
|
|
sub distance { |
1007
|
|
|
|
|
|
|
my $self = shift; |
1008
|
|
|
|
|
|
|
my $ele1_id = shift @_; # symbolic name of data sample |
1009
|
|
|
|
|
|
|
my @ele1 = @{$self->{_data}->{$ele1_id}}; |
1010
|
|
|
|
|
|
|
my @ele2 = @{shift @_}; |
1011
|
|
|
|
|
|
|
die "wrong data types for distance calculation\n" if @ele1 != @ele2; |
1012
|
|
|
|
|
|
|
my $how_many = @ele1; |
1013
|
|
|
|
|
|
|
my $squared_sum = 0; |
1014
|
|
|
|
|
|
|
foreach my $i (0..$how_many-1) { |
1015
|
|
|
|
|
|
|
$squared_sum += ($ele1[$i] - $ele2[$i])**2; |
1016
|
|
|
|
|
|
|
} |
1017
|
|
|
|
|
|
|
my $dist = sqrt $squared_sum; |
1018
|
|
|
|
|
|
|
return $dist; |
1019
|
|
|
|
|
|
|
} |
1020
|
|
|
|
|
|
|
|
1021
|
|
|
|
|
|
|
# The following routine does the same as above but now both arguments are expected to |
1022
|
|
|
|
|
|
|
# be arrays of numbers: |
1023
|
|
|
|
|
|
|
sub distance2 { |
1024
|
|
|
|
|
|
|
my $self = shift; |
1025
|
|
|
|
|
|
|
my @ele1 = @{shift @_}; |
1026
|
|
|
|
|
|
|
my @ele2 = @{shift @_}; |
1027
|
|
|
|
|
|
|
die "wrong data types for distance calculation\n" if @ele1 != @ele2; |
1028
|
|
|
|
|
|
|
my $how_many = @ele1; |
1029
|
|
|
|
|
|
|
my $squared_sum = 0; |
1030
|
|
|
|
|
|
|
foreach my $i (0..$how_many-1) { |
1031
|
|
|
|
|
|
|
$squared_sum += ($ele1[$i] - $ele2[$i])**2; |
1032
|
|
|
|
|
|
|
} |
1033
|
|
|
|
|
|
|
return sqrt $squared_sum; |
1034
|
|
|
|
|
|
|
} |
1035
|
|
|
|
|
|
|
|
1036
|
|
|
|
|
|
|
# Requires three args: $ele for the symbolic tag of the element, $center for the |
1037
|
|
|
|
|
|
|
# coordinates of the center of the cluster, and $covariance for the covariance of |
1038
|
|
|
|
|
|
|
# cluster. Our goal is to find the distance of the element ele from the cluster |
1039
|
|
|
|
|
|
|
# whose mean and covariance are provided. |
1040
|
|
|
|
|
|
|
sub distance_mahalanobis { |
1041
|
|
|
|
|
|
|
my $self = shift; |
1042
|
|
|
|
|
|
|
my $ele = shift; |
1043
|
|
|
|
|
|
|
my $cluster_center = shift; |
1044
|
|
|
|
|
|
|
my $cluster_covariance = shift; |
1045
|
|
|
|
|
|
|
my $det_of_covar = $cluster_covariance->det(); |
1046
|
|
|
|
|
|
|
my $ele_data = $self->{_data}->{$ele}; |
1047
|
|
|
|
|
|
|
my @diff_from_mean = vector_subtract($ele_data, $cluster_center); |
1048
|
|
|
|
|
|
|
my $vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); |
1049
|
|
|
|
|
|
|
$vec->set_col(0, \@diff_from_mean); |
1050
|
|
|
|
|
|
|
my $transposed = transpose( $vec ); |
1051
|
|
|
|
|
|
|
my $covariance_inverse; |
1052
|
|
|
|
|
|
|
if ($cluster_covariance->det() > 0.001) { |
1053
|
|
|
|
|
|
|
$covariance_inverse = $cluster_covariance->inverse; |
1054
|
|
|
|
|
|
|
} else { |
1055
|
|
|
|
|
|
|
die "covariance matrix may not have an inverse"; |
1056
|
|
|
|
|
|
|
} |
1057
|
|
|
|
|
|
|
my $determinant = $covariance_inverse->det(); |
1058
|
|
|
|
|
|
|
my $distance = $transposed * $covariance_inverse * $vec; |
1059
|
|
|
|
|
|
|
my @distance = $distance->as_list; |
1060
|
|
|
|
|
|
|
$distance = $distance[0]; |
1061
|
|
|
|
|
|
|
return sqrt $distance; |
1062
|
|
|
|
|
|
|
} |
1063
|
|
|
|
|
|
|
# Same as the previous method except that the first argument can be the actual |
1064
|
|
|
|
|
|
|
# coordinates of the data element. Our goal is to find the Mahalanobis distance |
1065
|
|
|
|
|
|
|
# from a given data element to a cluster whose center and covariance are known. As |
1066
|
|
|
|
|
|
|
# for the previous method, it requires three arguments. |
1067
|
|
|
|
|
|
|
sub distance_mahalanobis2 { |
1068
|
|
|
|
|
|
|
my $self = shift; |
1069
|
|
|
|
|
|
|
my $ele = shift; # is now a ref to the array of coords for a point |
1070
|
|
|
|
|
|
|
my $cluster_center = shift; |
1071
|
|
|
|
|
|
|
my $cluster_covariance = shift; |
1072
|
|
|
|
|
|
|
return undef unless defined $cluster_covariance; |
1073
|
|
|
|
|
|
|
my $det_of_covar = $cluster_covariance->det(); |
1074
|
|
|
|
|
|
|
my @diff_from_mean = vector_subtract($ele, $cluster_center); |
1075
|
|
|
|
|
|
|
my $vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); |
1076
|
|
|
|
|
|
|
$vec->set_col(0, \@diff_from_mean); |
1077
|
|
|
|
|
|
|
my $transposed = transpose( $vec ); |
1078
|
|
|
|
|
|
|
my $covariance_inverse; |
1079
|
|
|
|
|
|
|
if ($cluster_covariance->det() > 0.001) { |
1080
|
|
|
|
|
|
|
$covariance_inverse = $cluster_covariance->inverse; |
1081
|
|
|
|
|
|
|
} else { |
1082
|
|
|
|
|
|
|
die "covariance matrix may not have an inverse"; |
1083
|
|
|
|
|
|
|
} |
1084
|
|
|
|
|
|
|
my $determinant = $covariance_inverse->det(); |
1085
|
|
|
|
|
|
|
my $distance = $transposed * $covariance_inverse * $vec; |
1086
|
|
|
|
|
|
|
my @distance = $distance->as_list; |
1087
|
|
|
|
|
|
|
$distance = $distance[0]; |
1088
|
|
|
|
|
|
|
return sqrt $distance; |
1089
|
|
|
|
|
|
|
} |
1090
|
|
|
|
|
|
|
|
1091
|
|
|
|
|
|
|
sub estimate_cluster_covariance { |
1092
|
|
|
|
|
|
|
my $self = shift; |
1093
|
|
|
|
|
|
|
my $cluster = shift; |
1094
|
|
|
|
|
|
|
my $cluster_size = @$cluster; |
1095
|
|
|
|
|
|
|
my @cluster_center = @{$self->add_point_coords( $cluster )}; |
1096
|
|
|
|
|
|
|
@cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center; |
1097
|
|
|
|
|
|
|
# for covariance calculation: |
1098
|
|
|
|
|
|
|
my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$cluster)); |
1099
|
|
|
|
|
|
|
my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols); |
1100
|
|
|
|
|
|
|
my $mean_vec = Math::GSL::Matrix->new($num_rows,1); |
1101
|
|
|
|
|
|
|
# All the record labels are stored in the array $self->{_data_id_tags}. The |
1102
|
|
|
|
|
|
|
# actual data for clustering is stored in a hash at $self->{_data} whose keys are |
1103
|
|
|
|
|
|
|
# the record labels; the value associated with each key is the array holding the |
1104
|
|
|
|
|
|
|
# corresponding numerical multidimensional data. |
1105
|
|
|
|
|
|
|
foreach my $j (0..$num_cols-1) { |
1106
|
|
|
|
|
|
|
my $tag = $cluster->[$j]; |
1107
|
|
|
|
|
|
|
my $data = $self->{_data}->{$tag}; |
1108
|
|
|
|
|
|
|
my @diff_from_mean = vector_subtract($data, \@cluster_center); |
1109
|
|
|
|
|
|
|
$matrix->set_col($j, \@diff_from_mean); |
1110
|
|
|
|
|
|
|
} |
1111
|
|
|
|
|
|
|
my $transposed = transpose( $matrix ); |
1112
|
|
|
|
|
|
|
my $covariance = $matrix * $transposed; |
1113
|
|
|
|
|
|
|
$covariance *= 1.0 / $num_cols; |
1114
|
|
|
|
|
|
|
if ($self->{_debug}) { |
1115
|
|
|
|
|
|
|
print "\nDisplaying the Covariance Matrix for cluster:"; |
1116
|
|
|
|
|
|
|
display_matrix( $covariance ); |
1117
|
|
|
|
|
|
|
} |
1118
|
|
|
|
|
|
|
return $covariance; |
1119
|
|
|
|
|
|
|
} |
1120
|
|
|
|
|
|
|
|
1121
|
|
|
|
|
|
|
sub write_clusters_to_files { |
1122
|
|
|
|
|
|
|
my $self = shift; |
1123
|
|
|
|
|
|
|
my @clusters = @{$self->{_clusters}}; |
1124
|
|
|
|
|
|
|
unlink glob "cluster*.dat"; |
1125
|
|
|
|
|
|
|
foreach my $i (0..@clusters-1) { |
1126
|
|
|
|
|
|
|
my $filename = "cluster" . $i . ".txt"; |
1127
|
|
|
|
|
|
|
print "\nWriting cluster $i to file $filename\n" if $self->{_terminal_output}; |
1128
|
|
|
|
|
|
|
open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!"; |
1129
|
|
|
|
|
|
|
foreach my $ele (@{$clusters[$i]}) { |
1130
|
|
|
|
|
|
|
print FILEHANDLE "$ele "; |
1131
|
|
|
|
|
|
|
} |
1132
|
|
|
|
|
|
|
close FILEHANDLE; |
1133
|
|
|
|
|
|
|
} |
1134
|
|
|
|
|
|
|
} |
1135
|
|
|
|
|
|
|
|
1136
|
|
|
|
|
|
|
sub get_K_best { |
1137
|
|
|
|
|
|
|
my $self = shift; |
1138
|
|
|
|
|
|
|
croak "You need to run the clusterer with K=0 option " . |
1139
|
|
|
|
|
|
|
"before you can call this method" if $self->{_K_best} eq 'unknown'; |
1140
|
|
|
|
|
|
|
print "\nThe best value of K: $self->{_K_best}\n" if $self->{_terminal_output}; |
1141
|
|
|
|
|
|
|
return $self->{_K_best}; |
1142
|
|
|
|
|
|
|
} |
1143
|
|
|
|
|
|
|
|
1144
|
|
|
|
|
|
|
sub show_QoC_values { |
1145
|
|
|
|
|
|
|
my $self = shift; |
1146
|
|
|
|
|
|
|
croak "\nYou need to run the clusterer with K=0 option before you can call this method" |
1147
|
|
|
|
|
|
|
if $self->{_K_best} eq 'unknown'; |
1148
|
|
|
|
|
|
|
print "\nShown below are K on the left and the QoC values on the right (the smaller " . |
1149
|
|
|
|
|
|
|
"the QoC, the better it is)\n"; |
1150
|
|
|
|
|
|
|
foreach my $key (sort keys %{$self->{_QoC_values}} ) { |
1151
|
|
|
|
|
|
|
print " $key => $self->{_QoC_values}->{$key}\n" if defined $self->{_QoC_values}->{$key}; |
1152
|
|
|
|
|
|
|
} |
1153
|
|
|
|
|
|
|
} |
1154
|
|
|
|
|
|
|
|
1155
|
|
|
|
|
|
|
sub DESTROY { |
1156
|
|
|
|
|
|
|
unlink "__temp_" . basename($_[0]->{_datafile}); |
1157
|
|
|
|
|
|
|
unlink "__temp_data_" . basename($_[0]->{_datafile}); |
1158
|
|
|
|
|
|
|
unlink "__temp_normed_data_" . basename($_[0]->{_datafile}); |
1159
|
|
|
|
|
|
|
} |
1160
|
|
|
|
|
|
|
|
1161
|
|
|
|
|
|
|
################################## Visualization Code ################################### |
1162
|
|
|
|
|
|
|
|
1163
|
|
|
|
|
|
|
# It makes sense to call visualize_clusters() only AFTER you have called kmeans(). |
1164
|
|
|
|
|
|
|
# |
1165
|
|
|
|
|
|
|
# The visualize_clusters() implementation automatically figures out whether it |
1166
|
|
|
|
|
|
|
# should do a 2D plot or a 3D plot. If the number of on bits in the mask that is |
1167
|
|
|
|
|
|
|
# supplied as one of the arguments is greater than 2, it does a 3D plot for the |
1168
|
|
|
|
|
|
|
# first three data coordinates. That is, the clusters will be displayed in the 3D |
1169
|
|
|
|
|
|
|
# space formed by the first three data coordinates. On the other hand, if the number |
1170
|
|
|
|
|
|
|
# of on bits in the mask is exactly 2, it does a 2D plot. Should it happen that |
1171
|
|
|
|
|
|
|
# only one on bit is specified for the mask, visualize_clusters() aborts. |
1172
|
|
|
|
|
|
|
# |
1173
|
|
|
|
|
|
|
# The visualization code consists of first accessing each of clusters created by the |
1174
|
|
|
|
|
|
|
# kmeans() subroutine. Note that the clusters contain only the symbolic names for |
1175
|
|
|
|
|
|
|
# the individual records in the source data file. We therefore next reach into the |
1176
|
|
|
|
|
|
|
# $self->{_original_data} hash and get the data coordinates associated with each |
1177
|
|
|
|
|
|
|
# symbolic label in a cluster. The numerical data thus generated is then written |
1178
|
|
|
|
|
|
|
# out to a temp file. When doing so we must remember to insert TWO BLANK LINES |
1179
|
|
|
|
|
|
|
# between the data blocks corresponding to the different clusters. This constraint |
1180
|
|
|
|
|
|
|
# is imposed on us by Gnuplot when plotting data from the same file since we want to |
1181
|
|
|
|
|
|
|
# use different point styles for the data points in different cluster files. |
1182
|
|
|
|
|
|
|
# |
1183
|
|
|
|
|
|
|
# Subsequently, we call upon the Perl interface provided by the Graphics::GnuplotIF |
1184
|
|
|
|
|
|
|
# module to plot the data clusters. |
1185
|
|
|
|
|
|
|
sub visualize_clusters { |
1186
|
|
|
|
|
|
|
my $self = shift; |
1187
|
|
|
|
|
|
|
my $v_mask; |
1188
|
|
|
|
|
|
|
my $pause_time; |
1189
|
|
|
|
|
|
|
if (@_ == 1) { |
1190
|
|
|
|
|
|
|
$v_mask = shift || croak "visualization mask missing"; |
1191
|
|
|
|
|
|
|
} elsif (@_ == 2) { |
1192
|
|
|
|
|
|
|
$v_mask = shift || croak "visualization mask missing"; |
1193
|
|
|
|
|
|
|
$pause_time = shift; |
1194
|
|
|
|
|
|
|
} else { |
1195
|
|
|
|
|
|
|
croak "visualize_clusters() called with wrong args"; |
1196
|
|
|
|
|
|
|
} |
1197
|
|
|
|
|
|
|
my $master_datafile = $self->{_datafile}; |
1198
|
|
|
|
|
|
|
my @v_mask = split //, $v_mask; |
1199
|
|
|
|
|
|
|
my $visualization_mask_width = @v_mask; |
1200
|
|
|
|
|
|
|
my $original_data_mask = $self->{_mask}; |
1201
|
|
|
|
|
|
|
my @mask = split //, $original_data_mask; |
1202
|
|
|
|
|
|
|
my $data_field_width = scalar grep {$_ eq '1'} @mask; |
1203
|
|
|
|
|
|
|
croak "\n\nABORTED: The width of the visualization mask (including " . |
1204
|
|
|
|
|
|
|
"all its 1s and 0s) must equal the width of the original mask " . |
1205
|
|
|
|
|
|
|
"used for reading the data file (counting only the 1's)" |
1206
|
|
|
|
|
|
|
if $visualization_mask_width != $data_field_width; |
1207
|
|
|
|
|
|
|
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask; |
1208
|
|
|
|
|
|
|
my %visualization_data; |
1209
|
|
|
|
|
|
|
while ( my ($record_id, $data) = each %{$self->{_original_data}} ) { |
1210
|
|
|
|
|
|
|
my @fields = @$data; |
1211
|
|
|
|
|
|
|
croak "\nABORTED: Visualization mask size exceeds data record size\n" |
1212
|
|
|
|
|
|
|
if $#v_mask > $#fields; |
1213
|
|
|
|
|
|
|
my @data_fields; |
1214
|
|
|
|
|
|
|
foreach my $i (0..@fields-1) { |
1215
|
|
|
|
|
|
|
if ($v_mask[$i] eq '0') { |
1216
|
|
|
|
|
|
|
next; |
1217
|
|
|
|
|
|
|
} elsif ($v_mask[$i] eq '1') { |
1218
|
|
|
|
|
|
|
push @data_fields, $fields[$i]; |
1219
|
|
|
|
|
|
|
} else { |
1220
|
|
|
|
|
|
|
croak "Misformed visualization mask. It can only have 1s and 0s\n"; |
1221
|
|
|
|
|
|
|
} |
1222
|
|
|
|
|
|
|
} |
1223
|
|
|
|
|
|
|
$visualization_data{ $record_id } = \@data_fields; |
1224
|
|
|
|
|
|
|
} |
1225
|
|
|
|
|
|
|
my @all_data_ids = @{$self->{_data_id_tags}}; |
1226
|
|
|
|
|
|
|
my $K = scalar @{$self->{_clusters}}; |
1227
|
|
|
|
|
|
|
my $filename = basename($master_datafile); |
1228
|
|
|
|
|
|
|
my $temp_file = "__temp_" . $filename; |
1229
|
|
|
|
|
|
|
unlink $temp_file if -e $temp_file; |
1230
|
|
|
|
|
|
|
open OUTPUT, ">$temp_file" |
1231
|
|
|
|
|
|
|
or die "Unable to open a temp file in this directory: $!\n"; |
1232
|
|
|
|
|
|
|
foreach my $cluster (@{$self->{_clusters}}) { |
1233
|
|
|
|
|
|
|
foreach my $item (@$cluster) { |
1234
|
|
|
|
|
|
|
print OUTPUT "@{$visualization_data{$item}}"; |
1235
|
|
|
|
|
|
|
print OUTPUT "\n"; |
1236
|
|
|
|
|
|
|
} |
1237
|
|
|
|
|
|
|
print OUTPUT "\n\n"; |
1238
|
|
|
|
|
|
|
} |
1239
|
|
|
|
|
|
|
close OUTPUT; |
1240
|
|
|
|
|
|
|
my $plot; |
1241
|
|
|
|
|
|
|
my $hardcopy_plot; |
1242
|
|
|
|
|
|
|
if (!defined $pause_time) { |
1243
|
|
|
|
|
|
|
$plot = Graphics::GnuplotIF->new( persist => 1 ); |
1244
|
|
|
|
|
|
|
$hardcopy_plot = Graphics::GnuplotIF->new(); |
1245
|
|
|
|
|
|
|
$hardcopy_plot->gnuplot_cmd('set terminal png', "set output \"clustering_results.png\""); |
1246
|
|
|
|
|
|
|
} else { |
1247
|
|
|
|
|
|
|
$plot = Graphics::GnuplotIF->new(); |
1248
|
|
|
|
|
|
|
} |
1249
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set noclip" ); |
1250
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set pointsize 2" ); |
1251
|
|
|
|
|
|
|
my $arg_string = ""; |
1252
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
1253
|
|
|
|
|
|
|
foreach my $i (0..$K-1) { |
1254
|
|
|
|
|
|
|
my $j = $i + 1; |
1255
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i\" with points lt $j pt $j, "; |
1256
|
|
|
|
|
|
|
} |
1257
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
1258
|
|
|
|
|
|
|
foreach my $i (0..$K-1) { |
1259
|
|
|
|
|
|
|
my $j = $i + 1; |
1260
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i\" with points lt $j pt $j, "; |
1261
|
|
|
|
|
|
|
} |
1262
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1 ) { |
1263
|
|
|
|
|
|
|
foreach my $i (0..$K-1) { |
1264
|
|
|
|
|
|
|
my $j = $i + 1; |
1265
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" index $i using 1 title \"Cluster $i\" with points lt $j pt $j, "; |
1266
|
|
|
|
|
|
|
} |
1267
|
|
|
|
|
|
|
} |
1268
|
|
|
|
|
|
|
$arg_string = $arg_string =~ /^(.*),[ ]+$/; |
1269
|
|
|
|
|
|
|
$arg_string = $1; |
1270
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
1271
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "splot $arg_string" ); |
1272
|
|
|
|
|
|
|
$hardcopy_plot->gnuplot_cmd( "splot $arg_string" ) unless defined $pause_time; |
1273
|
|
|
|
|
|
|
$plot->gnuplot_pause( $pause_time ) if defined $pause_time; |
1274
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
1275
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "plot $arg_string" ); |
1276
|
|
|
|
|
|
|
$hardcopy_plot->gnuplot_cmd( "plot $arg_string" ) unless defined $pause_time; |
1277
|
|
|
|
|
|
|
$plot->gnuplot_pause( $pause_time ) if defined $pause_time; |
1278
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1) { |
1279
|
|
|
|
|
|
|
croak "No provision for plotting 1-D data\n"; |
1280
|
|
|
|
|
|
|
} |
1281
|
|
|
|
|
|
|
} |
1282
|
|
|
|
|
|
|
|
1283
|
|
|
|
|
|
|
# It makes sense to call visualize_data() only AFTER you have called the method |
1284
|
|
|
|
|
|
|
# read_data_from_file(). |
1285
|
|
|
|
|
|
|
# |
1286
|
|
|
|
|
|
|
# The visualize_data() is meant for the visualization of the original data in its |
1287
|
|
|
|
|
|
|
# various 2D or 3D subspaces. The method can also be used to visualize the normed |
1288
|
|
|
|
|
|
|
# data in a similar manner. Recall the normed data is the original data after each |
1289
|
|
|
|
|
|
|
# data dimension is normalized by the standard-deviation along that dimension. |
1290
|
|
|
|
|
|
|
# |
1291
|
|
|
|
|
|
|
# Whether you see the original data or the normed data depends on the second |
1292
|
|
|
|
|
|
|
# argument supplied in the method call. It must be either the string 'original' or |
1293
|
|
|
|
|
|
|
# the string 'normed'. |
1294
|
|
|
|
|
|
|
sub visualize_data { |
1295
|
|
|
|
|
|
|
my $self = shift; |
1296
|
|
|
|
|
|
|
my $v_mask = shift || croak "visualization mask missing"; |
1297
|
|
|
|
|
|
|
my $datatype = shift; # must be either 'original' or 'normed' |
1298
|
|
|
|
|
|
|
croak "\n\nABORTED: You called visualize_data() for normed data " . |
1299
|
|
|
|
|
|
|
"but without first turning on data normalization in the " . |
1300
|
|
|
|
|
|
|
"in the KMeans constructor" |
1301
|
|
|
|
|
|
|
if ($datatype eq 'normed') && ! $self->{_var_normalize}; |
1302
|
|
|
|
|
|
|
my $master_datafile = $self->{_datafile}; |
1303
|
|
|
|
|
|
|
my @v_mask = split //, $v_mask; |
1304
|
|
|
|
|
|
|
my $visualization_mask_width = @v_mask; |
1305
|
|
|
|
|
|
|
my $original_data_mask = $self->{_mask}; |
1306
|
|
|
|
|
|
|
my @mask = split //, $original_data_mask; |
1307
|
|
|
|
|
|
|
my $data_field_width = scalar grep {$_ eq '1'} @mask; |
1308
|
|
|
|
|
|
|
croak "\n\nABORTED: The width of the visualization mask (including " . |
1309
|
|
|
|
|
|
|
"all its 1s and 0s) must equal the width of the original mask " . |
1310
|
|
|
|
|
|
|
"used for reading the data file (counting only the 1's)" |
1311
|
|
|
|
|
|
|
if $visualization_mask_width != $data_field_width; |
1312
|
|
|
|
|
|
|
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask; |
1313
|
|
|
|
|
|
|
my %visualization_data; |
1314
|
|
|
|
|
|
|
my $data_source; |
1315
|
|
|
|
|
|
|
if ($datatype eq 'original') { |
1316
|
|
|
|
|
|
|
$data_source = $self->{_original_data}; |
1317
|
|
|
|
|
|
|
} elsif ($datatype eq 'normed') { |
1318
|
|
|
|
|
|
|
$data_source = $self->{_data}; |
1319
|
|
|
|
|
|
|
} else { |
1320
|
|
|
|
|
|
|
croak "\n\nABORTED: improper call to visualize_data()"; |
1321
|
|
|
|
|
|
|
} |
1322
|
|
|
|
|
|
|
while ( my ($record_id, $data) = each %{$data_source} ) { |
1323
|
|
|
|
|
|
|
my @fields = @$data; |
1324
|
|
|
|
|
|
|
croak "\nABORTED: Visualization mask size exceeds data record size\n" |
1325
|
|
|
|
|
|
|
if $#v_mask > $#fields; |
1326
|
|
|
|
|
|
|
my @data_fields; |
1327
|
|
|
|
|
|
|
foreach my $i (0..@fields-1) { |
1328
|
|
|
|
|
|
|
if ($v_mask[$i] eq '0') { |
1329
|
|
|
|
|
|
|
next; |
1330
|
|
|
|
|
|
|
} elsif ($v_mask[$i] eq '1') { |
1331
|
|
|
|
|
|
|
push @data_fields, $fields[$i]; |
1332
|
|
|
|
|
|
|
} else { |
1333
|
|
|
|
|
|
|
croak "Misformed visualization mask. It can only have 1s and 0s\n"; |
1334
|
|
|
|
|
|
|
} |
1335
|
|
|
|
|
|
|
} |
1336
|
|
|
|
|
|
|
$visualization_data{ $record_id } = \@data_fields; |
1337
|
|
|
|
|
|
|
} |
1338
|
|
|
|
|
|
|
my $filename = basename($master_datafile); |
1339
|
|
|
|
|
|
|
my $temp_file; |
1340
|
|
|
|
|
|
|
if ($datatype eq 'original') { |
1341
|
|
|
|
|
|
|
$temp_file = "__temp_data_" . $filename; |
1342
|
|
|
|
|
|
|
} elsif ($datatype eq 'normed') { |
1343
|
|
|
|
|
|
|
$temp_file = "__temp_normed_data_" . $filename; |
1344
|
|
|
|
|
|
|
} else { |
1345
|
|
|
|
|
|
|
croak "ABORTED: Improper call to visualize_data()"; |
1346
|
|
|
|
|
|
|
} |
1347
|
|
|
|
|
|
|
unlink $temp_file if -e $temp_file; |
1348
|
|
|
|
|
|
|
open OUTPUT, ">$temp_file" |
1349
|
|
|
|
|
|
|
or die "Unable to open a temp file in this directory: $!\n"; |
1350
|
|
|
|
|
|
|
foreach my $datapoint (values %visualization_data) { |
1351
|
|
|
|
|
|
|
print OUTPUT "@$datapoint"; |
1352
|
|
|
|
|
|
|
print OUTPUT "\n"; |
1353
|
|
|
|
|
|
|
} |
1354
|
|
|
|
|
|
|
close OUTPUT; |
1355
|
|
|
|
|
|
|
my $plot = Graphics::GnuplotIF->new( persist => 1 ); |
1356
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set noclip" ); |
1357
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set pointsize 2" ); |
1358
|
|
|
|
|
|
|
my $plot_title = $datatype eq 'original' ? '"data"' : '"normed data"'; |
1359
|
|
|
|
|
|
|
my $arg_string ; |
1360
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
1361
|
|
|
|
|
|
|
$arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1"; |
1362
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
1363
|
|
|
|
|
|
|
$arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1"; |
1364
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1 ) { |
1365
|
|
|
|
|
|
|
$arg_string = "\"$temp_file\" using 1 notitle with points lt -1 pt 1"; |
1366
|
|
|
|
|
|
|
} |
1367
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
1368
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "splot $arg_string" ); |
1369
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
1370
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "plot $arg_string" ); |
1371
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1) { |
1372
|
|
|
|
|
|
|
croak "No provision for plotting 1-D data\n"; |
1373
|
|
|
|
|
|
|
} |
1374
|
|
|
|
|
|
|
} |
1375
|
|
|
|
|
|
|
|
1376
|
|
|
|
|
|
|
########################### Generating Synthetic Data for Clustering ############################## |
1377
|
|
|
|
|
|
|
|
1378
|
|
|
|
|
|
|
# The data generated corresponds to a multivariate distribution. The mean and the |
1379
|
|
|
|
|
|
|
# covariance of each Gaussian in the distribution are specified individually in a |
1380
|
|
|
|
|
|
|
# parameter file. See the example parameter file param.txt in the examples |
1381
|
|
|
|
|
|
|
# directory. Just edit this file for your own needs. |
1382
|
|
|
|
|
|
|
# |
1383
|
|
|
|
|
|
|
# The multivariate random numbers are generated by calling the Math::Random module. |
1384
|
|
|
|
|
|
|
# As you would expect, that module will insist that the covariance matrix you |
1385
|
|
|
|
|
|
|
# specify be symmetric and positive definite. |
1386
|
|
|
|
|
|
|
sub cluster_data_generator { |
1387
|
|
|
|
|
|
|
my $class = shift; |
1388
|
|
|
|
|
|
|
croak "illegal call of a class method" unless $class eq 'Algorithm::KMeans'; |
1389
|
|
|
|
|
|
|
my %args = @_; |
1390
|
|
|
|
|
|
|
my $input_parameter_file = $args{input_parameter_file}; |
1391
|
|
|
|
|
|
|
my $output_file = $args{output_datafile}; |
1392
|
|
|
|
|
|
|
my $N = $args{number_data_points_per_cluster}; |
1393
|
|
|
|
|
|
|
my @all_params; |
1394
|
|
|
|
|
|
|
my $param_string; |
1395
|
|
|
|
|
|
|
if (defined $input_parameter_file) { |
1396
|
|
|
|
|
|
|
open INPUT, $input_parameter_file || "unable to open parameter file: $!"; |
1397
|
|
|
|
|
|
|
@all_params = ; |
1398
|
|
|
|
|
|
|
@all_params = grep { $_ !~ /^[ ]*#/ } @all_params; |
1399
|
|
|
|
|
|
|
chomp @all_params; |
1400
|
|
|
|
|
|
|
$param_string = join ' ', @all_params; |
1401
|
|
|
|
|
|
|
} else { |
1402
|
|
|
|
|
|
|
# Just for testing. Used in t/test.t |
1403
|
|
|
|
|
|
|
$param_string = "cluster 5 0 0 1 0 0 0 1 0 0 0 1 " . |
1404
|
|
|
|
|
|
|
"cluster 0 5 0 1 0 0 0 1 0 0 0 1 " . |
1405
|
|
|
|
|
|
|
"cluster 0 0 5 1 0 0 0 1 0 0 0 1"; |
1406
|
|
|
|
|
|
|
} |
1407
|
|
|
|
|
|
|
my @cluster_strings = split /[ ]*cluster[ ]*/, $param_string; |
1408
|
|
|
|
|
|
|
@cluster_strings = grep $_, @cluster_strings; |
1409
|
|
|
|
|
|
|
my $K = @cluster_strings; |
1410
|
|
|
|
|
|
|
croak "Too many clusters requested" if $K > 12; |
1411
|
|
|
|
|
|
|
my @point_labels = ('a'..'z'); |
1412
|
|
|
|
|
|
|
print "Number of Gaussians used for the synthetic data: $K\n"; |
1413
|
|
|
|
|
|
|
my @means; |
1414
|
|
|
|
|
|
|
my @covariances; |
1415
|
|
|
|
|
|
|
my $data_dimension; |
1416
|
|
|
|
|
|
|
foreach my $i (0..$K-1) { |
1417
|
|
|
|
|
|
|
my @num_strings = split / /, $cluster_strings[$i]; |
1418
|
|
|
|
|
|
|
my @cluster_mean = map {/$_num_regex/;$_} split / /, $num_strings[0]; |
1419
|
|
|
|
|
|
|
$data_dimension = @cluster_mean; |
1420
|
|
|
|
|
|
|
push @means, \@cluster_mean; |
1421
|
|
|
|
|
|
|
my @covariance_nums = map {/$_num_regex/;$_} split / /, $num_strings[1]; |
1422
|
|
|
|
|
|
|
croak "dimensionality error" if @covariance_nums != |
1423
|
|
|
|
|
|
|
($data_dimension ** 2); |
1424
|
|
|
|
|
|
|
my $cluster_covariance; |
1425
|
|
|
|
|
|
|
foreach my $j (0..$data_dimension-1) { |
1426
|
|
|
|
|
|
|
foreach my $k (0..$data_dimension-1) { |
1427
|
|
|
|
|
|
|
$cluster_covariance->[$j]->[$k] = |
1428
|
|
|
|
|
|
|
$covariance_nums[$j*$data_dimension + $k]; |
1429
|
|
|
|
|
|
|
} |
1430
|
|
|
|
|
|
|
} |
1431
|
|
|
|
|
|
|
push @covariances, $cluster_covariance; |
1432
|
|
|
|
|
|
|
} |
1433
|
|
|
|
|
|
|
random_seed_from_phrase( 'hellojello' ); |
1434
|
|
|
|
|
|
|
my @data_dump; |
1435
|
|
|
|
|
|
|
foreach my $i (0..$K-1) { |
1436
|
|
|
|
|
|
|
my @m = @{shift @means}; |
1437
|
|
|
|
|
|
|
my @covar = @{shift @covariances}; |
1438
|
|
|
|
|
|
|
my @new_data = Math::Random::random_multivariate_normal( $N, @m, @covar ); |
1439
|
|
|
|
|
|
|
my $p = 0; |
1440
|
|
|
|
|
|
|
my $label = $point_labels[$i]; |
1441
|
|
|
|
|
|
|
@new_data = map {unshift @$_, $label.$i; $i++; $_} @new_data; |
1442
|
|
|
|
|
|
|
push @data_dump, @new_data; |
1443
|
|
|
|
|
|
|
} |
1444
|
|
|
|
|
|
|
fisher_yates_shuffle( \@data_dump ); |
1445
|
|
|
|
|
|
|
open OUTPUT, ">$output_file"; |
1446
|
|
|
|
|
|
|
foreach my $ele (@data_dump) { |
1447
|
|
|
|
|
|
|
foreach my $coord ( @$ele ) { |
1448
|
|
|
|
|
|
|
print OUTPUT "$coord "; |
1449
|
|
|
|
|
|
|
} |
1450
|
|
|
|
|
|
|
print OUTPUT "\n"; |
1451
|
|
|
|
|
|
|
} |
1452
|
|
|
|
|
|
|
print "Data written out to file $output_file\n"; |
1453
|
|
|
|
|
|
|
close OUTPUT; |
1454
|
|
|
|
|
|
|
} |
1455
|
|
|
|
|
|
|
|
1456
|
|
|
|
|
|
|
sub add_point_coords { |
1457
|
|
|
|
|
|
|
my $self = shift; |
1458
|
|
|
|
|
|
|
my @arr_of_ids = @{shift @_}; # array of data element names |
1459
|
|
|
|
|
|
|
my @result; |
1460
|
|
|
|
|
|
|
my $data_dimensionality = $self->{_data_dimensions}; |
1461
|
|
|
|
|
|
|
foreach my $i (0..$data_dimensionality-1) { |
1462
|
|
|
|
|
|
|
$result[$i] = 0.0; |
1463
|
|
|
|
|
|
|
} |
1464
|
|
|
|
|
|
|
foreach my $id (@arr_of_ids) { |
1465
|
|
|
|
|
|
|
my $ele = $self->{_data}->{$id}; |
1466
|
|
|
|
|
|
|
my $i = 0; |
1467
|
|
|
|
|
|
|
foreach my $component (@$ele) { |
1468
|
|
|
|
|
|
|
$result[$i] += $component; |
1469
|
|
|
|
|
|
|
$i++; |
1470
|
|
|
|
|
|
|
} |
1471
|
|
|
|
|
|
|
} |
1472
|
|
|
|
|
|
|
return \@result; |
1473
|
|
|
|
|
|
|
} |
1474
|
|
|
|
|
|
|
|
1475
|
|
|
|
|
|
|
sub add_point_coords_from_original_data { |
1476
|
|
|
|
|
|
|
my $self = shift; |
1477
|
|
|
|
|
|
|
my @arr_of_ids = @{shift @_}; # array of data element names |
1478
|
|
|
|
|
|
|
my @result; |
1479
|
|
|
|
|
|
|
my $data_dimensionality = $self->{_data_dimensions}; |
1480
|
|
|
|
|
|
|
foreach my $i (0..$data_dimensionality-1) { |
1481
|
|
|
|
|
|
|
$result[$i] = 0.0; |
1482
|
|
|
|
|
|
|
} |
1483
|
|
|
|
|
|
|
foreach my $id (@arr_of_ids) { |
1484
|
|
|
|
|
|
|
my $ele = $self->{_original_data}->{$id}; |
1485
|
|
|
|
|
|
|
my $i = 0; |
1486
|
|
|
|
|
|
|
foreach my $component (@$ele) { |
1487
|
|
|
|
|
|
|
$result[$i] += $component; |
1488
|
|
|
|
|
|
|
$i++; |
1489
|
|
|
|
|
|
|
} |
1490
|
|
|
|
|
|
|
} |
1491
|
|
|
|
|
|
|
return \@result; |
1492
|
|
|
|
|
|
|
} |
1493
|
|
|
|
|
|
|
|
1494
|
|
|
|
|
|
|
################################### Support Routines ######################################## |
1495
|
|
|
|
|
|
|
|
1496
|
|
|
|
|
|
|
sub get_index_at_value { |
1497
|
|
|
|
|
|
|
my $value = shift; |
1498
|
|
|
|
|
|
|
my @array = @{shift @_}; |
1499
|
|
|
|
|
|
|
foreach my $i (0..@array-1) { |
1500
|
|
|
|
|
|
|
return $i if $value == $array[$i]; |
1501
|
|
|
|
|
|
|
} |
1502
|
|
|
|
|
|
|
return -1; |
1503
|
|
|
|
|
|
|
} |
1504
|
|
|
|
|
|
|
|
1505
|
|
|
|
|
|
|
# This routine is really not necessary in light of the new `~~' operator in Perl. |
1506
|
|
|
|
|
|
|
# Will use the new operator in the next version. |
1507
|
|
|
|
|
|
|
sub vector_equal { |
1508
|
|
|
|
|
|
|
my $vec1 = shift; |
1509
|
|
|
|
|
|
|
my $vec2 = shift; |
1510
|
|
|
|
|
|
|
die "wrong data types for vector-equal predicate\n" if @$vec1 != @$vec2; |
1511
|
|
|
|
|
|
|
foreach my $i (0..@$vec1-1){ |
1512
|
|
|
|
|
|
|
return 0 if $vec1->[$i] != $vec2->[$i]; |
1513
|
|
|
|
|
|
|
} |
1514
|
|
|
|
|
|
|
return 1; |
1515
|
|
|
|
|
|
|
} |
1516
|
|
|
|
|
|
|
|
1517
|
|
|
|
|
|
|
# Returns the minimum value and its positional index in an array |
1518
|
|
|
|
|
|
|
sub minimum { |
1519
|
|
|
|
|
|
|
my $arr = shift; |
1520
|
|
|
|
|
|
|
my $min; |
1521
|
|
|
|
|
|
|
my $index; |
1522
|
|
|
|
|
|
|
foreach my $i (0..@{$arr}-1) { |
1523
|
|
|
|
|
|
|
if ( (!defined $min) || ($arr->[$i] < $min) ) { |
1524
|
|
|
|
|
|
|
$index = $i; |
1525
|
|
|
|
|
|
|
$min = $arr->[$i]; |
1526
|
|
|
|
|
|
|
} |
1527
|
|
|
|
|
|
|
} |
1528
|
|
|
|
|
|
|
return ($min, $index); |
1529
|
|
|
|
|
|
|
} |
1530
|
|
|
|
|
|
|
|
1531
|
|
|
|
|
|
|
sub minmax { |
1532
|
|
|
|
|
|
|
my $arr = shift; |
1533
|
|
|
|
|
|
|
my $min; |
1534
|
|
|
|
|
|
|
my $max; |
1535
|
|
|
|
|
|
|
foreach my $i (0..@{$arr}-1) { |
1536
|
|
|
|
|
|
|
next if !defined $arr->[$i]; |
1537
|
|
|
|
|
|
|
if ( (!defined $min) && (!defined $max) ) { |
1538
|
|
|
|
|
|
|
$min = $arr->[$i]; |
1539
|
|
|
|
|
|
|
$max = $arr->[$i]; |
1540
|
|
|
|
|
|
|
} elsif ( $arr->[$i] < $min ) { |
1541
|
|
|
|
|
|
|
$min = $arr->[$i]; |
1542
|
|
|
|
|
|
|
} elsif ( $arr->[$i] > $max ) { |
1543
|
|
|
|
|
|
|
$max = $arr->[$i]; |
1544
|
|
|
|
|
|
|
} |
1545
|
|
|
|
|
|
|
} |
1546
|
|
|
|
|
|
|
return ($min, $max); |
1547
|
|
|
|
|
|
|
} |
1548
|
|
|
|
|
|
|
|
1549
|
|
|
|
|
|
|
# Meant only for constructing a deep copy of an array of arrays: |
1550
|
|
|
|
|
|
|
sub deep_copy_AoA { |
1551
|
|
|
|
|
|
|
my $ref_in = shift; |
1552
|
|
|
|
|
|
|
my $ref_out; |
1553
|
|
|
|
|
|
|
foreach my $i (0..@{$ref_in}-1) { |
1554
|
|
|
|
|
|
|
foreach my $j (0..@{$ref_in->[$i]}-1) { |
1555
|
|
|
|
|
|
|
$ref_out->[$i]->[$j] = $ref_in->[$i]->[$j]; |
1556
|
|
|
|
|
|
|
} |
1557
|
|
|
|
|
|
|
} |
1558
|
|
|
|
|
|
|
return $ref_out; |
1559
|
|
|
|
|
|
|
} |
1560
|
|
|
|
|
|
|
|
1561
|
|
|
|
|
|
|
# Meant only for constructing a deep copy of an array of arrays for the case when |
1562
|
|
|
|
|
|
|
# some elements of the top-level array may be undefined: |
1563
|
|
|
|
|
|
|
sub deep_copy_AoA_with_nulls { |
1564
|
|
|
|
|
|
|
my $ref_in = shift; |
1565
|
|
|
|
|
|
|
my $ref_out; |
1566
|
|
|
|
|
|
|
foreach my $i (0..@{$ref_in}-1) { |
1567
|
|
|
|
|
|
|
if ( !defined $ref_in->[$i] ) { |
1568
|
|
|
|
|
|
|
$ref_out->[$i] = undef; |
1569
|
|
|
|
|
|
|
next; |
1570
|
|
|
|
|
|
|
} |
1571
|
|
|
|
|
|
|
foreach my $j (0..@{$ref_in->[$i]}-1) { |
1572
|
|
|
|
|
|
|
$ref_out->[$i]->[$j] = $ref_in->[$i]->[$j]; |
1573
|
|
|
|
|
|
|
} |
1574
|
|
|
|
|
|
|
} |
1575
|
|
|
|
|
|
|
return $ref_out; |
1576
|
|
|
|
|
|
|
} |
1577
|
|
|
|
|
|
|
|
1578
|
|
|
|
|
|
|
# Meant only for constructing a deep copy of a hash in which each value is an |
1579
|
|
|
|
|
|
|
# anonymous array of numbers: |
1580
|
|
|
|
|
|
|
sub deep_copy_hash { |
1581
|
|
|
|
|
|
|
my $ref_in = shift; |
1582
|
|
|
|
|
|
|
my $ref_out; |
1583
|
|
|
|
|
|
|
while ( my ($key, $value) = each( %$ref_in ) ) { |
1584
|
|
|
|
|
|
|
$ref_out->{$key} = deep_copy_array( $value ); |
1585
|
|
|
|
|
|
|
} |
1586
|
|
|
|
|
|
|
return $ref_out; |
1587
|
|
|
|
|
|
|
} |
1588
|
|
|
|
|
|
|
|
1589
|
|
|
|
|
|
|
# Meant only for an array of numbers: |
1590
|
|
|
|
|
|
|
sub deep_copy_array { |
1591
|
|
|
|
|
|
|
my $ref_in = shift; |
1592
|
|
|
|
|
|
|
my $ref_out; |
1593
|
|
|
|
|
|
|
foreach my $i (0..@{$ref_in}-1) { |
1594
|
|
|
|
|
|
|
$ref_out->[$i] = $ref_in->[$i]; |
1595
|
|
|
|
|
|
|
} |
1596
|
|
|
|
|
|
|
return $ref_out; |
1597
|
|
|
|
|
|
|
} |
1598
|
|
|
|
|
|
|
|
1599
|
|
|
|
|
|
|
sub display_cluster_centers { |
1600
|
|
|
|
|
|
|
my $self = shift; |
1601
|
|
|
|
|
|
|
my @clusters = @{shift @_}; |
1602
|
|
|
|
|
|
|
my $i = 0; |
1603
|
|
|
|
|
|
|
foreach my $cluster (@clusters) { |
1604
|
|
|
|
|
|
|
my $cluster_size = @$cluster; |
1605
|
|
|
|
|
|
|
my @cluster_center = |
1606
|
|
|
|
|
|
|
@{$self->add_point_coords_from_original_data( $cluster )}; |
1607
|
|
|
|
|
|
|
@cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center; |
1608
|
|
|
|
|
|
|
print "\ncluster $i ($cluster_size records):\n"; |
1609
|
|
|
|
|
|
|
print "cluster center $i: " . |
1610
|
|
|
|
|
|
|
"@{[map {my $x = sprintf('%.4f', $_); $x} @cluster_center]}\n"; |
1611
|
|
|
|
|
|
|
$i++; |
1612
|
|
|
|
|
|
|
} |
1613
|
|
|
|
|
|
|
} |
1614
|
|
|
|
|
|
|
|
1615
|
|
|
|
|
|
|
# For displaying the individual clusters on a terminal screen. Each cluster is |
1616
|
|
|
|
|
|
|
# displayed through the symbolic names associated with the data points. |
1617
|
|
|
|
|
|
|
sub display_clusters { |
1618
|
|
|
|
|
|
|
my @clusters = @{shift @_}; |
1619
|
|
|
|
|
|
|
my $i = 0; |
1620
|
|
|
|
|
|
|
foreach my $cluster (@clusters) { |
1621
|
|
|
|
|
|
|
@$cluster = sort @$cluster; |
1622
|
|
|
|
|
|
|
my $cluster_size = @$cluster; |
1623
|
|
|
|
|
|
|
print "\n\nCluster $i ($cluster_size records):\n"; |
1624
|
|
|
|
|
|
|
foreach my $ele (@$cluster) { |
1625
|
|
|
|
|
|
|
print " $ele"; |
1626
|
|
|
|
|
|
|
} |
1627
|
|
|
|
|
|
|
$i++ |
1628
|
|
|
|
|
|
|
} |
1629
|
|
|
|
|
|
|
print "\n\n"; |
1630
|
|
|
|
|
|
|
} |
1631
|
|
|
|
|
|
|
|
1632
|
|
|
|
|
|
|
# from perl docs: |
1633
|
|
|
|
|
|
|
sub fisher_yates_shuffle { |
1634
|
|
|
|
|
|
|
my $arr = shift; |
1635
|
|
|
|
|
|
|
my $i = @$arr; |
1636
|
|
|
|
|
|
|
while (--$i) { |
1637
|
|
|
|
|
|
|
my $j = int rand( $i + 1 ); |
1638
|
|
|
|
|
|
|
@$arr[$i, $j] = @$arr[$j, $i]; |
1639
|
|
|
|
|
|
|
} |
1640
|
|
|
|
|
|
|
} |
1641
|
|
|
|
|
|
|
|
1642
|
|
|
|
|
|
|
sub variance_normalization { |
1643
|
|
|
|
|
|
|
my %data_hash = %{shift @_}; |
1644
|
|
|
|
|
|
|
my @all_data_points = values %data_hash; |
1645
|
|
|
|
|
|
|
my $dimensions = @{$all_data_points[0]}; |
1646
|
|
|
|
|
|
|
my @data_projections; |
1647
|
|
|
|
|
|
|
foreach my $data_point (@all_data_points) { |
1648
|
|
|
|
|
|
|
my $i = 0; |
1649
|
|
|
|
|
|
|
foreach my $proj (@$data_point) { |
1650
|
|
|
|
|
|
|
push @{$data_projections[$i++]}, $proj; |
1651
|
|
|
|
|
|
|
} |
1652
|
|
|
|
|
|
|
} |
1653
|
|
|
|
|
|
|
my @variance_vec; |
1654
|
|
|
|
|
|
|
foreach my $vec (@data_projections) { |
1655
|
|
|
|
|
|
|
my ($mean, $variance) = mean_and_variance( $vec ); |
1656
|
|
|
|
|
|
|
push @variance_vec, $variance; |
1657
|
|
|
|
|
|
|
} |
1658
|
|
|
|
|
|
|
my %new_data_hash; |
1659
|
|
|
|
|
|
|
while (my ($label, $data) = each(%data_hash) ) { |
1660
|
|
|
|
|
|
|
my @new_data; |
1661
|
|
|
|
|
|
|
foreach my $i (0..@{$data}-1) { |
1662
|
|
|
|
|
|
|
my $new = $data->[$i] / sqrt($variance_vec[$i]); |
1663
|
|
|
|
|
|
|
push @new_data, $data->[$i] / sqrt($variance_vec[$i]); |
1664
|
|
|
|
|
|
|
} |
1665
|
|
|
|
|
|
|
$new_data_hash{$label} = \@new_data; |
1666
|
|
|
|
|
|
|
} |
1667
|
|
|
|
|
|
|
return \%new_data_hash; |
1668
|
|
|
|
|
|
|
} |
1669
|
|
|
|
|
|
|
|
1670
|
|
|
|
|
|
|
sub mean_and_variance { |
1671
|
|
|
|
|
|
|
my @data = @{shift @_}; |
1672
|
|
|
|
|
|
|
my ($mean, $variance); |
1673
|
|
|
|
|
|
|
foreach my $i (1..@data) { |
1674
|
|
|
|
|
|
|
if ($i == 1) { |
1675
|
|
|
|
|
|
|
$mean = $data[0]; |
1676
|
|
|
|
|
|
|
$variance = 0; |
1677
|
|
|
|
|
|
|
} else { |
1678
|
|
|
|
|
|
|
$mean = ( (($i-1)/$i) * $mean ) + $data[$i-1] / $i; |
1679
|
|
|
|
|
|
|
$variance = ( (($i-1)/$i) * $variance ) + ($data[$i-1]-$mean)**2 / ($i-1); |
1680
|
|
|
|
|
|
|
} |
1681
|
|
|
|
|
|
|
} |
1682
|
|
|
|
|
|
|
return ($mean, $variance); |
1683
|
|
|
|
|
|
|
} |
1684
|
|
|
|
|
|
|
|
1685
|
|
|
|
|
|
|
sub check_for_illegal_params { |
1686
|
|
|
|
|
|
|
my @params = @_; |
1687
|
|
|
|
|
|
|
my @legal_params = qw / datafile |
1688
|
|
|
|
|
|
|
mask |
1689
|
|
|
|
|
|
|
K |
1690
|
|
|
|
|
|
|
Kmin |
1691
|
|
|
|
|
|
|
Kmax |
1692
|
|
|
|
|
|
|
terminal_output |
1693
|
|
|
|
|
|
|
write_clusters_to_files |
1694
|
|
|
|
|
|
|
do_variance_normalization |
1695
|
|
|
|
|
|
|
cluster_seeding |
1696
|
|
|
|
|
|
|
use_mahalanobis_metric |
1697
|
|
|
|
|
|
|
debug |
1698
|
|
|
|
|
|
|
/; |
1699
|
|
|
|
|
|
|
my $found_match_flag; |
1700
|
|
|
|
|
|
|
foreach my $param (@params) { |
1701
|
|
|
|
|
|
|
foreach my $legal (@legal_params) { |
1702
|
|
|
|
|
|
|
$found_match_flag = 0; |
1703
|
|
|
|
|
|
|
if ($param eq $legal) { |
1704
|
|
|
|
|
|
|
$found_match_flag = 1; |
1705
|
|
|
|
|
|
|
last; |
1706
|
|
|
|
|
|
|
} |
1707
|
|
|
|
|
|
|
} |
1708
|
|
|
|
|
|
|
last if $found_match_flag == 0; |
1709
|
|
|
|
|
|
|
} |
1710
|
|
|
|
|
|
|
return $found_match_flag; |
1711
|
|
|
|
|
|
|
} |
1712
|
|
|
|
|
|
|
|
1713
|
|
|
|
|
|
|
sub get_value_index_hash { |
1714
|
|
|
|
|
|
|
my $arr = shift; |
1715
|
|
|
|
|
|
|
my %hash; |
1716
|
|
|
|
|
|
|
foreach my $index (0..@$arr-1) { |
1717
|
|
|
|
|
|
|
$hash{$arr->[$index]} = $index if $arr->[$index] > 0; |
1718
|
|
|
|
|
|
|
} |
1719
|
|
|
|
|
|
|
return \%hash; |
1720
|
|
|
|
|
|
|
} |
1721
|
|
|
|
|
|
|
|
1722
|
|
|
|
|
|
|
sub non_maximum_suppression { |
1723
|
|
|
|
|
|
|
my $arr = shift; |
1724
|
|
|
|
|
|
|
my @output = (0) x @$arr; |
1725
|
|
|
|
|
|
|
my @final_output = (0) x @$arr; |
1726
|
|
|
|
|
|
|
my %hash; |
1727
|
|
|
|
|
|
|
my @array_of_runs = ([$arr->[0]]); |
1728
|
|
|
|
|
|
|
foreach my $index (1..@$arr-1) { |
1729
|
|
|
|
|
|
|
if ($arr->[$index] == $arr->[$index-1]) { |
1730
|
|
|
|
|
|
|
push @{$array_of_runs[-1]}, $arr->[$index]; |
1731
|
|
|
|
|
|
|
} else { |
1732
|
|
|
|
|
|
|
push @array_of_runs, [$arr->[$index]]; |
1733
|
|
|
|
|
|
|
} |
1734
|
|
|
|
|
|
|
} |
1735
|
|
|
|
|
|
|
my $runstart_index = 0; |
1736
|
|
|
|
|
|
|
foreach my $run_index (1..@array_of_runs-2) { |
1737
|
|
|
|
|
|
|
$runstart_index += @{$array_of_runs[$run_index-1]}; |
1738
|
|
|
|
|
|
|
if ($array_of_runs[$run_index]->[0] > |
1739
|
|
|
|
|
|
|
$array_of_runs[$run_index-1]->[0] && |
1740
|
|
|
|
|
|
|
$array_of_runs[$run_index]->[0] > |
1741
|
|
|
|
|
|
|
$array_of_runs[$run_index+1]->[0]) { |
1742
|
|
|
|
|
|
|
my $run_center = @{$array_of_runs[$run_index]} / 2; |
1743
|
|
|
|
|
|
|
my $assignment_index = $runstart_index + $run_center; |
1744
|
|
|
|
|
|
|
$output[$assignment_index] = $arr->[$assignment_index]; |
1745
|
|
|
|
|
|
|
} |
1746
|
|
|
|
|
|
|
} |
1747
|
|
|
|
|
|
|
if ($array_of_runs[-1]->[0] > $array_of_runs[-2]->[0]) { |
1748
|
|
|
|
|
|
|
$runstart_index += @{$array_of_runs[-2]}; |
1749
|
|
|
|
|
|
|
my $run_center = @{$array_of_runs[-1]} / 2; |
1750
|
|
|
|
|
|
|
my $assignment_index = $runstart_index + $run_center; |
1751
|
|
|
|
|
|
|
$output[$assignment_index] = $arr->[$assignment_index]; |
1752
|
|
|
|
|
|
|
} |
1753
|
|
|
|
|
|
|
if ($array_of_runs[0]->[0] > $array_of_runs[1]->[0]) { |
1754
|
|
|
|
|
|
|
my $run_center = @{$array_of_runs[0]} / 2; |
1755
|
|
|
|
|
|
|
$output[$run_center] = $arr->[$run_center]; |
1756
|
|
|
|
|
|
|
} |
1757
|
|
|
|
|
|
|
return \@output; |
1758
|
|
|
|
|
|
|
} |
1759
|
|
|
|
|
|
|
|
1760
|
|
|
|
|
|
|
sub display_matrix { |
1761
|
|
|
|
|
|
|
my $matrix = shift; |
1762
|
|
|
|
|
|
|
my $nrows = $matrix->rows(); |
1763
|
|
|
|
|
|
|
my $ncols = $matrix->cols(); |
1764
|
|
|
|
|
|
|
print "\n\nDisplaying matrix of size $nrows rows and $ncols columns:\n"; |
1765
|
|
|
|
|
|
|
foreach my $i (0..$nrows-1) { |
1766
|
|
|
|
|
|
|
my $row = $matrix->row($i); |
1767
|
|
|
|
|
|
|
my @row_as_list = $row->as_list; |
1768
|
|
|
|
|
|
|
print "@row_as_list\n"; |
1769
|
|
|
|
|
|
|
} |
1770
|
|
|
|
|
|
|
print "\n\n"; |
1771
|
|
|
|
|
|
|
} |
1772
|
|
|
|
|
|
|
|
1773
|
|
|
|
|
|
|
sub transpose { |
1774
|
|
|
|
|
|
|
my $matrix = shift; |
1775
|
|
|
|
|
|
|
my $num_rows = $matrix->rows(); |
1776
|
|
|
|
|
|
|
my $num_cols = $matrix->cols(); |
1777
|
|
|
|
|
|
|
my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows); |
1778
|
|
|
|
|
|
|
foreach my $i (0..$num_rows-1) { |
1779
|
|
|
|
|
|
|
my @row = $matrix->row($i)->as_list; |
1780
|
|
|
|
|
|
|
$transpose->set_col($i, \@row ); |
1781
|
|
|
|
|
|
|
} |
1782
|
|
|
|
|
|
|
return $transpose; |
1783
|
|
|
|
|
|
|
} |
1784
|
|
|
|
|
|
|
|
1785
|
|
|
|
|
|
|
sub vector_subtract { |
1786
|
|
|
|
|
|
|
my $vec1 = shift; |
1787
|
|
|
|
|
|
|
my $vec2 = shift; |
1788
|
|
|
|
|
|
|
die "wrong data types for vector subtract calculation\n" if @$vec1 != @$vec2; |
1789
|
|
|
|
|
|
|
my @result; |
1790
|
|
|
|
|
|
|
foreach my $i (0..@$vec1-1){ |
1791
|
|
|
|
|
|
|
push @result, $vec1->[$i] - $vec2->[$i]; |
1792
|
|
|
|
|
|
|
} |
1793
|
|
|
|
|
|
|
return @result; |
1794
|
|
|
|
|
|
|
} |
1795
|
|
|
|
|
|
|
|
1796
|
|
|
|
|
|
|
sub vector_multiply { |
1797
|
|
|
|
|
|
|
my $vec1 = shift; |
1798
|
|
|
|
|
|
|
my $vec2 = shift; |
1799
|
|
|
|
|
|
|
die "vec_multiply called with two vectors of different sizes" |
1800
|
|
|
|
|
|
|
unless @$vec1 == @$vec2; |
1801
|
|
|
|
|
|
|
my $result = 0; |
1802
|
|
|
|
|
|
|
foreach my $i (0..@$vec1-1) { |
1803
|
|
|
|
|
|
|
$result += $vec1->[$i] * $vec2->[$i]; |
1804
|
|
|
|
|
|
|
} |
1805
|
|
|
|
|
|
|
return $result; |
1806
|
|
|
|
|
|
|
} |
1807
|
|
|
|
|
|
|
|
1808
|
|
|
|
|
|
|
sub matrix_multiply { |
1809
|
|
|
|
|
|
|
my $matrix1 = shift; |
1810
|
|
|
|
|
|
|
my $matrix2 = shift; |
1811
|
|
|
|
|
|
|
my ($nrows1, $ncols1) = ($matrix1->rows(), $matrix1->cols()); |
1812
|
|
|
|
|
|
|
my ($nrows2, $ncols2) = ($matrix2->rows(), $matrix2->cols()); |
1813
|
|
|
|
|
|
|
die "matrix multiplication called with non-matching matrix arguments" |
1814
|
|
|
|
|
|
|
unless $nrows1 == $ncols2 && $ncols1 == $nrows2; |
1815
|
|
|
|
|
|
|
if ($nrows1 == 1) { |
1816
|
|
|
|
|
|
|
my @row = $matrix1->row(0)->as_list; |
1817
|
|
|
|
|
|
|
my @col = $matrix2->col(0)->as_list; |
1818
|
|
|
|
|
|
|
my $result; |
1819
|
|
|
|
|
|
|
foreach my $j (0..$ncols1-1) { |
1820
|
|
|
|
|
|
|
$result += $row[$j] * $col[$j]; |
1821
|
|
|
|
|
|
|
} |
1822
|
|
|
|
|
|
|
return $result; |
1823
|
|
|
|
|
|
|
} |
1824
|
|
|
|
|
|
|
my $product = Math::GSL::Matrix->new($nrows1, $nrows1); |
1825
|
|
|
|
|
|
|
foreach my $i (0..$nrows1-1) { |
1826
|
|
|
|
|
|
|
my $row = $matrix1->row($i); |
1827
|
|
|
|
|
|
|
my @product_row; |
1828
|
|
|
|
|
|
|
foreach my $j (0..$ncols2-1) { |
1829
|
|
|
|
|
|
|
my $col = $matrix2->col($j); |
1830
|
|
|
|
|
|
|
my $row_times_col = matrix_multiply($row, $col); |
1831
|
|
|
|
|
|
|
push @product_row, $row_times_col; |
1832
|
|
|
|
|
|
|
} |
1833
|
|
|
|
|
|
|
$product->set_row($i, \@product_row); |
1834
|
|
|
|
|
|
|
} |
1835
|
|
|
|
|
|
|
return $product; |
1836
|
|
|
|
|
|
|
} |
1837
|
|
|
|
|
|
|
|
1838
|
|
|
|
|
|
|
1; |
1839
|
|
|
|
|
|
|
|
1840
|
|
|
|
|
|
|
=pod |
1841
|
|
|
|
|
|
|
|
1842
|
|
|
|
|
|
|
=head1 NAME |
1843
|
|
|
|
|
|
|
|
1844
|
|
|
|
|
|
|
Algorithm::KMeans - for clustering multidimensional data |
1845
|
|
|
|
|
|
|
|
1846
|
|
|
|
|
|
|
=head1 SYNOPSIS |
1847
|
|
|
|
|
|
|
|
1848
|
|
|
|
|
|
|
# You now have four different choices for clustering your data with this module: |
1849
|
|
|
|
|
|
|
# |
1850
|
|
|
|
|
|
|
# 1) With Euclidean distances and with random cluster seeding |
1851
|
|
|
|
|
|
|
# |
1852
|
|
|
|
|
|
|
# 2) With Mahalanobis distances and with random cluster seeding |
1853
|
|
|
|
|
|
|
# |
1854
|
|
|
|
|
|
|
# 3) With Euclidean distances and with smart cluster seeding |
1855
|
|
|
|
|
|
|
# |
1856
|
|
|
|
|
|
|
# 4) With Mahalanobis distances and with smart cluster seeding |
1857
|
|
|
|
|
|
|
# |
1858
|
|
|
|
|
|
|
# Despite the qualifier 'smart' in 'smart cluster seeding', it may not always |
1859
|
|
|
|
|
|
|
# produce results that are superior to those obtained with random seeding. (If you |
1860
|
|
|
|
|
|
|
# also factor in the choice regarding variance normalization, you actually have |
1861
|
|
|
|
|
|
|
# eight different choices for data clustering with this module.) |
1862
|
|
|
|
|
|
|
# |
1863
|
|
|
|
|
|
|
# In all cases, you'd obviously begin with |
1864
|
|
|
|
|
|
|
|
1865
|
|
|
|
|
|
|
use Algorithm::KMeans; |
1866
|
|
|
|
|
|
|
|
1867
|
|
|
|
|
|
|
# You'd then name the data file: |
1868
|
|
|
|
|
|
|
|
1869
|
|
|
|
|
|
|
my $datafile = "mydatafile.csv"; |
1870
|
|
|
|
|
|
|
|
1871
|
|
|
|
|
|
|
# Next, set the mask to indicate which columns of the datafile to use for |
1872
|
|
|
|
|
|
|
# clustering and which column contains a symbolic ID for each data record. For |
1873
|
|
|
|
|
|
|
# example, if the symbolic name is in the first column, you want the second column |
1874
|
|
|
|
|
|
|
# to be ignored, and you want the next three columns to be used for 3D clustering, |
1875
|
|
|
|
|
|
|
# you'd set the mask to: |
1876
|
|
|
|
|
|
|
|
1877
|
|
|
|
|
|
|
my $mask = "N0111"; |
1878
|
|
|
|
|
|
|
|
1879
|
|
|
|
|
|
|
# Now construct an instance of the clusterer. The parameter K controls the number |
1880
|
|
|
|
|
|
|
# of clusters. If you know how many clusters you want (let's say 3), call |
1881
|
|
|
|
|
|
|
|
1882
|
|
|
|
|
|
|
my $clusterer = Algorithm::KMeans->new( datafile => $datafile, |
1883
|
|
|
|
|
|
|
mask => $mask, |
1884
|
|
|
|
|
|
|
K => 3, |
1885
|
|
|
|
|
|
|
cluster_seeding => 'random', |
1886
|
|
|
|
|
|
|
terminal_output => 1, |
1887
|
|
|
|
|
|
|
write_clusters_to_files => 1, |
1888
|
|
|
|
|
|
|
); |
1889
|
|
|
|
|
|
|
|
1890
|
|
|
|
|
|
|
# By default, this constructor call will set you up for clustering based on |
1891
|
|
|
|
|
|
|
# Euclidean distances. If you want the module to use Mahalanobis distances, your |
1892
|
|
|
|
|
|
|
# constructor call will look like: |
1893
|
|
|
|
|
|
|
|
1894
|
|
|
|
|
|
|
my $clusterer = Algorithm::KMeans->new( datafile => $datafile, |
1895
|
|
|
|
|
|
|
mask => $mask, |
1896
|
|
|
|
|
|
|
K => 3, |
1897
|
|
|
|
|
|
|
cluster_seeding => 'random', |
1898
|
|
|
|
|
|
|
use_mahalanobis_metric => 1, |
1899
|
|
|
|
|
|
|
terminal_output => 1, |
1900
|
|
|
|
|
|
|
write_clusters_to_files => 1, |
1901
|
|
|
|
|
|
|
); |
1902
|
|
|
|
|
|
|
|
1903
|
|
|
|
|
|
|
# For both constructor calls shown above, you can use smart seeding of the clusters |
1904
|
|
|
|
|
|
|
# by changing 'random' to 'smart' for the cluster_seeding option. See the |
1905
|
|
|
|
|
|
|
# explanation of smart seeding in the Methods section of this documentation. |
1906
|
|
|
|
|
|
|
|
1907
|
|
|
|
|
|
|
# If your data is such that its variability along the different dimensions of the |
1908
|
|
|
|
|
|
|
# data space is significantly different, you may get better clustering if you first |
1909
|
|
|
|
|
|
|
# normalize your data by setting the constructor parameter |
1910
|
|
|
|
|
|
|
# do_variance_normalization as shown below: |
1911
|
|
|
|
|
|
|
|
1912
|
|
|
|
|
|
|
my $clusterer = Algorithm::KMeans->new( datafile => $datafile, |
1913
|
|
|
|
|
|
|
mask => $mask, |
1914
|
|
|
|
|
|
|
K => 3, |
1915
|
|
|
|
|
|
|
cluster_seeding => 'smart', # or 'random' |
1916
|
|
|
|
|
|
|
terminal_output => 1, |
1917
|
|
|
|
|
|
|
do_variance_normalization => 1, |
1918
|
|
|
|
|
|
|
write_clusters_to_files => 1, |
1919
|
|
|
|
|
|
|
); |
1920
|
|
|
|
|
|
|
|
1921
|
|
|
|
|
|
|
# But bear in mind that such data normalization may actually decrease the |
1922
|
|
|
|
|
|
|
# performance of the clusterer if the variability in the data is more a result of |
1923
|
|
|
|
|
|
|
# the separation between the means than a consequence of intra-cluster variance. |
1924
|
|
|
|
|
|
|
|
1925
|
|
|
|
|
|
|
# Set K to 0 if you want the module to figure out the optimum number of clusters |
1926
|
|
|
|
|
|
|
# from the data. (It is best to run this option with the terminal_output set to 1 |
1927
|
|
|
|
|
|
|
# so that you can see the different value of QoC for the different K): |
1928
|
|
|
|
|
|
|
|
1929
|
|
|
|
|
|
|
my $clusterer = Algorithm::KMeans->new( datafile => $datafile, |
1930
|
|
|
|
|
|
|
mask => $mask, |
1931
|
|
|
|
|
|
|
K => 0, |
1932
|
|
|
|
|
|
|
cluster_seeding => 'random', # or 'smart' |
1933
|
|
|
|
|
|
|
terminal_output => 1, |
1934
|
|
|
|
|
|
|
write_clusters_to_files => 1, |
1935
|
|
|
|
|
|
|
); |
1936
|
|
|
|
|
|
|
|
1937
|
|
|
|
|
|
|
# Although not shown above, you can obviously set the 'do_variance_normalization' |
1938
|
|
|
|
|
|
|
# flag here also if you wish. |
1939
|
|
|
|
|
|
|
|
1940
|
|
|
|
|
|
|
# For very large data files, setting K to 0 will result in searching through too |
1941
|
|
|
|
|
|
|
# many values for K. For such cases, you can range limit the values of K to search |
1942
|
|
|
|
|
|
|
# through by |
1943
|
|
|
|
|
|
|
|
1944
|
|
|
|
|
|
|
my $clusterer = Algorithm::KMeans->new( datafile => $datafile, |
1945
|
|
|
|
|
|
|
mask => "N111", |
1946
|
|
|
|
|
|
|
Kmin => 3, |
1947
|
|
|
|
|
|
|
Kmax => 10, |
1948
|
|
|
|
|
|
|
cluster_seeding => 'random', # or 'smart' |
1949
|
|
|
|
|
|
|
terminal_output => 1, |
1950
|
|
|
|
|
|
|
write_clusters_to_files => 1, |
1951
|
|
|
|
|
|
|
); |
1952
|
|
|
|
|
|
|
|
1953
|
|
|
|
|
|
|
# FOR ALL CASES ABOVE, YOU'D NEED TO MAKE THE FOLLOWING CALLS ON THE CLUSTERER |
1954
|
|
|
|
|
|
|
# INSTANCE TO ACTUALLY CLUSTER THE DATA: |
1955
|
|
|
|
|
|
|
|
1956
|
|
|
|
|
|
|
$clusterer->read_data_from_file(); |
1957
|
|
|
|
|
|
|
$clusterer->kmeans(); |
1958
|
|
|
|
|
|
|
|
1959
|
|
|
|
|
|
|
# If you want to directly access the clusters and the cluster centers in your own |
1960
|
|
|
|
|
|
|
# top-level script, replace the above two statements with: |
1961
|
|
|
|
|
|
|
|
1962
|
|
|
|
|
|
|
$clusterer->read_data_from_file(); |
1963
|
|
|
|
|
|
|
my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans(); |
1964
|
|
|
|
|
|
|
|
1965
|
|
|
|
|
|
|
# You can subsequently access the clusters directly in your own code, as in: |
1966
|
|
|
|
|
|
|
|
1967
|
|
|
|
|
|
|
foreach my $cluster_id (sort keys %{$clusters_hash}) { |
1968
|
|
|
|
|
|
|
print "\n$cluster_id => @{$clusters_hash->{$cluster_id}}\n"; |
1969
|
|
|
|
|
|
|
} |
1970
|
|
|
|
|
|
|
foreach my $cluster_id (sort keys %{$cluster_centers_hash}) { |
1971
|
|
|
|
|
|
|
print "\n$cluster_id => @{$cluster_centers_hash->{$cluster_id}}\n"; |
1972
|
|
|
|
|
|
|
} |
1973
|
|
|
|
|
|
|
|
1974
|
|
|
|
|
|
|
|
1975
|
|
|
|
|
|
|
# CLUSTER VISUALIZATION: |
1976
|
|
|
|
|
|
|
|
1977
|
|
|
|
|
|
|
# You must first set the mask for cluster visualization. This mask tells the module |
1978
|
|
|
|
|
|
|
# which 2D or 3D subspace of the original data space you wish to visualize the |
1979
|
|
|
|
|
|
|
# clusters in: |
1980
|
|
|
|
|
|
|
|
1981
|
|
|
|
|
|
|
my $visualization_mask = "111"; |
1982
|
|
|
|
|
|
|
$clusterer->visualize_clusters($visualization_mask); |
1983
|
|
|
|
|
|
|
|
1984
|
|
|
|
|
|
|
|
1985
|
|
|
|
|
|
|
# SYNTHETIC DATA GENERATION: |
1986
|
|
|
|
|
|
|
|
1987
|
|
|
|
|
|
|
# The module has been provided with a class method for generating multivariate data |
1988
|
|
|
|
|
|
|
# for experimenting with clustering. The data generation is controlled by the |
1989
|
|
|
|
|
|
|
# contents of the parameter file that is supplied as an argument to the data |
1990
|
|
|
|
|
|
|
# generator method. The mean and covariance matrix entries in the parameter file |
1991
|
|
|
|
|
|
|
# must be according to the syntax shown in the param.txt file in the examples |
1992
|
|
|
|
|
|
|
# directory. It is best to edit this file as needed: |
1993
|
|
|
|
|
|
|
|
1994
|
|
|
|
|
|
|
my $parameter_file = "param.txt"; |
1995
|
|
|
|
|
|
|
my $out_datafile = "mydatafile.dat"; |
1996
|
|
|
|
|
|
|
Algorithm::KMeans->cluster_data_generator( |
1997
|
|
|
|
|
|
|
input_parameter_file => $parameter_file, |
1998
|
|
|
|
|
|
|
output_datafile => $out_datafile, |
1999
|
|
|
|
|
|
|
number_data_points_per_cluster => $N ); |
2000
|
|
|
|
|
|
|
|
2001
|
|
|
|
|
|
|
=head1 CHANGES |
2002
|
|
|
|
|
|
|
|
2003
|
|
|
|
|
|
|
Version 2.05 removes the restriction on the version of Perl that is required. This |
2004
|
|
|
|
|
|
|
is based on Srezic's recommendation. He had no problem building and testing the |
2005
|
|
|
|
|
|
|
previous version with Perl 5.8.9. Version 2.05 also includes a small augmentation of |
2006
|
|
|
|
|
|
|
the code in the method C for guarding against user errors |
2007
|
|
|
|
|
|
|
in the specification of the mask that tells the module which columns of the data file |
2008
|
|
|
|
|
|
|
are to be used for clustering. |
2009
|
|
|
|
|
|
|
|
2010
|
|
|
|
|
|
|
Version 2.04 allows you to use CSV data files for clustering. |
2011
|
|
|
|
|
|
|
|
2012
|
|
|
|
|
|
|
Version 2.03 incorporates minor code cleanup. The main implementation of the module |
2013
|
|
|
|
|
|
|
remains unchanged. |
2014
|
|
|
|
|
|
|
|
2015
|
|
|
|
|
|
|
Version 2.02 downshifts the version of Perl that is required for this module. The |
2016
|
|
|
|
|
|
|
module should work with versions 5.10 and higher of Perl. The implementation code |
2017
|
|
|
|
|
|
|
for the module remains unchanged. |
2018
|
|
|
|
|
|
|
|
2019
|
|
|
|
|
|
|
Version 2.01 removes many errors in the documentation. The changes made to the module |
2020
|
|
|
|
|
|
|
in Version 2.0 were not reflected properly in the documentation page for that |
2021
|
|
|
|
|
|
|
version. The implementation code remains unchanged. |
2022
|
|
|
|
|
|
|
|
2023
|
|
|
|
|
|
|
Version 2.0 includes significant additional functionality: (1) You now have the |
2024
|
|
|
|
|
|
|
option to cluster using the Mahalanobis distance metric (the default is the Euclidean |
2025
|
|
|
|
|
|
|
metric); and (2) With the two C methods that have been added to the |
2026
|
|
|
|
|
|
|
module, you can now determine the best cluster for a new data sample after you have |
2027
|
|
|
|
|
|
|
created the clusters with the previously available data. Finding the best cluster |
2028
|
|
|
|
|
|
|
for a new data sample can be done using either the Euclidean metric or the |
2029
|
|
|
|
|
|
|
Mahalanobis metric. |
2030
|
|
|
|
|
|
|
|
2031
|
|
|
|
|
|
|
Version 1.40 includes a C option for seeding the clusters. This option, |
2032
|
|
|
|
|
|
|
supplied through the constructor parameter C, means that the |
2033
|
|
|
|
|
|
|
clusterer will (1) Subject the data to principal components analysis in order to |
2034
|
|
|
|
|
|
|
determine the maximum variance direction; (2) Project the data onto this direction; |
2035
|
|
|
|
|
|
|
(3) Find peaks in a smoothed histogram of the projected points; and (4) Use the |
2036
|
|
|
|
|
|
|
locations of the highest peaks as initial guesses for the cluster centers. If you |
2037
|
|
|
|
|
|
|
don't want to use this option, set C to C. That should work |
2038
|
|
|
|
|
|
|
as in the previous version of the module. |
2039
|
|
|
|
|
|
|
|
2040
|
|
|
|
|
|
|
Version 1.30 includes a bug fix for the case when the datafile contains empty lines, |
2041
|
|
|
|
|
|
|
that is, lines with no data records. Another bug fix in Version 1.30 deals with the |
2042
|
|
|
|
|
|
|
case when you want the module to figure out how many clusters to form (this is the |
2043
|
|
|
|
|
|
|
C option in the constructor call) and the number of data records is close to the |
2044
|
|
|
|
|
|
|
minimum. |
2045
|
|
|
|
|
|
|
|
2046
|
|
|
|
|
|
|
Version 1.21 includes fixes to handle the possibility that, when clustering the data |
2047
|
|
|
|
|
|
|
for a fixed number of clusters, a cluster may become empty during iterative |
2048
|
|
|
|
|
|
|
calculation of cluster assignments of the data elements and the updating of the |
2049
|
|
|
|
|
|
|
cluster centers. The code changes are in the C and |
2050
|
|
|
|
|
|
|
C subroutines. |
2051
|
|
|
|
|
|
|
|
2052
|
|
|
|
|
|
|
Version 1.20 includes an option to normalize the data with respect to its variability |
2053
|
|
|
|
|
|
|
along the different coordinates before clustering is carried out. |
2054
|
|
|
|
|
|
|
|
2055
|
|
|
|
|
|
|
Version 1.1.1 allows for range limiting the values of C to search through. C |
2056
|
|
|
|
|
|
|
stands for the number of clusters to form. This version also declares the module |
2057
|
|
|
|
|
|
|
dependencies in the C file. |
2058
|
|
|
|
|
|
|
|
2059
|
|
|
|
|
|
|
Version 1.1 is a an object-oriented version of the implementation presented in |
2060
|
|
|
|
|
|
|
version 1.0. The current version should lend itself more easily to code extension. |
2061
|
|
|
|
|
|
|
You could, for example, create your own class by subclassing from the class presented |
2062
|
|
|
|
|
|
|
here and, in your subclass, use your own criteria for the similarity distance between |
2063
|
|
|
|
|
|
|
the data points and for the QoC (Quality of Clustering) metric, and, possibly a |
2064
|
|
|
|
|
|
|
different rule to stop the iterations. Version 1.1 also allows you to directly |
2065
|
|
|
|
|
|
|
access the clusters formed and the cluster centers in your calling script. |
2066
|
|
|
|
|
|
|
|
2067
|
|
|
|
|
|
|
|
2068
|
|
|
|
|
|
|
=head1 SPECIAL USAGE NOTE |
2069
|
|
|
|
|
|
|
|
2070
|
|
|
|
|
|
|
If you were directly accessing in your own scripts the clusters produced by the older |
2071
|
|
|
|
|
|
|
versions of this module, you'd need to make changes to your code if you wish to use |
2072
|
|
|
|
|
|
|
Version 2.0 or higher. Instead of returning arrays of clusters and cluster centers, |
2073
|
|
|
|
|
|
|
Versions 2.0 and higher return hashes. This change was made necessary by the logic |
2074
|
|
|
|
|
|
|
required for implementing the two new C methods that were introduced |
2075
|
|
|
|
|
|
|
in Version 2.0. These methods return the best cluster for a new data sample from the |
2076
|
|
|
|
|
|
|
clusters you created using the existing data. One of the C methods is |
2077
|
|
|
|
|
|
|
based on the Euclidean metric for finding the cluster that is closest to the new data |
2078
|
|
|
|
|
|
|
sample, and the other on the Mahalanobis metric. Another point of incompatibility |
2079
|
|
|
|
|
|
|
with the previous versions is that you must now explicitly set the C |
2080
|
|
|
|
|
|
|
parameter in the call to the constructor to either C or C. This |
2081
|
|
|
|
|
|
|
parameter does not have a default associated with it starting with Version 2.0. |
2082
|
|
|
|
|
|
|
|
2083
|
|
|
|
|
|
|
|
2084
|
|
|
|
|
|
|
=head1 DESCRIPTION |
2085
|
|
|
|
|
|
|
|
2086
|
|
|
|
|
|
|
Clustering with K-Means takes place iteratively and involves two steps: 1) assignment |
2087
|
|
|
|
|
|
|
of data samples to clusters on the basis of how far the data samples are from the |
2088
|
|
|
|
|
|
|
cluster centers; and 2) Recalculation of the cluster centers (and cluster covariances |
2089
|
|
|
|
|
|
|
if you are using the Mahalanobis distance metric for clustering). |
2090
|
|
|
|
|
|
|
|
2091
|
|
|
|
|
|
|
Obviously, before the two-step approach can proceed, we need to initialize the the |
2092
|
|
|
|
|
|
|
cluster centers. How this initialization is carried out is important. The module |
2093
|
|
|
|
|
|
|
gives you two very different ways for carrying out this initialization. One option, |
2094
|
|
|
|
|
|
|
called the C option, consists of subjecting the data to principal components |
2095
|
|
|
|
|
|
|
analysis to discover the direction of maximum variance in the data space. The data |
2096
|
|
|
|
|
|
|
points are then projected on to this direction and a histogram constructed from the |
2097
|
|
|
|
|
|
|
projections. Centers of the smoothed histogram are used to seed the clustering |
2098
|
|
|
|
|
|
|
operation. The other option is to choose the cluster centers purely randomly. You |
2099
|
|
|
|
|
|
|
get the first option if you set C to C in the constructor, |
2100
|
|
|
|
|
|
|
and you get the second option if you set it to C. |
2101
|
|
|
|
|
|
|
|
2102
|
|
|
|
|
|
|
How to specify the number of clusters, C, is one of the most vexing issues in any |
2103
|
|
|
|
|
|
|
approach to clustering. In some case, we can set C on the basis of prior |
2104
|
|
|
|
|
|
|
knowledge. But, more often than not, no such prior knowledge is available. When the |
2105
|
|
|
|
|
|
|
programmer does not explicitly specify a value for C, the approach taken in the |
2106
|
|
|
|
|
|
|
current implementation is to try all possible values between 2 and some largest |
2107
|
|
|
|
|
|
|
possible value that makes statistical sense. We then choose that value for C |
2108
|
|
|
|
|
|
|
which yields the best value for the QoC (Quality of Clustering) metric. It is |
2109
|
|
|
|
|
|
|
generally believed that the largest value for C should not exceed C |
2110
|
|
|
|
|
|
|
where C is the number of data samples to be clustered. |
2111
|
|
|
|
|
|
|
|
2112
|
|
|
|
|
|
|
What to use for the QoC metric is obviously a critical issue unto itself. In the |
2113
|
|
|
|
|
|
|
current implementation, the value of QoC is the ratio of the average radius of the |
2114
|
|
|
|
|
|
|
clusters and the average distance between the cluster centers. |
2115
|
|
|
|
|
|
|
|
2116
|
|
|
|
|
|
|
Every iterative algorithm requires a stopping criterion. The criterion implemented |
2117
|
|
|
|
|
|
|
here is that we stop iterations when there is no re-assignment of the data points |
2118
|
|
|
|
|
|
|
during the assignment step. |
2119
|
|
|
|
|
|
|
|
2120
|
|
|
|
|
|
|
Ordinarily, the output produced by a K-Means clusterer will correspond to a local |
2121
|
|
|
|
|
|
|
minimum for the QoC values, as opposed to a global minimum. The current |
2122
|
|
|
|
|
|
|
implementation protects against that when the module constructor is called with the |
2123
|
|
|
|
|
|
|
C option for C by trying different randomly selected initial |
2124
|
|
|
|
|
|
|
cluster centers and then selecting the one that gives the best overall QoC value. |
2125
|
|
|
|
|
|
|
|
2126
|
|
|
|
|
|
|
A K-Means clusterer will generally produce good results if the overlap between the |
2127
|
|
|
|
|
|
|
clusters is minimal and if each cluster exhibits variability that is uniform in all |
2128
|
|
|
|
|
|
|
directions. When the data variability is different along the different directions in |
2129
|
|
|
|
|
|
|
the data space, the results you obtain with a K-Means clusterer may be improved by |
2130
|
|
|
|
|
|
|
first normalizing the data appropriately, as can be done in this module when you set |
2131
|
|
|
|
|
|
|
the C option in the constructor. However, as pointed out |
2132
|
|
|
|
|
|
|
elsewhere in this documentation, such normalization may actually decrease the |
2133
|
|
|
|
|
|
|
performance of the clusterer if the overall data variability along any dimension is |
2134
|
|
|
|
|
|
|
more a result of separation between the means than a consequence of intra-cluster |
2135
|
|
|
|
|
|
|
variability. |
2136
|
|
|
|
|
|
|
|
2137
|
|
|
|
|
|
|
|
2138
|
|
|
|
|
|
|
=head1 METHODS |
2139
|
|
|
|
|
|
|
|
2140
|
|
|
|
|
|
|
The module provides the following methods for clustering, for cluster visualization, |
2141
|
|
|
|
|
|
|
for data visualization, for the generation of data for testing a clustering |
2142
|
|
|
|
|
|
|
algorithm, and for determining the cluster membership of a new data sample: |
2143
|
|
|
|
|
|
|
|
2144
|
|
|
|
|
|
|
=over 4 |
2145
|
|
|
|
|
|
|
|
2146
|
|
|
|
|
|
|
=item B |
2147
|
|
|
|
|
|
|
|
2148
|
|
|
|
|
|
|
my $clusterer = Algorithm::KMeans->new(datafile => $datafile, |
2149
|
|
|
|
|
|
|
mask => $mask, |
2150
|
|
|
|
|
|
|
K => $K, |
2151
|
|
|
|
|
|
|
cluster_seeding => 'random', # also try 'smart' |
2152
|
|
|
|
|
|
|
use_mahalanobis_metric => 1, # also try '0' |
2153
|
|
|
|
|
|
|
terminal_output => 1, |
2154
|
|
|
|
|
|
|
write_clusters_to_files => 1, |
2155
|
|
|
|
|
|
|
); |
2156
|
|
|
|
|
|
|
|
2157
|
|
|
|
|
|
|
A call to C constructs a new instance of the C class. When |
2158
|
|
|
|
|
|
|
C<$K> is a non-zero positive integer, the module will construct exactly that many |
2159
|
|
|
|
|
|
|
clusters. However, when C<$K> is 0, the module will find the best number of clusters |
2160
|
|
|
|
|
|
|
to partition the data into. As explained in the Description, setting |
2161
|
|
|
|
|
|
|
C to C causes PCA (principal components analysis) to be used |
2162
|
|
|
|
|
|
|
for discovering the best choices for the initial cluster centers. If you want purely |
2163
|
|
|
|
|
|
|
random decisions to be made for the initial choices for the cluster centers, set |
2164
|
|
|
|
|
|
|
C to C. |
2165
|
|
|
|
|
|
|
|
2166
|
|
|
|
|
|
|
The data file is expected to contain entries in the following format |
2167
|
|
|
|
|
|
|
|
2168
|
|
|
|
|
|
|
c20 0 10.7087017086940 9.63528386251712 10.9512155258108 ... |
2169
|
|
|
|
|
|
|
c7 0 12.8025925026787 10.6126270065785 10.5228482095349 ... |
2170
|
|
|
|
|
|
|
b9 0 7.60118206283120 5.05889245193079 5.82841781759102 ... |
2171
|
|
|
|
|
|
|
.... |
2172
|
|
|
|
|
|
|
.... |
2173
|
|
|
|
|
|
|
|
2174
|
|
|
|
|
|
|
where the first column contains the symbolic ID tag for each data record and the rest |
2175
|
|
|
|
|
|
|
of the columns the numerical information. As to which columns are actually used for |
2176
|
|
|
|
|
|
|
clustering is decided by the string value of the mask. For example, if we wanted to |
2177
|
|
|
|
|
|
|
cluster on the basis of the entries in just the 3rd, the 4th, and the 5th columns |
2178
|
|
|
|
|
|
|
above, the mask value would be C where the character C indicates that the |
2179
|
|
|
|
|
|
|
ID tag is in the first column, the character C<0> that the second column is to be |
2180
|
|
|
|
|
|
|
ignored, and the C<1>'s that follow that the 3rd, the 4th, and the 5th columns are to |
2181
|
|
|
|
|
|
|
be used for clustering. |
2182
|
|
|
|
|
|
|
|
2183
|
|
|
|
|
|
|
If you wish for the clusterer to search through a C<(Kmin,Kmax)> range of values for |
2184
|
|
|
|
|
|
|
C, the constructor should be called in the following fashion: |
2185
|
|
|
|
|
|
|
|
2186
|
|
|
|
|
|
|
my $clusterer = Algorithm::KMeans->new(datafile => $datafile, |
2187
|
|
|
|
|
|
|
mask => $mask, |
2188
|
|
|
|
|
|
|
Kmin => 3, |
2189
|
|
|
|
|
|
|
Kmax => 10, |
2190
|
|
|
|
|
|
|
cluster_seeding => 'smart', # try 'random' also |
2191
|
|
|
|
|
|
|
terminal_output => 1, |
2192
|
|
|
|
|
|
|
); |
2193
|
|
|
|
|
|
|
|
2194
|
|
|
|
|
|
|
where obviously you can choose any reasonable values for C and C. If you |
2195
|
|
|
|
|
|
|
choose a value for C that is statistically too large, the module will let you |
2196
|
|
|
|
|
|
|
know. Again, you may choose C for C, the default value being |
2197
|
|
|
|
|
|
|
C. |
2198
|
|
|
|
|
|
|
|
2199
|
|
|
|
|
|
|
If you believe that the variability of the data is very different along the different |
2200
|
|
|
|
|
|
|
dimensions of the data space, you may get better clustering by first normalizing the |
2201
|
|
|
|
|
|
|
data coordinates by the standard-deviations along those directions. When you set the |
2202
|
|
|
|
|
|
|
constructor option C as shown below, the module uses the |
2203
|
|
|
|
|
|
|
overall data standard-deviation along a direction for the normalization in that |
2204
|
|
|
|
|
|
|
direction. (As mentioned elsewhere in the documentation, such a normalization could |
2205
|
|
|
|
|
|
|
backfire on you if the data variability along a dimension is more a result of the |
2206
|
|
|
|
|
|
|
separation between the means than a consequence of the intra-cluster variability.): |
2207
|
|
|
|
|
|
|
|
2208
|
|
|
|
|
|
|
my $clusterer = Algorithm::KMeans->new( datafile => $datafile, |
2209
|
|
|
|
|
|
|
mask => "N111", |
2210
|
|
|
|
|
|
|
K => 2, |
2211
|
|
|
|
|
|
|
cluster_seeding => 'smart', # try 'random' also |
2212
|
|
|
|
|
|
|
terminal_output => 1, |
2213
|
|
|
|
|
|
|
do_variance_normalization => 1, |
2214
|
|
|
|
|
|
|
); |
2215
|
|
|
|
|
|
|
|
2216
|
|
|
|
|
|
|
=back |
2217
|
|
|
|
|
|
|
|
2218
|
|
|
|
|
|
|
=head2 Constructor Parameters |
2219
|
|
|
|
|
|
|
|
2220
|
|
|
|
|
|
|
=over 8 |
2221
|
|
|
|
|
|
|
|
2222
|
|
|
|
|
|
|
=item C: |
2223
|
|
|
|
|
|
|
|
2224
|
|
|
|
|
|
|
This parameter names the data file that contains the multidimensional data records |
2225
|
|
|
|
|
|
|
you want the module to cluster. |
2226
|
|
|
|
|
|
|
|
2227
|
|
|
|
|
|
|
=item C: |
2228
|
|
|
|
|
|
|
|
2229
|
|
|
|
|
|
|
This parameter supplies the mask to be applied to the columns of your data file. See |
2230
|
|
|
|
|
|
|
the explanation in Synopsis for what this mask looks like. |
2231
|
|
|
|
|
|
|
|
2232
|
|
|
|
|
|
|
=item C: |
2233
|
|
|
|
|
|
|
|
2234
|
|
|
|
|
|
|
This parameter supplies the number of clusters you are looking for. If you set this |
2235
|
|
|
|
|
|
|
option to 0, that means that you want the module to search for the best value for |
2236
|
|
|
|
|
|
|
C. (Keep in mind the fact that searching for the best C may take a long time |
2237
|
|
|
|
|
|
|
for large data files.) |
2238
|
|
|
|
|
|
|
|
2239
|
|
|
|
|
|
|
=item C: |
2240
|
|
|
|
|
|
|
|
2241
|
|
|
|
|
|
|
If you supply an integer value for C, the search for the best C will begin |
2242
|
|
|
|
|
|
|
with that value. |
2243
|
|
|
|
|
|
|
|
2244
|
|
|
|
|
|
|
=item C: |
2245
|
|
|
|
|
|
|
|
2246
|
|
|
|
|
|
|
If you supply an integer value for C, the search for the best C will end at |
2247
|
|
|
|
|
|
|
that value. |
2248
|
|
|
|
|
|
|
|
2249
|
|
|
|
|
|
|
=item C: |
2250
|
|
|
|
|
|
|
|
2251
|
|
|
|
|
|
|
This parameter must be set to either C or C. Depending on your data, |
2252
|
|
|
|
|
|
|
you may get superior clustering with the C option. The choice C means |
2253
|
|
|
|
|
|
|
that the clusterer will (1) subject the data to principal components analysis to |
2254
|
|
|
|
|
|
|
determine the maximum variance direction; (2) project the data onto this direction; |
2255
|
|
|
|
|
|
|
(3) find peaks in a smoothed histogram of the projected points; and (4) use the |
2256
|
|
|
|
|
|
|
locations of the highest peaks as seeds for cluster centers. If the C option |
2257
|
|
|
|
|
|
|
produces bizarre results, try C. |
2258
|
|
|
|
|
|
|
|
2259
|
|
|
|
|
|
|
=item C: |
2260
|
|
|
|
|
|
|
|
2261
|
|
|
|
|
|
|
When set to 1, this option causes Mahalanobis distances to be used for clustering. |
2262
|
|
|
|
|
|
|
The default is 0 for this parameter. By default, the module uses the Euclidean |
2263
|
|
|
|
|
|
|
distances for clustering. In general, Mahalanobis distance based clustering will |
2264
|
|
|
|
|
|
|
fail if your data resides on a lower-dimensional hyperplane in the data space, if you |
2265
|
|
|
|
|
|
|
seek too many clusters, and if you do not have a sufficient number of samples in your |
2266
|
|
|
|
|
|
|
data file. A necessary requirement for the module to be able to compute Mahalanobis |
2267
|
|
|
|
|
|
|
distances is that the cluster covariance matrices be non-singular. (Let's say your |
2268
|
|
|
|
|
|
|
data dimensionality is C and the module is considering a cluster that has only |
2269
|
|
|
|
|
|
|
C samples in it where C is less than C. In this case, the covariance matrix |
2270
|
|
|
|
|
|
|
will be singular since its rank will not exceed C. For the covariance matrix to |
2271
|
|
|
|
|
|
|
be non-singular, it must be of full rank, that is, its rank must be C.) |
2272
|
|
|
|
|
|
|
|
2273
|
|
|
|
|
|
|
=item C: |
2274
|
|
|
|
|
|
|
|
2275
|
|
|
|
|
|
|
When set, the module will first normalize the data variance along the different |
2276
|
|
|
|
|
|
|
dimensions of the data space before attempting clustering. Depending on your data, |
2277
|
|
|
|
|
|
|
this option may or may not result in better clustering. |
2278
|
|
|
|
|
|
|
|
2279
|
|
|
|
|
|
|
=item C: |
2280
|
|
|
|
|
|
|
|
2281
|
|
|
|
|
|
|
This boolean parameter, when not supplied in the call to C, defaults to 0. |
2282
|
|
|
|
|
|
|
When set, you will see in your terminal window the different clusters as lists of the |
2283
|
|
|
|
|
|
|
symbolic IDs and their cluster centers. You will also see the QoC (Quality of |
2284
|
|
|
|
|
|
|
Clustering) values for the clusters displayed. |
2285
|
|
|
|
|
|
|
|
2286
|
|
|
|
|
|
|
=item C: |
2287
|
|
|
|
|
|
|
|
2288
|
|
|
|
|
|
|
This parameter is also boolean. When set to 1, the clusters are written out to files |
2289
|
|
|
|
|
|
|
that are named in the following manner: |
2290
|
|
|
|
|
|
|
|
2291
|
|
|
|
|
|
|
cluster0.txt |
2292
|
|
|
|
|
|
|
cluster1.txt |
2293
|
|
|
|
|
|
|
cluster2.txt |
2294
|
|
|
|
|
|
|
... |
2295
|
|
|
|
|
|
|
... |
2296
|
|
|
|
|
|
|
|
2297
|
|
|
|
|
|
|
Before the clusters are written to these files, the module destroys all files with |
2298
|
|
|
|
|
|
|
such names in the directory in which you call the module. |
2299
|
|
|
|
|
|
|
|
2300
|
|
|
|
|
|
|
|
2301
|
|
|
|
|
|
|
=back |
2302
|
|
|
|
|
|
|
|
2303
|
|
|
|
|
|
|
=over |
2304
|
|
|
|
|
|
|
|
2305
|
|
|
|
|
|
|
=item B |
2306
|
|
|
|
|
|
|
|
2307
|
|
|
|
|
|
|
$clusterer->read_data_from_file() |
2308
|
|
|
|
|
|
|
|
2309
|
|
|
|
|
|
|
=item B |
2310
|
|
|
|
|
|
|
|
2311
|
|
|
|
|
|
|
$clusterer->kmeans(); |
2312
|
|
|
|
|
|
|
|
2313
|
|
|
|
|
|
|
or |
2314
|
|
|
|
|
|
|
|
2315
|
|
|
|
|
|
|
my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans(); |
2316
|
|
|
|
|
|
|
|
2317
|
|
|
|
|
|
|
The first call above works solely by side-effect. The second call also returns the |
2318
|
|
|
|
|
|
|
clusters and the cluster centers. See the C script in the |
2319
|
|
|
|
|
|
|
C directory for how you can in your own code extract the clusters and the |
2320
|
|
|
|
|
|
|
cluster centers from the variables C<$clusters_hash> and C<$cluster_centers_hash>. |
2321
|
|
|
|
|
|
|
|
2322
|
|
|
|
|
|
|
=item B |
2323
|
|
|
|
|
|
|
|
2324
|
|
|
|
|
|
|
$clusterer->get_K_best(); |
2325
|
|
|
|
|
|
|
|
2326
|
|
|
|
|
|
|
This call makes sense only if you supply either the C option to the constructor, |
2327
|
|
|
|
|
|
|
or if you specify values for the C and C options. The C and the |
2328
|
|
|
|
|
|
|
C<(Kmin,Kmax)> options cause the module to determine the best value for C. |
2329
|
|
|
|
|
|
|
Remember, C is the number of clusters the data is partitioned into. |
2330
|
|
|
|
|
|
|
|
2331
|
|
|
|
|
|
|
=item B |
2332
|
|
|
|
|
|
|
|
2333
|
|
|
|
|
|
|
$clusterer->show_QoC_values(); |
2334
|
|
|
|
|
|
|
|
2335
|
|
|
|
|
|
|
presents a table with C values in the left column and the corresponding QoC |
2336
|
|
|
|
|
|
|
(Quality-of-Clustering) values in the right column. Note that this call makes sense |
2337
|
|
|
|
|
|
|
only if you either supply the C option to the constructor, or if you specify |
2338
|
|
|
|
|
|
|
values for the C and C options. |
2339
|
|
|
|
|
|
|
|
2340
|
|
|
|
|
|
|
=item B |
2341
|
|
|
|
|
|
|
|
2342
|
|
|
|
|
|
|
$clusterer->visualize_clusters( $visualization_mask ) |
2343
|
|
|
|
|
|
|
|
2344
|
|
|
|
|
|
|
The visualization mask here does not have to be identical to the one used for |
2345
|
|
|
|
|
|
|
clustering, but must be a subset of that mask. This is convenient for visualizing |
2346
|
|
|
|
|
|
|
the clusters in two- or three-dimensional subspaces of the original space. |
2347
|
|
|
|
|
|
|
|
2348
|
|
|
|
|
|
|
=item B |
2349
|
|
|
|
|
|
|
|
2350
|
|
|
|
|
|
|
$clusterer->visualize_data($visualization_mask, 'original'); |
2351
|
|
|
|
|
|
|
|
2352
|
|
|
|
|
|
|
$clusterer->visualize_data($visualization_mask, 'normed'); |
2353
|
|
|
|
|
|
|
|
2354
|
|
|
|
|
|
|
This method requires a second argument and, as shown, it must be either the string |
2355
|
|
|
|
|
|
|
C or the string C, the former for the visualization of the raw data |
2356
|
|
|
|
|
|
|
and the latter for the visualization of the data after its different dimensions are |
2357
|
|
|
|
|
|
|
normalized by the standard-deviations along those directions. If you call the method |
2358
|
|
|
|
|
|
|
with the second argument set to C, but do so without turning on the |
2359
|
|
|
|
|
|
|
C option in the KMeans constructor, it will let you know. |
2360
|
|
|
|
|
|
|
|
2361
|
|
|
|
|
|
|
|
2362
|
|
|
|
|
|
|
=item B |
2363
|
|
|
|
|
|
|
|
2364
|
|
|
|
|
|
|
If you wish to determine the cluster membership of a new data sample after you have |
2365
|
|
|
|
|
|
|
created the clusters with the existing data samples, you would need to call this |
2366
|
|
|
|
|
|
|
method. The C script in the C directory |
2367
|
|
|
|
|
|
|
shows how to use this method. |
2368
|
|
|
|
|
|
|
|
2369
|
|
|
|
|
|
|
|
2370
|
|
|
|
|
|
|
=item B |
2371
|
|
|
|
|
|
|
|
2372
|
|
|
|
|
|
|
This does the same thing as the previous method, except that it determines the |
2373
|
|
|
|
|
|
|
cluster membership using the Mahalanobis distance metric. As for the previous |
2374
|
|
|
|
|
|
|
method, see the C script in the C directory |
2375
|
|
|
|
|
|
|
for how to use this method. |
2376
|
|
|
|
|
|
|
|
2377
|
|
|
|
|
|
|
|
2378
|
|
|
|
|
|
|
=item B |
2379
|
|
|
|
|
|
|
|
2380
|
|
|
|
|
|
|
Algorithm::KMeans->cluster_data_generator( |
2381
|
|
|
|
|
|
|
input_parameter_file => $parameter_file, |
2382
|
|
|
|
|
|
|
output_datafile => $out_datafile, |
2383
|
|
|
|
|
|
|
number_data_points_per_cluster => 20 ); |
2384
|
|
|
|
|
|
|
|
2385
|
|
|
|
|
|
|
for generating multivariate data for clustering if you wish to play with synthetic |
2386
|
|
|
|
|
|
|
data for clustering. The input parameter file contains the means and the variances |
2387
|
|
|
|
|
|
|
for the different Gaussians you wish to use for the synthetic data. See the file |
2388
|
|
|
|
|
|
|
C provided in the examples directory. It will be easiest for you to just |
2389
|
|
|
|
|
|
|
edit this file for your data generation needs. In addition to the format of the |
2390
|
|
|
|
|
|
|
parameter file, the main constraint you need to observe in specifying the parameters |
2391
|
|
|
|
|
|
|
is that the dimensionality of the covariance matrix must correspond to the |
2392
|
|
|
|
|
|
|
dimensionality of the mean vectors. The multivariate random numbers are generated by |
2393
|
|
|
|
|
|
|
calling the C module. As you would expect, this module requires that |
2394
|
|
|
|
|
|
|
the covariance matrices you specify in your parameter file be symmetric and positive |
2395
|
|
|
|
|
|
|
definite. Should the covariances in your parameter file not obey this condition, the |
2396
|
|
|
|
|
|
|
C module will let you know. |
2397
|
|
|
|
|
|
|
|
2398
|
|
|
|
|
|
|
=back |
2399
|
|
|
|
|
|
|
|
2400
|
|
|
|
|
|
|
=head1 HOW THE CLUSTERS ARE OUTPUT |
2401
|
|
|
|
|
|
|
|
2402
|
|
|
|
|
|
|
When the option C is set in the call to the constructor, the |
2403
|
|
|
|
|
|
|
clusters are displayed on the terminal screen. |
2404
|
|
|
|
|
|
|
|
2405
|
|
|
|
|
|
|
When the option C is set in the call to the constructor, the |
2406
|
|
|
|
|
|
|
module dumps the clusters in files named |
2407
|
|
|
|
|
|
|
|
2408
|
|
|
|
|
|
|
cluster0.txt |
2409
|
|
|
|
|
|
|
cluster1.txt |
2410
|
|
|
|
|
|
|
cluster2.txt |
2411
|
|
|
|
|
|
|
... |
2412
|
|
|
|
|
|
|
... |
2413
|
|
|
|
|
|
|
|
2414
|
|
|
|
|
|
|
in the directory in which you execute the module. The number of such files will |
2415
|
|
|
|
|
|
|
equal the number of clusters formed. All such existing files in the directory are |
2416
|
|
|
|
|
|
|
destroyed before any fresh ones are created. Each cluster file contains the symbolic |
2417
|
|
|
|
|
|
|
ID tags of the data samples in that cluster. |
2418
|
|
|
|
|
|
|
|
2419
|
|
|
|
|
|
|
The module also leaves in your directory a printable `.png' file that is a point plot |
2420
|
|
|
|
|
|
|
of the different clusters. The name of this file is always C. |
2421
|
|
|
|
|
|
|
|
2422
|
|
|
|
|
|
|
Also, as mentioned previously, a call to C in your own code will return the |
2423
|
|
|
|
|
|
|
clusters and the cluster centers. |
2424
|
|
|
|
|
|
|
|
2425
|
|
|
|
|
|
|
=head1 REQUIRED |
2426
|
|
|
|
|
|
|
|
2427
|
|
|
|
|
|
|
This module requires the following three modules: |
2428
|
|
|
|
|
|
|
|
2429
|
|
|
|
|
|
|
Math::Random |
2430
|
|
|
|
|
|
|
Graphics::GnuplotIF |
2431
|
|
|
|
|
|
|
Math::GSL |
2432
|
|
|
|
|
|
|
|
2433
|
|
|
|
|
|
|
With regard to the third item above, what is actually required is the |
2434
|
|
|
|
|
|
|
C module. However, that module is a part of the C |
2435
|
|
|
|
|
|
|
distribution. The acronym GSL stands for the GNU Scientific Library. C is |
2436
|
|
|
|
|
|
|
a Perl interface to the GSL C-based library. |
2437
|
|
|
|
|
|
|
|
2438
|
|
|
|
|
|
|
|
2439
|
|
|
|
|
|
|
=head1 THE C DIRECTORY |
2440
|
|
|
|
|
|
|
|
2441
|
|
|
|
|
|
|
The C directory contains several scripts to help you become familiar with |
2442
|
|
|
|
|
|
|
this module. The following script is an example of how the module can be expected to |
2443
|
|
|
|
|
|
|
be used most of the time. It calls for clustering to be carried out with a fixed |
2444
|
|
|
|
|
|
|
C: |
2445
|
|
|
|
|
|
|
|
2446
|
|
|
|
|
|
|
cluster_and_visualize.pl |
2447
|
|
|
|
|
|
|
|
2448
|
|
|
|
|
|
|
The more time you spend with this script, the more comfortable you will become with |
2449
|
|
|
|
|
|
|
the use of this module. The script file contains a large comment block that mentions |
2450
|
|
|
|
|
|
|
six locations in the script where you have to make decisions about how to use the |
2451
|
|
|
|
|
|
|
module. |
2452
|
|
|
|
|
|
|
|
2453
|
|
|
|
|
|
|
See the following script if you do not know what value to use for C for clustering |
2454
|
|
|
|
|
|
|
your data: |
2455
|
|
|
|
|
|
|
|
2456
|
|
|
|
|
|
|
find_best_K_and_cluster.pl |
2457
|
|
|
|
|
|
|
|
2458
|
|
|
|
|
|
|
This script uses the C option in the constructor that causes the module to |
2459
|
|
|
|
|
|
|
search for the best C for your data. Since this search is virtually unbounded --- |
2460
|
|
|
|
|
|
|
limited only by the number of samples in your data file --- the script may take a |
2461
|
|
|
|
|
|
|
long time to run for a large data file. Hence the next script. |
2462
|
|
|
|
|
|
|
|
2463
|
|
|
|
|
|
|
If your datafile is too large, you may need to range limit the values of C that |
2464
|
|
|
|
|
|
|
are searched through, as in the following script: |
2465
|
|
|
|
|
|
|
|
2466
|
|
|
|
|
|
|
find_best_K_in_range_and_cluster.pl |
2467
|
|
|
|
|
|
|
|
2468
|
|
|
|
|
|
|
If you also want to include data normalization (it may reduce the performance of the |
2469
|
|
|
|
|
|
|
clusterer in some cases), see the following script: |
2470
|
|
|
|
|
|
|
|
2471
|
|
|
|
|
|
|
cluster_after_data_normalization.pl |
2472
|
|
|
|
|
|
|
|
2473
|
|
|
|
|
|
|
When you include the data normalization step and you would like to visualize the data |
2474
|
|
|
|
|
|
|
before and after normalization, see the following script: |
2475
|
|
|
|
|
|
|
|
2476
|
|
|
|
|
|
|
cluster_and_visualize_with_data_visualization.pl* |
2477
|
|
|
|
|
|
|
|
2478
|
|
|
|
|
|
|
After you are done clustering, let's say you want to find the cluster membership of a |
2479
|
|
|
|
|
|
|
new data sample. To see how you can do that, see the script: |
2480
|
|
|
|
|
|
|
|
2481
|
|
|
|
|
|
|
which_cluster_for_new_data.pl |
2482
|
|
|
|
|
|
|
|
2483
|
|
|
|
|
|
|
This script returns two answers for which cluster a new data sample belongs to: one |
2484
|
|
|
|
|
|
|
using the Euclidean metric to calculate the distances between the new data sample and |
2485
|
|
|
|
|
|
|
the cluster centers, and the other using the Mahalanobis metric. If the clusters are |
2486
|
|
|
|
|
|
|
strongly elliptical in shape, you are likely to get better results with the |
2487
|
|
|
|
|
|
|
Mahalanobis metric. (To see that you can get two different answers using the two |
2488
|
|
|
|
|
|
|
different distance metrics, run the C script on the |
2489
|
|
|
|
|
|
|
data in the file C. To make this run, note that you have to comment |
2490
|
|
|
|
|
|
|
out and uncomment the lines at four different locations in the script.) |
2491
|
|
|
|
|
|
|
|
2492
|
|
|
|
|
|
|
The C directory also contains the following support scripts: |
2493
|
|
|
|
|
|
|
|
2494
|
|
|
|
|
|
|
For generating the data for experiments with clustering: |
2495
|
|
|
|
|
|
|
|
2496
|
|
|
|
|
|
|
data_generator.pl |
2497
|
|
|
|
|
|
|
|
2498
|
|
|
|
|
|
|
For cleaning up the examples directory: |
2499
|
|
|
|
|
|
|
|
2500
|
|
|
|
|
|
|
cleanup_directory.pl |
2501
|
|
|
|
|
|
|
|
2502
|
|
|
|
|
|
|
The examples directory also includes a parameter file, C, for generating |
2503
|
|
|
|
|
|
|
synthetic data for clustering. Just edit this file if you would like to generate |
2504
|
|
|
|
|
|
|
your own multivariate data for clustering. The parameter file is for the 3D case, |
2505
|
|
|
|
|
|
|
but you can generate data with any dimensionality through appropriate entries in the |
2506
|
|
|
|
|
|
|
parameter file. |
2507
|
|
|
|
|
|
|
|
2508
|
|
|
|
|
|
|
=head1 EXPORT |
2509
|
|
|
|
|
|
|
|
2510
|
|
|
|
|
|
|
None by design. |
2511
|
|
|
|
|
|
|
|
2512
|
|
|
|
|
|
|
=head1 CAVEATS |
2513
|
|
|
|
|
|
|
|
2514
|
|
|
|
|
|
|
K-Means based clustering usually does not work well when the clusters are strongly |
2515
|
|
|
|
|
|
|
overlapping and when the extent of variability along the different dimensions is |
2516
|
|
|
|
|
|
|
different for the different clusters. The module does give you the ability to |
2517
|
|
|
|
|
|
|
normalize the variability in your data with the constructor option |
2518
|
|
|
|
|
|
|
C. However, as described elsewhere, this may actually |
2519
|
|
|
|
|
|
|
reduce the performance of the clusterer if the data variability along a direction is |
2520
|
|
|
|
|
|
|
more a result of the separation between the means than because of intra-cluster |
2521
|
|
|
|
|
|
|
variability. For better clustering with difficult-to-cluster data, you could try |
2522
|
|
|
|
|
|
|
using the author's C module. |
2523
|
|
|
|
|
|
|
|
2524
|
|
|
|
|
|
|
=head1 BUGS |
2525
|
|
|
|
|
|
|
|
2526
|
|
|
|
|
|
|
Please notify the author if you encounter any bugs. When sending email, please place |
2527
|
|
|
|
|
|
|
the string 'KMeans' in the subject line. |
2528
|
|
|
|
|
|
|
|
2529
|
|
|
|
|
|
|
=head1 INSTALLATION |
2530
|
|
|
|
|
|
|
|
2531
|
|
|
|
|
|
|
Download the archive from CPAN in any directory of your choice. Unpack the archive |
2532
|
|
|
|
|
|
|
with a command that on a Linux machine would look like: |
2533
|
|
|
|
|
|
|
|
2534
|
|
|
|
|
|
|
tar zxvf Algorithm-KMeans-2.05.tar.gz |
2535
|
|
|
|
|
|
|
|
2536
|
|
|
|
|
|
|
This will create an installation directory for you whose name will be |
2537
|
|
|
|
|
|
|
C. Enter this directory and execute the following commands |
2538
|
|
|
|
|
|
|
for a standard install of the module if you have root privileges: |
2539
|
|
|
|
|
|
|
|
2540
|
|
|
|
|
|
|
perl Makefile.PL |
2541
|
|
|
|
|
|
|
make |
2542
|
|
|
|
|
|
|
make test |
2543
|
|
|
|
|
|
|
sudo make install |
2544
|
|
|
|
|
|
|
|
2545
|
|
|
|
|
|
|
If you do not have root privileges, you can carry out a non-standard install the |
2546
|
|
|
|
|
|
|
module in any directory of your choice by: |
2547
|
|
|
|
|
|
|
|
2548
|
|
|
|
|
|
|
perl Makefile.PL prefix=/some/other/directory/ |
2549
|
|
|
|
|
|
|
make |
2550
|
|
|
|
|
|
|
make test |
2551
|
|
|
|
|
|
|
make install |
2552
|
|
|
|
|
|
|
|
2553
|
|
|
|
|
|
|
With a non-standard install, you may also have to set your PERL5LIB environment |
2554
|
|
|
|
|
|
|
variable so that this module can find the required other modules. How you do that |
2555
|
|
|
|
|
|
|
would depend on what platform you are working on. In order to install this module in |
2556
|
|
|
|
|
|
|
a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment |
2557
|
|
|
|
|
|
|
variable by |
2558
|
|
|
|
|
|
|
|
2559
|
|
|
|
|
|
|
setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/ |
2560
|
|
|
|
|
|
|
|
2561
|
|
|
|
|
|
|
If I used bash, I'd need to declare: |
2562
|
|
|
|
|
|
|
|
2563
|
|
|
|
|
|
|
export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/ |
2564
|
|
|
|
|
|
|
|
2565
|
|
|
|
|
|
|
|
2566
|
|
|
|
|
|
|
=head1 THANKS |
2567
|
|
|
|
|
|
|
|
2568
|
|
|
|
|
|
|
I thank Slaven for pointing out that I needed to downshift the required version of Perl |
2569
|
|
|
|
|
|
|
for this module. Fortunately, I had access to an old machine still running Perl |
2570
|
|
|
|
|
|
|
5.10.1. The current version, 2.02, is based on my testing the module on that machine. |
2571
|
|
|
|
|
|
|
|
2572
|
|
|
|
|
|
|
I added two C methods in Version 2.0 as a result of an email from |
2573
|
|
|
|
|
|
|
Jerome White who expressed a need for such methods in order to determine the best |
2574
|
|
|
|
|
|
|
cluster for a new data record after you have successfully clustered your existing |
2575
|
|
|
|
|
|
|
data. Thanks Jerome for your feedback! |
2576
|
|
|
|
|
|
|
|
2577
|
|
|
|
|
|
|
It was an email from Nadeem Bulsara that prompted me to create Version 1.40 of this |
2578
|
|
|
|
|
|
|
module. Working with Version 1.30, Nadeem noticed that occasionally the module would |
2579
|
|
|
|
|
|
|
produce variable clustering results on the same dataset. I believe that this |
2580
|
|
|
|
|
|
|
variability was caused (at least partly) by the purely random mode that was used in |
2581
|
|
|
|
|
|
|
Version 1.30 for the seeding of the cluster centers. Version 1.40 now includes a |
2582
|
|
|
|
|
|
|
C mode. With the new mode the clusterer uses a PCA (Principal Components |
2583
|
|
|
|
|
|
|
Analysis) of the data to make good guesses for the cluster centers. However, |
2584
|
|
|
|
|
|
|
depending on how the data is jumbled up, it is possible that the new mode will not |
2585
|
|
|
|
|
|
|
produce uniformly good results in all cases. So you can still use the old mode by |
2586
|
|
|
|
|
|
|
setting C to C in the constructor. Thanks Nadeem for your |
2587
|
|
|
|
|
|
|
feedback! |
2588
|
|
|
|
|
|
|
|
2589
|
|
|
|
|
|
|
Version 1.30 resulted from Martin Kalin reporting problems with a very small data |
2590
|
|
|
|
|
|
|
set. Thanks Martin! |
2591
|
|
|
|
|
|
|
|
2592
|
|
|
|
|
|
|
Version 1.21 came about in response to the problems encountered by Luis Fernando |
2593
|
|
|
|
|
|
|
D'Haro with version 1.20. Although the module would yield the clusters for some of |
2594
|
|
|
|
|
|
|
its runs, more frequently than not the module would abort with an "empty cluster" |
2595
|
|
|
|
|
|
|
message for his data. Luis Fernando has also suggested other improvements (such as |
2596
|
|
|
|
|
|
|
clustering directly from the contents of a hash) that I intend to make in future |
2597
|
|
|
|
|
|
|
versions of this module. Thanks Luis Fernando. |
2598
|
|
|
|
|
|
|
|
2599
|
|
|
|
|
|
|
Chad Aeschliman was kind enough to test out the interface of this module and to give |
2600
|
|
|
|
|
|
|
suggestions for its improvement. His key slogan: "If you cannot figure out how to |
2601
|
|
|
|
|
|
|
use a module in under 10 minutes, it's not going to be used." That should explain |
2602
|
|
|
|
|
|
|
the longish Synopsis included here. |
2603
|
|
|
|
|
|
|
|
2604
|
|
|
|
|
|
|
=head1 AUTHOR |
2605
|
|
|
|
|
|
|
|
2606
|
|
|
|
|
|
|
Avinash Kak, kak@purdue.edu |
2607
|
|
|
|
|
|
|
|
2608
|
|
|
|
|
|
|
If you send email, please place the string "KMeans" in your subject line to get past |
2609
|
|
|
|
|
|
|
my spam filter. |
2610
|
|
|
|
|
|
|
|
2611
|
|
|
|
|
|
|
=head1 COPYRIGHT |
2612
|
|
|
|
|
|
|
|
2613
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify it under the |
2614
|
|
|
|
|
|
|
same terms as Perl itself. |
2615
|
|
|
|
|
|
|
|
2616
|
|
|
|
|
|
|
Copyright 2014 Avinash Kak |
2617
|
|
|
|
|
|
|
|
2618
|
|
|
|
|
|
|
=cut |
2619
|
|
|
|
|
|
|
|