line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Algorithm::ExpectationMaximization; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
#--------------------------------------------------------------------------- |
4
|
|
|
|
|
|
|
# Copyright (c) 2014 Avinash Kak. All rights reserved. This program is free |
5
|
|
|
|
|
|
|
# software. You may modify and/or distribute it under the same terms as Perl itself. |
6
|
|
|
|
|
|
|
# This copyright notice must remain attached to the file. |
7
|
|
|
|
|
|
|
# |
8
|
|
|
|
|
|
|
# Algorithm::ExpectationMaximization is a pure Perl implementation for |
9
|
|
|
|
|
|
|
# Expectation-Maximization based clustering of multi-dimensional data that can be |
10
|
|
|
|
|
|
|
# modeled as a Gaussian mixture. |
11
|
|
|
|
|
|
|
# --------------------------------------------------------------------------- |
12
|
|
|
|
|
|
|
|
13
|
1
|
|
|
1
|
|
24009
|
use 5.10.0; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
64
|
|
14
|
1
|
|
|
1
|
|
6
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
157
|
|
15
|
1
|
|
|
1
|
|
7
|
use warnings; |
|
1
|
|
|
|
|
87
|
|
|
1
|
|
|
|
|
59
|
|
16
|
1
|
|
|
1
|
|
7
|
use Carp; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
119
|
|
17
|
1
|
|
|
1
|
|
7
|
use File::Basename; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
106
|
|
18
|
1
|
|
|
1
|
|
911
|
use Math::Random; |
|
1
|
|
|
|
|
9382
|
|
|
1
|
|
|
|
|
145
|
|
19
|
1
|
|
|
1
|
|
808
|
use Graphics::GnuplotIF; |
|
1
|
|
|
|
|
15118
|
|
|
1
|
|
|
|
|
74
|
|
20
|
1
|
|
|
1
|
|
1351
|
use Math::GSL::Matrix; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
use Scalar::Util 'blessed'; |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
our $VERSION = '1.22'; |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
# from perl docs: |
26
|
|
|
|
|
|
|
my $_num_regex = '^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$'; |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
# Constructor: |
29
|
|
|
|
|
|
|
sub new { |
30
|
|
|
|
|
|
|
my ($class, %args) = @_; |
31
|
|
|
|
|
|
|
my @params = keys %args; |
32
|
|
|
|
|
|
|
croak "\nYou have used a wrong name for a keyword argument " . |
33
|
|
|
|
|
|
|
"--- perhaps a misspelling\n" |
34
|
|
|
|
|
|
|
if check_for_illegal_params(@params) == 0; |
35
|
|
|
|
|
|
|
bless { |
36
|
|
|
|
|
|
|
_datafile => $args{datafile} || croak("datafile required"), |
37
|
|
|
|
|
|
|
_mask => $args{mask} || croak("mask required"), |
38
|
|
|
|
|
|
|
_K => $args{K} || croak("number of clusters required"), |
39
|
|
|
|
|
|
|
_terminal_output => $args{terminal_output} || 0, |
40
|
|
|
|
|
|
|
_seeding => $args{seeding} || 'random', |
41
|
|
|
|
|
|
|
_seed_tags => $args{seed_tags} || [], |
42
|
|
|
|
|
|
|
_max_em_iterations=> $args{max_em_iterations} || 100, |
43
|
|
|
|
|
|
|
_class_priors => $args{class_priors} || [], |
44
|
|
|
|
|
|
|
_debug => $args{debug} || 0, |
45
|
|
|
|
|
|
|
_N => 0, |
46
|
|
|
|
|
|
|
_data => {}, |
47
|
|
|
|
|
|
|
_data_id_tags => [], |
48
|
|
|
|
|
|
|
_clusters => [], |
49
|
|
|
|
|
|
|
_cluster_centers => [], |
50
|
|
|
|
|
|
|
_data_dimensions => 0, |
51
|
|
|
|
|
|
|
_cluster_normalizers => [], |
52
|
|
|
|
|
|
|
_cluster_means => [], |
53
|
|
|
|
|
|
|
_cluster_covariances => [], |
54
|
|
|
|
|
|
|
_class_labels_for_data => {}, |
55
|
|
|
|
|
|
|
_class_probs_at_each_data_point => {}, |
56
|
|
|
|
|
|
|
_expected_class_probs => {}, |
57
|
|
|
|
|
|
|
_old_priors => [], |
58
|
|
|
|
|
|
|
_old_old_priors => [], |
59
|
|
|
|
|
|
|
_fisher_quality_vs_iteration => [], |
60
|
|
|
|
|
|
|
_mdl_quality_vs_iterations => [], |
61
|
|
|
|
|
|
|
}, $class; |
62
|
|
|
|
|
|
|
} |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
sub read_data_from_file { |
66
|
|
|
|
|
|
|
my $self = shift; |
67
|
|
|
|
|
|
|
my $filename = $self->{_datafile}; |
68
|
|
|
|
|
|
|
$self->read_data_from_file_csv() if $filename =~ /.csv$/; |
69
|
|
|
|
|
|
|
$self->read_data_from_file_dat() if $filename =~ /.dat$/; |
70
|
|
|
|
|
|
|
} |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
sub read_data_from_file_csv { |
73
|
|
|
|
|
|
|
my $self = shift; |
74
|
|
|
|
|
|
|
my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?'; |
75
|
|
|
|
|
|
|
my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered"; |
76
|
|
|
|
|
|
|
my $mask = $self->{_mask}; |
77
|
|
|
|
|
|
|
my @mask = split //, $mask; |
78
|
|
|
|
|
|
|
$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask; |
79
|
|
|
|
|
|
|
print "data dimensionality: $self->{_data_dimensions} \n"if $self->{_terminal_output}; |
80
|
|
|
|
|
|
|
open FILEIN, $filename or die "Unable to open $filename: $!"; |
81
|
|
|
|
|
|
|
die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/; |
82
|
|
|
|
|
|
|
local $/ = undef; |
83
|
|
|
|
|
|
|
my @all_data = split /\s+/, ; |
84
|
|
|
|
|
|
|
my %data_hash = (); |
85
|
|
|
|
|
|
|
my @data_tags = (); |
86
|
|
|
|
|
|
|
foreach my $record (@all_data) { |
87
|
|
|
|
|
|
|
my @splits = split /,/, $record; |
88
|
|
|
|
|
|
|
die "\nYour mask size (including `N' and 1's and 0's) does not match\n" . |
89
|
|
|
|
|
|
|
"the size of at least one of the data records in the file.\n" |
90
|
|
|
|
|
|
|
unless scalar(@mask) == scalar(@splits); |
91
|
|
|
|
|
|
|
my $record_name = shift @splits; |
92
|
|
|
|
|
|
|
$data_hash{$record_name} = \@splits; |
93
|
|
|
|
|
|
|
push @data_tags, $record_name; |
94
|
|
|
|
|
|
|
} |
95
|
|
|
|
|
|
|
$self->{_data} = \%data_hash; |
96
|
|
|
|
|
|
|
$self->{_data_id_tags} = \@data_tags; |
97
|
|
|
|
|
|
|
$self->{_N} = scalar @data_tags; |
98
|
|
|
|
|
|
|
# Need to make the following call to set the global mean and covariance: |
99
|
|
|
|
|
|
|
# my $covariance = $self->estimate_mean_and_covariance(\@data_tags); |
100
|
|
|
|
|
|
|
# Need to make the following call to set the global eigenvec eigenval sets: |
101
|
|
|
|
|
|
|
# $self->eigen_analysis_of_covariance($covariance); |
102
|
|
|
|
|
|
|
if ( defined($self->{_K}) && ($self->{_K} > 0) ) { |
103
|
|
|
|
|
|
|
carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " . |
104
|
|
|
|
|
|
|
"points must satisfy the relation N > 2xK**2 where K is " . |
105
|
|
|
|
|
|
|
"the number of clusters requested for the clusters to be " . |
106
|
|
|
|
|
|
|
"meaningful $!" |
107
|
|
|
|
|
|
|
if ( $self->{_N} < (2 * $self->{_K} ** 2) ); |
108
|
|
|
|
|
|
|
print "\n\n\n"; |
109
|
|
|
|
|
|
|
} |
110
|
|
|
|
|
|
|
} |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
sub read_data_from_file_dat { |
113
|
|
|
|
|
|
|
my $self = shift; |
114
|
|
|
|
|
|
|
my $datafile = $self->{_datafile}; |
115
|
|
|
|
|
|
|
my $mask = $self->{_mask}; |
116
|
|
|
|
|
|
|
my @mask = split //, $mask; |
117
|
|
|
|
|
|
|
$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask; |
118
|
|
|
|
|
|
|
print "data dimensionality: $self->{_data_dimensions} \n" |
119
|
|
|
|
|
|
|
if $self->{_terminal_output}; |
120
|
|
|
|
|
|
|
open INPUT, $datafile |
121
|
|
|
|
|
|
|
or die "unable to open file $datafile: $!"; |
122
|
|
|
|
|
|
|
chomp( my @raw_data = ); |
123
|
|
|
|
|
|
|
close INPUT; |
124
|
|
|
|
|
|
|
# Transform strings into number data |
125
|
|
|
|
|
|
|
foreach my $record (@raw_data) { |
126
|
|
|
|
|
|
|
next unless $record; |
127
|
|
|
|
|
|
|
next if $record =~ /^#/; |
128
|
|
|
|
|
|
|
my @data_fields; |
129
|
|
|
|
|
|
|
my @fields = split /\s+/, $record; |
130
|
|
|
|
|
|
|
die "\nABORTED: Mask size does not correspond to row record size" |
131
|
|
|
|
|
|
|
if $#fields != $#mask; |
132
|
|
|
|
|
|
|
my $record_id; |
133
|
|
|
|
|
|
|
foreach my $i (0..@fields-1) { |
134
|
|
|
|
|
|
|
if ($mask[$i] eq '0') { |
135
|
|
|
|
|
|
|
next; |
136
|
|
|
|
|
|
|
} elsif ($mask[$i] eq 'N') { |
137
|
|
|
|
|
|
|
$record_id = $fields[$i]; |
138
|
|
|
|
|
|
|
} elsif ($mask[$i] eq '1') { |
139
|
|
|
|
|
|
|
push @data_fields, $fields[$i]; |
140
|
|
|
|
|
|
|
} else { |
141
|
|
|
|
|
|
|
die "misformed mask for reading the data file"; |
142
|
|
|
|
|
|
|
} |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
my @nums = map {/$_num_regex/;$_} @data_fields; |
145
|
|
|
|
|
|
|
$self->{_data}->{ $record_id } = \@nums; |
146
|
|
|
|
|
|
|
} |
147
|
|
|
|
|
|
|
my @all_data_ids = keys %{$self->{_data}}; |
148
|
|
|
|
|
|
|
$self->{_data_id_tags} = \@all_data_ids; |
149
|
|
|
|
|
|
|
$self->{_N} = scalar @all_data_ids; |
150
|
|
|
|
|
|
|
if ( defined($self->{_K}) && ($self->{_K} > 0) ) { |
151
|
|
|
|
|
|
|
carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " . |
152
|
|
|
|
|
|
|
"points must satisfy the relation N > 2xK**2 where K is " . |
153
|
|
|
|
|
|
|
"the number of clusters requested for the clusters to be " . |
154
|
|
|
|
|
|
|
"meaningful $!" |
155
|
|
|
|
|
|
|
if ( $self->{_N} < (2 * $self->{_K} ** 2) ); |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
} |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
# This is the heart of the module --- in the sense that this method implements the EM |
161
|
|
|
|
|
|
|
# algorithm for the estimating the parameters of a Gaussian mixture model for the |
162
|
|
|
|
|
|
|
# data. In the implementation shown below, we declare convergence for the EM |
163
|
|
|
|
|
|
|
# algorithm when the change in the class priors over three iterations falls below a |
164
|
|
|
|
|
|
|
# threshold. The current value of this threshold, as can be seen in the function |
165
|
|
|
|
|
|
|
# compare_array_floats(), is 0.00001. |
166
|
|
|
|
|
|
|
sub EM { |
167
|
|
|
|
|
|
|
my $self = shift; |
168
|
|
|
|
|
|
|
$self->initialize_class_priors(); |
169
|
|
|
|
|
|
|
for (my $em_iteration=0; $em_iteration < $self->{_max_em_iterations}; |
170
|
|
|
|
|
|
|
$em_iteration++) { |
171
|
|
|
|
|
|
|
if ($em_iteration == 0) { |
172
|
|
|
|
|
|
|
print "\nSeeding the EM algorithm with:\n"; |
173
|
|
|
|
|
|
|
$self->display_seeding_stats(); |
174
|
|
|
|
|
|
|
print "\nFinished displaying the seeding information\n"; |
175
|
|
|
|
|
|
|
print "\nWill print out a dot for each iteration of EM:\n\n"; |
176
|
|
|
|
|
|
|
} |
177
|
|
|
|
|
|
|
my $progress_indicator = $em_iteration % 5 == 0 ? $em_iteration : "."; |
178
|
|
|
|
|
|
|
print $progress_indicator; |
179
|
|
|
|
|
|
|
foreach my $data_id (@{$self->{_data_id_tags}}) { |
180
|
|
|
|
|
|
|
$self->{_class_probs_at_each_data_point}->{$data_id} = []; |
181
|
|
|
|
|
|
|
$self->{_expected_class_probs}->{$data_id} = []; |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
# Calculate prob(x | C_i) --- this is the prob of data point x as |
184
|
|
|
|
|
|
|
# a member of class C_i. You must do this for all K classes: |
185
|
|
|
|
|
|
|
foreach my $cluster_index(0..$self->{_K}-1) { |
186
|
|
|
|
|
|
|
$self->find_prob_at_each_datapoint_for_given_mean_and_covar( |
187
|
|
|
|
|
|
|
$self->{_cluster_means}->[$cluster_index], |
188
|
|
|
|
|
|
|
$self->{_cluster_covariances}->[$cluster_index] ); |
189
|
|
|
|
|
|
|
} |
190
|
|
|
|
|
|
|
$self->{_cluster_normalizers} = []; |
191
|
|
|
|
|
|
|
if ($self->{_debug}) { |
192
|
|
|
|
|
|
|
print "\n\nDisplaying prob of a data point vis-a-vis each class:\n\n"; |
193
|
|
|
|
|
|
|
foreach my $data_id (sort keys %{$self->{_data}}) { |
194
|
|
|
|
|
|
|
my $class_probs_at_a_point = |
195
|
|
|
|
|
|
|
$self->{_class_probs_at_each_data_point}->{$data_id}; |
196
|
|
|
|
|
|
|
print "Class probs for $data_id: @$class_probs_at_a_point\n" |
197
|
|
|
|
|
|
|
} |
198
|
|
|
|
|
|
|
} |
199
|
|
|
|
|
|
|
# Calculate prob(C_i | x) which is the posterior prob of class |
200
|
|
|
|
|
|
|
# considered as a r.v. to be C_i at a given point x. For a given |
201
|
|
|
|
|
|
|
# x, the sum of such probabilities over all C_i must add up to 1: |
202
|
|
|
|
|
|
|
$self->find_expected_classes_at_each_datapoint(); |
203
|
|
|
|
|
|
|
if ($self->{_debug}) { |
204
|
|
|
|
|
|
|
print "\n\nDisplaying expected class probs at each data point:\n\n"; |
205
|
|
|
|
|
|
|
foreach my $data_id (sort keys %{$self->{_expected_class_probs}}) { |
206
|
|
|
|
|
|
|
my $expected_classes_at_a_point = |
207
|
|
|
|
|
|
|
$self->{_expected_class_probs}->{$data_id}; |
208
|
|
|
|
|
|
|
print "Expected classes $data_id: @$expected_classes_at_a_point\n"; |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
# 1. UPDATE MEANS: |
212
|
|
|
|
|
|
|
my @new_means; |
213
|
|
|
|
|
|
|
foreach my $cluster_index(0..$self->{_K}-1) { |
214
|
|
|
|
|
|
|
$new_means[$cluster_index] = |
215
|
|
|
|
|
|
|
Math::GSL::Matrix->new($self->{_data_dimensions},1); |
216
|
|
|
|
|
|
|
$new_means[$cluster_index]->zero(); |
217
|
|
|
|
|
|
|
foreach my $data_id (keys %{$self->{_data}}) { |
218
|
|
|
|
|
|
|
my $data_record = $self->{_data}->{$data_id}; |
219
|
|
|
|
|
|
|
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); |
220
|
|
|
|
|
|
|
$data_vec->set_col(0,$data_record); |
221
|
|
|
|
|
|
|
$new_means[$cluster_index] += |
222
|
|
|
|
|
|
|
$self->{_expected_class_probs}->{$data_id}->[$cluster_index] * |
223
|
|
|
|
|
|
|
$data_vec->copy(); |
224
|
|
|
|
|
|
|
$self->{_cluster_normalizers}->[$cluster_index] += |
225
|
|
|
|
|
|
|
$self->{_expected_class_probs}->{$data_id}->[$cluster_index]; |
226
|
|
|
|
|
|
|
} |
227
|
|
|
|
|
|
|
$new_means[$cluster_index] *= 1.0 / |
228
|
|
|
|
|
|
|
$self->{_cluster_normalizers}->[$cluster_index]; |
229
|
|
|
|
|
|
|
} |
230
|
|
|
|
|
|
|
if ($self->{_debug}) { |
231
|
|
|
|
|
|
|
foreach my $meanvec (@new_means) { |
232
|
|
|
|
|
|
|
display_matrix("At EM Iteration $em_iteration, new mean vector is", |
233
|
|
|
|
|
|
|
$meanvec); |
234
|
|
|
|
|
|
|
} |
235
|
|
|
|
|
|
|
} |
236
|
|
|
|
|
|
|
$self->{_cluster_means} = \@new_means; |
237
|
|
|
|
|
|
|
# 2. UPDATE COVARIANCES: |
238
|
|
|
|
|
|
|
my @new_covariances; |
239
|
|
|
|
|
|
|
foreach my $cluster_index(0..$self->{_K}-1) { |
240
|
|
|
|
|
|
|
$new_covariances[$cluster_index] = |
241
|
|
|
|
|
|
|
Math::GSL::Matrix->new($self->{_data_dimensions}, |
242
|
|
|
|
|
|
|
$self->{_data_dimensions}); |
243
|
|
|
|
|
|
|
$new_covariances[$cluster_index]->zero(); |
244
|
|
|
|
|
|
|
my $normalizer = 0; |
245
|
|
|
|
|
|
|
foreach my $data_id (keys %{$self->{_data}}) { |
246
|
|
|
|
|
|
|
my $data_record = $self->{_data}->{$data_id}; |
247
|
|
|
|
|
|
|
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); |
248
|
|
|
|
|
|
|
$data_vec->set_col(0,$data_record); |
249
|
|
|
|
|
|
|
my $mean_subtracted_data = |
250
|
|
|
|
|
|
|
$data_vec - $self->{_cluster_means}->[$cluster_index]; |
251
|
|
|
|
|
|
|
my $outer_product = outer_product($mean_subtracted_data, |
252
|
|
|
|
|
|
|
$mean_subtracted_data); |
253
|
|
|
|
|
|
|
$new_covariances[$cluster_index] += |
254
|
|
|
|
|
|
|
$self->{_expected_class_probs}->{$data_id}->[$cluster_index] * |
255
|
|
|
|
|
|
|
$outer_product; |
256
|
|
|
|
|
|
|
} |
257
|
|
|
|
|
|
|
$new_covariances[$cluster_index] *= |
258
|
|
|
|
|
|
|
1.0 / |
259
|
|
|
|
|
|
|
$self->{_cluster_normalizers}->[$cluster_index]; |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
$self->{_cluster_covariances} = \@new_covariances; |
262
|
|
|
|
|
|
|
# 3. UPDATE PRIORS: |
263
|
|
|
|
|
|
|
$self->{_old_old_priors} = deep_copy_array( $self->{_old_priors} ) |
264
|
|
|
|
|
|
|
if @{$self->{_old_priors}} > 0; |
265
|
|
|
|
|
|
|
$self->{_old_priors} = deep_copy_array( $self->{_class_priors} ); |
266
|
|
|
|
|
|
|
foreach my $cluster_index(0..$self->{_K}-1) { |
267
|
|
|
|
|
|
|
$self->{_class_priors}->[$cluster_index] = |
268
|
|
|
|
|
|
|
$self->{_cluster_normalizers}->[$cluster_index] / $self->{_N}; |
269
|
|
|
|
|
|
|
} |
270
|
|
|
|
|
|
|
my @priors = @{$self->{_class_priors}}; |
271
|
|
|
|
|
|
|
print "\nUpdated priors: @priors\n\n\n" if $self->{_debug}; |
272
|
|
|
|
|
|
|
push @{$self->{_fisher_quality_vs_iteration}}, |
273
|
|
|
|
|
|
|
$self->clustering_quality_fisher(); |
274
|
|
|
|
|
|
|
push @{$self->{_mdl_quality_vs_iteration}}, $self->clustering_quality_mdl(); |
275
|
|
|
|
|
|
|
if ( ($em_iteration > 5 && $self->reached_convergence()) |
276
|
|
|
|
|
|
|
|| ($em_iteration == $self->{_max_em_iterations} - 1) ) { |
277
|
|
|
|
|
|
|
my @old_old_priors = @{$self->{_old_old_priors}}; |
278
|
|
|
|
|
|
|
my @old_priors = @{$self->{_old_priors}}; |
279
|
|
|
|
|
|
|
print "\n\nPrevious to previous priors: @old_old_priors\n"; |
280
|
|
|
|
|
|
|
print "Previous priors: @old_priors\n"; |
281
|
|
|
|
|
|
|
print "Current class priors: @{$self->{_class_priors}}\n"; |
282
|
|
|
|
|
|
|
print "\n\nCONVERGENCE ACHIEVED AT ITERATION $em_iteration\n\n" |
283
|
|
|
|
|
|
|
if $em_iteration < $self->{_max_em_iterations} - 1; |
284
|
|
|
|
|
|
|
last; |
285
|
|
|
|
|
|
|
} |
286
|
|
|
|
|
|
|
} |
287
|
|
|
|
|
|
|
print "\n\n\n"; |
288
|
|
|
|
|
|
|
} |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
sub reached_convergence { |
291
|
|
|
|
|
|
|
my $self = shift; |
292
|
|
|
|
|
|
|
return 1 if compare_array_floats($self->{_old_old_priors}, |
293
|
|
|
|
|
|
|
$self->{_old_priors}) |
294
|
|
|
|
|
|
|
&& |
295
|
|
|
|
|
|
|
compare_array_floats($self->{_old_priors}, |
296
|
|
|
|
|
|
|
$self->{_class_priors}); |
297
|
|
|
|
|
|
|
return 0; |
298
|
|
|
|
|
|
|
} |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
# Classify the data into disjoint clusters using the Naive Bayes' classification: |
301
|
|
|
|
|
|
|
sub run_bayes_classifier { |
302
|
|
|
|
|
|
|
my $self = shift; |
303
|
|
|
|
|
|
|
$self->classify_all_data_tuples_bayes($self->{_cluster_means}, |
304
|
|
|
|
|
|
|
$self->{_cluster_covariances}); |
305
|
|
|
|
|
|
|
} |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
# Should NOT be called before run_bayes_classifier is run |
308
|
|
|
|
|
|
|
sub return_disjoint_clusters { |
309
|
|
|
|
|
|
|
my $self = shift; |
310
|
|
|
|
|
|
|
return $self->{_clusters}; |
311
|
|
|
|
|
|
|
} |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
sub return_clusters_with_posterior_probs_above_threshold { |
314
|
|
|
|
|
|
|
my $self = shift; |
315
|
|
|
|
|
|
|
my $theta = shift; |
316
|
|
|
|
|
|
|
my @class_distributions; |
317
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
318
|
|
|
|
|
|
|
push @class_distributions, []; |
319
|
|
|
|
|
|
|
} |
320
|
|
|
|
|
|
|
foreach my $data_tag (@{$self->{_data_id_tags}}) { |
321
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
322
|
|
|
|
|
|
|
push @{$class_distributions[$cluster_index]}, $data_tag |
323
|
|
|
|
|
|
|
if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] |
324
|
|
|
|
|
|
|
> $theta; |
325
|
|
|
|
|
|
|
} |
326
|
|
|
|
|
|
|
} |
327
|
|
|
|
|
|
|
return \@class_distributions; |
328
|
|
|
|
|
|
|
} |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
sub return_individual_class_distributions_above_given_threshold { |
331
|
|
|
|
|
|
|
my $self = shift; |
332
|
|
|
|
|
|
|
my $theta = shift; |
333
|
|
|
|
|
|
|
my @probability_distributions; |
334
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
335
|
|
|
|
|
|
|
push @probability_distributions, []; |
336
|
|
|
|
|
|
|
} |
337
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
338
|
|
|
|
|
|
|
my $mean_vec = $self->{_cluster_means}->[$cluster_index]; |
339
|
|
|
|
|
|
|
my $covar = $self->{_cluster_covariances}->[$cluster_index]; |
340
|
|
|
|
|
|
|
foreach my $data_id (keys %{$self->{_data}}) { |
341
|
|
|
|
|
|
|
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); |
342
|
|
|
|
|
|
|
$data_vec->set_col( 0, $self->{_data}->{$data_id}); |
343
|
|
|
|
|
|
|
my $datavec_minus_mean = $data_vec - $mean_vec; |
344
|
|
|
|
|
|
|
display_matrix( "datavec minus mean is ", $datavec_minus_mean ) |
345
|
|
|
|
|
|
|
if $self->{_debug}; |
346
|
|
|
|
|
|
|
my $exponent = undef; |
347
|
|
|
|
|
|
|
if ($self->{_data_dimensions} > 1) { |
348
|
|
|
|
|
|
|
$exponent = -0.5 * vector_matrix_multiply( |
349
|
|
|
|
|
|
|
transpose($datavec_minus_mean), |
350
|
|
|
|
|
|
|
matrix_vector_multiply($covar->inverse(), $datavec_minus_mean ) ); |
351
|
|
|
|
|
|
|
} else { |
352
|
|
|
|
|
|
|
my @var_inverse = $covar->inverse()->as_list; |
353
|
|
|
|
|
|
|
my $var_inverse_val = $var_inverse[0]; |
354
|
|
|
|
|
|
|
my @data_minus_mean = $datavec_minus_mean->as_list; |
355
|
|
|
|
|
|
|
my $data_minus_mean_val = $data_minus_mean[0]; |
356
|
|
|
|
|
|
|
$exponent = -0.5 * ($data_minus_mean_val ** 2) * $var_inverse_val; |
357
|
|
|
|
|
|
|
} |
358
|
|
|
|
|
|
|
print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug}; |
359
|
|
|
|
|
|
|
my $coefficient = 1.0 / \ |
360
|
|
|
|
|
|
|
( (2 * $Math::GSL::Const::M_PI)**$self->{_data_dimensions} * sqrt($covar->det()) ); |
361
|
|
|
|
|
|
|
my $prob = $coefficient * exp($exponent); |
362
|
|
|
|
|
|
|
push @{$probability_distributions[$cluster_index]}, $data_id |
363
|
|
|
|
|
|
|
if $prob > $theta; |
364
|
|
|
|
|
|
|
} |
365
|
|
|
|
|
|
|
} |
366
|
|
|
|
|
|
|
return \@probability_distributions; |
367
|
|
|
|
|
|
|
} |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
sub return_estimated_priors { |
370
|
|
|
|
|
|
|
my $self = shift; |
371
|
|
|
|
|
|
|
return $self->{_class_priors}; |
372
|
|
|
|
|
|
|
} |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
# Calculates the MDL (Minimum Description Length) clustering criterion according to |
375
|
|
|
|
|
|
|
# Rissanen. (J. Rissanen: "Modeling by Shortest Data Description," Automatica, 1978, |
376
|
|
|
|
|
|
|
# and "A Universal Prior for Integers and Estimation by Minimum Description Length," |
377
|
|
|
|
|
|
|
# Annals of Statistics, 1983.) The MDL criterion is a difference of a log-likelihood |
378
|
|
|
|
|
|
|
# term for all of the observed data and a model-complexity penalty term. In general, |
379
|
|
|
|
|
|
|
# both the log-likelihood and the model-complexity terms increase as the number of |
380
|
|
|
|
|
|
|
# clusters increases. The form of the MDL criterion used in the implementation below |
381
|
|
|
|
|
|
|
# uses for the penalty term the Bayesian Information Criterion (BIC) of G. Schwartz, |
382
|
|
|
|
|
|
|
# "Estimating the Dimensions of a Model," The Annals of Statistics, 1978. In |
383
|
|
|
|
|
|
|
# general, the smaller the value of the MDL quality measure calculated below, the |
384
|
|
|
|
|
|
|
# better the clustering of the data. |
385
|
|
|
|
|
|
|
sub clustering_quality_mdl { |
386
|
|
|
|
|
|
|
my $self = shift; |
387
|
|
|
|
|
|
|
# Calculate the inverses of all of the covariance matrices in order to avoid |
388
|
|
|
|
|
|
|
# having to calculate them repeatedly inside the inner 'foreach' loop in the |
389
|
|
|
|
|
|
|
# main part of this method. Here we go: |
390
|
|
|
|
|
|
|
my @covar_inverses; |
391
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
392
|
|
|
|
|
|
|
my $covar = $self->{_cluster_covariances}->[$cluster_index]; |
393
|
|
|
|
|
|
|
push @covar_inverses, $covar->inverse(); |
394
|
|
|
|
|
|
|
} |
395
|
|
|
|
|
|
|
# For the clustering quality, first calculate the log-likelihood of all the |
396
|
|
|
|
|
|
|
# observed data: |
397
|
|
|
|
|
|
|
my $log_likelihood = 0; |
398
|
|
|
|
|
|
|
foreach my $tag (@{$self->{_data_id_tags}}) { |
399
|
|
|
|
|
|
|
my $likelihood_for_each_tag = 0; |
400
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
401
|
|
|
|
|
|
|
my $mean_vec = $self->{_cluster_means}->[$cluster_index]; |
402
|
|
|
|
|
|
|
my $covar = $self->{_cluster_covariances}->[$cluster_index]; |
403
|
|
|
|
|
|
|
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); |
404
|
|
|
|
|
|
|
$data_vec->set_col( 0, $self->{_data}->{$tag}); |
405
|
|
|
|
|
|
|
my $datavec_minus_mean = $data_vec - $mean_vec; |
406
|
|
|
|
|
|
|
my $exponent = undef; |
407
|
|
|
|
|
|
|
if ($self->{_data_dimensions} > 1) { |
408
|
|
|
|
|
|
|
$exponent = -0.5 * vector_matrix_multiply( |
409
|
|
|
|
|
|
|
transpose($datavec_minus_mean), |
410
|
|
|
|
|
|
|
matrix_vector_multiply($covar_inverses[$cluster_index], |
411
|
|
|
|
|
|
|
$datavec_minus_mean ) ); |
412
|
|
|
|
|
|
|
} else { |
413
|
|
|
|
|
|
|
my @var_inverse = $covar_inverses[$cluster_index]->as_list; |
414
|
|
|
|
|
|
|
my $var_inverse_val = $var_inverse[0]; |
415
|
|
|
|
|
|
|
my @data_minus_mean = $datavec_minus_mean->as_list; |
416
|
|
|
|
|
|
|
my $data_minus_mean_val = $data_minus_mean[0]; |
417
|
|
|
|
|
|
|
$exponent = -0.5 * ($data_minus_mean_val ** 2) * $var_inverse_val; |
418
|
|
|
|
|
|
|
} |
419
|
|
|
|
|
|
|
next if $covar->det() < 0; |
420
|
|
|
|
|
|
|
my $coefficient = 1.0 / |
421
|
|
|
|
|
|
|
( (2 * $Math::GSL::Const::M_PI)**$self->{_data_dimensions} |
422
|
|
|
|
|
|
|
* sqrt($covar->det()) ); |
423
|
|
|
|
|
|
|
my $prob = $coefficient * exp($exponent); |
424
|
|
|
|
|
|
|
$likelihood_for_each_tag += |
425
|
|
|
|
|
|
|
$prob * $self->{_class_priors}->[$cluster_index]; |
426
|
|
|
|
|
|
|
} |
427
|
|
|
|
|
|
|
$log_likelihood += log( $likelihood_for_each_tag ); |
428
|
|
|
|
|
|
|
} |
429
|
|
|
|
|
|
|
# Now calculate the model complexity penalty. $L is the total number of |
430
|
|
|
|
|
|
|
# parameters it takes to specify a mixture of K Gaussians. If d is the |
431
|
|
|
|
|
|
|
# dimensionality of the data space, the covariance matrix of each Gaussian takes |
432
|
|
|
|
|
|
|
# (d**2 -d)/2 + d = d(d+1)/2 parameters since this matrix must be symmetric. And |
433
|
|
|
|
|
|
|
# then you need d mean value parameters, and one prior probability parameter |
434
|
|
|
|
|
|
|
# for the Gaussian. So $L = K[1 + d + d(d+1)/2] - 1 where the final '1' that |
435
|
|
|
|
|
|
|
# is subtracted is to account for the normalization on the class priors. |
436
|
|
|
|
|
|
|
my $L = (0.5 * $self->{_K} * |
437
|
|
|
|
|
|
|
($self->{_data_dimensions}**2 + 3*$self->{_data_dimensions} + 2) ) - 1; |
438
|
|
|
|
|
|
|
my $model_complexity_penalty = 0.5 * $L * log( $self->{_N} ); |
439
|
|
|
|
|
|
|
my $mdl_criterion = -1 * $log_likelihood + $model_complexity_penalty; |
440
|
|
|
|
|
|
|
return $mdl_criterion; |
441
|
|
|
|
|
|
|
} |
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
# For our second measure of clustering quality, we use `trace( SW^-1 . SB)' where SW |
444
|
|
|
|
|
|
|
# is the within-class scatter matrix, more commonly denoted S_w, and SB the |
445
|
|
|
|
|
|
|
# between-class scatter matrix, more commonly denoted S_b (the underscore means |
446
|
|
|
|
|
|
|
# subscript). This measure can be thought of as the normalized average distance |
447
|
|
|
|
|
|
|
# between the clusters, the normalization being provided by average cluster |
448
|
|
|
|
|
|
|
# covariance SW^-1. Therefore, the larger the value of this quality measure, the |
449
|
|
|
|
|
|
|
# better the separation between the clusters. Since this measure has its roots in |
450
|
|
|
|
|
|
|
# the Fisher linear discriminant function, we incorporate the word 'fisher' in the |
451
|
|
|
|
|
|
|
# name of the quality measure. Note that this measure is good only when the clusters |
452
|
|
|
|
|
|
|
# are disjoint. When the clusters exhibit significant overlap, the numbers produced |
453
|
|
|
|
|
|
|
# by this quality measure tend to be generally meaningless. As an extreme case, |
454
|
|
|
|
|
|
|
# let's say your data was produced by a set of Gaussians, all with the same mean |
455
|
|
|
|
|
|
|
# vector, but each with a distinct covariance. For this extreme case, this measure |
456
|
|
|
|
|
|
|
# will produce a value close to zero --- depending on the accuracy with which the |
457
|
|
|
|
|
|
|
# means are estimated --- even when your clusterer is doing a good job of identifying |
458
|
|
|
|
|
|
|
# the individual clusters. |
459
|
|
|
|
|
|
|
sub clustering_quality_fisher { |
460
|
|
|
|
|
|
|
my $self = shift; |
461
|
|
|
|
|
|
|
my @cluster_quality_indices; |
462
|
|
|
|
|
|
|
my $fisher_trace = 0; |
463
|
|
|
|
|
|
|
my $S_w = |
464
|
|
|
|
|
|
|
Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions}); |
465
|
|
|
|
|
|
|
$S_w->zero; |
466
|
|
|
|
|
|
|
my $S_b = |
467
|
|
|
|
|
|
|
Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions}); |
468
|
|
|
|
|
|
|
$S_b->zero; |
469
|
|
|
|
|
|
|
my $global_mean = Math::GSL::Matrix->new($self->{_data_dimensions},1); |
470
|
|
|
|
|
|
|
$global_mean->zero; |
471
|
|
|
|
|
|
|
foreach my $cluster_index(0..$self->{_K}-1) { |
472
|
|
|
|
|
|
|
$global_mean = $self->{_class_priors}->[$cluster_index] * |
473
|
|
|
|
|
|
|
$self->{_cluster_means}->[$cluster_index]; |
474
|
|
|
|
|
|
|
} |
475
|
|
|
|
|
|
|
foreach my $cluster_index(0..$self->{_K}-1) { |
476
|
|
|
|
|
|
|
$S_w += $self->{_cluster_covariances}->[$cluster_index] * |
477
|
|
|
|
|
|
|
$self->{_class_priors}->[$cluster_index]; |
478
|
|
|
|
|
|
|
my $class_mean_minus_global_mean = $self->{_cluster_means}->[$cluster_index] |
479
|
|
|
|
|
|
|
- $global_mean; |
480
|
|
|
|
|
|
|
my $outer_product = outer_product( $class_mean_minus_global_mean, |
481
|
|
|
|
|
|
|
$class_mean_minus_global_mean ); |
482
|
|
|
|
|
|
|
$S_b += $self->{_class_priors}->[$cluster_index] * $outer_product; |
483
|
|
|
|
|
|
|
} |
484
|
|
|
|
|
|
|
my $fisher = matrix_multiply($S_w->inverse, $S_b); |
485
|
|
|
|
|
|
|
return $fisher unless defined blessed($fisher); |
486
|
|
|
|
|
|
|
return matrix_trace($fisher); |
487
|
|
|
|
|
|
|
} |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
sub display_seeding_stats { |
490
|
|
|
|
|
|
|
my $self = shift; |
491
|
|
|
|
|
|
|
foreach my $cluster_index(0..$self->{_K}-1) { |
492
|
|
|
|
|
|
|
print "\nSeeding for cluster $cluster_index:\n"; |
493
|
|
|
|
|
|
|
my $mean = $self->{_cluster_means}->[$cluster_index]; |
494
|
|
|
|
|
|
|
display_matrix("The mean is: ", $mean); |
495
|
|
|
|
|
|
|
my $covariance = $self->{_cluster_covariances}->[$cluster_index]; |
496
|
|
|
|
|
|
|
display_matrix("The covariance is: ", $covariance); |
497
|
|
|
|
|
|
|
} |
498
|
|
|
|
|
|
|
} |
499
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
sub display_fisher_quality_vs_iterations { |
501
|
|
|
|
|
|
|
my $self = shift; |
502
|
|
|
|
|
|
|
print "\n\nFisher Quality vs. Iterations: " . |
503
|
|
|
|
|
|
|
"@{$self->{_fisher_quality_vs_iteration}}\n\n"; |
504
|
|
|
|
|
|
|
} |
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
sub display_mdl_quality_vs_iterations { |
507
|
|
|
|
|
|
|
my $self = shift; |
508
|
|
|
|
|
|
|
print "\n\nMDL Quality vs. Iterations: @{$self->{_mdl_quality_vs_iteration}}\n\n"; |
509
|
|
|
|
|
|
|
} |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
sub find_prob_at_each_datapoint_for_given_mean_and_covar { |
512
|
|
|
|
|
|
|
my $self = shift; |
513
|
|
|
|
|
|
|
my $mean_vec_ref = shift; |
514
|
|
|
|
|
|
|
my $covar_ref = shift; |
515
|
|
|
|
|
|
|
foreach my $data_id (keys %{$self->{_data}}) { |
516
|
|
|
|
|
|
|
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); |
517
|
|
|
|
|
|
|
$data_vec->set_col( 0, $self->{_data}->{$data_id}); |
518
|
|
|
|
|
|
|
if ($self->{_debug}) { |
519
|
|
|
|
|
|
|
display_matrix("data vec in find prob function", $data_vec); |
520
|
|
|
|
|
|
|
display_matrix("mean vec in find prob function", $mean_vec_ref); |
521
|
|
|
|
|
|
|
display_matrix("covariance in find prob function", $covar_ref); |
522
|
|
|
|
|
|
|
} |
523
|
|
|
|
|
|
|
my $datavec_minus_mean = $data_vec - $mean_vec_ref; |
524
|
|
|
|
|
|
|
display_matrix( "datavec minus mean is ", $datavec_minus_mean ) if $self->{_debug}; |
525
|
|
|
|
|
|
|
my $exponent = undef; |
526
|
|
|
|
|
|
|
if ($self->{_data_dimensions} > 1) { |
527
|
|
|
|
|
|
|
$exponent = -0.5 * vector_matrix_multiply( transpose($datavec_minus_mean), |
528
|
|
|
|
|
|
|
matrix_vector_multiply( $covar_ref->inverse(), $datavec_minus_mean ) ); |
529
|
|
|
|
|
|
|
} elsif (defined blessed($covar_ref)) { |
530
|
|
|
|
|
|
|
my @data_minus_mean = $datavec_minus_mean->as_list; |
531
|
|
|
|
|
|
|
my $data_minus_mean_val = $data_minus_mean[0]; |
532
|
|
|
|
|
|
|
my @covar_as_matrix = $covar_ref->as_list; |
533
|
|
|
|
|
|
|
my $covar_val = $covar_as_matrix[0]; |
534
|
|
|
|
|
|
|
$exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_val; |
535
|
|
|
|
|
|
|
} else { |
536
|
|
|
|
|
|
|
my @data_minus_mean = $datavec_minus_mean->as_list; |
537
|
|
|
|
|
|
|
my $data_minus_mean_val = $data_minus_mean[0]; |
538
|
|
|
|
|
|
|
$exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_ref; |
539
|
|
|
|
|
|
|
} |
540
|
|
|
|
|
|
|
print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug}; |
541
|
|
|
|
|
|
|
my $coefficient = undef; |
542
|
|
|
|
|
|
|
if ($self->{_data_dimensions} > 1) { |
543
|
|
|
|
|
|
|
$coefficient = 1.0 / sqrt( ((2 * $Math::GSL::Const::M_PI) ** $self->{_data_dimensions}) * |
544
|
|
|
|
|
|
|
$covar_ref->det()) ; |
545
|
|
|
|
|
|
|
} elsif (!defined blessed($covar_ref)) { |
546
|
|
|
|
|
|
|
$coefficient = 1.0 / sqrt(2 * $covar_ref * $Math::GSL::Const::M_PI); |
547
|
|
|
|
|
|
|
} else { |
548
|
|
|
|
|
|
|
my @covar_as_matrix = $covar_ref->as_list; |
549
|
|
|
|
|
|
|
my $covar_val = $covar_as_matrix[0]; |
550
|
|
|
|
|
|
|
$coefficient = 1.0 / sqrt(2 * $covar_val * $Math::GSL::Const::M_PI); |
551
|
|
|
|
|
|
|
} |
552
|
|
|
|
|
|
|
my $prob = $coefficient * exp($exponent); |
553
|
|
|
|
|
|
|
push @{$self->{_class_probs_at_each_data_point}->{$data_id}}, $prob; |
554
|
|
|
|
|
|
|
} |
555
|
|
|
|
|
|
|
} |
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
sub find_expected_classes_at_each_datapoint { |
558
|
|
|
|
|
|
|
my $self = shift; |
559
|
|
|
|
|
|
|
my @priors = @{$self->{_class_priors}}; |
560
|
|
|
|
|
|
|
foreach my $data_id (sort keys %{$self->{_class_probs_at_each_data_point}}) { |
561
|
|
|
|
|
|
|
my $numerator = |
562
|
|
|
|
|
|
|
vector_2_vector_multiply( |
563
|
|
|
|
|
|
|
$self->{_class_probs_at_each_data_point}->{$data_id}, |
564
|
|
|
|
|
|
|
$self->{_class_priors} ); |
565
|
|
|
|
|
|
|
my $sum = 0; |
566
|
|
|
|
|
|
|
foreach my $part (@$numerator) { |
567
|
|
|
|
|
|
|
$sum += $part; |
568
|
|
|
|
|
|
|
} |
569
|
|
|
|
|
|
|
$self->{_expected_class_probs}->{$data_id} = [map $_/$sum, @{$numerator}]; |
570
|
|
|
|
|
|
|
} |
571
|
|
|
|
|
|
|
} |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
sub initialize_class_priors { |
574
|
|
|
|
|
|
|
my $self = shift; |
575
|
|
|
|
|
|
|
if (@{$self->{_class_priors}} == 0) { |
576
|
|
|
|
|
|
|
my $prior = 1.0 / $self->{_K}; |
577
|
|
|
|
|
|
|
foreach my $class_index (0..$self->{_K}-1) { |
578
|
|
|
|
|
|
|
push @{$self->{_class_priors}}, $prior; |
579
|
|
|
|
|
|
|
} |
580
|
|
|
|
|
|
|
} |
581
|
|
|
|
|
|
|
die "Mismatch between number of values for class priors " . |
582
|
|
|
|
|
|
|
"and the number of clusters expected" |
583
|
|
|
|
|
|
|
unless @{$self->{_class_priors}} == $self->{_K}; |
584
|
|
|
|
|
|
|
my $sum = 0; |
585
|
|
|
|
|
|
|
foreach my $prior (@{$self->{_class_priors}}) { |
586
|
|
|
|
|
|
|
$sum += $prior; |
587
|
|
|
|
|
|
|
} |
588
|
|
|
|
|
|
|
die "Your priors in the constructor call do not add up to 1" |
589
|
|
|
|
|
|
|
unless abs($sum - 1) < 0.001; |
590
|
|
|
|
|
|
|
print "\nInitially assumed class priors are: @{$self->{_class_priors}}\n"; |
591
|
|
|
|
|
|
|
} |
592
|
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
sub estimate_class_priors { |
594
|
|
|
|
|
|
|
my $self = shift; |
595
|
|
|
|
|
|
|
foreach my $datatag (keys %{$self->{_data}}) { |
596
|
|
|
|
|
|
|
my $class_label = $self->{_class_labels}->{$datatag}; |
597
|
|
|
|
|
|
|
$self->{_class_priors}[$class_label]++; |
598
|
|
|
|
|
|
|
} |
599
|
|
|
|
|
|
|
foreach my $prior (@{$self->{_class_priors}}) { |
600
|
|
|
|
|
|
|
$prior /= $self->{_total_number_data_tuples}; |
601
|
|
|
|
|
|
|
} |
602
|
|
|
|
|
|
|
foreach my $prior (@{$self->{_class_priors}}) { |
603
|
|
|
|
|
|
|
print "class priors: @{$self->{_class_priors}}\n"; |
604
|
|
|
|
|
|
|
} |
605
|
|
|
|
|
|
|
} |
606
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
sub classify_all_data_tuples_bayes { |
608
|
|
|
|
|
|
|
my $self = shift; |
609
|
|
|
|
|
|
|
my $mean_vecs_ref = shift; |
610
|
|
|
|
|
|
|
my $covariances_ref = shift; |
611
|
|
|
|
|
|
|
my @new_clusters; |
612
|
|
|
|
|
|
|
foreach my $index (0..$self->{_K}-1) { |
613
|
|
|
|
|
|
|
push @new_clusters, []; |
614
|
|
|
|
|
|
|
} |
615
|
|
|
|
|
|
|
foreach my $data_id (@{$self->{_data_id_tags}}) { |
616
|
|
|
|
|
|
|
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); |
617
|
|
|
|
|
|
|
$data_vec->set_col( 0, deep_copy_array($self->{_data}->{$data_id})); |
618
|
|
|
|
|
|
|
my $cluster_index_for_tuple = |
619
|
|
|
|
|
|
|
$self->classify_a_data_point_bayes($data_vec, |
620
|
|
|
|
|
|
|
$mean_vecs_ref, $covariances_ref); |
621
|
|
|
|
|
|
|
$self->{_class_labels}->{$data_id} = $cluster_index_for_tuple; |
622
|
|
|
|
|
|
|
push @{$new_clusters[$cluster_index_for_tuple]}, $data_id; |
623
|
|
|
|
|
|
|
} |
624
|
|
|
|
|
|
|
$self->{_clusters} = \@new_clusters; |
625
|
|
|
|
|
|
|
} |
626
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
sub classify_a_data_point_bayes { |
628
|
|
|
|
|
|
|
my $self = shift; |
629
|
|
|
|
|
|
|
my $data_vec = shift; |
630
|
|
|
|
|
|
|
my $mean_vecs_ref = shift; |
631
|
|
|
|
|
|
|
my $covariances_ref = shift; |
632
|
|
|
|
|
|
|
my @cluster_mean_vecs = @$mean_vecs_ref; |
633
|
|
|
|
|
|
|
my @cluster_covariances = @$covariances_ref; |
634
|
|
|
|
|
|
|
my @log_likelihoods; |
635
|
|
|
|
|
|
|
foreach my $cluster_index (0..@cluster_mean_vecs-1) { |
636
|
|
|
|
|
|
|
my $mean = $cluster_mean_vecs[$cluster_index]; |
637
|
|
|
|
|
|
|
my $covariance = $cluster_covariances[$cluster_index]; |
638
|
|
|
|
|
|
|
my $datavec_minus_mean = $data_vec - $mean; |
639
|
|
|
|
|
|
|
my $log_likely = undef; |
640
|
|
|
|
|
|
|
if ($self->{_data_dimensions} > 1) { |
641
|
|
|
|
|
|
|
$log_likely = -0.5 * vector_matrix_multiply( |
642
|
|
|
|
|
|
|
transpose($datavec_minus_mean), |
643
|
|
|
|
|
|
|
matrix_vector_multiply( $covariance->inverse(), |
644
|
|
|
|
|
|
|
$datavec_minus_mean ) ); |
645
|
|
|
|
|
|
|
} else { |
646
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
my @data_minus_mean = $datavec_minus_mean->as_list; |
648
|
|
|
|
|
|
|
my $data_minus_mean_val = $data_minus_mean[0]; |
649
|
|
|
|
|
|
|
my @covar_as_matrix = $covariance->as_list; |
650
|
|
|
|
|
|
|
my $covar_val = $covar_as_matrix[0]; |
651
|
|
|
|
|
|
|
$log_likely = -0.5 * ($data_minus_mean_val ** 2) / $covar_val; |
652
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
} |
654
|
|
|
|
|
|
|
my $posterior_log_likely = $log_likely + |
655
|
|
|
|
|
|
|
log( $self->{_class_priors}[$cluster_index] ); |
656
|
|
|
|
|
|
|
push @log_likelihoods, $posterior_log_likely; |
657
|
|
|
|
|
|
|
} |
658
|
|
|
|
|
|
|
my ($minlikely, $maxlikely) = minmax(\@log_likelihoods); |
659
|
|
|
|
|
|
|
my $cluster_index_for_data_point = |
660
|
|
|
|
|
|
|
get_index_at_value( $maxlikely, \@log_likelihoods ); |
661
|
|
|
|
|
|
|
return $cluster_index_for_data_point; |
662
|
|
|
|
|
|
|
} |
663
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
sub find_cluster_means_and_covariances { |
665
|
|
|
|
|
|
|
my $clusters = shift; |
666
|
|
|
|
|
|
|
my $data_dimensions = find_data_dimensionality($clusters); |
667
|
|
|
|
|
|
|
my (@cluster_mean_vecs, @cluster_covariances); |
668
|
|
|
|
|
|
|
foreach my $cluster_index (0..@$clusters-1) { |
669
|
|
|
|
|
|
|
my ($num_rows,$num_cols) = |
670
|
|
|
|
|
|
|
($data_dimensions,scalar(@{$clusters->[$cluster_index]})); |
671
|
|
|
|
|
|
|
print "\nFor cluster $cluster_index: rows: $num_rows and cols: $num_cols\n"; |
672
|
|
|
|
|
|
|
my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols); |
673
|
|
|
|
|
|
|
my $mean_vec = Math::GSL::Matrix->new($num_rows,1); |
674
|
|
|
|
|
|
|
my $col_index = 0; |
675
|
|
|
|
|
|
|
foreach my $ele (@{$clusters->[$cluster_index]}) { |
676
|
|
|
|
|
|
|
$matrix->set_col($col_index++, $ele); |
677
|
|
|
|
|
|
|
} |
678
|
|
|
|
|
|
|
# display_matrix( "Displaying cluster matrix", $matrix ); |
679
|
|
|
|
|
|
|
foreach my $j (0..$num_cols-1) { |
680
|
|
|
|
|
|
|
$mean_vec += $matrix->col($j); |
681
|
|
|
|
|
|
|
} |
682
|
|
|
|
|
|
|
$mean_vec *= 1.0 / $num_cols; |
683
|
|
|
|
|
|
|
push @cluster_mean_vecs, $mean_vec; |
684
|
|
|
|
|
|
|
display_matrix( "Displaying the mean vector", $mean_vec ); |
685
|
|
|
|
|
|
|
foreach my $j (0..$num_cols-1) { |
686
|
|
|
|
|
|
|
my @new_col = ($matrix->col($j) - $mean_vec)->as_list; |
687
|
|
|
|
|
|
|
$matrix->set_col($j, \@new_col); |
688
|
|
|
|
|
|
|
} |
689
|
|
|
|
|
|
|
# display_matrix("Displaying mean subtracted data as a matrix", $matrix ); |
690
|
|
|
|
|
|
|
my $transposed = transpose( $matrix ); |
691
|
|
|
|
|
|
|
# display_matrix("Displaying transposed matrix",$transposed); |
692
|
|
|
|
|
|
|
my $covariance = matrix_multiply( $matrix, $transposed ); |
693
|
|
|
|
|
|
|
$covariance *= 1.0 / $num_cols; |
694
|
|
|
|
|
|
|
push @cluster_covariances, $covariance; |
695
|
|
|
|
|
|
|
display_matrix("Displaying the cluster covariance", $covariance ); |
696
|
|
|
|
|
|
|
} |
697
|
|
|
|
|
|
|
return (\@cluster_mean_vecs, \@cluster_covariances); |
698
|
|
|
|
|
|
|
} |
699
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
sub find_data_dimensionality { |
701
|
|
|
|
|
|
|
my $clusters = shift; |
702
|
|
|
|
|
|
|
my @first_cluster = @{$clusters->[0]}; |
703
|
|
|
|
|
|
|
my @first_data_element = @{$first_cluster[0]}; |
704
|
|
|
|
|
|
|
return scalar(@first_data_element); |
705
|
|
|
|
|
|
|
} |
706
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
sub find_seed_centered_covariances { |
708
|
|
|
|
|
|
|
my $self = shift; |
709
|
|
|
|
|
|
|
my $seed_tags = shift; |
710
|
|
|
|
|
|
|
my (@seed_mean_vecs, @seed_based_covariances); |
711
|
|
|
|
|
|
|
foreach my $seed_tag (@$seed_tags) { |
712
|
|
|
|
|
|
|
my ($num_rows,$num_cols) = ($self->{_data_dimensions}, $self->{_N}); |
713
|
|
|
|
|
|
|
my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols); |
714
|
|
|
|
|
|
|
my $mean_vec = Math::GSL::Matrix->new($num_rows,1); |
715
|
|
|
|
|
|
|
$mean_vec->set_col(0, $self->{_data}->{$seed_tag}); |
716
|
|
|
|
|
|
|
push @seed_mean_vecs, $mean_vec; |
717
|
|
|
|
|
|
|
display_matrix( "Displaying the seed mean vector", $mean_vec ); |
718
|
|
|
|
|
|
|
my $col_index = 0; |
719
|
|
|
|
|
|
|
foreach my $tag (@{$self->{_data_id_tags}}) { |
720
|
|
|
|
|
|
|
$matrix->set_col($col_index++, $self->{_data}->{$tag}); |
721
|
|
|
|
|
|
|
} |
722
|
|
|
|
|
|
|
foreach my $j (0..$num_cols-1) { |
723
|
|
|
|
|
|
|
my @new_col = ($matrix->col($j) - $mean_vec)->as_list; |
724
|
|
|
|
|
|
|
$matrix->set_col($j, \@new_col); |
725
|
|
|
|
|
|
|
} |
726
|
|
|
|
|
|
|
my $transposed = transpose( $matrix ); |
727
|
|
|
|
|
|
|
my $covariance = matrix_multiply( $matrix, $transposed ); |
728
|
|
|
|
|
|
|
$covariance *= 1.0 / $num_cols; |
729
|
|
|
|
|
|
|
push @seed_based_covariances, $covariance; |
730
|
|
|
|
|
|
|
display_matrix("Displaying the seed covariance", $covariance ) |
731
|
|
|
|
|
|
|
if $self->{_debug}; |
732
|
|
|
|
|
|
|
} |
733
|
|
|
|
|
|
|
return (\@seed_mean_vecs, \@seed_based_covariances); |
734
|
|
|
|
|
|
|
} |
735
|
|
|
|
|
|
|
|
736
|
|
|
|
|
|
|
# The most popular seeding mode for EM is random. We include two other seeding modes |
737
|
|
|
|
|
|
|
# --- kmeans and manual --- since they do produce good results for specialized cases. |
738
|
|
|
|
|
|
|
# For example, when the clusters in your data are non-overlapping and not too |
739
|
|
|
|
|
|
|
# anisotropic, the kmeans based seeding should work at least as well as the random |
740
|
|
|
|
|
|
|
# seeding. In such cases --- AND ONLY IN SUCH CASES --- the kmeans based seeding has |
741
|
|
|
|
|
|
|
# the advantage of avoiding the getting stuck in a local-maximum problem of the EM |
742
|
|
|
|
|
|
|
# algorithm. |
743
|
|
|
|
|
|
|
sub seed_the_clusters { |
744
|
|
|
|
|
|
|
my $self = shift; |
745
|
|
|
|
|
|
|
if ($self->{_seeding} eq 'random') { |
746
|
|
|
|
|
|
|
my @covariances; |
747
|
|
|
|
|
|
|
my @means; |
748
|
|
|
|
|
|
|
my @all_tags = @{$self->{_data_id_tags}}; |
749
|
|
|
|
|
|
|
my @seed_tags; |
750
|
|
|
|
|
|
|
foreach my $i (0..$self->{_K}-1) { |
751
|
|
|
|
|
|
|
push @seed_tags, $all_tags[int rand( $self->{_N} )]; |
752
|
|
|
|
|
|
|
} |
753
|
|
|
|
|
|
|
print "Random Seeding: Randomly selected seeding tags are @seed_tags\n\n"; |
754
|
|
|
|
|
|
|
my ($seed_means, $seed_covars) = |
755
|
|
|
|
|
|
|
$self->find_seed_centered_covariances(\@seed_tags); |
756
|
|
|
|
|
|
|
$self->{_cluster_means} = $seed_means; |
757
|
|
|
|
|
|
|
$self->{_cluster_covariances} = $seed_covars; |
758
|
|
|
|
|
|
|
} elsif ($self->{_seeding} eq 'kmeans') { |
759
|
|
|
|
|
|
|
$self->kmeans(); |
760
|
|
|
|
|
|
|
my $clusters = $self->{_clusters}; |
761
|
|
|
|
|
|
|
my @dataclusters; |
762
|
|
|
|
|
|
|
foreach my $index (0..@$clusters-1) { |
763
|
|
|
|
|
|
|
push @dataclusters, []; |
764
|
|
|
|
|
|
|
} |
765
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
766
|
|
|
|
|
|
|
foreach my $tag (@{$clusters->[$cluster_index]}) { |
767
|
|
|
|
|
|
|
my $data = $self->{_data}->{$tag}; |
768
|
|
|
|
|
|
|
push @{$dataclusters[$cluster_index]}, deep_copy_array($data); |
769
|
|
|
|
|
|
|
} |
770
|
|
|
|
|
|
|
} |
771
|
|
|
|
|
|
|
($self->{_cluster_means}, $self->{_cluster_covariances}) = |
772
|
|
|
|
|
|
|
find_cluster_means_and_covariances(\@dataclusters); |
773
|
|
|
|
|
|
|
} elsif ($self->{_seeding} eq 'manual') { |
774
|
|
|
|
|
|
|
die "You have not supplied the seeding tags for the option \"manual\"" |
775
|
|
|
|
|
|
|
unless @{$self->{_seed_tags}} > 0; |
776
|
|
|
|
|
|
|
print "Manual Seeding: Seed tags are @{$self->{_seed_tags}}\n\n"; |
777
|
|
|
|
|
|
|
foreach my $tag (@{$self->{_seed_tags}}) { |
778
|
|
|
|
|
|
|
die "invalid tag used for manual seeding" |
779
|
|
|
|
|
|
|
unless exists $self->{_data}->{$tag}; |
780
|
|
|
|
|
|
|
} |
781
|
|
|
|
|
|
|
my ($seed_means, $seed_covars) = |
782
|
|
|
|
|
|
|
$self->find_seed_centered_covariances($self->{_seed_tags}); |
783
|
|
|
|
|
|
|
$self->{_cluster_means} = $seed_means; |
784
|
|
|
|
|
|
|
$self->{_cluster_covariances} = $seed_covars; |
785
|
|
|
|
|
|
|
} else { |
786
|
|
|
|
|
|
|
die "Incorrect call syntax used. See documentation."; |
787
|
|
|
|
|
|
|
} |
788
|
|
|
|
|
|
|
} |
789
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
# This is the top-level method for kmeans based initialization of the EM |
791
|
|
|
|
|
|
|
# algorithm. The means and the covariances returned by kmeans are used to seed the EM |
792
|
|
|
|
|
|
|
# algorithm. |
793
|
|
|
|
|
|
|
sub kmeans { |
794
|
|
|
|
|
|
|
my $self = shift; |
795
|
|
|
|
|
|
|
my $K = $self->{_K}; |
796
|
|
|
|
|
|
|
$self->cluster_for_fixed_K_single_smart_try($K); |
797
|
|
|
|
|
|
|
if ((defined $self->{_clusters}) && (defined $self->{_cluster_centers})){ |
798
|
|
|
|
|
|
|
return ($self->{_clusters}, $self->{_cluster_centers}); |
799
|
|
|
|
|
|
|
} else { |
800
|
|
|
|
|
|
|
die "kmeans clustering failed."; |
801
|
|
|
|
|
|
|
} |
802
|
|
|
|
|
|
|
} |
803
|
|
|
|
|
|
|
|
804
|
|
|
|
|
|
|
# Used by the kmeans algorithm for the initialization of the EM iterations. We do |
805
|
|
|
|
|
|
|
# initial kmeans cluster seeding by subjecting the data to principal components |
806
|
|
|
|
|
|
|
# analysis in order to discover the direction of maximum variance in the data space. |
807
|
|
|
|
|
|
|
# Subsequently, we try to find the K largest peaks along this direction. The |
808
|
|
|
|
|
|
|
# coordinates of these peaks serve as the seeds for the K clusters. |
809
|
|
|
|
|
|
|
sub cluster_for_fixed_K_single_smart_try { |
810
|
|
|
|
|
|
|
my $self = shift; |
811
|
|
|
|
|
|
|
my $K = shift; |
812
|
|
|
|
|
|
|
print "Clustering for K = $K\n" if $self->{_terminal_output}; |
813
|
|
|
|
|
|
|
my ($clusters, $cluster_centers) = |
814
|
|
|
|
|
|
|
$self->cluster_for_given_K($K); |
815
|
|
|
|
|
|
|
$self->{_clusters} = $clusters; |
816
|
|
|
|
|
|
|
$self->{_cluster_centers} = $cluster_centers; |
817
|
|
|
|
|
|
|
} |
818
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
# Used by the kmeans part of the code for the initialization of the EM algorithm: |
820
|
|
|
|
|
|
|
sub cluster_for_given_K { |
821
|
|
|
|
|
|
|
my $self = shift; |
822
|
|
|
|
|
|
|
my $K = shift; |
823
|
|
|
|
|
|
|
my $cluster_centers = $self->get_initial_cluster_centers($K); |
824
|
|
|
|
|
|
|
my $clusters = $self->assign_data_to_clusters_initial($cluster_centers); |
825
|
|
|
|
|
|
|
my $cluster_nonexistant_flag = 0; |
826
|
|
|
|
|
|
|
foreach my $trial (0..2) { |
827
|
|
|
|
|
|
|
($clusters, $cluster_centers) = |
828
|
|
|
|
|
|
|
$self->assign_data_to_clusters( $clusters, $K ); |
829
|
|
|
|
|
|
|
my $num_of_clusters_returned = @$clusters; |
830
|
|
|
|
|
|
|
foreach my $cluster (@$clusters) { |
831
|
|
|
|
|
|
|
$cluster_nonexistant_flag = 1 if ((!defined $cluster) |
832
|
|
|
|
|
|
|
|| (@$cluster == 0)); |
833
|
|
|
|
|
|
|
} |
834
|
|
|
|
|
|
|
last unless $cluster_nonexistant_flag; |
835
|
|
|
|
|
|
|
} |
836
|
|
|
|
|
|
|
return ($clusters, $cluster_centers); |
837
|
|
|
|
|
|
|
} |
838
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
# Used by the kmeans part of the code for the initialization of the EM algorithm: |
840
|
|
|
|
|
|
|
sub get_initial_cluster_centers { |
841
|
|
|
|
|
|
|
my $self = shift; |
842
|
|
|
|
|
|
|
my $K = shift; |
843
|
|
|
|
|
|
|
if ($self->{_data_dimensions} == 1) { |
844
|
|
|
|
|
|
|
my @one_d_data; |
845
|
|
|
|
|
|
|
foreach my $j (0..$self->{_N}-1) { |
846
|
|
|
|
|
|
|
my $tag = $self->{_data_id_tags}[$j]; |
847
|
|
|
|
|
|
|
push @one_d_data, $self->{_data}->{$tag}->[0]; |
848
|
|
|
|
|
|
|
} |
849
|
|
|
|
|
|
|
my @peak_points = |
850
|
|
|
|
|
|
|
find_peak_points_in_given_direction(\@one_d_data,$K); |
851
|
|
|
|
|
|
|
print "highest points at data values: @peak_points\n" |
852
|
|
|
|
|
|
|
if $self->{_debug}; |
853
|
|
|
|
|
|
|
my @cluster_centers; |
854
|
|
|
|
|
|
|
foreach my $peakpoint (@peak_points) { |
855
|
|
|
|
|
|
|
push @cluster_centers, [$peakpoint]; |
856
|
|
|
|
|
|
|
} |
857
|
|
|
|
|
|
|
return \@cluster_centers; |
858
|
|
|
|
|
|
|
} |
859
|
|
|
|
|
|
|
my ($num_rows,$num_cols) = ($self->{_data_dimensions},$self->{_N}); |
860
|
|
|
|
|
|
|
my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols); |
861
|
|
|
|
|
|
|
my $mean_vec = Math::GSL::Matrix->new($num_rows,1); |
862
|
|
|
|
|
|
|
# All the record labels are stored in the array $self->{_data_id_tags}. |
863
|
|
|
|
|
|
|
# The actual data for clustering is stored in a hash at $self->{_data} |
864
|
|
|
|
|
|
|
# whose keys are the record labels; the value associated with each |
865
|
|
|
|
|
|
|
# key is the array holding the corresponding numerical multidimensional |
866
|
|
|
|
|
|
|
# data. |
867
|
|
|
|
|
|
|
foreach my $j (0..$num_cols-1) { |
868
|
|
|
|
|
|
|
my $tag = $self->{_data_id_tags}[$j]; |
869
|
|
|
|
|
|
|
my $data = $self->{_data}->{$tag}; |
870
|
|
|
|
|
|
|
$matrix->set_col($j, $data); |
871
|
|
|
|
|
|
|
} |
872
|
|
|
|
|
|
|
if ($self->{_debug}) { |
873
|
|
|
|
|
|
|
print "\nDisplaying the original data as a matrix:"; |
874
|
|
|
|
|
|
|
display_matrix( $matrix ); |
875
|
|
|
|
|
|
|
} |
876
|
|
|
|
|
|
|
foreach my $j (0..$num_cols-1) { |
877
|
|
|
|
|
|
|
$mean_vec += $matrix->col($j); |
878
|
|
|
|
|
|
|
} |
879
|
|
|
|
|
|
|
$mean_vec *= 1.0 / $num_cols; |
880
|
|
|
|
|
|
|
if ($self->{_debug}) { |
881
|
|
|
|
|
|
|
print "Displaying the mean vector for the data:"; |
882
|
|
|
|
|
|
|
display_matrix( $mean_vec ); |
883
|
|
|
|
|
|
|
} |
884
|
|
|
|
|
|
|
foreach my $j (0..$num_cols-1) { |
885
|
|
|
|
|
|
|
my @new_col = ($matrix->col($j) - $mean_vec)->as_list; |
886
|
|
|
|
|
|
|
$matrix->set_col($j, \@new_col); |
887
|
|
|
|
|
|
|
} |
888
|
|
|
|
|
|
|
if ($self->{_debug}) { |
889
|
|
|
|
|
|
|
print "Displaying mean subtracted data as a matrix:"; |
890
|
|
|
|
|
|
|
display_matrix( $matrix ); |
891
|
|
|
|
|
|
|
} |
892
|
|
|
|
|
|
|
my $transposed = transpose( $matrix ); |
893
|
|
|
|
|
|
|
if ($self->{_debug}) { |
894
|
|
|
|
|
|
|
print "Displaying transposed data matrix:"; |
895
|
|
|
|
|
|
|
display_matrix( $transposed ); |
896
|
|
|
|
|
|
|
} |
897
|
|
|
|
|
|
|
my $covariance = matrix_multiply( $matrix, $transposed ); |
898
|
|
|
|
|
|
|
$covariance *= 1.0 / $num_cols; |
899
|
|
|
|
|
|
|
if ($self->{_debug}) { |
900
|
|
|
|
|
|
|
print "\nDisplaying the Covariance Matrix for your data:"; |
901
|
|
|
|
|
|
|
display_matrix( $covariance ); |
902
|
|
|
|
|
|
|
} |
903
|
|
|
|
|
|
|
my ($eigenvalues, $eigenvectors) = $covariance->eigenpair; |
904
|
|
|
|
|
|
|
my $num_of_eigens = @$eigenvalues; |
905
|
|
|
|
|
|
|
my $largest_eigen_index = 0; |
906
|
|
|
|
|
|
|
my $smallest_eigen_index = 0; |
907
|
|
|
|
|
|
|
print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug}; |
908
|
|
|
|
|
|
|
foreach my $i (1..$num_of_eigens-1) { |
909
|
|
|
|
|
|
|
$largest_eigen_index = $i if $eigenvalues->[$i] > |
910
|
|
|
|
|
|
|
$eigenvalues->[$largest_eigen_index]; |
911
|
|
|
|
|
|
|
$smallest_eigen_index = $i if $eigenvalues->[$i] < |
912
|
|
|
|
|
|
|
$eigenvalues->[$smallest_eigen_index]; |
913
|
|
|
|
|
|
|
print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug}; |
914
|
|
|
|
|
|
|
} |
915
|
|
|
|
|
|
|
print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug}; |
916
|
|
|
|
|
|
|
print "\nsmallest eigen index: $smallest_eigen_index\n\n" |
917
|
|
|
|
|
|
|
if $self->{_debug}; |
918
|
|
|
|
|
|
|
foreach my $i (0..$num_of_eigens-1) { |
919
|
|
|
|
|
|
|
my @vec = $eigenvectors->[$i]->as_list; |
920
|
|
|
|
|
|
|
print "Eigenvector $i: @vec\n" if $self->{_debug}; |
921
|
|
|
|
|
|
|
} |
922
|
|
|
|
|
|
|
my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list; |
923
|
|
|
|
|
|
|
print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug}; |
924
|
|
|
|
|
|
|
my @max_var_direction; |
925
|
|
|
|
|
|
|
# Each element of the array @largest_eigen_vec is a Math::Complex object |
926
|
|
|
|
|
|
|
foreach my $k (0..@largest_eigen_vec-1) { |
927
|
|
|
|
|
|
|
my ($mag, $theta) = $largest_eigen_vec[$k] =~ /\[(\d*\.\d+),(\S+)\]/; |
928
|
|
|
|
|
|
|
if ($theta eq '0') { |
929
|
|
|
|
|
|
|
$max_var_direction[$k] = $mag; |
930
|
|
|
|
|
|
|
} elsif ($theta eq 'pi') { |
931
|
|
|
|
|
|
|
$max_var_direction[$k] = -1.0 * $mag; |
932
|
|
|
|
|
|
|
} else { |
933
|
|
|
|
|
|
|
die "eigendecomposition of covariance matrix produced a complex eigenvector --- something is wrong"; |
934
|
|
|
|
|
|
|
} |
935
|
|
|
|
|
|
|
} |
936
|
|
|
|
|
|
|
# "Maximum variance direction: @max_var_direction |
937
|
|
|
|
|
|
|
print "\nMaximum Variance Direction: @max_var_direction\n\n" |
938
|
|
|
|
|
|
|
if $self->{_debug}; |
939
|
|
|
|
|
|
|
# We now project all data points on the largest eigenvector. |
940
|
|
|
|
|
|
|
# Each projection will yield a single point on the eigenvector. |
941
|
|
|
|
|
|
|
my @projections; |
942
|
|
|
|
|
|
|
foreach my $j (0..$self->{_N}-1) { |
943
|
|
|
|
|
|
|
my $tag = $self->{_data_id_tags}[$j]; |
944
|
|
|
|
|
|
|
my $data = $self->{_data}->{$tag}; |
945
|
|
|
|
|
|
|
die "Dimensionality of the largest eigenvector does not " |
946
|
|
|
|
|
|
|
. "match the dimensionality of the data" |
947
|
|
|
|
|
|
|
unless @max_var_direction == $self->{_data_dimensions}; |
948
|
|
|
|
|
|
|
my $projection = vector_multiply($data, \@max_var_direction); |
949
|
|
|
|
|
|
|
push @projections, $projection; |
950
|
|
|
|
|
|
|
} |
951
|
|
|
|
|
|
|
print "All projection points: @projections\n" if $self->{_debug}; |
952
|
|
|
|
|
|
|
my @peak_points = find_peak_points_in_given_direction(\@projections, $K); |
953
|
|
|
|
|
|
|
print "highest points at points along largest eigenvec: @peak_points\n" |
954
|
|
|
|
|
|
|
if $self->{_debug}; |
955
|
|
|
|
|
|
|
my @cluster_centers; |
956
|
|
|
|
|
|
|
foreach my $peakpoint (@peak_points) { |
957
|
|
|
|
|
|
|
my @actual_peak_coords = map {$peakpoint * $_} @max_var_direction; |
958
|
|
|
|
|
|
|
push @cluster_centers, \@actual_peak_coords; |
959
|
|
|
|
|
|
|
} |
960
|
|
|
|
|
|
|
return \@cluster_centers; |
961
|
|
|
|
|
|
|
} |
962
|
|
|
|
|
|
|
|
963
|
|
|
|
|
|
|
# Used by the kmeans part of the code: This method is called by the previous method |
964
|
|
|
|
|
|
|
# to locate K peaks in a smoothed histogram of the data points projected onto the |
965
|
|
|
|
|
|
|
# maximal variance direction. |
966
|
|
|
|
|
|
|
sub find_peak_points_in_given_direction { |
967
|
|
|
|
|
|
|
my $dataref = shift; |
968
|
|
|
|
|
|
|
my $how_many = shift; |
969
|
|
|
|
|
|
|
my @data = @$dataref; |
970
|
|
|
|
|
|
|
my ($min, $max) = minmax(\@data); |
971
|
|
|
|
|
|
|
my $num_points = @data; |
972
|
|
|
|
|
|
|
my @sorted_data = sort {$a <=> $b} @data; |
973
|
|
|
|
|
|
|
#print "\n\nSorted data: @sorted_data\n"; |
974
|
|
|
|
|
|
|
my $scale = $max - $min; |
975
|
|
|
|
|
|
|
foreach my $index (0..$#sorted_data-1) { |
976
|
|
|
|
|
|
|
$sorted_data[$index] = ($sorted_data[$index] - $min) / $scale; |
977
|
|
|
|
|
|
|
} |
978
|
|
|
|
|
|
|
my $avg_diff = 0; |
979
|
|
|
|
|
|
|
foreach my $index (0..$#sorted_data-1) { |
980
|
|
|
|
|
|
|
my $diff = $sorted_data[$index+1] - $sorted_data[$index]; |
981
|
|
|
|
|
|
|
$avg_diff += ($diff - $avg_diff) / ($index + 1); |
982
|
|
|
|
|
|
|
} |
983
|
|
|
|
|
|
|
my $delta = 1.0 / 1000.0; |
984
|
|
|
|
|
|
|
# It would be nice to set the delta adaptively, but I must |
985
|
|
|
|
|
|
|
# change the number of cells in the next foreach loop accordingly |
986
|
|
|
|
|
|
|
# my $delta = $avg_diff / 20; |
987
|
|
|
|
|
|
|
my @accumulator = (0) x 1000; |
988
|
|
|
|
|
|
|
foreach my $index (0..@sorted_data-1) { |
989
|
|
|
|
|
|
|
my $cell_index = int($sorted_data[$index] / $delta); |
990
|
|
|
|
|
|
|
my $smoothness = 40; |
991
|
|
|
|
|
|
|
for my $index ($cell_index-$smoothness..$cell_index+$smoothness) { |
992
|
|
|
|
|
|
|
next if $index < 0 || $index > 999; |
993
|
|
|
|
|
|
|
$accumulator[$index]++; |
994
|
|
|
|
|
|
|
} |
995
|
|
|
|
|
|
|
} |
996
|
|
|
|
|
|
|
my $peaks_array = non_maximum_supression( \@accumulator ); |
997
|
|
|
|
|
|
|
my $peaks_index_hash = get_value_index_hash( $peaks_array ); |
998
|
|
|
|
|
|
|
my @K_highest_peak_locations; |
999
|
|
|
|
|
|
|
my $k = 0; |
1000
|
|
|
|
|
|
|
foreach my $peak (sort {$b <=> $a} keys %$peaks_index_hash) { |
1001
|
|
|
|
|
|
|
my $unscaled_peak_point = |
1002
|
|
|
|
|
|
|
$min + $peaks_index_hash->{$peak} * $scale * $delta; |
1003
|
|
|
|
|
|
|
push @K_highest_peak_locations, $unscaled_peak_point |
1004
|
|
|
|
|
|
|
if $k < $how_many; |
1005
|
|
|
|
|
|
|
last if ++$k == $how_many; |
1006
|
|
|
|
|
|
|
} |
1007
|
|
|
|
|
|
|
return @K_highest_peak_locations; |
1008
|
|
|
|
|
|
|
} |
1009
|
|
|
|
|
|
|
|
1010
|
|
|
|
|
|
|
# Used by the kmeans part of the code: The purpose of this routine is to form initial |
1011
|
|
|
|
|
|
|
# clusters by assigning the data samples to the initial clusters formed by the |
1012
|
|
|
|
|
|
|
# previous routine on the basis of the best proximity of the data samples to the |
1013
|
|
|
|
|
|
|
# different cluster centers. |
1014
|
|
|
|
|
|
|
sub assign_data_to_clusters_initial { |
1015
|
|
|
|
|
|
|
my $self = shift; |
1016
|
|
|
|
|
|
|
my @cluster_centers = @{ shift @_ }; |
1017
|
|
|
|
|
|
|
my @clusters; |
1018
|
|
|
|
|
|
|
foreach my $ele (@{$self->{_data_id_tags}}) { |
1019
|
|
|
|
|
|
|
my $best_cluster; |
1020
|
|
|
|
|
|
|
my @dist_from_clust_centers; |
1021
|
|
|
|
|
|
|
foreach my $center (@cluster_centers) { |
1022
|
|
|
|
|
|
|
push @dist_from_clust_centers, $self->distance($ele, $center); |
1023
|
|
|
|
|
|
|
} |
1024
|
|
|
|
|
|
|
my ($min, $best_center_index) = minimum( \@dist_from_clust_centers ); |
1025
|
|
|
|
|
|
|
push @{$clusters[$best_center_index]}, $ele; |
1026
|
|
|
|
|
|
|
} |
1027
|
|
|
|
|
|
|
return \@clusters; |
1028
|
|
|
|
|
|
|
} |
1029
|
|
|
|
|
|
|
|
1030
|
|
|
|
|
|
|
# Used by the kmeans part of the code: This is the main routine that along with the |
1031
|
|
|
|
|
|
|
# update_cluster_centers() routine constitute the two key steps of the K-Means |
1032
|
|
|
|
|
|
|
# algorithm. In most cases, the infinite while() loop will terminate automatically |
1033
|
|
|
|
|
|
|
# when the cluster assignments of the data points remain unchanged. For the sake of |
1034
|
|
|
|
|
|
|
# safety, we keep track of the number of iterations. If this number reaches 100, we |
1035
|
|
|
|
|
|
|
# exit the while() loop anyway. In most cases, this limit will not be reached. |
1036
|
|
|
|
|
|
|
sub assign_data_to_clusters { |
1037
|
|
|
|
|
|
|
my $self = shift; |
1038
|
|
|
|
|
|
|
my $clusters = shift; |
1039
|
|
|
|
|
|
|
my $K = shift; |
1040
|
|
|
|
|
|
|
my $final_cluster_centers; |
1041
|
|
|
|
|
|
|
my $iteration_index = 0; |
1042
|
|
|
|
|
|
|
while (1) { |
1043
|
|
|
|
|
|
|
my $new_clusters; |
1044
|
|
|
|
|
|
|
my $assignment_changed_flag = 0; |
1045
|
|
|
|
|
|
|
my $current_cluster_center_index = 0; |
1046
|
|
|
|
|
|
|
my $cluster_size_zero_condition = 0; |
1047
|
|
|
|
|
|
|
my $how_many = @$clusters; |
1048
|
|
|
|
|
|
|
my $cluster_centers = $self->update_cluster_centers( |
1049
|
|
|
|
|
|
|
deep_copy_AoA_with_nulls( $clusters ) ); |
1050
|
|
|
|
|
|
|
$iteration_index++; |
1051
|
|
|
|
|
|
|
foreach my $cluster (@$clusters) { |
1052
|
|
|
|
|
|
|
my $current_cluster_center = |
1053
|
|
|
|
|
|
|
$cluster_centers->[$current_cluster_center_index]; |
1054
|
|
|
|
|
|
|
foreach my $ele (@$cluster) { |
1055
|
|
|
|
|
|
|
my @dist_from_clust_centers; |
1056
|
|
|
|
|
|
|
foreach my $center (@$cluster_centers) { |
1057
|
|
|
|
|
|
|
push @dist_from_clust_centers, |
1058
|
|
|
|
|
|
|
$self->distance($ele, $center); |
1059
|
|
|
|
|
|
|
} |
1060
|
|
|
|
|
|
|
my ($min, $best_center_index) = |
1061
|
|
|
|
|
|
|
minimum( \@dist_from_clust_centers ); |
1062
|
|
|
|
|
|
|
my $best_cluster_center = |
1063
|
|
|
|
|
|
|
$cluster_centers->[$best_center_index]; |
1064
|
|
|
|
|
|
|
if (vector_equal($current_cluster_center, |
1065
|
|
|
|
|
|
|
$best_cluster_center)){ |
1066
|
|
|
|
|
|
|
push @{$new_clusters->[$current_cluster_center_index]}, |
1067
|
|
|
|
|
|
|
$ele; |
1068
|
|
|
|
|
|
|
} else { |
1069
|
|
|
|
|
|
|
$assignment_changed_flag = 1; |
1070
|
|
|
|
|
|
|
push @{$new_clusters->[$best_center_index]}, $ele; |
1071
|
|
|
|
|
|
|
} |
1072
|
|
|
|
|
|
|
} |
1073
|
|
|
|
|
|
|
$current_cluster_center_index++; |
1074
|
|
|
|
|
|
|
} |
1075
|
|
|
|
|
|
|
# Now make sure that we still have K clusters since K is fixed: |
1076
|
|
|
|
|
|
|
next if ((@$new_clusters != @$clusters) && ($iteration_index < 100)); |
1077
|
|
|
|
|
|
|
# Now make sure that none of the K clusters is an empty cluster: |
1078
|
|
|
|
|
|
|
foreach my $newcluster (@$new_clusters) { |
1079
|
|
|
|
|
|
|
$cluster_size_zero_condition = 1 if ((!defined $newcluster) |
1080
|
|
|
|
|
|
|
or (@$newcluster == 0)); |
1081
|
|
|
|
|
|
|
} |
1082
|
|
|
|
|
|
|
push @$new_clusters, (undef) x ($K - @$new_clusters) |
1083
|
|
|
|
|
|
|
if @$new_clusters < $K; |
1084
|
|
|
|
|
|
|
my $largest_cluster; |
1085
|
|
|
|
|
|
|
foreach my $local_cluster (@$new_clusters) { |
1086
|
|
|
|
|
|
|
next if !defined $local_cluster; |
1087
|
|
|
|
|
|
|
$largest_cluster = $local_cluster if !defined $largest_cluster; |
1088
|
|
|
|
|
|
|
if (@$local_cluster > @$largest_cluster) { |
1089
|
|
|
|
|
|
|
$largest_cluster = $local_cluster; |
1090
|
|
|
|
|
|
|
} |
1091
|
|
|
|
|
|
|
} |
1092
|
|
|
|
|
|
|
foreach my $local_cluster (@$new_clusters) { |
1093
|
|
|
|
|
|
|
if ( (!defined $local_cluster) || (@$local_cluster == 0) ) { |
1094
|
|
|
|
|
|
|
push @$local_cluster, pop @$largest_cluster; |
1095
|
|
|
|
|
|
|
} |
1096
|
|
|
|
|
|
|
} |
1097
|
|
|
|
|
|
|
next if (($cluster_size_zero_condition) && ($iteration_index < 100)); |
1098
|
|
|
|
|
|
|
last if $iteration_index == 100; |
1099
|
|
|
|
|
|
|
# Now do a deep copy of new_clusters into clusters |
1100
|
|
|
|
|
|
|
$clusters = deep_copy_AoA( $new_clusters ); |
1101
|
|
|
|
|
|
|
last if $assignment_changed_flag == 0; |
1102
|
|
|
|
|
|
|
} |
1103
|
|
|
|
|
|
|
$final_cluster_centers = $self->update_cluster_centers( $clusters ); |
1104
|
|
|
|
|
|
|
return ($clusters, $final_cluster_centers); |
1105
|
|
|
|
|
|
|
} |
1106
|
|
|
|
|
|
|
|
1107
|
|
|
|
|
|
|
# Used by the kmeans part of the code: After each new assignment of the data points |
1108
|
|
|
|
|
|
|
# to the clusters on the basis of the current values for the cluster centers, we call |
1109
|
|
|
|
|
|
|
# the routine shown here for updating the values of the cluster centers. |
1110
|
|
|
|
|
|
|
sub update_cluster_centers { |
1111
|
|
|
|
|
|
|
my $self = shift; |
1112
|
|
|
|
|
|
|
my @clusters = @{ shift @_ }; |
1113
|
|
|
|
|
|
|
my @new_cluster_centers; |
1114
|
|
|
|
|
|
|
my $largest_cluster; |
1115
|
|
|
|
|
|
|
foreach my $cluster (@clusters) { |
1116
|
|
|
|
|
|
|
next if !defined $cluster; |
1117
|
|
|
|
|
|
|
$largest_cluster = $cluster if !defined $largest_cluster; |
1118
|
|
|
|
|
|
|
if (@$cluster > @$largest_cluster) { |
1119
|
|
|
|
|
|
|
$largest_cluster = $cluster; |
1120
|
|
|
|
|
|
|
} |
1121
|
|
|
|
|
|
|
} |
1122
|
|
|
|
|
|
|
foreach my $cluster (@clusters) { |
1123
|
|
|
|
|
|
|
if ( (!defined $cluster) || (@$cluster == 0) ) { |
1124
|
|
|
|
|
|
|
push @$cluster, pop @$largest_cluster; |
1125
|
|
|
|
|
|
|
} |
1126
|
|
|
|
|
|
|
} |
1127
|
|
|
|
|
|
|
foreach my $cluster (@clusters) { |
1128
|
|
|
|
|
|
|
die "Cluster became empty --- untenable condition " . |
1129
|
|
|
|
|
|
|
"for a given K. Try again. " if !defined $cluster; |
1130
|
|
|
|
|
|
|
my $cluster_size = @$cluster; |
1131
|
|
|
|
|
|
|
die "Cluster size is zero --- untenable." if $cluster_size == 0; |
1132
|
|
|
|
|
|
|
my @new_cluster_center = @{$self->add_point_coords( $cluster )}; |
1133
|
|
|
|
|
|
|
@new_cluster_center = map {my $x = $_/$cluster_size; $x} |
1134
|
|
|
|
|
|
|
@new_cluster_center; |
1135
|
|
|
|
|
|
|
push @new_cluster_centers, \@new_cluster_center; |
1136
|
|
|
|
|
|
|
} |
1137
|
|
|
|
|
|
|
return \@new_cluster_centers; |
1138
|
|
|
|
|
|
|
} |
1139
|
|
|
|
|
|
|
|
1140
|
|
|
|
|
|
|
# The following routine is for computing the distance between a data point specified |
1141
|
|
|
|
|
|
|
# by its symbolic name in the master datafile and a point (such as the center of a |
1142
|
|
|
|
|
|
|
# cluster) expressed as a vector of coordinates: |
1143
|
|
|
|
|
|
|
sub distance { |
1144
|
|
|
|
|
|
|
my $self = shift; |
1145
|
|
|
|
|
|
|
my $ele1_id = shift @_; # symbolic name of data sample |
1146
|
|
|
|
|
|
|
my @ele1 = @{$self->{_data}->{$ele1_id}}; |
1147
|
|
|
|
|
|
|
my @ele2 = @{shift @_}; |
1148
|
|
|
|
|
|
|
die "wrong data types for distance calculation" if @ele1 != @ele2; |
1149
|
|
|
|
|
|
|
my $how_many = @ele1; |
1150
|
|
|
|
|
|
|
my $squared_sum = 0; |
1151
|
|
|
|
|
|
|
foreach my $i (0..$how_many-1) { |
1152
|
|
|
|
|
|
|
$squared_sum += ($ele1[$i] - $ele2[$i])**2; |
1153
|
|
|
|
|
|
|
} |
1154
|
|
|
|
|
|
|
my $dist = sqrt $squared_sum; |
1155
|
|
|
|
|
|
|
return $dist; |
1156
|
|
|
|
|
|
|
} |
1157
|
|
|
|
|
|
|
|
1158
|
|
|
|
|
|
|
# The following routine does the same as above but now both |
1159
|
|
|
|
|
|
|
# arguments are expected to be arrays of numbers: |
1160
|
|
|
|
|
|
|
sub distance2 { |
1161
|
|
|
|
|
|
|
my $self = shift; |
1162
|
|
|
|
|
|
|
my @ele1 = @{shift @_}; |
1163
|
|
|
|
|
|
|
my @ele2 = @{shift @_}; |
1164
|
|
|
|
|
|
|
die "wrong data types for distance calculation" if @ele1 != @ele2; |
1165
|
|
|
|
|
|
|
my $how_many = @ele1; |
1166
|
|
|
|
|
|
|
my $squared_sum = 0; |
1167
|
|
|
|
|
|
|
foreach my $i (0..$how_many-1) { |
1168
|
|
|
|
|
|
|
$squared_sum += ($ele1[$i] - $ele2[$i])**2; |
1169
|
|
|
|
|
|
|
} |
1170
|
|
|
|
|
|
|
return sqrt $squared_sum; |
1171
|
|
|
|
|
|
|
} |
1172
|
|
|
|
|
|
|
|
1173
|
|
|
|
|
|
|
sub write_naive_bayes_clusters_to_files { |
1174
|
|
|
|
|
|
|
my $self = shift; |
1175
|
|
|
|
|
|
|
my @clusters = @{$self->{_clusters}}; |
1176
|
|
|
|
|
|
|
unlink glob "naive_bayes_cluster*.txt"; |
1177
|
|
|
|
|
|
|
foreach my $i (1..@clusters) { |
1178
|
|
|
|
|
|
|
my $filename = "naive_bayes_cluster" . $i . ".txt"; |
1179
|
|
|
|
|
|
|
print "Writing cluster $i to file $filename\n" |
1180
|
|
|
|
|
|
|
if $self->{_terminal_output}; |
1181
|
|
|
|
|
|
|
open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!"; |
1182
|
|
|
|
|
|
|
foreach my $ele (@{$clusters[$i-1]}) { |
1183
|
|
|
|
|
|
|
print FILEHANDLE "$ele\n"; |
1184
|
|
|
|
|
|
|
} |
1185
|
|
|
|
|
|
|
close FILEHANDLE; |
1186
|
|
|
|
|
|
|
} |
1187
|
|
|
|
|
|
|
} |
1188
|
|
|
|
|
|
|
|
1189
|
|
|
|
|
|
|
sub write_posterior_prob_clusters_above_threshold_to_files { |
1190
|
|
|
|
|
|
|
my $self = shift; |
1191
|
|
|
|
|
|
|
my $theta = shift; |
1192
|
|
|
|
|
|
|
my @class_distributions; |
1193
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
1194
|
|
|
|
|
|
|
push @class_distributions, []; |
1195
|
|
|
|
|
|
|
} |
1196
|
|
|
|
|
|
|
foreach my $data_tag (@{$self->{_data_id_tags}}) { |
1197
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
1198
|
|
|
|
|
|
|
push @{$class_distributions[$cluster_index]}, $data_tag |
1199
|
|
|
|
|
|
|
if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] |
1200
|
|
|
|
|
|
|
> $theta; |
1201
|
|
|
|
|
|
|
} |
1202
|
|
|
|
|
|
|
} |
1203
|
|
|
|
|
|
|
unlink glob "posterior_prob_cluster*.txt"; |
1204
|
|
|
|
|
|
|
foreach my $i (1..@class_distributions) { |
1205
|
|
|
|
|
|
|
my $filename = "posterior_prob_cluster" . $i . ".txt"; |
1206
|
|
|
|
|
|
|
print "Writing posterior prob cluster $i to file $filename\n" |
1207
|
|
|
|
|
|
|
if $self->{_terminal_output}; |
1208
|
|
|
|
|
|
|
open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!"; |
1209
|
|
|
|
|
|
|
foreach my $ele (@{$class_distributions[$i-1]}) { |
1210
|
|
|
|
|
|
|
print FILEHANDLE "$ele\n"; |
1211
|
|
|
|
|
|
|
} |
1212
|
|
|
|
|
|
|
close FILEHANDLE; |
1213
|
|
|
|
|
|
|
} |
1214
|
|
|
|
|
|
|
} |
1215
|
|
|
|
|
|
|
|
1216
|
|
|
|
|
|
|
sub DESTROY { |
1217
|
|
|
|
|
|
|
unlink "__temp_" . basename($_[0]->{_datafile}); |
1218
|
|
|
|
|
|
|
unlink "__temp_data_" . basename($_[0]->{_datafile}); |
1219
|
|
|
|
|
|
|
unlink "__temp2_" . basename($_[0]->{_datafile}); |
1220
|
|
|
|
|
|
|
unlink glob "__temp1dhist*"; |
1221
|
|
|
|
|
|
|
unlink glob "__contour*"; |
1222
|
|
|
|
|
|
|
} |
1223
|
|
|
|
|
|
|
|
1224
|
|
|
|
|
|
|
############################# Visualization Code ############################### |
1225
|
|
|
|
|
|
|
|
1226
|
|
|
|
|
|
|
# The visualize_clusters() implementation displays as a plot in your terminal window |
1227
|
|
|
|
|
|
|
# the clusters constructed by the EM algorithm. It can show either 2D plots or |
1228
|
|
|
|
|
|
|
# 3D plots that you can rotate interactively for better visualization. For |
1229
|
|
|
|
|
|
|
# multidimensional data, as to which 2D or 3D dimensions are used for visualization |
1230
|
|
|
|
|
|
|
# is controlled by the mask you must supply as an argument to the method. Should it |
1231
|
|
|
|
|
|
|
# happen that only one on bit is specified for the mask, visualize_clusters() |
1232
|
|
|
|
|
|
|
# aborts. |
1233
|
|
|
|
|
|
|
# |
1234
|
|
|
|
|
|
|
# The visualization code consists of first accessing each of clusters created by the |
1235
|
|
|
|
|
|
|
# EM() subroutine. Note that the clusters contain only the symbolic names for the |
1236
|
|
|
|
|
|
|
# individual records in the source data file. We therefore next reach into the |
1237
|
|
|
|
|
|
|
# $self->{_data} hash and get the data coordinates associated with each symbolic |
1238
|
|
|
|
|
|
|
# label in a cluster. The numerical data thus generated is then written out to a |
1239
|
|
|
|
|
|
|
# temp file. When doing so we must remember to insert TWO BLANK LINES between the |
1240
|
|
|
|
|
|
|
# data blocks corresponding to the different clusters. This constraint is imposed |
1241
|
|
|
|
|
|
|
# on us by Gnuplot when plotting data from the same file since we want to use |
1242
|
|
|
|
|
|
|
# different point styles for the data points in different cluster files. |
1243
|
|
|
|
|
|
|
# Subsequently, we call upon the Perl interface provided by the Graphics::GnuplotIF |
1244
|
|
|
|
|
|
|
# module to plot the data clusters. |
1245
|
|
|
|
|
|
|
sub visualize_clusters { |
1246
|
|
|
|
|
|
|
my $self = shift; |
1247
|
|
|
|
|
|
|
my $v_mask; |
1248
|
|
|
|
|
|
|
my $pause_time; |
1249
|
|
|
|
|
|
|
if (@_ == 1) { |
1250
|
|
|
|
|
|
|
$v_mask = shift || die "visualization mask missing"; |
1251
|
|
|
|
|
|
|
} elsif (@_ == 2) { |
1252
|
|
|
|
|
|
|
$v_mask = shift || die "visualization mask missing"; |
1253
|
|
|
|
|
|
|
$pause_time = shift; |
1254
|
|
|
|
|
|
|
} else { |
1255
|
|
|
|
|
|
|
die "visualize_clusters() called with wrong args"; |
1256
|
|
|
|
|
|
|
} |
1257
|
|
|
|
|
|
|
my $master_datafile = $self->{_datafile}; |
1258
|
|
|
|
|
|
|
my @v_mask = split //, $v_mask; |
1259
|
|
|
|
|
|
|
my $visualization_mask_width = @v_mask; |
1260
|
|
|
|
|
|
|
my $original_data_mask = $self->{_mask}; |
1261
|
|
|
|
|
|
|
my @mask = split //, $original_data_mask; |
1262
|
|
|
|
|
|
|
my $data_field_width = scalar grep {$_ eq '1'} @mask; |
1263
|
|
|
|
|
|
|
die "\n\nABORTED: The width of the visualization mask (including " . |
1264
|
|
|
|
|
|
|
"all its 1s and 0s) must equal the width of the original mask " . |
1265
|
|
|
|
|
|
|
"used for reading the data file (counting only the 1's)" |
1266
|
|
|
|
|
|
|
if $visualization_mask_width != $data_field_width; |
1267
|
|
|
|
|
|
|
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask; |
1268
|
|
|
|
|
|
|
# The following section is for the superimposed one-Mahalanobis-distance-unit |
1269
|
|
|
|
|
|
|
# ellipses that are shown only for 2D plots: |
1270
|
|
|
|
|
|
|
if ($visualization_data_field_width == 2) { |
1271
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
1272
|
|
|
|
|
|
|
my $contour_filename = "__contour_" . $cluster_index . ".dat"; |
1273
|
|
|
|
|
|
|
my $mean = $self->{_cluster_means}->[$cluster_index]; |
1274
|
|
|
|
|
|
|
my $covariance = $self->{_cluster_covariances}->[$cluster_index]; |
1275
|
|
|
|
|
|
|
my ($mux,$muy) = $mean->as_list(); |
1276
|
|
|
|
|
|
|
my ($varx,$sigmaxy) = $covariance->row(0)->as_list(); |
1277
|
|
|
|
|
|
|
my ($sigmayx,$vary) = $covariance->row(1)->as_list(); |
1278
|
|
|
|
|
|
|
die "Your covariance matrix does not look right" |
1279
|
|
|
|
|
|
|
unless $sigmaxy == $sigmayx; |
1280
|
|
|
|
|
|
|
my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary)); |
1281
|
|
|
|
|
|
|
my $argstring = <<"END"; |
1282
|
|
|
|
|
|
|
set contour |
1283
|
|
|
|
|
|
|
mux = $mux |
1284
|
|
|
|
|
|
|
muy = $muy |
1285
|
|
|
|
|
|
|
sigmax = $sigmax |
1286
|
|
|
|
|
|
|
sigmay = $sigmay |
1287
|
|
|
|
|
|
|
sigmaxy = $sigmaxy |
1288
|
|
|
|
|
|
|
determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2 |
1289
|
|
|
|
|
|
|
exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) ) |
1290
|
|
|
|
|
|
|
f(x,y) = exp( exponent(x,y) ) - 0.2 |
1291
|
|
|
|
|
|
|
xmax = mux + 2 * sigmax |
1292
|
|
|
|
|
|
|
xmin = mux - 2 * sigmax |
1293
|
|
|
|
|
|
|
ymax = muy + 2 * sigmay |
1294
|
|
|
|
|
|
|
ymin = muy - 2 * sigmay |
1295
|
|
|
|
|
|
|
set xrange [ xmin : xmax ] |
1296
|
|
|
|
|
|
|
set yrange [ ymin : ymax ] |
1297
|
|
|
|
|
|
|
set isosamples 200 |
1298
|
|
|
|
|
|
|
unset surface |
1299
|
|
|
|
|
|
|
set cntrparam levels discrete 0 |
1300
|
|
|
|
|
|
|
set table \"$contour_filename\" |
1301
|
|
|
|
|
|
|
splot f(x,y) |
1302
|
|
|
|
|
|
|
unset table |
1303
|
|
|
|
|
|
|
END |
1304
|
|
|
|
|
|
|
my $plot = Graphics::GnuplotIF->new(); |
1305
|
|
|
|
|
|
|
$plot->gnuplot_cmd( $argstring ); |
1306
|
|
|
|
|
|
|
} |
1307
|
|
|
|
|
|
|
} |
1308
|
|
|
|
|
|
|
my %visualization_data; |
1309
|
|
|
|
|
|
|
while ( my ($record_id, $data) = each %{$self->{_data}} ) { |
1310
|
|
|
|
|
|
|
my @fields = @$data; |
1311
|
|
|
|
|
|
|
die "\nABORTED: Visualization mask size exceeds data record size" |
1312
|
|
|
|
|
|
|
if $#v_mask > $#fields; |
1313
|
|
|
|
|
|
|
my @data_fields; |
1314
|
|
|
|
|
|
|
foreach my $i (0..@fields-1) { |
1315
|
|
|
|
|
|
|
if ($v_mask[$i] eq '0') { |
1316
|
|
|
|
|
|
|
next; |
1317
|
|
|
|
|
|
|
} elsif ($v_mask[$i] eq '1') { |
1318
|
|
|
|
|
|
|
push @data_fields, $fields[$i]; |
1319
|
|
|
|
|
|
|
} else { |
1320
|
|
|
|
|
|
|
die "Misformed visualization mask. It can only have 1s and 0s"; |
1321
|
|
|
|
|
|
|
} |
1322
|
|
|
|
|
|
|
} |
1323
|
|
|
|
|
|
|
$visualization_data{ $record_id } = \@data_fields; |
1324
|
|
|
|
|
|
|
} |
1325
|
|
|
|
|
|
|
my $K = scalar @{$self->{_clusters}}; |
1326
|
|
|
|
|
|
|
my $filename = basename($master_datafile); |
1327
|
|
|
|
|
|
|
my $temp_file = "__temp_" . $filename; |
1328
|
|
|
|
|
|
|
unlink $temp_file if -e $temp_file; |
1329
|
|
|
|
|
|
|
open OUTPUT, ">$temp_file" |
1330
|
|
|
|
|
|
|
or die "Unable to open a temp file in this directory: $!"; |
1331
|
|
|
|
|
|
|
foreach my $cluster (@{$self->{_clusters}}) { |
1332
|
|
|
|
|
|
|
foreach my $item (@$cluster) { |
1333
|
|
|
|
|
|
|
print OUTPUT "@{$visualization_data{$item}}"; |
1334
|
|
|
|
|
|
|
print OUTPUT "\n"; |
1335
|
|
|
|
|
|
|
} |
1336
|
|
|
|
|
|
|
print OUTPUT "\n\n"; |
1337
|
|
|
|
|
|
|
} |
1338
|
|
|
|
|
|
|
close OUTPUT; |
1339
|
|
|
|
|
|
|
my $plot; |
1340
|
|
|
|
|
|
|
if (!defined $pause_time) { |
1341
|
|
|
|
|
|
|
$plot = Graphics::GnuplotIF->new( persist => 1 ); |
1342
|
|
|
|
|
|
|
} else { |
1343
|
|
|
|
|
|
|
$plot = Graphics::GnuplotIF->new(); |
1344
|
|
|
|
|
|
|
} |
1345
|
|
|
|
|
|
|
my $arg_string = ""; |
1346
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
1347
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set noclip"); |
1348
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set pointsize 2"); |
1349
|
|
|
|
|
|
|
foreach my $i (0..$K-1) { |
1350
|
|
|
|
|
|
|
my $j = $i + 1; |
1351
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, "; |
1352
|
|
|
|
|
|
|
} |
1353
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
1354
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set noclip"); |
1355
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set pointsize 2"); |
1356
|
|
|
|
|
|
|
foreach my $i (0..$K-1) { |
1357
|
|
|
|
|
|
|
my $j = $i + 1; |
1358
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, "; |
1359
|
|
|
|
|
|
|
my $ellipse_filename = "__contour_" . $i . ".dat"; |
1360
|
|
|
|
|
|
|
$arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", "; |
1361
|
|
|
|
|
|
|
} |
1362
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1 ) { |
1363
|
|
|
|
|
|
|
open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!"; |
1364
|
|
|
|
|
|
|
my @all_data = ; |
1365
|
|
|
|
|
|
|
close INPUT; |
1366
|
|
|
|
|
|
|
@all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data; |
1367
|
|
|
|
|
|
|
my $all_joined_data = join ':', @all_data; |
1368
|
|
|
|
|
|
|
my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data; |
1369
|
|
|
|
|
|
|
my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals); |
1370
|
|
|
|
|
|
|
foreach my $i (0..@separated-1) { |
1371
|
|
|
|
|
|
|
$separated[$i] =~ s/SEPERATOR//g; |
1372
|
|
|
|
|
|
|
my @cluster_for_hist = split /:/, $separated[$i]; |
1373
|
|
|
|
|
|
|
@cluster_for_hist = grep $_, @cluster_for_hist; |
1374
|
|
|
|
|
|
|
my ($minval,$maxval) = minmax(\@cluster_for_hist); |
1375
|
|
|
|
|
|
|
push @all_minvals, $minval; |
1376
|
|
|
|
|
|
|
push @all_maxvals, $maxval; |
1377
|
|
|
|
|
|
|
push @all_clusters_for_hist, \@cluster_for_hist; |
1378
|
|
|
|
|
|
|
} |
1379
|
|
|
|
|
|
|
push @all_minmaxvals, @all_minvals; |
1380
|
|
|
|
|
|
|
push @all_minmaxvals, @all_maxvals; |
1381
|
|
|
|
|
|
|
my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals); |
1382
|
|
|
|
|
|
|
my $delta = ($abs_maxval - $abs_minval) / 100.0; |
1383
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set boxwidth 3"); |
1384
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style fill solid border -1"); |
1385
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set ytics out nomirror"); |
1386
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style data histograms"); |
1387
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style histogram clustered"); |
1388
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set title 'Clusters shown through histograms'"); |
1389
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror"); |
1390
|
|
|
|
|
|
|
foreach my $cindex (0..@all_clusters_for_hist-1) { |
1391
|
|
|
|
|
|
|
my $filename = basename($master_datafile); |
1392
|
|
|
|
|
|
|
my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $filename; |
1393
|
|
|
|
|
|
|
unlink $temp_file if -e $temp_file; |
1394
|
|
|
|
|
|
|
open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!"; |
1395
|
|
|
|
|
|
|
print OUTPUT "Xstep histval\n"; |
1396
|
|
|
|
|
|
|
my @histogram = (0) x 100; |
1397
|
|
|
|
|
|
|
foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) { |
1398
|
|
|
|
|
|
|
$histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++; |
1399
|
|
|
|
|
|
|
} |
1400
|
|
|
|
|
|
|
foreach my $i (0..@histogram-1) { |
1401
|
|
|
|
|
|
|
print OUTPUT "$i $histogram[$i]\n"; |
1402
|
|
|
|
|
|
|
} |
1403
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, "; |
1404
|
|
|
|
|
|
|
close OUTPUT; |
1405
|
|
|
|
|
|
|
} |
1406
|
|
|
|
|
|
|
} |
1407
|
|
|
|
|
|
|
$arg_string = $arg_string =~ /^(.*),[ ]+$/; |
1408
|
|
|
|
|
|
|
$arg_string = $1; |
1409
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
1410
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "splot $arg_string" ); |
1411
|
|
|
|
|
|
|
$plot->gnuplot_pause( $pause_time ) if defined $pause_time; |
1412
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
1413
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "plot $arg_string" ); |
1414
|
|
|
|
|
|
|
$plot->gnuplot_pause( $pause_time ) if defined $pause_time; |
1415
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1) { |
1416
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "plot $arg_string" ); |
1417
|
|
|
|
|
|
|
$plot->gnuplot_pause( $pause_time ) if defined $pause_time; |
1418
|
|
|
|
|
|
|
} |
1419
|
|
|
|
|
|
|
} |
1420
|
|
|
|
|
|
|
|
1421
|
|
|
|
|
|
|
# This subroutine is the same as above except that it makes PNG plots (for hardcopy |
1422
|
|
|
|
|
|
|
# printing) of the clusters. |
1423
|
|
|
|
|
|
|
sub plot_hardcopy_clusters { |
1424
|
|
|
|
|
|
|
my $self = shift; |
1425
|
|
|
|
|
|
|
my $v_mask; |
1426
|
|
|
|
|
|
|
my $pause_time; |
1427
|
|
|
|
|
|
|
if (@_ == 1) { |
1428
|
|
|
|
|
|
|
$v_mask = shift || die "visualization mask missing"; |
1429
|
|
|
|
|
|
|
} elsif (@_ == 2) { |
1430
|
|
|
|
|
|
|
$v_mask = shift || die "visualization mask missing"; |
1431
|
|
|
|
|
|
|
$pause_time = shift; |
1432
|
|
|
|
|
|
|
} else { |
1433
|
|
|
|
|
|
|
die "visualize_clusters() called with wrong args"; |
1434
|
|
|
|
|
|
|
} |
1435
|
|
|
|
|
|
|
my $master_datafile = $self->{_datafile}; |
1436
|
|
|
|
|
|
|
my @v_mask = split //, $v_mask; |
1437
|
|
|
|
|
|
|
my $visualization_mask_width = @v_mask; |
1438
|
|
|
|
|
|
|
my $original_data_mask = $self->{_mask}; |
1439
|
|
|
|
|
|
|
my @mask = split //, $original_data_mask; |
1440
|
|
|
|
|
|
|
my $data_field_width = scalar grep {$_ eq '1'} @mask; |
1441
|
|
|
|
|
|
|
die "\n\nABORTED: The width of the visualization mask (including " . |
1442
|
|
|
|
|
|
|
"all its 1s and 0s) must equal the width of the original mask " . |
1443
|
|
|
|
|
|
|
"used for reading the data file (counting only the 1's)" |
1444
|
|
|
|
|
|
|
if $visualization_mask_width != $data_field_width; |
1445
|
|
|
|
|
|
|
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask; |
1446
|
|
|
|
|
|
|
if ($visualization_data_field_width == 2) { |
1447
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
1448
|
|
|
|
|
|
|
my $contour_filename = "__contour_" . $cluster_index . ".dat"; |
1449
|
|
|
|
|
|
|
my $mean = $self->{_cluster_means}->[$cluster_index]; |
1450
|
|
|
|
|
|
|
my $covariance = $self->{_cluster_covariances}->[$cluster_index]; |
1451
|
|
|
|
|
|
|
my ($mux,$muy) = $mean->as_list(); |
1452
|
|
|
|
|
|
|
my ($varx,$sigmaxy) = $covariance->row(0)->as_list(); |
1453
|
|
|
|
|
|
|
my ($sigmayx,$vary) = $covariance->row(1)->as_list(); |
1454
|
|
|
|
|
|
|
die "Your covariance matrix does not look right" |
1455
|
|
|
|
|
|
|
unless $sigmaxy == $sigmayx; |
1456
|
|
|
|
|
|
|
my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary)); |
1457
|
|
|
|
|
|
|
my $argstring = <<"END"; |
1458
|
|
|
|
|
|
|
set contour |
1459
|
|
|
|
|
|
|
mux = $mux |
1460
|
|
|
|
|
|
|
muy = $muy |
1461
|
|
|
|
|
|
|
sigmax = $sigmax |
1462
|
|
|
|
|
|
|
sigmay = $sigmay |
1463
|
|
|
|
|
|
|
sigmaxy = $sigmaxy |
1464
|
|
|
|
|
|
|
determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2 |
1465
|
|
|
|
|
|
|
exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) ) |
1466
|
|
|
|
|
|
|
f(x,y) = exp( exponent(x,y) ) - 0.2 |
1467
|
|
|
|
|
|
|
xmax = mux + 2 * sigmax |
1468
|
|
|
|
|
|
|
xmin = mux - 2 * sigmax |
1469
|
|
|
|
|
|
|
ymax = muy + 2 * sigmay |
1470
|
|
|
|
|
|
|
ymin = muy - 2 * sigmay |
1471
|
|
|
|
|
|
|
set xrange [ xmin : xmax ] |
1472
|
|
|
|
|
|
|
set yrange [ ymin : ymax ] |
1473
|
|
|
|
|
|
|
set isosamples 200 |
1474
|
|
|
|
|
|
|
unset surface |
1475
|
|
|
|
|
|
|
set cntrparam levels discrete 0 |
1476
|
|
|
|
|
|
|
set table \"$contour_filename\" |
1477
|
|
|
|
|
|
|
splot f(x,y) |
1478
|
|
|
|
|
|
|
unset table |
1479
|
|
|
|
|
|
|
END |
1480
|
|
|
|
|
|
|
my $plot = Graphics::GnuplotIF->new(); |
1481
|
|
|
|
|
|
|
$plot->gnuplot_cmd( $argstring ); |
1482
|
|
|
|
|
|
|
} |
1483
|
|
|
|
|
|
|
} |
1484
|
|
|
|
|
|
|
my %visualization_data; |
1485
|
|
|
|
|
|
|
while ( my ($record_id, $data) = each %{$self->{_data}} ) { |
1486
|
|
|
|
|
|
|
my @fields = @$data; |
1487
|
|
|
|
|
|
|
die "\nABORTED: Visualization mask size exceeds data record size" |
1488
|
|
|
|
|
|
|
if $#v_mask > $#fields; |
1489
|
|
|
|
|
|
|
my @data_fields; |
1490
|
|
|
|
|
|
|
foreach my $i (0..@fields-1) { |
1491
|
|
|
|
|
|
|
if ($v_mask[$i] eq '0') { |
1492
|
|
|
|
|
|
|
next; |
1493
|
|
|
|
|
|
|
} elsif ($v_mask[$i] eq '1') { |
1494
|
|
|
|
|
|
|
push @data_fields, $fields[$i]; |
1495
|
|
|
|
|
|
|
} else { |
1496
|
|
|
|
|
|
|
die "Misformed visualization mask. It can only have 1s and 0s"; |
1497
|
|
|
|
|
|
|
} |
1498
|
|
|
|
|
|
|
} |
1499
|
|
|
|
|
|
|
$visualization_data{ $record_id } = \@data_fields; |
1500
|
|
|
|
|
|
|
} |
1501
|
|
|
|
|
|
|
my $K = scalar @{$self->{_clusters}}; |
1502
|
|
|
|
|
|
|
my $filename = basename($master_datafile); |
1503
|
|
|
|
|
|
|
my $temp_file = "__temp_" . $filename; |
1504
|
|
|
|
|
|
|
unlink $temp_file if -e $temp_file; |
1505
|
|
|
|
|
|
|
open OUTPUT, ">$temp_file" |
1506
|
|
|
|
|
|
|
or die "Unable to open a temp file in this directory: $!"; |
1507
|
|
|
|
|
|
|
foreach my $cluster (@{$self->{_clusters}}) { |
1508
|
|
|
|
|
|
|
foreach my $item (@$cluster) { |
1509
|
|
|
|
|
|
|
print OUTPUT "@{$visualization_data{$item}}"; |
1510
|
|
|
|
|
|
|
print OUTPUT "\n"; |
1511
|
|
|
|
|
|
|
} |
1512
|
|
|
|
|
|
|
print OUTPUT "\n\n"; |
1513
|
|
|
|
|
|
|
} |
1514
|
|
|
|
|
|
|
close OUTPUT; |
1515
|
|
|
|
|
|
|
my $plot; |
1516
|
|
|
|
|
|
|
if (!defined $pause_time) { |
1517
|
|
|
|
|
|
|
$plot = Graphics::GnuplotIF->new( persist => 1 ); |
1518
|
|
|
|
|
|
|
} else { |
1519
|
|
|
|
|
|
|
$plot = Graphics::GnuplotIF->new(); |
1520
|
|
|
|
|
|
|
} |
1521
|
|
|
|
|
|
|
my $arg_string = ""; |
1522
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
1523
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set noclip" ); |
1524
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set pointsize 2" ); |
1525
|
|
|
|
|
|
|
foreach my $i (0..$K-1) { |
1526
|
|
|
|
|
|
|
my $j = $i + 1; |
1527
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, "; |
1528
|
|
|
|
|
|
|
} |
1529
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
1530
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set noclip" ); |
1531
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set pointsize 2" ); |
1532
|
|
|
|
|
|
|
foreach my $i (0..$K-1) { |
1533
|
|
|
|
|
|
|
my $j = $i + 1; |
1534
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, "; |
1535
|
|
|
|
|
|
|
my $ellipse_filename = "__contour_" . $i . ".dat"; |
1536
|
|
|
|
|
|
|
$arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", "; |
1537
|
|
|
|
|
|
|
} |
1538
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1 ) { |
1539
|
|
|
|
|
|
|
open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!"; |
1540
|
|
|
|
|
|
|
my @all_data = ; |
1541
|
|
|
|
|
|
|
close INPUT; |
1542
|
|
|
|
|
|
|
@all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data; |
1543
|
|
|
|
|
|
|
my $all_joined_data = join ':', @all_data; |
1544
|
|
|
|
|
|
|
my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data; |
1545
|
|
|
|
|
|
|
my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals); |
1546
|
|
|
|
|
|
|
foreach my $i (0..@separated-1) { |
1547
|
|
|
|
|
|
|
$separated[$i] =~ s/SEPERATOR//g; |
1548
|
|
|
|
|
|
|
my @cluster_for_hist = split /:/, $separated[$i]; |
1549
|
|
|
|
|
|
|
@cluster_for_hist = grep $_, @cluster_for_hist; |
1550
|
|
|
|
|
|
|
my ($minval,$maxval) = minmax(\@cluster_for_hist); |
1551
|
|
|
|
|
|
|
push @all_minvals, $minval; |
1552
|
|
|
|
|
|
|
push @all_maxvals, $maxval; |
1553
|
|
|
|
|
|
|
push @all_clusters_for_hist, \@cluster_for_hist; |
1554
|
|
|
|
|
|
|
} |
1555
|
|
|
|
|
|
|
push @all_minmaxvals, @all_minvals; |
1556
|
|
|
|
|
|
|
push @all_minmaxvals, @all_maxvals; |
1557
|
|
|
|
|
|
|
my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals); |
1558
|
|
|
|
|
|
|
my $delta = ($abs_maxval - $abs_minval) / 100.0; |
1559
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set boxwidth 3"); |
1560
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style fill solid border -1"); |
1561
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set ytics out nomirror"); |
1562
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style data histograms"); |
1563
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style histogram clustered"); |
1564
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set title 'Clusters shown through histograms'"); |
1565
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror"); |
1566
|
|
|
|
|
|
|
foreach my $cindex (0..@all_clusters_for_hist-1) { |
1567
|
|
|
|
|
|
|
my $filename = basename($master_datafile); |
1568
|
|
|
|
|
|
|
my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $filename; |
1569
|
|
|
|
|
|
|
unlink $temp_file if -e $temp_file; |
1570
|
|
|
|
|
|
|
open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!"; |
1571
|
|
|
|
|
|
|
print OUTPUT "Xstep histval\n"; |
1572
|
|
|
|
|
|
|
my @histogram = (0) x 100; |
1573
|
|
|
|
|
|
|
foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) { |
1574
|
|
|
|
|
|
|
$histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++; |
1575
|
|
|
|
|
|
|
} |
1576
|
|
|
|
|
|
|
foreach my $i (0..@histogram-1) { |
1577
|
|
|
|
|
|
|
print OUTPUT "$i $histogram[$i]\n"; |
1578
|
|
|
|
|
|
|
} |
1579
|
|
|
|
|
|
|
# $arg_string .= "\"$temp_file\" using 1:2 ti col smooth frequency with boxes lc $cindex, "; |
1580
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, "; |
1581
|
|
|
|
|
|
|
close OUTPUT; |
1582
|
|
|
|
|
|
|
} |
1583
|
|
|
|
|
|
|
} |
1584
|
|
|
|
|
|
|
$arg_string = $arg_string =~ /^(.*),[ ]+$/; |
1585
|
|
|
|
|
|
|
$arg_string = $1; |
1586
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
1587
|
|
|
|
|
|
|
$plot->gnuplot_cmd( 'set terminal png color', |
1588
|
|
|
|
|
|
|
'set output "cluster_plot.png"'); |
1589
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "splot $arg_string" ); |
1590
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
1591
|
|
|
|
|
|
|
$plot->gnuplot_cmd('set terminal png', |
1592
|
|
|
|
|
|
|
'set output "cluster_plot.png"'); |
1593
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "plot $arg_string" ); |
1594
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1) { |
1595
|
|
|
|
|
|
|
$plot->gnuplot_cmd('set terminal png', |
1596
|
|
|
|
|
|
|
'set output "cluster_plot.png"'); |
1597
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "plot $arg_string" ); |
1598
|
|
|
|
|
|
|
} |
1599
|
|
|
|
|
|
|
} |
1600
|
|
|
|
|
|
|
|
1601
|
|
|
|
|
|
|
# This method is for the visualization of the posterior class distributions. In |
1602
|
|
|
|
|
|
|
# other words, this method allows us to see the soft clustering produced by the EM |
1603
|
|
|
|
|
|
|
# algorithm. While much of the gnuplot logic here is the same as in the |
1604
|
|
|
|
|
|
|
# visualize_clusters() method, there are significant differences in how the data is |
1605
|
|
|
|
|
|
|
# pooled for the purpose of display. |
1606
|
|
|
|
|
|
|
sub visualize_distributions { |
1607
|
|
|
|
|
|
|
my $self = shift; |
1608
|
|
|
|
|
|
|
my $v_mask; |
1609
|
|
|
|
|
|
|
my $pause_time; |
1610
|
|
|
|
|
|
|
if (@_ == 1) { |
1611
|
|
|
|
|
|
|
$v_mask = shift || die "visualization mask missing"; |
1612
|
|
|
|
|
|
|
} elsif (@_ == 2) { |
1613
|
|
|
|
|
|
|
$v_mask = shift || die "visualization mask missing"; |
1614
|
|
|
|
|
|
|
$pause_time = shift; |
1615
|
|
|
|
|
|
|
} else { |
1616
|
|
|
|
|
|
|
die "visualize_distributions() called with wrong args"; |
1617
|
|
|
|
|
|
|
} |
1618
|
|
|
|
|
|
|
my @v_mask = split //, $v_mask; |
1619
|
|
|
|
|
|
|
my $visualization_mask_width = @v_mask; |
1620
|
|
|
|
|
|
|
my $original_data_mask = $self->{_mask}; |
1621
|
|
|
|
|
|
|
my @mask = split //, $original_data_mask; |
1622
|
|
|
|
|
|
|
my $data_field_width = scalar grep {$_ eq '1'} @mask; |
1623
|
|
|
|
|
|
|
die "\n\nABORTED: The width of the visualization mask (including " . |
1624
|
|
|
|
|
|
|
"all its 1s and 0s) must equal the width of the original mask " . |
1625
|
|
|
|
|
|
|
"used for reading the data file (counting only the 1's)" |
1626
|
|
|
|
|
|
|
if $visualization_mask_width != $data_field_width; |
1627
|
|
|
|
|
|
|
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask; |
1628
|
|
|
|
|
|
|
if ($visualization_data_field_width == 2) { |
1629
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
1630
|
|
|
|
|
|
|
my $contour_filename = "__contour2_" . $cluster_index . ".dat"; |
1631
|
|
|
|
|
|
|
my $mean = $self->{_cluster_means}->[$cluster_index]; |
1632
|
|
|
|
|
|
|
my $covariance = $self->{_cluster_covariances}->[$cluster_index]; |
1633
|
|
|
|
|
|
|
my ($mux,$muy) = $mean->as_list(); |
1634
|
|
|
|
|
|
|
my ($varx,$sigmaxy) = $covariance->row(0)->as_list(); |
1635
|
|
|
|
|
|
|
my ($sigmayx,$vary) = $covariance->row(1)->as_list(); |
1636
|
|
|
|
|
|
|
die "Your covariance matrix does not look right" |
1637
|
|
|
|
|
|
|
unless $sigmaxy == $sigmayx; |
1638
|
|
|
|
|
|
|
my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary)); |
1639
|
|
|
|
|
|
|
my $argstring = <<"END"; |
1640
|
|
|
|
|
|
|
set contour |
1641
|
|
|
|
|
|
|
mux = $mux |
1642
|
|
|
|
|
|
|
muy = $muy |
1643
|
|
|
|
|
|
|
sigmax = $sigmax |
1644
|
|
|
|
|
|
|
sigmay = $sigmay |
1645
|
|
|
|
|
|
|
sigmaxy = $sigmaxy |
1646
|
|
|
|
|
|
|
determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2 |
1647
|
|
|
|
|
|
|
exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) ) |
1648
|
|
|
|
|
|
|
f(x,y) = exp( exponent(x,y) ) - 0.2 |
1649
|
|
|
|
|
|
|
xmax = mux + 2 * sigmax |
1650
|
|
|
|
|
|
|
xmin = mux - 2 * sigmax |
1651
|
|
|
|
|
|
|
ymax = muy + 2 * sigmay |
1652
|
|
|
|
|
|
|
ymin = muy - 2 * sigmay |
1653
|
|
|
|
|
|
|
set xrange [ xmin : xmax ] |
1654
|
|
|
|
|
|
|
set yrange [ ymin : ymax ] |
1655
|
|
|
|
|
|
|
set isosamples 200 |
1656
|
|
|
|
|
|
|
unset surface |
1657
|
|
|
|
|
|
|
set cntrparam levels discrete 0 |
1658
|
|
|
|
|
|
|
set table \"$contour_filename\" |
1659
|
|
|
|
|
|
|
splot f(x,y) |
1660
|
|
|
|
|
|
|
unset table |
1661
|
|
|
|
|
|
|
END |
1662
|
|
|
|
|
|
|
my $plot = Graphics::GnuplotIF->new(); |
1663
|
|
|
|
|
|
|
$plot->gnuplot_cmd( $argstring ); |
1664
|
|
|
|
|
|
|
} |
1665
|
|
|
|
|
|
|
} |
1666
|
|
|
|
|
|
|
my %visualization_data; |
1667
|
|
|
|
|
|
|
while ( my ($record_id, $data) = each %{$self->{_data}} ) { |
1668
|
|
|
|
|
|
|
my @fields = @$data; |
1669
|
|
|
|
|
|
|
die "\nABORTED: Visualization mask size exceeds data record size" |
1670
|
|
|
|
|
|
|
if $#v_mask > $#fields; |
1671
|
|
|
|
|
|
|
my @data_fields; |
1672
|
|
|
|
|
|
|
foreach my $i (0..@fields-1) { |
1673
|
|
|
|
|
|
|
if ($v_mask[$i] eq '0') { |
1674
|
|
|
|
|
|
|
next; |
1675
|
|
|
|
|
|
|
} elsif ($v_mask[$i] eq '1') { |
1676
|
|
|
|
|
|
|
push @data_fields, $fields[$i]; |
1677
|
|
|
|
|
|
|
} else { |
1678
|
|
|
|
|
|
|
die "Misformed visualization mask. It can only have 1s and 0s"; |
1679
|
|
|
|
|
|
|
} |
1680
|
|
|
|
|
|
|
} |
1681
|
|
|
|
|
|
|
$visualization_data{ $record_id } = \@data_fields; |
1682
|
|
|
|
|
|
|
} |
1683
|
|
|
|
|
|
|
my $filename = basename($self->{_datafile}); |
1684
|
|
|
|
|
|
|
my $temp_file = "__temp2_" . $filename; |
1685
|
|
|
|
|
|
|
unlink $temp_file if -e $temp_file; |
1686
|
|
|
|
|
|
|
open OUTPUT, ">$temp_file" |
1687
|
|
|
|
|
|
|
or die "Unable to open a temp file in this directory: $!"; |
1688
|
|
|
|
|
|
|
my @class_distributions; |
1689
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
1690
|
|
|
|
|
|
|
push @class_distributions, []; |
1691
|
|
|
|
|
|
|
} |
1692
|
|
|
|
|
|
|
foreach my $data_tag (@{$self->{_data_id_tags}}) { |
1693
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
1694
|
|
|
|
|
|
|
push @{$class_distributions[$cluster_index]}, $self->{_data}->{$data_tag} |
1695
|
|
|
|
|
|
|
if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] > 0.2; |
1696
|
|
|
|
|
|
|
} |
1697
|
|
|
|
|
|
|
} |
1698
|
|
|
|
|
|
|
foreach my $distribution (@class_distributions) { |
1699
|
|
|
|
|
|
|
foreach my $item (@$distribution) { |
1700
|
|
|
|
|
|
|
print OUTPUT "@$item"; |
1701
|
|
|
|
|
|
|
print OUTPUT "\n"; |
1702
|
|
|
|
|
|
|
} |
1703
|
|
|
|
|
|
|
print OUTPUT "\n\n"; |
1704
|
|
|
|
|
|
|
} |
1705
|
|
|
|
|
|
|
close OUTPUT; |
1706
|
|
|
|
|
|
|
my $plot; |
1707
|
|
|
|
|
|
|
if (!defined $pause_time) { |
1708
|
|
|
|
|
|
|
$plot = Graphics::GnuplotIF->new( persist => 1 ); |
1709
|
|
|
|
|
|
|
} else { |
1710
|
|
|
|
|
|
|
$plot = Graphics::GnuplotIF->new(); |
1711
|
|
|
|
|
|
|
} |
1712
|
|
|
|
|
|
|
my $arg_string = ""; |
1713
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
1714
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set noclip" ); |
1715
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set pointsize 2" ); |
1716
|
|
|
|
|
|
|
foreach my $i (0..$self->{_K}-1) { |
1717
|
|
|
|
|
|
|
my $j = $i + 1; |
1718
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, "; |
1719
|
|
|
|
|
|
|
} |
1720
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
1721
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set noclip" ); |
1722
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set pointsize 2" ); |
1723
|
|
|
|
|
|
|
foreach my $i (0..$self->{_K}-1) { |
1724
|
|
|
|
|
|
|
my $j = $i + 1; |
1725
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, "; |
1726
|
|
|
|
|
|
|
my $ellipse_filename = "__contour2_" . $i . ".dat"; |
1727
|
|
|
|
|
|
|
$arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", "; |
1728
|
|
|
|
|
|
|
} |
1729
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1 ) { |
1730
|
|
|
|
|
|
|
open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!"; |
1731
|
|
|
|
|
|
|
my @all_data = ; |
1732
|
|
|
|
|
|
|
close INPUT; |
1733
|
|
|
|
|
|
|
@all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data; |
1734
|
|
|
|
|
|
|
my $all_joined_data = join ':', @all_data; |
1735
|
|
|
|
|
|
|
my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data; |
1736
|
|
|
|
|
|
|
my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals); |
1737
|
|
|
|
|
|
|
foreach my $i (0..@separated-1) { |
1738
|
|
|
|
|
|
|
$separated[$i] =~ s/SEPERATOR//g; |
1739
|
|
|
|
|
|
|
my @cluster_for_hist = split /:/, $separated[$i]; |
1740
|
|
|
|
|
|
|
@cluster_for_hist = grep $_, @cluster_for_hist; |
1741
|
|
|
|
|
|
|
my ($minval,$maxval) = minmax(\@cluster_for_hist); |
1742
|
|
|
|
|
|
|
push @all_minvals, $minval; |
1743
|
|
|
|
|
|
|
push @all_maxvals, $maxval; |
1744
|
|
|
|
|
|
|
push @all_clusters_for_hist, \@cluster_for_hist; |
1745
|
|
|
|
|
|
|
} |
1746
|
|
|
|
|
|
|
push @all_minmaxvals, @all_minvals; |
1747
|
|
|
|
|
|
|
push @all_minmaxvals, @all_maxvals; |
1748
|
|
|
|
|
|
|
my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals); |
1749
|
|
|
|
|
|
|
my $delta = ($abs_maxval - $abs_minval) / 100.0; |
1750
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set boxwidth 3"); |
1751
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style fill solid border -1"); |
1752
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set ytics out nomirror"); |
1753
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style data histograms"); |
1754
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style histogram clustered"); |
1755
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set title 'Individual distributions shown through histograms'"); |
1756
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror"); |
1757
|
|
|
|
|
|
|
foreach my $cindex (0..@all_clusters_for_hist-1) { |
1758
|
|
|
|
|
|
|
my $localfilename = basename($filename); |
1759
|
|
|
|
|
|
|
my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $localfilename; |
1760
|
|
|
|
|
|
|
unlink $temp_file if -e $temp_file; |
1761
|
|
|
|
|
|
|
open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!"; |
1762
|
|
|
|
|
|
|
print OUTPUT "Xstep histval\n"; |
1763
|
|
|
|
|
|
|
|
1764
|
|
|
|
|
|
|
my @histogram = (0) x 100; |
1765
|
|
|
|
|
|
|
foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) { |
1766
|
|
|
|
|
|
|
$histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++; |
1767
|
|
|
|
|
|
|
} |
1768
|
|
|
|
|
|
|
foreach my $i (0..@histogram-1) { |
1769
|
|
|
|
|
|
|
print OUTPUT "$i $histogram[$i]\n"; |
1770
|
|
|
|
|
|
|
} |
1771
|
|
|
|
|
|
|
# $arg_string .= "\"$temp_file\" using 1:2 ti col smooth frequency with boxes lc $cindex, "; |
1772
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, "; |
1773
|
|
|
|
|
|
|
close OUTPUT; |
1774
|
|
|
|
|
|
|
} |
1775
|
|
|
|
|
|
|
} |
1776
|
|
|
|
|
|
|
$arg_string = $arg_string =~ /^(.*),[ ]+$/; |
1777
|
|
|
|
|
|
|
$arg_string = $1; |
1778
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
1779
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "splot $arg_string" ); |
1780
|
|
|
|
|
|
|
$plot->gnuplot_pause( $pause_time ) if defined $pause_time; |
1781
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
1782
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "plot $arg_string" ); |
1783
|
|
|
|
|
|
|
$plot->gnuplot_pause( $pause_time ) if defined $pause_time; |
1784
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1) { |
1785
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "plot $arg_string" ); |
1786
|
|
|
|
|
|
|
$plot->gnuplot_pause( $pause_time ) if defined $pause_time; |
1787
|
|
|
|
|
|
|
} |
1788
|
|
|
|
|
|
|
} |
1789
|
|
|
|
|
|
|
|
1790
|
|
|
|
|
|
|
# This method is basically the same as the previous method, except that it is |
1791
|
|
|
|
|
|
|
# intended for making PNG files from the distributions. |
1792
|
|
|
|
|
|
|
sub plot_hardcopy_distributions { |
1793
|
|
|
|
|
|
|
my $self = shift; |
1794
|
|
|
|
|
|
|
my $v_mask; |
1795
|
|
|
|
|
|
|
my $pause_time; |
1796
|
|
|
|
|
|
|
if (@_ == 1) { |
1797
|
|
|
|
|
|
|
$v_mask = shift || die "visualization mask missing"; |
1798
|
|
|
|
|
|
|
} elsif (@_ == 2) { |
1799
|
|
|
|
|
|
|
$v_mask = shift || die "visualization mask missing"; |
1800
|
|
|
|
|
|
|
$pause_time = shift; |
1801
|
|
|
|
|
|
|
} else { |
1802
|
|
|
|
|
|
|
die "visualize_distributions() called with wrong args"; |
1803
|
|
|
|
|
|
|
} |
1804
|
|
|
|
|
|
|
my @v_mask = split //, $v_mask; |
1805
|
|
|
|
|
|
|
my $visualization_mask_width = @v_mask; |
1806
|
|
|
|
|
|
|
my $original_data_mask = $self->{_mask}; |
1807
|
|
|
|
|
|
|
my @mask = split //, $original_data_mask; |
1808
|
|
|
|
|
|
|
my $data_field_width = scalar grep {$_ eq '1'} @mask; |
1809
|
|
|
|
|
|
|
die "\n\nABORTED: The width of the visualization mask (including " . |
1810
|
|
|
|
|
|
|
"all its 1s and 0s) must equal the width of the original mask " . |
1811
|
|
|
|
|
|
|
"used for reading the data file (counting only the 1's)" |
1812
|
|
|
|
|
|
|
if $visualization_mask_width != $data_field_width; |
1813
|
|
|
|
|
|
|
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask; |
1814
|
|
|
|
|
|
|
if ($visualization_data_field_width == 2) { |
1815
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
1816
|
|
|
|
|
|
|
my $contour_filename = "__contour2_" . $cluster_index . ".dat"; |
1817
|
|
|
|
|
|
|
my $mean = $self->{_cluster_means}->[$cluster_index]; |
1818
|
|
|
|
|
|
|
my $covariance = $self->{_cluster_covariances}->[$cluster_index]; |
1819
|
|
|
|
|
|
|
my ($mux,$muy) = $mean->as_list(); |
1820
|
|
|
|
|
|
|
my ($varx,$sigmaxy) = $covariance->row(0)->as_list(); |
1821
|
|
|
|
|
|
|
my ($sigmayx,$vary) = $covariance->row(1)->as_list(); |
1822
|
|
|
|
|
|
|
die "Your covariance matrix does not look right" |
1823
|
|
|
|
|
|
|
unless $sigmaxy == $sigmayx; |
1824
|
|
|
|
|
|
|
my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary)); |
1825
|
|
|
|
|
|
|
my $argstring = <<"END"; |
1826
|
|
|
|
|
|
|
set contour |
1827
|
|
|
|
|
|
|
mux = $mux |
1828
|
|
|
|
|
|
|
muy = $muy |
1829
|
|
|
|
|
|
|
sigmax = $sigmax |
1830
|
|
|
|
|
|
|
sigmay = $sigmay |
1831
|
|
|
|
|
|
|
sigmaxy = $sigmaxy |
1832
|
|
|
|
|
|
|
determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2 |
1833
|
|
|
|
|
|
|
exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) ) |
1834
|
|
|
|
|
|
|
f(x,y) = exp( exponent(x,y) ) - 0.2 |
1835
|
|
|
|
|
|
|
xmax = mux + 2 * sigmax |
1836
|
|
|
|
|
|
|
xmin = mux - 2 * sigmax |
1837
|
|
|
|
|
|
|
ymax = muy + 2 * sigmay |
1838
|
|
|
|
|
|
|
ymin = muy - 2 * sigmay |
1839
|
|
|
|
|
|
|
set xrange [ xmin : xmax ] |
1840
|
|
|
|
|
|
|
set yrange [ ymin : ymax ] |
1841
|
|
|
|
|
|
|
set isosamples 200 |
1842
|
|
|
|
|
|
|
unset surface |
1843
|
|
|
|
|
|
|
set cntrparam levels discrete 0 |
1844
|
|
|
|
|
|
|
set table \"$contour_filename\" |
1845
|
|
|
|
|
|
|
splot f(x,y) |
1846
|
|
|
|
|
|
|
unset table |
1847
|
|
|
|
|
|
|
END |
1848
|
|
|
|
|
|
|
my $plot = Graphics::GnuplotIF->new(); |
1849
|
|
|
|
|
|
|
$plot->gnuplot_cmd( $argstring ); |
1850
|
|
|
|
|
|
|
} |
1851
|
|
|
|
|
|
|
} |
1852
|
|
|
|
|
|
|
my %visualization_data; |
1853
|
|
|
|
|
|
|
while ( my ($record_id, $data) = each %{$self->{_data}} ) { |
1854
|
|
|
|
|
|
|
my @fields = @$data; |
1855
|
|
|
|
|
|
|
die "\nABORTED: Visualization mask size exceeds data record size" |
1856
|
|
|
|
|
|
|
if $#v_mask > $#fields; |
1857
|
|
|
|
|
|
|
my @data_fields; |
1858
|
|
|
|
|
|
|
foreach my $i (0..@fields-1) { |
1859
|
|
|
|
|
|
|
if ($v_mask[$i] eq '0') { |
1860
|
|
|
|
|
|
|
next; |
1861
|
|
|
|
|
|
|
} elsif ($v_mask[$i] eq '1') { |
1862
|
|
|
|
|
|
|
push @data_fields, $fields[$i]; |
1863
|
|
|
|
|
|
|
} else { |
1864
|
|
|
|
|
|
|
die "Misformed visualization mask. It can only have 1s and 0s"; |
1865
|
|
|
|
|
|
|
} |
1866
|
|
|
|
|
|
|
} |
1867
|
|
|
|
|
|
|
$visualization_data{ $record_id } = \@data_fields; |
1868
|
|
|
|
|
|
|
} |
1869
|
|
|
|
|
|
|
my $filename = basename($self->{_datafile}); |
1870
|
|
|
|
|
|
|
my $temp_file = "__temp2_" . $filename; |
1871
|
|
|
|
|
|
|
unlink $temp_file if -e $temp_file; |
1872
|
|
|
|
|
|
|
open OUTPUT, ">$temp_file" |
1873
|
|
|
|
|
|
|
or die "Unable to open a temp file in this directory: $!"; |
1874
|
|
|
|
|
|
|
my @class_distributions; |
1875
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
1876
|
|
|
|
|
|
|
push @class_distributions, []; |
1877
|
|
|
|
|
|
|
} |
1878
|
|
|
|
|
|
|
foreach my $data_tag (@{$self->{_data_id_tags}}) { |
1879
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
1880
|
|
|
|
|
|
|
push @{$class_distributions[$cluster_index]}, $self->{_data}->{$data_tag} |
1881
|
|
|
|
|
|
|
if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] > 0.2; |
1882
|
|
|
|
|
|
|
} |
1883
|
|
|
|
|
|
|
} |
1884
|
|
|
|
|
|
|
foreach my $distribution (@class_distributions) { |
1885
|
|
|
|
|
|
|
foreach my $item (@$distribution) { |
1886
|
|
|
|
|
|
|
print OUTPUT "@$item"; |
1887
|
|
|
|
|
|
|
print OUTPUT "\n"; |
1888
|
|
|
|
|
|
|
} |
1889
|
|
|
|
|
|
|
print OUTPUT "\n\n"; |
1890
|
|
|
|
|
|
|
} |
1891
|
|
|
|
|
|
|
close OUTPUT; |
1892
|
|
|
|
|
|
|
my $plot; |
1893
|
|
|
|
|
|
|
if (!defined $pause_time) { |
1894
|
|
|
|
|
|
|
$plot = Graphics::GnuplotIF->new( persist => 1 ); |
1895
|
|
|
|
|
|
|
} else { |
1896
|
|
|
|
|
|
|
$plot = Graphics::GnuplotIF->new(); |
1897
|
|
|
|
|
|
|
} |
1898
|
|
|
|
|
|
|
my $arg_string = ""; |
1899
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
1900
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set noclip" ); |
1901
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set pointsize 2" ); |
1902
|
|
|
|
|
|
|
foreach my $i (0..$self->{_K}-1) { |
1903
|
|
|
|
|
|
|
my $j = $i + 1; |
1904
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, "; |
1905
|
|
|
|
|
|
|
} |
1906
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
1907
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set noclip" ); |
1908
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set pointsize 2" ); |
1909
|
|
|
|
|
|
|
foreach my $i (0..$self->{_K}-1) { |
1910
|
|
|
|
|
|
|
my $j = $i + 1; |
1911
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, "; |
1912
|
|
|
|
|
|
|
my $ellipse_filename = "__contour2_" . $i . ".dat"; |
1913
|
|
|
|
|
|
|
$arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", "; |
1914
|
|
|
|
|
|
|
} |
1915
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1 ) { |
1916
|
|
|
|
|
|
|
open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!"; |
1917
|
|
|
|
|
|
|
my @all_data = ; |
1918
|
|
|
|
|
|
|
close INPUT; |
1919
|
|
|
|
|
|
|
@all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data; |
1920
|
|
|
|
|
|
|
my $all_joined_data = join ':', @all_data; |
1921
|
|
|
|
|
|
|
my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data; |
1922
|
|
|
|
|
|
|
my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals); |
1923
|
|
|
|
|
|
|
foreach my $i (0..@separated-1) { |
1924
|
|
|
|
|
|
|
$separated[$i] =~ s/SEPERATOR//g; |
1925
|
|
|
|
|
|
|
my @cluster_for_hist = split /:/, $separated[$i]; |
1926
|
|
|
|
|
|
|
@cluster_for_hist = grep $_, @cluster_for_hist; |
1927
|
|
|
|
|
|
|
my ($minval,$maxval) = minmax(\@cluster_for_hist); |
1928
|
|
|
|
|
|
|
push @all_minvals, $minval; |
1929
|
|
|
|
|
|
|
push @all_maxvals, $maxval; |
1930
|
|
|
|
|
|
|
push @all_clusters_for_hist, \@cluster_for_hist; |
1931
|
|
|
|
|
|
|
} |
1932
|
|
|
|
|
|
|
push @all_minmaxvals, @all_minvals; |
1933
|
|
|
|
|
|
|
push @all_minmaxvals, @all_maxvals; |
1934
|
|
|
|
|
|
|
my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals); |
1935
|
|
|
|
|
|
|
my $delta = ($abs_maxval - $abs_minval) / 100.0; |
1936
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set boxwidth 3"); |
1937
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style fill solid border -1"); |
1938
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set ytics out nomirror"); |
1939
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style data histograms"); |
1940
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style histogram clustered"); |
1941
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set title 'Individual distributions shown through histograms'"); |
1942
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror"); |
1943
|
|
|
|
|
|
|
foreach my $cindex (0..@all_clusters_for_hist-1) { |
1944
|
|
|
|
|
|
|
my $localfilename = basename($filename); |
1945
|
|
|
|
|
|
|
my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $localfilename; |
1946
|
|
|
|
|
|
|
unlink $temp_file if -e $temp_file; |
1947
|
|
|
|
|
|
|
open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!"; |
1948
|
|
|
|
|
|
|
print OUTPUT "Xstep histval\n"; |
1949
|
|
|
|
|
|
|
|
1950
|
|
|
|
|
|
|
my @histogram = (0) x 100; |
1951
|
|
|
|
|
|
|
foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) { |
1952
|
|
|
|
|
|
|
$histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++; |
1953
|
|
|
|
|
|
|
} |
1954
|
|
|
|
|
|
|
foreach my $i (0..@histogram-1) { |
1955
|
|
|
|
|
|
|
print OUTPUT "$i $histogram[$i]\n"; |
1956
|
|
|
|
|
|
|
} |
1957
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, "; |
1958
|
|
|
|
|
|
|
close OUTPUT; |
1959
|
|
|
|
|
|
|
} |
1960
|
|
|
|
|
|
|
} |
1961
|
|
|
|
|
|
|
$arg_string = $arg_string =~ /^(.*),[ ]+$/; |
1962
|
|
|
|
|
|
|
$arg_string = $1; |
1963
|
|
|
|
|
|
|
|
1964
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
1965
|
|
|
|
|
|
|
$plot->gnuplot_cmd( 'set terminal png', |
1966
|
|
|
|
|
|
|
'set output "posterior_prob_plot.png"'); |
1967
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "splot $arg_string" ); |
1968
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
1969
|
|
|
|
|
|
|
$plot->gnuplot_cmd( 'set terminal png', |
1970
|
|
|
|
|
|
|
'set output "posterior_prob_plot.png"'); |
1971
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "plot $arg_string" ); |
1972
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1) { |
1973
|
|
|
|
|
|
|
$plot->gnuplot_cmd( 'set terminal png', |
1974
|
|
|
|
|
|
|
'set output "posterior_prob_plot.png"'); |
1975
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "plot $arg_string" ); |
1976
|
|
|
|
|
|
|
} |
1977
|
|
|
|
|
|
|
} |
1978
|
|
|
|
|
|
|
|
1979
|
|
|
|
|
|
|
# The method shown below should be called only AFTER you have called the method |
1980
|
|
|
|
|
|
|
# read_data_from_file(). The visualize_data() is meant for the visualization of the |
1981
|
|
|
|
|
|
|
# original data in its various 2D or 3D subspaces. |
1982
|
|
|
|
|
|
|
sub visualize_data { |
1983
|
|
|
|
|
|
|
my $self = shift; |
1984
|
|
|
|
|
|
|
my $v_mask = shift || die "visualization mask missing"; |
1985
|
|
|
|
|
|
|
|
1986
|
|
|
|
|
|
|
my $master_datafile = $self->{_datafile}; |
1987
|
|
|
|
|
|
|
|
1988
|
|
|
|
|
|
|
my @v_mask = split //, $v_mask; |
1989
|
|
|
|
|
|
|
my $visualization_mask_width = @v_mask; |
1990
|
|
|
|
|
|
|
my $original_data_mask = $self->{_mask}; |
1991
|
|
|
|
|
|
|
my @mask = split //, $original_data_mask; |
1992
|
|
|
|
|
|
|
my $data_field_width = scalar grep {$_ eq '1'} @mask; |
1993
|
|
|
|
|
|
|
die "\n\nABORTED: The width of the visualization mask (including " . |
1994
|
|
|
|
|
|
|
"all its 1s and 0s) must equal the width of the original mask " . |
1995
|
|
|
|
|
|
|
"used for reading the data file (counting only the 1's)" |
1996
|
|
|
|
|
|
|
if $visualization_mask_width != $data_field_width; |
1997
|
|
|
|
|
|
|
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask; |
1998
|
|
|
|
|
|
|
my %visualization_data; |
1999
|
|
|
|
|
|
|
my $data_source = $self->{_data}; |
2000
|
|
|
|
|
|
|
while ( my ($record_id, $data) = each %{$data_source} ) { |
2001
|
|
|
|
|
|
|
my @fields = @$data; |
2002
|
|
|
|
|
|
|
die "\nABORTED: Visualization mask size exceeds data record size" |
2003
|
|
|
|
|
|
|
if $#v_mask > $#fields; |
2004
|
|
|
|
|
|
|
my @data_fields; |
2005
|
|
|
|
|
|
|
foreach my $i (0..@fields-1) { |
2006
|
|
|
|
|
|
|
if ($v_mask[$i] eq '0') { |
2007
|
|
|
|
|
|
|
next; |
2008
|
|
|
|
|
|
|
} elsif ($v_mask[$i] eq '1') { |
2009
|
|
|
|
|
|
|
push @data_fields, $fields[$i]; |
2010
|
|
|
|
|
|
|
} else { |
2011
|
|
|
|
|
|
|
die "Misformed visualization mask. It can only have 1s and 0s"; |
2012
|
|
|
|
|
|
|
} |
2013
|
|
|
|
|
|
|
} |
2014
|
|
|
|
|
|
|
$visualization_data{ $record_id } = \@data_fields; |
2015
|
|
|
|
|
|
|
} |
2016
|
|
|
|
|
|
|
my $filename = basename($master_datafile); |
2017
|
|
|
|
|
|
|
my $temp_file; |
2018
|
|
|
|
|
|
|
$temp_file = "__temp_data_" . $filename; |
2019
|
|
|
|
|
|
|
unlink $temp_file if -e $temp_file; |
2020
|
|
|
|
|
|
|
open OUTPUT, ">$temp_file" |
2021
|
|
|
|
|
|
|
or die "Unable to open a temp file in this directory: $!"; |
2022
|
|
|
|
|
|
|
foreach my $datapoint (values %visualization_data) { |
2023
|
|
|
|
|
|
|
print OUTPUT "@$datapoint"; |
2024
|
|
|
|
|
|
|
print OUTPUT "\n"; |
2025
|
|
|
|
|
|
|
} |
2026
|
|
|
|
|
|
|
close OUTPUT; |
2027
|
|
|
|
|
|
|
my $plot = Graphics::GnuplotIF->new( persist => 1 ); |
2028
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set noclip" ); |
2029
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set pointsize 2" ); |
2030
|
|
|
|
|
|
|
my $plot_title = '"original data provided for EM"'; |
2031
|
|
|
|
|
|
|
my $arg_string ; |
2032
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
2033
|
|
|
|
|
|
|
$arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1"; |
2034
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
2035
|
|
|
|
|
|
|
$arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1"; |
2036
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1 ) { |
2037
|
|
|
|
|
|
|
open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!"; |
2038
|
|
|
|
|
|
|
my @all_data = ; |
2039
|
|
|
|
|
|
|
close INPUT; |
2040
|
|
|
|
|
|
|
@all_data = map {chomp $_; $_} @all_data; |
2041
|
|
|
|
|
|
|
@all_data = grep $_, @all_data; |
2042
|
|
|
|
|
|
|
my ($minval,$maxval) = minmax(\@all_data); |
2043
|
|
|
|
|
|
|
my $delta = ($maxval - $minval) / 100.0; |
2044
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set boxwidth 3"); |
2045
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style fill solid border -1"); |
2046
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set ytics out nomirror"); |
2047
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style data histograms"); |
2048
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style histogram clustered"); |
2049
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set title 'Overall distribution of 1D data'"); |
2050
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror"); |
2051
|
|
|
|
|
|
|
my $localfilename = basename($filename); |
2052
|
|
|
|
|
|
|
my $temp_file = "__temp1dhist_" . $localfilename; |
2053
|
|
|
|
|
|
|
unlink $temp_file if -e $temp_file; |
2054
|
|
|
|
|
|
|
open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!"; |
2055
|
|
|
|
|
|
|
print OUTPUT "Xstep histval\n"; |
2056
|
|
|
|
|
|
|
my @histogram = (0) x 100; |
2057
|
|
|
|
|
|
|
foreach my $i (0..@all_data-1) { |
2058
|
|
|
|
|
|
|
$histogram[int( ($all_data[$i] - $minval) / $delta )]++; |
2059
|
|
|
|
|
|
|
} |
2060
|
|
|
|
|
|
|
foreach my $i (0..@histogram-1) { |
2061
|
|
|
|
|
|
|
print OUTPUT "$i $histogram[$i]\n"; |
2062
|
|
|
|
|
|
|
} |
2063
|
|
|
|
|
|
|
$arg_string = "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc rgb 'green'"; |
2064
|
|
|
|
|
|
|
close OUTPUT; |
2065
|
|
|
|
|
|
|
} |
2066
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
2067
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "splot $arg_string" ); |
2068
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
2069
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "plot $arg_string" ); |
2070
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1) { |
2071
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "plot $arg_string" ); |
2072
|
|
|
|
|
|
|
} |
2073
|
|
|
|
|
|
|
} |
2074
|
|
|
|
|
|
|
|
2075
|
|
|
|
|
|
|
# This method is the same as the one shown above, except that it is meant for |
2076
|
|
|
|
|
|
|
# creating PNG files of the displays. |
2077
|
|
|
|
|
|
|
sub plot_hardcopy_data { |
2078
|
|
|
|
|
|
|
my $self = shift; |
2079
|
|
|
|
|
|
|
my $v_mask = shift || die "visualization mask missing"; |
2080
|
|
|
|
|
|
|
my $master_datafile = $self->{_datafile}; |
2081
|
|
|
|
|
|
|
my @v_mask = split //, $v_mask; |
2082
|
|
|
|
|
|
|
my $visualization_mask_width = @v_mask; |
2083
|
|
|
|
|
|
|
my $original_data_mask = $self->{_mask}; |
2084
|
|
|
|
|
|
|
my @mask = split //, $original_data_mask; |
2085
|
|
|
|
|
|
|
my $data_field_width = scalar grep {$_ eq '1'} @mask; |
2086
|
|
|
|
|
|
|
die "\n\nABORTED: The width of the visualization mask (including " . |
2087
|
|
|
|
|
|
|
"all its 1s and 0s) must equal the width of the original mask " . |
2088
|
|
|
|
|
|
|
"used for reading the data file (counting only the 1's)" |
2089
|
|
|
|
|
|
|
if $visualization_mask_width != $data_field_width; |
2090
|
|
|
|
|
|
|
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask; |
2091
|
|
|
|
|
|
|
my %visualization_data; |
2092
|
|
|
|
|
|
|
my $data_source = $self->{_data}; |
2093
|
|
|
|
|
|
|
while ( my ($record_id, $data) = each %{$data_source} ) { |
2094
|
|
|
|
|
|
|
my @fields = @$data; |
2095
|
|
|
|
|
|
|
die "\nABORTED: Visualization mask size exceeds data record size" |
2096
|
|
|
|
|
|
|
if $#v_mask > $#fields; |
2097
|
|
|
|
|
|
|
my @data_fields; |
2098
|
|
|
|
|
|
|
foreach my $i (0..@fields-1) { |
2099
|
|
|
|
|
|
|
if ($v_mask[$i] eq '0') { |
2100
|
|
|
|
|
|
|
next; |
2101
|
|
|
|
|
|
|
} elsif ($v_mask[$i] eq '1') { |
2102
|
|
|
|
|
|
|
push @data_fields, $fields[$i]; |
2103
|
|
|
|
|
|
|
} else { |
2104
|
|
|
|
|
|
|
die "Misformed visualization mask. It can only have 1s and 0s"; |
2105
|
|
|
|
|
|
|
} |
2106
|
|
|
|
|
|
|
} |
2107
|
|
|
|
|
|
|
$visualization_data{ $record_id } = \@data_fields; |
2108
|
|
|
|
|
|
|
} |
2109
|
|
|
|
|
|
|
my $filename = basename($master_datafile); |
2110
|
|
|
|
|
|
|
my $temp_file; |
2111
|
|
|
|
|
|
|
$temp_file = "__temp_data_" . $filename; |
2112
|
|
|
|
|
|
|
unlink $temp_file if -e $temp_file; |
2113
|
|
|
|
|
|
|
open OUTPUT, ">$temp_file" |
2114
|
|
|
|
|
|
|
or die "Unable to open a temp file in this directory: $!"; |
2115
|
|
|
|
|
|
|
foreach my $datapoint (values %visualization_data) { |
2116
|
|
|
|
|
|
|
print OUTPUT "@$datapoint"; |
2117
|
|
|
|
|
|
|
print OUTPUT "\n"; |
2118
|
|
|
|
|
|
|
} |
2119
|
|
|
|
|
|
|
close OUTPUT; |
2120
|
|
|
|
|
|
|
|
2121
|
|
|
|
|
|
|
my $plot = Graphics::GnuplotIF->new( persist => 1 ); |
2122
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set noclip" ); |
2123
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set pointsize 2" ); |
2124
|
|
|
|
|
|
|
my $plot_title = '"original data provided for EM"'; |
2125
|
|
|
|
|
|
|
my $arg_string ; |
2126
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
2127
|
|
|
|
|
|
|
$arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1"; |
2128
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
2129
|
|
|
|
|
|
|
$arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1"; |
2130
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1 ) { |
2131
|
|
|
|
|
|
|
open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!"; |
2132
|
|
|
|
|
|
|
my @all_data = ; |
2133
|
|
|
|
|
|
|
close INPUT; |
2134
|
|
|
|
|
|
|
@all_data = map {chomp $_; $_} @all_data; |
2135
|
|
|
|
|
|
|
@all_data = grep $_, @all_data; |
2136
|
|
|
|
|
|
|
my ($minval,$maxval) = minmax(\@all_data); |
2137
|
|
|
|
|
|
|
my $delta = ($maxval - $minval) / 100.0; |
2138
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set boxwidth 3"); |
2139
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style fill solid border -1"); |
2140
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set ytics out nomirror"); |
2141
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style data histograms"); |
2142
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style histogram clustered"); |
2143
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set title 'Overall distribution of 1D data'"); |
2144
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror"); |
2145
|
|
|
|
|
|
|
my $localfilename = basename($filename); |
2146
|
|
|
|
|
|
|
my $temp_file = "__temp1dhist_" . $localfilename; |
2147
|
|
|
|
|
|
|
unlink $temp_file if -e $temp_file; |
2148
|
|
|
|
|
|
|
open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!"; |
2149
|
|
|
|
|
|
|
print OUTPUT "Xstep histval\n"; |
2150
|
|
|
|
|
|
|
my @histogram = (0) x 100; |
2151
|
|
|
|
|
|
|
foreach my $i (0..@all_data-1) { |
2152
|
|
|
|
|
|
|
$histogram[int( ($all_data[$i] - $minval) / $delta )]++; |
2153
|
|
|
|
|
|
|
} |
2154
|
|
|
|
|
|
|
foreach my $i (0..@histogram-1) { |
2155
|
|
|
|
|
|
|
print OUTPUT "$i $histogram[$i]\n"; |
2156
|
|
|
|
|
|
|
} |
2157
|
|
|
|
|
|
|
$arg_string = "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc rgb 'green'"; |
2158
|
|
|
|
|
|
|
close OUTPUT; |
2159
|
|
|
|
|
|
|
} |
2160
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
2161
|
|
|
|
|
|
|
$plot->gnuplot_cmd( 'set terminal png', |
2162
|
|
|
|
|
|
|
'set output "data_scatter_plot.png"'); |
2163
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "splot $arg_string" ); |
2164
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
2165
|
|
|
|
|
|
|
$plot->gnuplot_cmd( 'set terminal png', |
2166
|
|
|
|
|
|
|
'set output "data_scatter_plot.png"'); |
2167
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "plot $arg_string" ); |
2168
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1) { |
2169
|
|
|
|
|
|
|
$plot->gnuplot_cmd( 'set terminal png', |
2170
|
|
|
|
|
|
|
'set output "data_scatter_plot.png"'); |
2171
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "plot $arg_string" ); |
2172
|
|
|
|
|
|
|
} |
2173
|
|
|
|
|
|
|
} |
2174
|
|
|
|
|
|
|
|
2175
|
|
|
|
|
|
|
|
2176
|
|
|
|
|
|
|
################### Generating Synthetic Data for Clustering ################### |
2177
|
|
|
|
|
|
|
|
2178
|
|
|
|
|
|
|
# The data generated corresponds to a multivariate distribution. The mean and the |
2179
|
|
|
|
|
|
|
# covariance of each Gaussian in the distribution are specified individually in a |
2180
|
|
|
|
|
|
|
# parameter file. The parameter file must also state the prior probabilities to be |
2181
|
|
|
|
|
|
|
# associated with each Gaussian. See the example parameter file param1.txt in the |
2182
|
|
|
|
|
|
|
# examples directory. Just edit this file for your own needs. |
2183
|
|
|
|
|
|
|
# |
2184
|
|
|
|
|
|
|
# The multivariate random numbers are generated by calling the Math::Random module. |
2185
|
|
|
|
|
|
|
# As you would expect, that module will insist that the covariance matrix you |
2186
|
|
|
|
|
|
|
# specify be symmetric and positive definite. |
2187
|
|
|
|
|
|
|
sub cluster_data_generator { |
2188
|
|
|
|
|
|
|
my $class = shift; |
2189
|
|
|
|
|
|
|
die "illegal call of a class method" |
2190
|
|
|
|
|
|
|
unless $class eq 'Algorithm::ExpectationMaximization'; |
2191
|
|
|
|
|
|
|
my %args = @_; |
2192
|
|
|
|
|
|
|
my $input_parameter_file = $args{input_parameter_file}; |
2193
|
|
|
|
|
|
|
my $output_file = $args{output_datafile}; |
2194
|
|
|
|
|
|
|
my $N = $args{total_number_of_data_points}; |
2195
|
|
|
|
|
|
|
my @all_params; |
2196
|
|
|
|
|
|
|
my $param_string; |
2197
|
|
|
|
|
|
|
if (defined $input_parameter_file) { |
2198
|
|
|
|
|
|
|
open INPUT, $input_parameter_file || "unable to open parameter file: $!"; |
2199
|
|
|
|
|
|
|
@all_params = ; |
2200
|
|
|
|
|
|
|
@all_params = grep { $_ !~ /^[ ]*#/ } @all_params; |
2201
|
|
|
|
|
|
|
chomp @all_params; |
2202
|
|
|
|
|
|
|
$param_string = join ' ', @all_params; |
2203
|
|
|
|
|
|
|
} else { |
2204
|
|
|
|
|
|
|
# Just for testing. Used in t/test.t |
2205
|
|
|
|
|
|
|
$param_string = "priors 0.34 0.33 0.33 " . |
2206
|
|
|
|
|
|
|
"cluster 5 0 0 1 0 0 0 1 0 0 0 1 " . |
2207
|
|
|
|
|
|
|
"cluster 0 5 0 1 0 0 0 1 0 0 0 1 " . |
2208
|
|
|
|
|
|
|
"cluster 0 0 5 1 0 0 0 1 0 0 0 1"; |
2209
|
|
|
|
|
|
|
} |
2210
|
|
|
|
|
|
|
my ($priors_string) = $param_string =~ /priors(.+?)cluster/; |
2211
|
|
|
|
|
|
|
croak "You did not specify the prior probabilities in the parameter file" |
2212
|
|
|
|
|
|
|
unless $priors_string; |
2213
|
|
|
|
|
|
|
my @priors = split /\s+/, $priors_string; |
2214
|
|
|
|
|
|
|
@priors = grep {/$_num_regex/; $_} @priors; |
2215
|
|
|
|
|
|
|
my $sum = 0; |
2216
|
|
|
|
|
|
|
foreach my $prior (@priors) { |
2217
|
|
|
|
|
|
|
$sum += $prior; |
2218
|
|
|
|
|
|
|
} |
2219
|
|
|
|
|
|
|
croak "Your priors in the parameter file do not add up to 1" unless $sum == 1; |
2220
|
|
|
|
|
|
|
my ($rest_of_string) = $param_string =~ /priors\s*$priors_string(.*)$/; |
2221
|
|
|
|
|
|
|
my @cluster_strings = split /[ ]*cluster[ ]*/, $rest_of_string; |
2222
|
|
|
|
|
|
|
@cluster_strings = grep $_, @cluster_strings; |
2223
|
|
|
|
|
|
|
|
2224
|
|
|
|
|
|
|
my $K = @cluster_strings; |
2225
|
|
|
|
|
|
|
croak "Too many clusters requested" if $K > 12; |
2226
|
|
|
|
|
|
|
croak "Mismatch between the number of values for priors and the number " . |
2227
|
|
|
|
|
|
|
"of clusters" unless $K == @priors; |
2228
|
|
|
|
|
|
|
my @point_labels = ('a'..'z'); |
2229
|
|
|
|
|
|
|
print "Prior probabilities recorded from param file: @priors\n"; |
2230
|
|
|
|
|
|
|
print "Number of Gaussians used for the synthetic data: $K\n"; |
2231
|
|
|
|
|
|
|
my @means; |
2232
|
|
|
|
|
|
|
my @covariances; |
2233
|
|
|
|
|
|
|
my $data_dimension; |
2234
|
|
|
|
|
|
|
foreach my $i (0..$K-1) { |
2235
|
|
|
|
|
|
|
my @num_strings = split / /, $cluster_strings[$i]; |
2236
|
|
|
|
|
|
|
my @cluster_mean = map {/$_num_regex/;$_} split / /, $num_strings[0]; |
2237
|
|
|
|
|
|
|
$data_dimension = @cluster_mean; |
2238
|
|
|
|
|
|
|
push @means, \@cluster_mean; |
2239
|
|
|
|
|
|
|
my @covariance_nums = map {/$_num_regex/;$_} split / /, $num_strings[1]; |
2240
|
|
|
|
|
|
|
croak "dimensionality error" if @covariance_nums != |
2241
|
|
|
|
|
|
|
($data_dimension ** 2); |
2242
|
|
|
|
|
|
|
my $cluster_covariance; |
2243
|
|
|
|
|
|
|
foreach my $j (0..$data_dimension-1) { |
2244
|
|
|
|
|
|
|
foreach my $k (0..$data_dimension-1) { |
2245
|
|
|
|
|
|
|
$cluster_covariance->[$j]->[$k] = |
2246
|
|
|
|
|
|
|
$covariance_nums[$j*$data_dimension + $k]; |
2247
|
|
|
|
|
|
|
} |
2248
|
|
|
|
|
|
|
} |
2249
|
|
|
|
|
|
|
push @covariances, $cluster_covariance; |
2250
|
|
|
|
|
|
|
} |
2251
|
|
|
|
|
|
|
random_seed_from_phrase( 'hellomellojello' ); |
2252
|
|
|
|
|
|
|
my @data_dump; |
2253
|
|
|
|
|
|
|
foreach my $i (0..$K-1) { |
2254
|
|
|
|
|
|
|
my @m = @{shift @means}; |
2255
|
|
|
|
|
|
|
my @covar = @{shift @covariances}; |
2256
|
|
|
|
|
|
|
my @new_data = Math::Random::random_multivariate_normal( |
2257
|
|
|
|
|
|
|
int($N * $priors[$i]), @m, @covar ); |
2258
|
|
|
|
|
|
|
my $p = 0; |
2259
|
|
|
|
|
|
|
my $label = $point_labels[$i]; |
2260
|
|
|
|
|
|
|
@new_data = map {unshift @$_, $label.$i; $i++; $_} @new_data; |
2261
|
|
|
|
|
|
|
push @data_dump, @new_data; |
2262
|
|
|
|
|
|
|
} |
2263
|
|
|
|
|
|
|
fisher_yates_shuffle( \@data_dump ); |
2264
|
|
|
|
|
|
|
open OUTPUT, ">$output_file"; |
2265
|
|
|
|
|
|
|
print OUTPUT "\#Data generated from the parameter file: $input_parameter_file\n" |
2266
|
|
|
|
|
|
|
if $input_parameter_file; |
2267
|
|
|
|
|
|
|
print OUTPUT "\#Total number of data points in this file: $N\n"; |
2268
|
|
|
|
|
|
|
print OUTPUT "\#Prior class probabilities for this data: @priors\n"; |
2269
|
|
|
|
|
|
|
foreach my $ele (@data_dump) { |
2270
|
|
|
|
|
|
|
foreach my $coord ( @$ele ) { |
2271
|
|
|
|
|
|
|
print OUTPUT "$coord "; |
2272
|
|
|
|
|
|
|
} |
2273
|
|
|
|
|
|
|
print OUTPUT "\n"; |
2274
|
|
|
|
|
|
|
} |
2275
|
|
|
|
|
|
|
print "Data written out to file $output_file\n"; |
2276
|
|
|
|
|
|
|
close OUTPUT; |
2277
|
|
|
|
|
|
|
} |
2278
|
|
|
|
|
|
|
|
2279
|
|
|
|
|
|
|
sub add_point_coords { |
2280
|
|
|
|
|
|
|
my $self = shift; |
2281
|
|
|
|
|
|
|
my @arr_of_ids = @{shift @_}; # array of data element names |
2282
|
|
|
|
|
|
|
my @result; |
2283
|
|
|
|
|
|
|
my $data_dimensionality = $self->{_data_dimensions}; |
2284
|
|
|
|
|
|
|
foreach my $i (0..$data_dimensionality-1) { |
2285
|
|
|
|
|
|
|
$result[$i] = 0.0; |
2286
|
|
|
|
|
|
|
} |
2287
|
|
|
|
|
|
|
foreach my $id (@arr_of_ids) { |
2288
|
|
|
|
|
|
|
my $ele = $self->{_data}->{$id}; |
2289
|
|
|
|
|
|
|
my $i = 0; |
2290
|
|
|
|
|
|
|
foreach my $component (@$ele) { |
2291
|
|
|
|
|
|
|
$result[$i] += $component; |
2292
|
|
|
|
|
|
|
$i++; |
2293
|
|
|
|
|
|
|
} |
2294
|
|
|
|
|
|
|
} |
2295
|
|
|
|
|
|
|
return \@result; |
2296
|
|
|
|
|
|
|
} |
2297
|
|
|
|
|
|
|
|
2298
|
|
|
|
|
|
|
###################### Support Routines ######################## |
2299
|
|
|
|
|
|
|
|
2300
|
|
|
|
|
|
|
# computer the outer product of two column vectors |
2301
|
|
|
|
|
|
|
sub outer_product { |
2302
|
|
|
|
|
|
|
my $vec1 = shift; |
2303
|
|
|
|
|
|
|
my $vec2 = shift; |
2304
|
|
|
|
|
|
|
my ($nrows1, $ncols1) = ($vec1->rows(), $vec1->cols()); |
2305
|
|
|
|
|
|
|
my ($nrows2, $ncols2) = ($vec2->rows(), $vec2->cols()); |
2306
|
|
|
|
|
|
|
die "Outer product operation called with non-matching vectors" |
2307
|
|
|
|
|
|
|
unless $ncols1 == 1 && $ncols2 == 1 && $nrows1 == $nrows2; |
2308
|
|
|
|
|
|
|
my @vec_arr1 = $vec1->as_list(); |
2309
|
|
|
|
|
|
|
my @vec_arr2 = $vec2->as_list(); |
2310
|
|
|
|
|
|
|
my $outer_product = Math::GSL::Matrix->new($nrows1, $nrows2); |
2311
|
|
|
|
|
|
|
foreach my $index (0..$nrows1-1) { |
2312
|
|
|
|
|
|
|
my @new_row = map $vec_arr1[$index] * $_, @vec_arr2; |
2313
|
|
|
|
|
|
|
$outer_product->set_row($index, \@new_row); |
2314
|
|
|
|
|
|
|
} |
2315
|
|
|
|
|
|
|
return $outer_product; |
2316
|
|
|
|
|
|
|
} |
2317
|
|
|
|
|
|
|
|
2318
|
|
|
|
|
|
|
sub get_index_at_value { |
2319
|
|
|
|
|
|
|
my $value = shift; |
2320
|
|
|
|
|
|
|
my @array = @{shift @_}; |
2321
|
|
|
|
|
|
|
foreach my $i (0..@array-1) { |
2322
|
|
|
|
|
|
|
return $i if $value == $array[$i]; |
2323
|
|
|
|
|
|
|
} |
2324
|
|
|
|
|
|
|
} |
2325
|
|
|
|
|
|
|
|
2326
|
|
|
|
|
|
|
# This routine is really not necessary in light of the new `~~' operator in Perl. |
2327
|
|
|
|
|
|
|
# Will use the new operator in the next version. |
2328
|
|
|
|
|
|
|
sub vector_equal { |
2329
|
|
|
|
|
|
|
my $vec1 = shift; |
2330
|
|
|
|
|
|
|
my $vec2 = shift; |
2331
|
|
|
|
|
|
|
die "wrong data types for distance calculation" if @$vec1 != @$vec2; |
2332
|
|
|
|
|
|
|
foreach my $i (0..@$vec1-1){ |
2333
|
|
|
|
|
|
|
return 0 if $vec1->[$i] != $vec2->[$i]; |
2334
|
|
|
|
|
|
|
} |
2335
|
|
|
|
|
|
|
return 1; |
2336
|
|
|
|
|
|
|
} |
2337
|
|
|
|
|
|
|
|
2338
|
|
|
|
|
|
|
sub compare_array_floats { |
2339
|
|
|
|
|
|
|
my $vec1 = shift; |
2340
|
|
|
|
|
|
|
my $vec2 = shift; |
2341
|
|
|
|
|
|
|
foreach my $i (0..@$vec1-1){ |
2342
|
|
|
|
|
|
|
return 0 if abs($vec1->[$i] - $vec2->[$i]) > 0.00001; |
2343
|
|
|
|
|
|
|
} |
2344
|
|
|
|
|
|
|
return 1; |
2345
|
|
|
|
|
|
|
} |
2346
|
|
|
|
|
|
|
|
2347
|
|
|
|
|
|
|
# Returns the minimum value and its positional index in an array |
2348
|
|
|
|
|
|
|
sub minimum { |
2349
|
|
|
|
|
|
|
my $arr = shift; |
2350
|
|
|
|
|
|
|
my $min; |
2351
|
|
|
|
|
|
|
my $index; |
2352
|
|
|
|
|
|
|
foreach my $i (0..@{$arr}-1) { |
2353
|
|
|
|
|
|
|
if ( (!defined $min) || ($arr->[$i] < $min) ) { |
2354
|
|
|
|
|
|
|
$index = $i; |
2355
|
|
|
|
|
|
|
$min = $arr->[$i]; |
2356
|
|
|
|
|
|
|
} |
2357
|
|
|
|
|
|
|
} |
2358
|
|
|
|
|
|
|
return ($min, $index); |
2359
|
|
|
|
|
|
|
} |
2360
|
|
|
|
|
|
|
|
2361
|
|
|
|
|
|
|
sub minmax { |
2362
|
|
|
|
|
|
|
my $arr = shift; |
2363
|
|
|
|
|
|
|
my $min; |
2364
|
|
|
|
|
|
|
my $max; |
2365
|
|
|
|
|
|
|
foreach my $i (0..@{$arr}-1) { |
2366
|
|
|
|
|
|
|
if ( (!defined $min) && (!defined $max) ) { |
2367
|
|
|
|
|
|
|
$min = $arr->[$i]; |
2368
|
|
|
|
|
|
|
$max = $arr->[$i]; |
2369
|
|
|
|
|
|
|
} elsif ( $arr->[$i] < $min ) { |
2370
|
|
|
|
|
|
|
$min = $arr->[$i]; |
2371
|
|
|
|
|
|
|
} elsif ( $arr->[$i] > $max ) { |
2372
|
|
|
|
|
|
|
$max = $arr->[$i]; |
2373
|
|
|
|
|
|
|
} |
2374
|
|
|
|
|
|
|
} |
2375
|
|
|
|
|
|
|
return ($min, $max); |
2376
|
|
|
|
|
|
|
} |
2377
|
|
|
|
|
|
|
|
2378
|
|
|
|
|
|
|
# Meant only for constructing a deep copy of an array of |
2379
|
|
|
|
|
|
|
# arrays: |
2380
|
|
|
|
|
|
|
sub deep_copy_AoA { |
2381
|
|
|
|
|
|
|
my $ref_in = shift; |
2382
|
|
|
|
|
|
|
my $ref_out; |
2383
|
|
|
|
|
|
|
foreach my $i (0..@{$ref_in}-1) { |
2384
|
|
|
|
|
|
|
foreach my $j (0..@{$ref_in->[$i]}-1) { |
2385
|
|
|
|
|
|
|
$ref_out->[$i]->[$j] = $ref_in->[$i]->[$j]; |
2386
|
|
|
|
|
|
|
} |
2387
|
|
|
|
|
|
|
} |
2388
|
|
|
|
|
|
|
return $ref_out; |
2389
|
|
|
|
|
|
|
} |
2390
|
|
|
|
|
|
|
|
2391
|
|
|
|
|
|
|
# Meant only for constructing a deep copy of an array of arrays for the case when |
2392
|
|
|
|
|
|
|
# some elements of the top-level array may be undefined: |
2393
|
|
|
|
|
|
|
sub deep_copy_AoA_with_nulls { |
2394
|
|
|
|
|
|
|
my $ref_in = shift; |
2395
|
|
|
|
|
|
|
my $ref_out; |
2396
|
|
|
|
|
|
|
foreach my $i (0..@{$ref_in}-1) { |
2397
|
|
|
|
|
|
|
if ( !defined $ref_in->[$i] ) { |
2398
|
|
|
|
|
|
|
$ref_out->[$i] = undef; |
2399
|
|
|
|
|
|
|
next; |
2400
|
|
|
|
|
|
|
} |
2401
|
|
|
|
|
|
|
foreach my $j (0..@{$ref_in->[$i]}-1) { |
2402
|
|
|
|
|
|
|
$ref_out->[$i]->[$j] = $ref_in->[$i]->[$j]; |
2403
|
|
|
|
|
|
|
} |
2404
|
|
|
|
|
|
|
} |
2405
|
|
|
|
|
|
|
return $ref_out; |
2406
|
|
|
|
|
|
|
} |
2407
|
|
|
|
|
|
|
|
2408
|
|
|
|
|
|
|
# Meant only for constructing a deep copy of a hash in which each value is an |
2409
|
|
|
|
|
|
|
# anonymous array of numbers: |
2410
|
|
|
|
|
|
|
sub deep_copy_hash { |
2411
|
|
|
|
|
|
|
my $ref_in = shift; |
2412
|
|
|
|
|
|
|
my $ref_out; |
2413
|
|
|
|
|
|
|
while ( my ($key, $value) = each( %$ref_in ) ) { |
2414
|
|
|
|
|
|
|
$ref_out->{$key} = deep_copy_array( $value ); |
2415
|
|
|
|
|
|
|
} |
2416
|
|
|
|
|
|
|
return $ref_out; |
2417
|
|
|
|
|
|
|
} |
2418
|
|
|
|
|
|
|
|
2419
|
|
|
|
|
|
|
# Meant only for an array of numbers: |
2420
|
|
|
|
|
|
|
sub deep_copy_array { |
2421
|
|
|
|
|
|
|
my $ref_in = shift; |
2422
|
|
|
|
|
|
|
my $ref_out; |
2423
|
|
|
|
|
|
|
foreach my $i (0..@{$ref_in}-1) { |
2424
|
|
|
|
|
|
|
$ref_out->[$i] = $ref_in->[$i]; |
2425
|
|
|
|
|
|
|
} |
2426
|
|
|
|
|
|
|
return $ref_out; |
2427
|
|
|
|
|
|
|
} |
2428
|
|
|
|
|
|
|
|
2429
|
|
|
|
|
|
|
# from perl docs: |
2430
|
|
|
|
|
|
|
sub fisher_yates_shuffle { |
2431
|
|
|
|
|
|
|
my $arr = shift; |
2432
|
|
|
|
|
|
|
my $i = @$arr; |
2433
|
|
|
|
|
|
|
while (--$i) { |
2434
|
|
|
|
|
|
|
my $j = int rand( $i + 1 ); |
2435
|
|
|
|
|
|
|
@$arr[$i, $j] = @$arr[$j, $i]; |
2436
|
|
|
|
|
|
|
} |
2437
|
|
|
|
|
|
|
} |
2438
|
|
|
|
|
|
|
|
2439
|
|
|
|
|
|
|
sub mean_and_variance { |
2440
|
|
|
|
|
|
|
my @data = @{shift @_}; |
2441
|
|
|
|
|
|
|
my ($mean, $variance); |
2442
|
|
|
|
|
|
|
foreach my $i (1..@data) { |
2443
|
|
|
|
|
|
|
if ($i == 1) { |
2444
|
|
|
|
|
|
|
$mean = $data[0]; |
2445
|
|
|
|
|
|
|
$variance = 0; |
2446
|
|
|
|
|
|
|
} else { |
2447
|
|
|
|
|
|
|
# data[$i-1] because of zero-based indexing of vector |
2448
|
|
|
|
|
|
|
$mean = ( (($i-1)/$i) * $mean ) + $data[$i-1] / $i; |
2449
|
|
|
|
|
|
|
$variance = ( (($i-1)/$i) * $variance ) |
2450
|
|
|
|
|
|
|
+ ($data[$i-1]-$mean)**2 / ($i-1); |
2451
|
|
|
|
|
|
|
} |
2452
|
|
|
|
|
|
|
} |
2453
|
|
|
|
|
|
|
return ($mean, $variance); |
2454
|
|
|
|
|
|
|
} |
2455
|
|
|
|
|
|
|
|
2456
|
|
|
|
|
|
|
sub check_for_illegal_params { |
2457
|
|
|
|
|
|
|
my @params = @_; |
2458
|
|
|
|
|
|
|
my @legal_params = qw / datafile |
2459
|
|
|
|
|
|
|
mask |
2460
|
|
|
|
|
|
|
K |
2461
|
|
|
|
|
|
|
terminal_output |
2462
|
|
|
|
|
|
|
max_em_iterations |
2463
|
|
|
|
|
|
|
seeding |
2464
|
|
|
|
|
|
|
class_priors |
2465
|
|
|
|
|
|
|
seed_tags |
2466
|
|
|
|
|
|
|
debug |
2467
|
|
|
|
|
|
|
/; |
2468
|
|
|
|
|
|
|
my $found_match_flag; |
2469
|
|
|
|
|
|
|
foreach my $param (@params) { |
2470
|
|
|
|
|
|
|
foreach my $legal (@legal_params) { |
2471
|
|
|
|
|
|
|
$found_match_flag = 0; |
2472
|
|
|
|
|
|
|
if ($param eq $legal) { |
2473
|
|
|
|
|
|
|
$found_match_flag = 1; |
2474
|
|
|
|
|
|
|
last; |
2475
|
|
|
|
|
|
|
} |
2476
|
|
|
|
|
|
|
} |
2477
|
|
|
|
|
|
|
last if $found_match_flag == 0; |
2478
|
|
|
|
|
|
|
} |
2479
|
|
|
|
|
|
|
return $found_match_flag; |
2480
|
|
|
|
|
|
|
} |
2481
|
|
|
|
|
|
|
|
2482
|
|
|
|
|
|
|
sub get_value_index_hash { |
2483
|
|
|
|
|
|
|
my $arr = shift; |
2484
|
|
|
|
|
|
|
my %hash; |
2485
|
|
|
|
|
|
|
foreach my $index (0..@$arr-1) { |
2486
|
|
|
|
|
|
|
$hash{$arr->[$index]} = $index if $arr->[$index] > 0; |
2487
|
|
|
|
|
|
|
} |
2488
|
|
|
|
|
|
|
return \%hash; |
2489
|
|
|
|
|
|
|
} |
2490
|
|
|
|
|
|
|
|
2491
|
|
|
|
|
|
|
sub non_maximum_supression { |
2492
|
|
|
|
|
|
|
my $arr = shift; |
2493
|
|
|
|
|
|
|
my @output = (0) x @$arr; |
2494
|
|
|
|
|
|
|
my @final_output = (0) x @$arr; |
2495
|
|
|
|
|
|
|
my %hash; |
2496
|
|
|
|
|
|
|
my @array_of_runs = ([$arr->[0]]); |
2497
|
|
|
|
|
|
|
foreach my $index (1..@$arr-1) { |
2498
|
|
|
|
|
|
|
if ($arr->[$index] == $arr->[$index-1]) { |
2499
|
|
|
|
|
|
|
push @{$array_of_runs[-1]}, $arr->[$index]; |
2500
|
|
|
|
|
|
|
} else { |
2501
|
|
|
|
|
|
|
push @array_of_runs, [$arr->[$index]]; |
2502
|
|
|
|
|
|
|
} |
2503
|
|
|
|
|
|
|
} |
2504
|
|
|
|
|
|
|
my $runstart_index = 0; |
2505
|
|
|
|
|
|
|
foreach my $run_index (1..@array_of_runs-2) { |
2506
|
|
|
|
|
|
|
$runstart_index += @{$array_of_runs[$run_index-1]}; |
2507
|
|
|
|
|
|
|
if ($array_of_runs[$run_index]->[0] > |
2508
|
|
|
|
|
|
|
$array_of_runs[$run_index-1]->[0] && |
2509
|
|
|
|
|
|
|
$array_of_runs[$run_index]->[0] > |
2510
|
|
|
|
|
|
|
$array_of_runs[$run_index+1]->[0]) { |
2511
|
|
|
|
|
|
|
my $run_center = @{$array_of_runs[$run_index]} / 2; |
2512
|
|
|
|
|
|
|
my $assignment_index = $runstart_index + $run_center; |
2513
|
|
|
|
|
|
|
$output[$assignment_index] = $arr->[$assignment_index]; |
2514
|
|
|
|
|
|
|
} |
2515
|
|
|
|
|
|
|
} |
2516
|
|
|
|
|
|
|
if ($array_of_runs[-1]->[0] > $array_of_runs[-2]->[0]) { |
2517
|
|
|
|
|
|
|
$runstart_index += @{$array_of_runs[-2]}; |
2518
|
|
|
|
|
|
|
my $run_center = @{$array_of_runs[-1]} / 2; |
2519
|
|
|
|
|
|
|
my $assignment_index = $runstart_index + $run_center; |
2520
|
|
|
|
|
|
|
$output[$assignment_index] = $arr->[$assignment_index]; |
2521
|
|
|
|
|
|
|
} |
2522
|
|
|
|
|
|
|
if ($array_of_runs[0]->[0] > $array_of_runs[1]->[0]) { |
2523
|
|
|
|
|
|
|
my $run_center = @{$array_of_runs[0]} / 2; |
2524
|
|
|
|
|
|
|
$output[$run_center] = $arr->[$run_center]; |
2525
|
|
|
|
|
|
|
} |
2526
|
|
|
|
|
|
|
return \@output; |
2527
|
|
|
|
|
|
|
} |
2528
|
|
|
|
|
|
|
|
2529
|
|
|
|
|
|
|
sub display_matrix { |
2530
|
|
|
|
|
|
|
my $message = shift; |
2531
|
|
|
|
|
|
|
my $matrix = shift; |
2532
|
|
|
|
|
|
|
if (!defined blessed($matrix)) { |
2533
|
|
|
|
|
|
|
print "display_matrix called on a scalar value: $matrix\n"; |
2534
|
|
|
|
|
|
|
return; |
2535
|
|
|
|
|
|
|
} |
2536
|
|
|
|
|
|
|
my $nrows = $matrix->rows(); |
2537
|
|
|
|
|
|
|
my $ncols = $matrix->cols(); |
2538
|
|
|
|
|
|
|
print "$message ($nrows rows and $ncols columns)\n"; |
2539
|
|
|
|
|
|
|
foreach my $i (0..$nrows-1) { |
2540
|
|
|
|
|
|
|
my $row = $matrix->row($i); |
2541
|
|
|
|
|
|
|
my @row_as_list = $row->as_list; |
2542
|
|
|
|
|
|
|
print "@row_as_list\n"; |
2543
|
|
|
|
|
|
|
} |
2544
|
|
|
|
|
|
|
print "\n"; |
2545
|
|
|
|
|
|
|
} |
2546
|
|
|
|
|
|
|
|
2547
|
|
|
|
|
|
|
sub transpose { |
2548
|
|
|
|
|
|
|
my $matrix = shift; |
2549
|
|
|
|
|
|
|
my $num_rows = $matrix->rows(); |
2550
|
|
|
|
|
|
|
my $num_cols = $matrix->cols(); |
2551
|
|
|
|
|
|
|
my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows); |
2552
|
|
|
|
|
|
|
foreach my $i (0..$num_rows-1) { |
2553
|
|
|
|
|
|
|
my @row = $matrix->row($i)->as_list; |
2554
|
|
|
|
|
|
|
$transpose->set_col($i, \@row ); |
2555
|
|
|
|
|
|
|
} |
2556
|
|
|
|
|
|
|
return $transpose; |
2557
|
|
|
|
|
|
|
} |
2558
|
|
|
|
|
|
|
|
2559
|
|
|
|
|
|
|
sub vector_multiply { |
2560
|
|
|
|
|
|
|
my $vec1 = shift; |
2561
|
|
|
|
|
|
|
my $vec2 = shift; |
2562
|
|
|
|
|
|
|
die "vec_multiply called with two vectors of different sizes" |
2563
|
|
|
|
|
|
|
unless @$vec1 == @$vec2; |
2564
|
|
|
|
|
|
|
my $result = 0; |
2565
|
|
|
|
|
|
|
foreach my $i (0..@$vec1-1) { |
2566
|
|
|
|
|
|
|
$result += $vec1->[$i] * $vec2->[$i]; |
2567
|
|
|
|
|
|
|
} |
2568
|
|
|
|
|
|
|
return $result; |
2569
|
|
|
|
|
|
|
} |
2570
|
|
|
|
|
|
|
|
2571
|
|
|
|
|
|
|
sub vector_2_vector_multiply { |
2572
|
|
|
|
|
|
|
my $vec1 = shift; |
2573
|
|
|
|
|
|
|
my $vec2 = shift; |
2574
|
|
|
|
|
|
|
die "vec_multiply called with two vectors of different sizes" |
2575
|
|
|
|
|
|
|
unless @$vec1 == @$vec2; |
2576
|
|
|
|
|
|
|
my @result_vec; |
2577
|
|
|
|
|
|
|
foreach my $i (0..@$vec1-1) { |
2578
|
|
|
|
|
|
|
$result_vec[$i] = $vec1->[$i] * $vec2->[$i]; |
2579
|
|
|
|
|
|
|
} |
2580
|
|
|
|
|
|
|
return \@result_vec; |
2581
|
|
|
|
|
|
|
} |
2582
|
|
|
|
|
|
|
|
2583
|
|
|
|
|
|
|
sub matrix_multiply { |
2584
|
|
|
|
|
|
|
my $matrix1 = shift; |
2585
|
|
|
|
|
|
|
my $matrix2 = shift; |
2586
|
|
|
|
|
|
|
my ($nrows1, $ncols1) = ($matrix1->rows(), $matrix1->cols()); |
2587
|
|
|
|
|
|
|
my ($nrows2, $ncols2) = ($matrix2->rows(), $matrix2->cols()); |
2588
|
|
|
|
|
|
|
die "matrix multiplication called with non-matching matrix arguments" |
2589
|
|
|
|
|
|
|
# unless $ncols1 == $nrows2; |
2590
|
|
|
|
|
|
|
unless $nrows1 == $ncols2 && $ncols1 == $nrows2; |
2591
|
|
|
|
|
|
|
if ($nrows1 == 1) { |
2592
|
|
|
|
|
|
|
my @row = $matrix1->row(0)->as_list; |
2593
|
|
|
|
|
|
|
my @col = $matrix2->col(0)->as_list; |
2594
|
|
|
|
|
|
|
my $result; |
2595
|
|
|
|
|
|
|
foreach my $j (0..$ncols1-1) { |
2596
|
|
|
|
|
|
|
$result += $row[$j] * $col[$j]; |
2597
|
|
|
|
|
|
|
} |
2598
|
|
|
|
|
|
|
return $result; |
2599
|
|
|
|
|
|
|
} |
2600
|
|
|
|
|
|
|
my $product = Math::GSL::Matrix->new($nrows1, $nrows1); |
2601
|
|
|
|
|
|
|
foreach my $i (0..$nrows1-1) { |
2602
|
|
|
|
|
|
|
my $row = $matrix1->row($i); |
2603
|
|
|
|
|
|
|
my @product_row; |
2604
|
|
|
|
|
|
|
foreach my $j (0..$ncols2-1) { |
2605
|
|
|
|
|
|
|
my $col = $matrix2->col($j); |
2606
|
|
|
|
|
|
|
my $row_times_col = matrix_multiply($row, $col); |
2607
|
|
|
|
|
|
|
push @product_row, $row_times_col; |
2608
|
|
|
|
|
|
|
} |
2609
|
|
|
|
|
|
|
$product->set_row($i, \@product_row); |
2610
|
|
|
|
|
|
|
} |
2611
|
|
|
|
|
|
|
return $product; |
2612
|
|
|
|
|
|
|
} |
2613
|
|
|
|
|
|
|
|
2614
|
|
|
|
|
|
|
sub vector_matrix_multiply { |
2615
|
|
|
|
|
|
|
my $matrix1 = shift; |
2616
|
|
|
|
|
|
|
my $matrix2 = shift; |
2617
|
|
|
|
|
|
|
my ($nrows1, $ncols1) = ($matrix1->rows, $matrix1->cols); |
2618
|
|
|
|
|
|
|
my ($nrows2, $ncols2) = ($matrix2->rows, $matrix2->cols); |
2619
|
|
|
|
|
|
|
die "matrix multiplication called with non-matching matrix arguments" |
2620
|
|
|
|
|
|
|
unless $nrows1 == 1 && $ncols1 == $nrows2; |
2621
|
|
|
|
|
|
|
if ($ncols2 == 1) { |
2622
|
|
|
|
|
|
|
my @row = $matrix1->row(0)->as_list; |
2623
|
|
|
|
|
|
|
my @col = $matrix2->col(0)->as_list; |
2624
|
|
|
|
|
|
|
my $result; |
2625
|
|
|
|
|
|
|
foreach my $j (0..$ncols1-1) { |
2626
|
|
|
|
|
|
|
$result += $row[$j] * $col[$j]; |
2627
|
|
|
|
|
|
|
} |
2628
|
|
|
|
|
|
|
return $result; |
2629
|
|
|
|
|
|
|
} |
2630
|
|
|
|
|
|
|
my $product = Math::GSL::Matrix->new(1, $ncols2); |
2631
|
|
|
|
|
|
|
my $row = $matrix1->row(0); |
2632
|
|
|
|
|
|
|
my @product_row; |
2633
|
|
|
|
|
|
|
foreach my $j (0..$ncols2-1) { |
2634
|
|
|
|
|
|
|
my $col = $matrix2->col($j); |
2635
|
|
|
|
|
|
|
my $row_times_col = vector_matrix_multiply($row, $col); |
2636
|
|
|
|
|
|
|
push @product_row, $row_times_col; |
2637
|
|
|
|
|
|
|
} |
2638
|
|
|
|
|
|
|
$product->set_row(0, \@product_row); |
2639
|
|
|
|
|
|
|
return $product; |
2640
|
|
|
|
|
|
|
} |
2641
|
|
|
|
|
|
|
|
2642
|
|
|
|
|
|
|
sub matrix_vector_multiply { |
2643
|
|
|
|
|
|
|
my $matrix1 = shift; |
2644
|
|
|
|
|
|
|
my $matrix2 = shift; |
2645
|
|
|
|
|
|
|
my ($nrows1, $ncols1) = ($matrix1->rows(), $matrix1->cols()); |
2646
|
|
|
|
|
|
|
my ($nrows2, $ncols2) = ($matrix2->rows(), $matrix2->cols()); |
2647
|
|
|
|
|
|
|
die "matrix multiplication called with non-matching matrix arguments" |
2648
|
|
|
|
|
|
|
unless $ncols1 == $nrows2 && $ncols2 == 1; |
2649
|
|
|
|
|
|
|
if ($nrows1 == 1) { |
2650
|
|
|
|
|
|
|
my @row = $matrix1->row(0)->as_list; |
2651
|
|
|
|
|
|
|
my @col = $matrix2->col(0)->as_list; |
2652
|
|
|
|
|
|
|
my $result; |
2653
|
|
|
|
|
|
|
foreach my $j (0..$ncols1-1) { |
2654
|
|
|
|
|
|
|
$result += $row[$j] * $col[$j]; |
2655
|
|
|
|
|
|
|
} |
2656
|
|
|
|
|
|
|
return $result; |
2657
|
|
|
|
|
|
|
} |
2658
|
|
|
|
|
|
|
my $product = Math::GSL::Matrix->new($nrows1, 1); |
2659
|
|
|
|
|
|
|
my $col = $matrix2->col(0); |
2660
|
|
|
|
|
|
|
my @product_col; |
2661
|
|
|
|
|
|
|
foreach my $i (0..$nrows1-1) { |
2662
|
|
|
|
|
|
|
my $row = $matrix1->row($i); |
2663
|
|
|
|
|
|
|
my $row_times_col = matrix_vector_multiply($row, $col); |
2664
|
|
|
|
|
|
|
push @product_col, $row_times_col; |
2665
|
|
|
|
|
|
|
} |
2666
|
|
|
|
|
|
|
$product->set_col(0, \@product_col); |
2667
|
|
|
|
|
|
|
return $product; |
2668
|
|
|
|
|
|
|
} |
2669
|
|
|
|
|
|
|
|
2670
|
|
|
|
|
|
|
sub matrix_trace { |
2671
|
|
|
|
|
|
|
my $matrix = shift; |
2672
|
|
|
|
|
|
|
my ($nrows, $ncols) = ($matrix->rows(), $matrix->cols()); |
2673
|
|
|
|
|
|
|
die "trace can only be calculated for a square matrix" |
2674
|
|
|
|
|
|
|
unless $ncols == $nrows; |
2675
|
|
|
|
|
|
|
my @elements = $matrix->as_list; |
2676
|
|
|
|
|
|
|
my $trace = 0; |
2677
|
|
|
|
|
|
|
foreach my $i (0..$nrows-1) { |
2678
|
|
|
|
|
|
|
$trace += $elements[$i + $i * $ncols]; |
2679
|
|
|
|
|
|
|
} |
2680
|
|
|
|
|
|
|
return $trace; |
2681
|
|
|
|
|
|
|
} |
2682
|
|
|
|
|
|
|
|
2683
|
|
|
|
|
|
|
1; |
2684
|
|
|
|
|
|
|
|
2685
|
|
|
|
|
|
|
=pod |
2686
|
|
|
|
|
|
|
|
2687
|
|
|
|
|
|
|
=head1 NAME |
2688
|
|
|
|
|
|
|
|
2689
|
|
|
|
|
|
|
Algorithm::ExpectationMaximization -- A Perl module for clustering numerical |
2690
|
|
|
|
|
|
|
multi-dimensional data with the Expectation-Maximization algorithm. |
2691
|
|
|
|
|
|
|
|
2692
|
|
|
|
|
|
|
=head1 SYNOPSIS |
2693
|
|
|
|
|
|
|
|
2694
|
|
|
|
|
|
|
use Algorithm::ExpectationMaximization; |
2695
|
|
|
|
|
|
|
|
2696
|
|
|
|
|
|
|
# First name the data file: |
2697
|
|
|
|
|
|
|
|
2698
|
|
|
|
|
|
|
my $datafile = "mydatafile.csv"; |
2699
|
|
|
|
|
|
|
|
2700
|
|
|
|
|
|
|
# Next, set the mask to indicate which columns of the datafile to use for |
2701
|
|
|
|
|
|
|
# clustering and which column contains a symbolic ID for each data record. For |
2702
|
|
|
|
|
|
|
# example, if the symbolic name is in the first column, you want the second column |
2703
|
|
|
|
|
|
|
# to be ignored, and you want the next three columns to be used for 3D clustering: |
2704
|
|
|
|
|
|
|
|
2705
|
|
|
|
|
|
|
my $mask = "N0111"; |
2706
|
|
|
|
|
|
|
|
2707
|
|
|
|
|
|
|
# Now construct an instance of the clusterer. The parameter `K' controls the |
2708
|
|
|
|
|
|
|
# number of clusters. Here is an example call to the constructor for instance |
2709
|
|
|
|
|
|
|
# creation: |
2710
|
|
|
|
|
|
|
|
2711
|
|
|
|
|
|
|
my $clusterer = Algorithm::ExpectationMaximization->new( |
2712
|
|
|
|
|
|
|
datafile => $datafile, |
2713
|
|
|
|
|
|
|
mask => $mask, |
2714
|
|
|
|
|
|
|
K => 3, |
2715
|
|
|
|
|
|
|
max_em_iterations => 300, |
2716
|
|
|
|
|
|
|
seeding => 'random', |
2717
|
|
|
|
|
|
|
terminal_output => 1, |
2718
|
|
|
|
|
|
|
); |
2719
|
|
|
|
|
|
|
|
2720
|
|
|
|
|
|
|
# Note the choice for `seeding'. The choice `random' means that the clusterer will |
2721
|
|
|
|
|
|
|
# randomly select `K' data points to serve as initial cluster centers. Other |
2722
|
|
|
|
|
|
|
# possible choices for the constructor parameter `seeding' are `kmeans' and |
2723
|
|
|
|
|
|
|
# `manual'. With the `kmeans' option for `seeding', the output of a K-means |
2724
|
|
|
|
|
|
|
# clusterer is used for the cluster seeds and the initial cluster covariances. If |
2725
|
|
|
|
|
|
|
# you use the `manual' option for seeding, you must also specify the data elements |
2726
|
|
|
|
|
|
|
# to use for seeding the clusters. |
2727
|
|
|
|
|
|
|
|
2728
|
|
|
|
|
|
|
# Here is an example of a call to the constructor when we choose the `manual' |
2729
|
|
|
|
|
|
|
# option for seeding the clusters and for specifying the data elements for |
2730
|
|
|
|
|
|
|
# seeding. The data elements are specified by their tag names. In this case, |
2731
|
|
|
|
|
|
|
# these names are `a26', `b53', and `c49': |
2732
|
|
|
|
|
|
|
|
2733
|
|
|
|
|
|
|
my $clusterer = Algorithm::ExpectationMaximization->new( |
2734
|
|
|
|
|
|
|
datafile => $datafile, |
2735
|
|
|
|
|
|
|
mask => $mask, |
2736
|
|
|
|
|
|
|
class_priors => [0.6, 0.2, 0.2], |
2737
|
|
|
|
|
|
|
K => 3, |
2738
|
|
|
|
|
|
|
max_em_iterations => 300, |
2739
|
|
|
|
|
|
|
seeding => 'manual', |
2740
|
|
|
|
|
|
|
seed_tags => ['a26', 'b53', 'c49'], |
2741
|
|
|
|
|
|
|
terminal_output => 1, |
2742
|
|
|
|
|
|
|
); |
2743
|
|
|
|
|
|
|
|
2744
|
|
|
|
|
|
|
# This example call to the constructor also illustrates how you can inject class |
2745
|
|
|
|
|
|
|
# priors into the clustering process. The class priors are the prior probabilities |
2746
|
|
|
|
|
|
|
# of the class distributions in your dataset. As explained later, injecting class |
2747
|
|
|
|
|
|
|
# priors in the manner shown above makes statistical sense only for the case of |
2748
|
|
|
|
|
|
|
# manual seeding. When you do inject class priors, the order in which the priors |
2749
|
|
|
|
|
|
|
# are expressed must correspond to the manually specified seeds for the clusters. |
2750
|
|
|
|
|
|
|
|
2751
|
|
|
|
|
|
|
# After the invocation of the constructor, the following calls are mandatory |
2752
|
|
|
|
|
|
|
# for reasons that should be obvious from the names of the methods: |
2753
|
|
|
|
|
|
|
|
2754
|
|
|
|
|
|
|
$clusterer->read_data_from_file(); |
2755
|
|
|
|
|
|
|
srand(time); |
2756
|
|
|
|
|
|
|
$clusterer->seed_the_clusters(); |
2757
|
|
|
|
|
|
|
$clusterer->EM(); |
2758
|
|
|
|
|
|
|
$clusterer->run_bayes_classifier(); |
2759
|
|
|
|
|
|
|
my $clusters = $clusterer->return_disjoint_clusters(); |
2760
|
|
|
|
|
|
|
|
2761
|
|
|
|
|
|
|
# where the call to `EM()' is the invocation of the expectation-maximization |
2762
|
|
|
|
|
|
|
# algorithm. The call to `srand(time)' is to seed the pseudo random number |
2763
|
|
|
|
|
|
|
# generator afresh for each run of the cluster seeding procedure. If you want to |
2764
|
|
|
|
|
|
|
# see repeatable results from one run to another of the algorithm with random |
2765
|
|
|
|
|
|
|
# seeding, you would obviously not invoke `srand(time)'. |
2766
|
|
|
|
|
|
|
|
2767
|
|
|
|
|
|
|
# The call `run_bayes_classifier()' shown above carries out a disjoint clustering |
2768
|
|
|
|
|
|
|
# of all the data points using the naive Bayes' classifier. And the call |
2769
|
|
|
|
|
|
|
# `return_disjoint_clusters()' returns the clusters thus formed to you. Once you |
2770
|
|
|
|
|
|
|
# have obtained access to the clusters in this manner, you can display them in |
2771
|
|
|
|
|
|
|
# your terminal window by |
2772
|
|
|
|
|
|
|
|
2773
|
|
|
|
|
|
|
foreach my $index (0..@$clusters-1) { |
2774
|
|
|
|
|
|
|
print "Cluster $index (Naive Bayes): @{$clusters->[$index]}\n\n" |
2775
|
|
|
|
|
|
|
} |
2776
|
|
|
|
|
|
|
|
2777
|
|
|
|
|
|
|
# If you would like to also see the clusters purely on the basis of the posterior |
2778
|
|
|
|
|
|
|
# class probabilities exceeding a threshold, call |
2779
|
|
|
|
|
|
|
|
2780
|
|
|
|
|
|
|
my $theta1 = 0.2; |
2781
|
|
|
|
|
|
|
my $posterior_prob_clusters = |
2782
|
|
|
|
|
|
|
$clusterer->return_clusters_with_posterior_probs_above_threshold($theta1); |
2783
|
|
|
|
|
|
|
|
2784
|
|
|
|
|
|
|
# where you can obviously set the threshold $theta1 to any value you wish. Note |
2785
|
|
|
|
|
|
|
# that now you may end up with clusters that overlap. You can display them in |
2786
|
|
|
|
|
|
|
# your terminal window in the same manner as shown above for the naive Bayes' |
2787
|
|
|
|
|
|
|
# clusters. |
2788
|
|
|
|
|
|
|
|
2789
|
|
|
|
|
|
|
# You can write the naive Bayes' clusters out to files, one cluster per file, by |
2790
|
|
|
|
|
|
|
# calling |
2791
|
|
|
|
|
|
|
|
2792
|
|
|
|
|
|
|
$clusterer->write_naive_bayes_clusters_to_files(); |
2793
|
|
|
|
|
|
|
|
2794
|
|
|
|
|
|
|
# The clusters are placed in files with names like |
2795
|
|
|
|
|
|
|
|
2796
|
|
|
|
|
|
|
naive_bayes_cluster1.txt |
2797
|
|
|
|
|
|
|
naive_bayes_cluster2.txt |
2798
|
|
|
|
|
|
|
... |
2799
|
|
|
|
|
|
|
|
2800
|
|
|
|
|
|
|
# In the same manner, you can write out the posterior probability based possibly |
2801
|
|
|
|
|
|
|
# overlapping clusters to files by calling: |
2802
|
|
|
|
|
|
|
|
2803
|
|
|
|
|
|
|
$clusterer->write_posterior_prob_clusters_above_threshold_to_files($theta1); |
2804
|
|
|
|
|
|
|
|
2805
|
|
|
|
|
|
|
# where the threshold $theta1 sets the probability threshold for deciding which |
2806
|
|
|
|
|
|
|
# data elements to place in a cluster. These clusters are placed in files with |
2807
|
|
|
|
|
|
|
# names like |
2808
|
|
|
|
|
|
|
|
2809
|
|
|
|
|
|
|
posterior_prob_cluster1.txt |
2810
|
|
|
|
|
|
|
posterior_prob_cluster2.txt |
2811
|
|
|
|
|
|
|
... |
2812
|
|
|
|
|
|
|
|
2813
|
|
|
|
|
|
|
# CLUSTER VISUALIZATION: |
2814
|
|
|
|
|
|
|
|
2815
|
|
|
|
|
|
|
# You must first set the mask for cluster visualization. This mask tells the |
2816
|
|
|
|
|
|
|
# module which 2D or 3D subspace of the original data space you wish to visualize |
2817
|
|
|
|
|
|
|
# the clusters in: |
2818
|
|
|
|
|
|
|
|
2819
|
|
|
|
|
|
|
my $visualization_mask = "111"; |
2820
|
|
|
|
|
|
|
$clusterer->visualize_clusters($visualization_mask); |
2821
|
|
|
|
|
|
|
$clusterer->visualize_distributions($visualization_mask); |
2822
|
|
|
|
|
|
|
$clusterer->plot_hardcopy_clusters($visualization_mask); |
2823
|
|
|
|
|
|
|
$clusterer->plot_hardcopy_distributions($visualization_mask); |
2824
|
|
|
|
|
|
|
|
2825
|
|
|
|
|
|
|
# where the last two invocations are for writing out the PNG plots of the |
2826
|
|
|
|
|
|
|
# visualization displays to disk files. The PNG image of the posterior |
2827
|
|
|
|
|
|
|
# probability distributions is written out to a file named posterior_prob_plot.png |
2828
|
|
|
|
|
|
|
# and the PNG image of the disjoint clusters to a file called cluster_plot.png. |
2829
|
|
|
|
|
|
|
|
2830
|
|
|
|
|
|
|
# SYNTHETIC DATA GENERATION: |
2831
|
|
|
|
|
|
|
|
2832
|
|
|
|
|
|
|
# The module has been provided with a class method for generating multivariate |
2833
|
|
|
|
|
|
|
# data for experimenting with the EM algorithm. The data generation is controlled |
2834
|
|
|
|
|
|
|
# by the contents of a parameter file that is supplied as an argument to the data |
2835
|
|
|
|
|
|
|
# generator method. The priors, the means, and the covariance matrices in the |
2836
|
|
|
|
|
|
|
# parameter file must be according to the syntax shown in the `param1.txt' file in |
2837
|
|
|
|
|
|
|
# the `examples' directory. It is best to edit a copy of this file for your |
2838
|
|
|
|
|
|
|
# synthetic data generation needs. |
2839
|
|
|
|
|
|
|
|
2840
|
|
|
|
|
|
|
my $parameter_file = "param1.txt"; |
2841
|
|
|
|
|
|
|
my $out_datafile = "mydatafile1.dat"; |
2842
|
|
|
|
|
|
|
Algorithm::ExpectationMaximization->cluster_data_generator( |
2843
|
|
|
|
|
|
|
input_parameter_file => $parameter_file, |
2844
|
|
|
|
|
|
|
output_datafile => $out_datafile, |
2845
|
|
|
|
|
|
|
total_number_of_data_points => $N ); |
2846
|
|
|
|
|
|
|
|
2847
|
|
|
|
|
|
|
# where the value of $N is the total number of data points you would like to see |
2848
|
|
|
|
|
|
|
# generated for all of the Gaussians. How this total number is divided up amongst |
2849
|
|
|
|
|
|
|
# the Gaussians is decided by the prior probabilities for the Gaussian components |
2850
|
|
|
|
|
|
|
# as declared in input parameter file. The synthetic data may be visualized in a |
2851
|
|
|
|
|
|
|
# terminal window and the visualization written out as a PNG image to a diskfile |
2852
|
|
|
|
|
|
|
# by |
2853
|
|
|
|
|
|
|
|
2854
|
|
|
|
|
|
|
my $data_visualization_mask = "11"; |
2855
|
|
|
|
|
|
|
$clusterer->visualize_data($data_visualization_mask); |
2856
|
|
|
|
|
|
|
$clusterer->plot_hardcopy_data($data_visualization_mask); |
2857
|
|
|
|
|
|
|
|
2858
|
|
|
|
|
|
|
|
2859
|
|
|
|
|
|
|
=head1 CHANGES |
2860
|
|
|
|
|
|
|
|
2861
|
|
|
|
|
|
|
Version 1.22 should work with data in CSV files. |
2862
|
|
|
|
|
|
|
|
2863
|
|
|
|
|
|
|
Version 1.21 incorporates minor code clean up. Overall, the module implementation |
2864
|
|
|
|
|
|
|
remains unchanged. |
2865
|
|
|
|
|
|
|
|
2866
|
|
|
|
|
|
|
Version 1.2 allows the module to also be used for 1-D data. The visualization code |
2867
|
|
|
|
|
|
|
for 1-D shows the clusters through their histograms. |
2868
|
|
|
|
|
|
|
|
2869
|
|
|
|
|
|
|
Version 1.1 incorporates much cleanup of the documentation associated with the |
2870
|
|
|
|
|
|
|
module. Both the top-level module documentation, especially the Description part, |
2871
|
|
|
|
|
|
|
and the comments embedded in the code were revised for better utilization of the |
2872
|
|
|
|
|
|
|
module. The basic implementation code remains unchanged. |
2873
|
|
|
|
|
|
|
|
2874
|
|
|
|
|
|
|
|
2875
|
|
|
|
|
|
|
=head1 DESCRIPTION |
2876
|
|
|
|
|
|
|
|
2877
|
|
|
|
|
|
|
B is a I module for the |
2878
|
|
|
|
|
|
|
Expectation-Maximization (EM) method of clustering numerical data that lends itself |
2879
|
|
|
|
|
|
|
to modeling as a Gaussian mixture. Since the module is entirely in Perl (in the |
2880
|
|
|
|
|
|
|
sense that it is not a Perl wrapper around a C library that actually does the |
2881
|
|
|
|
|
|
|
clustering), the code in the module can easily be modified to experiment with several |
2882
|
|
|
|
|
|
|
aspects of EM. |
2883
|
|
|
|
|
|
|
|
2884
|
|
|
|
|
|
|
Gaussian Mixture Modeling (GMM) is based on the assumption that the data consists of |
2885
|
|
|
|
|
|
|
C Gaussian components, each characterized by its own mean vector and its own |
2886
|
|
|
|
|
|
|
covariance matrix. Obviously, given observed data for clustering, we do not know |
2887
|
|
|
|
|
|
|
which of the C Gaussian components was responsible for any of the data elements. |
2888
|
|
|
|
|
|
|
GMM also associates a prior probability with each Gaussian component. In general, |
2889
|
|
|
|
|
|
|
these priors will also be unknown. So the problem of clustering consists of |
2890
|
|
|
|
|
|
|
estimating the posterior class probability at each data element and also estimating |
2891
|
|
|
|
|
|
|
the class priors. Once these posterior class probabilities and the priors are |
2892
|
|
|
|
|
|
|
estimated with EM, we can use the naive Bayes' classifier to partition the data into |
2893
|
|
|
|
|
|
|
disjoint clusters. Or, for "soft" clustering, we can find all the data elements that |
2894
|
|
|
|
|
|
|
belong to a Gaussian component on the basis of the posterior class probabilities at |
2895
|
|
|
|
|
|
|
the data elements exceeding a prescribed threshold. |
2896
|
|
|
|
|
|
|
|
2897
|
|
|
|
|
|
|
If you do not mind the fact that it is possible for the EM algorithm to occasionally |
2898
|
|
|
|
|
|
|
get stuck in a local maximum and to, therefore, produce a wrong answer even when you |
2899
|
|
|
|
|
|
|
know the data to be perfectly multimodal Gaussian, EM is probably the most magical |
2900
|
|
|
|
|
|
|
approach to clustering multidimensional data. Consider the case of clustering |
2901
|
|
|
|
|
|
|
three-dimensional data. Each Gaussian cluster in 3D space is characterized by the |
2902
|
|
|
|
|
|
|
following 10 variables: the 6 unique elements of the C<3x3> covariance matrix (which |
2903
|
|
|
|
|
|
|
must be symmetric positive-definite), the 3 unique elements of the mean, and the |
2904
|
|
|
|
|
|
|
prior associated with the Gaussian. Now let's say you expect to see six Gaussians in |
2905
|
|
|
|
|
|
|
your data. What that means is that you would want the values for 59 variables |
2906
|
|
|
|
|
|
|
(remember the unit-summation constraint on the class priors which reduces the overall |
2907
|
|
|
|
|
|
|
number of variables by one) to be estimated by the algorithm that seeks to discover |
2908
|
|
|
|
|
|
|
the clusters in your data. What's amazing is that, despite the large number of |
2909
|
|
|
|
|
|
|
variables that must be optimized simultaneously, the EM algorithm will very likely |
2910
|
|
|
|
|
|
|
give you a good approximation to the right answer. |
2911
|
|
|
|
|
|
|
|
2912
|
|
|
|
|
|
|
At its core, EM depends on the notion of unobserved data and the averaging of the |
2913
|
|
|
|
|
|
|
log-likelihood of the data actually observed over all admissible probabilities for |
2914
|
|
|
|
|
|
|
the unobserved data. But what is unobserved data? While in some cases where EM is |
2915
|
|
|
|
|
|
|
used, the unobserved data is literally the missing data, in others, it is something |
2916
|
|
|
|
|
|
|
that cannot be seen directly but that nonetheless is relevant to the data actually |
2917
|
|
|
|
|
|
|
observed. For the case of clustering multidimensional numerical data that can be |
2918
|
|
|
|
|
|
|
modeled as a Gaussian mixture, it turns out that the best way to think of the |
2919
|
|
|
|
|
|
|
unobserved data is in terms of a sequence of random variables, one for each observed |
2920
|
|
|
|
|
|
|
data point, whose values dictate the selection of the Gaussian for that data point. |
2921
|
|
|
|
|
|
|
This point is explained in great detail in my on-line tutorial at |
2922
|
|
|
|
|
|
|
L. |
2923
|
|
|
|
|
|
|
|
2924
|
|
|
|
|
|
|
The EM algorithm in our context reduces to an iterative invocation of the following |
2925
|
|
|
|
|
|
|
steps: (1) Given the current guess for the means and the covariances of the different |
2926
|
|
|
|
|
|
|
Gaussians in our mixture model, use Bayes' Rule to update the posterior class |
2927
|
|
|
|
|
|
|
probabilities at each of the data points; (2) Using the updated posterior class |
2928
|
|
|
|
|
|
|
probabilities, first update the class priors; (3) Using the updated class priors, |
2929
|
|
|
|
|
|
|
update the class means and the class covariances; and go back to Step (1). Ideally, |
2930
|
|
|
|
|
|
|
the iterations should terminate when the expected log-likelihood of the observed data |
2931
|
|
|
|
|
|
|
has reached a maximum and does not change with any further iterations. The stopping |
2932
|
|
|
|
|
|
|
rule used in this module is the detection of no change over three consecutive |
2933
|
|
|
|
|
|
|
iterations in the values calculated for the priors. |
2934
|
|
|
|
|
|
|
|
2935
|
|
|
|
|
|
|
This module provides three different choices for seeding the clusters: (1) random, |
2936
|
|
|
|
|
|
|
(2) kmeans, and (3) manual. When random seeding is chosen, the algorithm randomly |
2937
|
|
|
|
|
|
|
selects C data elements as cluster seeds. That is, the data vectors associated |
2938
|
|
|
|
|
|
|
with these seeds are treated as initial guesses for the means of the Gaussian |
2939
|
|
|
|
|
|
|
distributions. The covariances are then set to the values calculated from the entire |
2940
|
|
|
|
|
|
|
dataset with respect to the means corresponding to the seeds. With kmeans seeding, on |
2941
|
|
|
|
|
|
|
the other hand, the means and the covariances are set to whatever values are returned |
2942
|
|
|
|
|
|
|
by the kmeans algorithm. And, when seeding is set to manual, you are allowed to |
2943
|
|
|
|
|
|
|
choose C data elements --- by specifying their tag names --- for the seeds. The |
2944
|
|
|
|
|
|
|
rest of the EM initialization for the manual mode is the same as for the random mode. |
2945
|
|
|
|
|
|
|
The algorithm allows for the initial priors to be specified for the manual mode of |
2946
|
|
|
|
|
|
|
seeding. |
2947
|
|
|
|
|
|
|
|
2948
|
|
|
|
|
|
|
Much of code for the kmeans based seeding of EM was drawn from the |
2949
|
|
|
|
|
|
|
C module by me. The code from that module used here corresponds to |
2950
|
|
|
|
|
|
|
the case when the C option in the C module is set |
2951
|
|
|
|
|
|
|
to C. The C option for KMeans consists of subjecting the data to a |
2952
|
|
|
|
|
|
|
principal components analysis (PCA) to discover the direction of maximum variance in |
2953
|
|
|
|
|
|
|
the data space. The data points are then projected on to this direction and a |
2954
|
|
|
|
|
|
|
histogram constructed from the projections. Centers of the C largest peaks in |
2955
|
|
|
|
|
|
|
this smoothed histogram are used to seed the KMeans based clusterer. As you'd |
2956
|
|
|
|
|
|
|
expect, the output of the KMeans used to seed the EM algorithm. |
2957
|
|
|
|
|
|
|
|
2958
|
|
|
|
|
|
|
This module uses two different criteria to measure the quality of the clustering |
2959
|
|
|
|
|
|
|
achieved. The first is the Minimum Description Length (MDL) proposed originally by |
2960
|
|
|
|
|
|
|
Rissanen (J. Rissanen: "Modeling by Shortest Data Description," Automatica, 1978, and |
2961
|
|
|
|
|
|
|
"A Universal Prior for Integers and Estimation by Minimum Description Length," Annals |
2962
|
|
|
|
|
|
|
of Statistics, 1983.) The MDL criterion is a difference of a log-likelihood term for |
2963
|
|
|
|
|
|
|
all of the observed data and a model-complexity penalty term. In general, both the |
2964
|
|
|
|
|
|
|
log-likelihood and the model-complexity terms increase as the number of clusters |
2965
|
|
|
|
|
|
|
increases. The form of the MDL criterion in this module uses for the penalty term |
2966
|
|
|
|
|
|
|
the Bayesian Information Criterion (BIC) of G. Schwartz, "Estimating the Dimensions |
2967
|
|
|
|
|
|
|
of a Model," The Annals of Statistics, 1978. In general, the smaller the value of |
2968
|
|
|
|
|
|
|
MDL quality measure, the better the clustering of the data. |
2969
|
|
|
|
|
|
|
|
2970
|
|
|
|
|
|
|
For our second measure of clustering quality, we use `trace( SW^-1 . SB)' where SW is |
2971
|
|
|
|
|
|
|
the within-class scatter matrix, more commonly denoted S_w, and SB the between-class |
2972
|
|
|
|
|
|
|
scatter matrix, more commonly denoted S_b (the underscore means subscript). This |
2973
|
|
|
|
|
|
|
measure can be thought of as the normalized average distance between the clusters, |
2974
|
|
|
|
|
|
|
the normalization being provided by average cluster covariance SW^-1. Therefore, the |
2975
|
|
|
|
|
|
|
larger the value of this quality measure, the better the separation between the |
2976
|
|
|
|
|
|
|
clusters. Since this measure has its roots in the Fisher linear discriminant |
2977
|
|
|
|
|
|
|
function, we incorporate the word C in the name of the quality measure. |
2978
|
|
|
|
|
|
|
I When the |
2979
|
|
|
|
|
|
|
clusters exhibit significant overlap, the numbers produced by this quality measure |
2980
|
|
|
|
|
|
|
tend to be generally meaningless. |
2981
|
|
|
|
|
|
|
|
2982
|
|
|
|
|
|
|
=head1 METHODS |
2983
|
|
|
|
|
|
|
|
2984
|
|
|
|
|
|
|
The module provides the following methods for EM based |
2985
|
|
|
|
|
|
|
clustering, for cluster visualization, for data |
2986
|
|
|
|
|
|
|
visualization, and for the generation of data for testing a |
2987
|
|
|
|
|
|
|
clustering algorithm: |
2988
|
|
|
|
|
|
|
|
2989
|
|
|
|
|
|
|
=over |
2990
|
|
|
|
|
|
|
|
2991
|
|
|
|
|
|
|
=item B |
2992
|
|
|
|
|
|
|
|
2993
|
|
|
|
|
|
|
A call to C constructs a new instance of the |
2994
|
|
|
|
|
|
|
C class. A typical form |
2995
|
|
|
|
|
|
|
of this call when you want to use random option for seeding |
2996
|
|
|
|
|
|
|
the algorithm is: |
2997
|
|
|
|
|
|
|
|
2998
|
|
|
|
|
|
|
my $clusterer = Algorithm::ExpectationMaximization->new( |
2999
|
|
|
|
|
|
|
datafile => $datafile, |
3000
|
|
|
|
|
|
|
mask => $mask, |
3001
|
|
|
|
|
|
|
K => 3, |
3002
|
|
|
|
|
|
|
max_em_iterations => 300, |
3003
|
|
|
|
|
|
|
seeding => 'random', |
3004
|
|
|
|
|
|
|
terminal_output => 1, |
3005
|
|
|
|
|
|
|
); |
3006
|
|
|
|
|
|
|
|
3007
|
|
|
|
|
|
|
where C is the expected number of clusters and |
3008
|
|
|
|
|
|
|
C the maximum number of EM iterations |
3009
|
|
|
|
|
|
|
that you want to allow until convergence is achieved. |
3010
|
|
|
|
|
|
|
Depending on your dataset and on the choice of the initial |
3011
|
|
|
|
|
|
|
seeds, the actual number of iterations used could be as few |
3012
|
|
|
|
|
|
|
as 10 and as many as reaching 300. The output produced by |
3013
|
|
|
|
|
|
|
the algorithm shows the actual number of iterations used to |
3014
|
|
|
|
|
|
|
arrive at convergence. |
3015
|
|
|
|
|
|
|
|
3016
|
|
|
|
|
|
|
The data file supplied through the C option is |
3017
|
|
|
|
|
|
|
expected to contain entries in the following format |
3018
|
|
|
|
|
|
|
|
3019
|
|
|
|
|
|
|
c20 0 10.7087017086940 9.63528386251712 10.9512155258108 ... |
3020
|
|
|
|
|
|
|
c7 0 12.8025925026787 10.6126270065785 10.5228482095349 ... |
3021
|
|
|
|
|
|
|
b9 0 7.60118206283120 5.05889245193079 5.82841781759102 ... |
3022
|
|
|
|
|
|
|
.... |
3023
|
|
|
|
|
|
|
.... |
3024
|
|
|
|
|
|
|
|
3025
|
|
|
|
|
|
|
where the first column contains the symbolic ID tag for each |
3026
|
|
|
|
|
|
|
data record and the rest of the columns the numerical |
3027
|
|
|
|
|
|
|
information. As to which columns are actually used for |
3028
|
|
|
|
|
|
|
clustering is decided by the string value of the mask. For |
3029
|
|
|
|
|
|
|
example, if we wanted to cluster on the basis of the entries |
3030
|
|
|
|
|
|
|
in just the 3rd, the 4th, and the 5th columns above, the |
3031
|
|
|
|
|
|
|
mask value would be C where the character C |
3032
|
|
|
|
|
|
|
indicates that the ID tag is in the first column, the |
3033
|
|
|
|
|
|
|
character C<0> that the second column is to be ignored, and |
3034
|
|
|
|
|
|
|
the C<1>'s that follow that the 3rd, the 4th, and the 5th |
3035
|
|
|
|
|
|
|
columns are to be used for clustering. |
3036
|
|
|
|
|
|
|
|
3037
|
|
|
|
|
|
|
If instead of random seeding, you wish to use the kmeans |
3038
|
|
|
|
|
|
|
based seeding, just replace the option C supplied |
3039
|
|
|
|
|
|
|
for C by C. You can also do manual seeding |
3040
|
|
|
|
|
|
|
by designating a specified set of data elements to serve as |
3041
|
|
|
|
|
|
|
cluster seeds. The call to the constructor in this case |
3042
|
|
|
|
|
|
|
looks like |
3043
|
|
|
|
|
|
|
|
3044
|
|
|
|
|
|
|
my $clusterer = Algorithm::ExpectationMaximization->new( |
3045
|
|
|
|
|
|
|
datafile => $datafile, |
3046
|
|
|
|
|
|
|
mask => $mask, |
3047
|
|
|
|
|
|
|
K => 3, |
3048
|
|
|
|
|
|
|
max_em_iterations => 300, |
3049
|
|
|
|
|
|
|
seeding => 'manual', |
3050
|
|
|
|
|
|
|
seed_tags => ['a26', 'b53', 'c49'], |
3051
|
|
|
|
|
|
|
terminal_output => 1, |
3052
|
|
|
|
|
|
|
); |
3053
|
|
|
|
|
|
|
|
3054
|
|
|
|
|
|
|
where the option C is set to an anonymous array |
3055
|
|
|
|
|
|
|
of symbolic names associated with the data elements. |
3056
|
|
|
|
|
|
|
|
3057
|
|
|
|
|
|
|
If you know the class priors, you can supply them through an |
3058
|
|
|
|
|
|
|
additional option to the constructor that looks like |
3059
|
|
|
|
|
|
|
|
3060
|
|
|
|
|
|
|
class_priors => [0.6, 0.2, 0.2], |
3061
|
|
|
|
|
|
|
|
3062
|
|
|
|
|
|
|
for the case of C equal to 3. B
|
3063
|
|
|
|
|
|
|
be a useful thing to do only for the case of manual |
3064
|
|
|
|
|
|
|
seeding.> If you go for manual seeding, the order in which |
3065
|
|
|
|
|
|
|
the priors are expressed should correspond to the order of |
3066
|
|
|
|
|
|
|
the manually chosen tags supplied through the C |
3067
|
|
|
|
|
|
|
option. |
3068
|
|
|
|
|
|
|
|
3069
|
|
|
|
|
|
|
Note that the parameter C is boolean; when |
3070
|
|
|
|
|
|
|
not supplied in the call to C it defaults to 0. When |
3071
|
|
|
|
|
|
|
set, this parameter displays useful information in the |
3072
|
|
|
|
|
|
|
window of the terminal screen in which you invoke the |
3073
|
|
|
|
|
|
|
algorithm. |
3074
|
|
|
|
|
|
|
|
3075
|
|
|
|
|
|
|
=item B |
3076
|
|
|
|
|
|
|
|
3077
|
|
|
|
|
|
|
$clusterer->read_data_from_file() |
3078
|
|
|
|
|
|
|
|
3079
|
|
|
|
|
|
|
This is a required call after the constructor is invoked. As |
3080
|
|
|
|
|
|
|
you would expect, this call reads in the data for |
3081
|
|
|
|
|
|
|
clustering. |
3082
|
|
|
|
|
|
|
|
3083
|
|
|
|
|
|
|
=item B |
3084
|
|
|
|
|
|
|
|
3085
|
|
|
|
|
|
|
$clusterer->seed_the_clusters(); |
3086
|
|
|
|
|
|
|
|
3087
|
|
|
|
|
|
|
This is also a required call. It processes the option you |
3088
|
|
|
|
|
|
|
supplied for C in the constructor call to choose |
3089
|
|
|
|
|
|
|
the data elements for seeding the C clusters. |
3090
|
|
|
|
|
|
|
|
3091
|
|
|
|
|
|
|
=item B |
3092
|
|
|
|
|
|
|
|
3093
|
|
|
|
|
|
|
$clusterer->EM(); |
3094
|
|
|
|
|
|
|
|
3095
|
|
|
|
|
|
|
This is the workhorse of the module, as you would expect. |
3096
|
|
|
|
|
|
|
The means, the covariances, and the priors estimated by this |
3097
|
|
|
|
|
|
|
method are stored in instance variables that are subsequently |
3098
|
|
|
|
|
|
|
accessed by other methods for the purpose of displaying the |
3099
|
|
|
|
|
|
|
clusters, the probability distributions, etc. |
3100
|
|
|
|
|
|
|
|
3101
|
|
|
|
|
|
|
=item B |
3102
|
|
|
|
|
|
|
|
3103
|
|
|
|
|
|
|
$clusterer->run_bayes_classifier(); |
3104
|
|
|
|
|
|
|
|
3105
|
|
|
|
|
|
|
Using the posterior probability distributions estimated by |
3106
|
|
|
|
|
|
|
the C method, this method partitions the data into the |
3107
|
|
|
|
|
|
|
C disjoint clusters using the naive Bayes' classifier. |
3108
|
|
|
|
|
|
|
|
3109
|
|
|
|
|
|
|
=item B |
3110
|
|
|
|
|
|
|
|
3111
|
|
|
|
|
|
|
my $clusters = $clusterer->return_disjoint_clusters(); |
3112
|
|
|
|
|
|
|
|
3113
|
|
|
|
|
|
|
This allows you to access the clusters obtained with the |
3114
|
|
|
|
|
|
|
application of the naive Bayes' classifier in your own |
3115
|
|
|
|
|
|
|
scripts. If, say, you wanted to see the data records placed |
3116
|
|
|
|
|
|
|
in each cluster, you could subsequently invoke the following |
3117
|
|
|
|
|
|
|
loop in your own script: |
3118
|
|
|
|
|
|
|
|
3119
|
|
|
|
|
|
|
foreach my $index (0..@$clusters-1) { |
3120
|
|
|
|
|
|
|
print "Cluster $index (Naive Bayes): @{$clusters->[$index]}\n\n" |
3121
|
|
|
|
|
|
|
} |
3122
|
|
|
|
|
|
|
|
3123
|
|
|
|
|
|
|
where C<$clusters> holds the array reference returned by the |
3124
|
|
|
|
|
|
|
call to C. |
3125
|
|
|
|
|
|
|
|
3126
|
|
|
|
|
|
|
=item B |
3127
|
|
|
|
|
|
|
|
3128
|
|
|
|
|
|
|
$clusterer->write_naive_bayes_clusters_to_files(); |
3129
|
|
|
|
|
|
|
|
3130
|
|
|
|
|
|
|
This method writes the clusters obtained by applying the |
3131
|
|
|
|
|
|
|
naive Bayes' classifier to disk files, one cluster per |
3132
|
|
|
|
|
|
|
file. What is written out to each file consists of the |
3133
|
|
|
|
|
|
|
symbolic names of the data records that belong to the |
3134
|
|
|
|
|
|
|
cluster corresponding to that file. The clusters are placed |
3135
|
|
|
|
|
|
|
in files with names like |
3136
|
|
|
|
|
|
|
|
3137
|
|
|
|
|
|
|
naive_bayes_cluster1.txt |
3138
|
|
|
|
|
|
|
naive_bayes_cluster2.txt |
3139
|
|
|
|
|
|
|
... |
3140
|
|
|
|
|
|
|
|
3141
|
|
|
|
|
|
|
=item B |
3142
|
|
|
|
|
|
|
|
3143
|
|
|
|
|
|
|
my $theta1 = 0.2; |
3144
|
|
|
|
|
|
|
my $posterior_prob_clusters = |
3145
|
|
|
|
|
|
|
$clusterer->return_clusters_with_posterior_probs_above_threshold($theta1); |
3146
|
|
|
|
|
|
|
|
3147
|
|
|
|
|
|
|
This method returns a reference to an array of C |
3148
|
|
|
|
|
|
|
anonymous arrays, each consisting of the symbolic names for |
3149
|
|
|
|
|
|
|
the data records where the posterior class probability |
3150
|
|
|
|
|
|
|
exceeds the threshold as specified by C<$theta1>. |
3151
|
|
|
|
|
|
|
Subsequently, you can access each of these |
3152
|
|
|
|
|
|
|
posterior-probability based clusters through a loop |
3153
|
|
|
|
|
|
|
construct such as |
3154
|
|
|
|
|
|
|
|
3155
|
|
|
|
|
|
|
foreach my $index (0..@$posterior_prob_clusters-1) { |
3156
|
|
|
|
|
|
|
print "Cluster $index (based on posterior probs exceeding $theta1): " . |
3157
|
|
|
|
|
|
|
"@{$posterior_prob_clusters->[$index]}\n\n" |
3158
|
|
|
|
|
|
|
} |
3159
|
|
|
|
|
|
|
|
3160
|
|
|
|
|
|
|
=item B |
3161
|
|
|
|
|
|
|
|
3162
|
|
|
|
|
|
|
$clusterer->write_posterior_prob_clusters_above_threshold_to_files($theta1); |
3163
|
|
|
|
|
|
|
|
3164
|
|
|
|
|
|
|
This call writes out the posterior-probability based soft |
3165
|
|
|
|
|
|
|
clusters to disk files. As in the previous method, the |
3166
|
|
|
|
|
|
|
threshold C<$theta1> sets the probability threshold for |
3167
|
|
|
|
|
|
|
deciding which data elements belong to a cluster. These |
3168
|
|
|
|
|
|
|
clusters are placed in files with names like |
3169
|
|
|
|
|
|
|
|
3170
|
|
|
|
|
|
|
posterior_prob_cluster1.txt |
3171
|
|
|
|
|
|
|
posterior_prob_cluster2.txt |
3172
|
|
|
|
|
|
|
... |
3173
|
|
|
|
|
|
|
|
3174
|
|
|
|
|
|
|
=item B |
3175
|
|
|
|
|
|
|
|
3176
|
|
|
|
|
|
|
my $theta2 = 0.00001; |
3177
|
|
|
|
|
|
|
my $class_distributions = |
3178
|
|
|
|
|
|
|
$clusterer->return_individual_class_distributions_above_given_threshold($theta2); |
3179
|
|
|
|
|
|
|
|
3180
|
|
|
|
|
|
|
This is the method to call if you wish to see the individual |
3181
|
|
|
|
|
|
|
Gaussians in your own script. The method returns a reference |
3182
|
|
|
|
|
|
|
to an array of anonymous arrays, with each anonymous array |
3183
|
|
|
|
|
|
|
representing data membership in each Gaussian. Only those |
3184
|
|
|
|
|
|
|
data points are included in each Gaussian where the |
3185
|
|
|
|
|
|
|
probability exceeds the threshold C<$theta2>. Note that the |
3186
|
|
|
|
|
|
|
larger the covariance and the higher the data |
3187
|
|
|
|
|
|
|
dimensionality, the smaller this threshold must be for you |
3188
|
|
|
|
|
|
|
to see any of the data points in a Gaussian. After you have |
3189
|
|
|
|
|
|
|
accessed the Gaussian mixture in this manner, you can |
3190
|
|
|
|
|
|
|
display the data membership in each Gaussian through the |
3191
|
|
|
|
|
|
|
following sort of a loop: |
3192
|
|
|
|
|
|
|
|
3193
|
|
|
|
|
|
|
foreach my $index (0..@$class_distributions-1) { |
3194
|
|
|
|
|
|
|
print "Gaussian Distribution $index (only shows data elements " . |
3195
|
|
|
|
|
|
|
"whose probabilities exceed the threshold $theta2: " . |
3196
|
|
|
|
|
|
|
"@{$class_distributions->[$index]}\n\n" |
3197
|
|
|
|
|
|
|
} |
3198
|
|
|
|
|
|
|
|
3199
|
|
|
|
|
|
|
=item B |
3200
|
|
|
|
|
|
|
|
3201
|
|
|
|
|
|
|
my $visualization_mask = "11"; |
3202
|
|
|
|
|
|
|
$clusterer->visualize_clusters($visualization_mask); |
3203
|
|
|
|
|
|
|
|
3204
|
|
|
|
|
|
|
The visualization mask here does not have to be identical to |
3205
|
|
|
|
|
|
|
the one used for clustering, but must be a subset of that |
3206
|
|
|
|
|
|
|
mask. This is convenient for visualizing the clusters in |
3207
|
|
|
|
|
|
|
two- or three-dimensional subspaces of the original space. |
3208
|
|
|
|
|
|
|
The subset is specified by placing `0's in the positions |
3209
|
|
|
|
|
|
|
corresponding to the dimensions you do NOT want to see |
3210
|
|
|
|
|
|
|
through visualization. Depending on the mask used, this |
3211
|
|
|
|
|
|
|
method creates a 2D or a 3D scatter plot of the clusters |
3212
|
|
|
|
|
|
|
obtained through the naive Bayes' classification rule. |
3213
|
|
|
|
|
|
|
|
3214
|
|
|
|
|
|
|
=item B |
3215
|
|
|
|
|
|
|
|
3216
|
|
|
|
|
|
|
$clusterer->visualize_distributions($visualization_mask); |
3217
|
|
|
|
|
|
|
|
3218
|
|
|
|
|
|
|
This is the method to call if you want to visualize the soft |
3219
|
|
|
|
|
|
|
clustering corresponding to the posterior class |
3220
|
|
|
|
|
|
|
probabilities exceeding the threshold specified in the call |
3221
|
|
|
|
|
|
|
to |
3222
|
|
|
|
|
|
|
C. |
3223
|
|
|
|
|
|
|
Again, depending on the visualization mask used, you will |
3224
|
|
|
|
|
|
|
see either a 2D plot or a 3D scatter plot. |
3225
|
|
|
|
|
|
|
|
3226
|
|
|
|
|
|
|
=item B |
3227
|
|
|
|
|
|
|
|
3228
|
|
|
|
|
|
|
$clusterer->plot_hardcopy_clusters($visualization_mask); |
3229
|
|
|
|
|
|
|
|
3230
|
|
|
|
|
|
|
This method create a PNG file from the C created |
3231
|
|
|
|
|
|
|
display of the naive Bayes' clusters obtained from the data. |
3232
|
|
|
|
|
|
|
The plotting functionality of C is accessed through |
3233
|
|
|
|
|
|
|
the Perl wrappers provided by the C |
3234
|
|
|
|
|
|
|
module. |
3235
|
|
|
|
|
|
|
|
3236
|
|
|
|
|
|
|
=item B |
3237
|
|
|
|
|
|
|
|
3238
|
|
|
|
|
|
|
$clusterer->plot_hardcopy_distributions($visualization_mask); |
3239
|
|
|
|
|
|
|
|
3240
|
|
|
|
|
|
|
This method create a PNG file from the C created |
3241
|
|
|
|
|
|
|
display of the clusters that correspond to the posterior |
3242
|
|
|
|
|
|
|
class probabilities exceeding a specified threshold. The |
3243
|
|
|
|
|
|
|
plotting functionality of C is accessed through the |
3244
|
|
|
|
|
|
|
Perl wrappers provided by the C module. |
3245
|
|
|
|
|
|
|
|
3246
|
|
|
|
|
|
|
=item B |
3247
|
|
|
|
|
|
|
|
3248
|
|
|
|
|
|
|
$clusterer->display_fisher_quality_vs_iterations(); |
3249
|
|
|
|
|
|
|
|
3250
|
|
|
|
|
|
|
This method measures the quality of clustering by |
3251
|
|
|
|
|
|
|
calculating the normalized average squared distance between |
3252
|
|
|
|
|
|
|
the cluster centers, the normalization being provided by the |
3253
|
|
|
|
|
|
|
average cluster covariance. See the Description for further |
3254
|
|
|
|
|
|
|
details. In general, this measure is NOT useful for |
3255
|
|
|
|
|
|
|
overlapping clusters. |
3256
|
|
|
|
|
|
|
|
3257
|
|
|
|
|
|
|
=item B |
3258
|
|
|
|
|
|
|
|
3259
|
|
|
|
|
|
|
$clusterer->display_mdl_quality_vs_iterations(); |
3260
|
|
|
|
|
|
|
|
3261
|
|
|
|
|
|
|
At the end of each iteration, this method measures the |
3262
|
|
|
|
|
|
|
quality of clustering my calculating its MDL (Minimum |
3263
|
|
|
|
|
|
|
Description Length). As stated earlier in Description, the |
3264
|
|
|
|
|
|
|
MDL measure is a difference of a log-likelihood term for all |
3265
|
|
|
|
|
|
|
of the observed data and a model-complexity penalty term. |
3266
|
|
|
|
|
|
|
The smaller the value returned by this method, the better |
3267
|
|
|
|
|
|
|
the clustering. |
3268
|
|
|
|
|
|
|
|
3269
|
|
|
|
|
|
|
=item B |
3270
|
|
|
|
|
|
|
|
3271
|
|
|
|
|
|
|
my $estimated_priors = $clusterer->return_estimated_priors(); |
3272
|
|
|
|
|
|
|
print "Estimated class priors: @$estimated_priors\n"; |
3273
|
|
|
|
|
|
|
|
3274
|
|
|
|
|
|
|
This method can be used to access the final values of the |
3275
|
|
|
|
|
|
|
class priors as estimated by the EM algorithm. |
3276
|
|
|
|
|
|
|
|
3277
|
|
|
|
|
|
|
=item B |
3278
|
|
|
|
|
|
|
|
3279
|
|
|
|
|
|
|
Algorithm::ExpectationMaximization->cluster_data_generator( |
3280
|
|
|
|
|
|
|
input_parameter_file => $parameter_file, |
3281
|
|
|
|
|
|
|
output_datafile => $out_datafile, |
3282
|
|
|
|
|
|
|
total_number_of_data_points => 300 |
3283
|
|
|
|
|
|
|
); |
3284
|
|
|
|
|
|
|
|
3285
|
|
|
|
|
|
|
for generating multivariate data for clustering if you wish to play with synthetic |
3286
|
|
|
|
|
|
|
data for experimenting with the EM algorithm. The input parameter file must specify |
3287
|
|
|
|
|
|
|
the priors to be used for the Gaussians, their means, and their covariance matrices. |
3288
|
|
|
|
|
|
|
The format of the information contained in the parameter file must be as shown in the |
3289
|
|
|
|
|
|
|
file C provided in the C directory. It will be easiest for you |
3290
|
|
|
|
|
|
|
to just edit a copy of this file for your data generation needs. In addition to the |
3291
|
|
|
|
|
|
|
format of the parameter file, the main constraint you need to observe in specifying |
3292
|
|
|
|
|
|
|
the parameters is that the dimensionality of the covariance matrices must correspond |
3293
|
|
|
|
|
|
|
to the dimensionality of the mean vectors. The multivariate random numbers are |
3294
|
|
|
|
|
|
|
generated by calling the C module. As you would expect, this module |
3295
|
|
|
|
|
|
|
requires that the covariance matrices you specify in your parameter file be symmetric |
3296
|
|
|
|
|
|
|
and positive definite. Should the covariances in your parameter file not obey this |
3297
|
|
|
|
|
|
|
condition, the C module will let you know. |
3298
|
|
|
|
|
|
|
|
3299
|
|
|
|
|
|
|
=item B |
3300
|
|
|
|
|
|
|
|
3301
|
|
|
|
|
|
|
$clusterer->visualize_data($data_visualization_mask); |
3302
|
|
|
|
|
|
|
|
3303
|
|
|
|
|
|
|
This is the method to call if you want to visualize the data |
3304
|
|
|
|
|
|
|
you plan to cluster with the EM algorithm. You'd need to |
3305
|
|
|
|
|
|
|
specify argument mask in a manner similar to the |
3306
|
|
|
|
|
|
|
visualization of the clusters, as explained earlier. |
3307
|
|
|
|
|
|
|
|
3308
|
|
|
|
|
|
|
=item B |
3309
|
|
|
|
|
|
|
|
3310
|
|
|
|
|
|
|
$clusterer->plot_hardcopy_data($data_visualization_mask); |
3311
|
|
|
|
|
|
|
|
3312
|
|
|
|
|
|
|
This method creates a PNG file that can be used to print out |
3313
|
|
|
|
|
|
|
a hardcopy of the data in different 2D and 3D subspaces of |
3314
|
|
|
|
|
|
|
the data space. The visualization mask is used to select the |
3315
|
|
|
|
|
|
|
subspace for the PNG image. |
3316
|
|
|
|
|
|
|
|
3317
|
|
|
|
|
|
|
=back |
3318
|
|
|
|
|
|
|
|
3319
|
|
|
|
|
|
|
=head1 HOW THE CLUSTERS ARE OUTPUT |
3320
|
|
|
|
|
|
|
|
3321
|
|
|
|
|
|
|
This module produces two different types of clusters: the "hard" clusters and the |
3322
|
|
|
|
|
|
|
"soft" clusters. The hard clusters correspond to the naive Bayes' classification of |
3323
|
|
|
|
|
|
|
the data points on the basis of the Gaussian distributions and the class priors |
3324
|
|
|
|
|
|
|
estimated by the EM algorithm. Such clusters partition the data into disjoint |
3325
|
|
|
|
|
|
|
subsets. On the other hand, the soft clusters correspond to the posterior class |
3326
|
|
|
|
|
|
|
probabilities calculated by the EM algorithm. A data element belongs to a cluster if |
3327
|
|
|
|
|
|
|
its posterior probability for that Gaussian exceeds a threshold. |
3328
|
|
|
|
|
|
|
|
3329
|
|
|
|
|
|
|
After the EM algorithm has finished running, the hard clusters are created by |
3330
|
|
|
|
|
|
|
invoking the method C on an instance of the module and then |
3331
|
|
|
|
|
|
|
made user-accessible by calling C. These clusters may |
3332
|
|
|
|
|
|
|
then be displayed in a terminal window by dereferencing each element of the array |
3333
|
|
|
|
|
|
|
whose reference is returned b C. The hard clusters can |
3334
|
|
|
|
|
|
|
be written out to disk files by invoking C. |
3335
|
|
|
|
|
|
|
This method writes out the clusters to files, one cluster per file. What is written |
3336
|
|
|
|
|
|
|
out to each file consists of the symbolic names of the data records that belong to |
3337
|
|
|
|
|
|
|
the cluster corresponding to that file. The clusters are placed in files with names |
3338
|
|
|
|
|
|
|
like |
3339
|
|
|
|
|
|
|
|
3340
|
|
|
|
|
|
|
naive_bayes_cluster1.txt |
3341
|
|
|
|
|
|
|
naive_bayes_cluster2.txt |
3342
|
|
|
|
|
|
|
... |
3343
|
|
|
|
|
|
|
|
3344
|
|
|
|
|
|
|
The soft clusters on the other hand are created by calling |
3345
|
|
|
|
|
|
|
C |
3346
|
|
|
|
|
|
|
on an instance of the module, where the argument C<$theta1> |
3347
|
|
|
|
|
|
|
is the threshold for deciding whether a data element belongs |
3348
|
|
|
|
|
|
|
in a soft cluster. The posterior class probability at a |
3349
|
|
|
|
|
|
|
data element must exceed the threshold for the element to |
3350
|
|
|
|
|
|
|
belong to the corresponding cluster. The soft cluster can |
3351
|
|
|
|
|
|
|
be written out to disk files by calling |
3352
|
|
|
|
|
|
|
C. |
3353
|
|
|
|
|
|
|
As with the hard clusters, each cluster is placed in a separate |
3354
|
|
|
|
|
|
|
file. The filenames for such clusters look like: |
3355
|
|
|
|
|
|
|
|
3356
|
|
|
|
|
|
|
posterior_prob_cluster1.txt |
3357
|
|
|
|
|
|
|
posterior_prob_cluster2.txt |
3358
|
|
|
|
|
|
|
... |
3359
|
|
|
|
|
|
|
|
3360
|
|
|
|
|
|
|
=head1 WHAT IF THE NUMBER OF CLUSTERS IS UNKNOWN? |
3361
|
|
|
|
|
|
|
|
3362
|
|
|
|
|
|
|
The module constructor requires that you supply a value for the parameter C, which |
3363
|
|
|
|
|
|
|
is the number of clusters you expect to see in the data. But what if you do not have |
3364
|
|
|
|
|
|
|
a good value for C? Note that it is possible to search for the best C by using |
3365
|
|
|
|
|
|
|
the two clustering quality criteria included in the module. However, I have |
3366
|
|
|
|
|
|
|
intentionally not yet incorporated that feature in the module because it slows down |
3367
|
|
|
|
|
|
|
the execution of the code --- especially when the dimensionality of the data |
3368
|
|
|
|
|
|
|
increases. However, nothing prevents you from writing a script --- along the lines |
3369
|
|
|
|
|
|
|
of the five "canned_example" scripts in the C directory --- that would use |
3370
|
|
|
|
|
|
|
the two clustering quality metrics for finding the best choice for C for a given |
3371
|
|
|
|
|
|
|
dataset. Obviously, you will now have to incorporate the call to the constructor in |
3372
|
|
|
|
|
|
|
a loop and check the value of the quality measures for each value of C. |
3373
|
|
|
|
|
|
|
|
3374
|
|
|
|
|
|
|
=head1 SOME RESULTS OBTAINED WITH THIS MODULE |
3375
|
|
|
|
|
|
|
|
3376
|
|
|
|
|
|
|
If you would like to see some results that have been obtained with this module, check |
3377
|
|
|
|
|
|
|
out Section 7 of the report |
3378
|
|
|
|
|
|
|
L. |
3379
|
|
|
|
|
|
|
|
3380
|
|
|
|
|
|
|
=head1 THE C DIRECTORY |
3381
|
|
|
|
|
|
|
|
3382
|
|
|
|
|
|
|
Becoming familiar with this directory should be your best |
3383
|
|
|
|
|
|
|
strategy to become comfortable with this module (and its |
3384
|
|
|
|
|
|
|
future versions). You are urged to start by executing the |
3385
|
|
|
|
|
|
|
following five example scripts: |
3386
|
|
|
|
|
|
|
|
3387
|
|
|
|
|
|
|
=over 16 |
3388
|
|
|
|
|
|
|
|
3389
|
|
|
|
|
|
|
=item I |
3390
|
|
|
|
|
|
|
|
3391
|
|
|
|
|
|
|
This example applies the EM algorithm to the data contained in the datafile |
3392
|
|
|
|
|
|
|
C. The mixture data in the file corresponds to three overlapping |
3393
|
|
|
|
|
|
|
Gaussian components in a star-shaped pattern. The EM based clustering for this data |
3394
|
|
|
|
|
|
|
is shown in the files C and |
3395
|
|
|
|
|
|
|
C, the former displaying the hard clusters |
3396
|
|
|
|
|
|
|
obtained by using the naive Bayes' classifier and the latter showing the soft |
3397
|
|
|
|
|
|
|
clusters obtained on the basis of the posterior class probabilities at the data |
3398
|
|
|
|
|
|
|
points. |
3399
|
|
|
|
|
|
|
|
3400
|
|
|
|
|
|
|
=item I |
3401
|
|
|
|
|
|
|
|
3402
|
|
|
|
|
|
|
The datafile used in this example is C. This mixture data |
3403
|
|
|
|
|
|
|
corresponds to two well-separated relatively isotropic Gaussians. EM based clustering for this |
3404
|
|
|
|
|
|
|
data is shown in the files C and |
3405
|
|
|
|
|
|
|
C, the former displaying the hard clusters |
3406
|
|
|
|
|
|
|
obtained by using the naive Bayes' classifier and the latter showing the soft |
3407
|
|
|
|
|
|
|
clusters obtained by using the posterior class probabilities at the data points. |
3408
|
|
|
|
|
|
|
|
3409
|
|
|
|
|
|
|
=item I |
3410
|
|
|
|
|
|
|
|
3411
|
|
|
|
|
|
|
Like the first example, this example again involves three Gaussians, but now their |
3412
|
|
|
|
|
|
|
means are not co-located. Additionally, we now seed the clusters manually by |
3413
|
|
|
|
|
|
|
specifying three selected data points as the initial guesses for the cluster means. |
3414
|
|
|
|
|
|
|
The datafile used for this example is C. The EM based clustering |
3415
|
|
|
|
|
|
|
for this data is shown in the files C and |
3416
|
|
|
|
|
|
|
C, the former displaying the hard clusters |
3417
|
|
|
|
|
|
|
obtained by using the naive Bayes' classifier and the latter showing the soft |
3418
|
|
|
|
|
|
|
clusters obtained on the basis of the posterior class probabilities at the data |
3419
|
|
|
|
|
|
|
points. |
3420
|
|
|
|
|
|
|
|
3421
|
|
|
|
|
|
|
=item I |
3422
|
|
|
|
|
|
|
|
3423
|
|
|
|
|
|
|
Whereas the three previous examples demonstrated EM based clustering of 2D data, we |
3424
|
|
|
|
|
|
|
now present an example of clustering in 3D. The datafile used in this example is |
3425
|
|
|
|
|
|
|
C. This mixture data corresponds to three well-separated but highly |
3426
|
|
|
|
|
|
|
anisotropic Gaussians. The EM derived clustering for this data is shown in the files |
3427
|
|
|
|
|
|
|
C and C, the |
3428
|
|
|
|
|
|
|
former displaying the hard clusters obtained by using the naive Bayes' classifier and |
3429
|
|
|
|
|
|
|
the latter showing the soft clusters obtained on the basis of the posterior class |
3430
|
|
|
|
|
|
|
probabilities at the data points. |
3431
|
|
|
|
|
|
|
|
3432
|
|
|
|
|
|
|
You may also wish to run this example on the data in a CSV file in the C |
3433
|
|
|
|
|
|
|
directory. The name of the file is C. |
3434
|
|
|
|
|
|
|
|
3435
|
|
|
|
|
|
|
=item I |
3436
|
|
|
|
|
|
|
|
3437
|
|
|
|
|
|
|
We again demonstrate clustering in 3D but now we have one Gaussian cluster that |
3438
|
|
|
|
|
|
|
"cuts" through the other two Gaussian clusters. The datafile used in this example is |
3439
|
|
|
|
|
|
|
C. The three Gaussians in this case are highly overlapping and |
3440
|
|
|
|
|
|
|
highly anisotropic. The EM derived clustering for this data is shown in the files |
3441
|
|
|
|
|
|
|
C and C, the |
3442
|
|
|
|
|
|
|
former displaying the hard clusters obtained by using the naive Bayes' classifier and |
3443
|
|
|
|
|
|
|
the latter showing the soft clusters obtained through the posterior class |
3444
|
|
|
|
|
|
|
probabilities at the data points. |
3445
|
|
|
|
|
|
|
|
3446
|
|
|
|
|
|
|
=item I |
3447
|
|
|
|
|
|
|
|
3448
|
|
|
|
|
|
|
This example, added in Version 1.2, demonstrates the use of this module for 1-D data. |
3449
|
|
|
|
|
|
|
In order to visualize the clusters for the 1-D case, we show them through their |
3450
|
|
|
|
|
|
|
respective histograms. The datafile used in this example is C. The |
3451
|
|
|
|
|
|
|
data consists of two overlapping Gaussians. The EM derived clustering for this data |
3452
|
|
|
|
|
|
|
is shown in the files C and |
3453
|
|
|
|
|
|
|
C, the former displaying the hard clusters |
3454
|
|
|
|
|
|
|
obtained by using the naive Bayes' classifier and the latter showing the soft |
3455
|
|
|
|
|
|
|
clusters obtained through the posterior class probabilities at the data points. |
3456
|
|
|
|
|
|
|
|
3457
|
|
|
|
|
|
|
=back |
3458
|
|
|
|
|
|
|
|
3459
|
|
|
|
|
|
|
Going through the six examples listed above will make you familiar with how to make |
3460
|
|
|
|
|
|
|
the calls to the clustering and the visualization methods. The C directory |
3461
|
|
|
|
|
|
|
also includes several parameter files with names like |
3462
|
|
|
|
|
|
|
|
3463
|
|
|
|
|
|
|
param1.txt |
3464
|
|
|
|
|
|
|
param2.txt |
3465
|
|
|
|
|
|
|
param3.txt |
3466
|
|
|
|
|
|
|
... |
3467
|
|
|
|
|
|
|
|
3468
|
|
|
|
|
|
|
These were used to generate the synthetic data for which the results are shown in the |
3469
|
|
|
|
|
|
|
C directory. Just make a copy of one of these files and edit it if you |
3470
|
|
|
|
|
|
|
would like to generate your own multivariate data for clustering. Note that you can |
3471
|
|
|
|
|
|
|
generate data with any dimensionality through appropriate entries in the parameter |
3472
|
|
|
|
|
|
|
file. |
3473
|
|
|
|
|
|
|
|
3474
|
|
|
|
|
|
|
=head1 CAVEATS |
3475
|
|
|
|
|
|
|
|
3476
|
|
|
|
|
|
|
When you run the scripts in the C directory, your results will NOT always |
3477
|
|
|
|
|
|
|
look like what I have shown in the PNG image files in the directory. As mentioned |
3478
|
|
|
|
|
|
|
earlier in Description, the EM algorithm starting from randomly chosen initial |
3479
|
|
|
|
|
|
|
guesses for the cluster means can get stuck in a local maximum. |
3480
|
|
|
|
|
|
|
|
3481
|
|
|
|
|
|
|
That raises an interesting question of how one judges the correctness of clustering |
3482
|
|
|
|
|
|
|
results when dealing with real experimental data. For real data, the best approach |
3483
|
|
|
|
|
|
|
is to try the EM algorithm multiple times with all of the seeding options included in |
3484
|
|
|
|
|
|
|
this module. It would be safe to say that, at least in low dimensional spaces and |
3485
|
|
|
|
|
|
|
with sufficient data, a majority of your runs should yield "correct" results. |
3486
|
|
|
|
|
|
|
|
3487
|
|
|
|
|
|
|
Also bear in mind that a pure Perl implementation is not meant for the clustering of |
3488
|
|
|
|
|
|
|
very large data files. It is really designed more for researching issues related to |
3489
|
|
|
|
|
|
|
EM based approaches to clustering. |
3490
|
|
|
|
|
|
|
|
3491
|
|
|
|
|
|
|
|
3492
|
|
|
|
|
|
|
=head1 REQUIRED |
3493
|
|
|
|
|
|
|
|
3494
|
|
|
|
|
|
|
This module requires the following three modules: |
3495
|
|
|
|
|
|
|
|
3496
|
|
|
|
|
|
|
Math::Random |
3497
|
|
|
|
|
|
|
Graphics::GnuplotIF |
3498
|
|
|
|
|
|
|
Math::GSL::Matrix |
3499
|
|
|
|
|
|
|
|
3500
|
|
|
|
|
|
|
the first for generating the multivariate random numbers, the second for the |
3501
|
|
|
|
|
|
|
visualization of the clusters, and the last for access to the Perl wrappers for the |
3502
|
|
|
|
|
|
|
GNU Scientific Library. The C module of this library is used for various |
3503
|
|
|
|
|
|
|
algebraic operations on the covariance matrices of the Gaussians. |
3504
|
|
|
|
|
|
|
|
3505
|
|
|
|
|
|
|
=head1 EXPORT |
3506
|
|
|
|
|
|
|
|
3507
|
|
|
|
|
|
|
None by design. |
3508
|
|
|
|
|
|
|
|
3509
|
|
|
|
|
|
|
|
3510
|
|
|
|
|
|
|
=head1 BUGS |
3511
|
|
|
|
|
|
|
|
3512
|
|
|
|
|
|
|
Please notify the author if you encounter any bugs. When sending email, please place |
3513
|
|
|
|
|
|
|
the string 'Algorithm EM' in the subject line. |
3514
|
|
|
|
|
|
|
|
3515
|
|
|
|
|
|
|
=head1 INSTALLATION |
3516
|
|
|
|
|
|
|
|
3517
|
|
|
|
|
|
|
Download the archive from CPAN in any directory of your choice. Unpack the archive |
3518
|
|
|
|
|
|
|
with a command that on a Linux machine would look like: |
3519
|
|
|
|
|
|
|
|
3520
|
|
|
|
|
|
|
tar zxvf Algorithm-ExpectationMaximization-1.22.tar.gz |
3521
|
|
|
|
|
|
|
|
3522
|
|
|
|
|
|
|
This will create an installation directory for you whose name will be |
3523
|
|
|
|
|
|
|
C. Enter this directory and execute the |
3524
|
|
|
|
|
|
|
following commands for a standard install of the module if you have root privileges: |
3525
|
|
|
|
|
|
|
|
3526
|
|
|
|
|
|
|
perl Makefile.PL |
3527
|
|
|
|
|
|
|
make |
3528
|
|
|
|
|
|
|
make test |
3529
|
|
|
|
|
|
|
sudo make install |
3530
|
|
|
|
|
|
|
|
3531
|
|
|
|
|
|
|
If you do not have root privileges, you can carry out a non-standard install the |
3532
|
|
|
|
|
|
|
module in any directory of your choice by: |
3533
|
|
|
|
|
|
|
|
3534
|
|
|
|
|
|
|
perl Makefile.PL prefix=/some/other/directory/ |
3535
|
|
|
|
|
|
|
make |
3536
|
|
|
|
|
|
|
make test |
3537
|
|
|
|
|
|
|
make install |
3538
|
|
|
|
|
|
|
|
3539
|
|
|
|
|
|
|
With a non-standard install, you may also have to set your PERL5LIB environment |
3540
|
|
|
|
|
|
|
variable so that this module can find the required other modules. How you do that |
3541
|
|
|
|
|
|
|
would depend on what platform you are working on. In order to install this module in |
3542
|
|
|
|
|
|
|
a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment |
3543
|
|
|
|
|
|
|
variable by |
3544
|
|
|
|
|
|
|
|
3545
|
|
|
|
|
|
|
setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/ |
3546
|
|
|
|
|
|
|
|
3547
|
|
|
|
|
|
|
If I used bash, I'd need to declare: |
3548
|
|
|
|
|
|
|
|
3549
|
|
|
|
|
|
|
export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/ |
3550
|
|
|
|
|
|
|
|
3551
|
|
|
|
|
|
|
|
3552
|
|
|
|
|
|
|
=head1 ACKNOWLEDGMENTS |
3553
|
|
|
|
|
|
|
|
3554
|
|
|
|
|
|
|
Version 1.2 is a result of the feedback received from Paul |
3555
|
|
|
|
|
|
|
May of University of Birmingham. Thanks, Paul! |
3556
|
|
|
|
|
|
|
|
3557
|
|
|
|
|
|
|
=head1 AUTHOR |
3558
|
|
|
|
|
|
|
|
3559
|
|
|
|
|
|
|
Avinash Kak, kak@purdue.edu |
3560
|
|
|
|
|
|
|
|
3561
|
|
|
|
|
|
|
If you send email, please place the string "EM Algorithm" in your |
3562
|
|
|
|
|
|
|
subject line to get past my spam filter. |
3563
|
|
|
|
|
|
|
|
3564
|
|
|
|
|
|
|
=head1 COPYRIGHT |
3565
|
|
|
|
|
|
|
|
3566
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify it under the |
3567
|
|
|
|
|
|
|
same terms as Perl itself. |
3568
|
|
|
|
|
|
|
|
3569
|
|
|
|
|
|
|
Copyright 2014 Avinash Kak |
3570
|
|
|
|
|
|
|
|
3571
|
|
|
|
|
|
|
=cut |
3572
|
|
|
|
|
|
|
|