| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Algorithm::ExpectationMaximization; |
|
2
|
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
#--------------------------------------------------------------------------- |
|
4
|
|
|
|
|
|
|
# Copyright (c) 2014 Avinash Kak. All rights reserved. This program is free |
|
5
|
|
|
|
|
|
|
# software. You may modify and/or distribute it under the same terms as Perl itself. |
|
6
|
|
|
|
|
|
|
# This copyright notice must remain attached to the file. |
|
7
|
|
|
|
|
|
|
# |
|
8
|
|
|
|
|
|
|
# Algorithm::ExpectationMaximization is a pure Perl implementation for |
|
9
|
|
|
|
|
|
|
# Expectation-Maximization based clustering of multi-dimensional data that can be |
|
10
|
|
|
|
|
|
|
# modeled as a Gaussian mixture. |
|
11
|
|
|
|
|
|
|
# --------------------------------------------------------------------------- |
|
12
|
|
|
|
|
|
|
|
|
13
|
1
|
|
|
1
|
|
24009
|
use 5.10.0; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
64
|
|
|
14
|
1
|
|
|
1
|
|
6
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
157
|
|
|
15
|
1
|
|
|
1
|
|
7
|
use warnings; |
|
|
1
|
|
|
|
|
87
|
|
|
|
1
|
|
|
|
|
59
|
|
|
16
|
1
|
|
|
1
|
|
7
|
use Carp; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
119
|
|
|
17
|
1
|
|
|
1
|
|
7
|
use File::Basename; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
106
|
|
|
18
|
1
|
|
|
1
|
|
911
|
use Math::Random; |
|
|
1
|
|
|
|
|
9382
|
|
|
|
1
|
|
|
|
|
145
|
|
|
19
|
1
|
|
|
1
|
|
808
|
use Graphics::GnuplotIF; |
|
|
1
|
|
|
|
|
15118
|
|
|
|
1
|
|
|
|
|
74
|
|
|
20
|
1
|
|
|
1
|
|
1351
|
use Math::GSL::Matrix; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
use Scalar::Util 'blessed'; |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
our $VERSION = '1.22'; |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
# from perl docs: |
|
26
|
|
|
|
|
|
|
my $_num_regex = '^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$'; |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
# Constructor: |
|
29
|
|
|
|
|
|
|
sub new { |
|
30
|
|
|
|
|
|
|
my ($class, %args) = @_; |
|
31
|
|
|
|
|
|
|
my @params = keys %args; |
|
32
|
|
|
|
|
|
|
croak "\nYou have used a wrong name for a keyword argument " . |
|
33
|
|
|
|
|
|
|
"--- perhaps a misspelling\n" |
|
34
|
|
|
|
|
|
|
if check_for_illegal_params(@params) == 0; |
|
35
|
|
|
|
|
|
|
bless { |
|
36
|
|
|
|
|
|
|
_datafile => $args{datafile} || croak("datafile required"), |
|
37
|
|
|
|
|
|
|
_mask => $args{mask} || croak("mask required"), |
|
38
|
|
|
|
|
|
|
_K => $args{K} || croak("number of clusters required"), |
|
39
|
|
|
|
|
|
|
_terminal_output => $args{terminal_output} || 0, |
|
40
|
|
|
|
|
|
|
_seeding => $args{seeding} || 'random', |
|
41
|
|
|
|
|
|
|
_seed_tags => $args{seed_tags} || [], |
|
42
|
|
|
|
|
|
|
_max_em_iterations=> $args{max_em_iterations} || 100, |
|
43
|
|
|
|
|
|
|
_class_priors => $args{class_priors} || [], |
|
44
|
|
|
|
|
|
|
_debug => $args{debug} || 0, |
|
45
|
|
|
|
|
|
|
_N => 0, |
|
46
|
|
|
|
|
|
|
_data => {}, |
|
47
|
|
|
|
|
|
|
_data_id_tags => [], |
|
48
|
|
|
|
|
|
|
_clusters => [], |
|
49
|
|
|
|
|
|
|
_cluster_centers => [], |
|
50
|
|
|
|
|
|
|
_data_dimensions => 0, |
|
51
|
|
|
|
|
|
|
_cluster_normalizers => [], |
|
52
|
|
|
|
|
|
|
_cluster_means => [], |
|
53
|
|
|
|
|
|
|
_cluster_covariances => [], |
|
54
|
|
|
|
|
|
|
_class_labels_for_data => {}, |
|
55
|
|
|
|
|
|
|
_class_probs_at_each_data_point => {}, |
|
56
|
|
|
|
|
|
|
_expected_class_probs => {}, |
|
57
|
|
|
|
|
|
|
_old_priors => [], |
|
58
|
|
|
|
|
|
|
_old_old_priors => [], |
|
59
|
|
|
|
|
|
|
_fisher_quality_vs_iteration => [], |
|
60
|
|
|
|
|
|
|
_mdl_quality_vs_iterations => [], |
|
61
|
|
|
|
|
|
|
}, $class; |
|
62
|
|
|
|
|
|
|
} |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
sub read_data_from_file { |
|
66
|
|
|
|
|
|
|
my $self = shift; |
|
67
|
|
|
|
|
|
|
my $filename = $self->{_datafile}; |
|
68
|
|
|
|
|
|
|
$self->read_data_from_file_csv() if $filename =~ /.csv$/; |
|
69
|
|
|
|
|
|
|
$self->read_data_from_file_dat() if $filename =~ /.dat$/; |
|
70
|
|
|
|
|
|
|
} |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
sub read_data_from_file_csv { |
|
73
|
|
|
|
|
|
|
my $self = shift; |
|
74
|
|
|
|
|
|
|
my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?'; |
|
75
|
|
|
|
|
|
|
my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered"; |
|
76
|
|
|
|
|
|
|
my $mask = $self->{_mask}; |
|
77
|
|
|
|
|
|
|
my @mask = split //, $mask; |
|
78
|
|
|
|
|
|
|
$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask; |
|
79
|
|
|
|
|
|
|
print "data dimensionality: $self->{_data_dimensions} \n"if $self->{_terminal_output}; |
|
80
|
|
|
|
|
|
|
open FILEIN, $filename or die "Unable to open $filename: $!"; |
|
81
|
|
|
|
|
|
|
die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/; |
|
82
|
|
|
|
|
|
|
local $/ = undef; |
|
83
|
|
|
|
|
|
|
my @all_data = split /\s+/, ; |
|
84
|
|
|
|
|
|
|
my %data_hash = (); |
|
85
|
|
|
|
|
|
|
my @data_tags = (); |
|
86
|
|
|
|
|
|
|
foreach my $record (@all_data) { |
|
87
|
|
|
|
|
|
|
my @splits = split /,/, $record; |
|
88
|
|
|
|
|
|
|
die "\nYour mask size (including `N' and 1's and 0's) does not match\n" . |
|
89
|
|
|
|
|
|
|
"the size of at least one of the data records in the file.\n" |
|
90
|
|
|
|
|
|
|
unless scalar(@mask) == scalar(@splits); |
|
91
|
|
|
|
|
|
|
my $record_name = shift @splits; |
|
92
|
|
|
|
|
|
|
$data_hash{$record_name} = \@splits; |
|
93
|
|
|
|
|
|
|
push @data_tags, $record_name; |
|
94
|
|
|
|
|
|
|
} |
|
95
|
|
|
|
|
|
|
$self->{_data} = \%data_hash; |
|
96
|
|
|
|
|
|
|
$self->{_data_id_tags} = \@data_tags; |
|
97
|
|
|
|
|
|
|
$self->{_N} = scalar @data_tags; |
|
98
|
|
|
|
|
|
|
# Need to make the following call to set the global mean and covariance: |
|
99
|
|
|
|
|
|
|
# my $covariance = $self->estimate_mean_and_covariance(\@data_tags); |
|
100
|
|
|
|
|
|
|
# Need to make the following call to set the global eigenvec eigenval sets: |
|
101
|
|
|
|
|
|
|
# $self->eigen_analysis_of_covariance($covariance); |
|
102
|
|
|
|
|
|
|
if ( defined($self->{_K}) && ($self->{_K} > 0) ) { |
|
103
|
|
|
|
|
|
|
carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " . |
|
104
|
|
|
|
|
|
|
"points must satisfy the relation N > 2xK**2 where K is " . |
|
105
|
|
|
|
|
|
|
"the number of clusters requested for the clusters to be " . |
|
106
|
|
|
|
|
|
|
"meaningful $!" |
|
107
|
|
|
|
|
|
|
if ( $self->{_N} < (2 * $self->{_K} ** 2) ); |
|
108
|
|
|
|
|
|
|
print "\n\n\n"; |
|
109
|
|
|
|
|
|
|
} |
|
110
|
|
|
|
|
|
|
} |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
sub read_data_from_file_dat { |
|
113
|
|
|
|
|
|
|
my $self = shift; |
|
114
|
|
|
|
|
|
|
my $datafile = $self->{_datafile}; |
|
115
|
|
|
|
|
|
|
my $mask = $self->{_mask}; |
|
116
|
|
|
|
|
|
|
my @mask = split //, $mask; |
|
117
|
|
|
|
|
|
|
$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask; |
|
118
|
|
|
|
|
|
|
print "data dimensionality: $self->{_data_dimensions} \n" |
|
119
|
|
|
|
|
|
|
if $self->{_terminal_output}; |
|
120
|
|
|
|
|
|
|
open INPUT, $datafile |
|
121
|
|
|
|
|
|
|
or die "unable to open file $datafile: $!"; |
|
122
|
|
|
|
|
|
|
chomp( my @raw_data = ); |
|
123
|
|
|
|
|
|
|
close INPUT; |
|
124
|
|
|
|
|
|
|
# Transform strings into number data |
|
125
|
|
|
|
|
|
|
foreach my $record (@raw_data) { |
|
126
|
|
|
|
|
|
|
next unless $record; |
|
127
|
|
|
|
|
|
|
next if $record =~ /^#/; |
|
128
|
|
|
|
|
|
|
my @data_fields; |
|
129
|
|
|
|
|
|
|
my @fields = split /\s+/, $record; |
|
130
|
|
|
|
|
|
|
die "\nABORTED: Mask size does not correspond to row record size" |
|
131
|
|
|
|
|
|
|
if $#fields != $#mask; |
|
132
|
|
|
|
|
|
|
my $record_id; |
|
133
|
|
|
|
|
|
|
foreach my $i (0..@fields-1) { |
|
134
|
|
|
|
|
|
|
if ($mask[$i] eq '0') { |
|
135
|
|
|
|
|
|
|
next; |
|
136
|
|
|
|
|
|
|
} elsif ($mask[$i] eq 'N') { |
|
137
|
|
|
|
|
|
|
$record_id = $fields[$i]; |
|
138
|
|
|
|
|
|
|
} elsif ($mask[$i] eq '1') { |
|
139
|
|
|
|
|
|
|
push @data_fields, $fields[$i]; |
|
140
|
|
|
|
|
|
|
} else { |
|
141
|
|
|
|
|
|
|
die "misformed mask for reading the data file"; |
|
142
|
|
|
|
|
|
|
} |
|
143
|
|
|
|
|
|
|
} |
|
144
|
|
|
|
|
|
|
my @nums = map {/$_num_regex/;$_} @data_fields; |
|
145
|
|
|
|
|
|
|
$self->{_data}->{ $record_id } = \@nums; |
|
146
|
|
|
|
|
|
|
} |
|
147
|
|
|
|
|
|
|
my @all_data_ids = keys %{$self->{_data}}; |
|
148
|
|
|
|
|
|
|
$self->{_data_id_tags} = \@all_data_ids; |
|
149
|
|
|
|
|
|
|
$self->{_N} = scalar @all_data_ids; |
|
150
|
|
|
|
|
|
|
if ( defined($self->{_K}) && ($self->{_K} > 0) ) { |
|
151
|
|
|
|
|
|
|
carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " . |
|
152
|
|
|
|
|
|
|
"points must satisfy the relation N > 2xK**2 where K is " . |
|
153
|
|
|
|
|
|
|
"the number of clusters requested for the clusters to be " . |
|
154
|
|
|
|
|
|
|
"meaningful $!" |
|
155
|
|
|
|
|
|
|
if ( $self->{_N} < (2 * $self->{_K} ** 2) ); |
|
156
|
|
|
|
|
|
|
} |
|
157
|
|
|
|
|
|
|
} |
|
158
|
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
# This is the heart of the module --- in the sense that this method implements the EM |
|
161
|
|
|
|
|
|
|
# algorithm for the estimating the parameters of a Gaussian mixture model for the |
|
162
|
|
|
|
|
|
|
# data. In the implementation shown below, we declare convergence for the EM |
|
163
|
|
|
|
|
|
|
# algorithm when the change in the class priors over three iterations falls below a |
|
164
|
|
|
|
|
|
|
# threshold. The current value of this threshold, as can be seen in the function |
|
165
|
|
|
|
|
|
|
# compare_array_floats(), is 0.00001. |
|
166
|
|
|
|
|
|
|
sub EM { |
|
167
|
|
|
|
|
|
|
my $self = shift; |
|
168
|
|
|
|
|
|
|
$self->initialize_class_priors(); |
|
169
|
|
|
|
|
|
|
for (my $em_iteration=0; $em_iteration < $self->{_max_em_iterations}; |
|
170
|
|
|
|
|
|
|
$em_iteration++) { |
|
171
|
|
|
|
|
|
|
if ($em_iteration == 0) { |
|
172
|
|
|
|
|
|
|
print "\nSeeding the EM algorithm with:\n"; |
|
173
|
|
|
|
|
|
|
$self->display_seeding_stats(); |
|
174
|
|
|
|
|
|
|
print "\nFinished displaying the seeding information\n"; |
|
175
|
|
|
|
|
|
|
print "\nWill print out a dot for each iteration of EM:\n\n"; |
|
176
|
|
|
|
|
|
|
} |
|
177
|
|
|
|
|
|
|
my $progress_indicator = $em_iteration % 5 == 0 ? $em_iteration : "."; |
|
178
|
|
|
|
|
|
|
print $progress_indicator; |
|
179
|
|
|
|
|
|
|
foreach my $data_id (@{$self->{_data_id_tags}}) { |
|
180
|
|
|
|
|
|
|
$self->{_class_probs_at_each_data_point}->{$data_id} = []; |
|
181
|
|
|
|
|
|
|
$self->{_expected_class_probs}->{$data_id} = []; |
|
182
|
|
|
|
|
|
|
} |
|
183
|
|
|
|
|
|
|
# Calculate prob(x | C_i) --- this is the prob of data point x as |
|
184
|
|
|
|
|
|
|
# a member of class C_i. You must do this for all K classes: |
|
185
|
|
|
|
|
|
|
foreach my $cluster_index(0..$self->{_K}-1) { |
|
186
|
|
|
|
|
|
|
$self->find_prob_at_each_datapoint_for_given_mean_and_covar( |
|
187
|
|
|
|
|
|
|
$self->{_cluster_means}->[$cluster_index], |
|
188
|
|
|
|
|
|
|
$self->{_cluster_covariances}->[$cluster_index] ); |
|
189
|
|
|
|
|
|
|
} |
|
190
|
|
|
|
|
|
|
$self->{_cluster_normalizers} = []; |
|
191
|
|
|
|
|
|
|
if ($self->{_debug}) { |
|
192
|
|
|
|
|
|
|
print "\n\nDisplaying prob of a data point vis-a-vis each class:\n\n"; |
|
193
|
|
|
|
|
|
|
foreach my $data_id (sort keys %{$self->{_data}}) { |
|
194
|
|
|
|
|
|
|
my $class_probs_at_a_point = |
|
195
|
|
|
|
|
|
|
$self->{_class_probs_at_each_data_point}->{$data_id}; |
|
196
|
|
|
|
|
|
|
print "Class probs for $data_id: @$class_probs_at_a_point\n" |
|
197
|
|
|
|
|
|
|
} |
|
198
|
|
|
|
|
|
|
} |
|
199
|
|
|
|
|
|
|
# Calculate prob(C_i | x) which is the posterior prob of class |
|
200
|
|
|
|
|
|
|
# considered as a r.v. to be C_i at a given point x. For a given |
|
201
|
|
|
|
|
|
|
# x, the sum of such probabilities over all C_i must add up to 1: |
|
202
|
|
|
|
|
|
|
$self->find_expected_classes_at_each_datapoint(); |
|
203
|
|
|
|
|
|
|
if ($self->{_debug}) { |
|
204
|
|
|
|
|
|
|
print "\n\nDisplaying expected class probs at each data point:\n\n"; |
|
205
|
|
|
|
|
|
|
foreach my $data_id (sort keys %{$self->{_expected_class_probs}}) { |
|
206
|
|
|
|
|
|
|
my $expected_classes_at_a_point = |
|
207
|
|
|
|
|
|
|
$self->{_expected_class_probs}->{$data_id}; |
|
208
|
|
|
|
|
|
|
print "Expected classes $data_id: @$expected_classes_at_a_point\n"; |
|
209
|
|
|
|
|
|
|
} |
|
210
|
|
|
|
|
|
|
} |
|
211
|
|
|
|
|
|
|
# 1. UPDATE MEANS: |
|
212
|
|
|
|
|
|
|
my @new_means; |
|
213
|
|
|
|
|
|
|
foreach my $cluster_index(0..$self->{_K}-1) { |
|
214
|
|
|
|
|
|
|
$new_means[$cluster_index] = |
|
215
|
|
|
|
|
|
|
Math::GSL::Matrix->new($self->{_data_dimensions},1); |
|
216
|
|
|
|
|
|
|
$new_means[$cluster_index]->zero(); |
|
217
|
|
|
|
|
|
|
foreach my $data_id (keys %{$self->{_data}}) { |
|
218
|
|
|
|
|
|
|
my $data_record = $self->{_data}->{$data_id}; |
|
219
|
|
|
|
|
|
|
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); |
|
220
|
|
|
|
|
|
|
$data_vec->set_col(0,$data_record); |
|
221
|
|
|
|
|
|
|
$new_means[$cluster_index] += |
|
222
|
|
|
|
|
|
|
$self->{_expected_class_probs}->{$data_id}->[$cluster_index] * |
|
223
|
|
|
|
|
|
|
$data_vec->copy(); |
|
224
|
|
|
|
|
|
|
$self->{_cluster_normalizers}->[$cluster_index] += |
|
225
|
|
|
|
|
|
|
$self->{_expected_class_probs}->{$data_id}->[$cluster_index]; |
|
226
|
|
|
|
|
|
|
} |
|
227
|
|
|
|
|
|
|
$new_means[$cluster_index] *= 1.0 / |
|
228
|
|
|
|
|
|
|
$self->{_cluster_normalizers}->[$cluster_index]; |
|
229
|
|
|
|
|
|
|
} |
|
230
|
|
|
|
|
|
|
if ($self->{_debug}) { |
|
231
|
|
|
|
|
|
|
foreach my $meanvec (@new_means) { |
|
232
|
|
|
|
|
|
|
display_matrix("At EM Iteration $em_iteration, new mean vector is", |
|
233
|
|
|
|
|
|
|
$meanvec); |
|
234
|
|
|
|
|
|
|
} |
|
235
|
|
|
|
|
|
|
} |
|
236
|
|
|
|
|
|
|
$self->{_cluster_means} = \@new_means; |
|
237
|
|
|
|
|
|
|
# 2. UPDATE COVARIANCES: |
|
238
|
|
|
|
|
|
|
my @new_covariances; |
|
239
|
|
|
|
|
|
|
foreach my $cluster_index(0..$self->{_K}-1) { |
|
240
|
|
|
|
|
|
|
$new_covariances[$cluster_index] = |
|
241
|
|
|
|
|
|
|
Math::GSL::Matrix->new($self->{_data_dimensions}, |
|
242
|
|
|
|
|
|
|
$self->{_data_dimensions}); |
|
243
|
|
|
|
|
|
|
$new_covariances[$cluster_index]->zero(); |
|
244
|
|
|
|
|
|
|
my $normalizer = 0; |
|
245
|
|
|
|
|
|
|
foreach my $data_id (keys %{$self->{_data}}) { |
|
246
|
|
|
|
|
|
|
my $data_record = $self->{_data}->{$data_id}; |
|
247
|
|
|
|
|
|
|
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); |
|
248
|
|
|
|
|
|
|
$data_vec->set_col(0,$data_record); |
|
249
|
|
|
|
|
|
|
my $mean_subtracted_data = |
|
250
|
|
|
|
|
|
|
$data_vec - $self->{_cluster_means}->[$cluster_index]; |
|
251
|
|
|
|
|
|
|
my $outer_product = outer_product($mean_subtracted_data, |
|
252
|
|
|
|
|
|
|
$mean_subtracted_data); |
|
253
|
|
|
|
|
|
|
$new_covariances[$cluster_index] += |
|
254
|
|
|
|
|
|
|
$self->{_expected_class_probs}->{$data_id}->[$cluster_index] * |
|
255
|
|
|
|
|
|
|
$outer_product; |
|
256
|
|
|
|
|
|
|
} |
|
257
|
|
|
|
|
|
|
$new_covariances[$cluster_index] *= |
|
258
|
|
|
|
|
|
|
1.0 / |
|
259
|
|
|
|
|
|
|
$self->{_cluster_normalizers}->[$cluster_index]; |
|
260
|
|
|
|
|
|
|
} |
|
261
|
|
|
|
|
|
|
$self->{_cluster_covariances} = \@new_covariances; |
|
262
|
|
|
|
|
|
|
# 3. UPDATE PRIORS: |
|
263
|
|
|
|
|
|
|
$self->{_old_old_priors} = deep_copy_array( $self->{_old_priors} ) |
|
264
|
|
|
|
|
|
|
if @{$self->{_old_priors}} > 0; |
|
265
|
|
|
|
|
|
|
$self->{_old_priors} = deep_copy_array( $self->{_class_priors} ); |
|
266
|
|
|
|
|
|
|
foreach my $cluster_index(0..$self->{_K}-1) { |
|
267
|
|
|
|
|
|
|
$self->{_class_priors}->[$cluster_index] = |
|
268
|
|
|
|
|
|
|
$self->{_cluster_normalizers}->[$cluster_index] / $self->{_N}; |
|
269
|
|
|
|
|
|
|
} |
|
270
|
|
|
|
|
|
|
my @priors = @{$self->{_class_priors}}; |
|
271
|
|
|
|
|
|
|
print "\nUpdated priors: @priors\n\n\n" if $self->{_debug}; |
|
272
|
|
|
|
|
|
|
push @{$self->{_fisher_quality_vs_iteration}}, |
|
273
|
|
|
|
|
|
|
$self->clustering_quality_fisher(); |
|
274
|
|
|
|
|
|
|
push @{$self->{_mdl_quality_vs_iteration}}, $self->clustering_quality_mdl(); |
|
275
|
|
|
|
|
|
|
if ( ($em_iteration > 5 && $self->reached_convergence()) |
|
276
|
|
|
|
|
|
|
|| ($em_iteration == $self->{_max_em_iterations} - 1) ) { |
|
277
|
|
|
|
|
|
|
my @old_old_priors = @{$self->{_old_old_priors}}; |
|
278
|
|
|
|
|
|
|
my @old_priors = @{$self->{_old_priors}}; |
|
279
|
|
|
|
|
|
|
print "\n\nPrevious to previous priors: @old_old_priors\n"; |
|
280
|
|
|
|
|
|
|
print "Previous priors: @old_priors\n"; |
|
281
|
|
|
|
|
|
|
print "Current class priors: @{$self->{_class_priors}}\n"; |
|
282
|
|
|
|
|
|
|
print "\n\nCONVERGENCE ACHIEVED AT ITERATION $em_iteration\n\n" |
|
283
|
|
|
|
|
|
|
if $em_iteration < $self->{_max_em_iterations} - 1; |
|
284
|
|
|
|
|
|
|
last; |
|
285
|
|
|
|
|
|
|
} |
|
286
|
|
|
|
|
|
|
} |
|
287
|
|
|
|
|
|
|
print "\n\n\n"; |
|
288
|
|
|
|
|
|
|
} |
|
289
|
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
sub reached_convergence { |
|
291
|
|
|
|
|
|
|
my $self = shift; |
|
292
|
|
|
|
|
|
|
return 1 if compare_array_floats($self->{_old_old_priors}, |
|
293
|
|
|
|
|
|
|
$self->{_old_priors}) |
|
294
|
|
|
|
|
|
|
&& |
|
295
|
|
|
|
|
|
|
compare_array_floats($self->{_old_priors}, |
|
296
|
|
|
|
|
|
|
$self->{_class_priors}); |
|
297
|
|
|
|
|
|
|
return 0; |
|
298
|
|
|
|
|
|
|
} |
|
299
|
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
# Classify the data into disjoint clusters using the Naive Bayes' classification: |
|
301
|
|
|
|
|
|
|
sub run_bayes_classifier { |
|
302
|
|
|
|
|
|
|
my $self = shift; |
|
303
|
|
|
|
|
|
|
$self->classify_all_data_tuples_bayes($self->{_cluster_means}, |
|
304
|
|
|
|
|
|
|
$self->{_cluster_covariances}); |
|
305
|
|
|
|
|
|
|
} |
|
306
|
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
# Should NOT be called before run_bayes_classifier is run |
|
308
|
|
|
|
|
|
|
sub return_disjoint_clusters { |
|
309
|
|
|
|
|
|
|
my $self = shift; |
|
310
|
|
|
|
|
|
|
return $self->{_clusters}; |
|
311
|
|
|
|
|
|
|
} |
|
312
|
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
sub return_clusters_with_posterior_probs_above_threshold { |
|
314
|
|
|
|
|
|
|
my $self = shift; |
|
315
|
|
|
|
|
|
|
my $theta = shift; |
|
316
|
|
|
|
|
|
|
my @class_distributions; |
|
317
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
|
318
|
|
|
|
|
|
|
push @class_distributions, []; |
|
319
|
|
|
|
|
|
|
} |
|
320
|
|
|
|
|
|
|
foreach my $data_tag (@{$self->{_data_id_tags}}) { |
|
321
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
|
322
|
|
|
|
|
|
|
push @{$class_distributions[$cluster_index]}, $data_tag |
|
323
|
|
|
|
|
|
|
if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] |
|
324
|
|
|
|
|
|
|
> $theta; |
|
325
|
|
|
|
|
|
|
} |
|
326
|
|
|
|
|
|
|
} |
|
327
|
|
|
|
|
|
|
return \@class_distributions; |
|
328
|
|
|
|
|
|
|
} |
|
329
|
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
sub return_individual_class_distributions_above_given_threshold { |
|
331
|
|
|
|
|
|
|
my $self = shift; |
|
332
|
|
|
|
|
|
|
my $theta = shift; |
|
333
|
|
|
|
|
|
|
my @probability_distributions; |
|
334
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
|
335
|
|
|
|
|
|
|
push @probability_distributions, []; |
|
336
|
|
|
|
|
|
|
} |
|
337
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
|
338
|
|
|
|
|
|
|
my $mean_vec = $self->{_cluster_means}->[$cluster_index]; |
|
339
|
|
|
|
|
|
|
my $covar = $self->{_cluster_covariances}->[$cluster_index]; |
|
340
|
|
|
|
|
|
|
foreach my $data_id (keys %{$self->{_data}}) { |
|
341
|
|
|
|
|
|
|
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); |
|
342
|
|
|
|
|
|
|
$data_vec->set_col( 0, $self->{_data}->{$data_id}); |
|
343
|
|
|
|
|
|
|
my $datavec_minus_mean = $data_vec - $mean_vec; |
|
344
|
|
|
|
|
|
|
display_matrix( "datavec minus mean is ", $datavec_minus_mean ) |
|
345
|
|
|
|
|
|
|
if $self->{_debug}; |
|
346
|
|
|
|
|
|
|
my $exponent = undef; |
|
347
|
|
|
|
|
|
|
if ($self->{_data_dimensions} > 1) { |
|
348
|
|
|
|
|
|
|
$exponent = -0.5 * vector_matrix_multiply( |
|
349
|
|
|
|
|
|
|
transpose($datavec_minus_mean), |
|
350
|
|
|
|
|
|
|
matrix_vector_multiply($covar->inverse(), $datavec_minus_mean ) ); |
|
351
|
|
|
|
|
|
|
} else { |
|
352
|
|
|
|
|
|
|
my @var_inverse = $covar->inverse()->as_list; |
|
353
|
|
|
|
|
|
|
my $var_inverse_val = $var_inverse[0]; |
|
354
|
|
|
|
|
|
|
my @data_minus_mean = $datavec_minus_mean->as_list; |
|
355
|
|
|
|
|
|
|
my $data_minus_mean_val = $data_minus_mean[0]; |
|
356
|
|
|
|
|
|
|
$exponent = -0.5 * ($data_minus_mean_val ** 2) * $var_inverse_val; |
|
357
|
|
|
|
|
|
|
} |
|
358
|
|
|
|
|
|
|
print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug}; |
|
359
|
|
|
|
|
|
|
my $coefficient = 1.0 / \ |
|
360
|
|
|
|
|
|
|
( (2 * $Math::GSL::Const::M_PI)**$self->{_data_dimensions} * sqrt($covar->det()) ); |
|
361
|
|
|
|
|
|
|
my $prob = $coefficient * exp($exponent); |
|
362
|
|
|
|
|
|
|
push @{$probability_distributions[$cluster_index]}, $data_id |
|
363
|
|
|
|
|
|
|
if $prob > $theta; |
|
364
|
|
|
|
|
|
|
} |
|
365
|
|
|
|
|
|
|
} |
|
366
|
|
|
|
|
|
|
return \@probability_distributions; |
|
367
|
|
|
|
|
|
|
} |
|
368
|
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
sub return_estimated_priors { |
|
370
|
|
|
|
|
|
|
my $self = shift; |
|
371
|
|
|
|
|
|
|
return $self->{_class_priors}; |
|
372
|
|
|
|
|
|
|
} |
|
373
|
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
# Calculates the MDL (Minimum Description Length) clustering criterion according to |
|
375
|
|
|
|
|
|
|
# Rissanen. (J. Rissanen: "Modeling by Shortest Data Description," Automatica, 1978, |
|
376
|
|
|
|
|
|
|
# and "A Universal Prior for Integers and Estimation by Minimum Description Length," |
|
377
|
|
|
|
|
|
|
# Annals of Statistics, 1983.) The MDL criterion is a difference of a log-likelihood |
|
378
|
|
|
|
|
|
|
# term for all of the observed data and a model-complexity penalty term. In general, |
|
379
|
|
|
|
|
|
|
# both the log-likelihood and the model-complexity terms increase as the number of |
|
380
|
|
|
|
|
|
|
# clusters increases. The form of the MDL criterion used in the implementation below |
|
381
|
|
|
|
|
|
|
# uses for the penalty term the Bayesian Information Criterion (BIC) of G. Schwartz, |
|
382
|
|
|
|
|
|
|
# "Estimating the Dimensions of a Model," The Annals of Statistics, 1978. In |
|
383
|
|
|
|
|
|
|
# general, the smaller the value of the MDL quality measure calculated below, the |
|
384
|
|
|
|
|
|
|
# better the clustering of the data. |
|
385
|
|
|
|
|
|
|
sub clustering_quality_mdl { |
|
386
|
|
|
|
|
|
|
my $self = shift; |
|
387
|
|
|
|
|
|
|
# Calculate the inverses of all of the covariance matrices in order to avoid |
|
388
|
|
|
|
|
|
|
# having to calculate them repeatedly inside the inner 'foreach' loop in the |
|
389
|
|
|
|
|
|
|
# main part of this method. Here we go: |
|
390
|
|
|
|
|
|
|
my @covar_inverses; |
|
391
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
|
392
|
|
|
|
|
|
|
my $covar = $self->{_cluster_covariances}->[$cluster_index]; |
|
393
|
|
|
|
|
|
|
push @covar_inverses, $covar->inverse(); |
|
394
|
|
|
|
|
|
|
} |
|
395
|
|
|
|
|
|
|
# For the clustering quality, first calculate the log-likelihood of all the |
|
396
|
|
|
|
|
|
|
# observed data: |
|
397
|
|
|
|
|
|
|
my $log_likelihood = 0; |
|
398
|
|
|
|
|
|
|
foreach my $tag (@{$self->{_data_id_tags}}) { |
|
399
|
|
|
|
|
|
|
my $likelihood_for_each_tag = 0; |
|
400
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
|
401
|
|
|
|
|
|
|
my $mean_vec = $self->{_cluster_means}->[$cluster_index]; |
|
402
|
|
|
|
|
|
|
my $covar = $self->{_cluster_covariances}->[$cluster_index]; |
|
403
|
|
|
|
|
|
|
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); |
|
404
|
|
|
|
|
|
|
$data_vec->set_col( 0, $self->{_data}->{$tag}); |
|
405
|
|
|
|
|
|
|
my $datavec_minus_mean = $data_vec - $mean_vec; |
|
406
|
|
|
|
|
|
|
my $exponent = undef; |
|
407
|
|
|
|
|
|
|
if ($self->{_data_dimensions} > 1) { |
|
408
|
|
|
|
|
|
|
$exponent = -0.5 * vector_matrix_multiply( |
|
409
|
|
|
|
|
|
|
transpose($datavec_minus_mean), |
|
410
|
|
|
|
|
|
|
matrix_vector_multiply($covar_inverses[$cluster_index], |
|
411
|
|
|
|
|
|
|
$datavec_minus_mean ) ); |
|
412
|
|
|
|
|
|
|
} else { |
|
413
|
|
|
|
|
|
|
my @var_inverse = $covar_inverses[$cluster_index]->as_list; |
|
414
|
|
|
|
|
|
|
my $var_inverse_val = $var_inverse[0]; |
|
415
|
|
|
|
|
|
|
my @data_minus_mean = $datavec_minus_mean->as_list; |
|
416
|
|
|
|
|
|
|
my $data_minus_mean_val = $data_minus_mean[0]; |
|
417
|
|
|
|
|
|
|
$exponent = -0.5 * ($data_minus_mean_val ** 2) * $var_inverse_val; |
|
418
|
|
|
|
|
|
|
} |
|
419
|
|
|
|
|
|
|
next if $covar->det() < 0; |
|
420
|
|
|
|
|
|
|
my $coefficient = 1.0 / |
|
421
|
|
|
|
|
|
|
( (2 * $Math::GSL::Const::M_PI)**$self->{_data_dimensions} |
|
422
|
|
|
|
|
|
|
* sqrt($covar->det()) ); |
|
423
|
|
|
|
|
|
|
my $prob = $coefficient * exp($exponent); |
|
424
|
|
|
|
|
|
|
$likelihood_for_each_tag += |
|
425
|
|
|
|
|
|
|
$prob * $self->{_class_priors}->[$cluster_index]; |
|
426
|
|
|
|
|
|
|
} |
|
427
|
|
|
|
|
|
|
$log_likelihood += log( $likelihood_for_each_tag ); |
|
428
|
|
|
|
|
|
|
} |
|
429
|
|
|
|
|
|
|
# Now calculate the model complexity penalty. $L is the total number of |
|
430
|
|
|
|
|
|
|
# parameters it takes to specify a mixture of K Gaussians. If d is the |
|
431
|
|
|
|
|
|
|
# dimensionality of the data space, the covariance matrix of each Gaussian takes |
|
432
|
|
|
|
|
|
|
# (d**2 -d)/2 + d = d(d+1)/2 parameters since this matrix must be symmetric. And |
|
433
|
|
|
|
|
|
|
# then you need d mean value parameters, and one prior probability parameter |
|
434
|
|
|
|
|
|
|
# for the Gaussian. So $L = K[1 + d + d(d+1)/2] - 1 where the final '1' that |
|
435
|
|
|
|
|
|
|
# is subtracted is to account for the normalization on the class priors. |
|
436
|
|
|
|
|
|
|
my $L = (0.5 * $self->{_K} * |
|
437
|
|
|
|
|
|
|
($self->{_data_dimensions}**2 + 3*$self->{_data_dimensions} + 2) ) - 1; |
|
438
|
|
|
|
|
|
|
my $model_complexity_penalty = 0.5 * $L * log( $self->{_N} ); |
|
439
|
|
|
|
|
|
|
my $mdl_criterion = -1 * $log_likelihood + $model_complexity_penalty; |
|
440
|
|
|
|
|
|
|
return $mdl_criterion; |
|
441
|
|
|
|
|
|
|
} |
|
442
|
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
# For our second measure of clustering quality, we use `trace( SW^-1 . SB)' where SW |
|
444
|
|
|
|
|
|
|
# is the within-class scatter matrix, more commonly denoted S_w, and SB the |
|
445
|
|
|
|
|
|
|
# between-class scatter matrix, more commonly denoted S_b (the underscore means |
|
446
|
|
|
|
|
|
|
# subscript). This measure can be thought of as the normalized average distance |
|
447
|
|
|
|
|
|
|
# between the clusters, the normalization being provided by average cluster |
|
448
|
|
|
|
|
|
|
# covariance SW^-1. Therefore, the larger the value of this quality measure, the |
|
449
|
|
|
|
|
|
|
# better the separation between the clusters. Since this measure has its roots in |
|
450
|
|
|
|
|
|
|
# the Fisher linear discriminant function, we incorporate the word 'fisher' in the |
|
451
|
|
|
|
|
|
|
# name of the quality measure. Note that this measure is good only when the clusters |
|
452
|
|
|
|
|
|
|
# are disjoint. When the clusters exhibit significant overlap, the numbers produced |
|
453
|
|
|
|
|
|
|
# by this quality measure tend to be generally meaningless. As an extreme case, |
|
454
|
|
|
|
|
|
|
# let's say your data was produced by a set of Gaussians, all with the same mean |
|
455
|
|
|
|
|
|
|
# vector, but each with a distinct covariance. For this extreme case, this measure |
|
456
|
|
|
|
|
|
|
# will produce a value close to zero --- depending on the accuracy with which the |
|
457
|
|
|
|
|
|
|
# means are estimated --- even when your clusterer is doing a good job of identifying |
|
458
|
|
|
|
|
|
|
# the individual clusters. |
|
459
|
|
|
|
|
|
|
sub clustering_quality_fisher { |
|
460
|
|
|
|
|
|
|
my $self = shift; |
|
461
|
|
|
|
|
|
|
my @cluster_quality_indices; |
|
462
|
|
|
|
|
|
|
my $fisher_trace = 0; |
|
463
|
|
|
|
|
|
|
my $S_w = |
|
464
|
|
|
|
|
|
|
Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions}); |
|
465
|
|
|
|
|
|
|
$S_w->zero; |
|
466
|
|
|
|
|
|
|
my $S_b = |
|
467
|
|
|
|
|
|
|
Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions}); |
|
468
|
|
|
|
|
|
|
$S_b->zero; |
|
469
|
|
|
|
|
|
|
my $global_mean = Math::GSL::Matrix->new($self->{_data_dimensions},1); |
|
470
|
|
|
|
|
|
|
$global_mean->zero; |
|
471
|
|
|
|
|
|
|
foreach my $cluster_index(0..$self->{_K}-1) { |
|
472
|
|
|
|
|
|
|
$global_mean = $self->{_class_priors}->[$cluster_index] * |
|
473
|
|
|
|
|
|
|
$self->{_cluster_means}->[$cluster_index]; |
|
474
|
|
|
|
|
|
|
} |
|
475
|
|
|
|
|
|
|
foreach my $cluster_index(0..$self->{_K}-1) { |
|
476
|
|
|
|
|
|
|
$S_w += $self->{_cluster_covariances}->[$cluster_index] * |
|
477
|
|
|
|
|
|
|
$self->{_class_priors}->[$cluster_index]; |
|
478
|
|
|
|
|
|
|
my $class_mean_minus_global_mean = $self->{_cluster_means}->[$cluster_index] |
|
479
|
|
|
|
|
|
|
- $global_mean; |
|
480
|
|
|
|
|
|
|
my $outer_product = outer_product( $class_mean_minus_global_mean, |
|
481
|
|
|
|
|
|
|
$class_mean_minus_global_mean ); |
|
482
|
|
|
|
|
|
|
$S_b += $self->{_class_priors}->[$cluster_index] * $outer_product; |
|
483
|
|
|
|
|
|
|
} |
|
484
|
|
|
|
|
|
|
my $fisher = matrix_multiply($S_w->inverse, $S_b); |
|
485
|
|
|
|
|
|
|
return $fisher unless defined blessed($fisher); |
|
486
|
|
|
|
|
|
|
return matrix_trace($fisher); |
|
487
|
|
|
|
|
|
|
} |
|
488
|
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
sub display_seeding_stats { |
|
490
|
|
|
|
|
|
|
my $self = shift; |
|
491
|
|
|
|
|
|
|
foreach my $cluster_index(0..$self->{_K}-1) { |
|
492
|
|
|
|
|
|
|
print "\nSeeding for cluster $cluster_index:\n"; |
|
493
|
|
|
|
|
|
|
my $mean = $self->{_cluster_means}->[$cluster_index]; |
|
494
|
|
|
|
|
|
|
display_matrix("The mean is: ", $mean); |
|
495
|
|
|
|
|
|
|
my $covariance = $self->{_cluster_covariances}->[$cluster_index]; |
|
496
|
|
|
|
|
|
|
display_matrix("The covariance is: ", $covariance); |
|
497
|
|
|
|
|
|
|
} |
|
498
|
|
|
|
|
|
|
} |
|
499
|
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
sub display_fisher_quality_vs_iterations { |
|
501
|
|
|
|
|
|
|
my $self = shift; |
|
502
|
|
|
|
|
|
|
print "\n\nFisher Quality vs. Iterations: " . |
|
503
|
|
|
|
|
|
|
"@{$self->{_fisher_quality_vs_iteration}}\n\n"; |
|
504
|
|
|
|
|
|
|
} |
|
505
|
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
sub display_mdl_quality_vs_iterations { |
|
507
|
|
|
|
|
|
|
my $self = shift; |
|
508
|
|
|
|
|
|
|
print "\n\nMDL Quality vs. Iterations: @{$self->{_mdl_quality_vs_iteration}}\n\n"; |
|
509
|
|
|
|
|
|
|
} |
|
510
|
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
sub find_prob_at_each_datapoint_for_given_mean_and_covar { |
|
512
|
|
|
|
|
|
|
my $self = shift; |
|
513
|
|
|
|
|
|
|
my $mean_vec_ref = shift; |
|
514
|
|
|
|
|
|
|
my $covar_ref = shift; |
|
515
|
|
|
|
|
|
|
foreach my $data_id (keys %{$self->{_data}}) { |
|
516
|
|
|
|
|
|
|
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); |
|
517
|
|
|
|
|
|
|
$data_vec->set_col( 0, $self->{_data}->{$data_id}); |
|
518
|
|
|
|
|
|
|
if ($self->{_debug}) { |
|
519
|
|
|
|
|
|
|
display_matrix("data vec in find prob function", $data_vec); |
|
520
|
|
|
|
|
|
|
display_matrix("mean vec in find prob function", $mean_vec_ref); |
|
521
|
|
|
|
|
|
|
display_matrix("covariance in find prob function", $covar_ref); |
|
522
|
|
|
|
|
|
|
} |
|
523
|
|
|
|
|
|
|
my $datavec_minus_mean = $data_vec - $mean_vec_ref; |
|
524
|
|
|
|
|
|
|
display_matrix( "datavec minus mean is ", $datavec_minus_mean ) if $self->{_debug}; |
|
525
|
|
|
|
|
|
|
my $exponent = undef; |
|
526
|
|
|
|
|
|
|
if ($self->{_data_dimensions} > 1) { |
|
527
|
|
|
|
|
|
|
$exponent = -0.5 * vector_matrix_multiply( transpose($datavec_minus_mean), |
|
528
|
|
|
|
|
|
|
matrix_vector_multiply( $covar_ref->inverse(), $datavec_minus_mean ) ); |
|
529
|
|
|
|
|
|
|
} elsif (defined blessed($covar_ref)) { |
|
530
|
|
|
|
|
|
|
my @data_minus_mean = $datavec_minus_mean->as_list; |
|
531
|
|
|
|
|
|
|
my $data_minus_mean_val = $data_minus_mean[0]; |
|
532
|
|
|
|
|
|
|
my @covar_as_matrix = $covar_ref->as_list; |
|
533
|
|
|
|
|
|
|
my $covar_val = $covar_as_matrix[0]; |
|
534
|
|
|
|
|
|
|
$exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_val; |
|
535
|
|
|
|
|
|
|
} else { |
|
536
|
|
|
|
|
|
|
my @data_minus_mean = $datavec_minus_mean->as_list; |
|
537
|
|
|
|
|
|
|
my $data_minus_mean_val = $data_minus_mean[0]; |
|
538
|
|
|
|
|
|
|
$exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_ref; |
|
539
|
|
|
|
|
|
|
} |
|
540
|
|
|
|
|
|
|
print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug}; |
|
541
|
|
|
|
|
|
|
my $coefficient = undef; |
|
542
|
|
|
|
|
|
|
if ($self->{_data_dimensions} > 1) { |
|
543
|
|
|
|
|
|
|
$coefficient = 1.0 / sqrt( ((2 * $Math::GSL::Const::M_PI) ** $self->{_data_dimensions}) * |
|
544
|
|
|
|
|
|
|
$covar_ref->det()) ; |
|
545
|
|
|
|
|
|
|
} elsif (!defined blessed($covar_ref)) { |
|
546
|
|
|
|
|
|
|
$coefficient = 1.0 / sqrt(2 * $covar_ref * $Math::GSL::Const::M_PI); |
|
547
|
|
|
|
|
|
|
} else { |
|
548
|
|
|
|
|
|
|
my @covar_as_matrix = $covar_ref->as_list; |
|
549
|
|
|
|
|
|
|
my $covar_val = $covar_as_matrix[0]; |
|
550
|
|
|
|
|
|
|
$coefficient = 1.0 / sqrt(2 * $covar_val * $Math::GSL::Const::M_PI); |
|
551
|
|
|
|
|
|
|
} |
|
552
|
|
|
|
|
|
|
my $prob = $coefficient * exp($exponent); |
|
553
|
|
|
|
|
|
|
push @{$self->{_class_probs_at_each_data_point}->{$data_id}}, $prob; |
|
554
|
|
|
|
|
|
|
} |
|
555
|
|
|
|
|
|
|
} |
|
556
|
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
sub find_expected_classes_at_each_datapoint { |
|
558
|
|
|
|
|
|
|
my $self = shift; |
|
559
|
|
|
|
|
|
|
my @priors = @{$self->{_class_priors}}; |
|
560
|
|
|
|
|
|
|
foreach my $data_id (sort keys %{$self->{_class_probs_at_each_data_point}}) { |
|
561
|
|
|
|
|
|
|
my $numerator = |
|
562
|
|
|
|
|
|
|
vector_2_vector_multiply( |
|
563
|
|
|
|
|
|
|
$self->{_class_probs_at_each_data_point}->{$data_id}, |
|
564
|
|
|
|
|
|
|
$self->{_class_priors} ); |
|
565
|
|
|
|
|
|
|
my $sum = 0; |
|
566
|
|
|
|
|
|
|
foreach my $part (@$numerator) { |
|
567
|
|
|
|
|
|
|
$sum += $part; |
|
568
|
|
|
|
|
|
|
} |
|
569
|
|
|
|
|
|
|
$self->{_expected_class_probs}->{$data_id} = [map $_/$sum, @{$numerator}]; |
|
570
|
|
|
|
|
|
|
} |
|
571
|
|
|
|
|
|
|
} |
|
572
|
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
sub initialize_class_priors { |
|
574
|
|
|
|
|
|
|
my $self = shift; |
|
575
|
|
|
|
|
|
|
if (@{$self->{_class_priors}} == 0) { |
|
576
|
|
|
|
|
|
|
my $prior = 1.0 / $self->{_K}; |
|
577
|
|
|
|
|
|
|
foreach my $class_index (0..$self->{_K}-1) { |
|
578
|
|
|
|
|
|
|
push @{$self->{_class_priors}}, $prior; |
|
579
|
|
|
|
|
|
|
} |
|
580
|
|
|
|
|
|
|
} |
|
581
|
|
|
|
|
|
|
die "Mismatch between number of values for class priors " . |
|
582
|
|
|
|
|
|
|
"and the number of clusters expected" |
|
583
|
|
|
|
|
|
|
unless @{$self->{_class_priors}} == $self->{_K}; |
|
584
|
|
|
|
|
|
|
my $sum = 0; |
|
585
|
|
|
|
|
|
|
foreach my $prior (@{$self->{_class_priors}}) { |
|
586
|
|
|
|
|
|
|
$sum += $prior; |
|
587
|
|
|
|
|
|
|
} |
|
588
|
|
|
|
|
|
|
die "Your priors in the constructor call do not add up to 1" |
|
589
|
|
|
|
|
|
|
unless abs($sum - 1) < 0.001; |
|
590
|
|
|
|
|
|
|
print "\nInitially assumed class priors are: @{$self->{_class_priors}}\n"; |
|
591
|
|
|
|
|
|
|
} |
|
592
|
|
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
sub estimate_class_priors { |
|
594
|
|
|
|
|
|
|
my $self = shift; |
|
595
|
|
|
|
|
|
|
foreach my $datatag (keys %{$self->{_data}}) { |
|
596
|
|
|
|
|
|
|
my $class_label = $self->{_class_labels}->{$datatag}; |
|
597
|
|
|
|
|
|
|
$self->{_class_priors}[$class_label]++; |
|
598
|
|
|
|
|
|
|
} |
|
599
|
|
|
|
|
|
|
foreach my $prior (@{$self->{_class_priors}}) { |
|
600
|
|
|
|
|
|
|
$prior /= $self->{_total_number_data_tuples}; |
|
601
|
|
|
|
|
|
|
} |
|
602
|
|
|
|
|
|
|
foreach my $prior (@{$self->{_class_priors}}) { |
|
603
|
|
|
|
|
|
|
print "class priors: @{$self->{_class_priors}}\n"; |
|
604
|
|
|
|
|
|
|
} |
|
605
|
|
|
|
|
|
|
} |
|
606
|
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
sub classify_all_data_tuples_bayes { |
|
608
|
|
|
|
|
|
|
my $self = shift; |
|
609
|
|
|
|
|
|
|
my $mean_vecs_ref = shift; |
|
610
|
|
|
|
|
|
|
my $covariances_ref = shift; |
|
611
|
|
|
|
|
|
|
my @new_clusters; |
|
612
|
|
|
|
|
|
|
foreach my $index (0..$self->{_K}-1) { |
|
613
|
|
|
|
|
|
|
push @new_clusters, []; |
|
614
|
|
|
|
|
|
|
} |
|
615
|
|
|
|
|
|
|
foreach my $data_id (@{$self->{_data_id_tags}}) { |
|
616
|
|
|
|
|
|
|
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); |
|
617
|
|
|
|
|
|
|
$data_vec->set_col( 0, deep_copy_array($self->{_data}->{$data_id})); |
|
618
|
|
|
|
|
|
|
my $cluster_index_for_tuple = |
|
619
|
|
|
|
|
|
|
$self->classify_a_data_point_bayes($data_vec, |
|
620
|
|
|
|
|
|
|
$mean_vecs_ref, $covariances_ref); |
|
621
|
|
|
|
|
|
|
$self->{_class_labels}->{$data_id} = $cluster_index_for_tuple; |
|
622
|
|
|
|
|
|
|
push @{$new_clusters[$cluster_index_for_tuple]}, $data_id; |
|
623
|
|
|
|
|
|
|
} |
|
624
|
|
|
|
|
|
|
$self->{_clusters} = \@new_clusters; |
|
625
|
|
|
|
|
|
|
} |
|
626
|
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
sub classify_a_data_point_bayes { |
|
628
|
|
|
|
|
|
|
my $self = shift; |
|
629
|
|
|
|
|
|
|
my $data_vec = shift; |
|
630
|
|
|
|
|
|
|
my $mean_vecs_ref = shift; |
|
631
|
|
|
|
|
|
|
my $covariances_ref = shift; |
|
632
|
|
|
|
|
|
|
my @cluster_mean_vecs = @$mean_vecs_ref; |
|
633
|
|
|
|
|
|
|
my @cluster_covariances = @$covariances_ref; |
|
634
|
|
|
|
|
|
|
my @log_likelihoods; |
|
635
|
|
|
|
|
|
|
foreach my $cluster_index (0..@cluster_mean_vecs-1) { |
|
636
|
|
|
|
|
|
|
my $mean = $cluster_mean_vecs[$cluster_index]; |
|
637
|
|
|
|
|
|
|
my $covariance = $cluster_covariances[$cluster_index]; |
|
638
|
|
|
|
|
|
|
my $datavec_minus_mean = $data_vec - $mean; |
|
639
|
|
|
|
|
|
|
my $log_likely = undef; |
|
640
|
|
|
|
|
|
|
if ($self->{_data_dimensions} > 1) { |
|
641
|
|
|
|
|
|
|
$log_likely = -0.5 * vector_matrix_multiply( |
|
642
|
|
|
|
|
|
|
transpose($datavec_minus_mean), |
|
643
|
|
|
|
|
|
|
matrix_vector_multiply( $covariance->inverse(), |
|
644
|
|
|
|
|
|
|
$datavec_minus_mean ) ); |
|
645
|
|
|
|
|
|
|
} else { |
|
646
|
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
my @data_minus_mean = $datavec_minus_mean->as_list; |
|
648
|
|
|
|
|
|
|
my $data_minus_mean_val = $data_minus_mean[0]; |
|
649
|
|
|
|
|
|
|
my @covar_as_matrix = $covariance->as_list; |
|
650
|
|
|
|
|
|
|
my $covar_val = $covar_as_matrix[0]; |
|
651
|
|
|
|
|
|
|
$log_likely = -0.5 * ($data_minus_mean_val ** 2) / $covar_val; |
|
652
|
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
} |
|
654
|
|
|
|
|
|
|
my $posterior_log_likely = $log_likely + |
|
655
|
|
|
|
|
|
|
log( $self->{_class_priors}[$cluster_index] ); |
|
656
|
|
|
|
|
|
|
push @log_likelihoods, $posterior_log_likely; |
|
657
|
|
|
|
|
|
|
} |
|
658
|
|
|
|
|
|
|
my ($minlikely, $maxlikely) = minmax(\@log_likelihoods); |
|
659
|
|
|
|
|
|
|
my $cluster_index_for_data_point = |
|
660
|
|
|
|
|
|
|
get_index_at_value( $maxlikely, \@log_likelihoods ); |
|
661
|
|
|
|
|
|
|
return $cluster_index_for_data_point; |
|
662
|
|
|
|
|
|
|
} |
|
663
|
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
sub find_cluster_means_and_covariances { |
|
665
|
|
|
|
|
|
|
my $clusters = shift; |
|
666
|
|
|
|
|
|
|
my $data_dimensions = find_data_dimensionality($clusters); |
|
667
|
|
|
|
|
|
|
my (@cluster_mean_vecs, @cluster_covariances); |
|
668
|
|
|
|
|
|
|
foreach my $cluster_index (0..@$clusters-1) { |
|
669
|
|
|
|
|
|
|
my ($num_rows,$num_cols) = |
|
670
|
|
|
|
|
|
|
($data_dimensions,scalar(@{$clusters->[$cluster_index]})); |
|
671
|
|
|
|
|
|
|
print "\nFor cluster $cluster_index: rows: $num_rows and cols: $num_cols\n"; |
|
672
|
|
|
|
|
|
|
my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols); |
|
673
|
|
|
|
|
|
|
my $mean_vec = Math::GSL::Matrix->new($num_rows,1); |
|
674
|
|
|
|
|
|
|
my $col_index = 0; |
|
675
|
|
|
|
|
|
|
foreach my $ele (@{$clusters->[$cluster_index]}) { |
|
676
|
|
|
|
|
|
|
$matrix->set_col($col_index++, $ele); |
|
677
|
|
|
|
|
|
|
} |
|
678
|
|
|
|
|
|
|
# display_matrix( "Displaying cluster matrix", $matrix ); |
|
679
|
|
|
|
|
|
|
foreach my $j (0..$num_cols-1) { |
|
680
|
|
|
|
|
|
|
$mean_vec += $matrix->col($j); |
|
681
|
|
|
|
|
|
|
} |
|
682
|
|
|
|
|
|
|
$mean_vec *= 1.0 / $num_cols; |
|
683
|
|
|
|
|
|
|
push @cluster_mean_vecs, $mean_vec; |
|
684
|
|
|
|
|
|
|
display_matrix( "Displaying the mean vector", $mean_vec ); |
|
685
|
|
|
|
|
|
|
foreach my $j (0..$num_cols-1) { |
|
686
|
|
|
|
|
|
|
my @new_col = ($matrix->col($j) - $mean_vec)->as_list; |
|
687
|
|
|
|
|
|
|
$matrix->set_col($j, \@new_col); |
|
688
|
|
|
|
|
|
|
} |
|
689
|
|
|
|
|
|
|
# display_matrix("Displaying mean subtracted data as a matrix", $matrix ); |
|
690
|
|
|
|
|
|
|
my $transposed = transpose( $matrix ); |
|
691
|
|
|
|
|
|
|
# display_matrix("Displaying transposed matrix",$transposed); |
|
692
|
|
|
|
|
|
|
my $covariance = matrix_multiply( $matrix, $transposed ); |
|
693
|
|
|
|
|
|
|
$covariance *= 1.0 / $num_cols; |
|
694
|
|
|
|
|
|
|
push @cluster_covariances, $covariance; |
|
695
|
|
|
|
|
|
|
display_matrix("Displaying the cluster covariance", $covariance ); |
|
696
|
|
|
|
|
|
|
} |
|
697
|
|
|
|
|
|
|
return (\@cluster_mean_vecs, \@cluster_covariances); |
|
698
|
|
|
|
|
|
|
} |
|
699
|
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
sub find_data_dimensionality { |
|
701
|
|
|
|
|
|
|
my $clusters = shift; |
|
702
|
|
|
|
|
|
|
my @first_cluster = @{$clusters->[0]}; |
|
703
|
|
|
|
|
|
|
my @first_data_element = @{$first_cluster[0]}; |
|
704
|
|
|
|
|
|
|
return scalar(@first_data_element); |
|
705
|
|
|
|
|
|
|
} |
|
706
|
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
sub find_seed_centered_covariances { |
|
708
|
|
|
|
|
|
|
my $self = shift; |
|
709
|
|
|
|
|
|
|
my $seed_tags = shift; |
|
710
|
|
|
|
|
|
|
my (@seed_mean_vecs, @seed_based_covariances); |
|
711
|
|
|
|
|
|
|
foreach my $seed_tag (@$seed_tags) { |
|
712
|
|
|
|
|
|
|
my ($num_rows,$num_cols) = ($self->{_data_dimensions}, $self->{_N}); |
|
713
|
|
|
|
|
|
|
my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols); |
|
714
|
|
|
|
|
|
|
my $mean_vec = Math::GSL::Matrix->new($num_rows,1); |
|
715
|
|
|
|
|
|
|
$mean_vec->set_col(0, $self->{_data}->{$seed_tag}); |
|
716
|
|
|
|
|
|
|
push @seed_mean_vecs, $mean_vec; |
|
717
|
|
|
|
|
|
|
display_matrix( "Displaying the seed mean vector", $mean_vec ); |
|
718
|
|
|
|
|
|
|
my $col_index = 0; |
|
719
|
|
|
|
|
|
|
foreach my $tag (@{$self->{_data_id_tags}}) { |
|
720
|
|
|
|
|
|
|
$matrix->set_col($col_index++, $self->{_data}->{$tag}); |
|
721
|
|
|
|
|
|
|
} |
|
722
|
|
|
|
|
|
|
foreach my $j (0..$num_cols-1) { |
|
723
|
|
|
|
|
|
|
my @new_col = ($matrix->col($j) - $mean_vec)->as_list; |
|
724
|
|
|
|
|
|
|
$matrix->set_col($j, \@new_col); |
|
725
|
|
|
|
|
|
|
} |
|
726
|
|
|
|
|
|
|
my $transposed = transpose( $matrix ); |
|
727
|
|
|
|
|
|
|
my $covariance = matrix_multiply( $matrix, $transposed ); |
|
728
|
|
|
|
|
|
|
$covariance *= 1.0 / $num_cols; |
|
729
|
|
|
|
|
|
|
push @seed_based_covariances, $covariance; |
|
730
|
|
|
|
|
|
|
display_matrix("Displaying the seed covariance", $covariance ) |
|
731
|
|
|
|
|
|
|
if $self->{_debug}; |
|
732
|
|
|
|
|
|
|
} |
|
733
|
|
|
|
|
|
|
return (\@seed_mean_vecs, \@seed_based_covariances); |
|
734
|
|
|
|
|
|
|
} |
|
735
|
|
|
|
|
|
|
|
|
736
|
|
|
|
|
|
|
# The most popular seeding mode for EM is random. We include two other seeding modes |
|
737
|
|
|
|
|
|
|
# --- kmeans and manual --- since they do produce good results for specialized cases. |
|
738
|
|
|
|
|
|
|
# For example, when the clusters in your data are non-overlapping and not too |
|
739
|
|
|
|
|
|
|
# anisotropic, the kmeans based seeding should work at least as well as the random |
|
740
|
|
|
|
|
|
|
# seeding. In such cases --- AND ONLY IN SUCH CASES --- the kmeans based seeding has |
|
741
|
|
|
|
|
|
|
# the advantage of avoiding the getting stuck in a local-maximum problem of the EM |
|
742
|
|
|
|
|
|
|
# algorithm. |
|
743
|
|
|
|
|
|
|
sub seed_the_clusters { |
|
744
|
|
|
|
|
|
|
my $self = shift; |
|
745
|
|
|
|
|
|
|
if ($self->{_seeding} eq 'random') { |
|
746
|
|
|
|
|
|
|
my @covariances; |
|
747
|
|
|
|
|
|
|
my @means; |
|
748
|
|
|
|
|
|
|
my @all_tags = @{$self->{_data_id_tags}}; |
|
749
|
|
|
|
|
|
|
my @seed_tags; |
|
750
|
|
|
|
|
|
|
foreach my $i (0..$self->{_K}-1) { |
|
751
|
|
|
|
|
|
|
push @seed_tags, $all_tags[int rand( $self->{_N} )]; |
|
752
|
|
|
|
|
|
|
} |
|
753
|
|
|
|
|
|
|
print "Random Seeding: Randomly selected seeding tags are @seed_tags\n\n"; |
|
754
|
|
|
|
|
|
|
my ($seed_means, $seed_covars) = |
|
755
|
|
|
|
|
|
|
$self->find_seed_centered_covariances(\@seed_tags); |
|
756
|
|
|
|
|
|
|
$self->{_cluster_means} = $seed_means; |
|
757
|
|
|
|
|
|
|
$self->{_cluster_covariances} = $seed_covars; |
|
758
|
|
|
|
|
|
|
} elsif ($self->{_seeding} eq 'kmeans') { |
|
759
|
|
|
|
|
|
|
$self->kmeans(); |
|
760
|
|
|
|
|
|
|
my $clusters = $self->{_clusters}; |
|
761
|
|
|
|
|
|
|
my @dataclusters; |
|
762
|
|
|
|
|
|
|
foreach my $index (0..@$clusters-1) { |
|
763
|
|
|
|
|
|
|
push @dataclusters, []; |
|
764
|
|
|
|
|
|
|
} |
|
765
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
|
766
|
|
|
|
|
|
|
foreach my $tag (@{$clusters->[$cluster_index]}) { |
|
767
|
|
|
|
|
|
|
my $data = $self->{_data}->{$tag}; |
|
768
|
|
|
|
|
|
|
push @{$dataclusters[$cluster_index]}, deep_copy_array($data); |
|
769
|
|
|
|
|
|
|
} |
|
770
|
|
|
|
|
|
|
} |
|
771
|
|
|
|
|
|
|
($self->{_cluster_means}, $self->{_cluster_covariances}) = |
|
772
|
|
|
|
|
|
|
find_cluster_means_and_covariances(\@dataclusters); |
|
773
|
|
|
|
|
|
|
} elsif ($self->{_seeding} eq 'manual') { |
|
774
|
|
|
|
|
|
|
die "You have not supplied the seeding tags for the option \"manual\"" |
|
775
|
|
|
|
|
|
|
unless @{$self->{_seed_tags}} > 0; |
|
776
|
|
|
|
|
|
|
print "Manual Seeding: Seed tags are @{$self->{_seed_tags}}\n\n"; |
|
777
|
|
|
|
|
|
|
foreach my $tag (@{$self->{_seed_tags}}) { |
|
778
|
|
|
|
|
|
|
die "invalid tag used for manual seeding" |
|
779
|
|
|
|
|
|
|
unless exists $self->{_data}->{$tag}; |
|
780
|
|
|
|
|
|
|
} |
|
781
|
|
|
|
|
|
|
my ($seed_means, $seed_covars) = |
|
782
|
|
|
|
|
|
|
$self->find_seed_centered_covariances($self->{_seed_tags}); |
|
783
|
|
|
|
|
|
|
$self->{_cluster_means} = $seed_means; |
|
784
|
|
|
|
|
|
|
$self->{_cluster_covariances} = $seed_covars; |
|
785
|
|
|
|
|
|
|
} else { |
|
786
|
|
|
|
|
|
|
die "Incorrect call syntax used. See documentation."; |
|
787
|
|
|
|
|
|
|
} |
|
788
|
|
|
|
|
|
|
} |
|
789
|
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
# This is the top-level method for kmeans based initialization of the EM |
|
791
|
|
|
|
|
|
|
# algorithm. The means and the covariances returned by kmeans are used to seed the EM |
|
792
|
|
|
|
|
|
|
# algorithm. |
|
793
|
|
|
|
|
|
|
sub kmeans { |
|
794
|
|
|
|
|
|
|
my $self = shift; |
|
795
|
|
|
|
|
|
|
my $K = $self->{_K}; |
|
796
|
|
|
|
|
|
|
$self->cluster_for_fixed_K_single_smart_try($K); |
|
797
|
|
|
|
|
|
|
if ((defined $self->{_clusters}) && (defined $self->{_cluster_centers})){ |
|
798
|
|
|
|
|
|
|
return ($self->{_clusters}, $self->{_cluster_centers}); |
|
799
|
|
|
|
|
|
|
} else { |
|
800
|
|
|
|
|
|
|
die "kmeans clustering failed."; |
|
801
|
|
|
|
|
|
|
} |
|
802
|
|
|
|
|
|
|
} |
|
803
|
|
|
|
|
|
|
|
|
804
|
|
|
|
|
|
|
# Used by the kmeans algorithm for the initialization of the EM iterations. We do |
|
805
|
|
|
|
|
|
|
# initial kmeans cluster seeding by subjecting the data to principal components |
|
806
|
|
|
|
|
|
|
# analysis in order to discover the direction of maximum variance in the data space. |
|
807
|
|
|
|
|
|
|
# Subsequently, we try to find the K largest peaks along this direction. The |
|
808
|
|
|
|
|
|
|
# coordinates of these peaks serve as the seeds for the K clusters. |
|
809
|
|
|
|
|
|
|
sub cluster_for_fixed_K_single_smart_try { |
|
810
|
|
|
|
|
|
|
my $self = shift; |
|
811
|
|
|
|
|
|
|
my $K = shift; |
|
812
|
|
|
|
|
|
|
print "Clustering for K = $K\n" if $self->{_terminal_output}; |
|
813
|
|
|
|
|
|
|
my ($clusters, $cluster_centers) = |
|
814
|
|
|
|
|
|
|
$self->cluster_for_given_K($K); |
|
815
|
|
|
|
|
|
|
$self->{_clusters} = $clusters; |
|
816
|
|
|
|
|
|
|
$self->{_cluster_centers} = $cluster_centers; |
|
817
|
|
|
|
|
|
|
} |
|
818
|
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
# Used by the kmeans part of the code for the initialization of the EM algorithm: |
|
820
|
|
|
|
|
|
|
sub cluster_for_given_K { |
|
821
|
|
|
|
|
|
|
my $self = shift; |
|
822
|
|
|
|
|
|
|
my $K = shift; |
|
823
|
|
|
|
|
|
|
my $cluster_centers = $self->get_initial_cluster_centers($K); |
|
824
|
|
|
|
|
|
|
my $clusters = $self->assign_data_to_clusters_initial($cluster_centers); |
|
825
|
|
|
|
|
|
|
my $cluster_nonexistant_flag = 0; |
|
826
|
|
|
|
|
|
|
foreach my $trial (0..2) { |
|
827
|
|
|
|
|
|
|
($clusters, $cluster_centers) = |
|
828
|
|
|
|
|
|
|
$self->assign_data_to_clusters( $clusters, $K ); |
|
829
|
|
|
|
|
|
|
my $num_of_clusters_returned = @$clusters; |
|
830
|
|
|
|
|
|
|
foreach my $cluster (@$clusters) { |
|
831
|
|
|
|
|
|
|
$cluster_nonexistant_flag = 1 if ((!defined $cluster) |
|
832
|
|
|
|
|
|
|
|| (@$cluster == 0)); |
|
833
|
|
|
|
|
|
|
} |
|
834
|
|
|
|
|
|
|
last unless $cluster_nonexistant_flag; |
|
835
|
|
|
|
|
|
|
} |
|
836
|
|
|
|
|
|
|
return ($clusters, $cluster_centers); |
|
837
|
|
|
|
|
|
|
} |
|
838
|
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
# Used by the kmeans part of the code for the initialization of the EM algorithm: |
|
840
|
|
|
|
|
|
|
sub get_initial_cluster_centers { |
|
841
|
|
|
|
|
|
|
my $self = shift; |
|
842
|
|
|
|
|
|
|
my $K = shift; |
|
843
|
|
|
|
|
|
|
if ($self->{_data_dimensions} == 1) { |
|
844
|
|
|
|
|
|
|
my @one_d_data; |
|
845
|
|
|
|
|
|
|
foreach my $j (0..$self->{_N}-1) { |
|
846
|
|
|
|
|
|
|
my $tag = $self->{_data_id_tags}[$j]; |
|
847
|
|
|
|
|
|
|
push @one_d_data, $self->{_data}->{$tag}->[0]; |
|
848
|
|
|
|
|
|
|
} |
|
849
|
|
|
|
|
|
|
my @peak_points = |
|
850
|
|
|
|
|
|
|
find_peak_points_in_given_direction(\@one_d_data,$K); |
|
851
|
|
|
|
|
|
|
print "highest points at data values: @peak_points\n" |
|
852
|
|
|
|
|
|
|
if $self->{_debug}; |
|
853
|
|
|
|
|
|
|
my @cluster_centers; |
|
854
|
|
|
|
|
|
|
foreach my $peakpoint (@peak_points) { |
|
855
|
|
|
|
|
|
|
push @cluster_centers, [$peakpoint]; |
|
856
|
|
|
|
|
|
|
} |
|
857
|
|
|
|
|
|
|
return \@cluster_centers; |
|
858
|
|
|
|
|
|
|
} |
|
859
|
|
|
|
|
|
|
my ($num_rows,$num_cols) = ($self->{_data_dimensions},$self->{_N}); |
|
860
|
|
|
|
|
|
|
my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols); |
|
861
|
|
|
|
|
|
|
my $mean_vec = Math::GSL::Matrix->new($num_rows,1); |
|
862
|
|
|
|
|
|
|
# All the record labels are stored in the array $self->{_data_id_tags}. |
|
863
|
|
|
|
|
|
|
# The actual data for clustering is stored in a hash at $self->{_data} |
|
864
|
|
|
|
|
|
|
# whose keys are the record labels; the value associated with each |
|
865
|
|
|
|
|
|
|
# key is the array holding the corresponding numerical multidimensional |
|
866
|
|
|
|
|
|
|
# data. |
|
867
|
|
|
|
|
|
|
foreach my $j (0..$num_cols-1) { |
|
868
|
|
|
|
|
|
|
my $tag = $self->{_data_id_tags}[$j]; |
|
869
|
|
|
|
|
|
|
my $data = $self->{_data}->{$tag}; |
|
870
|
|
|
|
|
|
|
$matrix->set_col($j, $data); |
|
871
|
|
|
|
|
|
|
} |
|
872
|
|
|
|
|
|
|
if ($self->{_debug}) { |
|
873
|
|
|
|
|
|
|
print "\nDisplaying the original data as a matrix:"; |
|
874
|
|
|
|
|
|
|
display_matrix( $matrix ); |
|
875
|
|
|
|
|
|
|
} |
|
876
|
|
|
|
|
|
|
foreach my $j (0..$num_cols-1) { |
|
877
|
|
|
|
|
|
|
$mean_vec += $matrix->col($j); |
|
878
|
|
|
|
|
|
|
} |
|
879
|
|
|
|
|
|
|
$mean_vec *= 1.0 / $num_cols; |
|
880
|
|
|
|
|
|
|
if ($self->{_debug}) { |
|
881
|
|
|
|
|
|
|
print "Displaying the mean vector for the data:"; |
|
882
|
|
|
|
|
|
|
display_matrix( $mean_vec ); |
|
883
|
|
|
|
|
|
|
} |
|
884
|
|
|
|
|
|
|
foreach my $j (0..$num_cols-1) { |
|
885
|
|
|
|
|
|
|
my @new_col = ($matrix->col($j) - $mean_vec)->as_list; |
|
886
|
|
|
|
|
|
|
$matrix->set_col($j, \@new_col); |
|
887
|
|
|
|
|
|
|
} |
|
888
|
|
|
|
|
|
|
if ($self->{_debug}) { |
|
889
|
|
|
|
|
|
|
print "Displaying mean subtracted data as a matrix:"; |
|
890
|
|
|
|
|
|
|
display_matrix( $matrix ); |
|
891
|
|
|
|
|
|
|
} |
|
892
|
|
|
|
|
|
|
my $transposed = transpose( $matrix ); |
|
893
|
|
|
|
|
|
|
if ($self->{_debug}) { |
|
894
|
|
|
|
|
|
|
print "Displaying transposed data matrix:"; |
|
895
|
|
|
|
|
|
|
display_matrix( $transposed ); |
|
896
|
|
|
|
|
|
|
} |
|
897
|
|
|
|
|
|
|
my $covariance = matrix_multiply( $matrix, $transposed ); |
|
898
|
|
|
|
|
|
|
$covariance *= 1.0 / $num_cols; |
|
899
|
|
|
|
|
|
|
if ($self->{_debug}) { |
|
900
|
|
|
|
|
|
|
print "\nDisplaying the Covariance Matrix for your data:"; |
|
901
|
|
|
|
|
|
|
display_matrix( $covariance ); |
|
902
|
|
|
|
|
|
|
} |
|
903
|
|
|
|
|
|
|
my ($eigenvalues, $eigenvectors) = $covariance->eigenpair; |
|
904
|
|
|
|
|
|
|
my $num_of_eigens = @$eigenvalues; |
|
905
|
|
|
|
|
|
|
my $largest_eigen_index = 0; |
|
906
|
|
|
|
|
|
|
my $smallest_eigen_index = 0; |
|
907
|
|
|
|
|
|
|
print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug}; |
|
908
|
|
|
|
|
|
|
foreach my $i (1..$num_of_eigens-1) { |
|
909
|
|
|
|
|
|
|
$largest_eigen_index = $i if $eigenvalues->[$i] > |
|
910
|
|
|
|
|
|
|
$eigenvalues->[$largest_eigen_index]; |
|
911
|
|
|
|
|
|
|
$smallest_eigen_index = $i if $eigenvalues->[$i] < |
|
912
|
|
|
|
|
|
|
$eigenvalues->[$smallest_eigen_index]; |
|
913
|
|
|
|
|
|
|
print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug}; |
|
914
|
|
|
|
|
|
|
} |
|
915
|
|
|
|
|
|
|
print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug}; |
|
916
|
|
|
|
|
|
|
print "\nsmallest eigen index: $smallest_eigen_index\n\n" |
|
917
|
|
|
|
|
|
|
if $self->{_debug}; |
|
918
|
|
|
|
|
|
|
foreach my $i (0..$num_of_eigens-1) { |
|
919
|
|
|
|
|
|
|
my @vec = $eigenvectors->[$i]->as_list; |
|
920
|
|
|
|
|
|
|
print "Eigenvector $i: @vec\n" if $self->{_debug}; |
|
921
|
|
|
|
|
|
|
} |
|
922
|
|
|
|
|
|
|
my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list; |
|
923
|
|
|
|
|
|
|
print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug}; |
|
924
|
|
|
|
|
|
|
my @max_var_direction; |
|
925
|
|
|
|
|
|
|
# Each element of the array @largest_eigen_vec is a Math::Complex object |
|
926
|
|
|
|
|
|
|
foreach my $k (0..@largest_eigen_vec-1) { |
|
927
|
|
|
|
|
|
|
my ($mag, $theta) = $largest_eigen_vec[$k] =~ /\[(\d*\.\d+),(\S+)\]/; |
|
928
|
|
|
|
|
|
|
if ($theta eq '0') { |
|
929
|
|
|
|
|
|
|
$max_var_direction[$k] = $mag; |
|
930
|
|
|
|
|
|
|
} elsif ($theta eq 'pi') { |
|
931
|
|
|
|
|
|
|
$max_var_direction[$k] = -1.0 * $mag; |
|
932
|
|
|
|
|
|
|
} else { |
|
933
|
|
|
|
|
|
|
die "eigendecomposition of covariance matrix produced a complex eigenvector --- something is wrong"; |
|
934
|
|
|
|
|
|
|
} |
|
935
|
|
|
|
|
|
|
} |
|
936
|
|
|
|
|
|
|
# "Maximum variance direction: @max_var_direction |
|
937
|
|
|
|
|
|
|
print "\nMaximum Variance Direction: @max_var_direction\n\n" |
|
938
|
|
|
|
|
|
|
if $self->{_debug}; |
|
939
|
|
|
|
|
|
|
# We now project all data points on the largest eigenvector. |
|
940
|
|
|
|
|
|
|
# Each projection will yield a single point on the eigenvector. |
|
941
|
|
|
|
|
|
|
my @projections; |
|
942
|
|
|
|
|
|
|
foreach my $j (0..$self->{_N}-1) { |
|
943
|
|
|
|
|
|
|
my $tag = $self->{_data_id_tags}[$j]; |
|
944
|
|
|
|
|
|
|
my $data = $self->{_data}->{$tag}; |
|
945
|
|
|
|
|
|
|
die "Dimensionality of the largest eigenvector does not " |
|
946
|
|
|
|
|
|
|
. "match the dimensionality of the data" |
|
947
|
|
|
|
|
|
|
unless @max_var_direction == $self->{_data_dimensions}; |
|
948
|
|
|
|
|
|
|
my $projection = vector_multiply($data, \@max_var_direction); |
|
949
|
|
|
|
|
|
|
push @projections, $projection; |
|
950
|
|
|
|
|
|
|
} |
|
951
|
|
|
|
|
|
|
print "All projection points: @projections\n" if $self->{_debug}; |
|
952
|
|
|
|
|
|
|
my @peak_points = find_peak_points_in_given_direction(\@projections, $K); |
|
953
|
|
|
|
|
|
|
print "highest points at points along largest eigenvec: @peak_points\n" |
|
954
|
|
|
|
|
|
|
if $self->{_debug}; |
|
955
|
|
|
|
|
|
|
my @cluster_centers; |
|
956
|
|
|
|
|
|
|
foreach my $peakpoint (@peak_points) { |
|
957
|
|
|
|
|
|
|
my @actual_peak_coords = map {$peakpoint * $_} @max_var_direction; |
|
958
|
|
|
|
|
|
|
push @cluster_centers, \@actual_peak_coords; |
|
959
|
|
|
|
|
|
|
} |
|
960
|
|
|
|
|
|
|
return \@cluster_centers; |
|
961
|
|
|
|
|
|
|
} |
|
962
|
|
|
|
|
|
|
|
|
963
|
|
|
|
|
|
|
# Used by the kmeans part of the code: This method is called by the previous method |
|
964
|
|
|
|
|
|
|
# to locate K peaks in a smoothed histogram of the data points projected onto the |
|
965
|
|
|
|
|
|
|
# maximal variance direction. |
|
966
|
|
|
|
|
|
|
sub find_peak_points_in_given_direction { |
|
967
|
|
|
|
|
|
|
my $dataref = shift; |
|
968
|
|
|
|
|
|
|
my $how_many = shift; |
|
969
|
|
|
|
|
|
|
my @data = @$dataref; |
|
970
|
|
|
|
|
|
|
my ($min, $max) = minmax(\@data); |
|
971
|
|
|
|
|
|
|
my $num_points = @data; |
|
972
|
|
|
|
|
|
|
my @sorted_data = sort {$a <=> $b} @data; |
|
973
|
|
|
|
|
|
|
#print "\n\nSorted data: @sorted_data\n"; |
|
974
|
|
|
|
|
|
|
my $scale = $max - $min; |
|
975
|
|
|
|
|
|
|
foreach my $index (0..$#sorted_data-1) { |
|
976
|
|
|
|
|
|
|
$sorted_data[$index] = ($sorted_data[$index] - $min) / $scale; |
|
977
|
|
|
|
|
|
|
} |
|
978
|
|
|
|
|
|
|
my $avg_diff = 0; |
|
979
|
|
|
|
|
|
|
foreach my $index (0..$#sorted_data-1) { |
|
980
|
|
|
|
|
|
|
my $diff = $sorted_data[$index+1] - $sorted_data[$index]; |
|
981
|
|
|
|
|
|
|
$avg_diff += ($diff - $avg_diff) / ($index + 1); |
|
982
|
|
|
|
|
|
|
} |
|
983
|
|
|
|
|
|
|
my $delta = 1.0 / 1000.0; |
|
984
|
|
|
|
|
|
|
# It would be nice to set the delta adaptively, but I must |
|
985
|
|
|
|
|
|
|
# change the number of cells in the next foreach loop accordingly |
|
986
|
|
|
|
|
|
|
# my $delta = $avg_diff / 20; |
|
987
|
|
|
|
|
|
|
my @accumulator = (0) x 1000; |
|
988
|
|
|
|
|
|
|
foreach my $index (0..@sorted_data-1) { |
|
989
|
|
|
|
|
|
|
my $cell_index = int($sorted_data[$index] / $delta); |
|
990
|
|
|
|
|
|
|
my $smoothness = 40; |
|
991
|
|
|
|
|
|
|
for my $index ($cell_index-$smoothness..$cell_index+$smoothness) { |
|
992
|
|
|
|
|
|
|
next if $index < 0 || $index > 999; |
|
993
|
|
|
|
|
|
|
$accumulator[$index]++; |
|
994
|
|
|
|
|
|
|
} |
|
995
|
|
|
|
|
|
|
} |
|
996
|
|
|
|
|
|
|
my $peaks_array = non_maximum_supression( \@accumulator ); |
|
997
|
|
|
|
|
|
|
my $peaks_index_hash = get_value_index_hash( $peaks_array ); |
|
998
|
|
|
|
|
|
|
my @K_highest_peak_locations; |
|
999
|
|
|
|
|
|
|
my $k = 0; |
|
1000
|
|
|
|
|
|
|
foreach my $peak (sort {$b <=> $a} keys %$peaks_index_hash) { |
|
1001
|
|
|
|
|
|
|
my $unscaled_peak_point = |
|
1002
|
|
|
|
|
|
|
$min + $peaks_index_hash->{$peak} * $scale * $delta; |
|
1003
|
|
|
|
|
|
|
push @K_highest_peak_locations, $unscaled_peak_point |
|
1004
|
|
|
|
|
|
|
if $k < $how_many; |
|
1005
|
|
|
|
|
|
|
last if ++$k == $how_many; |
|
1006
|
|
|
|
|
|
|
} |
|
1007
|
|
|
|
|
|
|
return @K_highest_peak_locations; |
|
1008
|
|
|
|
|
|
|
} |
|
1009
|
|
|
|
|
|
|
|
|
1010
|
|
|
|
|
|
|
# Used by the kmeans part of the code: The purpose of this routine is to form initial |
|
1011
|
|
|
|
|
|
|
# clusters by assigning the data samples to the initial clusters formed by the |
|
1012
|
|
|
|
|
|
|
# previous routine on the basis of the best proximity of the data samples to the |
|
1013
|
|
|
|
|
|
|
# different cluster centers. |
|
1014
|
|
|
|
|
|
|
sub assign_data_to_clusters_initial { |
|
1015
|
|
|
|
|
|
|
my $self = shift; |
|
1016
|
|
|
|
|
|
|
my @cluster_centers = @{ shift @_ }; |
|
1017
|
|
|
|
|
|
|
my @clusters; |
|
1018
|
|
|
|
|
|
|
foreach my $ele (@{$self->{_data_id_tags}}) { |
|
1019
|
|
|
|
|
|
|
my $best_cluster; |
|
1020
|
|
|
|
|
|
|
my @dist_from_clust_centers; |
|
1021
|
|
|
|
|
|
|
foreach my $center (@cluster_centers) { |
|
1022
|
|
|
|
|
|
|
push @dist_from_clust_centers, $self->distance($ele, $center); |
|
1023
|
|
|
|
|
|
|
} |
|
1024
|
|
|
|
|
|
|
my ($min, $best_center_index) = minimum( \@dist_from_clust_centers ); |
|
1025
|
|
|
|
|
|
|
push @{$clusters[$best_center_index]}, $ele; |
|
1026
|
|
|
|
|
|
|
} |
|
1027
|
|
|
|
|
|
|
return \@clusters; |
|
1028
|
|
|
|
|
|
|
} |
|
1029
|
|
|
|
|
|
|
|
|
1030
|
|
|
|
|
|
|
# Used by the kmeans part of the code: This is the main routine that along with the |
|
1031
|
|
|
|
|
|
|
# update_cluster_centers() routine constitute the two key steps of the K-Means |
|
1032
|
|
|
|
|
|
|
# algorithm. In most cases, the infinite while() loop will terminate automatically |
|
1033
|
|
|
|
|
|
|
# when the cluster assignments of the data points remain unchanged. For the sake of |
|
1034
|
|
|
|
|
|
|
# safety, we keep track of the number of iterations. If this number reaches 100, we |
|
1035
|
|
|
|
|
|
|
# exit the while() loop anyway. In most cases, this limit will not be reached. |
|
1036
|
|
|
|
|
|
|
sub assign_data_to_clusters { |
|
1037
|
|
|
|
|
|
|
my $self = shift; |
|
1038
|
|
|
|
|
|
|
my $clusters = shift; |
|
1039
|
|
|
|
|
|
|
my $K = shift; |
|
1040
|
|
|
|
|
|
|
my $final_cluster_centers; |
|
1041
|
|
|
|
|
|
|
my $iteration_index = 0; |
|
1042
|
|
|
|
|
|
|
while (1) { |
|
1043
|
|
|
|
|
|
|
my $new_clusters; |
|
1044
|
|
|
|
|
|
|
my $assignment_changed_flag = 0; |
|
1045
|
|
|
|
|
|
|
my $current_cluster_center_index = 0; |
|
1046
|
|
|
|
|
|
|
my $cluster_size_zero_condition = 0; |
|
1047
|
|
|
|
|
|
|
my $how_many = @$clusters; |
|
1048
|
|
|
|
|
|
|
my $cluster_centers = $self->update_cluster_centers( |
|
1049
|
|
|
|
|
|
|
deep_copy_AoA_with_nulls( $clusters ) ); |
|
1050
|
|
|
|
|
|
|
$iteration_index++; |
|
1051
|
|
|
|
|
|
|
foreach my $cluster (@$clusters) { |
|
1052
|
|
|
|
|
|
|
my $current_cluster_center = |
|
1053
|
|
|
|
|
|
|
$cluster_centers->[$current_cluster_center_index]; |
|
1054
|
|
|
|
|
|
|
foreach my $ele (@$cluster) { |
|
1055
|
|
|
|
|
|
|
my @dist_from_clust_centers; |
|
1056
|
|
|
|
|
|
|
foreach my $center (@$cluster_centers) { |
|
1057
|
|
|
|
|
|
|
push @dist_from_clust_centers, |
|
1058
|
|
|
|
|
|
|
$self->distance($ele, $center); |
|
1059
|
|
|
|
|
|
|
} |
|
1060
|
|
|
|
|
|
|
my ($min, $best_center_index) = |
|
1061
|
|
|
|
|
|
|
minimum( \@dist_from_clust_centers ); |
|
1062
|
|
|
|
|
|
|
my $best_cluster_center = |
|
1063
|
|
|
|
|
|
|
$cluster_centers->[$best_center_index]; |
|
1064
|
|
|
|
|
|
|
if (vector_equal($current_cluster_center, |
|
1065
|
|
|
|
|
|
|
$best_cluster_center)){ |
|
1066
|
|
|
|
|
|
|
push @{$new_clusters->[$current_cluster_center_index]}, |
|
1067
|
|
|
|
|
|
|
$ele; |
|
1068
|
|
|
|
|
|
|
} else { |
|
1069
|
|
|
|
|
|
|
$assignment_changed_flag = 1; |
|
1070
|
|
|
|
|
|
|
push @{$new_clusters->[$best_center_index]}, $ele; |
|
1071
|
|
|
|
|
|
|
} |
|
1072
|
|
|
|
|
|
|
} |
|
1073
|
|
|
|
|
|
|
$current_cluster_center_index++; |
|
1074
|
|
|
|
|
|
|
} |
|
1075
|
|
|
|
|
|
|
# Now make sure that we still have K clusters since K is fixed: |
|
1076
|
|
|
|
|
|
|
next if ((@$new_clusters != @$clusters) && ($iteration_index < 100)); |
|
1077
|
|
|
|
|
|
|
# Now make sure that none of the K clusters is an empty cluster: |
|
1078
|
|
|
|
|
|
|
foreach my $newcluster (@$new_clusters) { |
|
1079
|
|
|
|
|
|
|
$cluster_size_zero_condition = 1 if ((!defined $newcluster) |
|
1080
|
|
|
|
|
|
|
or (@$newcluster == 0)); |
|
1081
|
|
|
|
|
|
|
} |
|
1082
|
|
|
|
|
|
|
push @$new_clusters, (undef) x ($K - @$new_clusters) |
|
1083
|
|
|
|
|
|
|
if @$new_clusters < $K; |
|
1084
|
|
|
|
|
|
|
my $largest_cluster; |
|
1085
|
|
|
|
|
|
|
foreach my $local_cluster (@$new_clusters) { |
|
1086
|
|
|
|
|
|
|
next if !defined $local_cluster; |
|
1087
|
|
|
|
|
|
|
$largest_cluster = $local_cluster if !defined $largest_cluster; |
|
1088
|
|
|
|
|
|
|
if (@$local_cluster > @$largest_cluster) { |
|
1089
|
|
|
|
|
|
|
$largest_cluster = $local_cluster; |
|
1090
|
|
|
|
|
|
|
} |
|
1091
|
|
|
|
|
|
|
} |
|
1092
|
|
|
|
|
|
|
foreach my $local_cluster (@$new_clusters) { |
|
1093
|
|
|
|
|
|
|
if ( (!defined $local_cluster) || (@$local_cluster == 0) ) { |
|
1094
|
|
|
|
|
|
|
push @$local_cluster, pop @$largest_cluster; |
|
1095
|
|
|
|
|
|
|
} |
|
1096
|
|
|
|
|
|
|
} |
|
1097
|
|
|
|
|
|
|
next if (($cluster_size_zero_condition) && ($iteration_index < 100)); |
|
1098
|
|
|
|
|
|
|
last if $iteration_index == 100; |
|
1099
|
|
|
|
|
|
|
# Now do a deep copy of new_clusters into clusters |
|
1100
|
|
|
|
|
|
|
$clusters = deep_copy_AoA( $new_clusters ); |
|
1101
|
|
|
|
|
|
|
last if $assignment_changed_flag == 0; |
|
1102
|
|
|
|
|
|
|
} |
|
1103
|
|
|
|
|
|
|
$final_cluster_centers = $self->update_cluster_centers( $clusters ); |
|
1104
|
|
|
|
|
|
|
return ($clusters, $final_cluster_centers); |
|
1105
|
|
|
|
|
|
|
} |
|
1106
|
|
|
|
|
|
|
|
|
1107
|
|
|
|
|
|
|
# Used by the kmeans part of the code: After each new assignment of the data points |
|
1108
|
|
|
|
|
|
|
# to the clusters on the basis of the current values for the cluster centers, we call |
|
1109
|
|
|
|
|
|
|
# the routine shown here for updating the values of the cluster centers. |
|
1110
|
|
|
|
|
|
|
sub update_cluster_centers { |
|
1111
|
|
|
|
|
|
|
my $self = shift; |
|
1112
|
|
|
|
|
|
|
my @clusters = @{ shift @_ }; |
|
1113
|
|
|
|
|
|
|
my @new_cluster_centers; |
|
1114
|
|
|
|
|
|
|
my $largest_cluster; |
|
1115
|
|
|
|
|
|
|
foreach my $cluster (@clusters) { |
|
1116
|
|
|
|
|
|
|
next if !defined $cluster; |
|
1117
|
|
|
|
|
|
|
$largest_cluster = $cluster if !defined $largest_cluster; |
|
1118
|
|
|
|
|
|
|
if (@$cluster > @$largest_cluster) { |
|
1119
|
|
|
|
|
|
|
$largest_cluster = $cluster; |
|
1120
|
|
|
|
|
|
|
} |
|
1121
|
|
|
|
|
|
|
} |
|
1122
|
|
|
|
|
|
|
foreach my $cluster (@clusters) { |
|
1123
|
|
|
|
|
|
|
if ( (!defined $cluster) || (@$cluster == 0) ) { |
|
1124
|
|
|
|
|
|
|
push @$cluster, pop @$largest_cluster; |
|
1125
|
|
|
|
|
|
|
} |
|
1126
|
|
|
|
|
|
|
} |
|
1127
|
|
|
|
|
|
|
foreach my $cluster (@clusters) { |
|
1128
|
|
|
|
|
|
|
die "Cluster became empty --- untenable condition " . |
|
1129
|
|
|
|
|
|
|
"for a given K. Try again. " if !defined $cluster; |
|
1130
|
|
|
|
|
|
|
my $cluster_size = @$cluster; |
|
1131
|
|
|
|
|
|
|
die "Cluster size is zero --- untenable." if $cluster_size == 0; |
|
1132
|
|
|
|
|
|
|
my @new_cluster_center = @{$self->add_point_coords( $cluster )}; |
|
1133
|
|
|
|
|
|
|
@new_cluster_center = map {my $x = $_/$cluster_size; $x} |
|
1134
|
|
|
|
|
|
|
@new_cluster_center; |
|
1135
|
|
|
|
|
|
|
push @new_cluster_centers, \@new_cluster_center; |
|
1136
|
|
|
|
|
|
|
} |
|
1137
|
|
|
|
|
|
|
return \@new_cluster_centers; |
|
1138
|
|
|
|
|
|
|
} |
|
1139
|
|
|
|
|
|
|
|
|
1140
|
|
|
|
|
|
|
# The following routine is for computing the distance between a data point specified |
|
1141
|
|
|
|
|
|
|
# by its symbolic name in the master datafile and a point (such as the center of a |
|
1142
|
|
|
|
|
|
|
# cluster) expressed as a vector of coordinates: |
|
1143
|
|
|
|
|
|
|
sub distance { |
|
1144
|
|
|
|
|
|
|
my $self = shift; |
|
1145
|
|
|
|
|
|
|
my $ele1_id = shift @_; # symbolic name of data sample |
|
1146
|
|
|
|
|
|
|
my @ele1 = @{$self->{_data}->{$ele1_id}}; |
|
1147
|
|
|
|
|
|
|
my @ele2 = @{shift @_}; |
|
1148
|
|
|
|
|
|
|
die "wrong data types for distance calculation" if @ele1 != @ele2; |
|
1149
|
|
|
|
|
|
|
my $how_many = @ele1; |
|
1150
|
|
|
|
|
|
|
my $squared_sum = 0; |
|
1151
|
|
|
|
|
|
|
foreach my $i (0..$how_many-1) { |
|
1152
|
|
|
|
|
|
|
$squared_sum += ($ele1[$i] - $ele2[$i])**2; |
|
1153
|
|
|
|
|
|
|
} |
|
1154
|
|
|
|
|
|
|
my $dist = sqrt $squared_sum; |
|
1155
|
|
|
|
|
|
|
return $dist; |
|
1156
|
|
|
|
|
|
|
} |
|
1157
|
|
|
|
|
|
|
|
|
1158
|
|
|
|
|
|
|
# The following routine does the same as above but now both |
|
1159
|
|
|
|
|
|
|
# arguments are expected to be arrays of numbers: |
|
1160
|
|
|
|
|
|
|
sub distance2 { |
|
1161
|
|
|
|
|
|
|
my $self = shift; |
|
1162
|
|
|
|
|
|
|
my @ele1 = @{shift @_}; |
|
1163
|
|
|
|
|
|
|
my @ele2 = @{shift @_}; |
|
1164
|
|
|
|
|
|
|
die "wrong data types for distance calculation" if @ele1 != @ele2; |
|
1165
|
|
|
|
|
|
|
my $how_many = @ele1; |
|
1166
|
|
|
|
|
|
|
my $squared_sum = 0; |
|
1167
|
|
|
|
|
|
|
foreach my $i (0..$how_many-1) { |
|
1168
|
|
|
|
|
|
|
$squared_sum += ($ele1[$i] - $ele2[$i])**2; |
|
1169
|
|
|
|
|
|
|
} |
|
1170
|
|
|
|
|
|
|
return sqrt $squared_sum; |
|
1171
|
|
|
|
|
|
|
} |
|
1172
|
|
|
|
|
|
|
|
|
1173
|
|
|
|
|
|
|
sub write_naive_bayes_clusters_to_files { |
|
1174
|
|
|
|
|
|
|
my $self = shift; |
|
1175
|
|
|
|
|
|
|
my @clusters = @{$self->{_clusters}}; |
|
1176
|
|
|
|
|
|
|
unlink glob "naive_bayes_cluster*.txt"; |
|
1177
|
|
|
|
|
|
|
foreach my $i (1..@clusters) { |
|
1178
|
|
|
|
|
|
|
my $filename = "naive_bayes_cluster" . $i . ".txt"; |
|
1179
|
|
|
|
|
|
|
print "Writing cluster $i to file $filename\n" |
|
1180
|
|
|
|
|
|
|
if $self->{_terminal_output}; |
|
1181
|
|
|
|
|
|
|
open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!"; |
|
1182
|
|
|
|
|
|
|
foreach my $ele (@{$clusters[$i-1]}) { |
|
1183
|
|
|
|
|
|
|
print FILEHANDLE "$ele\n"; |
|
1184
|
|
|
|
|
|
|
} |
|
1185
|
|
|
|
|
|
|
close FILEHANDLE; |
|
1186
|
|
|
|
|
|
|
} |
|
1187
|
|
|
|
|
|
|
} |
|
1188
|
|
|
|
|
|
|
|
|
1189
|
|
|
|
|
|
|
sub write_posterior_prob_clusters_above_threshold_to_files { |
|
1190
|
|
|
|
|
|
|
my $self = shift; |
|
1191
|
|
|
|
|
|
|
my $theta = shift; |
|
1192
|
|
|
|
|
|
|
my @class_distributions; |
|
1193
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
|
1194
|
|
|
|
|
|
|
push @class_distributions, []; |
|
1195
|
|
|
|
|
|
|
} |
|
1196
|
|
|
|
|
|
|
foreach my $data_tag (@{$self->{_data_id_tags}}) { |
|
1197
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
|
1198
|
|
|
|
|
|
|
push @{$class_distributions[$cluster_index]}, $data_tag |
|
1199
|
|
|
|
|
|
|
if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] |
|
1200
|
|
|
|
|
|
|
> $theta; |
|
1201
|
|
|
|
|
|
|
} |
|
1202
|
|
|
|
|
|
|
} |
|
1203
|
|
|
|
|
|
|
unlink glob "posterior_prob_cluster*.txt"; |
|
1204
|
|
|
|
|
|
|
foreach my $i (1..@class_distributions) { |
|
1205
|
|
|
|
|
|
|
my $filename = "posterior_prob_cluster" . $i . ".txt"; |
|
1206
|
|
|
|
|
|
|
print "Writing posterior prob cluster $i to file $filename\n" |
|
1207
|
|
|
|
|
|
|
if $self->{_terminal_output}; |
|
1208
|
|
|
|
|
|
|
open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!"; |
|
1209
|
|
|
|
|
|
|
foreach my $ele (@{$class_distributions[$i-1]}) { |
|
1210
|
|
|
|
|
|
|
print FILEHANDLE "$ele\n"; |
|
1211
|
|
|
|
|
|
|
} |
|
1212
|
|
|
|
|
|
|
close FILEHANDLE; |
|
1213
|
|
|
|
|
|
|
} |
|
1214
|
|
|
|
|
|
|
} |
|
1215
|
|
|
|
|
|
|
|
|
1216
|
|
|
|
|
|
|
sub DESTROY { |
|
1217
|
|
|
|
|
|
|
unlink "__temp_" . basename($_[0]->{_datafile}); |
|
1218
|
|
|
|
|
|
|
unlink "__temp_data_" . basename($_[0]->{_datafile}); |
|
1219
|
|
|
|
|
|
|
unlink "__temp2_" . basename($_[0]->{_datafile}); |
|
1220
|
|
|
|
|
|
|
unlink glob "__temp1dhist*"; |
|
1221
|
|
|
|
|
|
|
unlink glob "__contour*"; |
|
1222
|
|
|
|
|
|
|
} |
|
1223
|
|
|
|
|
|
|
|
|
1224
|
|
|
|
|
|
|
############################# Visualization Code ############################### |
|
1225
|
|
|
|
|
|
|
|
|
1226
|
|
|
|
|
|
|
# The visualize_clusters() implementation displays as a plot in your terminal window |
|
1227
|
|
|
|
|
|
|
# the clusters constructed by the EM algorithm. It can show either 2D plots or |
|
1228
|
|
|
|
|
|
|
# 3D plots that you can rotate interactively for better visualization. For |
|
1229
|
|
|
|
|
|
|
# multidimensional data, as to which 2D or 3D dimensions are used for visualization |
|
1230
|
|
|
|
|
|
|
# is controlled by the mask you must supply as an argument to the method. Should it |
|
1231
|
|
|
|
|
|
|
# happen that only one on bit is specified for the mask, visualize_clusters() |
|
1232
|
|
|
|
|
|
|
# aborts. |
|
1233
|
|
|
|
|
|
|
# |
|
1234
|
|
|
|
|
|
|
# The visualization code consists of first accessing each of clusters created by the |
|
1235
|
|
|
|
|
|
|
# EM() subroutine. Note that the clusters contain only the symbolic names for the |
|
1236
|
|
|
|
|
|
|
# individual records in the source data file. We therefore next reach into the |
|
1237
|
|
|
|
|
|
|
# $self->{_data} hash and get the data coordinates associated with each symbolic |
|
1238
|
|
|
|
|
|
|
# label in a cluster. The numerical data thus generated is then written out to a |
|
1239
|
|
|
|
|
|
|
# temp file. When doing so we must remember to insert TWO BLANK LINES between the |
|
1240
|
|
|
|
|
|
|
# data blocks corresponding to the different clusters. This constraint is imposed |
|
1241
|
|
|
|
|
|
|
# on us by Gnuplot when plotting data from the same file since we want to use |
|
1242
|
|
|
|
|
|
|
# different point styles for the data points in different cluster files. |
|
1243
|
|
|
|
|
|
|
# Subsequently, we call upon the Perl interface provided by the Graphics::GnuplotIF |
|
1244
|
|
|
|
|
|
|
# module to plot the data clusters. |
|
1245
|
|
|
|
|
|
|
sub visualize_clusters { |
|
1246
|
|
|
|
|
|
|
my $self = shift; |
|
1247
|
|
|
|
|
|
|
my $v_mask; |
|
1248
|
|
|
|
|
|
|
my $pause_time; |
|
1249
|
|
|
|
|
|
|
if (@_ == 1) { |
|
1250
|
|
|
|
|
|
|
$v_mask = shift || die "visualization mask missing"; |
|
1251
|
|
|
|
|
|
|
} elsif (@_ == 2) { |
|
1252
|
|
|
|
|
|
|
$v_mask = shift || die "visualization mask missing"; |
|
1253
|
|
|
|
|
|
|
$pause_time = shift; |
|
1254
|
|
|
|
|
|
|
} else { |
|
1255
|
|
|
|
|
|
|
die "visualize_clusters() called with wrong args"; |
|
1256
|
|
|
|
|
|
|
} |
|
1257
|
|
|
|
|
|
|
my $master_datafile = $self->{_datafile}; |
|
1258
|
|
|
|
|
|
|
my @v_mask = split //, $v_mask; |
|
1259
|
|
|
|
|
|
|
my $visualization_mask_width = @v_mask; |
|
1260
|
|
|
|
|
|
|
my $original_data_mask = $self->{_mask}; |
|
1261
|
|
|
|
|
|
|
my @mask = split //, $original_data_mask; |
|
1262
|
|
|
|
|
|
|
my $data_field_width = scalar grep {$_ eq '1'} @mask; |
|
1263
|
|
|
|
|
|
|
die "\n\nABORTED: The width of the visualization mask (including " . |
|
1264
|
|
|
|
|
|
|
"all its 1s and 0s) must equal the width of the original mask " . |
|
1265
|
|
|
|
|
|
|
"used for reading the data file (counting only the 1's)" |
|
1266
|
|
|
|
|
|
|
if $visualization_mask_width != $data_field_width; |
|
1267
|
|
|
|
|
|
|
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask; |
|
1268
|
|
|
|
|
|
|
# The following section is for the superimposed one-Mahalanobis-distance-unit |
|
1269
|
|
|
|
|
|
|
# ellipses that are shown only for 2D plots: |
|
1270
|
|
|
|
|
|
|
if ($visualization_data_field_width == 2) { |
|
1271
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
|
1272
|
|
|
|
|
|
|
my $contour_filename = "__contour_" . $cluster_index . ".dat"; |
|
1273
|
|
|
|
|
|
|
my $mean = $self->{_cluster_means}->[$cluster_index]; |
|
1274
|
|
|
|
|
|
|
my $covariance = $self->{_cluster_covariances}->[$cluster_index]; |
|
1275
|
|
|
|
|
|
|
my ($mux,$muy) = $mean->as_list(); |
|
1276
|
|
|
|
|
|
|
my ($varx,$sigmaxy) = $covariance->row(0)->as_list(); |
|
1277
|
|
|
|
|
|
|
my ($sigmayx,$vary) = $covariance->row(1)->as_list(); |
|
1278
|
|
|
|
|
|
|
die "Your covariance matrix does not look right" |
|
1279
|
|
|
|
|
|
|
unless $sigmaxy == $sigmayx; |
|
1280
|
|
|
|
|
|
|
my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary)); |
|
1281
|
|
|
|
|
|
|
my $argstring = <<"END"; |
|
1282
|
|
|
|
|
|
|
set contour |
|
1283
|
|
|
|
|
|
|
mux = $mux |
|
1284
|
|
|
|
|
|
|
muy = $muy |
|
1285
|
|
|
|
|
|
|
sigmax = $sigmax |
|
1286
|
|
|
|
|
|
|
sigmay = $sigmay |
|
1287
|
|
|
|
|
|
|
sigmaxy = $sigmaxy |
|
1288
|
|
|
|
|
|
|
determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2 |
|
1289
|
|
|
|
|
|
|
exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) ) |
|
1290
|
|
|
|
|
|
|
f(x,y) = exp( exponent(x,y) ) - 0.2 |
|
1291
|
|
|
|
|
|
|
xmax = mux + 2 * sigmax |
|
1292
|
|
|
|
|
|
|
xmin = mux - 2 * sigmax |
|
1293
|
|
|
|
|
|
|
ymax = muy + 2 * sigmay |
|
1294
|
|
|
|
|
|
|
ymin = muy - 2 * sigmay |
|
1295
|
|
|
|
|
|
|
set xrange [ xmin : xmax ] |
|
1296
|
|
|
|
|
|
|
set yrange [ ymin : ymax ] |
|
1297
|
|
|
|
|
|
|
set isosamples 200 |
|
1298
|
|
|
|
|
|
|
unset surface |
|
1299
|
|
|
|
|
|
|
set cntrparam levels discrete 0 |
|
1300
|
|
|
|
|
|
|
set table \"$contour_filename\" |
|
1301
|
|
|
|
|
|
|
splot f(x,y) |
|
1302
|
|
|
|
|
|
|
unset table |
|
1303
|
|
|
|
|
|
|
END |
|
1304
|
|
|
|
|
|
|
my $plot = Graphics::GnuplotIF->new(); |
|
1305
|
|
|
|
|
|
|
$plot->gnuplot_cmd( $argstring ); |
|
1306
|
|
|
|
|
|
|
} |
|
1307
|
|
|
|
|
|
|
} |
|
1308
|
|
|
|
|
|
|
my %visualization_data; |
|
1309
|
|
|
|
|
|
|
while ( my ($record_id, $data) = each %{$self->{_data}} ) { |
|
1310
|
|
|
|
|
|
|
my @fields = @$data; |
|
1311
|
|
|
|
|
|
|
die "\nABORTED: Visualization mask size exceeds data record size" |
|
1312
|
|
|
|
|
|
|
if $#v_mask > $#fields; |
|
1313
|
|
|
|
|
|
|
my @data_fields; |
|
1314
|
|
|
|
|
|
|
foreach my $i (0..@fields-1) { |
|
1315
|
|
|
|
|
|
|
if ($v_mask[$i] eq '0') { |
|
1316
|
|
|
|
|
|
|
next; |
|
1317
|
|
|
|
|
|
|
} elsif ($v_mask[$i] eq '1') { |
|
1318
|
|
|
|
|
|
|
push @data_fields, $fields[$i]; |
|
1319
|
|
|
|
|
|
|
} else { |
|
1320
|
|
|
|
|
|
|
die "Misformed visualization mask. It can only have 1s and 0s"; |
|
1321
|
|
|
|
|
|
|
} |
|
1322
|
|
|
|
|
|
|
} |
|
1323
|
|
|
|
|
|
|
$visualization_data{ $record_id } = \@data_fields; |
|
1324
|
|
|
|
|
|
|
} |
|
1325
|
|
|
|
|
|
|
my $K = scalar @{$self->{_clusters}}; |
|
1326
|
|
|
|
|
|
|
my $filename = basename($master_datafile); |
|
1327
|
|
|
|
|
|
|
my $temp_file = "__temp_" . $filename; |
|
1328
|
|
|
|
|
|
|
unlink $temp_file if -e $temp_file; |
|
1329
|
|
|
|
|
|
|
open OUTPUT, ">$temp_file" |
|
1330
|
|
|
|
|
|
|
or die "Unable to open a temp file in this directory: $!"; |
|
1331
|
|
|
|
|
|
|
foreach my $cluster (@{$self->{_clusters}}) { |
|
1332
|
|
|
|
|
|
|
foreach my $item (@$cluster) { |
|
1333
|
|
|
|
|
|
|
print OUTPUT "@{$visualization_data{$item}}"; |
|
1334
|
|
|
|
|
|
|
print OUTPUT "\n"; |
|
1335
|
|
|
|
|
|
|
} |
|
1336
|
|
|
|
|
|
|
print OUTPUT "\n\n"; |
|
1337
|
|
|
|
|
|
|
} |
|
1338
|
|
|
|
|
|
|
close OUTPUT; |
|
1339
|
|
|
|
|
|
|
my $plot; |
|
1340
|
|
|
|
|
|
|
if (!defined $pause_time) { |
|
1341
|
|
|
|
|
|
|
$plot = Graphics::GnuplotIF->new( persist => 1 ); |
|
1342
|
|
|
|
|
|
|
} else { |
|
1343
|
|
|
|
|
|
|
$plot = Graphics::GnuplotIF->new(); |
|
1344
|
|
|
|
|
|
|
} |
|
1345
|
|
|
|
|
|
|
my $arg_string = ""; |
|
1346
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
|
1347
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set noclip"); |
|
1348
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set pointsize 2"); |
|
1349
|
|
|
|
|
|
|
foreach my $i (0..$K-1) { |
|
1350
|
|
|
|
|
|
|
my $j = $i + 1; |
|
1351
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, "; |
|
1352
|
|
|
|
|
|
|
} |
|
1353
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
|
1354
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set noclip"); |
|
1355
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set pointsize 2"); |
|
1356
|
|
|
|
|
|
|
foreach my $i (0..$K-1) { |
|
1357
|
|
|
|
|
|
|
my $j = $i + 1; |
|
1358
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, "; |
|
1359
|
|
|
|
|
|
|
my $ellipse_filename = "__contour_" . $i . ".dat"; |
|
1360
|
|
|
|
|
|
|
$arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", "; |
|
1361
|
|
|
|
|
|
|
} |
|
1362
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1 ) { |
|
1363
|
|
|
|
|
|
|
open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!"; |
|
1364
|
|
|
|
|
|
|
my @all_data = ; |
|
1365
|
|
|
|
|
|
|
close INPUT; |
|
1366
|
|
|
|
|
|
|
@all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data; |
|
1367
|
|
|
|
|
|
|
my $all_joined_data = join ':', @all_data; |
|
1368
|
|
|
|
|
|
|
my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data; |
|
1369
|
|
|
|
|
|
|
my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals); |
|
1370
|
|
|
|
|
|
|
foreach my $i (0..@separated-1) { |
|
1371
|
|
|
|
|
|
|
$separated[$i] =~ s/SEPERATOR//g; |
|
1372
|
|
|
|
|
|
|
my @cluster_for_hist = split /:/, $separated[$i]; |
|
1373
|
|
|
|
|
|
|
@cluster_for_hist = grep $_, @cluster_for_hist; |
|
1374
|
|
|
|
|
|
|
my ($minval,$maxval) = minmax(\@cluster_for_hist); |
|
1375
|
|
|
|
|
|
|
push @all_minvals, $minval; |
|
1376
|
|
|
|
|
|
|
push @all_maxvals, $maxval; |
|
1377
|
|
|
|
|
|
|
push @all_clusters_for_hist, \@cluster_for_hist; |
|
1378
|
|
|
|
|
|
|
} |
|
1379
|
|
|
|
|
|
|
push @all_minmaxvals, @all_minvals; |
|
1380
|
|
|
|
|
|
|
push @all_minmaxvals, @all_maxvals; |
|
1381
|
|
|
|
|
|
|
my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals); |
|
1382
|
|
|
|
|
|
|
my $delta = ($abs_maxval - $abs_minval) / 100.0; |
|
1383
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set boxwidth 3"); |
|
1384
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style fill solid border -1"); |
|
1385
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set ytics out nomirror"); |
|
1386
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style data histograms"); |
|
1387
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style histogram clustered"); |
|
1388
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set title 'Clusters shown through histograms'"); |
|
1389
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror"); |
|
1390
|
|
|
|
|
|
|
foreach my $cindex (0..@all_clusters_for_hist-1) { |
|
1391
|
|
|
|
|
|
|
my $filename = basename($master_datafile); |
|
1392
|
|
|
|
|
|
|
my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $filename; |
|
1393
|
|
|
|
|
|
|
unlink $temp_file if -e $temp_file; |
|
1394
|
|
|
|
|
|
|
open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!"; |
|
1395
|
|
|
|
|
|
|
print OUTPUT "Xstep histval\n"; |
|
1396
|
|
|
|
|
|
|
my @histogram = (0) x 100; |
|
1397
|
|
|
|
|
|
|
foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) { |
|
1398
|
|
|
|
|
|
|
$histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++; |
|
1399
|
|
|
|
|
|
|
} |
|
1400
|
|
|
|
|
|
|
foreach my $i (0..@histogram-1) { |
|
1401
|
|
|
|
|
|
|
print OUTPUT "$i $histogram[$i]\n"; |
|
1402
|
|
|
|
|
|
|
} |
|
1403
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, "; |
|
1404
|
|
|
|
|
|
|
close OUTPUT; |
|
1405
|
|
|
|
|
|
|
} |
|
1406
|
|
|
|
|
|
|
} |
|
1407
|
|
|
|
|
|
|
$arg_string = $arg_string =~ /^(.*),[ ]+$/; |
|
1408
|
|
|
|
|
|
|
$arg_string = $1; |
|
1409
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
|
1410
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "splot $arg_string" ); |
|
1411
|
|
|
|
|
|
|
$plot->gnuplot_pause( $pause_time ) if defined $pause_time; |
|
1412
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
|
1413
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "plot $arg_string" ); |
|
1414
|
|
|
|
|
|
|
$plot->gnuplot_pause( $pause_time ) if defined $pause_time; |
|
1415
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1) { |
|
1416
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "plot $arg_string" ); |
|
1417
|
|
|
|
|
|
|
$plot->gnuplot_pause( $pause_time ) if defined $pause_time; |
|
1418
|
|
|
|
|
|
|
} |
|
1419
|
|
|
|
|
|
|
} |
|
1420
|
|
|
|
|
|
|
|
|
1421
|
|
|
|
|
|
|
# This subroutine is the same as above except that it makes PNG plots (for hardcopy |
|
1422
|
|
|
|
|
|
|
# printing) of the clusters. |
|
1423
|
|
|
|
|
|
|
sub plot_hardcopy_clusters { |
|
1424
|
|
|
|
|
|
|
my $self = shift; |
|
1425
|
|
|
|
|
|
|
my $v_mask; |
|
1426
|
|
|
|
|
|
|
my $pause_time; |
|
1427
|
|
|
|
|
|
|
if (@_ == 1) { |
|
1428
|
|
|
|
|
|
|
$v_mask = shift || die "visualization mask missing"; |
|
1429
|
|
|
|
|
|
|
} elsif (@_ == 2) { |
|
1430
|
|
|
|
|
|
|
$v_mask = shift || die "visualization mask missing"; |
|
1431
|
|
|
|
|
|
|
$pause_time = shift; |
|
1432
|
|
|
|
|
|
|
} else { |
|
1433
|
|
|
|
|
|
|
die "visualize_clusters() called with wrong args"; |
|
1434
|
|
|
|
|
|
|
} |
|
1435
|
|
|
|
|
|
|
my $master_datafile = $self->{_datafile}; |
|
1436
|
|
|
|
|
|
|
my @v_mask = split //, $v_mask; |
|
1437
|
|
|
|
|
|
|
my $visualization_mask_width = @v_mask; |
|
1438
|
|
|
|
|
|
|
my $original_data_mask = $self->{_mask}; |
|
1439
|
|
|
|
|
|
|
my @mask = split //, $original_data_mask; |
|
1440
|
|
|
|
|
|
|
my $data_field_width = scalar grep {$_ eq '1'} @mask; |
|
1441
|
|
|
|
|
|
|
die "\n\nABORTED: The width of the visualization mask (including " . |
|
1442
|
|
|
|
|
|
|
"all its 1s and 0s) must equal the width of the original mask " . |
|
1443
|
|
|
|
|
|
|
"used for reading the data file (counting only the 1's)" |
|
1444
|
|
|
|
|
|
|
if $visualization_mask_width != $data_field_width; |
|
1445
|
|
|
|
|
|
|
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask; |
|
1446
|
|
|
|
|
|
|
if ($visualization_data_field_width == 2) { |
|
1447
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
|
1448
|
|
|
|
|
|
|
my $contour_filename = "__contour_" . $cluster_index . ".dat"; |
|
1449
|
|
|
|
|
|
|
my $mean = $self->{_cluster_means}->[$cluster_index]; |
|
1450
|
|
|
|
|
|
|
my $covariance = $self->{_cluster_covariances}->[$cluster_index]; |
|
1451
|
|
|
|
|
|
|
my ($mux,$muy) = $mean->as_list(); |
|
1452
|
|
|
|
|
|
|
my ($varx,$sigmaxy) = $covariance->row(0)->as_list(); |
|
1453
|
|
|
|
|
|
|
my ($sigmayx,$vary) = $covariance->row(1)->as_list(); |
|
1454
|
|
|
|
|
|
|
die "Your covariance matrix does not look right" |
|
1455
|
|
|
|
|
|
|
unless $sigmaxy == $sigmayx; |
|
1456
|
|
|
|
|
|
|
my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary)); |
|
1457
|
|
|
|
|
|
|
my $argstring = <<"END"; |
|
1458
|
|
|
|
|
|
|
set contour |
|
1459
|
|
|
|
|
|
|
mux = $mux |
|
1460
|
|
|
|
|
|
|
muy = $muy |
|
1461
|
|
|
|
|
|
|
sigmax = $sigmax |
|
1462
|
|
|
|
|
|
|
sigmay = $sigmay |
|
1463
|
|
|
|
|
|
|
sigmaxy = $sigmaxy |
|
1464
|
|
|
|
|
|
|
determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2 |
|
1465
|
|
|
|
|
|
|
exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) ) |
|
1466
|
|
|
|
|
|
|
f(x,y) = exp( exponent(x,y) ) - 0.2 |
|
1467
|
|
|
|
|
|
|
xmax = mux + 2 * sigmax |
|
1468
|
|
|
|
|
|
|
xmin = mux - 2 * sigmax |
|
1469
|
|
|
|
|
|
|
ymax = muy + 2 * sigmay |
|
1470
|
|
|
|
|
|
|
ymin = muy - 2 * sigmay |
|
1471
|
|
|
|
|
|
|
set xrange [ xmin : xmax ] |
|
1472
|
|
|
|
|
|
|
set yrange [ ymin : ymax ] |
|
1473
|
|
|
|
|
|
|
set isosamples 200 |
|
1474
|
|
|
|
|
|
|
unset surface |
|
1475
|
|
|
|
|
|
|
set cntrparam levels discrete 0 |
|
1476
|
|
|
|
|
|
|
set table \"$contour_filename\" |
|
1477
|
|
|
|
|
|
|
splot f(x,y) |
|
1478
|
|
|
|
|
|
|
unset table |
|
1479
|
|
|
|
|
|
|
END |
|
1480
|
|
|
|
|
|
|
my $plot = Graphics::GnuplotIF->new(); |
|
1481
|
|
|
|
|
|
|
$plot->gnuplot_cmd( $argstring ); |
|
1482
|
|
|
|
|
|
|
} |
|
1483
|
|
|
|
|
|
|
} |
|
1484
|
|
|
|
|
|
|
my %visualization_data; |
|
1485
|
|
|
|
|
|
|
while ( my ($record_id, $data) = each %{$self->{_data}} ) { |
|
1486
|
|
|
|
|
|
|
my @fields = @$data; |
|
1487
|
|
|
|
|
|
|
die "\nABORTED: Visualization mask size exceeds data record size" |
|
1488
|
|
|
|
|
|
|
if $#v_mask > $#fields; |
|
1489
|
|
|
|
|
|
|
my @data_fields; |
|
1490
|
|
|
|
|
|
|
foreach my $i (0..@fields-1) { |
|
1491
|
|
|
|
|
|
|
if ($v_mask[$i] eq '0') { |
|
1492
|
|
|
|
|
|
|
next; |
|
1493
|
|
|
|
|
|
|
} elsif ($v_mask[$i] eq '1') { |
|
1494
|
|
|
|
|
|
|
push @data_fields, $fields[$i]; |
|
1495
|
|
|
|
|
|
|
} else { |
|
1496
|
|
|
|
|
|
|
die "Misformed visualization mask. It can only have 1s and 0s"; |
|
1497
|
|
|
|
|
|
|
} |
|
1498
|
|
|
|
|
|
|
} |
|
1499
|
|
|
|
|
|
|
$visualization_data{ $record_id } = \@data_fields; |
|
1500
|
|
|
|
|
|
|
} |
|
1501
|
|
|
|
|
|
|
my $K = scalar @{$self->{_clusters}}; |
|
1502
|
|
|
|
|
|
|
my $filename = basename($master_datafile); |
|
1503
|
|
|
|
|
|
|
my $temp_file = "__temp_" . $filename; |
|
1504
|
|
|
|
|
|
|
unlink $temp_file if -e $temp_file; |
|
1505
|
|
|
|
|
|
|
open OUTPUT, ">$temp_file" |
|
1506
|
|
|
|
|
|
|
or die "Unable to open a temp file in this directory: $!"; |
|
1507
|
|
|
|
|
|
|
foreach my $cluster (@{$self->{_clusters}}) { |
|
1508
|
|
|
|
|
|
|
foreach my $item (@$cluster) { |
|
1509
|
|
|
|
|
|
|
print OUTPUT "@{$visualization_data{$item}}"; |
|
1510
|
|
|
|
|
|
|
print OUTPUT "\n"; |
|
1511
|
|
|
|
|
|
|
} |
|
1512
|
|
|
|
|
|
|
print OUTPUT "\n\n"; |
|
1513
|
|
|
|
|
|
|
} |
|
1514
|
|
|
|
|
|
|
close OUTPUT; |
|
1515
|
|
|
|
|
|
|
my $plot; |
|
1516
|
|
|
|
|
|
|
if (!defined $pause_time) { |
|
1517
|
|
|
|
|
|
|
$plot = Graphics::GnuplotIF->new( persist => 1 ); |
|
1518
|
|
|
|
|
|
|
} else { |
|
1519
|
|
|
|
|
|
|
$plot = Graphics::GnuplotIF->new(); |
|
1520
|
|
|
|
|
|
|
} |
|
1521
|
|
|
|
|
|
|
my $arg_string = ""; |
|
1522
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
|
1523
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set noclip" ); |
|
1524
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set pointsize 2" ); |
|
1525
|
|
|
|
|
|
|
foreach my $i (0..$K-1) { |
|
1526
|
|
|
|
|
|
|
my $j = $i + 1; |
|
1527
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, "; |
|
1528
|
|
|
|
|
|
|
} |
|
1529
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
|
1530
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set noclip" ); |
|
1531
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set pointsize 2" ); |
|
1532
|
|
|
|
|
|
|
foreach my $i (0..$K-1) { |
|
1533
|
|
|
|
|
|
|
my $j = $i + 1; |
|
1534
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, "; |
|
1535
|
|
|
|
|
|
|
my $ellipse_filename = "__contour_" . $i . ".dat"; |
|
1536
|
|
|
|
|
|
|
$arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", "; |
|
1537
|
|
|
|
|
|
|
} |
|
1538
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1 ) { |
|
1539
|
|
|
|
|
|
|
open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!"; |
|
1540
|
|
|
|
|
|
|
my @all_data = ; |
|
1541
|
|
|
|
|
|
|
close INPUT; |
|
1542
|
|
|
|
|
|
|
@all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data; |
|
1543
|
|
|
|
|
|
|
my $all_joined_data = join ':', @all_data; |
|
1544
|
|
|
|
|
|
|
my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data; |
|
1545
|
|
|
|
|
|
|
my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals); |
|
1546
|
|
|
|
|
|
|
foreach my $i (0..@separated-1) { |
|
1547
|
|
|
|
|
|
|
$separated[$i] =~ s/SEPERATOR//g; |
|
1548
|
|
|
|
|
|
|
my @cluster_for_hist = split /:/, $separated[$i]; |
|
1549
|
|
|
|
|
|
|
@cluster_for_hist = grep $_, @cluster_for_hist; |
|
1550
|
|
|
|
|
|
|
my ($minval,$maxval) = minmax(\@cluster_for_hist); |
|
1551
|
|
|
|
|
|
|
push @all_minvals, $minval; |
|
1552
|
|
|
|
|
|
|
push @all_maxvals, $maxval; |
|
1553
|
|
|
|
|
|
|
push @all_clusters_for_hist, \@cluster_for_hist; |
|
1554
|
|
|
|
|
|
|
} |
|
1555
|
|
|
|
|
|
|
push @all_minmaxvals, @all_minvals; |
|
1556
|
|
|
|
|
|
|
push @all_minmaxvals, @all_maxvals; |
|
1557
|
|
|
|
|
|
|
my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals); |
|
1558
|
|
|
|
|
|
|
my $delta = ($abs_maxval - $abs_minval) / 100.0; |
|
1559
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set boxwidth 3"); |
|
1560
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style fill solid border -1"); |
|
1561
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set ytics out nomirror"); |
|
1562
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style data histograms"); |
|
1563
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style histogram clustered"); |
|
1564
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set title 'Clusters shown through histograms'"); |
|
1565
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror"); |
|
1566
|
|
|
|
|
|
|
foreach my $cindex (0..@all_clusters_for_hist-1) { |
|
1567
|
|
|
|
|
|
|
my $filename = basename($master_datafile); |
|
1568
|
|
|
|
|
|
|
my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $filename; |
|
1569
|
|
|
|
|
|
|
unlink $temp_file if -e $temp_file; |
|
1570
|
|
|
|
|
|
|
open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!"; |
|
1571
|
|
|
|
|
|
|
print OUTPUT "Xstep histval\n"; |
|
1572
|
|
|
|
|
|
|
my @histogram = (0) x 100; |
|
1573
|
|
|
|
|
|
|
foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) { |
|
1574
|
|
|
|
|
|
|
$histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++; |
|
1575
|
|
|
|
|
|
|
} |
|
1576
|
|
|
|
|
|
|
foreach my $i (0..@histogram-1) { |
|
1577
|
|
|
|
|
|
|
print OUTPUT "$i $histogram[$i]\n"; |
|
1578
|
|
|
|
|
|
|
} |
|
1579
|
|
|
|
|
|
|
# $arg_string .= "\"$temp_file\" using 1:2 ti col smooth frequency with boxes lc $cindex, "; |
|
1580
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, "; |
|
1581
|
|
|
|
|
|
|
close OUTPUT; |
|
1582
|
|
|
|
|
|
|
} |
|
1583
|
|
|
|
|
|
|
} |
|
1584
|
|
|
|
|
|
|
$arg_string = $arg_string =~ /^(.*),[ ]+$/; |
|
1585
|
|
|
|
|
|
|
$arg_string = $1; |
|
1586
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
|
1587
|
|
|
|
|
|
|
$plot->gnuplot_cmd( 'set terminal png color', |
|
1588
|
|
|
|
|
|
|
'set output "cluster_plot.png"'); |
|
1589
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "splot $arg_string" ); |
|
1590
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
|
1591
|
|
|
|
|
|
|
$plot->gnuplot_cmd('set terminal png', |
|
1592
|
|
|
|
|
|
|
'set output "cluster_plot.png"'); |
|
1593
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "plot $arg_string" ); |
|
1594
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1) { |
|
1595
|
|
|
|
|
|
|
$plot->gnuplot_cmd('set terminal png', |
|
1596
|
|
|
|
|
|
|
'set output "cluster_plot.png"'); |
|
1597
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "plot $arg_string" ); |
|
1598
|
|
|
|
|
|
|
} |
|
1599
|
|
|
|
|
|
|
} |
|
1600
|
|
|
|
|
|
|
|
|
1601
|
|
|
|
|
|
|
# This method is for the visualization of the posterior class distributions. In |
|
1602
|
|
|
|
|
|
|
# other words, this method allows us to see the soft clustering produced by the EM |
|
1603
|
|
|
|
|
|
|
# algorithm. While much of the gnuplot logic here is the same as in the |
|
1604
|
|
|
|
|
|
|
# visualize_clusters() method, there are significant differences in how the data is |
|
1605
|
|
|
|
|
|
|
# pooled for the purpose of display. |
|
1606
|
|
|
|
|
|
|
sub visualize_distributions { |
|
1607
|
|
|
|
|
|
|
my $self = shift; |
|
1608
|
|
|
|
|
|
|
my $v_mask; |
|
1609
|
|
|
|
|
|
|
my $pause_time; |
|
1610
|
|
|
|
|
|
|
if (@_ == 1) { |
|
1611
|
|
|
|
|
|
|
$v_mask = shift || die "visualization mask missing"; |
|
1612
|
|
|
|
|
|
|
} elsif (@_ == 2) { |
|
1613
|
|
|
|
|
|
|
$v_mask = shift || die "visualization mask missing"; |
|
1614
|
|
|
|
|
|
|
$pause_time = shift; |
|
1615
|
|
|
|
|
|
|
} else { |
|
1616
|
|
|
|
|
|
|
die "visualize_distributions() called with wrong args"; |
|
1617
|
|
|
|
|
|
|
} |
|
1618
|
|
|
|
|
|
|
my @v_mask = split //, $v_mask; |
|
1619
|
|
|
|
|
|
|
my $visualization_mask_width = @v_mask; |
|
1620
|
|
|
|
|
|
|
my $original_data_mask = $self->{_mask}; |
|
1621
|
|
|
|
|
|
|
my @mask = split //, $original_data_mask; |
|
1622
|
|
|
|
|
|
|
my $data_field_width = scalar grep {$_ eq '1'} @mask; |
|
1623
|
|
|
|
|
|
|
die "\n\nABORTED: The width of the visualization mask (including " . |
|
1624
|
|
|
|
|
|
|
"all its 1s and 0s) must equal the width of the original mask " . |
|
1625
|
|
|
|
|
|
|
"used for reading the data file (counting only the 1's)" |
|
1626
|
|
|
|
|
|
|
if $visualization_mask_width != $data_field_width; |
|
1627
|
|
|
|
|
|
|
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask; |
|
1628
|
|
|
|
|
|
|
if ($visualization_data_field_width == 2) { |
|
1629
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
|
1630
|
|
|
|
|
|
|
my $contour_filename = "__contour2_" . $cluster_index . ".dat"; |
|
1631
|
|
|
|
|
|
|
my $mean = $self->{_cluster_means}->[$cluster_index]; |
|
1632
|
|
|
|
|
|
|
my $covariance = $self->{_cluster_covariances}->[$cluster_index]; |
|
1633
|
|
|
|
|
|
|
my ($mux,$muy) = $mean->as_list(); |
|
1634
|
|
|
|
|
|
|
my ($varx,$sigmaxy) = $covariance->row(0)->as_list(); |
|
1635
|
|
|
|
|
|
|
my ($sigmayx,$vary) = $covariance->row(1)->as_list(); |
|
1636
|
|
|
|
|
|
|
die "Your covariance matrix does not look right" |
|
1637
|
|
|
|
|
|
|
unless $sigmaxy == $sigmayx; |
|
1638
|
|
|
|
|
|
|
my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary)); |
|
1639
|
|
|
|
|
|
|
my $argstring = <<"END"; |
|
1640
|
|
|
|
|
|
|
set contour |
|
1641
|
|
|
|
|
|
|
mux = $mux |
|
1642
|
|
|
|
|
|
|
muy = $muy |
|
1643
|
|
|
|
|
|
|
sigmax = $sigmax |
|
1644
|
|
|
|
|
|
|
sigmay = $sigmay |
|
1645
|
|
|
|
|
|
|
sigmaxy = $sigmaxy |
|
1646
|
|
|
|
|
|
|
determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2 |
|
1647
|
|
|
|
|
|
|
exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) ) |
|
1648
|
|
|
|
|
|
|
f(x,y) = exp( exponent(x,y) ) - 0.2 |
|
1649
|
|
|
|
|
|
|
xmax = mux + 2 * sigmax |
|
1650
|
|
|
|
|
|
|
xmin = mux - 2 * sigmax |
|
1651
|
|
|
|
|
|
|
ymax = muy + 2 * sigmay |
|
1652
|
|
|
|
|
|
|
ymin = muy - 2 * sigmay |
|
1653
|
|
|
|
|
|
|
set xrange [ xmin : xmax ] |
|
1654
|
|
|
|
|
|
|
set yrange [ ymin : ymax ] |
|
1655
|
|
|
|
|
|
|
set isosamples 200 |
|
1656
|
|
|
|
|
|
|
unset surface |
|
1657
|
|
|
|
|
|
|
set cntrparam levels discrete 0 |
|
1658
|
|
|
|
|
|
|
set table \"$contour_filename\" |
|
1659
|
|
|
|
|
|
|
splot f(x,y) |
|
1660
|
|
|
|
|
|
|
unset table |
|
1661
|
|
|
|
|
|
|
END |
|
1662
|
|
|
|
|
|
|
my $plot = Graphics::GnuplotIF->new(); |
|
1663
|
|
|
|
|
|
|
$plot->gnuplot_cmd( $argstring ); |
|
1664
|
|
|
|
|
|
|
} |
|
1665
|
|
|
|
|
|
|
} |
|
1666
|
|
|
|
|
|
|
my %visualization_data; |
|
1667
|
|
|
|
|
|
|
while ( my ($record_id, $data) = each %{$self->{_data}} ) { |
|
1668
|
|
|
|
|
|
|
my @fields = @$data; |
|
1669
|
|
|
|
|
|
|
die "\nABORTED: Visualization mask size exceeds data record size" |
|
1670
|
|
|
|
|
|
|
if $#v_mask > $#fields; |
|
1671
|
|
|
|
|
|
|
my @data_fields; |
|
1672
|
|
|
|
|
|
|
foreach my $i (0..@fields-1) { |
|
1673
|
|
|
|
|
|
|
if ($v_mask[$i] eq '0') { |
|
1674
|
|
|
|
|
|
|
next; |
|
1675
|
|
|
|
|
|
|
} elsif ($v_mask[$i] eq '1') { |
|
1676
|
|
|
|
|
|
|
push @data_fields, $fields[$i]; |
|
1677
|
|
|
|
|
|
|
} else { |
|
1678
|
|
|
|
|
|
|
die "Misformed visualization mask. It can only have 1s and 0s"; |
|
1679
|
|
|
|
|
|
|
} |
|
1680
|
|
|
|
|
|
|
} |
|
1681
|
|
|
|
|
|
|
$visualization_data{ $record_id } = \@data_fields; |
|
1682
|
|
|
|
|
|
|
} |
|
1683
|
|
|
|
|
|
|
my $filename = basename($self->{_datafile}); |
|
1684
|
|
|
|
|
|
|
my $temp_file = "__temp2_" . $filename; |
|
1685
|
|
|
|
|
|
|
unlink $temp_file if -e $temp_file; |
|
1686
|
|
|
|
|
|
|
open OUTPUT, ">$temp_file" |
|
1687
|
|
|
|
|
|
|
or die "Unable to open a temp file in this directory: $!"; |
|
1688
|
|
|
|
|
|
|
my @class_distributions; |
|
1689
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
|
1690
|
|
|
|
|
|
|
push @class_distributions, []; |
|
1691
|
|
|
|
|
|
|
} |
|
1692
|
|
|
|
|
|
|
foreach my $data_tag (@{$self->{_data_id_tags}}) { |
|
1693
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
|
1694
|
|
|
|
|
|
|
push @{$class_distributions[$cluster_index]}, $self->{_data}->{$data_tag} |
|
1695
|
|
|
|
|
|
|
if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] > 0.2; |
|
1696
|
|
|
|
|
|
|
} |
|
1697
|
|
|
|
|
|
|
} |
|
1698
|
|
|
|
|
|
|
foreach my $distribution (@class_distributions) { |
|
1699
|
|
|
|
|
|
|
foreach my $item (@$distribution) { |
|
1700
|
|
|
|
|
|
|
print OUTPUT "@$item"; |
|
1701
|
|
|
|
|
|
|
print OUTPUT "\n"; |
|
1702
|
|
|
|
|
|
|
} |
|
1703
|
|
|
|
|
|
|
print OUTPUT "\n\n"; |
|
1704
|
|
|
|
|
|
|
} |
|
1705
|
|
|
|
|
|
|
close OUTPUT; |
|
1706
|
|
|
|
|
|
|
my $plot; |
|
1707
|
|
|
|
|
|
|
if (!defined $pause_time) { |
|
1708
|
|
|
|
|
|
|
$plot = Graphics::GnuplotIF->new( persist => 1 ); |
|
1709
|
|
|
|
|
|
|
} else { |
|
1710
|
|
|
|
|
|
|
$plot = Graphics::GnuplotIF->new(); |
|
1711
|
|
|
|
|
|
|
} |
|
1712
|
|
|
|
|
|
|
my $arg_string = ""; |
|
1713
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
|
1714
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set noclip" ); |
|
1715
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set pointsize 2" ); |
|
1716
|
|
|
|
|
|
|
foreach my $i (0..$self->{_K}-1) { |
|
1717
|
|
|
|
|
|
|
my $j = $i + 1; |
|
1718
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, "; |
|
1719
|
|
|
|
|
|
|
} |
|
1720
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
|
1721
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set noclip" ); |
|
1722
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set pointsize 2" ); |
|
1723
|
|
|
|
|
|
|
foreach my $i (0..$self->{_K}-1) { |
|
1724
|
|
|
|
|
|
|
my $j = $i + 1; |
|
1725
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, "; |
|
1726
|
|
|
|
|
|
|
my $ellipse_filename = "__contour2_" . $i . ".dat"; |
|
1727
|
|
|
|
|
|
|
$arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", "; |
|
1728
|
|
|
|
|
|
|
} |
|
1729
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1 ) { |
|
1730
|
|
|
|
|
|
|
open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!"; |
|
1731
|
|
|
|
|
|
|
my @all_data = ; |
|
1732
|
|
|
|
|
|
|
close INPUT; |
|
1733
|
|
|
|
|
|
|
@all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data; |
|
1734
|
|
|
|
|
|
|
my $all_joined_data = join ':', @all_data; |
|
1735
|
|
|
|
|
|
|
my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data; |
|
1736
|
|
|
|
|
|
|
my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals); |
|
1737
|
|
|
|
|
|
|
foreach my $i (0..@separated-1) { |
|
1738
|
|
|
|
|
|
|
$separated[$i] =~ s/SEPERATOR//g; |
|
1739
|
|
|
|
|
|
|
my @cluster_for_hist = split /:/, $separated[$i]; |
|
1740
|
|
|
|
|
|
|
@cluster_for_hist = grep $_, @cluster_for_hist; |
|
1741
|
|
|
|
|
|
|
my ($minval,$maxval) = minmax(\@cluster_for_hist); |
|
1742
|
|
|
|
|
|
|
push @all_minvals, $minval; |
|
1743
|
|
|
|
|
|
|
push @all_maxvals, $maxval; |
|
1744
|
|
|
|
|
|
|
push @all_clusters_for_hist, \@cluster_for_hist; |
|
1745
|
|
|
|
|
|
|
} |
|
1746
|
|
|
|
|
|
|
push @all_minmaxvals, @all_minvals; |
|
1747
|
|
|
|
|
|
|
push @all_minmaxvals, @all_maxvals; |
|
1748
|
|
|
|
|
|
|
my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals); |
|
1749
|
|
|
|
|
|
|
my $delta = ($abs_maxval - $abs_minval) / 100.0; |
|
1750
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set boxwidth 3"); |
|
1751
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style fill solid border -1"); |
|
1752
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set ytics out nomirror"); |
|
1753
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style data histograms"); |
|
1754
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style histogram clustered"); |
|
1755
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set title 'Individual distributions shown through histograms'"); |
|
1756
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror"); |
|
1757
|
|
|
|
|
|
|
foreach my $cindex (0..@all_clusters_for_hist-1) { |
|
1758
|
|
|
|
|
|
|
my $localfilename = basename($filename); |
|
1759
|
|
|
|
|
|
|
my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $localfilename; |
|
1760
|
|
|
|
|
|
|
unlink $temp_file if -e $temp_file; |
|
1761
|
|
|
|
|
|
|
open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!"; |
|
1762
|
|
|
|
|
|
|
print OUTPUT "Xstep histval\n"; |
|
1763
|
|
|
|
|
|
|
|
|
1764
|
|
|
|
|
|
|
my @histogram = (0) x 100; |
|
1765
|
|
|
|
|
|
|
foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) { |
|
1766
|
|
|
|
|
|
|
$histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++; |
|
1767
|
|
|
|
|
|
|
} |
|
1768
|
|
|
|
|
|
|
foreach my $i (0..@histogram-1) { |
|
1769
|
|
|
|
|
|
|
print OUTPUT "$i $histogram[$i]\n"; |
|
1770
|
|
|
|
|
|
|
} |
|
1771
|
|
|
|
|
|
|
# $arg_string .= "\"$temp_file\" using 1:2 ti col smooth frequency with boxes lc $cindex, "; |
|
1772
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, "; |
|
1773
|
|
|
|
|
|
|
close OUTPUT; |
|
1774
|
|
|
|
|
|
|
} |
|
1775
|
|
|
|
|
|
|
} |
|
1776
|
|
|
|
|
|
|
$arg_string = $arg_string =~ /^(.*),[ ]+$/; |
|
1777
|
|
|
|
|
|
|
$arg_string = $1; |
|
1778
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
|
1779
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "splot $arg_string" ); |
|
1780
|
|
|
|
|
|
|
$plot->gnuplot_pause( $pause_time ) if defined $pause_time; |
|
1781
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
|
1782
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "plot $arg_string" ); |
|
1783
|
|
|
|
|
|
|
$plot->gnuplot_pause( $pause_time ) if defined $pause_time; |
|
1784
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1) { |
|
1785
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "plot $arg_string" ); |
|
1786
|
|
|
|
|
|
|
$plot->gnuplot_pause( $pause_time ) if defined $pause_time; |
|
1787
|
|
|
|
|
|
|
} |
|
1788
|
|
|
|
|
|
|
} |
|
1789
|
|
|
|
|
|
|
|
|
1790
|
|
|
|
|
|
|
# This method is basically the same as the previous method, except that it is |
|
1791
|
|
|
|
|
|
|
# intended for making PNG files from the distributions. |
|
1792
|
|
|
|
|
|
|
sub plot_hardcopy_distributions { |
|
1793
|
|
|
|
|
|
|
my $self = shift; |
|
1794
|
|
|
|
|
|
|
my $v_mask; |
|
1795
|
|
|
|
|
|
|
my $pause_time; |
|
1796
|
|
|
|
|
|
|
if (@_ == 1) { |
|
1797
|
|
|
|
|
|
|
$v_mask = shift || die "visualization mask missing"; |
|
1798
|
|
|
|
|
|
|
} elsif (@_ == 2) { |
|
1799
|
|
|
|
|
|
|
$v_mask = shift || die "visualization mask missing"; |
|
1800
|
|
|
|
|
|
|
$pause_time = shift; |
|
1801
|
|
|
|
|
|
|
} else { |
|
1802
|
|
|
|
|
|
|
die "visualize_distributions() called with wrong args"; |
|
1803
|
|
|
|
|
|
|
} |
|
1804
|
|
|
|
|
|
|
my @v_mask = split //, $v_mask; |
|
1805
|
|
|
|
|
|
|
my $visualization_mask_width = @v_mask; |
|
1806
|
|
|
|
|
|
|
my $original_data_mask = $self->{_mask}; |
|
1807
|
|
|
|
|
|
|
my @mask = split //, $original_data_mask; |
|
1808
|
|
|
|
|
|
|
my $data_field_width = scalar grep {$_ eq '1'} @mask; |
|
1809
|
|
|
|
|
|
|
die "\n\nABORTED: The width of the visualization mask (including " . |
|
1810
|
|
|
|
|
|
|
"all its 1s and 0s) must equal the width of the original mask " . |
|
1811
|
|
|
|
|
|
|
"used for reading the data file (counting only the 1's)" |
|
1812
|
|
|
|
|
|
|
if $visualization_mask_width != $data_field_width; |
|
1813
|
|
|
|
|
|
|
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask; |
|
1814
|
|
|
|
|
|
|
if ($visualization_data_field_width == 2) { |
|
1815
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
|
1816
|
|
|
|
|
|
|
my $contour_filename = "__contour2_" . $cluster_index . ".dat"; |
|
1817
|
|
|
|
|
|
|
my $mean = $self->{_cluster_means}->[$cluster_index]; |
|
1818
|
|
|
|
|
|
|
my $covariance = $self->{_cluster_covariances}->[$cluster_index]; |
|
1819
|
|
|
|
|
|
|
my ($mux,$muy) = $mean->as_list(); |
|
1820
|
|
|
|
|
|
|
my ($varx,$sigmaxy) = $covariance->row(0)->as_list(); |
|
1821
|
|
|
|
|
|
|
my ($sigmayx,$vary) = $covariance->row(1)->as_list(); |
|
1822
|
|
|
|
|
|
|
die "Your covariance matrix does not look right" |
|
1823
|
|
|
|
|
|
|
unless $sigmaxy == $sigmayx; |
|
1824
|
|
|
|
|
|
|
my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary)); |
|
1825
|
|
|
|
|
|
|
my $argstring = <<"END"; |
|
1826
|
|
|
|
|
|
|
set contour |
|
1827
|
|
|
|
|
|
|
mux = $mux |
|
1828
|
|
|
|
|
|
|
muy = $muy |
|
1829
|
|
|
|
|
|
|
sigmax = $sigmax |
|
1830
|
|
|
|
|
|
|
sigmay = $sigmay |
|
1831
|
|
|
|
|
|
|
sigmaxy = $sigmaxy |
|
1832
|
|
|
|
|
|
|
determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2 |
|
1833
|
|
|
|
|
|
|
exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) ) |
|
1834
|
|
|
|
|
|
|
f(x,y) = exp( exponent(x,y) ) - 0.2 |
|
1835
|
|
|
|
|
|
|
xmax = mux + 2 * sigmax |
|
1836
|
|
|
|
|
|
|
xmin = mux - 2 * sigmax |
|
1837
|
|
|
|
|
|
|
ymax = muy + 2 * sigmay |
|
1838
|
|
|
|
|
|
|
ymin = muy - 2 * sigmay |
|
1839
|
|
|
|
|
|
|
set xrange [ xmin : xmax ] |
|
1840
|
|
|
|
|
|
|
set yrange [ ymin : ymax ] |
|
1841
|
|
|
|
|
|
|
set isosamples 200 |
|
1842
|
|
|
|
|
|
|
unset surface |
|
1843
|
|
|
|
|
|
|
set cntrparam levels discrete 0 |
|
1844
|
|
|
|
|
|
|
set table \"$contour_filename\" |
|
1845
|
|
|
|
|
|
|
splot f(x,y) |
|
1846
|
|
|
|
|
|
|
unset table |
|
1847
|
|
|
|
|
|
|
END |
|
1848
|
|
|
|
|
|
|
my $plot = Graphics::GnuplotIF->new(); |
|
1849
|
|
|
|
|
|
|
$plot->gnuplot_cmd( $argstring ); |
|
1850
|
|
|
|
|
|
|
} |
|
1851
|
|
|
|
|
|
|
} |
|
1852
|
|
|
|
|
|
|
my %visualization_data; |
|
1853
|
|
|
|
|
|
|
while ( my ($record_id, $data) = each %{$self->{_data}} ) { |
|
1854
|
|
|
|
|
|
|
my @fields = @$data; |
|
1855
|
|
|
|
|
|
|
die "\nABORTED: Visualization mask size exceeds data record size" |
|
1856
|
|
|
|
|
|
|
if $#v_mask > $#fields; |
|
1857
|
|
|
|
|
|
|
my @data_fields; |
|
1858
|
|
|
|
|
|
|
foreach my $i (0..@fields-1) { |
|
1859
|
|
|
|
|
|
|
if ($v_mask[$i] eq '0') { |
|
1860
|
|
|
|
|
|
|
next; |
|
1861
|
|
|
|
|
|
|
} elsif ($v_mask[$i] eq '1') { |
|
1862
|
|
|
|
|
|
|
push @data_fields, $fields[$i]; |
|
1863
|
|
|
|
|
|
|
} else { |
|
1864
|
|
|
|
|
|
|
die "Misformed visualization mask. It can only have 1s and 0s"; |
|
1865
|
|
|
|
|
|
|
} |
|
1866
|
|
|
|
|
|
|
} |
|
1867
|
|
|
|
|
|
|
$visualization_data{ $record_id } = \@data_fields; |
|
1868
|
|
|
|
|
|
|
} |
|
1869
|
|
|
|
|
|
|
my $filename = basename($self->{_datafile}); |
|
1870
|
|
|
|
|
|
|
my $temp_file = "__temp2_" . $filename; |
|
1871
|
|
|
|
|
|
|
unlink $temp_file if -e $temp_file; |
|
1872
|
|
|
|
|
|
|
open OUTPUT, ">$temp_file" |
|
1873
|
|
|
|
|
|
|
or die "Unable to open a temp file in this directory: $!"; |
|
1874
|
|
|
|
|
|
|
my @class_distributions; |
|
1875
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
|
1876
|
|
|
|
|
|
|
push @class_distributions, []; |
|
1877
|
|
|
|
|
|
|
} |
|
1878
|
|
|
|
|
|
|
foreach my $data_tag (@{$self->{_data_id_tags}}) { |
|
1879
|
|
|
|
|
|
|
foreach my $cluster_index (0..$self->{_K}-1) { |
|
1880
|
|
|
|
|
|
|
push @{$class_distributions[$cluster_index]}, $self->{_data}->{$data_tag} |
|
1881
|
|
|
|
|
|
|
if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] > 0.2; |
|
1882
|
|
|
|
|
|
|
} |
|
1883
|
|
|
|
|
|
|
} |
|
1884
|
|
|
|
|
|
|
foreach my $distribution (@class_distributions) { |
|
1885
|
|
|
|
|
|
|
foreach my $item (@$distribution) { |
|
1886
|
|
|
|
|
|
|
print OUTPUT "@$item"; |
|
1887
|
|
|
|
|
|
|
print OUTPUT "\n"; |
|
1888
|
|
|
|
|
|
|
} |
|
1889
|
|
|
|
|
|
|
print OUTPUT "\n\n"; |
|
1890
|
|
|
|
|
|
|
} |
|
1891
|
|
|
|
|
|
|
close OUTPUT; |
|
1892
|
|
|
|
|
|
|
my $plot; |
|
1893
|
|
|
|
|
|
|
if (!defined $pause_time) { |
|
1894
|
|
|
|
|
|
|
$plot = Graphics::GnuplotIF->new( persist => 1 ); |
|
1895
|
|
|
|
|
|
|
} else { |
|
1896
|
|
|
|
|
|
|
$plot = Graphics::GnuplotIF->new(); |
|
1897
|
|
|
|
|
|
|
} |
|
1898
|
|
|
|
|
|
|
my $arg_string = ""; |
|
1899
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
|
1900
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set noclip" ); |
|
1901
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set pointsize 2" ); |
|
1902
|
|
|
|
|
|
|
foreach my $i (0..$self->{_K}-1) { |
|
1903
|
|
|
|
|
|
|
my $j = $i + 1; |
|
1904
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, "; |
|
1905
|
|
|
|
|
|
|
} |
|
1906
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
|
1907
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set noclip" ); |
|
1908
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set pointsize 2" ); |
|
1909
|
|
|
|
|
|
|
foreach my $i (0..$self->{_K}-1) { |
|
1910
|
|
|
|
|
|
|
my $j = $i + 1; |
|
1911
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, "; |
|
1912
|
|
|
|
|
|
|
my $ellipse_filename = "__contour2_" . $i . ".dat"; |
|
1913
|
|
|
|
|
|
|
$arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", "; |
|
1914
|
|
|
|
|
|
|
} |
|
1915
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1 ) { |
|
1916
|
|
|
|
|
|
|
open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!"; |
|
1917
|
|
|
|
|
|
|
my @all_data = ; |
|
1918
|
|
|
|
|
|
|
close INPUT; |
|
1919
|
|
|
|
|
|
|
@all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data; |
|
1920
|
|
|
|
|
|
|
my $all_joined_data = join ':', @all_data; |
|
1921
|
|
|
|
|
|
|
my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data; |
|
1922
|
|
|
|
|
|
|
my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals); |
|
1923
|
|
|
|
|
|
|
foreach my $i (0..@separated-1) { |
|
1924
|
|
|
|
|
|
|
$separated[$i] =~ s/SEPERATOR//g; |
|
1925
|
|
|
|
|
|
|
my @cluster_for_hist = split /:/, $separated[$i]; |
|
1926
|
|
|
|
|
|
|
@cluster_for_hist = grep $_, @cluster_for_hist; |
|
1927
|
|
|
|
|
|
|
my ($minval,$maxval) = minmax(\@cluster_for_hist); |
|
1928
|
|
|
|
|
|
|
push @all_minvals, $minval; |
|
1929
|
|
|
|
|
|
|
push @all_maxvals, $maxval; |
|
1930
|
|
|
|
|
|
|
push @all_clusters_for_hist, \@cluster_for_hist; |
|
1931
|
|
|
|
|
|
|
} |
|
1932
|
|
|
|
|
|
|
push @all_minmaxvals, @all_minvals; |
|
1933
|
|
|
|
|
|
|
push @all_minmaxvals, @all_maxvals; |
|
1934
|
|
|
|
|
|
|
my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals); |
|
1935
|
|
|
|
|
|
|
my $delta = ($abs_maxval - $abs_minval) / 100.0; |
|
1936
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set boxwidth 3"); |
|
1937
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style fill solid border -1"); |
|
1938
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set ytics out nomirror"); |
|
1939
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style data histograms"); |
|
1940
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style histogram clustered"); |
|
1941
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set title 'Individual distributions shown through histograms'"); |
|
1942
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror"); |
|
1943
|
|
|
|
|
|
|
foreach my $cindex (0..@all_clusters_for_hist-1) { |
|
1944
|
|
|
|
|
|
|
my $localfilename = basename($filename); |
|
1945
|
|
|
|
|
|
|
my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $localfilename; |
|
1946
|
|
|
|
|
|
|
unlink $temp_file if -e $temp_file; |
|
1947
|
|
|
|
|
|
|
open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!"; |
|
1948
|
|
|
|
|
|
|
print OUTPUT "Xstep histval\n"; |
|
1949
|
|
|
|
|
|
|
|
|
1950
|
|
|
|
|
|
|
my @histogram = (0) x 100; |
|
1951
|
|
|
|
|
|
|
foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) { |
|
1952
|
|
|
|
|
|
|
$histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++; |
|
1953
|
|
|
|
|
|
|
} |
|
1954
|
|
|
|
|
|
|
foreach my $i (0..@histogram-1) { |
|
1955
|
|
|
|
|
|
|
print OUTPUT "$i $histogram[$i]\n"; |
|
1956
|
|
|
|
|
|
|
} |
|
1957
|
|
|
|
|
|
|
$arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, "; |
|
1958
|
|
|
|
|
|
|
close OUTPUT; |
|
1959
|
|
|
|
|
|
|
} |
|
1960
|
|
|
|
|
|
|
} |
|
1961
|
|
|
|
|
|
|
$arg_string = $arg_string =~ /^(.*),[ ]+$/; |
|
1962
|
|
|
|
|
|
|
$arg_string = $1; |
|
1963
|
|
|
|
|
|
|
|
|
1964
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
|
1965
|
|
|
|
|
|
|
$plot->gnuplot_cmd( 'set terminal png', |
|
1966
|
|
|
|
|
|
|
'set output "posterior_prob_plot.png"'); |
|
1967
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "splot $arg_string" ); |
|
1968
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
|
1969
|
|
|
|
|
|
|
$plot->gnuplot_cmd( 'set terminal png', |
|
1970
|
|
|
|
|
|
|
'set output "posterior_prob_plot.png"'); |
|
1971
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "plot $arg_string" ); |
|
1972
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1) { |
|
1973
|
|
|
|
|
|
|
$plot->gnuplot_cmd( 'set terminal png', |
|
1974
|
|
|
|
|
|
|
'set output "posterior_prob_plot.png"'); |
|
1975
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "plot $arg_string" ); |
|
1976
|
|
|
|
|
|
|
} |
|
1977
|
|
|
|
|
|
|
} |
|
1978
|
|
|
|
|
|
|
|
|
1979
|
|
|
|
|
|
|
# The method shown below should be called only AFTER you have called the method |
|
1980
|
|
|
|
|
|
|
# read_data_from_file(). The visualize_data() is meant for the visualization of the |
|
1981
|
|
|
|
|
|
|
# original data in its various 2D or 3D subspaces. |
|
1982
|
|
|
|
|
|
|
sub visualize_data { |
|
1983
|
|
|
|
|
|
|
my $self = shift; |
|
1984
|
|
|
|
|
|
|
my $v_mask = shift || die "visualization mask missing"; |
|
1985
|
|
|
|
|
|
|
|
|
1986
|
|
|
|
|
|
|
my $master_datafile = $self->{_datafile}; |
|
1987
|
|
|
|
|
|
|
|
|
1988
|
|
|
|
|
|
|
my @v_mask = split //, $v_mask; |
|
1989
|
|
|
|
|
|
|
my $visualization_mask_width = @v_mask; |
|
1990
|
|
|
|
|
|
|
my $original_data_mask = $self->{_mask}; |
|
1991
|
|
|
|
|
|
|
my @mask = split //, $original_data_mask; |
|
1992
|
|
|
|
|
|
|
my $data_field_width = scalar grep {$_ eq '1'} @mask; |
|
1993
|
|
|
|
|
|
|
die "\n\nABORTED: The width of the visualization mask (including " . |
|
1994
|
|
|
|
|
|
|
"all its 1s and 0s) must equal the width of the original mask " . |
|
1995
|
|
|
|
|
|
|
"used for reading the data file (counting only the 1's)" |
|
1996
|
|
|
|
|
|
|
if $visualization_mask_width != $data_field_width; |
|
1997
|
|
|
|
|
|
|
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask; |
|
1998
|
|
|
|
|
|
|
my %visualization_data; |
|
1999
|
|
|
|
|
|
|
my $data_source = $self->{_data}; |
|
2000
|
|
|
|
|
|
|
while ( my ($record_id, $data) = each %{$data_source} ) { |
|
2001
|
|
|
|
|
|
|
my @fields = @$data; |
|
2002
|
|
|
|
|
|
|
die "\nABORTED: Visualization mask size exceeds data record size" |
|
2003
|
|
|
|
|
|
|
if $#v_mask > $#fields; |
|
2004
|
|
|
|
|
|
|
my @data_fields; |
|
2005
|
|
|
|
|
|
|
foreach my $i (0..@fields-1) { |
|
2006
|
|
|
|
|
|
|
if ($v_mask[$i] eq '0') { |
|
2007
|
|
|
|
|
|
|
next; |
|
2008
|
|
|
|
|
|
|
} elsif ($v_mask[$i] eq '1') { |
|
2009
|
|
|
|
|
|
|
push @data_fields, $fields[$i]; |
|
2010
|
|
|
|
|
|
|
} else { |
|
2011
|
|
|
|
|
|
|
die "Misformed visualization mask. It can only have 1s and 0s"; |
|
2012
|
|
|
|
|
|
|
} |
|
2013
|
|
|
|
|
|
|
} |
|
2014
|
|
|
|
|
|
|
$visualization_data{ $record_id } = \@data_fields; |
|
2015
|
|
|
|
|
|
|
} |
|
2016
|
|
|
|
|
|
|
my $filename = basename($master_datafile); |
|
2017
|
|
|
|
|
|
|
my $temp_file; |
|
2018
|
|
|
|
|
|
|
$temp_file = "__temp_data_" . $filename; |
|
2019
|
|
|
|
|
|
|
unlink $temp_file if -e $temp_file; |
|
2020
|
|
|
|
|
|
|
open OUTPUT, ">$temp_file" |
|
2021
|
|
|
|
|
|
|
or die "Unable to open a temp file in this directory: $!"; |
|
2022
|
|
|
|
|
|
|
foreach my $datapoint (values %visualization_data) { |
|
2023
|
|
|
|
|
|
|
print OUTPUT "@$datapoint"; |
|
2024
|
|
|
|
|
|
|
print OUTPUT "\n"; |
|
2025
|
|
|
|
|
|
|
} |
|
2026
|
|
|
|
|
|
|
close OUTPUT; |
|
2027
|
|
|
|
|
|
|
my $plot = Graphics::GnuplotIF->new( persist => 1 ); |
|
2028
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set noclip" ); |
|
2029
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set pointsize 2" ); |
|
2030
|
|
|
|
|
|
|
my $plot_title = '"original data provided for EM"'; |
|
2031
|
|
|
|
|
|
|
my $arg_string ; |
|
2032
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
|
2033
|
|
|
|
|
|
|
$arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1"; |
|
2034
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
|
2035
|
|
|
|
|
|
|
$arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1"; |
|
2036
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1 ) { |
|
2037
|
|
|
|
|
|
|
open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!"; |
|
2038
|
|
|
|
|
|
|
my @all_data = ; |
|
2039
|
|
|
|
|
|
|
close INPUT; |
|
2040
|
|
|
|
|
|
|
@all_data = map {chomp $_; $_} @all_data; |
|
2041
|
|
|
|
|
|
|
@all_data = grep $_, @all_data; |
|
2042
|
|
|
|
|
|
|
my ($minval,$maxval) = minmax(\@all_data); |
|
2043
|
|
|
|
|
|
|
my $delta = ($maxval - $minval) / 100.0; |
|
2044
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set boxwidth 3"); |
|
2045
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style fill solid border -1"); |
|
2046
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set ytics out nomirror"); |
|
2047
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style data histograms"); |
|
2048
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style histogram clustered"); |
|
2049
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set title 'Overall distribution of 1D data'"); |
|
2050
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror"); |
|
2051
|
|
|
|
|
|
|
my $localfilename = basename($filename); |
|
2052
|
|
|
|
|
|
|
my $temp_file = "__temp1dhist_" . $localfilename; |
|
2053
|
|
|
|
|
|
|
unlink $temp_file if -e $temp_file; |
|
2054
|
|
|
|
|
|
|
open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!"; |
|
2055
|
|
|
|
|
|
|
print OUTPUT "Xstep histval\n"; |
|
2056
|
|
|
|
|
|
|
my @histogram = (0) x 100; |
|
2057
|
|
|
|
|
|
|
foreach my $i (0..@all_data-1) { |
|
2058
|
|
|
|
|
|
|
$histogram[int( ($all_data[$i] - $minval) / $delta )]++; |
|
2059
|
|
|
|
|
|
|
} |
|
2060
|
|
|
|
|
|
|
foreach my $i (0..@histogram-1) { |
|
2061
|
|
|
|
|
|
|
print OUTPUT "$i $histogram[$i]\n"; |
|
2062
|
|
|
|
|
|
|
} |
|
2063
|
|
|
|
|
|
|
$arg_string = "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc rgb 'green'"; |
|
2064
|
|
|
|
|
|
|
close OUTPUT; |
|
2065
|
|
|
|
|
|
|
} |
|
2066
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
|
2067
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "splot $arg_string" ); |
|
2068
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
|
2069
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "plot $arg_string" ); |
|
2070
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1) { |
|
2071
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "plot $arg_string" ); |
|
2072
|
|
|
|
|
|
|
} |
|
2073
|
|
|
|
|
|
|
} |
|
2074
|
|
|
|
|
|
|
|
|
2075
|
|
|
|
|
|
|
# This method is the same as the one shown above, except that it is meant for |
|
2076
|
|
|
|
|
|
|
# creating PNG files of the displays. |
|
2077
|
|
|
|
|
|
|
sub plot_hardcopy_data { |
|
2078
|
|
|
|
|
|
|
my $self = shift; |
|
2079
|
|
|
|
|
|
|
my $v_mask = shift || die "visualization mask missing"; |
|
2080
|
|
|
|
|
|
|
my $master_datafile = $self->{_datafile}; |
|
2081
|
|
|
|
|
|
|
my @v_mask = split //, $v_mask; |
|
2082
|
|
|
|
|
|
|
my $visualization_mask_width = @v_mask; |
|
2083
|
|
|
|
|
|
|
my $original_data_mask = $self->{_mask}; |
|
2084
|
|
|
|
|
|
|
my @mask = split //, $original_data_mask; |
|
2085
|
|
|
|
|
|
|
my $data_field_width = scalar grep {$_ eq '1'} @mask; |
|
2086
|
|
|
|
|
|
|
die "\n\nABORTED: The width of the visualization mask (including " . |
|
2087
|
|
|
|
|
|
|
"all its 1s and 0s) must equal the width of the original mask " . |
|
2088
|
|
|
|
|
|
|
"used for reading the data file (counting only the 1's)" |
|
2089
|
|
|
|
|
|
|
if $visualization_mask_width != $data_field_width; |
|
2090
|
|
|
|
|
|
|
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask; |
|
2091
|
|
|
|
|
|
|
my %visualization_data; |
|
2092
|
|
|
|
|
|
|
my $data_source = $self->{_data}; |
|
2093
|
|
|
|
|
|
|
while ( my ($record_id, $data) = each %{$data_source} ) { |
|
2094
|
|
|
|
|
|
|
my @fields = @$data; |
|
2095
|
|
|
|
|
|
|
die "\nABORTED: Visualization mask size exceeds data record size" |
|
2096
|
|
|
|
|
|
|
if $#v_mask > $#fields; |
|
2097
|
|
|
|
|
|
|
my @data_fields; |
|
2098
|
|
|
|
|
|
|
foreach my $i (0..@fields-1) { |
|
2099
|
|
|
|
|
|
|
if ($v_mask[$i] eq '0') { |
|
2100
|
|
|
|
|
|
|
next; |
|
2101
|
|
|
|
|
|
|
} elsif ($v_mask[$i] eq '1') { |
|
2102
|
|
|
|
|
|
|
push @data_fields, $fields[$i]; |
|
2103
|
|
|
|
|
|
|
} else { |
|
2104
|
|
|
|
|
|
|
die "Misformed visualization mask. It can only have 1s and 0s"; |
|
2105
|
|
|
|
|
|
|
} |
|
2106
|
|
|
|
|
|
|
} |
|
2107
|
|
|
|
|
|
|
$visualization_data{ $record_id } = \@data_fields; |
|
2108
|
|
|
|
|
|
|
} |
|
2109
|
|
|
|
|
|
|
my $filename = basename($master_datafile); |
|
2110
|
|
|
|
|
|
|
my $temp_file; |
|
2111
|
|
|
|
|
|
|
$temp_file = "__temp_data_" . $filename; |
|
2112
|
|
|
|
|
|
|
unlink $temp_file if -e $temp_file; |
|
2113
|
|
|
|
|
|
|
open OUTPUT, ">$temp_file" |
|
2114
|
|
|
|
|
|
|
or die "Unable to open a temp file in this directory: $!"; |
|
2115
|
|
|
|
|
|
|
foreach my $datapoint (values %visualization_data) { |
|
2116
|
|
|
|
|
|
|
print OUTPUT "@$datapoint"; |
|
2117
|
|
|
|
|
|
|
print OUTPUT "\n"; |
|
2118
|
|
|
|
|
|
|
} |
|
2119
|
|
|
|
|
|
|
close OUTPUT; |
|
2120
|
|
|
|
|
|
|
|
|
2121
|
|
|
|
|
|
|
my $plot = Graphics::GnuplotIF->new( persist => 1 ); |
|
2122
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set noclip" ); |
|
2123
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "set pointsize 2" ); |
|
2124
|
|
|
|
|
|
|
my $plot_title = '"original data provided for EM"'; |
|
2125
|
|
|
|
|
|
|
my $arg_string ; |
|
2126
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
|
2127
|
|
|
|
|
|
|
$arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1"; |
|
2128
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
|
2129
|
|
|
|
|
|
|
$arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1"; |
|
2130
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1 ) { |
|
2131
|
|
|
|
|
|
|
open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!"; |
|
2132
|
|
|
|
|
|
|
my @all_data = ; |
|
2133
|
|
|
|
|
|
|
close INPUT; |
|
2134
|
|
|
|
|
|
|
@all_data = map {chomp $_; $_} @all_data; |
|
2135
|
|
|
|
|
|
|
@all_data = grep $_, @all_data; |
|
2136
|
|
|
|
|
|
|
my ($minval,$maxval) = minmax(\@all_data); |
|
2137
|
|
|
|
|
|
|
my $delta = ($maxval - $minval) / 100.0; |
|
2138
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set boxwidth 3"); |
|
2139
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style fill solid border -1"); |
|
2140
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set ytics out nomirror"); |
|
2141
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style data histograms"); |
|
2142
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set style histogram clustered"); |
|
2143
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set title 'Overall distribution of 1D data'"); |
|
2144
|
|
|
|
|
|
|
$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror"); |
|
2145
|
|
|
|
|
|
|
my $localfilename = basename($filename); |
|
2146
|
|
|
|
|
|
|
my $temp_file = "__temp1dhist_" . $localfilename; |
|
2147
|
|
|
|
|
|
|
unlink $temp_file if -e $temp_file; |
|
2148
|
|
|
|
|
|
|
open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!"; |
|
2149
|
|
|
|
|
|
|
print OUTPUT "Xstep histval\n"; |
|
2150
|
|
|
|
|
|
|
my @histogram = (0) x 100; |
|
2151
|
|
|
|
|
|
|
foreach my $i (0..@all_data-1) { |
|
2152
|
|
|
|
|
|
|
$histogram[int( ($all_data[$i] - $minval) / $delta )]++; |
|
2153
|
|
|
|
|
|
|
} |
|
2154
|
|
|
|
|
|
|
foreach my $i (0..@histogram-1) { |
|
2155
|
|
|
|
|
|
|
print OUTPUT "$i $histogram[$i]\n"; |
|
2156
|
|
|
|
|
|
|
} |
|
2157
|
|
|
|
|
|
|
$arg_string = "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc rgb 'green'"; |
|
2158
|
|
|
|
|
|
|
close OUTPUT; |
|
2159
|
|
|
|
|
|
|
} |
|
2160
|
|
|
|
|
|
|
if ($visualization_data_field_width > 2) { |
|
2161
|
|
|
|
|
|
|
$plot->gnuplot_cmd( 'set terminal png', |
|
2162
|
|
|
|
|
|
|
'set output "data_scatter_plot.png"'); |
|
2163
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "splot $arg_string" ); |
|
2164
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 2) { |
|
2165
|
|
|
|
|
|
|
$plot->gnuplot_cmd( 'set terminal png', |
|
2166
|
|
|
|
|
|
|
'set output "data_scatter_plot.png"'); |
|
2167
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "plot $arg_string" ); |
|
2168
|
|
|
|
|
|
|
} elsif ($visualization_data_field_width == 1) { |
|
2169
|
|
|
|
|
|
|
$plot->gnuplot_cmd( 'set terminal png', |
|
2170
|
|
|
|
|
|
|
'set output "data_scatter_plot.png"'); |
|
2171
|
|
|
|
|
|
|
$plot->gnuplot_cmd( "plot $arg_string" ); |
|
2172
|
|
|
|
|
|
|
} |
|
2173
|
|
|
|
|
|
|
} |
|
2174
|
|
|
|
|
|
|
|
|
2175
|
|
|
|
|
|
|
|
|
2176
|
|
|
|
|
|
|
################### Generating Synthetic Data for Clustering ################### |
|
2177
|
|
|
|
|
|
|
|
|
2178
|
|
|
|
|
|
|
# The data generated corresponds to a multivariate distribution. The mean and the |
|
2179
|
|
|
|
|
|
|
# covariance of each Gaussian in the distribution are specified individually in a |
|
2180
|
|
|
|
|
|
|
# parameter file. The parameter file must also state the prior probabilities to be |
|
2181
|
|
|
|
|
|
|
# associated with each Gaussian. See the example parameter file param1.txt in the |
|
2182
|
|
|
|
|
|
|
# examples directory. Just edit this file for your own needs. |
|
2183
|
|
|
|
|
|
|
# |
|
2184
|
|
|
|
|
|
|
# The multivariate random numbers are generated by calling the Math::Random module. |
|
2185
|
|
|
|
|
|
|
# As you would expect, that module will insist that the covariance matrix you |
|
2186
|
|
|
|
|
|
|
# specify be symmetric and positive definite. |
|
2187
|
|
|
|
|
|
|
sub cluster_data_generator { |
|
2188
|
|
|
|
|
|
|
my $class = shift; |
|
2189
|
|
|
|
|
|
|
die "illegal call of a class method" |
|
2190
|
|
|
|
|
|
|
unless $class eq 'Algorithm::ExpectationMaximization'; |
|
2191
|
|
|
|
|
|
|
my %args = @_; |
|
2192
|
|
|
|
|
|
|
my $input_parameter_file = $args{input_parameter_file}; |
|
2193
|
|
|
|
|
|
|
my $output_file = $args{output_datafile}; |
|
2194
|
|
|
|
|
|
|
my $N = $args{total_number_of_data_points}; |
|
2195
|
|
|
|
|
|
|
my @all_params; |
|
2196
|
|
|
|
|
|
|
my $param_string; |
|
2197
|
|
|
|
|
|
|
if (defined $input_parameter_file) { |
|
2198
|
|
|
|
|
|
|
open INPUT, $input_parameter_file || "unable to open parameter file: $!"; |
|
2199
|
|
|
|
|
|
|
@all_params = ; |
|
2200
|
|
|
|
|
|
|
@all_params = grep { $_ !~ /^[ ]*#/ } @all_params; |
|
2201
|
|
|
|
|
|
|
chomp @all_params; |
|
2202
|
|
|
|
|
|
|
$param_string = join ' ', @all_params; |
|
2203
|
|
|
|
|
|
|
} else { |
|
2204
|
|
|
|
|
|
|
# Just for testing. Used in t/test.t |
|
2205
|
|
|
|
|
|
|
$param_string = "priors 0.34 0.33 0.33 " . |
|
2206
|
|
|
|
|
|
|
"cluster 5 0 0 1 0 0 0 1 0 0 0 1 " . |
|
2207
|
|
|
|
|
|
|
"cluster 0 5 0 1 0 0 0 1 0 0 0 1 " . |
|
2208
|
|
|
|
|
|
|
"cluster 0 0 5 1 0 0 0 1 0 0 0 1"; |
|
2209
|
|
|
|
|
|
|
} |
|
2210
|
|
|
|
|
|
|
my ($priors_string) = $param_string =~ /priors(.+?)cluster/; |
|
2211
|
|
|
|
|
|
|
croak "You did not specify the prior probabilities in the parameter file" |
|
2212
|
|
|
|
|
|
|
unless $priors_string; |
|
2213
|
|
|
|
|
|
|
my @priors = split /\s+/, $priors_string; |
|
2214
|
|
|
|
|
|
|
@priors = grep {/$_num_regex/; $_} @priors; |
|
2215
|
|
|
|
|
|
|
my $sum = 0; |
|
2216
|
|
|
|
|
|
|
foreach my $prior (@priors) { |
|
2217
|
|
|
|
|
|
|
$sum += $prior; |
|
2218
|
|
|
|
|
|
|
} |
|
2219
|
|
|
|
|
|
|
croak "Your priors in the parameter file do not add up to 1" unless $sum == 1; |
|
2220
|
|
|
|
|
|
|
my ($rest_of_string) = $param_string =~ /priors\s*$priors_string(.*)$/; |
|
2221
|
|
|
|
|
|
|
my @cluster_strings = split /[ ]*cluster[ ]*/, $rest_of_string; |
|
2222
|
|
|
|
|
|
|
@cluster_strings = grep $_, @cluster_strings; |
|
2223
|
|
|
|
|
|
|
|
|
2224
|
|
|
|
|
|
|
my $K = @cluster_strings; |
|
2225
|
|
|
|
|
|
|
croak "Too many clusters requested" if $K > 12; |
|
2226
|
|
|
|
|
|
|
croak "Mismatch between the number of values for priors and the number " . |
|
2227
|
|
|
|
|
|
|
"of clusters" unless $K == @priors; |
|
2228
|
|
|
|
|
|
|
my @point_labels = ('a'..'z'); |
|
2229
|
|
|
|
|
|
|
print "Prior probabilities recorded from param file: @priors\n"; |
|
2230
|
|
|
|
|
|
|
print "Number of Gaussians used for the synthetic data: $K\n"; |
|
2231
|
|
|
|
|
|
|
my @means; |
|
2232
|
|
|
|
|
|
|
my @covariances; |
|
2233
|
|
|
|
|
|
|
my $data_dimension; |
|
2234
|
|
|
|
|
|
|
foreach my $i (0..$K-1) { |
|
2235
|
|
|
|
|
|
|
my @num_strings = split / /, $cluster_strings[$i]; |
|
2236
|
|
|
|
|
|
|
my @cluster_mean = map {/$_num_regex/;$_} split / /, $num_strings[0]; |
|
2237
|
|
|
|
|
|
|
$data_dimension = @cluster_mean; |
|
2238
|
|
|
|
|
|
|
push @means, \@cluster_mean; |
|
2239
|
|
|
|
|
|
|
my @covariance_nums = map {/$_num_regex/;$_} split / /, $num_strings[1]; |
|
2240
|
|
|
|
|
|
|
croak "dimensionality error" if @covariance_nums != |
|
2241
|
|
|
|
|
|
|
($data_dimension ** 2); |
|
2242
|
|
|
|
|
|
|
my $cluster_covariance; |
|
2243
|
|
|
|
|
|
|
foreach my $j (0..$data_dimension-1) { |
|
2244
|
|
|
|
|
|
|
foreach my $k (0..$data_dimension-1) { |
|
2245
|
|
|
|
|
|
|
$cluster_covariance->[$j]->[$k] = |
|
2246
|
|
|
|
|
|
|
$covariance_nums[$j*$data_dimension + $k]; |
|
2247
|
|
|
|
|
|
|
} |
|
2248
|
|
|
|
|
|
|
} |
|
2249
|
|
|
|
|
|
|
push @covariances, $cluster_covariance; |
|
2250
|
|
|
|
|
|
|
} |
|
2251
|
|
|
|
|
|
|
random_seed_from_phrase( 'hellomellojello' ); |
|
2252
|
|
|
|
|
|
|
my @data_dump; |
|
2253
|
|
|
|
|
|
|
foreach my $i (0..$K-1) { |
|
2254
|
|
|
|
|
|
|
my @m = @{shift @means}; |
|
2255
|
|
|
|
|
|
|
my @covar = @{shift @covariances}; |
|
2256
|
|
|
|
|
|
|
my @new_data = Math::Random::random_multivariate_normal( |
|
2257
|
|
|
|
|
|
|
int($N * $priors[$i]), @m, @covar ); |
|
2258
|
|
|
|
|
|
|
my $p = 0; |
|
2259
|
|
|
|
|
|
|
my $label = $point_labels[$i]; |
|
2260
|
|
|
|
|
|
|
@new_data = map {unshift @$_, $label.$i; $i++; $_} @new_data; |
|
2261
|
|
|
|
|
|
|
push @data_dump, @new_data; |
|
2262
|
|
|
|
|
|
|
} |
|
2263
|
|
|
|
|
|
|
fisher_yates_shuffle( \@data_dump ); |
|
2264
|
|
|
|
|
|
|
open OUTPUT, ">$output_file"; |
|
2265
|
|
|
|
|
|
|
print OUTPUT "\#Data generated from the parameter file: $input_parameter_file\n" |
|
2266
|
|
|
|
|
|
|
if $input_parameter_file; |
|
2267
|
|
|
|
|
|
|
print OUTPUT "\#Total number of data points in this file: $N\n"; |
|
2268
|
|
|
|
|
|
|
print OUTPUT "\#Prior class probabilities for this data: @priors\n"; |
|
2269
|
|
|
|
|
|
|
foreach my $ele (@data_dump) { |
|
2270
|
|
|
|
|
|
|
foreach my $coord ( @$ele ) { |
|
2271
|
|
|
|
|
|
|
print OUTPUT "$coord "; |
|
2272
|
|
|
|
|
|
|
} |
|
2273
|
|
|
|
|
|
|
print OUTPUT "\n"; |
|
2274
|
|
|
|
|
|
|
} |
|
2275
|
|
|
|
|
|
|
print "Data written out to file $output_file\n"; |
|
2276
|
|
|
|
|
|
|
close OUTPUT; |
|
2277
|
|
|
|
|
|
|
} |
|
2278
|
|
|
|
|
|
|
|
|
2279
|
|
|
|
|
|
|
sub add_point_coords { |
|
2280
|
|
|
|
|
|
|
my $self = shift; |
|
2281
|
|
|
|
|
|
|
my @arr_of_ids = @{shift @_}; # array of data element names |
|
2282
|
|
|
|
|
|
|
my @result; |
|
2283
|
|
|
|
|
|
|
my $data_dimensionality = $self->{_data_dimensions}; |
|
2284
|
|
|
|
|
|
|
foreach my $i (0..$data_dimensionality-1) { |
|
2285
|
|
|
|
|
|
|
$result[$i] = 0.0; |
|
2286
|
|
|
|
|
|
|
} |
|
2287
|
|
|
|
|
|
|
foreach my $id (@arr_of_ids) { |
|
2288
|
|
|
|
|
|
|
my $ele = $self->{_data}->{$id}; |
|
2289
|
|
|
|
|
|
|
my $i = 0; |
|
2290
|
|
|
|
|
|
|
foreach my $component (@$ele) { |
|
2291
|
|
|
|
|
|
|
$result[$i] += $component; |
|
2292
|
|
|
|
|
|
|
$i++; |
|
2293
|
|
|
|
|
|
|
} |
|
2294
|
|
|
|
|
|
|
} |
|
2295
|
|
|
|
|
|
|
return \@result; |
|
2296
|
|
|
|
|
|
|
} |
|
2297
|
|
|
|
|
|
|
|
|
2298
|
|
|
|
|
|
|
###################### Support Routines ######################## |
|
2299
|
|
|
|
|
|
|
|
|
2300
|
|
|
|
|
|
|
# computer the outer product of two column vectors |
|
2301
|
|
|
|
|
|
|
sub outer_product { |
|
2302
|
|
|
|
|
|
|
my $vec1 = shift; |
|
2303
|
|
|
|
|
|
|
my $vec2 = shift; |
|
2304
|
|
|
|
|
|
|
my ($nrows1, $ncols1) = ($vec1->rows(), $vec1->cols()); |
|
2305
|
|
|
|
|
|
|
my ($nrows2, $ncols2) = ($vec2->rows(), $vec2->cols()); |
|
2306
|
|
|
|
|
|
|
die "Outer product operation called with non-matching vectors" |
|
2307
|
|
|
|
|
|
|
unless $ncols1 == 1 && $ncols2 == 1 && $nrows1 == $nrows2; |
|
2308
|
|
|
|
|
|
|
my @vec_arr1 = $vec1->as_list(); |
|
2309
|
|
|
|
|
|
|
my @vec_arr2 = $vec2->as_list(); |
|
2310
|
|
|
|
|
|
|
my $outer_product = Math::GSL::Matrix->new($nrows1, $nrows2); |
|
2311
|
|
|
|
|
|
|
foreach my $index (0..$nrows1-1) { |
|
2312
|
|
|
|
|
|
|
my @new_row = map $vec_arr1[$index] * $_, @vec_arr2; |
|
2313
|
|
|
|
|
|
|
$outer_product->set_row($index, \@new_row); |
|
2314
|
|
|
|
|
|
|
} |
|
2315
|
|
|
|
|
|
|
return $outer_product; |
|
2316
|
|
|
|
|
|
|
} |
|
2317
|
|
|
|
|
|
|
|
|
2318
|
|
|
|
|
|
|
sub get_index_at_value { |
|
2319
|
|
|
|
|
|
|
my $value = shift; |
|
2320
|
|
|
|
|
|
|
my @array = @{shift @_}; |
|
2321
|
|
|
|
|
|
|
foreach my $i (0..@array-1) { |
|
2322
|
|
|
|
|
|
|
return $i if $value == $array[$i]; |
|
2323
|
|
|
|
|
|
|
} |
|
2324
|
|
|
|
|
|
|
} |
|
2325
|
|
|
|
|
|
|
|
|
2326
|
|
|
|
|
|
|
# This routine is really not necessary in light of the new `~~' operator in Perl. |
|
2327
|
|
|
|
|
|
|
# Will use the new operator in the next version. |
|
2328
|
|
|
|
|
|
|
sub vector_equal { |
|
2329
|
|
|
|
|
|
|
my $vec1 = shift; |
|
2330
|
|
|
|
|
|
|
my $vec2 = shift; |
|
2331
|
|
|
|
|
|
|
die "wrong data types for distance calculation" if @$vec1 != @$vec2; |
|
2332
|
|
|
|
|
|
|
foreach my $i (0..@$vec1-1){ |
|
2333
|
|
|
|
|
|
|
return 0 if $vec1->[$i] != $vec2->[$i]; |
|
2334
|
|
|
|
|
|
|
} |
|
2335
|
|
|
|
|
|
|
return 1; |
|
2336
|
|
|
|
|
|
|
} |
|
2337
|
|
|
|
|
|
|
|
|
2338
|
|
|
|
|
|
|
sub compare_array_floats { |
|
2339
|
|
|
|
|
|
|
my $vec1 = shift; |
|
2340
|
|
|
|
|
|
|
my $vec2 = shift; |
|
2341
|
|
|
|
|
|
|
foreach my $i (0..@$vec1-1){ |
|
2342
|
|
|
|
|
|
|
return 0 if abs($vec1->[$i] - $vec2->[$i]) > 0.00001; |
|
2343
|
|
|
|
|
|
|
} |
|
2344
|
|
|
|
|
|
|
return 1; |
|
2345
|
|
|
|
|
|
|
} |
|
2346
|
|
|
|
|
|
|
|
|
2347
|
|
|
|
|
|
|
# Returns the minimum value and its positional index in an array |
|
2348
|
|
|
|
|
|
|
sub minimum { |
|
2349
|
|
|
|
|
|
|
my $arr = shift; |
|
2350
|
|
|
|
|
|
|
my $min; |
|
2351
|
|
|
|
|
|
|
my $index; |
|
2352
|
|
|
|
|
|
|
foreach my $i (0..@{$arr}-1) { |
|
2353
|
|
|
|
|
|
|
if ( (!defined $min) || ($arr->[$i] < $min) ) { |
|
2354
|
|
|
|
|
|
|
$index = $i; |
|
2355
|
|
|
|
|
|
|
$min = $arr->[$i]; |
|
2356
|
|
|
|
|
|
|
} |
|
2357
|
|
|
|
|
|
|
} |
|
2358
|
|
|
|
|
|
|
return ($min, $index); |
|
2359
|
|
|
|
|
|
|
} |
|
2360
|
|
|
|
|
|
|
|
|
2361
|
|
|
|
|
|
|
sub minmax { |
|
2362
|
|
|
|
|
|
|
my $arr = shift; |
|
2363
|
|
|
|
|
|
|
my $min; |
|
2364
|
|
|
|
|
|
|
my $max; |
|
2365
|
|
|
|
|
|
|
foreach my $i (0..@{$arr}-1) { |
|
2366
|
|
|
|
|
|
|
if ( (!defined $min) && (!defined $max) ) { |
|
2367
|
|
|
|
|
|
|
$min = $arr->[$i]; |
|
2368
|
|
|
|
|
|
|
$max = $arr->[$i]; |
|
2369
|
|
|
|
|
|
|
} elsif ( $arr->[$i] < $min ) { |
|
2370
|
|
|
|
|
|
|
$min = $arr->[$i]; |
|
2371
|
|
|
|
|
|
|
} elsif ( $arr->[$i] > $max ) { |
|
2372
|
|
|
|
|
|
|
$max = $arr->[$i]; |
|
2373
|
|
|
|
|
|
|
} |
|
2374
|
|
|
|
|
|
|
} |
|
2375
|
|
|
|
|
|
|
return ($min, $max); |
|
2376
|
|
|
|
|
|
|
} |
|
2377
|
|
|
|
|
|
|
|
|
2378
|
|
|
|
|
|
|
# Meant only for constructing a deep copy of an array of |
|
2379
|
|
|
|
|
|
|
# arrays: |
|
2380
|
|
|
|
|
|
|
sub deep_copy_AoA { |
|
2381
|
|
|
|
|
|
|
my $ref_in = shift; |
|
2382
|
|
|
|
|
|
|
my $ref_out; |
|
2383
|
|
|
|
|
|
|
foreach my $i (0..@{$ref_in}-1) { |
|
2384
|
|
|
|
|
|
|
foreach my $j (0..@{$ref_in->[$i]}-1) { |
|
2385
|
|
|
|
|
|
|
$ref_out->[$i]->[$j] = $ref_in->[$i]->[$j]; |
|
2386
|
|
|
|
|
|
|
} |
|
2387
|
|
|
|
|
|
|
} |
|
2388
|
|
|
|
|
|
|
return $ref_out; |
|
2389
|
|
|
|
|
|
|
} |
|
2390
|
|
|
|
|
|
|
|
|
2391
|
|
|
|
|
|
|
# Meant only for constructing a deep copy of an array of arrays for the case when |
|
2392
|
|
|
|
|
|
|
# some elements of the top-level array may be undefined: |
|
2393
|
|
|
|
|
|
|
sub deep_copy_AoA_with_nulls { |
|
2394
|
|
|
|
|
|
|
my $ref_in = shift; |
|
2395
|
|
|
|
|
|
|
my $ref_out; |
|
2396
|
|
|
|
|
|
|
foreach my $i (0..@{$ref_in}-1) { |
|
2397
|
|
|
|
|
|
|
if ( !defined $ref_in->[$i] ) { |
|
2398
|
|
|
|
|
|
|
$ref_out->[$i] = undef; |
|
2399
|
|
|
|
|
|
|
next; |
|
2400
|
|
|
|
|
|
|
} |
|
2401
|
|
|
|
|
|
|
foreach my $j (0..@{$ref_in->[$i]}-1) { |
|
2402
|
|
|
|
|
|
|
$ref_out->[$i]->[$j] = $ref_in->[$i]->[$j]; |
|
2403
|
|
|
|
|
|
|
} |
|
2404
|
|
|
|
|
|
|
} |
|
2405
|
|
|
|
|
|
|
return $ref_out; |
|
2406
|
|
|
|
|
|
|
} |
|
2407
|
|
|
|
|
|
|
|
|
2408
|
|
|
|
|
|
|
# Meant only for constructing a deep copy of a hash in which each value is an |
|
2409
|
|
|
|
|
|
|
# anonymous array of numbers: |
|
2410
|
|
|
|
|
|
|
sub deep_copy_hash { |
|
2411
|
|
|
|
|
|
|
my $ref_in = shift; |
|
2412
|
|
|
|
|
|
|
my $ref_out; |
|
2413
|
|
|
|
|
|
|
while ( my ($key, $value) = each( %$ref_in ) ) { |
|
2414
|
|
|
|
|
|
|
$ref_out->{$key} = deep_copy_array( $value ); |
|
2415
|
|
|
|
|
|
|
} |
|
2416
|
|
|
|
|
|
|
return $ref_out; |
|
2417
|
|
|
|
|
|
|
} |
|
2418
|
|
|
|
|
|
|
|
|
2419
|
|
|
|
|
|
|
# Meant only for an array of numbers: |
|
2420
|
|
|
|
|
|
|
sub deep_copy_array { |
|
2421
|
|
|
|
|
|
|
my $ref_in = shift; |
|
2422
|
|
|
|
|
|
|
my $ref_out; |
|
2423
|
|
|
|
|
|
|
foreach my $i (0..@{$ref_in}-1) { |
|
2424
|
|
|
|
|
|
|
$ref_out->[$i] = $ref_in->[$i]; |
|
2425
|
|
|
|
|
|
|
} |
|
2426
|
|
|
|
|
|
|
return $ref_out; |
|
2427
|
|
|
|
|
|
|
} |
|
2428
|
|
|
|
|
|
|
|
|
2429
|
|
|
|
|
|
|
# from perl docs: |
|
2430
|
|
|
|
|
|
|
sub fisher_yates_shuffle { |
|
2431
|
|
|
|
|
|
|
my $arr = shift; |
|
2432
|
|
|
|
|
|
|
my $i = @$arr; |
|
2433
|
|
|
|
|
|
|
while (--$i) { |
|
2434
|
|
|
|
|
|
|
my $j = int rand( $i + 1 ); |
|
2435
|
|
|
|
|
|
|
@$arr[$i, $j] = @$arr[$j, $i]; |
|
2436
|
|
|
|
|
|
|
} |
|
2437
|
|
|
|
|
|
|
} |
|
2438
|
|
|
|
|
|
|
|
|
2439
|
|
|
|
|
|
|
sub mean_and_variance { |
|
2440
|
|
|
|
|
|
|
my @data = @{shift @_}; |
|
2441
|
|
|
|
|
|
|
my ($mean, $variance); |
|
2442
|
|
|
|
|
|
|
foreach my $i (1..@data) { |
|
2443
|
|
|
|
|
|
|
if ($i == 1) { |
|
2444
|
|
|
|
|
|
|
$mean = $data[0]; |
|
2445
|
|
|
|
|
|
|
$variance = 0; |
|
2446
|
|
|
|
|
|
|
} else { |
|
2447
|
|
|
|
|
|
|
# data[$i-1] because of zero-based indexing of vector |
|
2448
|
|
|
|
|
|
|
$mean = ( (($i-1)/$i) * $mean ) + $data[$i-1] / $i; |
|
2449
|
|
|
|
|
|
|
$variance = ( (($i-1)/$i) * $variance ) |
|
2450
|
|
|
|
|
|
|
+ ($data[$i-1]-$mean)**2 / ($i-1); |
|
2451
|
|
|
|
|
|
|
} |
|
2452
|
|
|
|
|
|
|
} |
|
2453
|
|
|
|
|
|
|
return ($mean, $variance); |
|
2454
|
|
|
|
|
|
|
} |
|
2455
|
|
|
|
|
|
|
|
|
2456
|
|
|
|
|
|
|
sub check_for_illegal_params { |
|
2457
|
|
|
|
|
|
|
my @params = @_; |
|
2458
|
|
|
|
|
|
|
my @legal_params = qw / datafile |
|
2459
|
|
|
|
|
|
|
mask |
|
2460
|
|
|
|
|
|
|
K |
|
2461
|
|
|
|
|
|
|
terminal_output |
|
2462
|
|
|
|
|
|
|
max_em_iterations |
|
2463
|
|
|
|
|
|
|
seeding |
|
2464
|
|
|
|
|
|
|
class_priors |
|
2465
|
|
|
|
|
|
|
seed_tags |
|
2466
|
|
|
|
|
|
|
debug |
|
2467
|
|
|
|
|
|
|
/; |
|
2468
|
|
|
|
|
|
|
my $found_match_flag; |
|
2469
|
|
|
|
|
|
|
foreach my $param (@params) { |
|
2470
|
|
|
|
|
|
|
foreach my $legal (@legal_params) { |
|
2471
|
|
|
|
|
|
|
$found_match_flag = 0; |
|
2472
|
|
|
|
|
|
|
if ($param eq $legal) { |
|
2473
|
|
|
|
|
|
|
$found_match_flag = 1; |
|
2474
|
|
|
|
|
|
|
last; |
|
2475
|
|
|
|
|
|
|
} |
|
2476
|
|
|
|
|
|
|
} |
|
2477
|
|
|
|
|
|
|
last if $found_match_flag == 0; |
|
2478
|
|
|
|
|
|
|
} |
|
2479
|
|
|
|
|
|
|
return $found_match_flag; |
|
2480
|
|
|
|
|
|
|
} |
|
2481
|
|
|
|
|
|
|
|
|
2482
|
|
|
|
|
|
|
sub get_value_index_hash { |
|
2483
|
|
|
|
|
|
|
my $arr = shift; |
|
2484
|
|
|
|
|
|
|
my %hash; |
|
2485
|
|
|
|
|
|
|
foreach my $index (0..@$arr-1) { |
|
2486
|
|
|
|
|
|
|
$hash{$arr->[$index]} = $index if $arr->[$index] > 0; |
|
2487
|
|
|
|
|
|
|
} |
|
2488
|
|
|
|
|
|
|
return \%hash; |
|
2489
|
|
|
|
|
|
|
} |
|
2490
|
|
|
|
|
|
|
|
|
2491
|
|
|
|
|
|
|
sub non_maximum_supression { |
|
2492
|
|
|
|
|
|
|
my $arr = shift; |
|
2493
|
|
|
|
|
|
|
my @output = (0) x @$arr; |
|
2494
|
|
|
|
|
|
|
my @final_output = (0) x @$arr; |
|
2495
|
|
|
|
|
|
|
my %hash; |
|
2496
|
|
|
|
|
|
|
my @array_of_runs = ([$arr->[0]]); |
|
2497
|
|
|
|
|
|
|
foreach my $index (1..@$arr-1) { |
|
2498
|
|
|
|
|
|
|
if ($arr->[$index] == $arr->[$index-1]) { |
|
2499
|
|
|
|
|
|
|
push @{$array_of_runs[-1]}, $arr->[$index]; |
|
2500
|
|
|
|
|
|
|
} else { |
|
2501
|
|
|
|
|
|
|
push @array_of_runs, [$arr->[$index]]; |
|
2502
|
|
|
|
|
|
|
} |
|
2503
|
|
|
|
|
|
|
} |
|
2504
|
|
|
|
|
|
|
my $runstart_index = 0; |
|
2505
|
|
|
|
|
|
|
foreach my $run_index (1..@array_of_runs-2) { |
|
2506
|
|
|
|
|
|
|
$runstart_index += @{$array_of_runs[$run_index-1]}; |
|
2507
|
|
|
|
|
|
|
if ($array_of_runs[$run_index]->[0] > |
|
2508
|
|
|
|
|
|
|
$array_of_runs[$run_index-1]->[0] && |
|
2509
|
|
|
|
|
|
|
$array_of_runs[$run_index]->[0] > |
|
2510
|
|
|
|
|
|
|
$array_of_runs[$run_index+1]->[0]) { |
|
2511
|
|
|
|
|
|
|
my $run_center = @{$array_of_runs[$run_index]} / 2; |
|
2512
|
|
|
|
|
|
|
my $assignment_index = $runstart_index + $run_center; |
|
2513
|
|
|
|
|
|
|
$output[$assignment_index] = $arr->[$assignment_index]; |
|
2514
|
|
|
|
|
|
|
} |
|
2515
|
|
|
|
|
|
|
} |
|
2516
|
|
|
|
|
|
|
if ($array_of_runs[-1]->[0] > $array_of_runs[-2]->[0]) { |
|
2517
|
|
|
|
|
|
|
$runstart_index += @{$array_of_runs[-2]}; |
|
2518
|
|
|
|
|
|
|
my $run_center = @{$array_of_runs[-1]} / 2; |
|
2519
|
|
|
|
|
|
|
my $assignment_index = $runstart_index + $run_center; |
|
2520
|
|
|
|
|
|
|
$output[$assignment_index] = $arr->[$assignment_index]; |
|
2521
|
|
|
|
|
|
|
} |
|
2522
|
|
|
|
|
|
|
if ($array_of_runs[0]->[0] > $array_of_runs[1]->[0]) { |
|
2523
|
|
|
|
|
|
|
my $run_center = @{$array_of_runs[0]} / 2; |
|
2524
|
|
|
|
|
|
|
$output[$run_center] = $arr->[$run_center]; |
|
2525
|
|
|
|
|
|
|
} |
|
2526
|
|
|
|
|
|
|
return \@output; |
|
2527
|
|
|
|
|
|
|
} |
|
2528
|
|
|
|
|
|
|
|
|
2529
|
|
|
|
|
|
|
sub display_matrix { |
|
2530
|
|
|
|
|
|
|
my $message = shift; |
|
2531
|
|
|
|
|
|
|
my $matrix = shift; |
|
2532
|
|
|
|
|
|
|
if (!defined blessed($matrix)) { |
|
2533
|
|
|
|
|
|
|
print "display_matrix called on a scalar value: $matrix\n"; |
|
2534
|
|
|
|
|
|
|
return; |
|
2535
|
|
|
|
|
|
|
} |
|
2536
|
|
|
|
|
|
|
my $nrows = $matrix->rows(); |
|
2537
|
|
|
|
|
|
|
my $ncols = $matrix->cols(); |
|
2538
|
|
|
|
|
|
|
print "$message ($nrows rows and $ncols columns)\n"; |
|
2539
|
|
|
|
|
|
|
foreach my $i (0..$nrows-1) { |
|
2540
|
|
|
|
|
|
|
my $row = $matrix->row($i); |
|
2541
|
|
|
|
|
|
|
my @row_as_list = $row->as_list; |
|
2542
|
|
|
|
|
|
|
print "@row_as_list\n"; |
|
2543
|
|
|
|
|
|
|
} |
|
2544
|
|
|
|
|
|
|
print "\n"; |
|
2545
|
|
|
|
|
|
|
} |
|
2546
|
|
|
|
|
|
|
|
|
2547
|
|
|
|
|
|
|
sub transpose { |
|
2548
|
|
|
|
|
|
|
my $matrix = shift; |
|
2549
|
|
|
|
|
|
|
my $num_rows = $matrix->rows(); |
|
2550
|
|
|
|
|
|
|
my $num_cols = $matrix->cols(); |
|
2551
|
|
|
|
|
|
|
my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows); |
|
2552
|
|
|
|
|
|
|
foreach my $i (0..$num_rows-1) { |
|
2553
|
|
|
|
|
|
|
my @row = $matrix->row($i)->as_list; |
|
2554
|
|
|
|
|
|
|
$transpose->set_col($i, \@row ); |
|
2555
|
|
|
|
|
|
|
} |
|
2556
|
|
|
|
|
|
|
return $transpose; |
|
2557
|
|
|
|
|
|
|
} |
|
2558
|
|
|
|
|
|
|
|
|
2559
|
|
|
|
|
|
|
sub vector_multiply { |
|
2560
|
|
|
|
|
|
|
my $vec1 = shift; |
|
2561
|
|
|
|
|
|
|
my $vec2 = shift; |
|
2562
|
|
|
|
|
|
|
die "vec_multiply called with two vectors of different sizes" |
|
2563
|
|
|
|
|
|
|
unless @$vec1 == @$vec2; |
|
2564
|
|
|
|
|
|
|
my $result = 0; |
|
2565
|
|
|
|
|
|
|
foreach my $i (0..@$vec1-1) { |
|
2566
|
|
|
|
|
|
|
$result += $vec1->[$i] * $vec2->[$i]; |
|
2567
|
|
|
|
|
|
|
} |
|
2568
|
|
|
|
|
|
|
return $result; |
|
2569
|
|
|
|
|
|
|
} |
|
2570
|
|
|
|
|
|
|
|
|
2571
|
|
|
|
|
|
|
sub vector_2_vector_multiply { |
|
2572
|
|
|
|
|
|
|
my $vec1 = shift; |
|
2573
|
|
|
|
|
|
|
my $vec2 = shift; |
|
2574
|
|
|
|
|
|
|
die "vec_multiply called with two vectors of different sizes" |
|
2575
|
|
|
|
|
|
|
unless @$vec1 == @$vec2; |
|
2576
|
|
|
|
|
|
|
my @result_vec; |
|
2577
|
|
|
|
|
|
|
foreach my $i (0..@$vec1-1) { |
|
2578
|
|
|
|
|
|
|
$result_vec[$i] = $vec1->[$i] * $vec2->[$i]; |
|
2579
|
|
|
|
|
|
|
} |
|
2580
|
|
|
|
|
|
|
return \@result_vec; |
|
2581
|
|
|
|
|
|
|
} |
|
2582
|
|
|
|
|
|
|
|
|
2583
|
|
|
|
|
|
|
sub matrix_multiply { |
|
2584
|
|
|
|
|
|
|
my $matrix1 = shift; |
|
2585
|
|
|
|
|
|
|
my $matrix2 = shift; |
|
2586
|
|
|
|
|
|
|
my ($nrows1, $ncols1) = ($matrix1->rows(), $matrix1->cols()); |
|
2587
|
|
|
|
|
|
|
my ($nrows2, $ncols2) = ($matrix2->rows(), $matrix2->cols()); |
|
2588
|
|
|
|
|
|
|
die "matrix multiplication called with non-matching matrix arguments" |
|
2589
|
|
|
|
|
|
|
# unless $ncols1 == $nrows2; |
|
2590
|
|
|
|
|
|
|
unless $nrows1 == $ncols2 && $ncols1 == $nrows2; |
|
2591
|
|
|
|
|
|
|
if ($nrows1 == 1) { |
|
2592
|
|
|
|
|
|
|
my @row = $matrix1->row(0)->as_list; |
|
2593
|
|
|
|
|
|
|
my @col = $matrix2->col(0)->as_list; |
|
2594
|
|
|
|
|
|
|
my $result; |
|
2595
|
|
|
|
|
|
|
foreach my $j (0..$ncols1-1) { |
|
2596
|
|
|
|
|
|
|
$result += $row[$j] * $col[$j]; |
|
2597
|
|
|
|
|
|
|
} |
|
2598
|
|
|
|
|
|
|
return $result; |
|
2599
|
|
|
|
|
|
|
} |
|
2600
|
|
|
|
|
|
|
my $product = Math::GSL::Matrix->new($nrows1, $nrows1); |
|
2601
|
|
|
|
|
|
|
foreach my $i (0..$nrows1-1) { |
|
2602
|
|
|
|
|
|
|
my $row = $matrix1->row($i); |
|
2603
|
|
|
|
|
|
|
my @product_row; |
|
2604
|
|
|
|
|
|
|
foreach my $j (0..$ncols2-1) { |
|
2605
|
|
|
|
|
|
|
my $col = $matrix2->col($j); |
|
2606
|
|
|
|
|
|
|
my $row_times_col = matrix_multiply($row, $col); |
|
2607
|
|
|
|
|
|
|
push @product_row, $row_times_col; |
|
2608
|
|
|
|
|
|
|
} |
|
2609
|
|
|
|
|
|
|
$product->set_row($i, \@product_row); |
|
2610
|
|
|
|
|
|
|
} |
|
2611
|
|
|
|
|
|
|
return $product; |
|
2612
|
|
|
|
|
|
|
} |
|
2613
|
|
|
|
|
|
|
|
|
2614
|
|
|
|
|
|
|
sub vector_matrix_multiply { |
|
2615
|
|
|
|
|
|
|
my $matrix1 = shift; |
|
2616
|
|
|
|
|
|
|
my $matrix2 = shift; |
|
2617
|
|
|
|
|
|
|
my ($nrows1, $ncols1) = ($matrix1->rows, $matrix1->cols); |
|
2618
|
|
|
|
|
|
|
my ($nrows2, $ncols2) = ($matrix2->rows, $matrix2->cols); |
|
2619
|
|
|
|
|
|
|
die "matrix multiplication called with non-matching matrix arguments" |
|
2620
|
|
|
|
|
|
|
unless $nrows1 == 1 && $ncols1 == $nrows2; |
|
2621
|
|
|
|
|
|
|
if ($ncols2 == 1) { |
|
2622
|
|
|
|
|
|
|
my @row = $matrix1->row(0)->as_list; |
|
2623
|
|
|
|
|
|
|
my @col = $matrix2->col(0)->as_list; |
|
2624
|
|
|
|
|
|
|
my $result; |
|
2625
|
|
|
|
|
|
|
foreach my $j (0..$ncols1-1) { |
|
2626
|
|
|
|
|
|
|
$result += $row[$j] * $col[$j]; |
|
2627
|
|
|
|
|
|
|
} |
|
2628
|
|
|
|
|
|
|
return $result; |
|
2629
|
|
|
|
|
|
|
} |
|
2630
|
|
|
|
|
|
|
my $product = Math::GSL::Matrix->new(1, $ncols2); |
|
2631
|
|
|
|
|
|
|
my $row = $matrix1->row(0); |
|
2632
|
|
|
|
|
|
|
my @product_row; |
|
2633
|
|
|
|
|
|
|
foreach my $j (0..$ncols2-1) { |
|
2634
|
|
|
|
|
|
|
my $col = $matrix2->col($j); |
|
2635
|
|
|
|
|
|
|
my $row_times_col = vector_matrix_multiply($row, $col); |
|
2636
|
|
|
|
|
|
|
push @product_row, $row_times_col; |
|
2637
|
|
|
|
|
|
|
} |
|
2638
|
|
|
|
|
|
|
$product->set_row(0, \@product_row); |
|
2639
|
|
|
|
|
|
|
return $product; |
|
2640
|
|
|
|
|
|
|
} |
|
2641
|
|
|
|
|
|
|
|
|
2642
|
|
|
|
|
|
|
sub matrix_vector_multiply { |
|
2643
|
|
|
|
|
|
|
my $matrix1 = shift; |
|
2644
|
|
|
|
|
|
|
my $matrix2 = shift; |
|
2645
|
|
|
|
|
|
|
my ($nrows1, $ncols1) = ($matrix1->rows(), $matrix1->cols()); |
|
2646
|
|
|
|
|
|
|
my ($nrows2, $ncols2) = ($matrix2->rows(), $matrix2->cols()); |
|
2647
|
|
|
|
|
|
|
die "matrix multiplication called with non-matching matrix arguments" |
|
2648
|
|
|
|
|
|
|
unless $ncols1 == $nrows2 && $ncols2 == 1; |
|
2649
|
|
|
|
|
|
|
if ($nrows1 == 1) { |
|
2650
|
|
|
|
|
|
|
my @row = $matrix1->row(0)->as_list; |
|
2651
|
|
|
|
|
|
|
my @col = $matrix2->col(0)->as_list; |
|
2652
|
|
|
|
|
|
|
my $result; |
|
2653
|
|
|
|
|
|
|
foreach my $j (0..$ncols1-1) { |
|
2654
|
|
|
|
|
|
|
$result += $row[$j] * $col[$j]; |
|
2655
|
|
|
|
|
|
|
} |
|
2656
|
|
|
|
|
|
|
return $result; |
|
2657
|
|
|
|
|
|
|
} |
|
2658
|
|
|
|
|
|
|
my $product = Math::GSL::Matrix->new($nrows1, 1); |
|
2659
|
|
|
|
|
|
|
my $col = $matrix2->col(0); |
|
2660
|
|
|
|
|
|
|
my @product_col; |
|
2661
|
|
|
|
|
|
|
foreach my $i (0..$nrows1-1) { |
|
2662
|
|
|
|
|
|
|
my $row = $matrix1->row($i); |
|
2663
|
|
|
|
|
|
|
my $row_times_col = matrix_vector_multiply($row, $col); |
|
2664
|
|
|
|
|
|
|
push @product_col, $row_times_col; |
|
2665
|
|
|
|
|
|
|
} |
|
2666
|
|
|
|
|
|
|
$product->set_col(0, \@product_col); |
|
2667
|
|
|
|
|
|
|
return $product; |
|
2668
|
|
|
|
|
|
|
} |
|
2669
|
|
|
|
|
|
|
|
|
2670
|
|
|
|
|
|
|
sub matrix_trace { |
|
2671
|
|
|
|
|
|
|
my $matrix = shift; |
|
2672
|
|
|
|
|
|
|
my ($nrows, $ncols) = ($matrix->rows(), $matrix->cols()); |
|
2673
|
|
|
|
|
|
|
die "trace can only be calculated for a square matrix" |
|
2674
|
|
|
|
|
|
|
unless $ncols == $nrows; |
|
2675
|
|
|
|
|
|
|
my @elements = $matrix->as_list; |
|
2676
|
|
|
|
|
|
|
my $trace = 0; |
|
2677
|
|
|
|
|
|
|
foreach my $i (0..$nrows-1) { |
|
2678
|
|
|
|
|
|
|
$trace += $elements[$i + $i * $ncols]; |
|
2679
|
|
|
|
|
|
|
} |
|
2680
|
|
|
|
|
|
|
return $trace; |
|
2681
|
|
|
|
|
|
|
} |
|
2682
|
|
|
|
|
|
|
|
|
2683
|
|
|
|
|
|
|
1; |
|
2684
|
|
|
|
|
|
|
|
|
2685
|
|
|
|
|
|
|
=pod |
|
2686
|
|
|
|
|
|
|
|
|
2687
|
|
|
|
|
|
|
=head1 NAME |
|
2688
|
|
|
|
|
|
|
|
|
2689
|
|
|
|
|
|
|
Algorithm::ExpectationMaximization -- A Perl module for clustering numerical |
|
2690
|
|
|
|
|
|
|
multi-dimensional data with the Expectation-Maximization algorithm. |
|
2691
|
|
|
|
|
|
|
|
|
2692
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
2693
|
|
|
|
|
|
|
|
|
2694
|
|
|
|
|
|
|
use Algorithm::ExpectationMaximization; |
|
2695
|
|
|
|
|
|
|
|
|
2696
|
|
|
|
|
|
|
# First name the data file: |
|
2697
|
|
|
|
|
|
|
|
|
2698
|
|
|
|
|
|
|
my $datafile = "mydatafile.csv"; |
|
2699
|
|
|
|
|
|
|
|
|
2700
|
|
|
|
|
|
|
# Next, set the mask to indicate which columns of the datafile to use for |
|
2701
|
|
|
|
|
|
|
# clustering and which column contains a symbolic ID for each data record. For |
|
2702
|
|
|
|
|
|
|
# example, if the symbolic name is in the first column, you want the second column |
|
2703
|
|
|
|
|
|
|
# to be ignored, and you want the next three columns to be used for 3D clustering: |
|
2704
|
|
|
|
|
|
|
|
|
2705
|
|
|
|
|
|
|
my $mask = "N0111"; |
|
2706
|
|
|
|
|
|
|
|
|
2707
|
|
|
|
|
|
|
# Now construct an instance of the clusterer. The parameter `K' controls the |
|
2708
|
|
|
|
|
|
|
# number of clusters. Here is an example call to the constructor for instance |
|
2709
|
|
|
|
|
|
|
# creation: |
|
2710
|
|
|
|
|
|
|
|
|
2711
|
|
|
|
|
|
|
my $clusterer = Algorithm::ExpectationMaximization->new( |
|
2712
|
|
|
|
|
|
|
datafile => $datafile, |
|
2713
|
|
|
|
|
|
|
mask => $mask, |
|
2714
|
|
|
|
|
|
|
K => 3, |
|
2715
|
|
|
|
|
|
|
max_em_iterations => 300, |
|
2716
|
|
|
|
|
|
|
seeding => 'random', |
|
2717
|
|
|
|
|
|
|
terminal_output => 1, |
|
2718
|
|
|
|
|
|
|
); |
|
2719
|
|
|
|
|
|
|
|
|
2720
|
|
|
|
|
|
|
# Note the choice for `seeding'. The choice `random' means that the clusterer will |
|
2721
|
|
|
|
|
|
|
# randomly select `K' data points to serve as initial cluster centers. Other |
|
2722
|
|
|
|
|
|
|
# possible choices for the constructor parameter `seeding' are `kmeans' and |
|
2723
|
|
|
|
|
|
|
# `manual'. With the `kmeans' option for `seeding', the output of a K-means |
|
2724
|
|
|
|
|
|
|
# clusterer is used for the cluster seeds and the initial cluster covariances. If |
|
2725
|
|
|
|
|
|
|
# you use the `manual' option for seeding, you must also specify the data elements |
|
2726
|
|
|
|
|
|
|
# to use for seeding the clusters. |
|
2727
|
|
|
|
|
|
|
|
|
2728
|
|
|
|
|
|
|
# Here is an example of a call to the constructor when we choose the `manual' |
|
2729
|
|
|
|
|
|
|
# option for seeding the clusters and for specifying the data elements for |
|
2730
|
|
|
|
|
|
|
# seeding. The data elements are specified by their tag names. In this case, |
|
2731
|
|
|
|
|
|
|
# these names are `a26', `b53', and `c49': |
|
2732
|
|
|
|
|
|
|
|
|
2733
|
|
|
|
|
|
|
my $clusterer = Algorithm::ExpectationMaximization->new( |
|
2734
|
|
|
|
|
|
|
datafile => $datafile, |
|
2735
|
|
|
|
|
|
|
mask => $mask, |
|
2736
|
|
|
|
|
|
|
class_priors => [0.6, 0.2, 0.2], |
|
2737
|
|
|
|
|
|
|
K => 3, |
|
2738
|
|
|
|
|
|
|
max_em_iterations => 300, |
|
2739
|
|
|
|
|
|
|
seeding => 'manual', |
|
2740
|
|
|
|
|
|
|
seed_tags => ['a26', 'b53', 'c49'], |
|
2741
|
|
|
|
|
|
|
terminal_output => 1, |
|
2742
|
|
|
|
|
|
|
); |
|
2743
|
|
|
|
|
|
|
|
|
2744
|
|
|
|
|
|
|
# This example call to the constructor also illustrates how you can inject class |
|
2745
|
|
|
|
|
|
|
# priors into the clustering process. The class priors are the prior probabilities |
|
2746
|
|
|
|
|
|
|
# of the class distributions in your dataset. As explained later, injecting class |
|
2747
|
|
|
|
|
|
|
# priors in the manner shown above makes statistical sense only for the case of |
|
2748
|
|
|
|
|
|
|
# manual seeding. When you do inject class priors, the order in which the priors |
|
2749
|
|
|
|
|
|
|
# are expressed must correspond to the manually specified seeds for the clusters. |
|
2750
|
|
|
|
|
|
|
|
|
2751
|
|
|
|
|
|
|
# After the invocation of the constructor, the following calls are mandatory |
|
2752
|
|
|
|
|
|
|
# for reasons that should be obvious from the names of the methods: |
|
2753
|
|
|
|
|
|
|
|
|
2754
|
|
|
|
|
|
|
$clusterer->read_data_from_file(); |
|
2755
|
|
|
|
|
|
|
srand(time); |
|
2756
|
|
|
|
|
|
|
$clusterer->seed_the_clusters(); |
|
2757
|
|
|
|
|
|
|
$clusterer->EM(); |
|
2758
|
|
|
|
|
|
|
$clusterer->run_bayes_classifier(); |
|
2759
|
|
|
|
|
|
|
my $clusters = $clusterer->return_disjoint_clusters(); |
|
2760
|
|
|
|
|
|
|
|
|
2761
|
|
|
|
|
|
|
# where the call to `EM()' is the invocation of the expectation-maximization |
|
2762
|
|
|
|
|
|
|
# algorithm. The call to `srand(time)' is to seed the pseudo random number |
|
2763
|
|
|
|
|
|
|
# generator afresh for each run of the cluster seeding procedure. If you want to |
|
2764
|
|
|
|
|
|
|
# see repeatable results from one run to another of the algorithm with random |
|
2765
|
|
|
|
|
|
|
# seeding, you would obviously not invoke `srand(time)'. |
|
2766
|
|
|
|
|
|
|
|
|
2767
|
|
|
|
|
|
|
# The call `run_bayes_classifier()' shown above carries out a disjoint clustering |
|
2768
|
|
|
|
|
|
|
# of all the data points using the naive Bayes' classifier. And the call |
|
2769
|
|
|
|
|
|
|
# `return_disjoint_clusters()' returns the clusters thus formed to you. Once you |
|
2770
|
|
|
|
|
|
|
# have obtained access to the clusters in this manner, you can display them in |
|
2771
|
|
|
|
|
|
|
# your terminal window by |
|
2772
|
|
|
|
|
|
|
|
|
2773
|
|
|
|
|
|
|
foreach my $index (0..@$clusters-1) { |
|
2774
|
|
|
|
|
|
|
print "Cluster $index (Naive Bayes): @{$clusters->[$index]}\n\n" |
|
2775
|
|
|
|
|
|
|
} |
|
2776
|
|
|
|
|
|
|
|
|
2777
|
|
|
|
|
|
|
# If you would like to also see the clusters purely on the basis of the posterior |
|
2778
|
|
|
|
|
|
|
# class probabilities exceeding a threshold, call |
|
2779
|
|
|
|
|
|
|
|
|
2780
|
|
|
|
|
|
|
my $theta1 = 0.2; |
|
2781
|
|
|
|
|
|
|
my $posterior_prob_clusters = |
|
2782
|
|
|
|
|
|
|
$clusterer->return_clusters_with_posterior_probs_above_threshold($theta1); |
|
2783
|
|
|
|
|
|
|
|
|
2784
|
|
|
|
|
|
|
# where you can obviously set the threshold $theta1 to any value you wish. Note |
|
2785
|
|
|
|
|
|
|
# that now you may end up with clusters that overlap. You can display them in |
|
2786
|
|
|
|
|
|
|
# your terminal window in the same manner as shown above for the naive Bayes' |
|
2787
|
|
|
|
|
|
|
# clusters. |
|
2788
|
|
|
|
|
|
|
|
|
2789
|
|
|
|
|
|
|
# You can write the naive Bayes' clusters out to files, one cluster per file, by |
|
2790
|
|
|
|
|
|
|
# calling |
|
2791
|
|
|
|
|
|
|
|
|
2792
|
|
|
|
|
|
|
$clusterer->write_naive_bayes_clusters_to_files(); |
|
2793
|
|
|
|
|
|
|
|
|
2794
|
|
|
|
|
|
|
# The clusters are placed in files with names like |
|
2795
|
|
|
|
|
|
|
|
|
2796
|
|
|
|
|
|
|
naive_bayes_cluster1.txt |
|
2797
|
|
|
|
|
|
|
naive_bayes_cluster2.txt |
|
2798
|
|
|
|
|
|
|
... |
|
2799
|
|
|
|
|
|
|
|
|
2800
|
|
|
|
|
|
|
# In the same manner, you can write out the posterior probability based possibly |
|
2801
|
|
|
|
|
|
|
# overlapping clusters to files by calling: |
|
2802
|
|
|
|
|
|
|
|
|
2803
|
|
|
|
|
|
|
$clusterer->write_posterior_prob_clusters_above_threshold_to_files($theta1); |
|
2804
|
|
|
|
|
|
|
|
|
2805
|
|
|
|
|
|
|
# where the threshold $theta1 sets the probability threshold for deciding which |
|
2806
|
|
|
|
|
|
|
# data elements to place in a cluster. These clusters are placed in files with |
|
2807
|
|
|
|
|
|
|
# names like |
|
2808
|
|
|
|
|
|
|
|
|
2809
|
|
|
|
|
|
|
posterior_prob_cluster1.txt |
|
2810
|
|
|
|
|
|
|
posterior_prob_cluster2.txt |
|
2811
|
|
|
|
|
|
|
... |
|
2812
|
|
|
|
|
|
|
|
|
2813
|
|
|
|
|
|
|
# CLUSTER VISUALIZATION: |
|
2814
|
|
|
|
|
|
|
|
|
2815
|
|
|
|
|
|
|
# You must first set the mask for cluster visualization. This mask tells the |
|
2816
|
|
|
|
|
|
|
# module which 2D or 3D subspace of the original data space you wish to visualize |
|
2817
|
|
|
|
|
|
|
# the clusters in: |
|
2818
|
|
|
|
|
|
|
|
|
2819
|
|
|
|
|
|
|
my $visualization_mask = "111"; |
|
2820
|
|
|
|
|
|
|
$clusterer->visualize_clusters($visualization_mask); |
|
2821
|
|
|
|
|
|
|
$clusterer->visualize_distributions($visualization_mask); |
|
2822
|
|
|
|
|
|
|
$clusterer->plot_hardcopy_clusters($visualization_mask); |
|
2823
|
|
|
|
|
|
|
$clusterer->plot_hardcopy_distributions($visualization_mask); |
|
2824
|
|
|
|
|
|
|
|
|
2825
|
|
|
|
|
|
|
# where the last two invocations are for writing out the PNG plots of the |
|
2826
|
|
|
|
|
|
|
# visualization displays to disk files. The PNG image of the posterior |
|
2827
|
|
|
|
|
|
|
# probability distributions is written out to a file named posterior_prob_plot.png |
|
2828
|
|
|
|
|
|
|
# and the PNG image of the disjoint clusters to a file called cluster_plot.png. |
|
2829
|
|
|
|
|
|
|
|
|
2830
|
|
|
|
|
|
|
# SYNTHETIC DATA GENERATION: |
|
2831
|
|
|
|
|
|
|
|
|
2832
|
|
|
|
|
|
|
# The module has been provided with a class method for generating multivariate |
|
2833
|
|
|
|
|
|
|
# data for experimenting with the EM algorithm. The data generation is controlled |
|
2834
|
|
|
|
|
|
|
# by the contents of a parameter file that is supplied as an argument to the data |
|
2835
|
|
|
|
|
|
|
# generator method. The priors, the means, and the covariance matrices in the |
|
2836
|
|
|
|
|
|
|
# parameter file must be according to the syntax shown in the `param1.txt' file in |
|
2837
|
|
|
|
|
|
|
# the `examples' directory. It is best to edit a copy of this file for your |
|
2838
|
|
|
|
|
|
|
# synthetic data generation needs. |
|
2839
|
|
|
|
|
|
|
|
|
2840
|
|
|
|
|
|
|
my $parameter_file = "param1.txt"; |
|
2841
|
|
|
|
|
|
|
my $out_datafile = "mydatafile1.dat"; |
|
2842
|
|
|
|
|
|
|
Algorithm::ExpectationMaximization->cluster_data_generator( |
|
2843
|
|
|
|
|
|
|
input_parameter_file => $parameter_file, |
|
2844
|
|
|
|
|
|
|
output_datafile => $out_datafile, |
|
2845
|
|
|
|
|
|
|
total_number_of_data_points => $N ); |
|
2846
|
|
|
|
|
|
|
|
|
2847
|
|
|
|
|
|
|
# where the value of $N is the total number of data points you would like to see |
|
2848
|
|
|
|
|
|
|
# generated for all of the Gaussians. How this total number is divided up amongst |
|
2849
|
|
|
|
|
|
|
# the Gaussians is decided by the prior probabilities for the Gaussian components |
|
2850
|
|
|
|
|
|
|
# as declared in input parameter file. The synthetic data may be visualized in a |
|
2851
|
|
|
|
|
|
|
# terminal window and the visualization written out as a PNG image to a diskfile |
|
2852
|
|
|
|
|
|
|
# by |
|
2853
|
|
|
|
|
|
|
|
|
2854
|
|
|
|
|
|
|
my $data_visualization_mask = "11"; |
|
2855
|
|
|
|
|
|
|
$clusterer->visualize_data($data_visualization_mask); |
|
2856
|
|
|
|
|
|
|
$clusterer->plot_hardcopy_data($data_visualization_mask); |
|
2857
|
|
|
|
|
|
|
|
|
2858
|
|
|
|
|
|
|
|
|
2859
|
|
|
|
|
|
|
=head1 CHANGES |
|
2860
|
|
|
|
|
|
|
|
|
2861
|
|
|
|
|
|
|
Version 1.22 should work with data in CSV files. |
|
2862
|
|
|
|
|
|
|
|
|
2863
|
|
|
|
|
|
|
Version 1.21 incorporates minor code clean up. Overall, the module implementation |
|
2864
|
|
|
|
|
|
|
remains unchanged. |
|
2865
|
|
|
|
|
|
|
|
|
2866
|
|
|
|
|
|
|
Version 1.2 allows the module to also be used for 1-D data. The visualization code |
|
2867
|
|
|
|
|
|
|
for 1-D shows the clusters through their histograms. |
|
2868
|
|
|
|
|
|
|
|
|
2869
|
|
|
|
|
|
|
Version 1.1 incorporates much cleanup of the documentation associated with the |
|
2870
|
|
|
|
|
|
|
module. Both the top-level module documentation, especially the Description part, |
|
2871
|
|
|
|
|
|
|
and the comments embedded in the code were revised for better utilization of the |
|
2872
|
|
|
|
|
|
|
module. The basic implementation code remains unchanged. |
|
2873
|
|
|
|
|
|
|
|
|
2874
|
|
|
|
|
|
|
|
|
2875
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
2876
|
|
|
|
|
|
|
|
|
2877
|
|
|
|
|
|
|
B is a I module for the |
|
2878
|
|
|
|
|
|
|
Expectation-Maximization (EM) method of clustering numerical data that lends itself |
|
2879
|
|
|
|
|
|
|
to modeling as a Gaussian mixture. Since the module is entirely in Perl (in the |
|
2880
|
|
|
|
|
|
|
sense that it is not a Perl wrapper around a C library that actually does the |
|
2881
|
|
|
|
|
|
|
clustering), the code in the module can easily be modified to experiment with several |
|
2882
|
|
|
|
|
|
|
aspects of EM. |
|
2883
|
|
|
|
|
|
|
|
|
2884
|
|
|
|
|
|
|
Gaussian Mixture Modeling (GMM) is based on the assumption that the data consists of |
|
2885
|
|
|
|
|
|
|
C Gaussian components, each characterized by its own mean vector and its own |
|
2886
|
|
|
|
|
|
|
covariance matrix. Obviously, given observed data for clustering, we do not know |
|
2887
|
|
|
|
|
|
|
which of the C Gaussian components was responsible for any of the data elements. |
|
2888
|
|
|
|
|
|
|
GMM also associates a prior probability with each Gaussian component. In general, |
|
2889
|
|
|
|
|
|
|
these priors will also be unknown. So the problem of clustering consists of |
|
2890
|
|
|
|
|
|
|
estimating the posterior class probability at each data element and also estimating |
|
2891
|
|
|
|
|
|
|
the class priors. Once these posterior class probabilities and the priors are |
|
2892
|
|
|
|
|
|
|
estimated with EM, we can use the naive Bayes' classifier to partition the data into |
|
2893
|
|
|
|
|
|
|
disjoint clusters. Or, for "soft" clustering, we can find all the data elements that |
|
2894
|
|
|
|
|
|
|
belong to a Gaussian component on the basis of the posterior class probabilities at |
|
2895
|
|
|
|
|
|
|
the data elements exceeding a prescribed threshold. |
|
2896
|
|
|
|
|
|
|
|
|
2897
|
|
|
|
|
|
|
If you do not mind the fact that it is possible for the EM algorithm to occasionally |
|
2898
|
|
|
|
|
|
|
get stuck in a local maximum and to, therefore, produce a wrong answer even when you |
|
2899
|
|
|
|
|
|
|
know the data to be perfectly multimodal Gaussian, EM is probably the most magical |
|
2900
|
|
|
|
|
|
|
approach to clustering multidimensional data. Consider the case of clustering |
|
2901
|
|
|
|
|
|
|
three-dimensional data. Each Gaussian cluster in 3D space is characterized by the |
|
2902
|
|
|
|
|
|
|
following 10 variables: the 6 unique elements of the C<3x3> covariance matrix (which |
|
2903
|
|
|
|
|
|
|
must be symmetric positive-definite), the 3 unique elements of the mean, and the |
|
2904
|
|
|
|
|
|
|
prior associated with the Gaussian. Now let's say you expect to see six Gaussians in |
|
2905
|
|
|
|
|
|
|
your data. What that means is that you would want the values for 59 variables |
|
2906
|
|
|
|
|
|
|
(remember the unit-summation constraint on the class priors which reduces the overall |
|
2907
|
|
|
|
|
|
|
number of variables by one) to be estimated by the algorithm that seeks to discover |
|
2908
|
|
|
|
|
|
|
the clusters in your data. What's amazing is that, despite the large number of |
|
2909
|
|
|
|
|
|
|
variables that must be optimized simultaneously, the EM algorithm will very likely |
|
2910
|
|
|
|
|
|
|
give you a good approximation to the right answer. |
|
2911
|
|
|
|
|
|
|
|
|
2912
|
|
|
|
|
|
|
At its core, EM depends on the notion of unobserved data and the averaging of the |
|
2913
|
|
|
|
|
|
|
log-likelihood of the data actually observed over all admissible probabilities for |
|
2914
|
|
|
|
|
|
|
the unobserved data. But what is unobserved data? While in some cases where EM is |
|
2915
|
|
|
|
|
|
|
used, the unobserved data is literally the missing data, in others, it is something |
|
2916
|
|
|
|
|
|
|
that cannot be seen directly but that nonetheless is relevant to the data actually |
|
2917
|
|
|
|
|
|
|
observed. For the case of clustering multidimensional numerical data that can be |
|
2918
|
|
|
|
|
|
|
modeled as a Gaussian mixture, it turns out that the best way to think of the |
|
2919
|
|
|
|
|
|
|
unobserved data is in terms of a sequence of random variables, one for each observed |
|
2920
|
|
|
|
|
|
|
data point, whose values dictate the selection of the Gaussian for that data point. |
|
2921
|
|
|
|
|
|
|
This point is explained in great detail in my on-line tutorial at |
|
2922
|
|
|
|
|
|
|
L. |
|
2923
|
|
|
|
|
|
|
|
|
2924
|
|
|
|
|
|
|
The EM algorithm in our context reduces to an iterative invocation of the following |
|
2925
|
|
|
|
|
|
|
steps: (1) Given the current guess for the means and the covariances of the different |
|
2926
|
|
|
|
|
|
|
Gaussians in our mixture model, use Bayes' Rule to update the posterior class |
|
2927
|
|
|
|
|
|
|
probabilities at each of the data points; (2) Using the updated posterior class |
|
2928
|
|
|
|
|
|
|
probabilities, first update the class priors; (3) Using the updated class priors, |
|
2929
|
|
|
|
|
|
|
update the class means and the class covariances; and go back to Step (1). Ideally, |
|
2930
|
|
|
|
|
|
|
the iterations should terminate when the expected log-likelihood of the observed data |
|
2931
|
|
|
|
|
|
|
has reached a maximum and does not change with any further iterations. The stopping |
|
2932
|
|
|
|
|
|
|
rule used in this module is the detection of no change over three consecutive |
|
2933
|
|
|
|
|
|
|
iterations in the values calculated for the priors. |
|
2934
|
|
|
|
|
|
|
|
|
2935
|
|
|
|
|
|
|
This module provides three different choices for seeding the clusters: (1) random, |
|
2936
|
|
|
|
|
|
|
(2) kmeans, and (3) manual. When random seeding is chosen, the algorithm randomly |
|
2937
|
|
|
|
|
|
|
selects C data elements as cluster seeds. That is, the data vectors associated |
|
2938
|
|
|
|
|
|
|
with these seeds are treated as initial guesses for the means of the Gaussian |
|
2939
|
|
|
|
|
|
|
distributions. The covariances are then set to the values calculated from the entire |
|
2940
|
|
|
|
|
|
|
dataset with respect to the means corresponding to the seeds. With kmeans seeding, on |
|
2941
|
|
|
|
|
|
|
the other hand, the means and the covariances are set to whatever values are returned |
|
2942
|
|
|
|
|
|
|
by the kmeans algorithm. And, when seeding is set to manual, you are allowed to |
|
2943
|
|
|
|
|
|
|
choose C data elements --- by specifying their tag names --- for the seeds. The |
|
2944
|
|
|
|
|
|
|
rest of the EM initialization for the manual mode is the same as for the random mode. |
|
2945
|
|
|
|
|
|
|
The algorithm allows for the initial priors to be specified for the manual mode of |
|
2946
|
|
|
|
|
|
|
seeding. |
|
2947
|
|
|
|
|
|
|
|
|
2948
|
|
|
|
|
|
|
Much of code for the kmeans based seeding of EM was drawn from the |
|
2949
|
|
|
|
|
|
|
C module by me. The code from that module used here corresponds to |
|
2950
|
|
|
|
|
|
|
the case when the C option in the C module is set |
|
2951
|
|
|
|
|
|
|
to C. The C option for KMeans consists of subjecting the data to a |
|
2952
|
|
|
|
|
|
|
principal components analysis (PCA) to discover the direction of maximum variance in |
|
2953
|
|
|
|
|
|
|
the data space. The data points are then projected on to this direction and a |
|
2954
|
|
|
|
|
|
|
histogram constructed from the projections. Centers of the C largest peaks in |
|
2955
|
|
|
|
|
|
|
this smoothed histogram are used to seed the KMeans based clusterer. As you'd |
|
2956
|
|
|
|
|
|
|
expect, the output of the KMeans used to seed the EM algorithm. |
|
2957
|
|
|
|
|
|
|
|
|
2958
|
|
|
|
|
|
|
This module uses two different criteria to measure the quality of the clustering |
|
2959
|
|
|
|
|
|
|
achieved. The first is the Minimum Description Length (MDL) proposed originally by |
|
2960
|
|
|
|
|
|
|
Rissanen (J. Rissanen: "Modeling by Shortest Data Description," Automatica, 1978, and |
|
2961
|
|
|
|
|
|
|
"A Universal Prior for Integers and Estimation by Minimum Description Length," Annals |
|
2962
|
|
|
|
|
|
|
of Statistics, 1983.) The MDL criterion is a difference of a log-likelihood term for |
|
2963
|
|
|
|
|
|
|
all of the observed data and a model-complexity penalty term. In general, both the |
|
2964
|
|
|
|
|
|
|
log-likelihood and the model-complexity terms increase as the number of clusters |
|
2965
|
|
|
|
|
|
|
increases. The form of the MDL criterion in this module uses for the penalty term |
|
2966
|
|
|
|
|
|
|
the Bayesian Information Criterion (BIC) of G. Schwartz, "Estimating the Dimensions |
|
2967
|
|
|
|
|
|
|
of a Model," The Annals of Statistics, 1978. In general, the smaller the value of |
|
2968
|
|
|
|
|
|
|
MDL quality measure, the better the clustering of the data. |
|
2969
|
|
|
|
|
|
|
|
|
2970
|
|
|
|
|
|
|
For our second measure of clustering quality, we use `trace( SW^-1 . SB)' where SW is |
|
2971
|
|
|
|
|
|
|
the within-class scatter matrix, more commonly denoted S_w, and SB the between-class |
|
2972
|
|
|
|
|
|
|
scatter matrix, more commonly denoted S_b (the underscore means subscript). This |
|
2973
|
|
|
|
|
|
|
measure can be thought of as the normalized average distance between the clusters, |
|
2974
|
|
|
|
|
|
|
the normalization being provided by average cluster covariance SW^-1. Therefore, the |
|
2975
|
|
|
|
|
|
|
larger the value of this quality measure, the better the separation between the |
|
2976
|
|
|
|
|
|
|
clusters. Since this measure has its roots in the Fisher linear discriminant |
|
2977
|
|
|
|
|
|
|
function, we incorporate the word C in the name of the quality measure. |
|
2978
|
|
|
|
|
|
|
I When the |
|
2979
|
|
|
|
|
|
|
clusters exhibit significant overlap, the numbers produced by this quality measure |
|
2980
|
|
|
|
|
|
|
tend to be generally meaningless. |
|
2981
|
|
|
|
|
|
|
|
|
2982
|
|
|
|
|
|
|
=head1 METHODS |
|
2983
|
|
|
|
|
|
|
|
|
2984
|
|
|
|
|
|
|
The module provides the following methods for EM based |
|
2985
|
|
|
|
|
|
|
clustering, for cluster visualization, for data |
|
2986
|
|
|
|
|
|
|
visualization, and for the generation of data for testing a |
|
2987
|
|
|
|
|
|
|
clustering algorithm: |
|
2988
|
|
|
|
|
|
|
|
|
2989
|
|
|
|
|
|
|
=over |
|
2990
|
|
|
|
|
|
|
|
|
2991
|
|
|
|
|
|
|
=item B |
|
2992
|
|
|
|
|
|
|
|
|
2993
|
|
|
|
|
|
|
A call to C constructs a new instance of the |
|
2994
|
|
|
|
|
|
|
C class. A typical form |
|
2995
|
|
|
|
|
|
|
of this call when you want to use random option for seeding |
|
2996
|
|
|
|
|
|
|
the algorithm is: |
|
2997
|
|
|
|
|
|
|
|
|
2998
|
|
|
|
|
|
|
my $clusterer = Algorithm::ExpectationMaximization->new( |
|
2999
|
|
|
|
|
|
|
datafile => $datafile, |
|
3000
|
|
|
|
|
|
|
mask => $mask, |
|
3001
|
|
|
|
|
|
|
K => 3, |
|
3002
|
|
|
|
|
|
|
max_em_iterations => 300, |
|
3003
|
|
|
|
|
|
|
seeding => 'random', |
|
3004
|
|
|
|
|
|
|
terminal_output => 1, |
|
3005
|
|
|
|
|
|
|
); |
|
3006
|
|
|
|
|
|
|
|
|
3007
|
|
|
|
|
|
|
where C is the expected number of clusters and |
|
3008
|
|
|
|
|
|
|
C the maximum number of EM iterations |
|
3009
|
|
|
|
|
|
|
that you want to allow until convergence is achieved. |
|
3010
|
|
|
|
|
|
|
Depending on your dataset and on the choice of the initial |
|
3011
|
|
|
|
|
|
|
seeds, the actual number of iterations used could be as few |
|
3012
|
|
|
|
|
|
|
as 10 and as many as reaching 300. The output produced by |
|
3013
|
|
|
|
|
|
|
the algorithm shows the actual number of iterations used to |
|
3014
|
|
|
|
|
|
|
arrive at convergence. |
|
3015
|
|
|
|
|
|
|
|
|
3016
|
|
|
|
|
|
|
The data file supplied through the C option is |
|
3017
|
|
|
|
|
|
|
expected to contain entries in the following format |
|
3018
|
|
|
|
|
|
|
|
|
3019
|
|
|
|
|
|
|
c20 0 10.7087017086940 9.63528386251712 10.9512155258108 ... |
|
3020
|
|
|
|
|
|
|
c7 0 12.8025925026787 10.6126270065785 10.5228482095349 ... |
|
3021
|
|
|
|
|
|
|
b9 0 7.60118206283120 5.05889245193079 5.82841781759102 ... |
|
3022
|
|
|
|
|
|
|
.... |
|
3023
|
|
|
|
|
|
|
.... |
|
3024
|
|
|
|
|
|
|
|
|
3025
|
|
|
|
|
|
|
where the first column contains the symbolic ID tag for each |
|
3026
|
|
|
|
|
|
|
data record and the rest of the columns the numerical |
|
3027
|
|
|
|
|
|
|
information. As to which columns are actually used for |
|
3028
|
|
|
|
|
|
|
clustering is decided by the string value of the mask. For |
|
3029
|
|
|
|
|
|
|
example, if we wanted to cluster on the basis of the entries |
|
3030
|
|
|
|
|
|
|
in just the 3rd, the 4th, and the 5th columns above, the |
|
3031
|
|
|
|
|
|
|
mask value would be C where the character C |
|
3032
|
|
|
|
|
|
|
indicates that the ID tag is in the first column, the |
|
3033
|
|
|
|
|
|
|
character C<0> that the second column is to be ignored, and |
|
3034
|
|
|
|
|
|
|
the C<1>'s that follow that the 3rd, the 4th, and the 5th |
|
3035
|
|
|
|
|
|
|
columns are to be used for clustering. |
|
3036
|
|
|
|
|
|
|
|
|
3037
|
|
|
|
|
|
|
If instead of random seeding, you wish to use the kmeans |
|
3038
|
|
|
|
|
|
|
based seeding, just replace the option C supplied |
|
3039
|
|
|
|
|
|
|
for C by C. You can also do manual seeding |
|
3040
|
|
|
|
|
|
|
by designating a specified set of data elements to serve as |
|
3041
|
|
|
|
|
|
|
cluster seeds. The call to the constructor in this case |
|
3042
|
|
|
|
|
|
|
looks like |
|
3043
|
|
|
|
|
|
|
|
|
3044
|
|
|
|
|
|
|
my $clusterer = Algorithm::ExpectationMaximization->new( |
|
3045
|
|
|
|
|
|
|
datafile => $datafile, |
|
3046
|
|
|
|
|
|
|
mask => $mask, |
|
3047
|
|
|
|
|
|
|
K => 3, |
|
3048
|
|
|
|
|
|
|
max_em_iterations => 300, |
|
3049
|
|
|
|
|
|
|
seeding => 'manual', |
|
3050
|
|
|
|
|
|
|
seed_tags => ['a26', 'b53', 'c49'], |
|
3051
|
|
|
|
|
|
|
terminal_output => 1, |
|
3052
|
|
|
|
|
|
|
); |
|
3053
|
|
|
|
|
|
|
|
|
3054
|
|
|
|
|
|
|
where the option C is set to an anonymous array |
|
3055
|
|
|
|
|
|
|
of symbolic names associated with the data elements. |
|
3056
|
|
|
|
|
|
|
|
|
3057
|
|
|
|
|
|
|
If you know the class priors, you can supply them through an |
|
3058
|
|
|
|
|
|
|
additional option to the constructor that looks like |
|
3059
|
|
|
|
|
|
|
|
|
3060
|
|
|
|
|
|
|
class_priors => [0.6, 0.2, 0.2], |
|
3061
|
|
|
|
|
|
|
|
|
3062
|
|
|
|
|
|
|
for the case of C equal to 3. B
|
|
3063
|
|
|
|
|
|
|
be a useful thing to do only for the case of manual |
|
3064
|
|
|
|
|
|
|
seeding.> If you go for manual seeding, the order in which |
|
3065
|
|
|
|
|
|
|
the priors are expressed should correspond to the order of |
|
3066
|
|
|
|
|
|
|
the manually chosen tags supplied through the C |
|
3067
|
|
|
|
|
|
|
option. |
|
3068
|
|
|
|
|
|
|
|
|
3069
|
|
|
|
|
|
|
Note that the parameter C is boolean; when |
|
3070
|
|
|
|
|
|
|
not supplied in the call to C it defaults to 0. When |
|
3071
|
|
|
|
|
|
|
set, this parameter displays useful information in the |
|
3072
|
|
|
|
|
|
|
window of the terminal screen in which you invoke the |
|
3073
|
|
|
|
|
|
|
algorithm. |
|
3074
|
|
|
|
|
|
|
|
|
3075
|
|
|
|
|
|
|
=item B |
|
3076
|
|
|
|
|
|
|
|
|
3077
|
|
|
|
|
|
|
$clusterer->read_data_from_file() |
|
3078
|
|
|
|
|
|
|
|
|
3079
|
|
|
|
|
|
|
This is a required call after the constructor is invoked. As |
|
3080
|
|
|
|
|
|
|
you would expect, this call reads in the data for |
|
3081
|
|
|
|
|
|
|
clustering. |
|
3082
|
|
|
|
|
|
|
|
|
3083
|
|
|
|
|
|
|
=item B |
|
3084
|
|
|
|
|
|
|
|
|
3085
|
|
|
|
|
|
|
$clusterer->seed_the_clusters(); |
|
3086
|
|
|
|
|
|
|
|
|
3087
|
|
|
|
|
|
|
This is also a required call. It processes the option you |
|
3088
|
|
|
|
|
|
|
supplied for C in the constructor call to choose |
|
3089
|
|
|
|
|
|
|
the data elements for seeding the C clusters. |
|
3090
|
|
|
|
|
|
|
|
|
3091
|
|
|
|
|
|
|
=item B |
|
3092
|
|
|
|
|
|
|
|
|
3093
|
|
|
|
|
|
|
$clusterer->EM(); |
|
3094
|
|
|
|
|
|
|
|
|
3095
|
|
|
|
|
|
|
This is the workhorse of the module, as you would expect. |
|
3096
|
|
|
|
|
|
|
The means, the covariances, and the priors estimated by this |
|
3097
|
|
|
|
|
|
|
method are stored in instance variables that are subsequently |
|
3098
|
|
|
|
|
|
|
accessed by other methods for the purpose of displaying the |
|
3099
|
|
|
|
|
|
|
clusters, the probability distributions, etc. |
|
3100
|
|
|
|
|
|
|
|
|
3101
|
|
|
|
|
|
|
=item B |
|
3102
|
|
|
|
|
|
|
|
|
3103
|
|
|
|
|
|
|
$clusterer->run_bayes_classifier(); |
|
3104
|
|
|
|
|
|
|
|
|
3105
|
|
|
|
|
|
|
Using the posterior probability distributions estimated by |
|
3106
|
|
|
|
|
|
|
the C method, this method partitions the data into the |
|
3107
|
|
|
|
|
|
|
C disjoint clusters using the naive Bayes' classifier. |
|
3108
|
|
|
|
|
|
|
|
|
3109
|
|
|
|
|
|
|
=item B |
|
3110
|
|
|
|
|
|
|
|
|
3111
|
|
|
|
|
|
|
my $clusters = $clusterer->return_disjoint_clusters(); |
|
3112
|
|
|
|
|
|
|
|
|
3113
|
|
|
|
|
|
|
This allows you to access the clusters obtained with the |
|
3114
|
|
|
|
|
|
|
application of the naive Bayes' classifier in your own |
|
3115
|
|
|
|
|
|
|
scripts. If, say, you wanted to see the data records placed |
|
3116
|
|
|
|
|
|
|
in each cluster, you could subsequently invoke the following |
|
3117
|
|
|
|
|
|
|
loop in your own script: |
|
3118
|
|
|
|
|
|
|
|
|
3119
|
|
|
|
|
|
|
foreach my $index (0..@$clusters-1) { |
|
3120
|
|
|
|
|
|
|
print "Cluster $index (Naive Bayes): @{$clusters->[$index]}\n\n" |
|
3121
|
|
|
|
|
|
|
} |
|
3122
|
|
|
|
|
|
|
|
|
3123
|
|
|
|
|
|
|
where C<$clusters> holds the array reference returned by the |
|
3124
|
|
|
|
|
|
|
call to C. |
|
3125
|
|
|
|
|
|
|
|
|
3126
|
|
|
|
|
|
|
=item B |
|
3127
|
|
|
|
|
|
|
|
|
3128
|
|
|
|
|
|
|
$clusterer->write_naive_bayes_clusters_to_files(); |
|
3129
|
|
|
|
|
|
|
|
|
3130
|
|
|
|
|
|
|
This method writes the clusters obtained by applying the |
|
3131
|
|
|
|
|
|
|
naive Bayes' classifier to disk files, one cluster per |
|
3132
|
|
|
|
|
|
|
file. What is written out to each file consists of the |
|
3133
|
|
|
|
|
|
|
symbolic names of the data records that belong to the |
|
3134
|
|
|
|
|
|
|
cluster corresponding to that file. The clusters are placed |
|
3135
|
|
|
|
|
|
|
in files with names like |
|
3136
|
|
|
|
|
|
|
|
|
3137
|
|
|
|
|
|
|
naive_bayes_cluster1.txt |
|
3138
|
|
|
|
|
|
|
naive_bayes_cluster2.txt |
|
3139
|
|
|
|
|
|
|
... |
|
3140
|
|
|
|
|
|
|
|
|
3141
|
|
|
|
|
|
|
=item B |
|
3142
|
|
|
|
|
|
|
|
|
3143
|
|
|
|
|
|
|
my $theta1 = 0.2; |
|
3144
|
|
|
|
|
|
|
my $posterior_prob_clusters = |
|
3145
|
|
|
|
|
|
|
$clusterer->return_clusters_with_posterior_probs_above_threshold($theta1); |
|
3146
|
|
|
|
|
|
|
|
|
3147
|
|
|
|
|
|
|
This method returns a reference to an array of C |
|
3148
|
|
|
|
|
|
|
anonymous arrays, each consisting of the symbolic names for |
|
3149
|
|
|
|
|
|
|
the data records where the posterior class probability |
|
3150
|
|
|
|
|
|
|
exceeds the threshold as specified by C<$theta1>. |
|
3151
|
|
|
|
|
|
|
Subsequently, you can access each of these |
|
3152
|
|
|
|
|
|
|
posterior-probability based clusters through a loop |
|
3153
|
|
|
|
|
|
|
construct such as |
|
3154
|
|
|
|
|
|
|
|
|
3155
|
|
|
|
|
|
|
foreach my $index (0..@$posterior_prob_clusters-1) { |
|
3156
|
|
|
|
|
|
|
print "Cluster $index (based on posterior probs exceeding $theta1): " . |
|
3157
|
|
|
|
|
|
|
"@{$posterior_prob_clusters->[$index]}\n\n" |
|
3158
|
|
|
|
|
|
|
} |
|
3159
|
|
|
|
|
|
|
|
|
3160
|
|
|
|
|
|
|
=item B |
|
3161
|
|
|
|
|
|
|
|
|
3162
|
|
|
|
|
|
|
$clusterer->write_posterior_prob_clusters_above_threshold_to_files($theta1); |
|
3163
|
|
|
|
|
|
|
|
|
3164
|
|
|
|
|
|
|
This call writes out the posterior-probability based soft |
|
3165
|
|
|
|
|
|
|
clusters to disk files. As in the previous method, the |
|
3166
|
|
|
|
|
|
|
threshold C<$theta1> sets the probability threshold for |
|
3167
|
|
|
|
|
|
|
deciding which data elements belong to a cluster. These |
|
3168
|
|
|
|
|
|
|
clusters are placed in files with names like |
|
3169
|
|
|
|
|
|
|
|
|
3170
|
|
|
|
|
|
|
posterior_prob_cluster1.txt |
|
3171
|
|
|
|
|
|
|
posterior_prob_cluster2.txt |
|
3172
|
|
|
|
|
|
|
... |
|
3173
|
|
|
|
|
|
|
|
|
3174
|
|
|
|
|
|
|
=item B |
|
3175
|
|
|
|
|
|
|
|
|
3176
|
|
|
|
|
|
|
my $theta2 = 0.00001; |
|
3177
|
|
|
|
|
|
|
my $class_distributions = |
|
3178
|
|
|
|
|
|
|
$clusterer->return_individual_class_distributions_above_given_threshold($theta2); |
|
3179
|
|
|
|
|
|
|
|
|
3180
|
|
|
|
|
|
|
This is the method to call if you wish to see the individual |
|
3181
|
|
|
|
|
|
|
Gaussians in your own script. The method returns a reference |
|
3182
|
|
|
|
|
|
|
to an array of anonymous arrays, with each anonymous array |
|
3183
|
|
|
|
|
|
|
representing data membership in each Gaussian. Only those |
|
3184
|
|
|
|
|
|
|
data points are included in each Gaussian where the |
|
3185
|
|
|
|
|
|
|
probability exceeds the threshold C<$theta2>. Note that the |
|
3186
|
|
|
|
|
|
|
larger the covariance and the higher the data |
|
3187
|
|
|
|
|
|
|
dimensionality, the smaller this threshold must be for you |
|
3188
|
|
|
|
|
|
|
to see any of the data points in a Gaussian. After you have |
|
3189
|
|
|
|
|
|
|
accessed the Gaussian mixture in this manner, you can |
|
3190
|
|
|
|
|
|
|
display the data membership in each Gaussian through the |
|
3191
|
|
|
|
|
|
|
following sort of a loop: |
|
3192
|
|
|
|
|
|
|
|
|
3193
|
|
|
|
|
|
|
foreach my $index (0..@$class_distributions-1) { |
|
3194
|
|
|
|
|
|
|
print "Gaussian Distribution $index (only shows data elements " . |
|
3195
|
|
|
|
|
|
|
"whose probabilities exceed the threshold $theta2: " . |
|
3196
|
|
|
|
|
|
|
"@{$class_distributions->[$index]}\n\n" |
|
3197
|
|
|
|
|
|
|
} |
|
3198
|
|
|
|
|
|
|
|
|
3199
|
|
|
|
|
|
|
=item B |
|
3200
|
|
|
|
|
|
|
|
|
3201
|
|
|
|
|
|
|
my $visualization_mask = "11"; |
|
3202
|
|
|
|
|
|
|
$clusterer->visualize_clusters($visualization_mask); |
|
3203
|
|
|
|
|
|
|
|
|
3204
|
|
|
|
|
|
|
The visualization mask here does not have to be identical to |
|
3205
|
|
|
|
|
|
|
the one used for clustering, but must be a subset of that |
|
3206
|
|
|
|
|
|
|
mask. This is convenient for visualizing the clusters in |
|
3207
|
|
|
|
|
|
|
two- or three-dimensional subspaces of the original space. |
|
3208
|
|
|
|
|
|
|
The subset is specified by placing `0's in the positions |
|
3209
|
|
|
|
|
|
|
corresponding to the dimensions you do NOT want to see |
|
3210
|
|
|
|
|
|
|
through visualization. Depending on the mask used, this |
|
3211
|
|
|
|
|
|
|
method creates a 2D or a 3D scatter plot of the clusters |
|
3212
|
|
|
|
|
|
|
obtained through the naive Bayes' classification rule. |
|
3213
|
|
|
|
|
|
|
|
|
3214
|
|
|
|
|
|
|
=item B |
|
3215
|
|
|
|
|
|
|
|
|
3216
|
|
|
|
|
|
|
$clusterer->visualize_distributions($visualization_mask); |
|
3217
|
|
|
|
|
|
|
|
|
3218
|
|
|
|
|
|
|
This is the method to call if you want to visualize the soft |
|
3219
|
|
|
|
|
|
|
clustering corresponding to the posterior class |
|
3220
|
|
|
|
|
|
|
probabilities exceeding the threshold specified in the call |
|
3221
|
|
|
|
|
|
|
to |
|
3222
|
|
|
|
|
|
|
C. |
|
3223
|
|
|
|
|
|
|
Again, depending on the visualization mask used, you will |
|
3224
|
|
|
|
|
|
|
see either a 2D plot or a 3D scatter plot. |
|
3225
|
|
|
|
|
|
|
|
|
3226
|
|
|
|
|
|
|
=item B |
|
3227
|
|
|
|
|
|
|
|
|
3228
|
|
|
|
|
|
|
$clusterer->plot_hardcopy_clusters($visualization_mask); |
|
3229
|
|
|
|
|
|
|
|
|
3230
|
|
|
|
|
|
|
This method create a PNG file from the C created |
|
3231
|
|
|
|
|
|
|
display of the naive Bayes' clusters obtained from the data. |
|
3232
|
|
|
|
|
|
|
The plotting functionality of C is accessed through |
|
3233
|
|
|
|
|
|
|
the Perl wrappers provided by the C |
|
3234
|
|
|
|
|
|
|
module. |
|
3235
|
|
|
|
|
|
|
|
|
3236
|
|
|
|
|
|
|
=item B |
|
3237
|
|
|
|
|
|
|
|
|
3238
|
|
|
|
|
|
|
$clusterer->plot_hardcopy_distributions($visualization_mask); |
|
3239
|
|
|
|
|
|
|
|
|
3240
|
|
|
|
|
|
|
This method create a PNG file from the C created |
|
3241
|
|
|
|
|
|
|
display of the clusters that correspond to the posterior |
|
3242
|
|
|
|
|
|
|
class probabilities exceeding a specified threshold. The |
|
3243
|
|
|
|
|
|
|
plotting functionality of C is accessed through the |
|
3244
|
|
|
|
|
|
|
Perl wrappers provided by the C module. |
|
3245
|
|
|
|
|
|
|
|
|
3246
|
|
|
|
|
|
|
=item B |
|
3247
|
|
|
|
|
|
|
|
|
3248
|
|
|
|
|
|
|
$clusterer->display_fisher_quality_vs_iterations(); |
|
3249
|
|
|
|
|
|
|
|
|
3250
|
|
|
|
|
|
|
This method measures the quality of clustering by |
|
3251
|
|
|
|
|
|
|
calculating the normalized average squared distance between |
|
3252
|
|
|
|
|
|
|
the cluster centers, the normalization being provided by the |
|
3253
|
|
|
|
|
|
|
average cluster covariance. See the Description for further |
|
3254
|
|
|
|
|
|
|
details. In general, this measure is NOT useful for |
|
3255
|
|
|
|
|
|
|
overlapping clusters. |
|
3256
|
|
|
|
|
|
|
|
|
3257
|
|
|
|
|
|
|
=item B |
|
3258
|
|
|
|
|
|
|
|
|
3259
|
|
|
|
|
|
|
$clusterer->display_mdl_quality_vs_iterations(); |
|
3260
|
|
|
|
|
|
|
|
|
3261
|
|
|
|
|
|
|
At the end of each iteration, this method measures the |
|
3262
|
|
|
|
|
|
|
quality of clustering my calculating its MDL (Minimum |
|
3263
|
|
|
|
|
|
|
Description Length). As stated earlier in Description, the |
|
3264
|
|
|
|
|
|
|
MDL measure is a difference of a log-likelihood term for all |
|
3265
|
|
|
|
|
|
|
of the observed data and a model-complexity penalty term. |
|
3266
|
|
|
|
|
|
|
The smaller the value returned by this method, the better |
|
3267
|
|
|
|
|
|
|
the clustering. |
|
3268
|
|
|
|
|
|
|
|
|
3269
|
|
|
|
|
|
|
=item B |
|
3270
|
|
|
|
|
|
|
|
|
3271
|
|
|
|
|
|
|
my $estimated_priors = $clusterer->return_estimated_priors(); |
|
3272
|
|
|
|
|
|
|
print "Estimated class priors: @$estimated_priors\n"; |
|
3273
|
|
|
|
|
|
|
|
|
3274
|
|
|
|
|
|
|
This method can be used to access the final values of the |
|
3275
|
|
|
|
|
|
|
class priors as estimated by the EM algorithm. |
|
3276
|
|
|
|
|
|
|
|
|
3277
|
|
|
|
|
|
|
=item B |
|
3278
|
|
|
|
|
|
|
|
|
3279
|
|
|
|
|
|
|
Algorithm::ExpectationMaximization->cluster_data_generator( |
|
3280
|
|
|
|
|
|
|
input_parameter_file => $parameter_file, |
|
3281
|
|
|
|
|
|
|
output_datafile => $out_datafile, |
|
3282
|
|
|
|
|
|
|
total_number_of_data_points => 300 |
|
3283
|
|
|
|
|
|
|
); |
|
3284
|
|
|
|
|
|
|
|
|
3285
|
|
|
|
|
|
|
for generating multivariate data for clustering if you wish to play with synthetic |
|
3286
|
|
|
|
|
|
|
data for experimenting with the EM algorithm. The input parameter file must specify |
|
3287
|
|
|
|
|
|
|
the priors to be used for the Gaussians, their means, and their covariance matrices. |
|
3288
|
|
|
|
|
|
|
The format of the information contained in the parameter file must be as shown in the |
|
3289
|
|
|
|
|
|
|
file C provided in the C directory. It will be easiest for you |
|
3290
|
|
|
|
|
|
|
to just edit a copy of this file for your data generation needs. In addition to the |
|
3291
|
|
|
|
|
|
|
format of the parameter file, the main constraint you need to observe in specifying |
|
3292
|
|
|
|
|
|
|
the parameters is that the dimensionality of the covariance matrices must correspond |
|
3293
|
|
|
|
|
|
|
to the dimensionality of the mean vectors. The multivariate random numbers are |
|
3294
|
|
|
|
|
|
|
generated by calling the C module. As you would expect, this module |
|
3295
|
|
|
|
|
|
|
requires that the covariance matrices you specify in your parameter file be symmetric |
|
3296
|
|
|
|
|
|
|
and positive definite. Should the covariances in your parameter file not obey this |
|
3297
|
|
|
|
|
|
|
condition, the C module will let you know. |
|
3298
|
|
|
|
|
|
|
|
|
3299
|
|
|
|
|
|
|
=item B |
|
3300
|
|
|
|
|
|
|
|
|
3301
|
|
|
|
|
|
|
$clusterer->visualize_data($data_visualization_mask); |
|
3302
|
|
|
|
|
|
|
|
|
3303
|
|
|
|
|
|
|
This is the method to call if you want to visualize the data |
|
3304
|
|
|
|
|
|
|
you plan to cluster with the EM algorithm. You'd need to |
|
3305
|
|
|
|
|
|
|
specify argument mask in a manner similar to the |
|
3306
|
|
|
|
|
|
|
visualization of the clusters, as explained earlier. |
|
3307
|
|
|
|
|
|
|
|
|
3308
|
|
|
|
|
|
|
=item B |
|
3309
|
|
|
|
|
|
|
|
|
3310
|
|
|
|
|
|
|
$clusterer->plot_hardcopy_data($data_visualization_mask); |
|
3311
|
|
|
|
|
|
|
|
|
3312
|
|
|
|
|
|
|
This method creates a PNG file that can be used to print out |
|
3313
|
|
|
|
|
|
|
a hardcopy of the data in different 2D and 3D subspaces of |
|
3314
|
|
|
|
|
|
|
the data space. The visualization mask is used to select the |
|
3315
|
|
|
|
|
|
|
subspace for the PNG image. |
|
3316
|
|
|
|
|
|
|
|
|
3317
|
|
|
|
|
|
|
=back |
|
3318
|
|
|
|
|
|
|
|
|
3319
|
|
|
|
|
|
|
=head1 HOW THE CLUSTERS ARE OUTPUT |
|
3320
|
|
|
|
|
|
|
|
|
3321
|
|
|
|
|
|
|
This module produces two different types of clusters: the "hard" clusters and the |
|
3322
|
|
|
|
|
|
|
"soft" clusters. The hard clusters correspond to the naive Bayes' classification of |
|
3323
|
|
|
|
|
|
|
the data points on the basis of the Gaussian distributions and the class priors |
|
3324
|
|
|
|
|
|
|
estimated by the EM algorithm. Such clusters partition the data into disjoint |
|
3325
|
|
|
|
|
|
|
subsets. On the other hand, the soft clusters correspond to the posterior class |
|
3326
|
|
|
|
|
|
|
probabilities calculated by the EM algorithm. A data element belongs to a cluster if |
|
3327
|
|
|
|
|
|
|
its posterior probability for that Gaussian exceeds a threshold. |
|
3328
|
|
|
|
|
|
|
|
|
3329
|
|
|
|
|
|
|
After the EM algorithm has finished running, the hard clusters are created by |
|
3330
|
|
|
|
|
|
|
invoking the method C on an instance of the module and then |
|
3331
|
|
|
|
|
|
|
made user-accessible by calling C. These clusters may |
|
3332
|
|
|
|
|
|
|
then be displayed in a terminal window by dereferencing each element of the array |
|
3333
|
|
|
|
|
|
|
whose reference is returned b C. The hard clusters can |
|
3334
|
|
|
|
|
|
|
be written out to disk files by invoking C. |
|
3335
|
|
|
|
|
|
|
This method writes out the clusters to files, one cluster per file. What is written |
|
3336
|
|
|
|
|
|
|
out to each file consists of the symbolic names of the data records that belong to |
|
3337
|
|
|
|
|
|
|
the cluster corresponding to that file. The clusters are placed in files with names |
|
3338
|
|
|
|
|
|
|
like |
|
3339
|
|
|
|
|
|
|
|
|
3340
|
|
|
|
|
|
|
naive_bayes_cluster1.txt |
|
3341
|
|
|
|
|
|
|
naive_bayes_cluster2.txt |
|
3342
|
|
|
|
|
|
|
... |
|
3343
|
|
|
|
|
|
|
|
|
3344
|
|
|
|
|
|
|
The soft clusters on the other hand are created by calling |
|
3345
|
|
|
|
|
|
|
C |
|
3346
|
|
|
|
|
|
|
on an instance of the module, where the argument C<$theta1> |
|
3347
|
|
|
|
|
|
|
is the threshold for deciding whether a data element belongs |
|
3348
|
|
|
|
|
|
|
in a soft cluster. The posterior class probability at a |
|
3349
|
|
|
|
|
|
|
data element must exceed the threshold for the element to |
|
3350
|
|
|
|
|
|
|
belong to the corresponding cluster. The soft cluster can |
|
3351
|
|
|
|
|
|
|
be written out to disk files by calling |
|
3352
|
|
|
|
|
|
|
C. |
|
3353
|
|
|
|
|
|
|
As with the hard clusters, each cluster is placed in a separate |
|
3354
|
|
|
|
|
|
|
file. The filenames for such clusters look like: |
|
3355
|
|
|
|
|
|
|
|
|
3356
|
|
|
|
|
|
|
posterior_prob_cluster1.txt |
|
3357
|
|
|
|
|
|
|
posterior_prob_cluster2.txt |
|
3358
|
|
|
|
|
|
|
... |
|
3359
|
|
|
|
|
|
|
|
|
3360
|
|
|
|
|
|
|
=head1 WHAT IF THE NUMBER OF CLUSTERS IS UNKNOWN? |
|
3361
|
|
|
|
|
|
|
|
|
3362
|
|
|
|
|
|
|
The module constructor requires that you supply a value for the parameter C, which |
|
3363
|
|
|
|
|
|
|
is the number of clusters you expect to see in the data. But what if you do not have |
|
3364
|
|
|
|
|
|
|
a good value for C? Note that it is possible to search for the best C by using |
|
3365
|
|
|
|
|
|
|
the two clustering quality criteria included in the module. However, I have |
|
3366
|
|
|
|
|
|
|
intentionally not yet incorporated that feature in the module because it slows down |
|
3367
|
|
|
|
|
|
|
the execution of the code --- especially when the dimensionality of the data |
|
3368
|
|
|
|
|
|
|
increases. However, nothing prevents you from writing a script --- along the lines |
|
3369
|
|
|
|
|
|
|
of the five "canned_example" scripts in the C directory --- that would use |
|
3370
|
|
|
|
|
|
|
the two clustering quality metrics for finding the best choice for C for a given |
|
3371
|
|
|
|
|
|
|
dataset. Obviously, you will now have to incorporate the call to the constructor in |
|
3372
|
|
|
|
|
|
|
a loop and check the value of the quality measures for each value of C. |
|
3373
|
|
|
|
|
|
|
|
|
3374
|
|
|
|
|
|
|
=head1 SOME RESULTS OBTAINED WITH THIS MODULE |
|
3375
|
|
|
|
|
|
|
|
|
3376
|
|
|
|
|
|
|
If you would like to see some results that have been obtained with this module, check |
|
3377
|
|
|
|
|
|
|
out Section 7 of the report |
|
3378
|
|
|
|
|
|
|
L. |
|
3379
|
|
|
|
|
|
|
|
|
3380
|
|
|
|
|
|
|
=head1 THE C DIRECTORY |
|
3381
|
|
|
|
|
|
|
|
|
3382
|
|
|
|
|
|
|
Becoming familiar with this directory should be your best |
|
3383
|
|
|
|
|
|
|
strategy to become comfortable with this module (and its |
|
3384
|
|
|
|
|
|
|
future versions). You are urged to start by executing the |
|
3385
|
|
|
|
|
|
|
following five example scripts: |
|
3386
|
|
|
|
|
|
|
|
|
3387
|
|
|
|
|
|
|
=over 16 |
|
3388
|
|
|
|
|
|
|
|
|
3389
|
|
|
|
|
|
|
=item I |
|
3390
|
|
|
|
|
|
|
|
|
3391
|
|
|
|
|
|
|
This example applies the EM algorithm to the data contained in the datafile |
|
3392
|
|
|
|
|
|
|
C. The mixture data in the file corresponds to three overlapping |
|
3393
|
|
|
|
|
|
|
Gaussian components in a star-shaped pattern. The EM based clustering for this data |
|
3394
|
|
|
|
|
|
|
is shown in the files C and |
|
3395
|
|
|
|
|
|
|
C, the former displaying the hard clusters |
|
3396
|
|
|
|
|
|
|
obtained by using the naive Bayes' classifier and the latter showing the soft |
|
3397
|
|
|
|
|
|
|
clusters obtained on the basis of the posterior class probabilities at the data |
|
3398
|
|
|
|
|
|
|
points. |
|
3399
|
|
|
|
|
|
|
|
|
3400
|
|
|
|
|
|
|
=item I |
|
3401
|
|
|
|
|
|
|
|
|
3402
|
|
|
|
|
|
|
The datafile used in this example is C. This mixture data |
|
3403
|
|
|
|
|
|
|
corresponds to two well-separated relatively isotropic Gaussians. EM based clustering for this |
|
3404
|
|
|
|
|
|
|
data is shown in the files C and |
|
3405
|
|
|
|
|
|
|
C, the former displaying the hard clusters |
|
3406
|
|
|
|
|
|
|
obtained by using the naive Bayes' classifier and the latter showing the soft |
|
3407
|
|
|
|
|
|
|
clusters obtained by using the posterior class probabilities at the data points. |
|
3408
|
|
|
|
|
|
|
|
|
3409
|
|
|
|
|
|
|
=item I |
|
3410
|
|
|
|
|
|
|
|
|
3411
|
|
|
|
|
|
|
Like the first example, this example again involves three Gaussians, but now their |
|
3412
|
|
|
|
|
|
|
means are not co-located. Additionally, we now seed the clusters manually by |
|
3413
|
|
|
|
|
|
|
specifying three selected data points as the initial guesses for the cluster means. |
|
3414
|
|
|
|
|
|
|
The datafile used for this example is C. The EM based clustering |
|
3415
|
|
|
|
|
|
|
for this data is shown in the files C and |
|
3416
|
|
|
|
|
|
|
C, the former displaying the hard clusters |
|
3417
|
|
|
|
|
|
|
obtained by using the naive Bayes' classifier and the latter showing the soft |
|
3418
|
|
|
|
|
|
|
clusters obtained on the basis of the posterior class probabilities at the data |
|
3419
|
|
|
|
|
|
|
points. |
|
3420
|
|
|
|
|
|
|
|
|
3421
|
|
|
|
|
|
|
=item I |
|
3422
|
|
|
|
|
|
|
|
|
3423
|
|
|
|
|
|
|
Whereas the three previous examples demonstrated EM based clustering of 2D data, we |
|
3424
|
|
|
|
|
|
|
now present an example of clustering in 3D. The datafile used in this example is |
|
3425
|
|
|
|
|
|
|
C. This mixture data corresponds to three well-separated but highly |
|
3426
|
|
|
|
|
|
|
anisotropic Gaussians. The EM derived clustering for this data is shown in the files |
|
3427
|
|
|
|
|
|
|
C and C, the |
|
3428
|
|
|
|
|
|
|
former displaying the hard clusters obtained by using the naive Bayes' classifier and |
|
3429
|
|
|
|
|
|
|
the latter showing the soft clusters obtained on the basis of the posterior class |
|
3430
|
|
|
|
|
|
|
probabilities at the data points. |
|
3431
|
|
|
|
|
|
|
|
|
3432
|
|
|
|
|
|
|
You may also wish to run this example on the data in a CSV file in the C |
|
3433
|
|
|
|
|
|
|
directory. The name of the file is C. |
|
3434
|
|
|
|
|
|
|
|
|
3435
|
|
|
|
|
|
|
=item I |
|
3436
|
|
|
|
|
|
|
|
|
3437
|
|
|
|
|
|
|
We again demonstrate clustering in 3D but now we have one Gaussian cluster that |
|
3438
|
|
|
|
|
|
|
"cuts" through the other two Gaussian clusters. The datafile used in this example is |
|
3439
|
|
|
|
|
|
|
C. The three Gaussians in this case are highly overlapping and |
|
3440
|
|
|
|
|
|
|
highly anisotropic. The EM derived clustering for this data is shown in the files |
|
3441
|
|
|
|
|
|
|
C and C, the |
|
3442
|
|
|
|
|
|
|
former displaying the hard clusters obtained by using the naive Bayes' classifier and |
|
3443
|
|
|
|
|
|
|
the latter showing the soft clusters obtained through the posterior class |
|
3444
|
|
|
|
|
|
|
probabilities at the data points. |
|
3445
|
|
|
|
|
|
|
|
|
3446
|
|
|
|
|
|
|
=item I |
|
3447
|
|
|
|
|
|
|
|
|
3448
|
|
|
|
|
|
|
This example, added in Version 1.2, demonstrates the use of this module for 1-D data. |
|
3449
|
|
|
|
|
|
|
In order to visualize the clusters for the 1-D case, we show them through their |
|
3450
|
|
|
|
|
|
|
respective histograms. The datafile used in this example is C. The |
|
3451
|
|
|
|
|
|
|
data consists of two overlapping Gaussians. The EM derived clustering for this data |
|
3452
|
|
|
|
|
|
|
is shown in the files C and |
|
3453
|
|
|
|
|
|
|
C, the former displaying the hard clusters |
|
3454
|
|
|
|
|
|
|
obtained by using the naive Bayes' classifier and the latter showing the soft |
|
3455
|
|
|
|
|
|
|
clusters obtained through the posterior class probabilities at the data points. |
|
3456
|
|
|
|
|
|
|
|
|
3457
|
|
|
|
|
|
|
=back |
|
3458
|
|
|
|
|
|
|
|
|
3459
|
|
|
|
|
|
|
Going through the six examples listed above will make you familiar with how to make |
|
3460
|
|
|
|
|
|
|
the calls to the clustering and the visualization methods. The C directory |
|
3461
|
|
|
|
|
|
|
also includes several parameter files with names like |
|
3462
|
|
|
|
|
|
|
|
|
3463
|
|
|
|
|
|
|
param1.txt |
|
3464
|
|
|
|
|
|
|
param2.txt |
|
3465
|
|
|
|
|
|
|
param3.txt |
|
3466
|
|
|
|
|
|
|
... |
|
3467
|
|
|
|
|
|
|
|
|
3468
|
|
|
|
|
|
|
These were used to generate the synthetic data for which the results are shown in the |
|
3469
|
|
|
|
|
|
|
C directory. Just make a copy of one of these files and edit it if you |
|
3470
|
|
|
|
|
|
|
would like to generate your own multivariate data for clustering. Note that you can |
|
3471
|
|
|
|
|
|
|
generate data with any dimensionality through appropriate entries in the parameter |
|
3472
|
|
|
|
|
|
|
file. |
|
3473
|
|
|
|
|
|
|
|
|
3474
|
|
|
|
|
|
|
=head1 CAVEATS |
|
3475
|
|
|
|
|
|
|
|
|
3476
|
|
|
|
|
|
|
When you run the scripts in the C directory, your results will NOT always |
|
3477
|
|
|
|
|
|
|
look like what I have shown in the PNG image files in the directory. As mentioned |
|
3478
|
|
|
|
|
|
|
earlier in Description, the EM algorithm starting from randomly chosen initial |
|
3479
|
|
|
|
|
|
|
guesses for the cluster means can get stuck in a local maximum. |
|
3480
|
|
|
|
|
|
|
|
|
3481
|
|
|
|
|
|
|
That raises an interesting question of how one judges the correctness of clustering |
|
3482
|
|
|
|
|
|
|
results when dealing with real experimental data. For real data, the best approach |
|
3483
|
|
|
|
|
|
|
is to try the EM algorithm multiple times with all of the seeding options included in |
|
3484
|
|
|
|
|
|
|
this module. It would be safe to say that, at least in low dimensional spaces and |
|
3485
|
|
|
|
|
|
|
with sufficient data, a majority of your runs should yield "correct" results. |
|
3486
|
|
|
|
|
|
|
|
|
3487
|
|
|
|
|
|
|
Also bear in mind that a pure Perl implementation is not meant for the clustering of |
|
3488
|
|
|
|
|
|
|
very large data files. It is really designed more for researching issues related to |
|
3489
|
|
|
|
|
|
|
EM based approaches to clustering. |
|
3490
|
|
|
|
|
|
|
|
|
3491
|
|
|
|
|
|
|
|
|
3492
|
|
|
|
|
|
|
=head1 REQUIRED |
|
3493
|
|
|
|
|
|
|
|
|
3494
|
|
|
|
|
|
|
This module requires the following three modules: |
|
3495
|
|
|
|
|
|
|
|
|
3496
|
|
|
|
|
|
|
Math::Random |
|
3497
|
|
|
|
|
|
|
Graphics::GnuplotIF |
|
3498
|
|
|
|
|
|
|
Math::GSL::Matrix |
|
3499
|
|
|
|
|
|
|
|
|
3500
|
|
|
|
|
|
|
the first for generating the multivariate random numbers, the second for the |
|
3501
|
|
|
|
|
|
|
visualization of the clusters, and the last for access to the Perl wrappers for the |
|
3502
|
|
|
|
|
|
|
GNU Scientific Library. The C module of this library is used for various |
|
3503
|
|
|
|
|
|
|
algebraic operations on the covariance matrices of the Gaussians. |
|
3504
|
|
|
|
|
|
|
|
|
3505
|
|
|
|
|
|
|
=head1 EXPORT |
|
3506
|
|
|
|
|
|
|
|
|
3507
|
|
|
|
|
|
|
None by design. |
|
3508
|
|
|
|
|
|
|
|
|
3509
|
|
|
|
|
|
|
|
|
3510
|
|
|
|
|
|
|
=head1 BUGS |
|
3511
|
|
|
|
|
|
|
|
|
3512
|
|
|
|
|
|
|
Please notify the author if you encounter any bugs. When sending email, please place |
|
3513
|
|
|
|
|
|
|
the string 'Algorithm EM' in the subject line. |
|
3514
|
|
|
|
|
|
|
|
|
3515
|
|
|
|
|
|
|
=head1 INSTALLATION |
|
3516
|
|
|
|
|
|
|
|
|
3517
|
|
|
|
|
|
|
Download the archive from CPAN in any directory of your choice. Unpack the archive |
|
3518
|
|
|
|
|
|
|
with a command that on a Linux machine would look like: |
|
3519
|
|
|
|
|
|
|
|
|
3520
|
|
|
|
|
|
|
tar zxvf Algorithm-ExpectationMaximization-1.22.tar.gz |
|
3521
|
|
|
|
|
|
|
|
|
3522
|
|
|
|
|
|
|
This will create an installation directory for you whose name will be |
|
3523
|
|
|
|
|
|
|
C. Enter this directory and execute the |
|
3524
|
|
|
|
|
|
|
following commands for a standard install of the module if you have root privileges: |
|
3525
|
|
|
|
|
|
|
|
|
3526
|
|
|
|
|
|
|
perl Makefile.PL |
|
3527
|
|
|
|
|
|
|
make |
|
3528
|
|
|
|
|
|
|
make test |
|
3529
|
|
|
|
|
|
|
sudo make install |
|
3530
|
|
|
|
|
|
|
|
|
3531
|
|
|
|
|
|
|
If you do not have root privileges, you can carry out a non-standard install the |
|
3532
|
|
|
|
|
|
|
module in any directory of your choice by: |
|
3533
|
|
|
|
|
|
|
|
|
3534
|
|
|
|
|
|
|
perl Makefile.PL prefix=/some/other/directory/ |
|
3535
|
|
|
|
|
|
|
make |
|
3536
|
|
|
|
|
|
|
make test |
|
3537
|
|
|
|
|
|
|
make install |
|
3538
|
|
|
|
|
|
|
|
|
3539
|
|
|
|
|
|
|
With a non-standard install, you may also have to set your PERL5LIB environment |
|
3540
|
|
|
|
|
|
|
variable so that this module can find the required other modules. How you do that |
|
3541
|
|
|
|
|
|
|
would depend on what platform you are working on. In order to install this module in |
|
3542
|
|
|
|
|
|
|
a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment |
|
3543
|
|
|
|
|
|
|
variable by |
|
3544
|
|
|
|
|
|
|
|
|
3545
|
|
|
|
|
|
|
setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/ |
|
3546
|
|
|
|
|
|
|
|
|
3547
|
|
|
|
|
|
|
If I used bash, I'd need to declare: |
|
3548
|
|
|
|
|
|
|
|
|
3549
|
|
|
|
|
|
|
export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/ |
|
3550
|
|
|
|
|
|
|
|
|
3551
|
|
|
|
|
|
|
|
|
3552
|
|
|
|
|
|
|
=head1 ACKNOWLEDGMENTS |
|
3553
|
|
|
|
|
|
|
|
|
3554
|
|
|
|
|
|
|
Version 1.2 is a result of the feedback received from Paul |
|
3555
|
|
|
|
|
|
|
May of University of Birmingham. Thanks, Paul! |
|
3556
|
|
|
|
|
|
|
|
|
3557
|
|
|
|
|
|
|
=head1 AUTHOR |
|
3558
|
|
|
|
|
|
|
|
|
3559
|
|
|
|
|
|
|
Avinash Kak, kak@purdue.edu |
|
3560
|
|
|
|
|
|
|
|
|
3561
|
|
|
|
|
|
|
If you send email, please place the string "EM Algorithm" in your |
|
3562
|
|
|
|
|
|
|
subject line to get past my spam filter. |
|
3563
|
|
|
|
|
|
|
|
|
3564
|
|
|
|
|
|
|
=head1 COPYRIGHT |
|
3565
|
|
|
|
|
|
|
|
|
3566
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify it under the |
|
3567
|
|
|
|
|
|
|
same terms as Perl itself. |
|
3568
|
|
|
|
|
|
|
|
|
3569
|
|
|
|
|
|
|
Copyright 2014 Avinash Kak |
|
3570
|
|
|
|
|
|
|
|
|
3571
|
|
|
|
|
|
|
=cut |
|
3572
|
|
|
|
|
|
|
|