File Coverage

blib/lib/Algorithm/ExpectationMaximization.pm

Criterion	Covered	Total	%
statement	22	24	91.6
branch			n/a
condition			n/a
subroutine	8	8	100.0
pod			n/a
total	30	32	93.7

line	stmt	sub	time	code
1				package Algorithm::ExpectationMaximization;
2
3				#---------------------------------------------------------------------------
4				# Copyright (c) 2013 Avinash Kak. All rights reserved. This program is free
5				# software. You may modify and/or distribute it under the same terms as Perl itself.
6				# This copyright notice must remain attached to the file.
7				#
8				# Algorithm::ExpectationMaximization is a pure Perl implementation for
9				# Expectation-Maximization based clustering of multi-dimensional data that can be
10				# modeled as a Gaussian mixture.
11				# ---------------------------------------------------------------------------
12
13	1	1	22045	use 5.10.0;
	1		4
	1		54
14	1	1	6	use strict;
	1		2
	1		33
15	1	1	5	use warnings;
	1		14
	1		43
16	1	1	6	use Carp;
	1		1
	1		117
17	1	1	6	use File::Basename;
	1		2
	1		111
18	1	1	1254	use Math::Random;
	1		8695
	1		130
19	1	1	1257	use Graphics::GnuplotIF;
	1		17022
	1		214
20	1	1	1722	use Math::GSL::Matrix;
	0
	0
21				use Scalar::Util 'blessed';
22
23				our $VERSION = '1.2';
24
25				# from perl docs:
26				my $_num_regex = '^[+-]?\ (\d+(\.\d)?\|\.\d+)([eE][+-]?\d+)?$';
27
28				# Constructor:
29				sub new {
30				my ($class, %args) = @_;
31				my @params = keys %args;
32				croak "\nYou have used a wrong name for a keyword argument " .
33				"--- perhaps a misspelling\n"
34				if check_for_illegal_params(@params) == 0;
35				bless {
36				_datafile => $args{datafile} \|\| croak("datafile required"),
37				_mask => $args{mask} \|\| croak("mask required"),
38				_K => $args{K} \|\| croak("number of clusters required"),
39				_terminal_output => $args{terminal_output} \|\| 0,
40				_seeding => $args{seeding} \|\| 'random',
41				_seed_tags => $args{seed_tags} \|\| [],
42				_max_em_iterations=> $args{max_em_iterations} \|\| 100,
43				_class_priors => $args{class_priors} \|\| [],
44				_debug => $args{debug} \|\| 0,
45				_N => 0,
46				_data => {},
47				_data_id_tags => [],
48				_clusters => [],
49				_cluster_centers => [],
50				_data_dimensions => 0,
51				_cluster_normalizers => [],
52				_cluster_means => [],
53				_cluster_covariances => [],
54				_class_labels_for_data => {},
55				_class_probs_at_each_data_point => {},
56				_expected_class_probs => {},
57				_old_priors => [],
58				_old_old_priors => [],
59				_fisher_quality_vs_iteration => [],
60				_mdl_quality_vs_iterations => [],
61				}, $class;
62				}
63
64				sub read_data_from_file {
65				my $self = shift;
66				my $datafile = $self->{_datafile};
67				my $mask = $self->{_mask};
68				my @mask = split //, $mask;
69				$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
70				print "data dimensionality: $self->{_data_dimensions} \n"
71				if $self->{_terminal_output};
72				open INPUT, $datafile
73				or die "unable to open file $datafile: $!\n";
74				chomp( my @raw_data = );
75				close INPUT;
76				# Transform strings into number data
77				foreach my $record (@raw_data) {
78				next unless $record;
79				next if $record =~ /^#/;
80				my @data_fields;
81				my @fields = split /\s+/, $record;
82				die "\nABORTED: Mask size does not correspond to row record size\n"
83				if $#fields != $#mask;
84				my $record_id;
85				foreach my $i (0..@fields-1) {
86				if ($mask[$i] eq '0') {
87				next;
88				} elsif ($mask[$i] eq 'N') {
89				$record_id = $fields[$i];
90				} elsif ($mask[$i] eq '1') {
91				push @data_fields, $fields[$i];
92				} else {
93				die "misformed mask for reading the data file\n";
94				}
95				}
96				my @nums = map {/$_num_regex/;$_} @data_fields;
97				$self->{_data}->{ $record_id } = \@nums;
98				}
99				my @all_data_ids = keys %{$self->{_data}};
100				$self->{_data_id_tags} = \@all_data_ids;
101				$self->{_N} = scalar @all_data_ids;
102				if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
103				carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
104				"points must satisfy the relation N > 2xK**2 where K is " .
105				"the number of clusters requested for the clusters to be " .
106				"meaningful $!"
107				if ( $self->{_N} < (2 * $self->{_K} ** 2) );
108				}
109				srand(123456789);
110				}
111
112
113				# This is the heart of the module --- in the sense that this method implements the EM
114				# algorithm for the estimating the parameters of a Gaussian mixture model for the
115				# data. In the implementation shown below, we declare convergence for the EM
116				# algorithm when the change in the class priors over three iterations falls below a
117				# threshold. The current value of this threshold, as can be seen in the function
118				# compare_array_floats(), is 0.00001.
119				sub EM {
120				my $self = shift;
121				$self->initialize_class_priors();
122				for (my $em_iteration=0; $em_iteration < $self->{_max_em_iterations};
123				$em_iteration++) {
124				if ($em_iteration == 0) {
125				print "\nSeeding the EM algorithm with:\n";
126				$self->display_seeding_stats();
127				print "\nFinished displaying the seeding information\n";
128				print "\nWill print out a dot for each iteration of EM:\n\n";
129				}
130				my $progress_indicator = $em_iteration % 5 == 0 ? $em_iteration : ".";
131				print $progress_indicator;
132				foreach my $data_id (@{$self->{_data_id_tags}}) {
133				$self->{_class_probs_at_each_data_point}->{$data_id} = [];
134				$self->{_expected_class_probs}->{$data_id} = [];
135				}
136				# Calculate prob(x \| C_i) --- this is the prob of data point x as
137				# a member of class C_i. You must do this for all K classes:
138				foreach my $cluster_index(0..$self->{_K}-1) {
139				$self->find_prob_at_each_datapoint_for_given_mean_and_covar(
140				$self->{_cluster_means}->[$cluster_index],
141				$self->{_cluster_covariances}->[$cluster_index] );
142				}
143				$self->{_cluster_normalizers} = [];
144				if ($self->{_debug}) {
145				print "\n\nDisplaying prob of a data point vis-a-vis each class:\n\n";
146				foreach my $data_id (sort keys %{$self->{_data}}) {
147				my $class_probs_at_a_point =
148				$self->{_class_probs_at_each_data_point}->{$data_id};
149				print "Class probs for $data_id: @$class_probs_at_a_point\n"
150				}
151				}
152				# Calculate prob(C_i \| x) which is the posterior prob of class
153				# considered as a r.v. to be C_i at a given point x. For a given
154				# x, the sum of such probabilities over all C_i must add up to 1:
155				$self->find_expected_classes_at_each_datapoint();
156				if ($self->{_debug}) {
157				print "\n\nDisplaying expected class probs at each data point:\n\n";
158				foreach my $data_id (sort keys %{$self->{_expected_class_probs}}) {
159				my $expected_classes_at_a_point =
160				$self->{_expected_class_probs}->{$data_id};
161				print "Expected classes $data_id: @$expected_classes_at_a_point\n";
162				}
163				}
164				# 1. UPDATE MEANS:
165				my @new_means;
166				foreach my $cluster_index(0..$self->{_K}-1) {
167				$new_means[$cluster_index] =
168				Math::GSL::Matrix->new($self->{_data_dimensions},1);
169				$new_means[$cluster_index]->zero();
170				foreach my $data_id (keys %{$self->{_data}}) {
171				my $data_record = $self->{_data}->{$data_id};
172				my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
173				$data_vec->set_col(0,$data_record);
174				$new_means[$cluster_index] +=
175				$self->{_expected_class_probs}->{$data_id}->[$cluster_index] *
176				$data_vec->copy();
177				$self->{_cluster_normalizers}->[$cluster_index] +=
178				$self->{_expected_class_probs}->{$data_id}->[$cluster_index];
179				}
180				$new_means[$cluster_index] *= 1.0 /
181				$self->{_cluster_normalizers}->[$cluster_index];
182				}
183				if ($self->{_debug}) {
184				foreach my $meanvec (@new_means) {
185				display_matrix("At EM Iteration $em_iteration, new mean vector is",
186				$meanvec);
187				}
188				}
189				$self->{_cluster_means} = \@new_means;
190				# 2. UPDATE COVARIANCES:
191				my @new_covariances;
192				foreach my $cluster_index(0..$self->{_K}-1) {
193				$new_covariances[$cluster_index] =
194				Math::GSL::Matrix->new($self->{_data_dimensions},
195				$self->{_data_dimensions});
196				$new_covariances[$cluster_index]->zero();
197				my $normalizer = 0;
198				foreach my $data_id (keys %{$self->{_data}}) {
199				my $data_record = $self->{_data}->{$data_id};
200				my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
201				$data_vec->set_col(0,$data_record);
202				my $mean_subtracted_data =
203				$data_vec - $self->{_cluster_means}->[$cluster_index];
204				my $outer_product = outer_product($mean_subtracted_data,
205				$mean_subtracted_data);
206				$new_covariances[$cluster_index] +=
207				$self->{_expected_class_probs}->{$data_id}->[$cluster_index] *
208				$outer_product;
209				}
210				$new_covariances[$cluster_index] *=
211				1.0 /
212				$self->{_cluster_normalizers}->[$cluster_index];
213				}
214				$self->{_cluster_covariances} = \@new_covariances;
215				# 3. UPDATE PRIORS:
216				$self->{_old_old_priors} = deep_copy_array( $self->{_old_priors} )
217				if @{$self->{_old_priors}} > 0;
218				$self->{_old_priors} = deep_copy_array( $self->{_class_priors} );
219				foreach my $cluster_index(0..$self->{_K}-1) {
220				$self->{_class_priors}->[$cluster_index] =
221				$self->{_cluster_normalizers}->[$cluster_index] / $self->{_N};
222				}
223				my @priors = @{$self->{_class_priors}};
224				print "\nUpdated priors: @priors\n\n\n" if $self->{_debug};
225				push @{$self->{_fisher_quality_vs_iteration}},
226				$self->clustering_quality_fisher();
227				push @{$self->{_mdl_quality_vs_iteration}}, $self->clustering_quality_mdl();
228				if ( ($em_iteration > 5 && $self->reached_convergence())
229				\|\| ($em_iteration == $self->{_max_em_iterations} - 1) ) {
230				my @old_old_priors = @{$self->{_old_old_priors}};
231				my @old_priors = @{$self->{_old_priors}};
232				print "\n\nPrevious to previous priors: @old_old_priors\n";
233				print "Previous priors: @old_priors\n";
234				print "Current class priors: @{$self->{_class_priors}}\n";
235				print "\n\nCONVERGENCE ACHIEVED AT ITERATION $em_iteration\n\n"
236				if $em_iteration < $self->{_max_em_iterations} - 1;
237				last;
238				}
239				}
240				print "\n\n\n";
241				}
242
243				sub reached_convergence {
244				my $self = shift;
245				return 1 if compare_array_floats($self->{_old_old_priors},
246				$self->{_old_priors})
247				&&
248				compare_array_floats($self->{_old_priors},
249				$self->{_class_priors});
250				return 0;
251				}
252
253				# Classify the data into disjoint clusters using the Naive Bayes' classification:
254				sub run_bayes_classifier {
255				my $self = shift;
256				$self->classify_all_data_tuples_bayes($self->{_cluster_means},
257				$self->{_cluster_covariances});
258				}
259
260				# Should NOT be called before run_bayes_classifier is run
261				sub return_disjoint_clusters {
262				my $self = shift;
263				return $self->{_clusters};
264				}
265
266				sub return_clusters_with_posterior_probs_above_threshold {
267				my $self = shift;
268				my $theta = shift;
269				my @class_distributions;
270				foreach my $cluster_index (0..$self->{_K}-1) {
271				push @class_distributions, [];
272				}
273				foreach my $data_tag (@{$self->{_data_id_tags}}) {
274				foreach my $cluster_index (0..$self->{_K}-1) {
275				push @{$class_distributions[$cluster_index]}, $data_tag
276				if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index]
277				> $theta;
278				}
279				}
280				return \@class_distributions;
281				}
282
283				sub return_individual_class_distributions_above_given_threshold {
284				my $self = shift;
285				my $theta = shift;
286				my @probability_distributions;
287				foreach my $cluster_index (0..$self->{_K}-1) {
288				push @probability_distributions, [];
289				}
290				foreach my $cluster_index (0..$self->{_K}-1) {
291				my $mean_vec = $self->{_cluster_means}->[$cluster_index];
292				my $covar = $self->{_cluster_covariances}->[$cluster_index];
293				foreach my $data_id (keys %{$self->{_data}}) {
294				my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
295				$data_vec->set_col( 0, $self->{_data}->{$data_id});
296				my $datavec_minus_mean = $data_vec - $mean_vec;
297				display_matrix( "datavec minus mean is ", $datavec_minus_mean )
298				if $self->{_debug};
299				my $exponent = undef;
300				if ($self->{_data_dimensions} > 1) {
301				$exponent = -0.5 * vector_matrix_multiply(
302				transpose($datavec_minus_mean),
303				matrix_vector_multiply($covar->inverse(), $datavec_minus_mean ) );
304				} else {
305				my @var_inverse = $covar->inverse()->as_list;
306				my $var_inverse_val = $var_inverse[0];
307				my @data_minus_mean = $datavec_minus_mean->as_list;
308				my $data_minus_mean_val = $data_minus_mean[0];
309				$exponent = -0.5 * ($data_minus_mean_val ** 2) * $var_inverse_val;
310				}
311				print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug};
312				my $coefficient = 1.0 / \
313				( (2 * $Math::GSL::Const::M_PI)*$self->{_data_dimensions} sqrt($covar->det()) );
314				my $prob = $coefficient * exp($exponent);
315				push @{$probability_distributions[$cluster_index]}, $data_id
316				if $prob > $theta;
317				}
318				}
319				return \@probability_distributions;
320				}
321
322				sub return_estimated_priors {
323				my $self = shift;
324				return $self->{_class_priors};
325				}
326
327				# Calculates the MDL (Minimum Description Length) clustering criterion according to
328				# Rissanen. (J. Rissanen: "Modeling by Shortest Data Description," Automatica, 1978,
329				# and "A Universal Prior for Integers and Estimation by Minimum Description Length,"
330				# Annals of Statistics, 1983.) The MDL criterion is a difference of a log-likelihood
331				# term for all of the observed data and a model-complexity penalty term. In general,
332				# both the log-likelihood and the model-complexity terms increase as the number of
333				# clusters increases. The form of the MDL criterion used in the implementation below
334				# uses for the penalty term the Bayesian Information Criterion (BIC) of G. Schwartz,
335				# "Estimating the Dimensions of a Model," The Annals of Statistics, 1978. In
336				# general, the smaller the value of the MDL quality measure calculated below, the
337				# better the clustering of the data.
338				sub clustering_quality_mdl {
339				my $self = shift;
340				# Calculate the inverses of all of the covariance matrices in order to avoid
341				# having to calculate them repeatedly inside the inner 'foreach' loop in the
342				# main part of this method. Here we go:
343				my @covar_inverses;
344				foreach my $cluster_index (0..$self->{_K}-1) {
345				my $covar = $self->{_cluster_covariances}->[$cluster_index];
346				push @covar_inverses, $covar->inverse();
347				}
348				# For the clustering quality, first calculate the log-likelihood of all the
349				# observed data:
350				my $log_likelihood = 0;
351				foreach my $tag (@{$self->{_data_id_tags}}) {
352				my $likelihood_for_each_tag = 0;
353				foreach my $cluster_index (0..$self->{_K}-1) {
354				my $mean_vec = $self->{_cluster_means}->[$cluster_index];
355				my $covar = $self->{_cluster_covariances}->[$cluster_index];
356				my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
357				$data_vec->set_col( 0, $self->{_data}->{$tag});
358				my $datavec_minus_mean = $data_vec - $mean_vec;
359				my $exponent = undef;
360				if ($self->{_data_dimensions} > 1) {
361				$exponent = -0.5 * vector_matrix_multiply(
362				transpose($datavec_minus_mean),
363				matrix_vector_multiply($covar_inverses[$cluster_index],
364				$datavec_minus_mean ) );
365				} else {
366				my @var_inverse = $covar_inverses[$cluster_index]->as_list;
367				my $var_inverse_val = $var_inverse[0];
368				my @data_minus_mean = $datavec_minus_mean->as_list;
369				my $data_minus_mean_val = $data_minus_mean[0];
370				$exponent = -0.5 * ($data_minus_mean_val ** 2) * $var_inverse_val;
371				}
372				next if $covar->det() < 0;
373				my $coefficient = 1.0 /
374				( (2 * $Math::GSL::Const::M_PI)**$self->{_data_dimensions}
375				* sqrt($covar->det()) );
376				my $prob = $coefficient * exp($exponent);
377				$likelihood_for_each_tag +=
378				$prob * $self->{_class_priors}->[$cluster_index];
379				}
380				$log_likelihood += log( $likelihood_for_each_tag );
381				}
382				# Now calculate the model complexity penalty. $L is the total number of
383				# parameters it takes to specify a mixture of K Gaussians. If d is the
384				# dimensionality of the data space, the covariance matrix of each Gaussian takes
385				# (d**2 -d)/2 + d = d(d+1)/2 parameters since this matrix must be symmetric. And
386				# then you need d mean value parameters, and one prior probability parameter
387				# for the Gaussian. So $L = K[1 + d + d(d+1)/2] - 1 where the final '1' that
388				# is subtracted is to account for the normalization on the class priors.
389				my $L = (0.5 * $self->{_K} *
390				($self->{_data_dimensions}*2 + 3$self->{_data_dimensions} + 2) ) - 1;
391				my $model_complexity_penalty = 0.5 * $L * log( $self->{_N} );
392				my $mdl_criterion = -1 * $log_likelihood + $model_complexity_penalty;
393				return $mdl_criterion;
394				}
395
396				# For our second measure of clustering quality, we use `trace( SW^-1 . SB)' where SW
397				# is the within-class scatter matrix, more commonly denoted S_w, and SB the
398				# between-class scatter matrix, more commonly denoted S_b (the underscore means
399				# subscript). This measure can be thought of as the normalized average distance
400				# between the clusters, the normalization being provided by average cluster
401				# covariance SW^-1. Therefore, the larger the value of this quality measure, the
402				# better the separation between the clusters. Since this measure has its roots in
403				# the Fisher linear discriminant function, we incorporate the word 'fisher' in the
404				# name of the quality measure. Note that this measure is good only when the clusters
405				# are disjoint. When the clusters exhibit significant overlap, the numbers produced
406				# by this quality measure tend to be generally meaningless. As an extreme case,
407				# let's say your data was produced by a set of Gaussians, all with the same mean
408				# vector, but each with a distinct covariance. For this extreme case, this measure
409				# will produce a value close to zero --- depending on the accuracy with which the
410				# means are estimated --- even when your clusterer is doing a good job of identifying
411				# the individual clusters.
412				sub clustering_quality_fisher {
413				my $self = shift;
414				my @cluster_quality_indices;
415				my $fisher_trace = 0;
416				my $S_w =
417				Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions});
418				$S_w->zero;
419				my $S_b =
420				Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions});
421				$S_b->zero;
422				my $global_mean = Math::GSL::Matrix->new($self->{_data_dimensions},1);
423				$global_mean->zero;
424				foreach my $cluster_index(0..$self->{_K}-1) {
425				$global_mean = $self->{_class_priors}->[$cluster_index] *
426				$self->{_cluster_means}->[$cluster_index];
427				}
428				foreach my $cluster_index(0..$self->{_K}-1) {
429				$S_w += $self->{_cluster_covariances}->[$cluster_index] *
430				$self->{_class_priors}->[$cluster_index];
431				my $class_mean_minus_global_mean = $self->{_cluster_means}->[$cluster_index]
432				- $global_mean;
433				my $outer_product = outer_product( $class_mean_minus_global_mean,
434				$class_mean_minus_global_mean );
435				$S_b += $self->{_class_priors}->[$cluster_index] * $outer_product;
436				}
437				my $fisher = matrix_multiply($S_w->inverse, $S_b);
438				return $fisher unless defined blessed($fisher);
439				return matrix_trace($fisher);
440				}
441
442				sub display_seeding_stats {
443				my $self = shift;
444				foreach my $cluster_index(0..$self->{_K}-1) {
445				print "\nSeeding for cluster $cluster_index:\n";
446				my $mean = $self->{_cluster_means}->[$cluster_index];
447				display_matrix("The mean is: ", $mean);
448				my $covariance = $self->{_cluster_covariances}->[$cluster_index];
449				display_matrix("The covariance is: ", $covariance);
450				}
451				}
452
453				sub display_fisher_quality_vs_iterations {
454				my $self = shift;
455				print "\n\nFisher Quality vs. Iterations: " .
456				"@{$self->{_fisher_quality_vs_iteration}}\n\n";
457				}
458
459				sub display_mdl_quality_vs_iterations {
460				my $self = shift;
461				print "\n\nMDL Quality vs. Iterations: @{$self->{_mdl_quality_vs_iteration}}\n\n";
462				}
463
464				sub find_prob_at_each_datapoint_for_given_mean_and_covar {
465				my $self = shift;
466				my $mean_vec_ref = shift;
467				my $covar_ref = shift;
468				foreach my $data_id (keys %{$self->{_data}}) {
469				my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
470				$data_vec->set_col( 0, $self->{_data}->{$data_id});
471				if ($self->{_debug}) {
472				display_matrix("data vec in find prob function", $data_vec);
473				display_matrix("mean vec in find prob function", $mean_vec_ref);
474				display_matrix("covariance in find prob function", $covar_ref);
475				}
476				my $datavec_minus_mean = $data_vec - $mean_vec_ref;
477				display_matrix( "datavec minus mean is ", $datavec_minus_mean ) if $self->{_debug};
478				my $exponent = undef;
479				if ($self->{_data_dimensions} > 1) {
480				$exponent = -0.5 * vector_matrix_multiply( transpose($datavec_minus_mean),
481				matrix_vector_multiply( $covar_ref->inverse(), $datavec_minus_mean ) );
482				} elsif (defined blessed($covar_ref)) {
483				my @data_minus_mean = $datavec_minus_mean->as_list;
484				my $data_minus_mean_val = $data_minus_mean[0];
485				my @covar_as_matrix = $covar_ref->as_list;
486				my $covar_val = $covar_as_matrix[0];
487				$exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_val;
488				} else {
489				my @data_minus_mean = $datavec_minus_mean->as_list;
490				my $data_minus_mean_val = $data_minus_mean[0];
491				$exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_ref;
492				}
493				print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug};
494				my $coefficient = undef;
495				if ($self->{_data_dimensions} > 1) {
496				$coefficient = 1.0 / sqrt( ((2 * $Math::GSL::Const::M_PI) ** $self->{_data_dimensions}) *
497				$covar_ref->det()) ;
498				} elsif (!defined blessed($covar_ref)) {
499				$coefficient = 1.0 / sqrt(2 * $covar_ref * $Math::GSL::Const::M_PI);
500				} else {
501				my @covar_as_matrix = $covar_ref->as_list;
502				my $covar_val = $covar_as_matrix[0];
503				$coefficient = 1.0 / sqrt(2 * $covar_val * $Math::GSL::Const::M_PI);
504				}
505				my $prob = $coefficient * exp($exponent);
506				push @{$self->{_class_probs_at_each_data_point}->{$data_id}}, $prob;
507				}
508				}
509
510				sub find_expected_classes_at_each_datapoint {
511				my $self = shift;
512				my @priors = @{$self->{_class_priors}};
513				foreach my $data_id (sort keys %{$self->{_class_probs_at_each_data_point}}) {
514				my $numerator =
515				vector_2_vector_multiply(
516				$self->{_class_probs_at_each_data_point}->{$data_id},
517				$self->{_class_priors} );
518				my $sum = 0;
519				foreach my $part (@$numerator) {
520				$sum += $part;
521				}
522				$self->{_expected_class_probs}->{$data_id} = [map $_/$sum, @{$numerator}];
523				}
524				}
525
526				sub initialize_class_priors {
527				my $self = shift;
528				if (@{$self->{_class_priors}} == 0) {
529				my $prior = 1.0 / $self->{_K};
530				foreach my $class_index (0..$self->{_K}-1) {
531				push @{$self->{_class_priors}}, $prior;
532				}
533				}
534				die "Mismatch between number of values for class priors " .
535				"and the number of clusters expected"
536				unless @{$self->{_class_priors}} == $self->{_K};
537				my $sum = 0;
538				foreach my $prior (@{$self->{_class_priors}}) {
539				$sum += $prior;
540				}
541				die "Your priors in the constructor call do not add up to 1"
542				unless abs($sum - 1) < 0.001;
543				print "\nInitially assumed class priors are: @{$self->{_class_priors}}\n";
544				}
545
546				sub estimate_class_priors {
547				my $self = shift;
548				foreach my $datatag (keys %{$self->{_data}}) {
549				my $class_label = $self->{_class_labels}->{$datatag};
550				$self->{_class_priors}[$class_label]++;
551				}
552				foreach my $prior (@{$self->{_class_priors}}) {
553				$prior /= $self->{_total_number_data_tuples};
554				}
555				foreach my $prior (@{$self->{_class_priors}}) {
556				print "class priors: @{$self->{_class_priors}}\n";
557				}
558				}
559
560				sub classify_all_data_tuples_bayes {
561				my $self = shift;
562				my $mean_vecs_ref = shift;
563				my $covariances_ref = shift;
564				my @new_clusters;
565				foreach my $index (0..$self->{_K}-1) {
566				push @new_clusters, [];
567				}
568				foreach my $data_id (@{$self->{_data_id_tags}}) {
569				my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
570				$data_vec->set_col( 0, deep_copy_array($self->{_data}->{$data_id}));
571				my $cluster_index_for_tuple =
572				$self->classify_a_data_point_bayes($data_vec,
573				$mean_vecs_ref, $covariances_ref);
574				$self->{_class_labels}->{$data_id} = $cluster_index_for_tuple;
575				push @{$new_clusters[$cluster_index_for_tuple]}, $data_id;
576				}
577				$self->{_clusters} = \@new_clusters;
578				}
579
580				sub classify_a_data_point_bayes {
581				my $self = shift;
582				my $data_vec = shift;
583				my $mean_vecs_ref = shift;
584				my $covariances_ref = shift;
585				my @cluster_mean_vecs = @$mean_vecs_ref;
586				my @cluster_covariances = @$covariances_ref;
587				my @log_likelihoods;
588				foreach my $cluster_index (0..@cluster_mean_vecs-1) {
589				my $mean = $cluster_mean_vecs[$cluster_index];
590				my $covariance = $cluster_covariances[$cluster_index];
591				my $datavec_minus_mean = $data_vec - $mean;
592				my $log_likely = undef;
593				if ($self->{_data_dimensions} > 1) {
594				$log_likely = -0.5 * vector_matrix_multiply(
595				transpose($datavec_minus_mean),
596				matrix_vector_multiply( $covariance->inverse(),
597				$datavec_minus_mean ) );
598				} else {
599
600				my @data_minus_mean = $datavec_minus_mean->as_list;
601				my $data_minus_mean_val = $data_minus_mean[0];
602				my @covar_as_matrix = $covariance->as_list;
603				my $covar_val = $covar_as_matrix[0];
604				$log_likely = -0.5 * ($data_minus_mean_val ** 2) / $covar_val;
605
606				}
607				my $posterior_log_likely = $log_likely +
608				log( $self->{_class_priors}[$cluster_index] );
609				push @log_likelihoods, $posterior_log_likely;
610				}
611				my ($minlikely, $maxlikely) = minmax(\@log_likelihoods);
612				my $cluster_index_for_data_point =
613				get_index_at_value( $maxlikely, \@log_likelihoods );
614				return $cluster_index_for_data_point;
615				}
616
617				sub find_cluster_means_and_covariances {
618				my $clusters = shift;
619				my $data_dimensions = find_data_dimensionality($clusters);
620				my (@cluster_mean_vecs, @cluster_covariances);
621				foreach my $cluster_index (0..@$clusters-1) {
622				my ($num_rows,$num_cols) =
623				($data_dimensions,scalar(@{$clusters->[$cluster_index]}));
624				print "\nFor cluster $cluster_index: rows: $num_rows and cols: $num_cols\n";
625				my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
626				my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
627				my $col_index = 0;
628				foreach my $ele (@{$clusters->[$cluster_index]}) {
629				$matrix->set_col($col_index++, $ele);
630				}
631				# display_matrix( "Displaying cluster matrix", $matrix );
632				foreach my $j (0..$num_cols-1) {
633				$mean_vec += $matrix->col($j);
634				}
635				$mean_vec *= 1.0 / $num_cols;
636				push @cluster_mean_vecs, $mean_vec;
637				display_matrix( "Displaying the mean vector", $mean_vec );
638				foreach my $j (0..$num_cols-1) {
639				my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
640				$matrix->set_col($j, \@new_col);
641				}
642				# display_matrix("Displaying mean subtracted data as a matrix", $matrix );
643				my $transposed = transpose( $matrix );
644				# display_matrix("Displaying transposed matrix",$transposed);
645				my $covariance = matrix_multiply( $matrix, $transposed );
646				$covariance *= 1.0 / $num_cols;
647				push @cluster_covariances, $covariance;
648				display_matrix("Displaying the cluster covariance", $covariance );
649				}
650				return (\@cluster_mean_vecs, \@cluster_covariances);
651				}
652
653				sub find_data_dimensionality {
654				my $clusters = shift;
655				my @first_cluster = @{$clusters->[0]};
656				my @first_data_element = @{$first_cluster[0]};
657				return scalar(@first_data_element);
658				}
659
660				sub find_seed_centered_covariances {
661				my $self = shift;
662				my $seed_tags = shift;
663				my (@seed_mean_vecs, @seed_based_covariances);
664				foreach my $seed_tag (@$seed_tags) {
665				my ($num_rows,$num_cols) = ($self->{_data_dimensions}, $self->{_N});
666				my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
667				my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
668				$mean_vec->set_col(0, $self->{_data}->{$seed_tag});
669				push @seed_mean_vecs, $mean_vec;
670				display_matrix( "Displaying the seed mean vector", $mean_vec );
671				my $col_index = 0;
672				foreach my $tag (@{$self->{_data_id_tags}}) {
673				$matrix->set_col($col_index++, $self->{_data}->{$tag});
674				}
675				foreach my $j (0..$num_cols-1) {
676				my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
677				$matrix->set_col($j, \@new_col);
678				}
679				my $transposed = transpose( $matrix );
680				my $covariance = matrix_multiply( $matrix, $transposed );
681				$covariance *= 1.0 / $num_cols;
682				push @seed_based_covariances, $covariance;
683				display_matrix("Displaying the seed covariance", $covariance )
684				if $self->{_debug};
685				}
686				return (\@seed_mean_vecs, \@seed_based_covariances);
687				}
688
689				# The most popular seeding mode for EM is random. We include two other seeding modes
690				# --- kmeans and manual --- since they do produce good results for specialized cases.
691				# For example, when the clusters in your data are non-overlapping and not too
692				# anisotropic, the kmeans based seeding should work at least as well as the random
693				# seeding. In such cases --- AND ONLY IN SUCH CASES --- the kmeans based seeding has
694				# the advantage of avoiding the getting stuck in a local-maximum problem of the EM
695				# algorithm.
696
697				sub seed_the_clusters {
698				my $self = shift;
699				if ($self->{_seeding} eq 'random') {
700				my @covariances;
701				my @means;
702				my @all_tags = @{$self->{_data_id_tags}};
703				my @seed_tags;
704				foreach my $i (0..$self->{_K}-1) {
705				push @seed_tags, $all_tags[int rand( $self->{_N} )];
706				}
707				print "Random Seeding: Randomly selected seeding tags are @seed_tags\n\n";
708				my ($seed_means, $seed_covars) =
709				$self->find_seed_centered_covariances(\@seed_tags);
710				$self->{_cluster_means} = $seed_means;
711				$self->{_cluster_covariances} = $seed_covars;
712				} elsif ($self->{_seeding} eq 'kmeans') {
713				$self->kmeans();
714				my $clusters = $self->{_clusters};
715				my @dataclusters;
716				foreach my $index (0..@$clusters-1) {
717				push @dataclusters, [];
718				}
719				foreach my $cluster_index (0..$self->{_K}-1) {
720				foreach my $tag (@{$clusters->[$cluster_index]}) {
721				my $data = $self->{_data}->{$tag};
722				push @{$dataclusters[$cluster_index]}, deep_copy_array($data);
723				}
724				}
725				($self->{_cluster_means}, $self->{_cluster_covariances}) =
726				find_cluster_means_and_covariances(\@dataclusters);
727				} elsif ($self->{_seeding} eq 'manual') {
728				die "You have not supplied the seeding tags for the option \"manual\""
729				unless @{$self->{_seed_tags}} > 0;
730				print "Manual Seeding: Seed tags are @{$self->{_seed_tags}}\n\n";
731				foreach my $tag (@{$self->{_seed_tags}}) {
732				die "invalid tag used for manual seeding"
733				unless exists $self->{_data}->{$tag};
734				}
735				my ($seed_means, $seed_covars) =
736				$self->find_seed_centered_covariances($self->{_seed_tags});
737				$self->{_cluster_means} = $seed_means;
738				$self->{_cluster_covariances} = $seed_covars;
739				} else {
740				die "Incorrect call syntax used. See documentation.";
741				}
742				}
743
744				# This is the top-level method for kmeans based initialization of the EM
745				# algorithm. The means and the covariances returned by kmeans are used to seed the EM
746				# algorithm.
747				sub kmeans {
748				my $self = shift;
749				my $K = $self->{_K};
750				$self->cluster_for_fixed_K_single_smart_try($K);
751				if ((defined $self->{_clusters}) && (defined $self->{_cluster_centers})){
752				return ($self->{_clusters}, $self->{_cluster_centers});
753				} else {
754				die "kmeans clustering failed.";
755				}
756				}
757
758				# Used by the kmeans algorithm for the initialization of the EM iterations. We do
759				# initial kmeans cluster seeding by subjecting the data to principal components
760				# analysis in order to discover the direction of maximum variance in the data space.
761				# Subsequently, we try to find the K largest peaks along this direction. The
762				# coordinates of these peaks serve as the seeds for the K clusters.
763				sub cluster_for_fixed_K_single_smart_try {
764				my $self = shift;
765				my $K = shift;
766				print "Clustering for K = $K\n" if $self->{_terminal_output};
767				my ($clusters, $cluster_centers) =
768				$self->cluster_for_given_K($K);
769				$self->{_clusters} = $clusters;
770				$self->{_cluster_centers} = $cluster_centers;
771				}
772
773				# Used by the kmeans part of the code for the initialization of the EM algorithm:
774				sub cluster_for_given_K {
775				my $self = shift;
776				my $K = shift;
777				my $cluster_centers = $self->get_initial_cluster_centers($K);
778				my $clusters = $self->assign_data_to_clusters_initial($cluster_centers);
779				my $cluster_nonexistant_flag = 0;
780				foreach my $trial (0..2) {
781				($clusters, $cluster_centers) =
782				$self->assign_data_to_clusters( $clusters, $K );
783				my $num_of_clusters_returned = @$clusters;
784				foreach my $cluster (@$clusters) {
785				$cluster_nonexistant_flag = 1 if ((!defined $cluster)
786				\|\| (@$cluster == 0));
787				}
788				last unless $cluster_nonexistant_flag;
789				}
790				return ($clusters, $cluster_centers);
791				}
792
793				# Used by the kmeans part of the code for the initialization of the EM algorithm:
794				sub get_initial_cluster_centers {
795				my $self = shift;
796				my $K = shift;
797				if ($self->{_data_dimensions} == 1) {
798				my @one_d_data;
799				foreach my $j (0..$self->{_N}-1) {
800				my $tag = $self->{_data_id_tags}[$j];
801				push @one_d_data, $self->{_data}->{$tag}->[0];
802				}
803				my @peak_points =
804				find_peak_points_in_given_direction(\@one_d_data,$K);
805				print "highest points at data values: @peak_points\n"
806				if $self->{_debug};
807				my @cluster_centers;
808				foreach my $peakpoint (@peak_points) {
809				push @cluster_centers, [$peakpoint];
810				}
811				return \@cluster_centers;
812				}
813				my ($num_rows,$num_cols) = ($self->{_data_dimensions},$self->{_N});
814				my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
815				my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
816				# All the record labels are stored in the array $self->{_data_id_tags}.
817				# The actual data for clustering is stored in a hash at $self->{_data}
818				# whose keys are the record labels; the value associated with each
819				# key is the array holding the corresponding numerical multidimensional
820				# data.
821				foreach my $j (0..$num_cols-1) {
822				my $tag = $self->{_data_id_tags}[$j];
823				my $data = $self->{_data}->{$tag};
824				$matrix->set_col($j, $data);
825				}
826				if ($self->{_debug}) {
827				print "\nDisplaying the original data as a matrix:";
828				display_matrix( $matrix );
829				}
830				foreach my $j (0..$num_cols-1) {
831				$mean_vec += $matrix->col($j);
832				}
833				$mean_vec *= 1.0 / $num_cols;
834				if ($self->{_debug}) {
835				print "Displaying the mean vector for the data:";
836				display_matrix( $mean_vec );
837				}
838				foreach my $j (0..$num_cols-1) {
839				my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
840				$matrix->set_col($j, \@new_col);
841				}
842				if ($self->{_debug}) {
843				print "Displaying mean subtracted data as a matrix:";
844				display_matrix( $matrix );
845				}
846				my $transposed = transpose( $matrix );
847				if ($self->{_debug}) {
848				print "Displaying transposed data matrix:";
849				display_matrix( $transposed );
850				}
851				my $covariance = matrix_multiply( $matrix, $transposed );
852				$covariance *= 1.0 / $num_cols;
853				if ($self->{_debug}) {
854				print "\nDisplaying the Covariance Matrix for your data:";
855				display_matrix( $covariance );
856				}
857				my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
858				my $num_of_eigens = @$eigenvalues;
859				my $largest_eigen_index = 0;
860				my $smallest_eigen_index = 0;
861				print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug};
862				foreach my $i (1..$num_of_eigens-1) {
863				$largest_eigen_index = $i if $eigenvalues->[$i] >
864				$eigenvalues->[$largest_eigen_index];
865				$smallest_eigen_index = $i if $eigenvalues->[$i] <
866				$eigenvalues->[$smallest_eigen_index];
867				print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug};
868				}
869				print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
870				print "\nsmallest eigen index: $smallest_eigen_index\n\n"
871				if $self->{_debug};
872				foreach my $i (0..$num_of_eigens-1) {
873				my @vec = $eigenvectors->[$i]->as_list;
874				print "Eigenvector $i: @vec\n" if $self->{_debug};
875				}
876				my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
877				print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug};
878				my @max_var_direction;
879				# Each element of the array @largest_eigen_vec is a Math::Complex object
880				foreach my $k (0..@largest_eigen_vec-1) {
881				my ($mag, $theta) = $largest_eigen_vec[$k] =~ /\[(\d*\.\d+),(\S+)\]/;
882				if ($theta eq '0') {
883				$max_var_direction[$k] = $mag;
884				} elsif ($theta eq 'pi') {
885				$max_var_direction[$k] = -1.0 * $mag;
886				} else {
887				die "eigendecomposition of covariance matrix produced a complex eigenvector --- something is wrong";
888				}
889				}
890				# "Maximum variance direction: @max_var_direction
891				print "\nMaximum Variance Direction: @max_var_direction\n\n"
892				if $self->{_debug};
893				# We now project all data points on the largest eigenvector.
894				# Each projection will yield a single point on the eigenvector.
895				my @projections;
896				foreach my $j (0..$self->{_N}-1) {
897				my $tag = $self->{_data_id_tags}[$j];
898				my $data = $self->{_data}->{$tag};
899				die "Dimensionality of the largest eigenvector does not "
900				. "match the dimensionality of the data"
901				unless @max_var_direction == $self->{_data_dimensions};
902				my $projection = vector_multiply($data, \@max_var_direction);
903				push @projections, $projection;
904				}
905				print "All projection points: @projections\n" if $self->{_debug};
906				my @peak_points = find_peak_points_in_given_direction(\@projections, $K);
907				print "highest points at points along largest eigenvec: @peak_points\n"
908				if $self->{_debug};
909				my @cluster_centers;
910				foreach my $peakpoint (@peak_points) {
911				my @actual_peak_coords = map {$peakpoint * $_} @max_var_direction;
912				push @cluster_centers, \@actual_peak_coords;
913				}
914				return \@cluster_centers;
915				}
916
917				# Used by the kmeans part of the code: This method is called by the previous method
918				# to locate K peaks in a smoothed histogram of the data points projected onto the
919				# maximal variance direction.
920				sub find_peak_points_in_given_direction {
921				my $dataref = shift;
922				my $how_many = shift;
923				my @data = @$dataref;
924				my ($min, $max) = minmax(\@data);
925				my $num_points = @data;
926				my @sorted_data = sort {$a <=> $b} @data;
927				#print "\n\nSorted data: @sorted_data\n";
928				my $scale = $max - $min;
929				foreach my $index (0..$#sorted_data-1) {
930				$sorted_data[$index] = ($sorted_data[$index] - $min) / $scale;
931				}
932				my $avg_diff = 0;
933				foreach my $index (0..$#sorted_data-1) {
934				my $diff = $sorted_data[$index+1] - $sorted_data[$index];
935				$avg_diff += ($diff - $avg_diff) / ($index + 1);
936				}
937				my $delta = 1.0 / 1000.0;
938				# It would be nice to set the delta adaptively, but I must
939				# change the number of cells in the next foreach loop accordingly
940				# my $delta = $avg_diff / 20;
941				my @accumulator = (0) x 1000;
942				foreach my $index (0..@sorted_data-1) {
943				my $cell_index = int($sorted_data[$index] / $delta);
944				my $smoothness = 40;
945				for my $index ($cell_index-$smoothness..$cell_index+$smoothness) {
946				next if $index < 0 \|\| $index > 999;
947				$accumulator[$index]++;
948				}
949				}
950				my $peaks_array = non_maximum_supression( \@accumulator );
951				my $peaks_index_hash = get_value_index_hash( $peaks_array );
952				my @K_highest_peak_locations;
953				my $k = 0;
954				foreach my $peak (sort {$b <=> $a} keys %$peaks_index_hash) {
955				my $unscaled_peak_point =
956				$min + $peaks_index_hash->{$peak} * $scale * $delta;
957				push @K_highest_peak_locations, $unscaled_peak_point
958				if $k < $how_many;
959				last if ++$k == $how_many;
960				}
961				return @K_highest_peak_locations;
962				}
963
964				# Used by the kmeans part of the code: The purpose of this routine is to form initial
965				# clusters by assigning the data samples to the initial clusters formed by the
966				# previous routine on the basis of the best proximity of the data samples to the
967				# different cluster centers.
968				sub assign_data_to_clusters_initial {
969				my $self = shift;
970				my @cluster_centers = @{ shift @_ };
971				my @clusters;
972				foreach my $ele (@{$self->{_data_id_tags}}) {
973				my $best_cluster;
974				my @dist_from_clust_centers;
975				foreach my $center (@cluster_centers) {
976				push @dist_from_clust_centers, $self->distance($ele, $center);
977				}
978				my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
979				push @{$clusters[$best_center_index]}, $ele;
980				}
981				return \@clusters;
982				}
983
984				# Used by the kmeans part of the code: This is the main routine that along with the
985				# update_cluster_centers() routine constitute the two key steps of the K-Means
986				# algorithm. In most cases, the infinite while() loop will terminate automatically
987				# when the cluster assignments of the data points remain unchanged. For the sake of
988				# safety, we keep track of the number of iterations. If this number reaches 100, we
989				# exit the while() loop anyway. In most cases, this limit will not be reached.
990				sub assign_data_to_clusters {
991				my $self = shift;
992				my $clusters = shift;
993				my $K = shift;
994				my $final_cluster_centers;
995				my $iteration_index = 0;
996				while (1) {
997				my $new_clusters;
998				my $assignment_changed_flag = 0;
999				my $current_cluster_center_index = 0;
1000				my $cluster_size_zero_condition = 0;
1001				my $how_many = @$clusters;
1002				my $cluster_centers = $self->update_cluster_centers(
1003				deep_copy_AoA_with_nulls( $clusters ) );
1004				$iteration_index++;
1005				foreach my $cluster (@$clusters) {
1006				my $current_cluster_center =
1007				$cluster_centers->[$current_cluster_center_index];
1008				foreach my $ele (@$cluster) {
1009				my @dist_from_clust_centers;
1010				foreach my $center (@$cluster_centers) {
1011				push @dist_from_clust_centers,
1012				$self->distance($ele, $center);
1013				}
1014				my ($min, $best_center_index) =
1015				minimum( \@dist_from_clust_centers );
1016				my $best_cluster_center =
1017				$cluster_centers->[$best_center_index];
1018				if (vector_equal($current_cluster_center,
1019				$best_cluster_center)){
1020				push @{$new_clusters->[$current_cluster_center_index]},
1021				$ele;
1022				} else {
1023				$assignment_changed_flag = 1;
1024				push @{$new_clusters->[$best_center_index]}, $ele;
1025				}
1026				}
1027				$current_cluster_center_index++;
1028				}
1029				# Now make sure that we still have K clusters since K is fixed:
1030				next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
1031				# Now make sure that none of the K clusters is an empty cluster:
1032				foreach my $newcluster (@$new_clusters) {
1033				$cluster_size_zero_condition = 1 if ((!defined $newcluster)
1034				or (@$newcluster == 0));
1035				}
1036				push @$new_clusters, (undef) x ($K - @$new_clusters)
1037				if @$new_clusters < $K;
1038				my $largest_cluster;
1039				foreach my $local_cluster (@$new_clusters) {
1040				next if !defined $local_cluster;
1041				$largest_cluster = $local_cluster if !defined $largest_cluster;
1042				if (@$local_cluster > @$largest_cluster) {
1043				$largest_cluster = $local_cluster;
1044				}
1045				}
1046				foreach my $local_cluster (@$new_clusters) {
1047				if ( (!defined $local_cluster) \|\| (@$local_cluster == 0) ) {
1048				push @$local_cluster, pop @$largest_cluster;
1049				}
1050				}
1051				next if (($cluster_size_zero_condition) && ($iteration_index < 100));
1052				last if $iteration_index == 100;
1053				# Now do a deep copy of new_clusters into clusters
1054				$clusters = deep_copy_AoA( $new_clusters );
1055				last if $assignment_changed_flag == 0;
1056				}
1057				$final_cluster_centers = $self->update_cluster_centers( $clusters );
1058				return ($clusters, $final_cluster_centers);
1059				}
1060
1061				# Used by the kmeans part of the code: After each new assignment of the data points
1062				# to the clusters on the basis of the current values for the cluster centers, we call
1063				# the routine shown here for updating the values of the cluster centers.
1064				sub update_cluster_centers {
1065				my $self = shift;
1066				my @clusters = @{ shift @_ };
1067				my @new_cluster_centers;
1068				my $largest_cluster;
1069				foreach my $cluster (@clusters) {
1070				next if !defined $cluster;
1071				$largest_cluster = $cluster if !defined $largest_cluster;
1072				if (@$cluster > @$largest_cluster) {
1073				$largest_cluster = $cluster;
1074				}
1075				}
1076				foreach my $cluster (@clusters) {
1077				if ( (!defined $cluster) \|\| (@$cluster == 0) ) {
1078				push @$cluster, pop @$largest_cluster;
1079				}
1080				}
1081				foreach my $cluster (@clusters) {
1082				die "Cluster became empty --- untenable condition " .
1083				"for a given K. Try again. \n" if !defined $cluster;
1084				my $cluster_size = @$cluster;
1085				die "Cluster size is zero --- untenable.\n" if $cluster_size == 0;
1086				my @new_cluster_center = @{$self->add_point_coords( $cluster )};
1087				@new_cluster_center = map {my $x = $_/$cluster_size; $x}
1088				@new_cluster_center;
1089				push @new_cluster_centers, \@new_cluster_center;
1090				}
1091				return \@new_cluster_centers;
1092				}
1093
1094				# The following routine is for computing the distance between a data point specified
1095				# by its symbolic name in the master datafile and a point (such as the center of a
1096				# cluster) expressed as a vector of coordinates:
1097				sub distance {
1098				my $self = shift;
1099				my $ele1_id = shift @_; # symbolic name of data sample
1100				my @ele1 = @{$self->{_data}->{$ele1_id}};
1101				my @ele2 = @{shift @_};
1102				die "wrong data types for distance calculation\n" if @ele1 != @ele2;
1103				my $how_many = @ele1;
1104				my $squared_sum = 0;
1105				foreach my $i (0..$how_many-1) {
1106				$squared_sum += ($ele1[$i] - $ele2[$i])**2;
1107				}
1108				my $dist = sqrt $squared_sum;
1109				return $dist;
1110				}
1111
1112				# The following routine does the same as above but now both
1113				# arguments are expected to be arrays of numbers:
1114				sub distance2 {
1115				my $self = shift;
1116				my @ele1 = @{shift @_};
1117				my @ele2 = @{shift @_};
1118				die "wrong data types for distance calculation\n" if @ele1 != @ele2;
1119				my $how_many = @ele1;
1120				my $squared_sum = 0;
1121				foreach my $i (0..$how_many-1) {
1122				$squared_sum += ($ele1[$i] - $ele2[$i])**2;
1123				}
1124				return sqrt $squared_sum;
1125				}
1126
1127				sub write_naive_bayes_clusters_to_files {
1128				my $self = shift;
1129				my @clusters = @{$self->{_clusters}};
1130				unlink glob "naive_bayes_cluster*.dat";
1131				foreach my $i (1..@clusters) {
1132				my $filename = "naive_bayes_cluster" . $i . ".dat";
1133				print "Writing cluster $i to file $filename\n"
1134				if $self->{_terminal_output};
1135				open FILEHANDLE, "\| sort > $filename" or die "Unable to open file: $!";
1136				foreach my $ele (@{$clusters[$i-1]}) {
1137				print FILEHANDLE "$ele\n";
1138				}
1139				close FILEHANDLE;
1140				}
1141				}
1142
1143				sub write_posterior_prob_clusters_above_threshold_to_files {
1144				my $self = shift;
1145				my $theta = shift;
1146				my @class_distributions;
1147				foreach my $cluster_index (0..$self->{_K}-1) {
1148				push @class_distributions, [];
1149				}
1150				foreach my $data_tag (@{$self->{_data_id_tags}}) {
1151				foreach my $cluster_index (0..$self->{_K}-1) {
1152				push @{$class_distributions[$cluster_index]}, $data_tag
1153				if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index]
1154				> $theta;
1155				}
1156				}
1157				unlink glob "posterior_prob_cluster*.dat";
1158				foreach my $i (1..@class_distributions) {
1159				my $filename = "posterior_prob_cluster" . $i . ".dat";
1160				print "Writing posterior prob cluster $i to file $filename\n"
1161				if $self->{_terminal_output};
1162				open FILEHANDLE, "\| sort > $filename" or die "Unable to open file: $!";
1163				foreach my $ele (@{$class_distributions[$i-1]}) {
1164				print FILEHANDLE "$ele\n";
1165				}
1166				close FILEHANDLE;
1167				}
1168				}
1169
1170				sub DESTROY {
1171				unlink "__temp_" . basename($_[0]->{_datafile});
1172				unlink "__temp_data_" . basename($_[0]->{_datafile});
1173				unlink "__temp2_" . basename($_[0]->{_datafile});
1174				unlink glob "__temp1dhist*";
1175				unlink glob "__contour*";
1176				}
1177
1178				############################# Visualization Code ###############################
1179
1180				# The visualize_clusters() implementation displays as a plot in your terminal window
1181				# the clusters constructed by the EM algorithm. It can show either 2D plots or
1182				# 3D plots that you can rotate interactively for better visualization. For
1183				# multidimensional data, as to which 2D or 3D dimensions are used for visualization
1184				# is controlled by the mask you must supply as an argument to the method. Should it
1185				# happen that only one on bit is specified for the mask, visualize_clusters()
1186				# aborts.
1187				#
1188				# The visualization code consists of first accessing each of clusters created by the
1189				# EM() subroutine. Note that the clusters contain only the symbolic names for the
1190				# individual records in the source data file. We therefore next reach into the
1191				# $self->{_data} hash and get the data coordinates associated with each symbolic
1192				# label in a cluster. The numerical data thus generated is then written out to a
1193				# temp file. When doing so we must remember to insert TWO BLANK LINES between the
1194				# data blocks corresponding to the different clusters. This constraint is imposed
1195				# on us by Gnuplot when plotting data from the same file since we want to use
1196				# different point styles for the data points in different cluster files.
1197				# Subsequently, we call upon the Perl interface provided by the Graphics::GnuplotIF
1198				# module to plot the data clusters.
1199				sub visualize_clusters {
1200				my $self = shift;
1201				my $v_mask;
1202				my $pause_time;
1203				if (@_ == 1) {
1204				$v_mask = shift \|\| die "visualization mask missing";
1205				} elsif (@_ == 2) {
1206				$v_mask = shift \|\| die "visualization mask missing";
1207				$pause_time = shift;
1208				} else {
1209				die "visualize_clusters() called with wrong args";
1210				}
1211				my $master_datafile = $self->{_datafile};
1212				my @v_mask = split //, $v_mask;
1213				my $visualization_mask_width = @v_mask;
1214				my $original_data_mask = $self->{_mask};
1215				my @mask = split //, $original_data_mask;
1216				my $data_field_width = scalar grep {$_ eq '1'} @mask;
1217				die "\n\nABORTED: The width of the visualization mask (including " .
1218				"all its 1s and 0s) must equal the width of the original mask " .
1219				"used for reading the data file (counting only the 1's)"
1220				if $visualization_mask_width != $data_field_width;
1221				my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1222				# The following section is for the superimposed one-Mahalanobis-distance-unit
1223				# ellipses that are shown only for 2D plots:
1224				if ($visualization_data_field_width == 2) {
1225				foreach my $cluster_index (0..$self->{_K}-1) {
1226				my $contour_filename = "__contour_" . $cluster_index . ".dat";
1227				my $mean = $self->{_cluster_means}->[$cluster_index];
1228				my $covariance = $self->{_cluster_covariances}->[$cluster_index];
1229				my ($mux,$muy) = $mean->as_list();
1230				my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
1231				my ($sigmayx,$vary) = $covariance->row(1)->as_list();
1232				die "Your covariance matrix does not look right"
1233				unless $sigmaxy == $sigmayx;
1234				my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
1235				my $argstring = <<"END";
1236				set contour
1237				mux = $mux
1238				muy = $muy
1239				sigmax = $sigmax
1240				sigmay = $sigmay
1241				sigmaxy = $sigmaxy
1242				determinant = (sigmax*2)(sigmay2) - sigmaxy2
1243				exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)*2)sigmay2 + ((y-muy)2)sigmax2 - 2sigmaxy(x-mux)(y-muy) )
1244				f(x,y) = exp( exponent(x,y) ) - 0.2
1245				xmax = mux + 2 * sigmax
1246				xmin = mux - 2 * sigmax
1247				ymax = muy + 2 * sigmay
1248				ymin = muy - 2 * sigmay
1249				set xrange [ xmin : xmax ]
1250				set yrange [ ymin : ymax ]
1251				set isosamples 200
1252				unset surface
1253				set cntrparam levels discrete 0
1254				set table \"$contour_filename\"
1255				splot f(x,y)
1256				unset table
1257				END
1258				my $plot = Graphics::GnuplotIF->new();
1259				$plot->gnuplot_cmd( $argstring );
1260				}
1261				}
1262				my %visualization_data;
1263				while ( my ($record_id, $data) = each %{$self->{_data}} ) {
1264				my @fields = @$data;
1265				die "\nABORTED: Visualization mask size exceeds data record size"
1266				if $#v_mask > $#fields;
1267				my @data_fields;
1268				foreach my $i (0..@fields-1) {
1269				if ($v_mask[$i] eq '0') {
1270				next;
1271				} elsif ($v_mask[$i] eq '1') {
1272				push @data_fields, $fields[$i];
1273				} else {
1274				die "Misformed visualization mask. It can only have 1s and 0s";
1275				}
1276				}
1277				$visualization_data{ $record_id } = \@data_fields;
1278				}
1279				my $K = scalar @{$self->{_clusters}};
1280				my $filename = basename($master_datafile);
1281				my $temp_file = "__temp_" . $filename;
1282				unlink $temp_file if -e $temp_file;
1283				open OUTPUT, ">$temp_file"
1284				or die "Unable to open a temp file in this directory: $!\n";
1285				foreach my $cluster (@{$self->{_clusters}}) {
1286				foreach my $item (@$cluster) {
1287				print OUTPUT "@{$visualization_data{$item}}";
1288				print OUTPUT "\n";
1289				}
1290				print OUTPUT "\n\n";
1291				}
1292				close OUTPUT;
1293				my $plot;
1294				if (!defined $pause_time) {
1295				$plot = Graphics::GnuplotIF->new( persist => 1 );
1296				} else {
1297				$plot = Graphics::GnuplotIF->new();
1298				}
1299				my $arg_string = "";
1300				if ($visualization_data_field_width > 2) {
1301				$plot->gnuplot_cmd("set noclip");
1302				$plot->gnuplot_cmd("set pointsize 2");
1303				foreach my $i (0..$K-1) {
1304				my $j = $i + 1;
1305				$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
1306				}
1307				} elsif ($visualization_data_field_width == 2) {
1308				$plot->gnuplot_cmd("set noclip");
1309				$plot->gnuplot_cmd("set pointsize 2");
1310				foreach my $i (0..$K-1) {
1311				my $j = $i + 1;
1312				$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
1313				my $ellipse_filename = "__contour_" . $i . ".dat";
1314				$arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
1315				}
1316				} elsif ($visualization_data_field_width == 1 ) {
1317				open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!\n";
1318				my @all_data = ;
1319				close INPUT;
1320				@all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
1321				my $all_joined_data = join ':', @all_data;
1322				my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
1323				my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
1324				foreach my $i (0..@separated-1) {
1325				$separated[$i] =~ s/SEPERATOR//g;
1326				my @cluster_for_hist = split /:/, $separated[$i];
1327				@cluster_for_hist = grep $_, @cluster_for_hist;
1328				my ($minval,$maxval) = minmax(\@cluster_for_hist);
1329				push @all_minvals, $minval;
1330				push @all_maxvals, $maxval;
1331				push @all_clusters_for_hist, \@cluster_for_hist;
1332				}
1333				push @all_minmaxvals, @all_minvals;
1334				push @all_minmaxvals, @all_maxvals;
1335				my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
1336				my $delta = ($abs_maxval - $abs_minval) / 100.0;
1337				$plot->gnuplot_cmd("set boxwidth 3");
1338				$plot->gnuplot_cmd("set style fill solid border -1");
1339				$plot->gnuplot_cmd("set ytics out nomirror");
1340				$plot->gnuplot_cmd("set style data histograms");
1341				$plot->gnuplot_cmd("set style histogram clustered");
1342				$plot->gnuplot_cmd("set title 'Clusters shown through histograms'");
1343				$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
1344				foreach my $cindex (0..@all_clusters_for_hist-1) {
1345				my $filename = basename($master_datafile);
1346				my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $filename;
1347				unlink $temp_file if -e $temp_file;
1348				open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!\n";
1349				print OUTPUT "Xstep histval\n";
1350				my @histogram = (0) x 100;
1351				foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
1352				$histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
1353				}
1354				foreach my $i (0..@histogram-1) {
1355				print OUTPUT "$i $histogram[$i]\n";
1356				}
1357				$arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
1358				close OUTPUT;
1359				}
1360				}
1361				$arg_string = $arg_string =~ /^(.*),[ ]+$/;
1362				$arg_string = $1;
1363				if ($visualization_data_field_width > 2) {
1364				$plot->gnuplot_cmd( "splot $arg_string" );
1365				$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1366				} elsif ($visualization_data_field_width == 2) {
1367				$plot->gnuplot_cmd( "plot $arg_string" );
1368				$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1369				} elsif ($visualization_data_field_width == 1) {
1370				$plot->gnuplot_cmd( "plot $arg_string" );
1371				$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1372				}
1373				}
1374
1375				# This subroutine is the same as above except that it makes PNG plots (for hardcopy
1376				# printing) of the clusters.
1377				sub plot_hardcopy_clusters {
1378				my $self = shift;
1379				my $v_mask;
1380				my $pause_time;
1381				if (@_ == 1) {
1382				$v_mask = shift \|\| die "visualization mask missing";
1383				} elsif (@_ == 2) {
1384				$v_mask = shift \|\| die "visualization mask missing";
1385				$pause_time = shift;
1386				} else {
1387				die "visualize_clusters() called with wrong args";
1388				}
1389				my $master_datafile = $self->{_datafile};
1390				my @v_mask = split //, $v_mask;
1391				my $visualization_mask_width = @v_mask;
1392				my $original_data_mask = $self->{_mask};
1393				my @mask = split //, $original_data_mask;
1394				my $data_field_width = scalar grep {$_ eq '1'} @mask;
1395				die "\n\nABORTED: The width of the visualization mask (including " .
1396				"all its 1s and 0s) must equal the width of the original mask " .
1397				"used for reading the data file (counting only the 1's)"
1398				if $visualization_mask_width != $data_field_width;
1399				my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1400				if ($visualization_data_field_width == 2) {
1401				foreach my $cluster_index (0..$self->{_K}-1) {
1402				my $contour_filename = "__contour_" . $cluster_index . ".dat";
1403				my $mean = $self->{_cluster_means}->[$cluster_index];
1404				my $covariance = $self->{_cluster_covariances}->[$cluster_index];
1405				my ($mux,$muy) = $mean->as_list();
1406				my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
1407				my ($sigmayx,$vary) = $covariance->row(1)->as_list();
1408				die "Your covariance matrix does not look right"
1409				unless $sigmaxy == $sigmayx;
1410				my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
1411				my $argstring = <<"END";
1412				set contour
1413				mux = $mux
1414				muy = $muy
1415				sigmax = $sigmax
1416				sigmay = $sigmay
1417				sigmaxy = $sigmaxy
1418				determinant = (sigmax*2)(sigmay2) - sigmaxy2
1419				exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)*2)sigmay2 + ((y-muy)2)sigmax2 - 2sigmaxy(x-mux)(y-muy) )
1420				f(x,y) = exp( exponent(x,y) ) - 0.2
1421				xmax = mux + 2 * sigmax
1422				xmin = mux - 2 * sigmax
1423				ymax = muy + 2 * sigmay
1424				ymin = muy - 2 * sigmay
1425				set xrange [ xmin : xmax ]
1426				set yrange [ ymin : ymax ]
1427				set isosamples 200
1428				unset surface
1429				set cntrparam levels discrete 0
1430				set table \"$contour_filename\"
1431				splot f(x,y)
1432				unset table
1433				END
1434				my $plot = Graphics::GnuplotIF->new();
1435				$plot->gnuplot_cmd( $argstring );
1436				}
1437				}
1438				my %visualization_data;
1439				while ( my ($record_id, $data) = each %{$self->{_data}} ) {
1440				my @fields = @$data;
1441				die "\nABORTED: Visualization mask size exceeds data record size"
1442				if $#v_mask > $#fields;
1443				my @data_fields;
1444				foreach my $i (0..@fields-1) {
1445				if ($v_mask[$i] eq '0') {
1446				next;
1447				} elsif ($v_mask[$i] eq '1') {
1448				push @data_fields, $fields[$i];
1449				} else {
1450				die "Misformed visualization mask. It can only have 1s and 0s";
1451				}
1452				}
1453				$visualization_data{ $record_id } = \@data_fields;
1454				}
1455				my $K = scalar @{$self->{_clusters}};
1456				my $filename = basename($master_datafile);
1457				my $temp_file = "__temp_" . $filename;
1458				unlink $temp_file if -e $temp_file;
1459				open OUTPUT, ">$temp_file"
1460				or die "Unable to open a temp file in this directory: $!\n";
1461				foreach my $cluster (@{$self->{_clusters}}) {
1462				foreach my $item (@$cluster) {
1463				print OUTPUT "@{$visualization_data{$item}}";
1464				print OUTPUT "\n";
1465				}
1466				print OUTPUT "\n\n";
1467				}
1468				close OUTPUT;
1469				my $plot;
1470				if (!defined $pause_time) {
1471				$plot = Graphics::GnuplotIF->new( persist => 1 );
1472				} else {
1473				$plot = Graphics::GnuplotIF->new();
1474				}
1475				my $arg_string = "";
1476				if ($visualization_data_field_width > 2) {
1477				$plot->gnuplot_cmd( "set noclip" );
1478				$plot->gnuplot_cmd( "set pointsize 2" );
1479				foreach my $i (0..$K-1) {
1480				my $j = $i + 1;
1481				$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
1482				}
1483				} elsif ($visualization_data_field_width == 2) {
1484				$plot->gnuplot_cmd( "set noclip" );
1485				$plot->gnuplot_cmd( "set pointsize 2" );
1486				foreach my $i (0..$K-1) {
1487				my $j = $i + 1;
1488				$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
1489				my $ellipse_filename = "__contour_" . $i . ".dat";
1490				$arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
1491				}
1492				} elsif ($visualization_data_field_width == 1 ) {
1493				open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!\n";
1494				my @all_data = ;
1495				close INPUT;
1496				@all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
1497				my $all_joined_data = join ':', @all_data;
1498				my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
1499				my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
1500				foreach my $i (0..@separated-1) {
1501				$separated[$i] =~ s/SEPERATOR//g;
1502				my @cluster_for_hist = split /:/, $separated[$i];
1503				@cluster_for_hist = grep $_, @cluster_for_hist;
1504				my ($minval,$maxval) = minmax(\@cluster_for_hist);
1505				push @all_minvals, $minval;
1506				push @all_maxvals, $maxval;
1507				push @all_clusters_for_hist, \@cluster_for_hist;
1508				}
1509				push @all_minmaxvals, @all_minvals;
1510				push @all_minmaxvals, @all_maxvals;
1511				my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
1512				my $delta = ($abs_maxval - $abs_minval) / 100.0;
1513				$plot->gnuplot_cmd("set boxwidth 3");
1514				$plot->gnuplot_cmd("set style fill solid border -1");
1515				$plot->gnuplot_cmd("set ytics out nomirror");
1516				$plot->gnuplot_cmd("set style data histograms");
1517				$plot->gnuplot_cmd("set style histogram clustered");
1518				$plot->gnuplot_cmd("set title 'Clusters shown through histograms'");
1519				$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
1520				foreach my $cindex (0..@all_clusters_for_hist-1) {
1521				my $filename = basename($master_datafile);
1522				my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $filename;
1523				unlink $temp_file if -e $temp_file;
1524				open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!\n";
1525				print OUTPUT "Xstep histval\n";
1526				my @histogram = (0) x 100;
1527				foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
1528				$histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
1529				}
1530				foreach my $i (0..@histogram-1) {
1531				print OUTPUT "$i $histogram[$i]\n";
1532				}
1533				# $arg_string .= "\"$temp_file\" using 1:2 ti col smooth frequency with boxes lc $cindex, ";
1534				$arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
1535				close OUTPUT;
1536				}
1537				}
1538				$arg_string = $arg_string =~ /^(.*),[ ]+$/;
1539				$arg_string = $1;
1540				if ($visualization_data_field_width > 2) {
1541				$plot->gnuplot_cmd( 'set terminal png color',
1542				'set output "cluster_plot.png"');
1543				$plot->gnuplot_cmd( "splot $arg_string" );
1544				} elsif ($visualization_data_field_width == 2) {
1545				$plot->gnuplot_cmd('set terminal png',
1546				'set output "cluster_plot.png"');
1547				$plot->gnuplot_cmd( "plot $arg_string" );
1548				} elsif ($visualization_data_field_width == 1) {
1549				$plot->gnuplot_cmd('set terminal png',
1550				'set output "cluster_plot.png"');
1551				$plot->gnuplot_cmd( "plot $arg_string" );
1552				}
1553				}
1554
1555				# This method is for the visualization of the posterior class distributions. In
1556				# other words, this method allows us to see the soft clustering produced by the EM
1557				# algorithm. While much of the gnuplot logic here is the same as in the
1558				# visualize_clusters() method, there are significant differences in how the data is
1559				# pooled for the purpose of display.
1560				sub visualize_distributions {
1561				my $self = shift;
1562				my $v_mask;
1563				my $pause_time;
1564				if (@_ == 1) {
1565				$v_mask = shift \|\| die "visualization mask missing";
1566				} elsif (@_ == 2) {
1567				$v_mask = shift \|\| die "visualization mask missing";
1568				$pause_time = shift;
1569				} else {
1570				die "visualize_distributions() called with wrong args";
1571				}
1572				my @v_mask = split //, $v_mask;
1573				my $visualization_mask_width = @v_mask;
1574				my $original_data_mask = $self->{_mask};
1575				my @mask = split //, $original_data_mask;
1576				my $data_field_width = scalar grep {$_ eq '1'} @mask;
1577				die "\n\nABORTED: The width of the visualization mask (including " .
1578				"all its 1s and 0s) must equal the width of the original mask " .
1579				"used for reading the data file (counting only the 1's)"
1580				if $visualization_mask_width != $data_field_width;
1581				my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1582				if ($visualization_data_field_width == 2) {
1583				foreach my $cluster_index (0..$self->{_K}-1) {
1584				my $contour_filename = "__contour2_" . $cluster_index . ".dat";
1585				my $mean = $self->{_cluster_means}->[$cluster_index];
1586				my $covariance = $self->{_cluster_covariances}->[$cluster_index];
1587				my ($mux,$muy) = $mean->as_list();
1588				my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
1589				my ($sigmayx,$vary) = $covariance->row(1)->as_list();
1590				die "Your covariance matrix does not look right"
1591				unless $sigmaxy == $sigmayx;
1592				my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
1593				my $argstring = <<"END";
1594				set contour
1595				mux = $mux
1596				muy = $muy
1597				sigmax = $sigmax
1598				sigmay = $sigmay
1599				sigmaxy = $sigmaxy
1600				determinant = (sigmax*2)(sigmay2) - sigmaxy2
1601				exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)*2)sigmay2 + ((y-muy)2)sigmax2 - 2sigmaxy(x-mux)(y-muy) )
1602				f(x,y) = exp( exponent(x,y) ) - 0.2
1603				xmax = mux + 2 * sigmax
1604				xmin = mux - 2 * sigmax
1605				ymax = muy + 2 * sigmay
1606				ymin = muy - 2 * sigmay
1607				set xrange [ xmin : xmax ]
1608				set yrange [ ymin : ymax ]
1609				set isosamples 200
1610				unset surface
1611				set cntrparam levels discrete 0
1612				set table \"$contour_filename\"
1613				splot f(x,y)
1614				unset table
1615				END
1616				my $plot = Graphics::GnuplotIF->new();
1617				$plot->gnuplot_cmd( $argstring );
1618				}
1619				}
1620				my %visualization_data;
1621				while ( my ($record_id, $data) = each %{$self->{_data}} ) {
1622				my @fields = @$data;
1623				die "\nABORTED: Visualization mask size exceeds data record size"
1624				if $#v_mask > $#fields;
1625				my @data_fields;
1626				foreach my $i (0..@fields-1) {
1627				if ($v_mask[$i] eq '0') {
1628				next;
1629				} elsif ($v_mask[$i] eq '1') {
1630				push @data_fields, $fields[$i];
1631				} else {
1632				die "Misformed visualization mask. It can only have 1s and 0s";
1633				}
1634				}
1635				$visualization_data{ $record_id } = \@data_fields;
1636				}
1637				my $filename = basename($self->{_datafile});
1638				my $temp_file = "__temp2_" . $filename;
1639				unlink $temp_file if -e $temp_file;
1640				open OUTPUT, ">$temp_file"
1641				or die "Unable to open a temp file in this directory: $!\n";
1642				my @class_distributions;
1643				foreach my $cluster_index (0..$self->{_K}-1) {
1644				push @class_distributions, [];
1645				}
1646				foreach my $data_tag (@{$self->{_data_id_tags}}) {
1647				foreach my $cluster_index (0..$self->{_K}-1) {
1648				push @{$class_distributions[$cluster_index]}, $self->{_data}->{$data_tag}
1649				if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] > 0.2;
1650				}
1651				}
1652				foreach my $distribution (@class_distributions) {
1653				foreach my $item (@$distribution) {
1654				print OUTPUT "@$item";
1655				print OUTPUT "\n";
1656				}
1657				print OUTPUT "\n\n";
1658				}
1659				close OUTPUT;
1660				my $plot;
1661				if (!defined $pause_time) {
1662				$plot = Graphics::GnuplotIF->new( persist => 1 );
1663				} else {
1664				$plot = Graphics::GnuplotIF->new();
1665				}
1666				my $arg_string = "";
1667				if ($visualization_data_field_width > 2) {
1668				$plot->gnuplot_cmd( "set noclip" );
1669				$plot->gnuplot_cmd( "set pointsize 2" );
1670				foreach my $i (0..$self->{_K}-1) {
1671				my $j = $i + 1;
1672				$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
1673				}
1674				} elsif ($visualization_data_field_width == 2) {
1675				$plot->gnuplot_cmd( "set noclip" );
1676				$plot->gnuplot_cmd( "set pointsize 2" );
1677				foreach my $i (0..$self->{_K}-1) {
1678				my $j = $i + 1;
1679				$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
1680				my $ellipse_filename = "__contour2_" . $i . ".dat";
1681				$arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
1682				}
1683				} elsif ($visualization_data_field_width == 1 ) {
1684				open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!\n";
1685				my @all_data = ;
1686				close INPUT;
1687				@all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
1688				my $all_joined_data = join ':', @all_data;
1689				my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
1690				my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
1691				foreach my $i (0..@separated-1) {
1692				$separated[$i] =~ s/SEPERATOR//g;
1693				my @cluster_for_hist = split /:/, $separated[$i];
1694				@cluster_for_hist = grep $_, @cluster_for_hist;
1695				my ($minval,$maxval) = minmax(\@cluster_for_hist);
1696				push @all_minvals, $minval;
1697				push @all_maxvals, $maxval;
1698				push @all_clusters_for_hist, \@cluster_for_hist;
1699				}
1700				push @all_minmaxvals, @all_minvals;
1701				push @all_minmaxvals, @all_maxvals;
1702				my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
1703				my $delta = ($abs_maxval - $abs_minval) / 100.0;
1704				$plot->gnuplot_cmd("set boxwidth 3");
1705				$plot->gnuplot_cmd("set style fill solid border -1");
1706				$plot->gnuplot_cmd("set ytics out nomirror");
1707				$plot->gnuplot_cmd("set style data histograms");
1708				$plot->gnuplot_cmd("set style histogram clustered");
1709				$plot->gnuplot_cmd("set title 'Individual distributions shown through histograms'");
1710				$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
1711				foreach my $cindex (0..@all_clusters_for_hist-1) {
1712				my $localfilename = basename($filename);
1713				my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $localfilename;
1714				unlink $temp_file if -e $temp_file;
1715				open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!\n";
1716				print OUTPUT "Xstep histval\n";
1717
1718				my @histogram = (0) x 100;
1719				foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
1720				$histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
1721				}
1722				foreach my $i (0..@histogram-1) {
1723				print OUTPUT "$i $histogram[$i]\n";
1724				}
1725				# $arg_string .= "\"$temp_file\" using 1:2 ti col smooth frequency with boxes lc $cindex, ";
1726				$arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
1727				close OUTPUT;
1728				}
1729				}
1730				$arg_string = $arg_string =~ /^(.*),[ ]+$/;
1731				$arg_string = $1;
1732				if ($visualization_data_field_width > 2) {
1733				$plot->gnuplot_cmd( "splot $arg_string" );
1734				$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1735				} elsif ($visualization_data_field_width == 2) {
1736				$plot->gnuplot_cmd( "plot $arg_string" );
1737				$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1738				} elsif ($visualization_data_field_width == 1) {
1739				$plot->gnuplot_cmd( "plot $arg_string" );
1740				$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1741				}
1742				}
1743
1744				# This method is basically the same as the previous method, except that it is
1745				# intended for making PNG files from the distributions.
1746				sub plot_hardcopy_distributions {
1747				my $self = shift;
1748				my $v_mask;
1749				my $pause_time;
1750				if (@_ == 1) {
1751				$v_mask = shift \|\| die "visualization mask missing";
1752				} elsif (@_ == 2) {
1753				$v_mask = shift \|\| die "visualization mask missing";
1754				$pause_time = shift;
1755				} else {
1756				die "visualize_distributions() called with wrong args";
1757				}
1758				my @v_mask = split //, $v_mask;
1759				my $visualization_mask_width = @v_mask;
1760				my $original_data_mask = $self->{_mask};
1761				my @mask = split //, $original_data_mask;
1762				my $data_field_width = scalar grep {$_ eq '1'} @mask;
1763				die "\n\nABORTED: The width of the visualization mask (including " .
1764				"all its 1s and 0s) must equal the width of the original mask " .
1765				"used for reading the data file (counting only the 1's)"
1766				if $visualization_mask_width != $data_field_width;
1767				my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1768				if ($visualization_data_field_width == 2) {
1769				foreach my $cluster_index (0..$self->{_K}-1) {
1770				my $contour_filename = "__contour2_" . $cluster_index . ".dat";
1771				my $mean = $self->{_cluster_means}->[$cluster_index];
1772				my $covariance = $self->{_cluster_covariances}->[$cluster_index];
1773				my ($mux,$muy) = $mean->as_list();
1774				my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
1775				my ($sigmayx,$vary) = $covariance->row(1)->as_list();
1776				die "Your covariance matrix does not look right"
1777				unless $sigmaxy == $sigmayx;
1778				my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
1779				my $argstring = <<"END";
1780				set contour
1781				mux = $mux
1782				muy = $muy
1783				sigmax = $sigmax
1784				sigmay = $sigmay
1785				sigmaxy = $sigmaxy
1786				determinant = (sigmax*2)(sigmay2) - sigmaxy2
1787				exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)*2)sigmay2 + ((y-muy)2)sigmax2 - 2sigmaxy(x-mux)(y-muy) )
1788				f(x,y) = exp( exponent(x,y) ) - 0.2
1789				xmax = mux + 2 * sigmax
1790				xmin = mux - 2 * sigmax
1791				ymax = muy + 2 * sigmay
1792				ymin = muy - 2 * sigmay
1793				set xrange [ xmin : xmax ]
1794				set yrange [ ymin : ymax ]
1795				set isosamples 200
1796				unset surface
1797				set cntrparam levels discrete 0
1798				set table \"$contour_filename\"
1799				splot f(x,y)
1800				unset table
1801				END
1802				my $plot = Graphics::GnuplotIF->new();
1803				$plot->gnuplot_cmd( $argstring );
1804				}
1805				}
1806				my %visualization_data;
1807				while ( my ($record_id, $data) = each %{$self->{_data}} ) {
1808				my @fields = @$data;
1809				die "\nABORTED: Visualization mask size exceeds data record size"
1810				if $#v_mask > $#fields;
1811				my @data_fields;
1812				foreach my $i (0..@fields-1) {
1813				if ($v_mask[$i] eq '0') {
1814				next;
1815				} elsif ($v_mask[$i] eq '1') {
1816				push @data_fields, $fields[$i];
1817				} else {
1818				die "Misformed visualization mask. It can only have 1s and 0s";
1819				}
1820				}
1821				$visualization_data{ $record_id } = \@data_fields;
1822				}
1823				my $filename = basename($self->{_datafile});
1824				my $temp_file = "__temp2_" . $filename;
1825				unlink $temp_file if -e $temp_file;
1826				open OUTPUT, ">$temp_file"
1827				or die "Unable to open a temp file in this directory: $!\n";
1828				my @class_distributions;
1829				foreach my $cluster_index (0..$self->{_K}-1) {
1830				push @class_distributions, [];
1831				}
1832				foreach my $data_tag (@{$self->{_data_id_tags}}) {
1833				foreach my $cluster_index (0..$self->{_K}-1) {
1834				push @{$class_distributions[$cluster_index]}, $self->{_data}->{$data_tag}
1835				if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] > 0.2;
1836				}
1837				}
1838				foreach my $distribution (@class_distributions) {
1839				foreach my $item (@$distribution) {
1840				print OUTPUT "@$item";
1841				print OUTPUT "\n";
1842				}
1843				print OUTPUT "\n\n";
1844				}
1845				close OUTPUT;
1846				my $plot;
1847				if (!defined $pause_time) {
1848				$plot = Graphics::GnuplotIF->new( persist => 1 );
1849				} else {
1850				$plot = Graphics::GnuplotIF->new();
1851				}
1852				my $arg_string = "";
1853				if ($visualization_data_field_width > 2) {
1854				$plot->gnuplot_cmd( "set noclip" );
1855				$plot->gnuplot_cmd( "set pointsize 2" );
1856				foreach my $i (0..$self->{_K}-1) {
1857				my $j = $i + 1;
1858				$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
1859				}
1860				} elsif ($visualization_data_field_width == 2) {
1861				$plot->gnuplot_cmd( "set noclip" );
1862				$plot->gnuplot_cmd( "set pointsize 2" );
1863				foreach my $i (0..$self->{_K}-1) {
1864				my $j = $i + 1;
1865				$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
1866				my $ellipse_filename = "__contour2_" . $i . ".dat";
1867				$arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
1868				}
1869				} elsif ($visualization_data_field_width == 1 ) {
1870				open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!\n";
1871				my @all_data = ;
1872				close INPUT;
1873				@all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
1874				my $all_joined_data = join ':', @all_data;
1875				my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
1876				my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
1877				foreach my $i (0..@separated-1) {
1878				$separated[$i] =~ s/SEPERATOR//g;
1879				my @cluster_for_hist = split /:/, $separated[$i];
1880				@cluster_for_hist = grep $_, @cluster_for_hist;
1881				my ($minval,$maxval) = minmax(\@cluster_for_hist);
1882				push @all_minvals, $minval;
1883				push @all_maxvals, $maxval;
1884				push @all_clusters_for_hist, \@cluster_for_hist;
1885				}
1886				push @all_minmaxvals, @all_minvals;
1887				push @all_minmaxvals, @all_maxvals;
1888				my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
1889				my $delta = ($abs_maxval - $abs_minval) / 100.0;
1890				$plot->gnuplot_cmd("set boxwidth 3");
1891				$plot->gnuplot_cmd("set style fill solid border -1");
1892				$plot->gnuplot_cmd("set ytics out nomirror");
1893				$plot->gnuplot_cmd("set style data histograms");
1894				$plot->gnuplot_cmd("set style histogram clustered");
1895				$plot->gnuplot_cmd("set title 'Individual distributions shown through histograms'");
1896				$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
1897				foreach my $cindex (0..@all_clusters_for_hist-1) {
1898				my $localfilename = basename($filename);
1899				my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $localfilename;
1900				unlink $temp_file if -e $temp_file;
1901				open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!\n";
1902				print OUTPUT "Xstep histval\n";
1903
1904				my @histogram = (0) x 100;
1905				foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
1906				$histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
1907				}
1908				foreach my $i (0..@histogram-1) {
1909				print OUTPUT "$i $histogram[$i]\n";
1910				}
1911				$arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
1912				close OUTPUT;
1913				}
1914				}
1915				$arg_string = $arg_string =~ /^(.*),[ ]+$/;
1916				$arg_string = $1;
1917
1918				if ($visualization_data_field_width > 2) {
1919				$plot->gnuplot_cmd( 'set terminal png',
1920				'set output "posterior_prob_plot.png"');
1921				$plot->gnuplot_cmd( "splot $arg_string" );
1922				} elsif ($visualization_data_field_width == 2) {
1923				$plot->gnuplot_cmd( 'set terminal png',
1924				'set output "posterior_prob_plot.png"');
1925				$plot->gnuplot_cmd( "plot $arg_string" );
1926				} elsif ($visualization_data_field_width == 1) {
1927				$plot->gnuplot_cmd( 'set terminal png',
1928				'set output "posterior_prob_plot.png"');
1929				$plot->gnuplot_cmd( "plot $arg_string" );
1930				}
1931				}
1932
1933				# The method shown below should be called only AFTER you have called the method
1934				# read_data_from_file(). The visualize_data() is meant for the visualization of the
1935				# original data in its various 2D or 3D subspaces.
1936				sub visualize_data {
1937				my $self = shift;
1938				my $v_mask = shift \|\| die "visualization mask missing";
1939
1940				my $master_datafile = $self->{_datafile};
1941
1942				my @v_mask = split //, $v_mask;
1943				my $visualization_mask_width = @v_mask;
1944				my $original_data_mask = $self->{_mask};
1945				my @mask = split //, $original_data_mask;
1946				my $data_field_width = scalar grep {$_ eq '1'} @mask;
1947				die "\n\nABORTED: The width of the visualization mask (including " .
1948				"all its 1s and 0s) must equal the width of the original mask " .
1949				"used for reading the data file (counting only the 1's)"
1950				if $visualization_mask_width != $data_field_width;
1951				my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1952				my %visualization_data;
1953				my $data_source = $self->{_data};
1954				while ( my ($record_id, $data) = each %{$data_source} ) {
1955				my @fields = @$data;
1956				die "\nABORTED: Visualization mask size exceeds data record size"
1957				if $#v_mask > $#fields;
1958				my @data_fields;
1959				foreach my $i (0..@fields-1) {
1960				if ($v_mask[$i] eq '0') {
1961				next;
1962				} elsif ($v_mask[$i] eq '1') {
1963				push @data_fields, $fields[$i];
1964				} else {
1965				die "Misformed visualization mask. It can only have 1s and 0s";
1966				}
1967				}
1968				$visualization_data{ $record_id } = \@data_fields;
1969				}
1970				my $filename = basename($master_datafile);
1971				my $temp_file;
1972				$temp_file = "__temp_data_" . $filename;
1973				unlink $temp_file if -e $temp_file;
1974				open OUTPUT, ">$temp_file"
1975				or die "Unable to open a temp file in this directory: $!\n";
1976				foreach my $datapoint (values %visualization_data) {
1977				print OUTPUT "@$datapoint";
1978				print OUTPUT "\n";
1979				}
1980				close OUTPUT;
1981				my $plot = Graphics::GnuplotIF->new( persist => 1 );
1982				$plot->gnuplot_cmd( "set noclip" );
1983				$plot->gnuplot_cmd( "set pointsize 2" );
1984				my $plot_title = '"original data provided for EM"';
1985				my $arg_string ;
1986				if ($visualization_data_field_width > 2) {
1987				$arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
1988				} elsif ($visualization_data_field_width == 2) {
1989				$arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
1990				} elsif ($visualization_data_field_width == 1 ) {
1991				open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!\n";
1992				my @all_data = ;
1993				close INPUT;
1994				@all_data = map {chomp $_; $_} @all_data;
1995				@all_data = grep $_, @all_data;
1996				my ($minval,$maxval) = minmax(\@all_data);
1997				my $delta = ($maxval - $minval) / 100.0;
1998				$plot->gnuplot_cmd("set boxwidth 3");
1999				$plot->gnuplot_cmd("set style fill solid border -1");
2000				$plot->gnuplot_cmd("set ytics out nomirror");
2001				$plot->gnuplot_cmd("set style data histograms");
2002				$plot->gnuplot_cmd("set style histogram clustered");
2003				$plot->gnuplot_cmd("set title 'Overall distribution of 1D data'");
2004				$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
2005				my $localfilename = basename($filename);
2006				my $temp_file = "__temp1dhist_" . $localfilename;
2007				unlink $temp_file if -e $temp_file;
2008				open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!\n";
2009				print OUTPUT "Xstep histval\n";
2010				my @histogram = (0) x 100;
2011				foreach my $i (0..@all_data-1) {
2012				$histogram[int( ($all_data[$i] - $minval) / $delta )]++;
2013				}
2014				foreach my $i (0..@histogram-1) {
2015				print OUTPUT "$i $histogram[$i]\n";
2016				}
2017				$arg_string = "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc rgb 'green'";
2018				close OUTPUT;
2019				}
2020				if ($visualization_data_field_width > 2) {
2021				$plot->gnuplot_cmd( "splot $arg_string" );
2022				} elsif ($visualization_data_field_width == 2) {
2023				$plot->gnuplot_cmd( "plot $arg_string" );
2024				} elsif ($visualization_data_field_width == 1) {
2025				$plot->gnuplot_cmd( "plot $arg_string" );
2026				}
2027				}
2028
2029				# This method is the same as the one shown above, except that it is meant for
2030				# creating PNG files of the displays.
2031				sub plot_hardcopy_data {
2032				my $self = shift;
2033				my $v_mask = shift \|\| die "visualization mask missing";
2034				my $master_datafile = $self->{_datafile};
2035				my @v_mask = split //, $v_mask;
2036				my $visualization_mask_width = @v_mask;
2037				my $original_data_mask = $self->{_mask};
2038				my @mask = split //, $original_data_mask;
2039				my $data_field_width = scalar grep {$_ eq '1'} @mask;
2040				die "\n\nABORTED: The width of the visualization mask (including " .
2041				"all its 1s and 0s) must equal the width of the original mask " .
2042				"used for reading the data file (counting only the 1's)"
2043				if $visualization_mask_width != $data_field_width;
2044				my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
2045				my %visualization_data;
2046				my $data_source = $self->{_data};
2047				while ( my ($record_id, $data) = each %{$data_source} ) {
2048				my @fields = @$data;
2049				die "\nABORTED: Visualization mask size exceeds data record size"
2050				if $#v_mask > $#fields;
2051				my @data_fields;
2052				foreach my $i (0..@fields-1) {
2053				if ($v_mask[$i] eq '0') {
2054				next;
2055				} elsif ($v_mask[$i] eq '1') {
2056				push @data_fields, $fields[$i];
2057				} else {
2058				die "Misformed visualization mask. It can only have 1s and 0s";
2059				}
2060				}
2061				$visualization_data{ $record_id } = \@data_fields;
2062				}
2063				my $filename = basename($master_datafile);
2064				my $temp_file;
2065				$temp_file = "__temp_data_" . $filename;
2066				unlink $temp_file if -e $temp_file;
2067				open OUTPUT, ">$temp_file"
2068				or die "Unable to open a temp file in this directory: $!\n";
2069				foreach my $datapoint (values %visualization_data) {
2070				print OUTPUT "@$datapoint";
2071				print OUTPUT "\n";
2072				}
2073				close OUTPUT;
2074
2075				my $plot = Graphics::GnuplotIF->new( persist => 1 );
2076				$plot->gnuplot_cmd( "set noclip" );
2077				$plot->gnuplot_cmd( "set pointsize 2" );
2078				my $plot_title = '"original data provided for EM"';
2079				my $arg_string ;
2080				if ($visualization_data_field_width > 2) {
2081				$arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
2082				} elsif ($visualization_data_field_width == 2) {
2083				$arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
2084				} elsif ($visualization_data_field_width == 1 ) {
2085				open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!\n";
2086				my @all_data = ;
2087				close INPUT;
2088				@all_data = map {chomp $_; $_} @all_data;
2089				@all_data = grep $_, @all_data;
2090				my ($minval,$maxval) = minmax(\@all_data);
2091				my $delta = ($maxval - $minval) / 100.0;
2092				$plot->gnuplot_cmd("set boxwidth 3");
2093				$plot->gnuplot_cmd("set style fill solid border -1");
2094				$plot->gnuplot_cmd("set ytics out nomirror");
2095				$plot->gnuplot_cmd("set style data histograms");
2096				$plot->gnuplot_cmd("set style histogram clustered");
2097				$plot->gnuplot_cmd("set title 'Overall distribution of 1D data'");
2098				$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
2099				my $localfilename = basename($filename);
2100				my $temp_file = "__temp1dhist_" . $localfilename;
2101				unlink $temp_file if -e $temp_file;
2102				open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!\n";
2103				print OUTPUT "Xstep histval\n";
2104				my @histogram = (0) x 100;
2105				foreach my $i (0..@all_data-1) {
2106				$histogram[int( ($all_data[$i] - $minval) / $delta )]++;
2107				}
2108				foreach my $i (0..@histogram-1) {
2109				print OUTPUT "$i $histogram[$i]\n";
2110				}
2111				$arg_string = "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc rgb 'green'";
2112				close OUTPUT;
2113				}
2114				if ($visualization_data_field_width > 2) {
2115				$plot->gnuplot_cmd( 'set terminal png',
2116				'set output "data_scatter_plot.png"');
2117				$plot->gnuplot_cmd( "splot $arg_string" );
2118				} elsif ($visualization_data_field_width == 2) {
2119				$plot->gnuplot_cmd( 'set terminal png',
2120				'set output "data_scatter_plot.png"');
2121				$plot->gnuplot_cmd( "plot $arg_string" );
2122				} elsif ($visualization_data_field_width == 1) {
2123				$plot->gnuplot_cmd( 'set terminal png',
2124				'set output "data_scatter_plot.png"');
2125				$plot->gnuplot_cmd( "plot $arg_string" );
2126				}
2127				}
2128
2129
2130				################### Generating Synthetic Data for Clustering ###################
2131
2132				# The data generated corresponds to a multivariate distribution. The mean and the
2133				# covariance of each Gaussian in the distribution are specified individually in a
2134				# parameter file. The parameter file must also state the prior probabilities to be
2135				# associated with each Gaussian. See the example parameter file param1.txt in the
2136				# examples directory. Just edit this file for your own needs.
2137				#
2138				# The multivariate random numbers are generated by calling the Math::Random module.
2139				# As you would expect, that module will insist that the covariance matrix you
2140				# specify be symmetric and positive definite.
2141				sub cluster_data_generator {
2142				my $class = shift;
2143				die "illegal call of a class method"
2144				unless $class eq 'Algorithm::ExpectationMaximization';
2145				my %args = @_;
2146				my $input_parameter_file = $args{input_parameter_file};
2147				my $output_file = $args{output_datafile};
2148				my $N = $args{total_number_of_data_points};
2149				my @all_params;
2150				my $param_string;
2151				if (defined $input_parameter_file) {
2152				open INPUT, $input_parameter_file \|\| "unable to open parameter file: $!";
2153				@all_params = ;
2154				@all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
2155				chomp @all_params;
2156				$param_string = join ' ', @all_params;
2157				} else {
2158				# Just for testing. Used in t/test.t
2159				$param_string = "priors 0.34 0.33 0.33 " .
2160				"cluster 5 0 0 1 0 0 0 1 0 0 0 1 " .
2161				"cluster 0 5 0 1 0 0 0 1 0 0 0 1 " .
2162				"cluster 0 0 5 1 0 0 0 1 0 0 0 1";
2163				}
2164				my ($priors_string) = $param_string =~ /priors(.+?)cluster/;
2165				croak "You did not specify the prior probabilities in the parameter file"
2166				unless $priors_string;
2167				my @priors = split /\s+/, $priors_string;
2168				@priors = grep {/$_num_regex/; $_} @priors;
2169				my $sum = 0;
2170				foreach my $prior (@priors) {
2171				$sum += $prior;
2172				}
2173				croak "Your priors in the parameter file do not add up to 1" unless $sum == 1;
2174				my ($rest_of_string) = $param_string =~ /priors\s$priors_string(.)$/;
2175				my @cluster_strings = split /[ ]cluster[ ]/, $rest_of_string;
2176				@cluster_strings = grep $_, @cluster_strings;
2177
2178				my $K = @cluster_strings;
2179				croak "Too many clusters requested" if $K > 12;
2180				croak "Mismatch between the number of values for priors and the number " .
2181				"of clusters" unless $K == @priors;
2182				my @point_labels = ('a'..'z');
2183				print "Prior probabilities recorded from param file: @priors\n";
2184				print "Number of Gaussians used for the synthetic data: $K\n";
2185				my @means;
2186				my @covariances;
2187				my $data_dimension;
2188				foreach my $i (0..$K-1) {
2189				my @num_strings = split / /, $cluster_strings[$i];
2190				my @cluster_mean = map {/$_num_regex/;$_} split / /, $num_strings[0];
2191				$data_dimension = @cluster_mean;
2192				push @means, \@cluster_mean;
2193				my @covariance_nums = map {/$_num_regex/;$_} split / /, $num_strings[1];
2194				croak "dimensionality error" if @covariance_nums !=
2195				($data_dimension ** 2);
2196				my $cluster_covariance;
2197				foreach my $j (0..$data_dimension-1) {
2198				foreach my $k (0..$data_dimension-1) {
2199				$cluster_covariance->[$j]->[$k] =
2200				$covariance_nums[$j*$data_dimension + $k];
2201				}
2202				}
2203				push @covariances, $cluster_covariance;
2204				}
2205				random_seed_from_phrase( 'hellomellojello' );
2206				my @data_dump;
2207				foreach my $i (0..$K-1) {
2208				my @m = @{shift @means};
2209				my @covar = @{shift @covariances};
2210				my @new_data = Math::Random::random_multivariate_normal(
2211				int($N * $priors[$i]), @m, @covar );
2212				my $p = 0;
2213				my $label = $point_labels[$i];
2214				@new_data = map {unshift @$_, $label.$i; $i++; $_} @new_data;
2215				push @data_dump, @new_data;
2216				}
2217				fisher_yates_shuffle( \@data_dump );
2218				open OUTPUT, ">$output_file";
2219				print OUTPUT "\#Data generated from the parameter file: $input_parameter_file\n"
2220				if $input_parameter_file;
2221				print OUTPUT "\#Total number of data points in this file: $N\n";
2222				print OUTPUT "\#Prior class probabilities for this data: @priors\n";
2223				foreach my $ele (@data_dump) {
2224				foreach my $coord ( @$ele ) {
2225				print OUTPUT "$coord ";
2226				}
2227				print OUTPUT "\n";
2228				}
2229				print "Data written out to file $output_file\n";
2230				close OUTPUT;
2231				}
2232
2233				sub add_point_coords {
2234				my $self = shift;
2235				my @arr_of_ids = @{shift @_}; # array of data element names
2236				my @result;
2237				my $data_dimensionality = $self->{_data_dimensions};
2238				foreach my $i (0..$data_dimensionality-1) {
2239				$result[$i] = 0.0;
2240				}
2241				foreach my $id (@arr_of_ids) {
2242				my $ele = $self->{_data}->{$id};
2243				my $i = 0;
2244				foreach my $component (@$ele) {
2245				$result[$i] += $component;
2246				$i++;
2247				}
2248				}
2249				return \@result;
2250				}
2251
2252				###################### Support Routines ########################
2253
2254				# computer the outer product of two column vectors
2255				sub outer_product {
2256				my $vec1 = shift;
2257				my $vec2 = shift;
2258				my ($nrows1, $ncols1) = ($vec1->rows(), $vec1->cols());
2259				my ($nrows2, $ncols2) = ($vec2->rows(), $vec2->cols());
2260				die "Outer product operation called with non-matching vectors"
2261				unless $ncols1 == 1 && $ncols2 == 1 && $nrows1 == $nrows2;
2262				my @vec_arr1 = $vec1->as_list();
2263				my @vec_arr2 = $vec2->as_list();
2264				my $outer_product = Math::GSL::Matrix->new($nrows1, $nrows2);
2265				foreach my $index (0..$nrows1-1) {
2266				my @new_row = map $vec_arr1[$index] * $_, @vec_arr2;
2267				$outer_product->set_row($index, \@new_row);
2268				}
2269				return $outer_product;
2270				}
2271
2272				sub get_index_at_value {
2273				my $value = shift;
2274				my @array = @{shift @_};
2275				foreach my $i (0..@array-1) {
2276				return $i if $value == $array[$i];
2277				}
2278				}
2279
2280				# This routine is really not necessary in light of the new `~~' operator in Perl.
2281				# Will use the new operator in the next version.
2282				sub vector_equal {
2283				my $vec1 = shift;
2284				my $vec2 = shift;
2285				die "wrong data types for distance calculation\n" if @$vec1 != @$vec2;
2286				foreach my $i (0..@$vec1-1){
2287				return 0 if $vec1->[$i] != $vec2->[$i];
2288				}
2289				return 1;
2290				}
2291
2292				sub compare_array_floats {
2293				my $vec1 = shift;
2294				my $vec2 = shift;
2295				foreach my $i (0..@$vec1-1){
2296				return 0 if abs($vec1->[$i] - $vec2->[$i]) > 0.00001;
2297				}
2298				return 1;
2299				}
2300
2301				# Returns the minimum value and its positional index in an array
2302				sub minimum {
2303				my $arr = shift;
2304				my $min;
2305				my $index;
2306				foreach my $i (0..@{$arr}-1) {
2307				if ( (!defined $min) \|\| ($arr->[$i] < $min) ) {
2308				$index = $i;
2309				$min = $arr->[$i];
2310				}
2311				}
2312				return ($min, $index);
2313				}
2314
2315				sub minmax {
2316				my $arr = shift;
2317				my $min;
2318				my $max;
2319				foreach my $i (0..@{$arr}-1) {
2320				if ( (!defined $min) && (!defined $max) ) {
2321				$min = $arr->[$i];
2322				$max = $arr->[$i];
2323				} elsif ( $arr->[$i] < $min ) {
2324				$min = $arr->[$i];
2325				} elsif ( $arr->[$i] > $max ) {
2326				$max = $arr->[$i];
2327				}
2328				}
2329				return ($min, $max);
2330				}
2331
2332				# Meant only for constructing a deep copy of an array of
2333				# arrays:
2334				sub deep_copy_AoA {
2335				my $ref_in = shift;
2336				my $ref_out;
2337				foreach my $i (0..@{$ref_in}-1) {
2338				foreach my $j (0..@{$ref_in->[$i]}-1) {
2339				$ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
2340				}
2341				}
2342				return $ref_out;
2343				}
2344
2345				# Meant only for constructing a deep copy of an array of arrays for the case when
2346				# some elements of the top-level array may be undefined:
2347				sub deep_copy_AoA_with_nulls {
2348				my $ref_in = shift;
2349				my $ref_out;
2350				foreach my $i (0..@{$ref_in}-1) {
2351				if ( !defined $ref_in->[$i] ) {
2352				$ref_out->[$i] = undef;
2353				next;
2354				}
2355				foreach my $j (0..@{$ref_in->[$i]}-1) {
2356				$ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
2357				}
2358				}
2359				return $ref_out;
2360				}
2361
2362				# Meant only for constructing a deep copy of a hash in which each value is an
2363				# anonymous array of numbers:
2364				sub deep_copy_hash {
2365				my $ref_in = shift;
2366				my $ref_out;
2367				while ( my ($key, $value) = each( %$ref_in ) ) {
2368				$ref_out->{$key} = deep_copy_array( $value );
2369				}
2370				return $ref_out;
2371				}
2372
2373				# Meant only for an array of numbers:
2374				sub deep_copy_array {
2375				my $ref_in = shift;
2376				my $ref_out;
2377				foreach my $i (0..@{$ref_in}-1) {
2378				$ref_out->[$i] = $ref_in->[$i];
2379				}
2380				return $ref_out;
2381				}
2382
2383				# from perl docs:
2384				sub fisher_yates_shuffle {
2385				my $arr = shift;
2386				my $i = @$arr;
2387				while (--$i) {
2388				my $j = int rand( $i + 1 );
2389				@$arr[$i, $j] = @$arr[$j, $i];
2390				}
2391				}
2392
2393				sub mean_and_variance {
2394				my @data = @{shift @_};
2395				my ($mean, $variance);
2396				foreach my $i (1..@data) {
2397				if ($i == 1) {
2398				$mean = $data[0];
2399				$variance = 0;
2400				} else {
2401				# data[$i-1] because of zero-based indexing of vector
2402				$mean = ( (($i-1)/$i) * $mean ) + $data[$i-1] / $i;
2403				$variance = ( (($i-1)/$i) * $variance )
2404				+ ($data[$i-1]-$mean)**2 / ($i-1);
2405				}
2406				}
2407				return ($mean, $variance);
2408				}
2409
2410				sub check_for_illegal_params {
2411				my @params = @_;
2412				my @legal_params = qw / datafile
2413				mask
2414				K
2415				terminal_output
2416				max_em_iterations
2417				seeding
2418				class_priors
2419				seed_tags
2420				debug
2421				/;
2422				my $found_match_flag;
2423				foreach my $param (@params) {
2424				foreach my $legal (@legal_params) {
2425				$found_match_flag = 0;
2426				if ($param eq $legal) {
2427				$found_match_flag = 1;
2428				last;
2429				}
2430				}
2431				last if $found_match_flag == 0;
2432				}
2433				return $found_match_flag;
2434				}
2435
2436				sub get_value_index_hash {
2437				my $arr = shift;
2438				my %hash;
2439				foreach my $index (0..@$arr-1) {
2440				$hash{$arr->[$index]} = $index if $arr->[$index] > 0;
2441				}
2442				return \%hash;
2443				}
2444
2445				sub non_maximum_supression {
2446				my $arr = shift;
2447				my @output = (0) x @$arr;
2448				my @final_output = (0) x @$arr;
2449				my %hash;
2450				my @array_of_runs = ([$arr->[0]]);
2451				foreach my $index (1..@$arr-1) {
2452				if ($arr->[$index] == $arr->[$index-1]) {
2453				push @{$array_of_runs[-1]}, $arr->[$index];
2454				} else {
2455				push @array_of_runs, [$arr->[$index]];
2456				}
2457				}
2458				my $runstart_index = 0;
2459				foreach my $run_index (1..@array_of_runs-2) {
2460				$runstart_index += @{$array_of_runs[$run_index-1]};
2461				if ($array_of_runs[$run_index]->[0] >
2462				$array_of_runs[$run_index-1]->[0] &&
2463				$array_of_runs[$run_index]->[0] >
2464				$array_of_runs[$run_index+1]->[0]) {
2465				my $run_center = @{$array_of_runs[$run_index]} / 2;
2466				my $assignment_index = $runstart_index + $run_center;
2467				$output[$assignment_index] = $arr->[$assignment_index];
2468				}
2469				}
2470				if ($array_of_runs[-1]->[0] > $array_of_runs[-2]->[0]) {
2471				$runstart_index += @{$array_of_runs[-2]};
2472				my $run_center = @{$array_of_runs[-1]} / 2;
2473				my $assignment_index = $runstart_index + $run_center;
2474				$output[$assignment_index] = $arr->[$assignment_index];
2475				}
2476				if ($array_of_runs[0]->[0] > $array_of_runs[1]->[0]) {
2477				my $run_center = @{$array_of_runs[0]} / 2;
2478				$output[$run_center] = $arr->[$run_center];
2479				}
2480				return \@output;
2481				}
2482
2483				sub display_matrix {
2484				my $message = shift;
2485				my $matrix = shift;
2486				if (!defined blessed($matrix)) {
2487				print "display_matrix called on a scalar value: $matrix\n";
2488				return;
2489				}
2490				my $nrows = $matrix->rows();
2491				my $ncols = $matrix->cols();
2492				print "$message ($nrows rows and $ncols columns)\n";
2493				foreach my $i (0..$nrows-1) {
2494				my $row = $matrix->row($i);
2495				my @row_as_list = $row->as_list;
2496				print "@row_as_list\n";
2497				}
2498				print "\n";
2499				}
2500
2501				sub transpose {
2502				my $matrix = shift;
2503				my $num_rows = $matrix->rows();
2504				my $num_cols = $matrix->cols();
2505				my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
2506				foreach my $i (0..$num_rows-1) {
2507				my @row = $matrix->row($i)->as_list;
2508				$transpose->set_col($i, \@row );
2509				}
2510				return $transpose;
2511				}
2512
2513				sub vector_multiply {
2514				my $vec1 = shift;
2515				my $vec2 = shift;
2516				die "vec_multiply called with two vectors of different sizes"
2517				unless @$vec1 == @$vec2;
2518				my $result = 0;
2519				foreach my $i (0..@$vec1-1) {
2520				$result += $vec1->[$i] * $vec2->[$i];
2521				}
2522				return $result;
2523				}
2524
2525				sub vector_2_vector_multiply {
2526				my $vec1 = shift;
2527				my $vec2 = shift;
2528				die "vec_multiply called with two vectors of different sizes"
2529				unless @$vec1 == @$vec2;
2530				my @result_vec;
2531				foreach my $i (0..@$vec1-1) {
2532				$result_vec[$i] = $vec1->[$i] * $vec2->[$i];
2533				}
2534				return \@result_vec;
2535				}
2536
2537				sub matrix_multiply {
2538				my $matrix1 = shift;
2539				my $matrix2 = shift;
2540				my ($nrows1, $ncols1) = ($matrix1->rows(), $matrix1->cols());
2541				my ($nrows2, $ncols2) = ($matrix2->rows(), $matrix2->cols());
2542				die "matrix multiplication called with non-matching matrix arguments"
2543				# unless $ncols1 == $nrows2;
2544				unless $nrows1 == $ncols2 && $ncols1 == $nrows2;
2545				if ($nrows1 == 1) {
2546				my @row = $matrix1->row(0)->as_list;
2547				my @col = $matrix2->col(0)->as_list;
2548				my $result;
2549				foreach my $j (0..$ncols1-1) {
2550				$result += $row[$j] * $col[$j];
2551				}
2552				return $result;
2553				}
2554				my $product = Math::GSL::Matrix->new($nrows1, $nrows1);
2555				foreach my $i (0..$nrows1-1) {
2556				my $row = $matrix1->row($i);
2557				my @product_row;
2558				foreach my $j (0..$ncols2-1) {
2559				my $col = $matrix2->col($j);
2560				my $row_times_col = matrix_multiply($row, $col);
2561				push @product_row, $row_times_col;
2562				}
2563				$product->set_row($i, \@product_row);
2564				}
2565				return $product;
2566				}
2567
2568				sub vector_matrix_multiply {
2569				my $matrix1 = shift;
2570				my $matrix2 = shift;
2571				my ($nrows1, $ncols1) = ($matrix1->rows, $matrix1->cols);
2572				my ($nrows2, $ncols2) = ($matrix2->rows, $matrix2->cols);
2573				die "matrix multiplication called with non-matching matrix arguments"
2574				unless $nrows1 == 1 && $ncols1 == $nrows2;
2575				if ($ncols2 == 1) {
2576				my @row = $matrix1->row(0)->as_list;
2577				my @col = $matrix2->col(0)->as_list;
2578				my $result;
2579				foreach my $j (0..$ncols1-1) {
2580				$result += $row[$j] * $col[$j];
2581				}
2582				return $result;
2583				}
2584				my $product = Math::GSL::Matrix->new(1, $ncols2);
2585				my $row = $matrix1->row(0);
2586				my @product_row;
2587				foreach my $j (0..$ncols2-1) {
2588				my $col = $matrix2->col($j);
2589				my $row_times_col = vector_matrix_multiply($row, $col);
2590				push @product_row, $row_times_col;
2591				}
2592				$product->set_row(0, \@product_row);
2593				return $product;
2594				}
2595
2596				sub matrix_vector_multiply {
2597				my $matrix1 = shift;
2598				my $matrix2 = shift;
2599				my ($nrows1, $ncols1) = ($matrix1->rows(), $matrix1->cols());
2600				my ($nrows2, $ncols2) = ($matrix2->rows(), $matrix2->cols());
2601				die "matrix multiplication called with non-matching matrix arguments"
2602				unless $ncols1 == $nrows2 && $ncols2 == 1;
2603				if ($nrows1 == 1) {
2604				my @row = $matrix1->row(0)->as_list;
2605				my @col = $matrix2->col(0)->as_list;
2606				my $result;
2607				foreach my $j (0..$ncols1-1) {
2608				$result += $row[$j] * $col[$j];
2609				}
2610				return $result;
2611				}
2612				my $product = Math::GSL::Matrix->new($nrows1, 1);
2613				my $col = $matrix2->col(0);
2614				my @product_col;
2615				foreach my $i (0..$nrows1-1) {
2616				my $row = $matrix1->row($i);
2617				my $row_times_col = matrix_vector_multiply($row, $col);
2618				push @product_col, $row_times_col;
2619				}
2620				$product->set_col(0, \@product_col);
2621				return $product;
2622				}
2623
2624				sub matrix_trace {
2625				my $matrix = shift;
2626				my ($nrows, $ncols) = ($matrix->rows(), $matrix->cols());
2627				die "trace can only be calculated for a square matrix"
2628				unless $ncols == $nrows;
2629				my @elements = $matrix->as_list;
2630				my $trace = 0;
2631				foreach my $i (0..$nrows-1) {
2632				$trace += $elements[$i + $i * $ncols];
2633				}
2634				return $trace;
2635				}
2636
2637				1;
2638
2639				=pod
2640				=head1 NAME
2641
2642				Algorithm::ExpectationMaximization -- A pure-Perl implementation for clustering
2643				numerical multi-dimensional data with the Expectation-Maximization algorithm.
2644
2645				=head1 SYNOPSIS
2646
2647				use Algorithm::ExpectationMaximization;
2648
2649				# First name the data file:
2650
2651				my $datafile = "mydatafile1.dat";
2652
2653				# Next, set the mask to indicate which columns of the datafile to use for
2654				# clustering and which column contains a symbolic ID for each data record. For
2655				# example, if the symbolic name is in the first column, you want the second column
2656				# to be ignored, and you want the next three columns to be used for 3D clustering:
2657
2658				my $mask = "N0111";
2659
2660				# Now construct an instance of the clusterer. The parameter `K' controls the
2661				# number of clusters. Here is an example call to the constructor for instance
2662				# creation:
2663
2664				my $clusterer = Algorithm::ExpectationMaximization->new(
2665				datafile => $datafile,
2666				mask => $mask,
2667				K => 3,
2668				max_em_iterations => 300,
2669				seeding => 'random',
2670				terminal_output => 1,
2671				debug => 0,
2672				);
2673
2674				# Note the choice for `seeding'. The choice `random' means that the clusterer will
2675				# randomly select `K' data points to serve as initial cluster centers. Other
2676				# possible choices for the constructor parameter `seeding' are `kmeans' and
2677				# `manual'. With the `kmeans' option for `seeding', the output of a K-means
2678				# clusterer is used for the cluster seeds and the initial cluster covariances. If
2679				# you use the `manual' option for seeding, you must also specify the data elements
2680				# to use for seeding the clusters.
2681
2682				# Here is an example of a call to the constructor when we choose the `manual'
2683				# option for seeding the clusters and for specifying the data elements for
2684				# seeding. The data elements are specified by their tag names. In this case,
2685				# these names are `a26', `b53', and `c49':
2686
2687				my $clusterer = Algorithm::ExpectationMaximization->new(
2688				datafile => $datafile,
2689				mask => $mask,
2690				class_priors => [0.6, 0.2, 0.2],
2691				K => 3,
2692				max_em_iterations => 300,
2693				seeding => 'manual',
2694				seed_tags => ['a26', 'b53', 'c49'],
2695				terminal_output => 1,
2696				debug => 0,
2697				);
2698
2699				# This example call to the constructor also illustrates how you can inject class
2700				# priors into the clustering process. The class priors are the prior probabilities
2701				# of the class distributions in your dataset. As explained later, injecting class
2702				# priors in the manner shown above makes statistical sense only for the case of
2703				# manual seeding. When you do inject class priors, the order in which the priors
2704				# are expressed must correspond to the manually specified seeds for the clusters.
2705
2706				# After the invocation of the constructor, the following calls are mandatory
2707				# for reasons that should be obvious from the names of the methods:
2708
2709				$clusterer->read_data_from_file();
2710				srand(time);
2711				$clusterer->seed_the_clusters();
2712				$clusterer->EM();
2713				$clusterer->run_bayes_classifier();
2714				my $clusters = $clusterer->return_disjoint_clusters();
2715
2716				# where the call to `EM()' is the invocation of the expectation-maximization
2717				# algorithm. The call to `srand(time)' is to seed the pseudo random number
2718				# generator afresh for each run of the cluster seeding procedure. If you want to
2719				# see repeatable results from one run to another of the algorithm with random
2720				# seeding, you would obviously not invoke `srand(time)'.
2721
2722				# The call `run_bayes_classifier()' shown above carries out a disjoint clustering
2723				# of all the data points using the naive Bayes' classifier. And the call
2724				# `return_disjoint_clusters()' returns the clusters thus formed to you. Once you
2725				# have obtained access to the clusters in this manner, you can display them in
2726				# your terminal window by
2727
2728				foreach my $index (0..@$clusters-1) {
2729				print "Cluster $index (Naive Bayes): @{$clusters->[$index]}\n\n"
2730				}
2731
2732				# If you would like to also see the clusters purely on the basis of the posterior
2733				# class probabilities exceeding a threshold, call
2734
2735				my $theta1 = 0.2;
2736				my $posterior_prob_clusters =
2737				$clusterer->return_clusters_with_posterior_probs_above_threshold($theta1);
2738
2739				# where you can obviously set the threshold $theta1 to any value you wish. Note
2740				# that now you may end up with clusters that overlap. You can display them in
2741				# your terminal window in the same manner as shown above for the naive Bayes'
2742				# clusters.
2743
2744				# You can write the naive Bayes' clusters out to files, one cluster per file, by
2745				# calling
2746
2747				$clusterer->write_naive_bayes_clusters_to_files();
2748
2749				# The clusters are placed in files with names like
2750
2751				naive_bayes_cluster1.dat
2752				naive_bayes_cluster2.dat
2753				...
2754
2755				# In the same manner, you can write out the posterior probability based possibly
2756				# overlapping clusters to files by calling:
2757
2758				$clusterer->write_posterior_prob_clusters_above_threshold_to_files($theta1);
2759
2760				# where the threshold $theta1 sets the probability threshold for deciding which
2761				# data elements to place in a cluster. These clusters are placed in files with
2762				# names like
2763
2764				posterior_prob_cluster1.dat
2765				posterior_prob_cluster2.dat
2766				...
2767
2768				# CLUSTER VISUALIZATION:
2769
2770				# You must first set the mask for cluster visualization. This mask tells the
2771				# module which 2D or 3D subspace of the original data space you wish to visualize
2772				# the clusters in:
2773
2774				my $visualization_mask = "111";
2775				$clusterer->visualize_clusters($visualization_mask);
2776				$clusterer->visualize_distributions($visualization_mask);
2777				$clusterer->plot_hardcopy_clusters($visualization_mask);
2778				$clusterer->plot_hardcopy_distributions($visualization_mask);
2779
2780				# where the last two invocations are for writing out the PNG plots of the
2781				# visualization displays to disk files. The PNG image of the posterior
2782				# probability distributions is written out to a file named posterior_prob_plot.png
2783				# and the PNG image of the disjoint clusters to a file called cluster_plot.png.
2784
2785				# SYNTHETIC DATA GENERATION:
2786
2787				# The module has been provided with a class method for generating multivariate
2788				# data for experimenting with the EM algorithm. The data generation is controlled
2789				# by the contents of a parameter file that is supplied as an argument to the data
2790				# generator method. The priors, the means, and the covariance matrices in the
2791				# parameter file must be according to the syntax shown in the `param1.txt' file in
2792				# the `examples' directory. It is best to edit a copy of this file for your
2793				# synthetic data generation needs.
2794
2795				my $parameter_file = "param1.txt";
2796				my $out_datafile = "mydatafile1.dat";
2797				Algorithm::ExpectationMaximization->cluster_data_generator(
2798				input_parameter_file => $parameter_file,
2799				output_datafile => $out_datafile,
2800				total_number_of_data_points => $N );
2801
2802				# where the value of $N is the total number of data points you would like to see
2803				# generated for all of the Gaussians. How this total number is divided up amongst
2804				# the Gaussians is decided by the prior probabilities for the Gaussian components
2805				# as declared in input parameter file. The synthetic data may be visualized in a
2806				# terminal window and the visualization written out as a PNG image to a diskfile
2807				# by
2808
2809				my $data_visualization_mask = "11";
2810				$clusterer->visualize_data($data_visualization_mask);
2811				$clusterer->plot_hardcopy_data($data_visualization_mask);
2812
2813
2814				=head1 CHANGES
2815
2816				Version 1.2 allows the module to also be used for 1-D data. The visualization code
2817				for 1-D shows the clusters through their histograms.
2818
2819				Version 1.1 incorporates much cleanup of the documentation associated with the
2820				module. Both the top-level module documentation, especially the Description part,
2821				and the comments embedded in the code were revised for better utilization of the
2822				module. The basic implementation code remains unchanged.
2823
2824
2825				=head1 DESCRIPTION
2826
2827				B is a I module for the
2828				Expectation-Maximization (EM) method of clustering numerical data that lends itself
2829				to modeling as a Gaussian mixture. Since the module is entirely in Perl (in the
2830				sense that it is not a Perl wrapper around a C library that actually does the
2831				clustering), the code in the module can easily be modified to experiment with several
2832				aspects of EM.
2833
2834				Gaussian Mixture Modeling (GMM) is based on the assumption that the data consists of
2835				C Gaussian components, each characterized by its own mean vector and its own
2836				covariance matrix. Obviously, given observed data for clustering, we do not know
2837				which of the C Gaussian components was responsible for any of the data elements.
2838				GMM also associates a prior probability with each Gaussian component. In general,
2839				these priors will also be unknown. So the problem of clustering consists of
2840				estimating the posterior class probability at each data element and also estimating
2841				the class priors. Once these posterior class probabilities and the priors are
2842				estimated with EM, we can use the naive Bayes' classifier to partition the data into
2843				disjoint clusters. Or, for "soft" clustering, we can find all the data elements that
2844				belong to a Gaussian component on the basis of the posterior class probabilities at
2845				the data elements exceeding a prescribed threshold.
2846
2847				If you do not mind the fact that it is possible for the EM algorithm to occasionally
2848				get stuck in a local maximum and to, therefore, produce a wrong answer even when you
2849				know the data to be perfectly multimodal Gaussian, EM is probably the most magical
2850				approach to clustering multidimensional data. Consider the case of clustering
2851				three-dimensional data. Each Gaussian cluster in 3D space is characterized by the
2852				following 10 variables: the 6 unique elements of the C<3x3> covariance matrix (which
2853				must be symmetric positive-definite), the 3 unique elements of the mean, and the
2854				prior associated with the Gaussian. Now let's say you expect to see six Gaussians in
2855				your data. What that means is that you would want the values for 59 variables
2856				(remember the unit-summation constraint on the class priors which reduces the overall
2857				number of variables by one) to be estimated by the algorithm that seeks to discover
2858				the clusters in your data. What's amazing is that despite the large number of
2859				variables that are being simultaneously optimized, the chances are that the EM
2860				algorithm will give you a good approximation to the right answer.
2861
2862				At its core, EM depends on the notion of unobserved data and the averaging of the
2863				log-likelihood of the data actually observed over all admissible probabilities for
2864				the unobserved data. But what is unobserved data? While in some cases where EM is
2865				used, the unobserved data is literally the missing data, in others, it is something
2866				that cannot be seen directly but that nonetheless is relevant to the data actually
2867				observed. For the case of clustering multidimensional numerical data that can be
2868				modeled as a Gaussian mixture, it turns out that the best way to think of the
2869				unobserved data is in terms of a sequence of random variables, one for each observed
2870				data point, whose values dictate the selection of the Gaussian for that data point.
2871				This point is explained in great detail in my on-line tutorial at
2872				L.
2873
2874				The EM algorithm in our context reduces to an iterative invocation of the following
2875				steps: (1) Given the current guess for the means and the covariances of the different
2876				Gaussians in our mixture model, use Bayes' Rule to update the posterior class
2877				probabilities at each of the data points; (2) Using the updated posterior class
2878				probabilities, first update the class priors; (3) Using the updated class priors,
2879				update the class means and the class covariances; and go back to Step (1). Ideally,
2880				the iterations should terminate when the expected log-likelihood of the observed data
2881				has reached a maximum and does not change with any further iterations. The stopping
2882				rule used in this module is the detection of no change over three consecutive
2883				iterations in the values calculated for the priors.
2884
2885				This module provides three different choices for seeding the clusters: (1) random,
2886				(2) kmeans, and (3) manual. When random seeding is chosen, the algorithm randomly
2887				selects C data elements as cluster seeds. That is, the data vectors associated
2888				with these seeds are treated as initial guesses for the means of the Gaussian
2889				distributions. The covariances are then set to the values calculated from the entire
2890				dataset with respect to the means corresponding to the seeds. With kmeans seeding, on
2891				the other hand, the means and the covariances are set to whatever values are returned
2892				by the kmeans algorithm. And, when seeding is set to manual, you are allowed to
2893				choose C data elements --- by specifying their tag names --- for the seeds. The
2894				rest of the EM initialization for the manual mode is the same as for the random mode.
2895				The algorithm allows for the initial priors to be specified for the manual mode of
2896				seeding.
2897
2898				Much of code for the kmeans based seeding of EM was drawn from the
2899				C module by me. The code from that module used here corresponds to
2900				the case when the C option in the C module is set
2901				to C. The C option for KMeans consists of subjecting the data to a
2902				principal components analysis (PCA) to discover the direction of maximum variance in
2903				the data space. The data points are then projected on to this direction and a
2904				histogram constructed from the projections. Centers of the C largest peaks in
2905				this smoothed histogram are used to seed the KMeans based clusterer. As you'd
2906				expect, the output of the KMeans used to seed the EM algorithm.
2907
2908				This module uses two different criteria to measure the quality of the clustering
2909				achieved. The first is the Minimum Description Length (MDL) proposed originally by
2910				Rissanen (J. Rissanen: "Modeling by Shortest Data Description," Automatica, 1978, and
2911				"A Universal Prior for Integers and Estimation by Minimum Description Length," Annals
2912				of Statistics, 1983.) The MDL criterion is a difference of a log-likelihood term for
2913				all of the observed data and a model-complexity penalty term. In general, both the
2914				log-likelihood and the model-complexity terms increase as the number of clusters
2915				increases. The form of the MDL criterion in this module uses for the penalty term
2916				the Bayesian Information Criterion (BIC) of G. Schwartz, "Estimating the Dimensions
2917				of a Model," The Annals of Statistics, 1978. In general, the smaller the value of
2918				MDL quality measure, the better the clustering of the data.
2919
2920				For our second measure of clustering quality, we use `trace( SW^-1 . SB)' where SW is
2921				the within-class scatter matrix, more commonly denoted S_w, and SB the between-class
2922				scatter matrix, more commonly denoted S_b (the underscore means subscript). This
2923				measure can be thought of as the normalized average distance between the clusters,
2924				the normalization being provided by average cluster covariance SW^-1. Therefore, the
2925				larger the value of this quality measure, the better the separation between the
2926				clusters. Since this measure has its roots in the Fisher linear discriminant
2927				function, we incorporate the word C in the name of the quality measure.
2928				I When the
2929				clusters exhibit significant overlap, the numbers produced by this quality measure
2930				tend to be generally meaningless.
2931
2932				=head1 METHODS
2933
2934				The module provides the following methods for EM based
2935				clustering, for cluster visualization, for data
2936				visualization, and for the generation of data for testing a
2937				clustering algorithm:
2938
2939				=over
2940
2941				=item B
2942
2943				A call to C constructs a new instance of the
2944				C class. A typical form
2945				of this call when you want to use random option for seeding
2946				the algorithm is:
2947
2948				my $clusterer = Algorithm::ExpectationMaximization->new(
2949				datafile => $datafile,
2950				mask => $mask,
2951				K => 3,
2952				max_em_iterations => 300,
2953				seeding => 'random',
2954				terminal_output => 1,
2955				debug => 0,
2956				);
2957
2958				where C is the expected number of clusters and
2959				C the maximum number of EM iterations
2960				that you want to allow until convergence is achieved.
2961				Depending on your dataset and on the choice of the initial
2962				seeds, the actual number of iterations used could be as few
2963				as 10 and as many as reaching 300. The output produced by
2964				the algorithm shows the actual number of iterations used to
2965				arrive at convergence.
2966
2967				The data file supplied through the C option is
2968				expected to contain entries in the following format
2969
2970				c20 0 10.7087017086940 9.63528386251712 10.9512155258108 ...
2971				c7 0 12.8025925026787 10.6126270065785 10.5228482095349 ...
2972				b9 0 7.60118206283120 5.05889245193079 5.82841781759102 ...
2973				....
2974				....
2975
2976				where the first column contains the symbolic ID tag for each
2977				data record and the rest of the columns the numerical
2978				information. As to which columns are actually used for
2979				clustering is decided by the string value of the mask. For
2980				example, if we wanted to cluster on the basis of the entries
2981				in just the 3rd, the 4th, and the 5th columns above, the
2982				mask value would be C where the character C
2983				indicates that the ID tag is in the first column, the
2984				character C<0> that the second column is to be ignored, and
2985				the C<1>'s that follow that the 3rd, the 4th, and the 5th
2986				columns are to be used for clustering.
2987
2988				If instead of random seeding, you wish to use the kmeans
2989				based seeding, just replace the option C supplied
2990				for C by C. You can also do manual seeding
2991				by designating a specified set of data elements to serve as
2992				cluster seeds. The call to the constructor in this case
2993				looks like
2994
2995				my $clusterer = Algorithm::ExpectationMaximization->new(
2996				datafile => $datafile,
2997				mask => $mask,
2998				K => 3,
2999				max_em_iterations => 300,
3000				seeding => 'manual',
3001				seed_tags => ['a26', 'b53', 'c49'],
3002				terminal_output => 1,
3003				debug => 0,
3004				);
3005
3006				where the option C is set to an anonymous array
3007				of symbolic names associated with the data elements.
3008
3009				If you know the class priors, you can supply them through an
3010				additional option to the constructor that looks like
3011
3012				class_priors => [0.6, 0.2, 0.2],
3013
3014				for the case of C equal to 3. B
3015				be a useful thing to do only for the case of manual
3016				seeding.> If you go for manual seeding, the order in which
3017				the priors are expressed should correspond to the order of
3018				the manually chosen tags supplied through the C
3019				option.
3020
3021				Note that the parameter C is boolean; when
3022				not supplied in the call to C it defaults to 0. When
3023				set, this parameter displays useful information in the
3024				window of the terminal screen in which you invoke the
3025				algorithm.
3026
3027				=item B
3028
3029				$clusterer->read_data_from_file()
3030
3031				This is a required call after the constructor is invoked. As
3032				you would expect, this call reads in the data for
3033				clustering.
3034
3035				=item B
3036
3037				$clusterer->seed_the_clusters();
3038
3039				This is also a required call. It processes the option you
3040				supplied for C in the constructor call to choose
3041				the data elements for seeding the C clusters.
3042
3043				=item B
3044
3045				$clusterer->EM();
3046
3047				This is the workhorse of the module, as you would expect.
3048				The means, the covariances, and the priors estimated by this
3049				method are stored in instance variables that are subsequently
3050				accessed by other methods for the purpose of displaying the
3051				clusters, the probability distributions, etc.
3052
3053				=item B
3054
3055				$clusterer->run_bayes_classifier();
3056
3057				Using the posterior probability distributions estimated by
3058				the C method, this method partitions the data into the
3059				C disjoint clusters using the naive Bayes' classifier.
3060
3061				=item B
3062
3063				my $clusters = $clusterer->return_disjoint_clusters();
3064
3065				This allows you to access the clusters obtained with the
3066				application of the naive Bayes' classifier in your own
3067				scripts. If, say, you wanted to see the data records placed
3068				in each cluster, you could subsequently invoke the following
3069				loop in your own script:
3070
3071				foreach my $index (0..@$clusters-1) {
3072				print "Cluster $index (Naive Bayes): @{$clusters->[$index]}\n\n"
3073				}
3074
3075				where C<$clusters> holds the array reference returned by the
3076				call to C.
3077
3078				=item B
3079
3080				$clusterer->write_naive_bayes_clusters_to_files();
3081
3082				This method writes the clusters obtained by applying the
3083				naive Bayes' classifier to disk files, one cluster per
3084				file. What is written out to each file consists of the
3085				symbolic names of the data records that belong to the
3086				cluster corresponding to that file. The clusters are placed
3087				in files with names like
3088
3089				naive_bayes_cluster1.dat
3090				naive_bayes_cluster2.dat
3091				...
3092
3093				=item B
3094
3095				my $theta1 = 0.2;
3096				my $posterior_prob_clusters =
3097				$clusterer->return_clusters_with_posterior_probs_above_threshold($theta1);
3098
3099				This method returns a reference to an array of C
3100				anonymous arrays, each consisting of the symbolic names for
3101				the data records where the posterior class probability
3102				exceeds the threshold as specified by C<$theta1>.
3103				Subsequently, you can access each of these
3104				posterior-probability based clusters through a loop
3105				construct such as
3106
3107				foreach my $index (0..@$posterior_prob_clusters-1) {
3108				print "Cluster $index (based on posterior probs exceeding $theta1): " .
3109				"@{$posterior_prob_clusters->[$index]}\n\n"
3110				}
3111
3112				=item B
3113
3114				$clusterer->write_posterior_prob_clusters_above_threshold_to_files($theta1);
3115
3116				This call writes out the posterior-probability based soft
3117				clusters to disk files. As in the previous method, the
3118				threshold C<$theta1> sets the probability threshold for
3119				deciding which data elements belong to a cluster. These
3120				clusters are placed in files with names like
3121
3122				posterior_prob_cluster1.dat
3123				posterior_prob_cluster2.dat
3124				...
3125
3126				=item B
3127
3128				my $theta2 = 0.00001;
3129				my $class_distributions =
3130				$clusterer->return_individual_class_distributions_above_given_threshold($theta2);
3131
3132				This is the method to call if you wish to see the individual
3133				Gaussians in your own script. The method returns a reference
3134				to an array of anonymous arrays, with each anonymous array
3135				representing data membership in each Gaussian. Only those
3136				data points are included in each Gaussian where the
3137				probability exceeds the threshold C<$theta2>. Note that the
3138				larger the covariance and the higher the data
3139				dimensionality, the smaller this threshold must be for you
3140				to see any of the data points in a Gaussian. After you have
3141				accessed the Gaussian mixture in this manner, you can
3142				display the data membership in each Gaussian through the
3143				following sort of a loop:
3144
3145				foreach my $index (0..@$class_distributions-1) {
3146				print "Gaussian Distribution $index (only shows data elements " .
3147				"whose probabilities exceed the threshold $theta2: " .
3148				"@{$class_distributions->[$index]}\n\n"
3149				}
3150
3151				=item B
3152
3153				my $visualization_mask = "11";
3154				$clusterer->visualize_clusters($visualization_mask);
3155
3156				The visualization mask here does not have to be identical to
3157				the one used for clustering, but must be a subset of that
3158				mask. This is convenient for visualizing the clusters in
3159				two- or three-dimensional subspaces of the original space.
3160				The subset is specified by placing `0's in the positions
3161				corresponding to the dimensions you do NOT want to see
3162				through visualization. Depending on the mask used, this
3163				method creates a 2D or a 3D scatter plot of the clusters
3164				obtained through the naive Bayes' classification rule.
3165
3166				=item B
3167
3168				$clusterer->visualize_distributions($visualization_mask);
3169
3170				This is the method to call if you want to visualize the soft
3171				clustering corresponding to the posterior class
3172				probabilities exceeding the threshold specified in the call
3173				to
3174				C.
3175				Again, depending on the visualization mask used, you will
3176				see either a 2D plot or a 3D scatter plot.
3177
3178				=item B
3179
3180				$clusterer->plot_hardcopy_clusters($visualization_mask);
3181
3182				This method create a PNG file from the C created
3183				display of the naive Bayes' clusters obtained from the data.
3184				The plotting functionality of C is accessed through
3185				the Perl wrappers provided by the C
3186				module.
3187
3188				=item B
3189
3190				$clusterer->plot_hardcopy_distributions($visualization_mask);
3191
3192				This method create a PNG file from the C created
3193				display of the clusters that correspond to the posterior
3194				class probabilities exceeding a specified threshold. The
3195				plotting functionality of C is accessed through the
3196				Perl wrappers provided by the C module.
3197
3198				=item B
3199
3200				$clusterer->display_fisher_quality_vs_iterations();
3201
3202				This method measures the quality of clustering by
3203				calculating the normalized average squared distance between
3204				the cluster centers, the normalization being provided by the
3205				average cluster covariance. See the Description for further
3206				details. In general, this measure is NOT useful for
3207				overlapping clusters.
3208
3209				=item B
3210
3211				$clusterer->display_mdl_quality_vs_iterations();
3212
3213				At the end of each iteration, this method measures the
3214				quality of clustering my calculating its MDL (Minimum
3215				Description Length). As stated earlier in Description, the
3216				MDL measure is a difference of a log-likelihood term for all
3217				of the observed data and a model-complexity penalty term.
3218				The smaller the value returned by this method, the better
3219				the clustering.
3220
3221				=item B
3222
3223				my $estimated_priors = $clusterer->return_estimated_priors();
3224				print "Estimated class priors: @$estimated_priors\n";
3225
3226				This method can be used to access the final values of the
3227				class priors as estimated by the EM algorithm.
3228
3229				=item B
3230
3231				Algorithm::ExpectationMaximization->cluster_data_generator(
3232				input_parameter_file => $parameter_file,
3233				output_datafile => $out_datafile,
3234				total_number_of_data_points => 300
3235				);
3236
3237				for generating multivariate data for clustering if you wish to play with synthetic
3238				data for experimenting with the EM algorithm. The input parameter file must specify
3239				the priors to be used for the Gaussians, their means, and their covariance matrices.
3240				The format of the information contained in the parameter file must be as shown in the
3241				file C provided in the C directory. It will be easiest for you
3242				to just edit a copy of this file for your data generation needs. In addition to the
3243				format of the parameter file, the main constraint you need to observe in specifying
3244				the parameters is that the dimensionality of the covariance matrices must correspond
3245				to the dimensionality of the mean vectors. The multivariate random numbers are
3246				generated by calling the C module. As you would expect, this module
3247				requires that the covariance matrices you specify in your parameter file be symmetric
3248				and positive definite. Should the covariances in your parameter file not obey this
3249				condition, the C module will let you know.
3250
3251				=item B
3252
3253				$clusterer->visualize_data($data_visualization_mask);
3254
3255				This is the method to call if you want to visualize the data
3256				you plan to cluster with the EM algorithm. You'd need to
3257				specify argument mask in a manner similar to the
3258				visualization of the clusters, as explained earlier.
3259
3260				=item B
3261
3262				$clusterer->plot_hardcopy_data($data_visualization_mask);
3263
3264				This method creates a PNG file that can be used to print out
3265				a hardcopy of the data in different 2D and 3D subspaces of
3266				the data space. The visualization mask is used to select the
3267				subspace for the PNG image.
3268
3269				=back
3270
3271				=head1 HOW THE CLUSTERS ARE OUTPUT
3272
3273				This module produces two different types of clusters: the "hard" clusters and the
3274				"soft" clusters. The hard clusters correspond to the naive Bayes' classification of
3275				the data points on the basis of the Gaussian distributions and the class priors
3276				estimated by the EM algorithm. Such clusters partition the data into disjoint
3277				subsets. On the other hand, the soft clusters correspond to the posterior class
3278				probabilities calculated by the EM algorithm. A data element belongs to a cluster if
3279				its posterior probability for that Gaussian exceeds a threshold.
3280
3281				After the EM algorithm has finished running, the hard clusters are created by
3282				invoking the method C on an instance of the module and then
3283				made user-accessible by calling C. These clusters may
3284				then be displayed in a terminal window by dereferencing each element of the array
3285				whose reference is returned b C. The hard clusters can
3286				be written out to disk files by invoking C.
3287				This method writes out the clusters to files, one cluster per file. What is written
3288				out to each file consists of the symbolic names of the data records that belong to
3289				the cluster corresponding to that file. The clusters are placed in files with names
3290				like
3291
3292				naive_bayes_cluster1.dat
3293				naive_bayes_cluster2.dat
3294				...
3295
3296				The soft clusters on the other hand are created by calling
3297				C
3298				on an instance of the module, where the argument C<$theta1>
3299				is the threshold for deciding whether a data element belongs
3300				in a soft cluster. The posterior class probability at a
3301				data element must exceed the threshold for the element to
3302				belong to the corresponding cluster. The soft cluster can
3303				be written out to disk files by calling
3304				C.
3305				As with the hard clusters, each cluster is placed in a separate
3306				file. The filenames for such clusters look like:
3307
3308				posterior_prob_cluster1.dat
3309				posterior_prob_cluster2.dat
3310				...
3311
3312				=head1 WHAT IF THE NUMBER OF CLUSTERS IS UNKNOWN?
3313
3314				The module constructor requires that you supply a value for the parameter C, which
3315				is the number of clusters you expect to see in the data. But what if you do not have
3316				a good value for C? Note that it is possible to search for the best C by using
3317				the two clustering quality criteria included in the module. However, I have
3318				intentionally not yet incorporated that feature in the module because it slows down
3319				the execution of the code --- especially when the dimensionality of the data
3320				increases. However, nothing prevents you from writing a script --- along the lines
3321				of the five "canned_example" scripts in the C directory --- that would use
3322				the two clustering quality metrics for finding the best choice for C for a given
3323				dataset. Obviously, you will now have to incorporate the call to the constructor in
3324				a loop and check the value of the quality measures for each value of C.
3325
3326				=head1 SOME RESULTS OBTAINED WITH THIS MODULE
3327
3328				If you would like to see some results that have been obtained with this module, check
3329				out Section 7 of the report
3330				L.
3331
3332				=head1 THE C DIRECTORY
3333
3334				Becoming familiar with this directory should be your best
3335				strategy to become comfortable with this module (and its
3336				future versions). You are urged to start by executing the
3337				following five example scripts:
3338
3339				=over 16
3340
3341				=item I
3342
3343				This example applies the EM algorithm to the data contained in the datafile
3344				C. The mixture data in the file corresponds to three overlapping
3345				Gaussian components in a star-shaped pattern. The EM based clustering for this data
3346				is shown in the files C and
3347				C, the former displaying the hard clusters
3348				obtained by using the naive Bayes' classifier and the latter showing the soft
3349				clusters obtained on the basis of the posterior class probabilities at the data
3350				points.
3351
3352				=item I
3353
3354				The datafile used in this example is C. This mixture data
3355				corresponds to two well-separated relatively isotropic Gaussians. EM based clustering for this
3356				data is shown in the files C and
3357				C, the former displaying the hard clusters
3358				obtained by using the naive Bayes' classifier and the latter showing the soft
3359				clusters obtained by using the posterior class probabilities at the data points.
3360
3361				=item I
3362
3363				Like the first example, this example again involves three Gaussians, but now their
3364				means are not co-located. Additionally, we now seed the clusters manually by
3365				specifying three selected data points as the initial guesses for the cluster means.
3366				The datafile used for this example is C. The EM based clustering
3367				for this data is shown in the files C and
3368				C, the former displaying the hard clusters
3369				obtained by using the naive Bayes' classifier and the latter showing the soft
3370				clusters obtained on the basis of the posterior class probabilities at the data
3371				points.
3372
3373				=item I
3374
3375				Whereas the three previous examples demonstrated EM based clustering of 2D data, we
3376				now present an example of clustering in 3D. The datafile used in this example is
3377				C. This mixture data corresponds to three well-separated but highly
3378				anisotropic Gaussians. The EM derived clustering for this data is shown in the files
3379				C and C, the
3380				former displaying the hard clusters obtained by using the naive Bayes' classifier and
3381				the latter showing the soft clusters obtained on the basis of the posterior class
3382				probabilities at the data points.
3383
3384				=item I
3385
3386				We again demonstrate clustering in 3D but now we have one Gaussian cluster that
3387				"cuts" through the other two Gaussian clusters. The datafile used in this example is
3388				C. The three Gaussians in this case are highly overlapping and
3389				highly anisotropic. The EM derived clustering for this data is shown in the files
3390				C and C, the
3391				former displaying the hard clusters obtained by using the naive Bayes' classifier and
3392				the latter showing the soft clusters obtained through the posterior class
3393				probabilities at the data points.
3394
3395				=item I
3396
3397				This example, added in Version 1.2, demonstrates the use of this module for 1-D data.
3398				In order to visualize the clusters for the 1-D case, we show them through their
3399				respective histograms. The datafile used in this example is C. The
3400				data consists of two overlapping Gaussians. The EM derived clustering for this data
3401				is shown in the files C and
3402				C, the former displaying the hard clusters
3403				obtained by using the naive Bayes' classifier and the latter showing the soft
3404				clusters obtained through the posterior class probabilities at the data points.
3405
3406				=back
3407
3408				Going through the six examples listed above will make you familiar with how to make
3409				the calls to the clustering and the visualization methods. The C directory
3410				also includes several parameter files with names like
3411
3412				param1.txt
3413				param2.txt
3414				param3.txt
3415				...
3416
3417				These were used to generate the synthetic data for which the results are shown in the
3418				C directory. Just make a copy of one of these files and edit it if you
3419				would like to generate your own multivariate data for clustering. Note that you can
3420				generate data with any dimensionality through appropriate entries in the parameter
3421				file.
3422
3423				=head1 CAVEATS
3424
3425				When you run the scripts in the C directory, your results will NOT always
3426				look like what I have shown in the PNG image files in the directory. As mentioned
3427				earlier in Description, the EM algorithm starting from randomly chosen initial
3428				guesses for the cluster means can get stuck in a local maximum.
3429
3430				That raises an interesting question of how one judges the correctness of clustering
3431				results when dealing with real experimental data. For real data, the best approach
3432				is to try the EM algorithm multiple times with all of the seeding options included in
3433				this module. It would be safe to say that, at least in low dimensional spaces and
3434				with sufficient data, a majority of your runs should yield "correct" results.
3435
3436				Also bear in mind that a pure Perl implementation is not meant for the clustering of
3437				very large data files. It is really designed more for researching issues related to
3438				EM based approaches to clustering.
3439
3440
3441				=head1 REQUIRED
3442
3443				This module requires the following three modules:
3444
3445				Math::Random
3446				Graphics::GnuplotIF
3447				Math::GSL::Matrix
3448
3449				the first for generating the multivariate random numbers, the second for the
3450				visualization of the clusters, and the last for access to the Perl wrappers for the
3451				GNU Scientific Library. The C module of this library is used for various
3452				algebraic operations on the covariance matrices of the Gaussians.
3453
3454				=head1 EXPORT
3455
3456				None by design.
3457
3458
3459				=head1 BUGS
3460
3461				Please notify the author if you encounter any bugs. When sending email, please place
3462				the string 'Algorithm EM' in the subject line.
3463
3464				=head1 INSTALLATION
3465
3466				The usual
3467
3468				perl Makefile.PL
3469				make
3470				make test
3471				make install
3472
3473				if you have root access. If not,
3474
3475				perl Makefile.PL prefix=/some/other/directory/
3476				make
3477				make test
3478				make install
3479
3480				=head1 ACKNOWLEDGMENTS
3481
3482				Version 1.2 is a result of the feedback received from Paul
3483				May of University of Birmingham. Thanks, Paul!
3484
3485				=head1 AUTHOR
3486
3487				Avinash Kak, kak@purdue.edu
3488
3489				If you send email, please place the string "EM Algorithm" in your
3490				subject line to get past my spam filter.
3491
3492				=head1 COPYRIGHT
3493
3494				This library is free software; you can redistribute it and/or modify it under the
3495				same terms as Perl itself.
3496
3497				Copyright 2013 Avinash Kak
3498
3499				=cut
3500