File Coverage

blib/lib/Algorithm/ExpectationMaximization.pm

Criterion	Covered	Total	%
statement	22	24	91.6
branch			n/a
condition			n/a
subroutine	8	8	100.0
pod			n/a
total	30	32	93.7

line	stmt	sub	time	code
1				package Algorithm::ExpectationMaximization;
2
3				#---------------------------------------------------------------------------
4				# Copyright (c) 2014 Avinash Kak. All rights reserved. This program is free
5				# software. You may modify and/or distribute it under the same terms as Perl itself.
6				# This copyright notice must remain attached to the file.
7				#
8				# Algorithm::ExpectationMaximization is a pure Perl implementation for
9				# Expectation-Maximization based clustering of multi-dimensional data that can be
10				# modeled as a Gaussian mixture.
11				# ---------------------------------------------------------------------------
12
13	1	1	12560	use 5.10.0;
	1		2
	1		33
14	1	1	4	use strict;
	1		1
	1		21
15	1	1	2	use warnings;
	1		9
	1		27
16	1	1	4	use Carp;
	1		0
	1		69
17	1	1	5	use File::Basename;
	1		1
	1		61
18	1	1	506	use Math::Random;
	1		4765
	1		105
19	1	1	514	use Graphics::GnuplotIF;
	1		8517
	1		44
20	1	1	1004	use Math::GSL::Matrix;
	0
	0
21				use Scalar::Util 'blessed';
22
23				our $VERSION = '1.21';
24
25				# from perl docs:
26				my $_num_regex = '^[+-]?\ (\d+(\.\d)?\|\.\d+)([eE][+-]?\d+)?$';
27
28				# Constructor:
29				sub new {
30				my ($class, %args) = @_;
31				my @params = keys %args;
32				croak "\nYou have used a wrong name for a keyword argument " .
33				"--- perhaps a misspelling\n"
34				if check_for_illegal_params(@params) == 0;
35				bless {
36				_datafile => $args{datafile} \|\| croak("datafile required"),
37				_mask => $args{mask} \|\| croak("mask required"),
38				_K => $args{K} \|\| croak("number of clusters required"),
39				_terminal_output => $args{terminal_output} \|\| 0,
40				_seeding => $args{seeding} \|\| 'random',
41				_seed_tags => $args{seed_tags} \|\| [],
42				_max_em_iterations=> $args{max_em_iterations} \|\| 100,
43				_class_priors => $args{class_priors} \|\| [],
44				_debug => $args{debug} \|\| 0,
45				_N => 0,
46				_data => {},
47				_data_id_tags => [],
48				_clusters => [],
49				_cluster_centers => [],
50				_data_dimensions => 0,
51				_cluster_normalizers => [],
52				_cluster_means => [],
53				_cluster_covariances => [],
54				_class_labels_for_data => {},
55				_class_probs_at_each_data_point => {},
56				_expected_class_probs => {},
57				_old_priors => [],
58				_old_old_priors => [],
59				_fisher_quality_vs_iteration => [],
60				_mdl_quality_vs_iterations => [],
61				}, $class;
62				}
63
64				sub read_data_from_file {
65				my $self = shift;
66				my $datafile = $self->{_datafile};
67				my $mask = $self->{_mask};
68				my @mask = split //, $mask;
69				$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
70				print "data dimensionality: $self->{_data_dimensions} \n"
71				if $self->{_terminal_output};
72				open INPUT, $datafile
73				or die "unable to open file $datafile: $!";
74				chomp( my @raw_data = );
75				close INPUT;
76				# Transform strings into number data
77				foreach my $record (@raw_data) {
78				next unless $record;
79				next if $record =~ /^#/;
80				my @data_fields;
81				my @fields = split /\s+/, $record;
82				die "\nABORTED: Mask size does not correspond to row record size"
83				if $#fields != $#mask;
84				my $record_id;
85				foreach my $i (0..@fields-1) {
86				if ($mask[$i] eq '0') {
87				next;
88				} elsif ($mask[$i] eq 'N') {
89				$record_id = $fields[$i];
90				} elsif ($mask[$i] eq '1') {
91				push @data_fields, $fields[$i];
92				} else {
93				die "misformed mask for reading the data file";
94				}
95				}
96				my @nums = map {/$_num_regex/;$_} @data_fields;
97				$self->{_data}->{ $record_id } = \@nums;
98				}
99				my @all_data_ids = keys %{$self->{_data}};
100				$self->{_data_id_tags} = \@all_data_ids;
101				$self->{_N} = scalar @all_data_ids;
102				if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
103				carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
104				"points must satisfy the relation N > 2xK**2 where K is " .
105				"the number of clusters requested for the clusters to be " .
106				"meaningful $!"
107				if ( $self->{_N} < (2 * $self->{_K} ** 2) );
108				}
109				srand(123456789);
110				}
111
112
113				# This is the heart of the module --- in the sense that this method implements the EM
114				# algorithm for the estimating the parameters of a Gaussian mixture model for the
115				# data. In the implementation shown below, we declare convergence for the EM
116				# algorithm when the change in the class priors over three iterations falls below a
117				# threshold. The current value of this threshold, as can be seen in the function
118				# compare_array_floats(), is 0.00001.
119				sub EM {
120				my $self = shift;
121				$self->initialize_class_priors();
122				for (my $em_iteration=0; $em_iteration < $self->{_max_em_iterations};
123				$em_iteration++) {
124				if ($em_iteration == 0) {
125				print "\nSeeding the EM algorithm with:\n";
126				$self->display_seeding_stats();
127				print "\nFinished displaying the seeding information\n";
128				print "\nWill print out a dot for each iteration of EM:\n\n";
129				}
130				my $progress_indicator = $em_iteration % 5 == 0 ? $em_iteration : ".";
131				print $progress_indicator;
132				foreach my $data_id (@{$self->{_data_id_tags}}) {
133				$self->{_class_probs_at_each_data_point}->{$data_id} = [];
134				$self->{_expected_class_probs}->{$data_id} = [];
135				}
136				# Calculate prob(x \| C_i) --- this is the prob of data point x as
137				# a member of class C_i. You must do this for all K classes:
138				foreach my $cluster_index(0..$self->{_K}-1) {
139				$self->find_prob_at_each_datapoint_for_given_mean_and_covar(
140				$self->{_cluster_means}->[$cluster_index],
141				$self->{_cluster_covariances}->[$cluster_index] );
142				}
143				$self->{_cluster_normalizers} = [];
144				if ($self->{_debug}) {
145				print "\n\nDisplaying prob of a data point vis-a-vis each class:\n\n";
146				foreach my $data_id (sort keys %{$self->{_data}}) {
147				my $class_probs_at_a_point =
148				$self->{_class_probs_at_each_data_point}->{$data_id};
149				print "Class probs for $data_id: @$class_probs_at_a_point\n"
150				}
151				}
152				# Calculate prob(C_i \| x) which is the posterior prob of class
153				# considered as a r.v. to be C_i at a given point x. For a given
154				# x, the sum of such probabilities over all C_i must add up to 1:
155				$self->find_expected_classes_at_each_datapoint();
156				if ($self->{_debug}) {
157				print "\n\nDisplaying expected class probs at each data point:\n\n";
158				foreach my $data_id (sort keys %{$self->{_expected_class_probs}}) {
159				my $expected_classes_at_a_point =
160				$self->{_expected_class_probs}->{$data_id};
161				print "Expected classes $data_id: @$expected_classes_at_a_point\n";
162				}
163				}
164				# 1. UPDATE MEANS:
165				my @new_means;
166				foreach my $cluster_index(0..$self->{_K}-1) {
167				$new_means[$cluster_index] =
168				Math::GSL::Matrix->new($self->{_data_dimensions},1);
169				$new_means[$cluster_index]->zero();
170				foreach my $data_id (keys %{$self->{_data}}) {
171				my $data_record = $self->{_data}->{$data_id};
172				my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
173				$data_vec->set_col(0,$data_record);
174				$new_means[$cluster_index] +=
175				$self->{_expected_class_probs}->{$data_id}->[$cluster_index] *
176				$data_vec->copy();
177				$self->{_cluster_normalizers}->[$cluster_index] +=
178				$self->{_expected_class_probs}->{$data_id}->[$cluster_index];
179				}
180				$new_means[$cluster_index] *= 1.0 /
181				$self->{_cluster_normalizers}->[$cluster_index];
182				}
183				if ($self->{_debug}) {
184				foreach my $meanvec (@new_means) {
185				display_matrix("At EM Iteration $em_iteration, new mean vector is",
186				$meanvec);
187				}
188				}
189				$self->{_cluster_means} = \@new_means;
190				# 2. UPDATE COVARIANCES:
191				my @new_covariances;
192				foreach my $cluster_index(0..$self->{_K}-1) {
193				$new_covariances[$cluster_index] =
194				Math::GSL::Matrix->new($self->{_data_dimensions},
195				$self->{_data_dimensions});
196				$new_covariances[$cluster_index]->zero();
197				my $normalizer = 0;
198				foreach my $data_id (keys %{$self->{_data}}) {
199				my $data_record = $self->{_data}->{$data_id};
200				my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
201				$data_vec->set_col(0,$data_record);
202				my $mean_subtracted_data =
203				$data_vec - $self->{_cluster_means}->[$cluster_index];
204				my $outer_product = outer_product($mean_subtracted_data,
205				$mean_subtracted_data);
206				$new_covariances[$cluster_index] +=
207				$self->{_expected_class_probs}->{$data_id}->[$cluster_index] *
208				$outer_product;
209				}
210				$new_covariances[$cluster_index] *=
211				1.0 /
212				$self->{_cluster_normalizers}->[$cluster_index];
213				}
214				$self->{_cluster_covariances} = \@new_covariances;
215				# 3. UPDATE PRIORS:
216				$self->{_old_old_priors} = deep_copy_array( $self->{_old_priors} )
217				if @{$self->{_old_priors}} > 0;
218				$self->{_old_priors} = deep_copy_array( $self->{_class_priors} );
219				foreach my $cluster_index(0..$self->{_K}-1) {
220				$self->{_class_priors}->[$cluster_index] =
221				$self->{_cluster_normalizers}->[$cluster_index] / $self->{_N};
222				}
223				my @priors = @{$self->{_class_priors}};
224				print "\nUpdated priors: @priors\n\n\n" if $self->{_debug};
225				push @{$self->{_fisher_quality_vs_iteration}},
226				$self->clustering_quality_fisher();
227				push @{$self->{_mdl_quality_vs_iteration}}, $self->clustering_quality_mdl();
228				if ( ($em_iteration > 5 && $self->reached_convergence())
229				\|\| ($em_iteration == $self->{_max_em_iterations} - 1) ) {
230				my @old_old_priors = @{$self->{_old_old_priors}};
231				my @old_priors = @{$self->{_old_priors}};
232				print "\n\nPrevious to previous priors: @old_old_priors\n";
233				print "Previous priors: @old_priors\n";
234				print "Current class priors: @{$self->{_class_priors}}\n";
235				print "\n\nCONVERGENCE ACHIEVED AT ITERATION $em_iteration\n\n"
236				if $em_iteration < $self->{_max_em_iterations} - 1;
237				last;
238				}
239				}
240				print "\n\n\n";
241				}
242
243				sub reached_convergence {
244				my $self = shift;
245				return 1 if compare_array_floats($self->{_old_old_priors},
246				$self->{_old_priors})
247				&&
248				compare_array_floats($self->{_old_priors},
249				$self->{_class_priors});
250				return 0;
251				}
252
253				# Classify the data into disjoint clusters using the Naive Bayes' classification:
254				sub run_bayes_classifier {
255				my $self = shift;
256				$self->classify_all_data_tuples_bayes($self->{_cluster_means},
257				$self->{_cluster_covariances});
258				}
259
260				# Should NOT be called before run_bayes_classifier is run
261				sub return_disjoint_clusters {
262				my $self = shift;
263				return $self->{_clusters};
264				}
265
266				sub return_clusters_with_posterior_probs_above_threshold {
267				my $self = shift;
268				my $theta = shift;
269				my @class_distributions;
270				foreach my $cluster_index (0..$self->{_K}-1) {
271				push @class_distributions, [];
272				}
273				foreach my $data_tag (@{$self->{_data_id_tags}}) {
274				foreach my $cluster_index (0..$self->{_K}-1) {
275				push @{$class_distributions[$cluster_index]}, $data_tag
276				if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index]
277				> $theta;
278				}
279				}
280				return \@class_distributions;
281				}
282
283				sub return_individual_class_distributions_above_given_threshold {
284				my $self = shift;
285				my $theta = shift;
286				my @probability_distributions;
287				foreach my $cluster_index (0..$self->{_K}-1) {
288				push @probability_distributions, [];
289				}
290				foreach my $cluster_index (0..$self->{_K}-1) {
291				my $mean_vec = $self->{_cluster_means}->[$cluster_index];
292				my $covar = $self->{_cluster_covariances}->[$cluster_index];
293				foreach my $data_id (keys %{$self->{_data}}) {
294				my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
295				$data_vec->set_col( 0, $self->{_data}->{$data_id});
296				my $datavec_minus_mean = $data_vec - $mean_vec;
297				display_matrix( "datavec minus mean is ", $datavec_minus_mean )
298				if $self->{_debug};
299				my $exponent = undef;
300				if ($self->{_data_dimensions} > 1) {
301				$exponent = -0.5 * vector_matrix_multiply(
302				transpose($datavec_minus_mean),
303				matrix_vector_multiply($covar->inverse(), $datavec_minus_mean ) );
304				} else {
305				my @var_inverse = $covar->inverse()->as_list;
306				my $var_inverse_val = $var_inverse[0];
307				my @data_minus_mean = $datavec_minus_mean->as_list;
308				my $data_minus_mean_val = $data_minus_mean[0];
309				$exponent = -0.5 * ($data_minus_mean_val ** 2) * $var_inverse_val;
310				}
311				print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug};
312				my $coefficient = 1.0 / \
313				( (2 * $Math::GSL::Const::M_PI)*$self->{_data_dimensions} sqrt($covar->det()) );
314				my $prob = $coefficient * exp($exponent);
315				push @{$probability_distributions[$cluster_index]}, $data_id
316				if $prob > $theta;
317				}
318				}
319				return \@probability_distributions;
320				}
321
322				sub return_estimated_priors {
323				my $self = shift;
324				return $self->{_class_priors};
325				}
326
327				# Calculates the MDL (Minimum Description Length) clustering criterion according to
328				# Rissanen. (J. Rissanen: "Modeling by Shortest Data Description," Automatica, 1978,
329				# and "A Universal Prior for Integers and Estimation by Minimum Description Length,"
330				# Annals of Statistics, 1983.) The MDL criterion is a difference of a log-likelihood
331				# term for all of the observed data and a model-complexity penalty term. In general,
332				# both the log-likelihood and the model-complexity terms increase as the number of
333				# clusters increases. The form of the MDL criterion used in the implementation below
334				# uses for the penalty term the Bayesian Information Criterion (BIC) of G. Schwartz,
335				# "Estimating the Dimensions of a Model," The Annals of Statistics, 1978. In
336				# general, the smaller the value of the MDL quality measure calculated below, the
337				# better the clustering of the data.
338				sub clustering_quality_mdl {
339				my $self = shift;
340				# Calculate the inverses of all of the covariance matrices in order to avoid
341				# having to calculate them repeatedly inside the inner 'foreach' loop in the
342				# main part of this method. Here we go:
343				my @covar_inverses;
344				foreach my $cluster_index (0..$self->{_K}-1) {
345				my $covar = $self->{_cluster_covariances}->[$cluster_index];
346				push @covar_inverses, $covar->inverse();
347				}
348				# For the clustering quality, first calculate the log-likelihood of all the
349				# observed data:
350				my $log_likelihood = 0;
351				foreach my $tag (@{$self->{_data_id_tags}}) {
352				my $likelihood_for_each_tag = 0;
353				foreach my $cluster_index (0..$self->{_K}-1) {
354				my $mean_vec = $self->{_cluster_means}->[$cluster_index];
355				my $covar = $self->{_cluster_covariances}->[$cluster_index];
356				my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
357				$data_vec->set_col( 0, $self->{_data}->{$tag});
358				my $datavec_minus_mean = $data_vec - $mean_vec;
359				my $exponent = undef;
360				if ($self->{_data_dimensions} > 1) {
361				$exponent = -0.5 * vector_matrix_multiply(
362				transpose($datavec_minus_mean),
363				matrix_vector_multiply($covar_inverses[$cluster_index],
364				$datavec_minus_mean ) );
365				} else {
366				my @var_inverse = $covar_inverses[$cluster_index]->as_list;
367				my $var_inverse_val = $var_inverse[0];
368				my @data_minus_mean = $datavec_minus_mean->as_list;
369				my $data_minus_mean_val = $data_minus_mean[0];
370				$exponent = -0.5 * ($data_minus_mean_val ** 2) * $var_inverse_val;
371				}
372				next if $covar->det() < 0;
373				my $coefficient = 1.0 /
374				( (2 * $Math::GSL::Const::M_PI)**$self->{_data_dimensions}
375				* sqrt($covar->det()) );
376				my $prob = $coefficient * exp($exponent);
377				$likelihood_for_each_tag +=
378				$prob * $self->{_class_priors}->[$cluster_index];
379				}
380				$log_likelihood += log( $likelihood_for_each_tag );
381				}
382				# Now calculate the model complexity penalty. $L is the total number of
383				# parameters it takes to specify a mixture of K Gaussians. If d is the
384				# dimensionality of the data space, the covariance matrix of each Gaussian takes
385				# (d**2 -d)/2 + d = d(d+1)/2 parameters since this matrix must be symmetric. And
386				# then you need d mean value parameters, and one prior probability parameter
387				# for the Gaussian. So $L = K[1 + d + d(d+1)/2] - 1 where the final '1' that
388				# is subtracted is to account for the normalization on the class priors.
389				my $L = (0.5 * $self->{_K} *
390				($self->{_data_dimensions}*2 + 3$self->{_data_dimensions} + 2) ) - 1;
391				my $model_complexity_penalty = 0.5 * $L * log( $self->{_N} );
392				my $mdl_criterion = -1 * $log_likelihood + $model_complexity_penalty;
393				return $mdl_criterion;
394				}
395
396				# For our second measure of clustering quality, we use `trace( SW^-1 . SB)' where SW
397				# is the within-class scatter matrix, more commonly denoted S_w, and SB the
398				# between-class scatter matrix, more commonly denoted S_b (the underscore means
399				# subscript). This measure can be thought of as the normalized average distance
400				# between the clusters, the normalization being provided by average cluster
401				# covariance SW^-1. Therefore, the larger the value of this quality measure, the
402				# better the separation between the clusters. Since this measure has its roots in
403				# the Fisher linear discriminant function, we incorporate the word 'fisher' in the
404				# name of the quality measure. Note that this measure is good only when the clusters
405				# are disjoint. When the clusters exhibit significant overlap, the numbers produced
406				# by this quality measure tend to be generally meaningless. As an extreme case,
407				# let's say your data was produced by a set of Gaussians, all with the same mean
408				# vector, but each with a distinct covariance. For this extreme case, this measure
409				# will produce a value close to zero --- depending on the accuracy with which the
410				# means are estimated --- even when your clusterer is doing a good job of identifying
411				# the individual clusters.
412				sub clustering_quality_fisher {
413				my $self = shift;
414				my @cluster_quality_indices;
415				my $fisher_trace = 0;
416				my $S_w =
417				Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions});
418				$S_w->zero;
419				my $S_b =
420				Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions});
421				$S_b->zero;
422				my $global_mean = Math::GSL::Matrix->new($self->{_data_dimensions},1);
423				$global_mean->zero;
424				foreach my $cluster_index(0..$self->{_K}-1) {
425				$global_mean = $self->{_class_priors}->[$cluster_index] *
426				$self->{_cluster_means}->[$cluster_index];
427				}
428				foreach my $cluster_index(0..$self->{_K}-1) {
429				$S_w += $self->{_cluster_covariances}->[$cluster_index] *
430				$self->{_class_priors}->[$cluster_index];
431				my $class_mean_minus_global_mean = $self->{_cluster_means}->[$cluster_index]
432				- $global_mean;
433				my $outer_product = outer_product( $class_mean_minus_global_mean,
434				$class_mean_minus_global_mean );
435				$S_b += $self->{_class_priors}->[$cluster_index] * $outer_product;
436				}
437				my $fisher = matrix_multiply($S_w->inverse, $S_b);
438				return $fisher unless defined blessed($fisher);
439				return matrix_trace($fisher);
440				}
441
442				sub display_seeding_stats {
443				my $self = shift;
444				foreach my $cluster_index(0..$self->{_K}-1) {
445				print "\nSeeding for cluster $cluster_index:\n";
446				my $mean = $self->{_cluster_means}->[$cluster_index];
447				display_matrix("The mean is: ", $mean);
448				my $covariance = $self->{_cluster_covariances}->[$cluster_index];
449				display_matrix("The covariance is: ", $covariance);
450				}
451				}
452
453				sub display_fisher_quality_vs_iterations {
454				my $self = shift;
455				print "\n\nFisher Quality vs. Iterations: " .
456				"@{$self->{_fisher_quality_vs_iteration}}\n\n";
457				}
458
459				sub display_mdl_quality_vs_iterations {
460				my $self = shift;
461				print "\n\nMDL Quality vs. Iterations: @{$self->{_mdl_quality_vs_iteration}}\n\n";
462				}
463
464				sub find_prob_at_each_datapoint_for_given_mean_and_covar {
465				my $self = shift;
466				my $mean_vec_ref = shift;
467				my $covar_ref = shift;
468				foreach my $data_id (keys %{$self->{_data}}) {
469				my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
470				$data_vec->set_col( 0, $self->{_data}->{$data_id});
471				if ($self->{_debug}) {
472				display_matrix("data vec in find prob function", $data_vec);
473				display_matrix("mean vec in find prob function", $mean_vec_ref);
474				display_matrix("covariance in find prob function", $covar_ref);
475				}
476				my $datavec_minus_mean = $data_vec - $mean_vec_ref;
477				display_matrix( "datavec minus mean is ", $datavec_minus_mean ) if $self->{_debug};
478				my $exponent = undef;
479				if ($self->{_data_dimensions} > 1) {
480				$exponent = -0.5 * vector_matrix_multiply( transpose($datavec_minus_mean),
481				matrix_vector_multiply( $covar_ref->inverse(), $datavec_minus_mean ) );
482				} elsif (defined blessed($covar_ref)) {
483				my @data_minus_mean = $datavec_minus_mean->as_list;
484				my $data_minus_mean_val = $data_minus_mean[0];
485				my @covar_as_matrix = $covar_ref->as_list;
486				my $covar_val = $covar_as_matrix[0];
487				$exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_val;
488				} else {
489				my @data_minus_mean = $datavec_minus_mean->as_list;
490				my $data_minus_mean_val = $data_minus_mean[0];
491				$exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_ref;
492				}
493				print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug};
494				my $coefficient = undef;
495				if ($self->{_data_dimensions} > 1) {
496				$coefficient = 1.0 / sqrt( ((2 * $Math::GSL::Const::M_PI) ** $self->{_data_dimensions}) *
497				$covar_ref->det()) ;
498				} elsif (!defined blessed($covar_ref)) {
499				$coefficient = 1.0 / sqrt(2 * $covar_ref * $Math::GSL::Const::M_PI);
500				} else {
501				my @covar_as_matrix = $covar_ref->as_list;
502				my $covar_val = $covar_as_matrix[0];
503				$coefficient = 1.0 / sqrt(2 * $covar_val * $Math::GSL::Const::M_PI);
504				}
505				my $prob = $coefficient * exp($exponent);
506				push @{$self->{_class_probs_at_each_data_point}->{$data_id}}, $prob;
507				}
508				}
509
510				sub find_expected_classes_at_each_datapoint {
511				my $self = shift;
512				my @priors = @{$self->{_class_priors}};
513				foreach my $data_id (sort keys %{$self->{_class_probs_at_each_data_point}}) {
514				my $numerator =
515				vector_2_vector_multiply(
516				$self->{_class_probs_at_each_data_point}->{$data_id},
517				$self->{_class_priors} );
518				my $sum = 0;
519				foreach my $part (@$numerator) {
520				$sum += $part;
521				}
522				$self->{_expected_class_probs}->{$data_id} = [map $_/$sum, @{$numerator}];
523				}
524				}
525
526				sub initialize_class_priors {
527				my $self = shift;
528				if (@{$self->{_class_priors}} == 0) {
529				my $prior = 1.0 / $self->{_K};
530				foreach my $class_index (0..$self->{_K}-1) {
531				push @{$self->{_class_priors}}, $prior;
532				}
533				}
534				die "Mismatch between number of values for class priors " .
535				"and the number of clusters expected"
536				unless @{$self->{_class_priors}} == $self->{_K};
537				my $sum = 0;
538				foreach my $prior (@{$self->{_class_priors}}) {
539				$sum += $prior;
540				}
541				die "Your priors in the constructor call do not add up to 1"
542				unless abs($sum - 1) < 0.001;
543				print "\nInitially assumed class priors are: @{$self->{_class_priors}}\n";
544				}
545
546				sub estimate_class_priors {
547				my $self = shift;
548				foreach my $datatag (keys %{$self->{_data}}) {
549				my $class_label = $self->{_class_labels}->{$datatag};
550				$self->{_class_priors}[$class_label]++;
551				}
552				foreach my $prior (@{$self->{_class_priors}}) {
553				$prior /= $self->{_total_number_data_tuples};
554				}
555				foreach my $prior (@{$self->{_class_priors}}) {
556				print "class priors: @{$self->{_class_priors}}\n";
557				}
558				}
559
560				sub classify_all_data_tuples_bayes {
561				my $self = shift;
562				my $mean_vecs_ref = shift;
563				my $covariances_ref = shift;
564				my @new_clusters;
565				foreach my $index (0..$self->{_K}-1) {
566				push @new_clusters, [];
567				}
568				foreach my $data_id (@{$self->{_data_id_tags}}) {
569				my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
570				$data_vec->set_col( 0, deep_copy_array($self->{_data}->{$data_id}));
571				my $cluster_index_for_tuple =
572				$self->classify_a_data_point_bayes($data_vec,
573				$mean_vecs_ref, $covariances_ref);
574				$self->{_class_labels}->{$data_id} = $cluster_index_for_tuple;
575				push @{$new_clusters[$cluster_index_for_tuple]}, $data_id;
576				}
577				$self->{_clusters} = \@new_clusters;
578				}
579
580				sub classify_a_data_point_bayes {
581				my $self = shift;
582				my $data_vec = shift;
583				my $mean_vecs_ref = shift;
584				my $covariances_ref = shift;
585				my @cluster_mean_vecs = @$mean_vecs_ref;
586				my @cluster_covariances = @$covariances_ref;
587				my @log_likelihoods;
588				foreach my $cluster_index (0..@cluster_mean_vecs-1) {
589				my $mean = $cluster_mean_vecs[$cluster_index];
590				my $covariance = $cluster_covariances[$cluster_index];
591				my $datavec_minus_mean = $data_vec - $mean;
592				my $log_likely = undef;
593				if ($self->{_data_dimensions} > 1) {
594				$log_likely = -0.5 * vector_matrix_multiply(
595				transpose($datavec_minus_mean),
596				matrix_vector_multiply( $covariance->inverse(),
597				$datavec_minus_mean ) );
598				} else {
599
600				my @data_minus_mean = $datavec_minus_mean->as_list;
601				my $data_minus_mean_val = $data_minus_mean[0];
602				my @covar_as_matrix = $covariance->as_list;
603				my $covar_val = $covar_as_matrix[0];
604				$log_likely = -0.5 * ($data_minus_mean_val ** 2) / $covar_val;
605
606				}
607				my $posterior_log_likely = $log_likely +
608				log( $self->{_class_priors}[$cluster_index] );
609				push @log_likelihoods, $posterior_log_likely;
610				}
611				my ($minlikely, $maxlikely) = minmax(\@log_likelihoods);
612				my $cluster_index_for_data_point =
613				get_index_at_value( $maxlikely, \@log_likelihoods );
614				return $cluster_index_for_data_point;
615				}
616
617				sub find_cluster_means_and_covariances {
618				my $clusters = shift;
619				my $data_dimensions = find_data_dimensionality($clusters);
620				my (@cluster_mean_vecs, @cluster_covariances);
621				foreach my $cluster_index (0..@$clusters-1) {
622				my ($num_rows,$num_cols) =
623				($data_dimensions,scalar(@{$clusters->[$cluster_index]}));
624				print "\nFor cluster $cluster_index: rows: $num_rows and cols: $num_cols\n";
625				my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
626				my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
627				my $col_index = 0;
628				foreach my $ele (@{$clusters->[$cluster_index]}) {
629				$matrix->set_col($col_index++, $ele);
630				}
631				# display_matrix( "Displaying cluster matrix", $matrix );
632				foreach my $j (0..$num_cols-1) {
633				$mean_vec += $matrix->col($j);
634				}
635				$mean_vec *= 1.0 / $num_cols;
636				push @cluster_mean_vecs, $mean_vec;
637				display_matrix( "Displaying the mean vector", $mean_vec );
638				foreach my $j (0..$num_cols-1) {
639				my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
640				$matrix->set_col($j, \@new_col);
641				}
642				# display_matrix("Displaying mean subtracted data as a matrix", $matrix );
643				my $transposed = transpose( $matrix );
644				# display_matrix("Displaying transposed matrix",$transposed);
645				my $covariance = matrix_multiply( $matrix, $transposed );
646				$covariance *= 1.0 / $num_cols;
647				push @cluster_covariances, $covariance;
648				display_matrix("Displaying the cluster covariance", $covariance );
649				}
650				return (\@cluster_mean_vecs, \@cluster_covariances);
651				}
652
653				sub find_data_dimensionality {
654				my $clusters = shift;
655				my @first_cluster = @{$clusters->[0]};
656				my @first_data_element = @{$first_cluster[0]};
657				return scalar(@first_data_element);
658				}
659
660				sub find_seed_centered_covariances {
661				my $self = shift;
662				my $seed_tags = shift;
663				my (@seed_mean_vecs, @seed_based_covariances);
664				foreach my $seed_tag (@$seed_tags) {
665				my ($num_rows,$num_cols) = ($self->{_data_dimensions}, $self->{_N});
666				my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
667				my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
668				$mean_vec->set_col(0, $self->{_data}->{$seed_tag});
669				push @seed_mean_vecs, $mean_vec;
670				display_matrix( "Displaying the seed mean vector", $mean_vec );
671				my $col_index = 0;
672				foreach my $tag (@{$self->{_data_id_tags}}) {
673				$matrix->set_col($col_index++, $self->{_data}->{$tag});
674				}
675				foreach my $j (0..$num_cols-1) {
676				my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
677				$matrix->set_col($j, \@new_col);
678				}
679				my $transposed = transpose( $matrix );
680				my $covariance = matrix_multiply( $matrix, $transposed );
681				$covariance *= 1.0 / $num_cols;
682				push @seed_based_covariances, $covariance;
683				display_matrix("Displaying the seed covariance", $covariance )
684				if $self->{_debug};
685				}
686				return (\@seed_mean_vecs, \@seed_based_covariances);
687				}
688
689				# The most popular seeding mode for EM is random. We include two other seeding modes
690				# --- kmeans and manual --- since they do produce good results for specialized cases.
691				# For example, when the clusters in your data are non-overlapping and not too
692				# anisotropic, the kmeans based seeding should work at least as well as the random
693				# seeding. In such cases --- AND ONLY IN SUCH CASES --- the kmeans based seeding has
694				# the advantage of avoiding the getting stuck in a local-maximum problem of the EM
695				# algorithm.
696				sub seed_the_clusters {
697				my $self = shift;
698				if ($self->{_seeding} eq 'random') {
699				my @covariances;
700				my @means;
701				my @all_tags = @{$self->{_data_id_tags}};
702				my @seed_tags;
703				foreach my $i (0..$self->{_K}-1) {
704				push @seed_tags, $all_tags[int rand( $self->{_N} )];
705				}
706				print "Random Seeding: Randomly selected seeding tags are @seed_tags\n\n";
707				my ($seed_means, $seed_covars) =
708				$self->find_seed_centered_covariances(\@seed_tags);
709				$self->{_cluster_means} = $seed_means;
710				$self->{_cluster_covariances} = $seed_covars;
711				} elsif ($self->{_seeding} eq 'kmeans') {
712				$self->kmeans();
713				my $clusters = $self->{_clusters};
714				my @dataclusters;
715				foreach my $index (0..@$clusters-1) {
716				push @dataclusters, [];
717				}
718				foreach my $cluster_index (0..$self->{_K}-1) {
719				foreach my $tag (@{$clusters->[$cluster_index]}) {
720				my $data = $self->{_data}->{$tag};
721				push @{$dataclusters[$cluster_index]}, deep_copy_array($data);
722				}
723				}
724				($self->{_cluster_means}, $self->{_cluster_covariances}) =
725				find_cluster_means_and_covariances(\@dataclusters);
726				} elsif ($self->{_seeding} eq 'manual') {
727				die "You have not supplied the seeding tags for the option \"manual\""
728				unless @{$self->{_seed_tags}} > 0;
729				print "Manual Seeding: Seed tags are @{$self->{_seed_tags}}\n\n";
730				foreach my $tag (@{$self->{_seed_tags}}) {
731				die "invalid tag used for manual seeding"
732				unless exists $self->{_data}->{$tag};
733				}
734				my ($seed_means, $seed_covars) =
735				$self->find_seed_centered_covariances($self->{_seed_tags});
736				$self->{_cluster_means} = $seed_means;
737				$self->{_cluster_covariances} = $seed_covars;
738				} else {
739				die "Incorrect call syntax used. See documentation.";
740				}
741				}
742
743				# This is the top-level method for kmeans based initialization of the EM
744				# algorithm. The means and the covariances returned by kmeans are used to seed the EM
745				# algorithm.
746				sub kmeans {
747				my $self = shift;
748				my $K = $self->{_K};
749				$self->cluster_for_fixed_K_single_smart_try($K);
750				if ((defined $self->{_clusters}) && (defined $self->{_cluster_centers})){
751				return ($self->{_clusters}, $self->{_cluster_centers});
752				} else {
753				die "kmeans clustering failed.";
754				}
755				}
756
757				# Used by the kmeans algorithm for the initialization of the EM iterations. We do
758				# initial kmeans cluster seeding by subjecting the data to principal components
759				# analysis in order to discover the direction of maximum variance in the data space.
760				# Subsequently, we try to find the K largest peaks along this direction. The
761				# coordinates of these peaks serve as the seeds for the K clusters.
762				sub cluster_for_fixed_K_single_smart_try {
763				my $self = shift;
764				my $K = shift;
765				print "Clustering for K = $K\n" if $self->{_terminal_output};
766				my ($clusters, $cluster_centers) =
767				$self->cluster_for_given_K($K);
768				$self->{_clusters} = $clusters;
769				$self->{_cluster_centers} = $cluster_centers;
770				}
771
772				# Used by the kmeans part of the code for the initialization of the EM algorithm:
773				sub cluster_for_given_K {
774				my $self = shift;
775				my $K = shift;
776				my $cluster_centers = $self->get_initial_cluster_centers($K);
777				my $clusters = $self->assign_data_to_clusters_initial($cluster_centers);
778				my $cluster_nonexistant_flag = 0;
779				foreach my $trial (0..2) {
780				($clusters, $cluster_centers) =
781				$self->assign_data_to_clusters( $clusters, $K );
782				my $num_of_clusters_returned = @$clusters;
783				foreach my $cluster (@$clusters) {
784				$cluster_nonexistant_flag = 1 if ((!defined $cluster)
785				\|\| (@$cluster == 0));
786				}
787				last unless $cluster_nonexistant_flag;
788				}
789				return ($clusters, $cluster_centers);
790				}
791
792				# Used by the kmeans part of the code for the initialization of the EM algorithm:
793				sub get_initial_cluster_centers {
794				my $self = shift;
795				my $K = shift;
796				if ($self->{_data_dimensions} == 1) {
797				my @one_d_data;
798				foreach my $j (0..$self->{_N}-1) {
799				my $tag = $self->{_data_id_tags}[$j];
800				push @one_d_data, $self->{_data}->{$tag}->[0];
801				}
802				my @peak_points =
803				find_peak_points_in_given_direction(\@one_d_data,$K);
804				print "highest points at data values: @peak_points\n"
805				if $self->{_debug};
806				my @cluster_centers;
807				foreach my $peakpoint (@peak_points) {
808				push @cluster_centers, [$peakpoint];
809				}
810				return \@cluster_centers;
811				}
812				my ($num_rows,$num_cols) = ($self->{_data_dimensions},$self->{_N});
813				my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
814				my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
815				# All the record labels are stored in the array $self->{_data_id_tags}.
816				# The actual data for clustering is stored in a hash at $self->{_data}
817				# whose keys are the record labels; the value associated with each
818				# key is the array holding the corresponding numerical multidimensional
819				# data.
820				foreach my $j (0..$num_cols-1) {
821				my $tag = $self->{_data_id_tags}[$j];
822				my $data = $self->{_data}->{$tag};
823				$matrix->set_col($j, $data);
824				}
825				if ($self->{_debug}) {
826				print "\nDisplaying the original data as a matrix:";
827				display_matrix( $matrix );
828				}
829				foreach my $j (0..$num_cols-1) {
830				$mean_vec += $matrix->col($j);
831				}
832				$mean_vec *= 1.0 / $num_cols;
833				if ($self->{_debug}) {
834				print "Displaying the mean vector for the data:";
835				display_matrix( $mean_vec );
836				}
837				foreach my $j (0..$num_cols-1) {
838				my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
839				$matrix->set_col($j, \@new_col);
840				}
841				if ($self->{_debug}) {
842				print "Displaying mean subtracted data as a matrix:";
843				display_matrix( $matrix );
844				}
845				my $transposed = transpose( $matrix );
846				if ($self->{_debug}) {
847				print "Displaying transposed data matrix:";
848				display_matrix( $transposed );
849				}
850				my $covariance = matrix_multiply( $matrix, $transposed );
851				$covariance *= 1.0 / $num_cols;
852				if ($self->{_debug}) {
853				print "\nDisplaying the Covariance Matrix for your data:";
854				display_matrix( $covariance );
855				}
856				my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
857				my $num_of_eigens = @$eigenvalues;
858				my $largest_eigen_index = 0;
859				my $smallest_eigen_index = 0;
860				print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug};
861				foreach my $i (1..$num_of_eigens-1) {
862				$largest_eigen_index = $i if $eigenvalues->[$i] >
863				$eigenvalues->[$largest_eigen_index];
864				$smallest_eigen_index = $i if $eigenvalues->[$i] <
865				$eigenvalues->[$smallest_eigen_index];
866				print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug};
867				}
868				print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
869				print "\nsmallest eigen index: $smallest_eigen_index\n\n"
870				if $self->{_debug};
871				foreach my $i (0..$num_of_eigens-1) {
872				my @vec = $eigenvectors->[$i]->as_list;
873				print "Eigenvector $i: @vec\n" if $self->{_debug};
874				}
875				my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
876				print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug};
877				my @max_var_direction;
878				# Each element of the array @largest_eigen_vec is a Math::Complex object
879				foreach my $k (0..@largest_eigen_vec-1) {
880				my ($mag, $theta) = $largest_eigen_vec[$k] =~ /\[(\d*\.\d+),(\S+)\]/;
881				if ($theta eq '0') {
882				$max_var_direction[$k] = $mag;
883				} elsif ($theta eq 'pi') {
884				$max_var_direction[$k] = -1.0 * $mag;
885				} else {
886				die "eigendecomposition of covariance matrix produced a complex eigenvector --- something is wrong";
887				}
888				}
889				# "Maximum variance direction: @max_var_direction
890				print "\nMaximum Variance Direction: @max_var_direction\n\n"
891				if $self->{_debug};
892				# We now project all data points on the largest eigenvector.
893				# Each projection will yield a single point on the eigenvector.
894				my @projections;
895				foreach my $j (0..$self->{_N}-1) {
896				my $tag = $self->{_data_id_tags}[$j];
897				my $data = $self->{_data}->{$tag};
898				die "Dimensionality of the largest eigenvector does not "
899				. "match the dimensionality of the data"
900				unless @max_var_direction == $self->{_data_dimensions};
901				my $projection = vector_multiply($data, \@max_var_direction);
902				push @projections, $projection;
903				}
904				print "All projection points: @projections\n" if $self->{_debug};
905				my @peak_points = find_peak_points_in_given_direction(\@projections, $K);
906				print "highest points at points along largest eigenvec: @peak_points\n"
907				if $self->{_debug};
908				my @cluster_centers;
909				foreach my $peakpoint (@peak_points) {
910				my @actual_peak_coords = map {$peakpoint * $_} @max_var_direction;
911				push @cluster_centers, \@actual_peak_coords;
912				}
913				return \@cluster_centers;
914				}
915
916				# Used by the kmeans part of the code: This method is called by the previous method
917				# to locate K peaks in a smoothed histogram of the data points projected onto the
918				# maximal variance direction.
919				sub find_peak_points_in_given_direction {
920				my $dataref = shift;
921				my $how_many = shift;
922				my @data = @$dataref;
923				my ($min, $max) = minmax(\@data);
924				my $num_points = @data;
925				my @sorted_data = sort {$a <=> $b} @data;
926				#print "\n\nSorted data: @sorted_data\n";
927				my $scale = $max - $min;
928				foreach my $index (0..$#sorted_data-1) {
929				$sorted_data[$index] = ($sorted_data[$index] - $min) / $scale;
930				}
931				my $avg_diff = 0;
932				foreach my $index (0..$#sorted_data-1) {
933				my $diff = $sorted_data[$index+1] - $sorted_data[$index];
934				$avg_diff += ($diff - $avg_diff) / ($index + 1);
935				}
936				my $delta = 1.0 / 1000.0;
937				# It would be nice to set the delta adaptively, but I must
938				# change the number of cells in the next foreach loop accordingly
939				# my $delta = $avg_diff / 20;
940				my @accumulator = (0) x 1000;
941				foreach my $index (0..@sorted_data-1) {
942				my $cell_index = int($sorted_data[$index] / $delta);
943				my $smoothness = 40;
944				for my $index ($cell_index-$smoothness..$cell_index+$smoothness) {
945				next if $index < 0 \|\| $index > 999;
946				$accumulator[$index]++;
947				}
948				}
949				my $peaks_array = non_maximum_supression( \@accumulator );
950				my $peaks_index_hash = get_value_index_hash( $peaks_array );
951				my @K_highest_peak_locations;
952				my $k = 0;
953				foreach my $peak (sort {$b <=> $a} keys %$peaks_index_hash) {
954				my $unscaled_peak_point =
955				$min + $peaks_index_hash->{$peak} * $scale * $delta;
956				push @K_highest_peak_locations, $unscaled_peak_point
957				if $k < $how_many;
958				last if ++$k == $how_many;
959				}
960				return @K_highest_peak_locations;
961				}
962
963				# Used by the kmeans part of the code: The purpose of this routine is to form initial
964				# clusters by assigning the data samples to the initial clusters formed by the
965				# previous routine on the basis of the best proximity of the data samples to the
966				# different cluster centers.
967				sub assign_data_to_clusters_initial {
968				my $self = shift;
969				my @cluster_centers = @{ shift @_ };
970				my @clusters;
971				foreach my $ele (@{$self->{_data_id_tags}}) {
972				my $best_cluster;
973				my @dist_from_clust_centers;
974				foreach my $center (@cluster_centers) {
975				push @dist_from_clust_centers, $self->distance($ele, $center);
976				}
977				my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
978				push @{$clusters[$best_center_index]}, $ele;
979				}
980				return \@clusters;
981				}
982
983				# Used by the kmeans part of the code: This is the main routine that along with the
984				# update_cluster_centers() routine constitute the two key steps of the K-Means
985				# algorithm. In most cases, the infinite while() loop will terminate automatically
986				# when the cluster assignments of the data points remain unchanged. For the sake of
987				# safety, we keep track of the number of iterations. If this number reaches 100, we
988				# exit the while() loop anyway. In most cases, this limit will not be reached.
989				sub assign_data_to_clusters {
990				my $self = shift;
991				my $clusters = shift;
992				my $K = shift;
993				my $final_cluster_centers;
994				my $iteration_index = 0;
995				while (1) {
996				my $new_clusters;
997				my $assignment_changed_flag = 0;
998				my $current_cluster_center_index = 0;
999				my $cluster_size_zero_condition = 0;
1000				my $how_many = @$clusters;
1001				my $cluster_centers = $self->update_cluster_centers(
1002				deep_copy_AoA_with_nulls( $clusters ) );
1003				$iteration_index++;
1004				foreach my $cluster (@$clusters) {
1005				my $current_cluster_center =
1006				$cluster_centers->[$current_cluster_center_index];
1007				foreach my $ele (@$cluster) {
1008				my @dist_from_clust_centers;
1009				foreach my $center (@$cluster_centers) {
1010				push @dist_from_clust_centers,
1011				$self->distance($ele, $center);
1012				}
1013				my ($min, $best_center_index) =
1014				minimum( \@dist_from_clust_centers );
1015				my $best_cluster_center =
1016				$cluster_centers->[$best_center_index];
1017				if (vector_equal($current_cluster_center,
1018				$best_cluster_center)){
1019				push @{$new_clusters->[$current_cluster_center_index]},
1020				$ele;
1021				} else {
1022				$assignment_changed_flag = 1;
1023				push @{$new_clusters->[$best_center_index]}, $ele;
1024				}
1025				}
1026				$current_cluster_center_index++;
1027				}
1028				# Now make sure that we still have K clusters since K is fixed:
1029				next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
1030				# Now make sure that none of the K clusters is an empty cluster:
1031				foreach my $newcluster (@$new_clusters) {
1032				$cluster_size_zero_condition = 1 if ((!defined $newcluster)
1033				or (@$newcluster == 0));
1034				}
1035				push @$new_clusters, (undef) x ($K - @$new_clusters)
1036				if @$new_clusters < $K;
1037				my $largest_cluster;
1038				foreach my $local_cluster (@$new_clusters) {
1039				next if !defined $local_cluster;
1040				$largest_cluster = $local_cluster if !defined $largest_cluster;
1041				if (@$local_cluster > @$largest_cluster) {
1042				$largest_cluster = $local_cluster;
1043				}
1044				}
1045				foreach my $local_cluster (@$new_clusters) {
1046				if ( (!defined $local_cluster) \|\| (@$local_cluster == 0) ) {
1047				push @$local_cluster, pop @$largest_cluster;
1048				}
1049				}
1050				next if (($cluster_size_zero_condition) && ($iteration_index < 100));
1051				last if $iteration_index == 100;
1052				# Now do a deep copy of new_clusters into clusters
1053				$clusters = deep_copy_AoA( $new_clusters );
1054				last if $assignment_changed_flag == 0;
1055				}
1056				$final_cluster_centers = $self->update_cluster_centers( $clusters );
1057				return ($clusters, $final_cluster_centers);
1058				}
1059
1060				# Used by the kmeans part of the code: After each new assignment of the data points
1061				# to the clusters on the basis of the current values for the cluster centers, we call
1062				# the routine shown here for updating the values of the cluster centers.
1063				sub update_cluster_centers {
1064				my $self = shift;
1065				my @clusters = @{ shift @_ };
1066				my @new_cluster_centers;
1067				my $largest_cluster;
1068				foreach my $cluster (@clusters) {
1069				next if !defined $cluster;
1070				$largest_cluster = $cluster if !defined $largest_cluster;
1071				if (@$cluster > @$largest_cluster) {
1072				$largest_cluster = $cluster;
1073				}
1074				}
1075				foreach my $cluster (@clusters) {
1076				if ( (!defined $cluster) \|\| (@$cluster == 0) ) {
1077				push @$cluster, pop @$largest_cluster;
1078				}
1079				}
1080				foreach my $cluster (@clusters) {
1081				die "Cluster became empty --- untenable condition " .
1082				"for a given K. Try again. " if !defined $cluster;
1083				my $cluster_size = @$cluster;
1084				die "Cluster size is zero --- untenable." if $cluster_size == 0;
1085				my @new_cluster_center = @{$self->add_point_coords( $cluster )};
1086				@new_cluster_center = map {my $x = $_/$cluster_size; $x}
1087				@new_cluster_center;
1088				push @new_cluster_centers, \@new_cluster_center;
1089				}
1090				return \@new_cluster_centers;
1091				}
1092
1093				# The following routine is for computing the distance between a data point specified
1094				# by its symbolic name in the master datafile and a point (such as the center of a
1095				# cluster) expressed as a vector of coordinates:
1096				sub distance {
1097				my $self = shift;
1098				my $ele1_id = shift @_; # symbolic name of data sample
1099				my @ele1 = @{$self->{_data}->{$ele1_id}};
1100				my @ele2 = @{shift @_};
1101				die "wrong data types for distance calculation" if @ele1 != @ele2;
1102				my $how_many = @ele1;
1103				my $squared_sum = 0;
1104				foreach my $i (0..$how_many-1) {
1105				$squared_sum += ($ele1[$i] - $ele2[$i])**2;
1106				}
1107				my $dist = sqrt $squared_sum;
1108				return $dist;
1109				}
1110
1111				# The following routine does the same as above but now both
1112				# arguments are expected to be arrays of numbers:
1113				sub distance2 {
1114				my $self = shift;
1115				my @ele1 = @{shift @_};
1116				my @ele2 = @{shift @_};
1117				die "wrong data types for distance calculation" if @ele1 != @ele2;
1118				my $how_many = @ele1;
1119				my $squared_sum = 0;
1120				foreach my $i (0..$how_many-1) {
1121				$squared_sum += ($ele1[$i] - $ele2[$i])**2;
1122				}
1123				return sqrt $squared_sum;
1124				}
1125
1126				sub write_naive_bayes_clusters_to_files {
1127				my $self = shift;
1128				my @clusters = @{$self->{_clusters}};
1129				unlink glob "naive_bayes_cluster*.dat";
1130				foreach my $i (1..@clusters) {
1131				my $filename = "naive_bayes_cluster" . $i . ".dat";
1132				print "Writing cluster $i to file $filename\n"
1133				if $self->{_terminal_output};
1134				open FILEHANDLE, "\| sort > $filename" or die "Unable to open file: $!";
1135				foreach my $ele (@{$clusters[$i-1]}) {
1136				print FILEHANDLE "$ele\n";
1137				}
1138				close FILEHANDLE;
1139				}
1140				}
1141
1142				sub write_posterior_prob_clusters_above_threshold_to_files {
1143				my $self = shift;
1144				my $theta = shift;
1145				my @class_distributions;
1146				foreach my $cluster_index (0..$self->{_K}-1) {
1147				push @class_distributions, [];
1148				}
1149				foreach my $data_tag (@{$self->{_data_id_tags}}) {
1150				foreach my $cluster_index (0..$self->{_K}-1) {
1151				push @{$class_distributions[$cluster_index]}, $data_tag
1152				if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index]
1153				> $theta;
1154				}
1155				}
1156				unlink glob "posterior_prob_cluster*.dat";
1157				foreach my $i (1..@class_distributions) {
1158				my $filename = "posterior_prob_cluster" . $i . ".dat";
1159				print "Writing posterior prob cluster $i to file $filename\n"
1160				if $self->{_terminal_output};
1161				open FILEHANDLE, "\| sort > $filename" or die "Unable to open file: $!";
1162				foreach my $ele (@{$class_distributions[$i-1]}) {
1163				print FILEHANDLE "$ele\n";
1164				}
1165				close FILEHANDLE;
1166				}
1167				}
1168
1169				sub DESTROY {
1170				unlink "__temp_" . basename($_[0]->{_datafile});
1171				unlink "__temp_data_" . basename($_[0]->{_datafile});
1172				unlink "__temp2_" . basename($_[0]->{_datafile});
1173				unlink glob "__temp1dhist*";
1174				unlink glob "__contour*";
1175				}
1176
1177				############################# Visualization Code ###############################
1178
1179				# The visualize_clusters() implementation displays as a plot in your terminal window
1180				# the clusters constructed by the EM algorithm. It can show either 2D plots or
1181				# 3D plots that you can rotate interactively for better visualization. For
1182				# multidimensional data, as to which 2D or 3D dimensions are used for visualization
1183				# is controlled by the mask you must supply as an argument to the method. Should it
1184				# happen that only one on bit is specified for the mask, visualize_clusters()
1185				# aborts.
1186				#
1187				# The visualization code consists of first accessing each of clusters created by the
1188				# EM() subroutine. Note that the clusters contain only the symbolic names for the
1189				# individual records in the source data file. We therefore next reach into the
1190				# $self->{_data} hash and get the data coordinates associated with each symbolic
1191				# label in a cluster. The numerical data thus generated is then written out to a
1192				# temp file. When doing so we must remember to insert TWO BLANK LINES between the
1193				# data blocks corresponding to the different clusters. This constraint is imposed
1194				# on us by Gnuplot when plotting data from the same file since we want to use
1195				# different point styles for the data points in different cluster files.
1196				# Subsequently, we call upon the Perl interface provided by the Graphics::GnuplotIF
1197				# module to plot the data clusters.
1198				sub visualize_clusters {
1199				my $self = shift;
1200				my $v_mask;
1201				my $pause_time;
1202				if (@_ == 1) {
1203				$v_mask = shift \|\| die "visualization mask missing";
1204				} elsif (@_ == 2) {
1205				$v_mask = shift \|\| die "visualization mask missing";
1206				$pause_time = shift;
1207				} else {
1208				die "visualize_clusters() called with wrong args";
1209				}
1210				my $master_datafile = $self->{_datafile};
1211				my @v_mask = split //, $v_mask;
1212				my $visualization_mask_width = @v_mask;
1213				my $original_data_mask = $self->{_mask};
1214				my @mask = split //, $original_data_mask;
1215				my $data_field_width = scalar grep {$_ eq '1'} @mask;
1216				die "\n\nABORTED: The width of the visualization mask (including " .
1217				"all its 1s and 0s) must equal the width of the original mask " .
1218				"used for reading the data file (counting only the 1's)"
1219				if $visualization_mask_width != $data_field_width;
1220				my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1221				# The following section is for the superimposed one-Mahalanobis-distance-unit
1222				# ellipses that are shown only for 2D plots:
1223				if ($visualization_data_field_width == 2) {
1224				foreach my $cluster_index (0..$self->{_K}-1) {
1225				my $contour_filename = "__contour_" . $cluster_index . ".dat";
1226				my $mean = $self->{_cluster_means}->[$cluster_index];
1227				my $covariance = $self->{_cluster_covariances}->[$cluster_index];
1228				my ($mux,$muy) = $mean->as_list();
1229				my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
1230				my ($sigmayx,$vary) = $covariance->row(1)->as_list();
1231				die "Your covariance matrix does not look right"
1232				unless $sigmaxy == $sigmayx;
1233				my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
1234				my $argstring = <<"END";
1235				set contour
1236				mux = $mux
1237				muy = $muy
1238				sigmax = $sigmax
1239				sigmay = $sigmay
1240				sigmaxy = $sigmaxy
1241				determinant = (sigmax*2)(sigmay2) - sigmaxy2
1242				exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)*2)sigmay2 + ((y-muy)2)sigmax2 - 2sigmaxy(x-mux)(y-muy) )
1243				f(x,y) = exp( exponent(x,y) ) - 0.2
1244				xmax = mux + 2 * sigmax
1245				xmin = mux - 2 * sigmax
1246				ymax = muy + 2 * sigmay
1247				ymin = muy - 2 * sigmay
1248				set xrange [ xmin : xmax ]
1249				set yrange [ ymin : ymax ]
1250				set isosamples 200
1251				unset surface
1252				set cntrparam levels discrete 0
1253				set table \"$contour_filename\"
1254				splot f(x,y)
1255				unset table
1256				END
1257				my $plot = Graphics::GnuplotIF->new();
1258				$plot->gnuplot_cmd( $argstring );
1259				}
1260				}
1261				my %visualization_data;
1262				while ( my ($record_id, $data) = each %{$self->{_data}} ) {
1263				my @fields = @$data;
1264				die "\nABORTED: Visualization mask size exceeds data record size"
1265				if $#v_mask > $#fields;
1266				my @data_fields;
1267				foreach my $i (0..@fields-1) {
1268				if ($v_mask[$i] eq '0') {
1269				next;
1270				} elsif ($v_mask[$i] eq '1') {
1271				push @data_fields, $fields[$i];
1272				} else {
1273				die "Misformed visualization mask. It can only have 1s and 0s";
1274				}
1275				}
1276				$visualization_data{ $record_id } = \@data_fields;
1277				}
1278				my $K = scalar @{$self->{_clusters}};
1279				my $filename = basename($master_datafile);
1280				my $temp_file = "__temp_" . $filename;
1281				unlink $temp_file if -e $temp_file;
1282				open OUTPUT, ">$temp_file"
1283				or die "Unable to open a temp file in this directory: $!";
1284				foreach my $cluster (@{$self->{_clusters}}) {
1285				foreach my $item (@$cluster) {
1286				print OUTPUT "@{$visualization_data{$item}}";
1287				print OUTPUT "\n";
1288				}
1289				print OUTPUT "\n\n";
1290				}
1291				close OUTPUT;
1292				my $plot;
1293				if (!defined $pause_time) {
1294				$plot = Graphics::GnuplotIF->new( persist => 1 );
1295				} else {
1296				$plot = Graphics::GnuplotIF->new();
1297				}
1298				my $arg_string = "";
1299				if ($visualization_data_field_width > 2) {
1300				$plot->gnuplot_cmd("set noclip");
1301				$plot->gnuplot_cmd("set pointsize 2");
1302				foreach my $i (0..$K-1) {
1303				my $j = $i + 1;
1304				$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
1305				}
1306				} elsif ($visualization_data_field_width == 2) {
1307				$plot->gnuplot_cmd("set noclip");
1308				$plot->gnuplot_cmd("set pointsize 2");
1309				foreach my $i (0..$K-1) {
1310				my $j = $i + 1;
1311				$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
1312				my $ellipse_filename = "__contour_" . $i . ".dat";
1313				$arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
1314				}
1315				} elsif ($visualization_data_field_width == 1 ) {
1316				open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
1317				my @all_data = ;
1318				close INPUT;
1319				@all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
1320				my $all_joined_data = join ':', @all_data;
1321				my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
1322				my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
1323				foreach my $i (0..@separated-1) {
1324				$separated[$i] =~ s/SEPERATOR//g;
1325				my @cluster_for_hist = split /:/, $separated[$i];
1326				@cluster_for_hist = grep $_, @cluster_for_hist;
1327				my ($minval,$maxval) = minmax(\@cluster_for_hist);
1328				push @all_minvals, $minval;
1329				push @all_maxvals, $maxval;
1330				push @all_clusters_for_hist, \@cluster_for_hist;
1331				}
1332				push @all_minmaxvals, @all_minvals;
1333				push @all_minmaxvals, @all_maxvals;
1334				my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
1335				my $delta = ($abs_maxval - $abs_minval) / 100.0;
1336				$plot->gnuplot_cmd("set boxwidth 3");
1337				$plot->gnuplot_cmd("set style fill solid border -1");
1338				$plot->gnuplot_cmd("set ytics out nomirror");
1339				$plot->gnuplot_cmd("set style data histograms");
1340				$plot->gnuplot_cmd("set style histogram clustered");
1341				$plot->gnuplot_cmd("set title 'Clusters shown through histograms'");
1342				$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
1343				foreach my $cindex (0..@all_clusters_for_hist-1) {
1344				my $filename = basename($master_datafile);
1345				my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $filename;
1346				unlink $temp_file if -e $temp_file;
1347				open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
1348				print OUTPUT "Xstep histval\n";
1349				my @histogram = (0) x 100;
1350				foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
1351				$histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
1352				}
1353				foreach my $i (0..@histogram-1) {
1354				print OUTPUT "$i $histogram[$i]\n";
1355				}
1356				$arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
1357				close OUTPUT;
1358				}
1359				}
1360				$arg_string = $arg_string =~ /^(.*),[ ]+$/;
1361				$arg_string = $1;
1362				if ($visualization_data_field_width > 2) {
1363				$plot->gnuplot_cmd( "splot $arg_string" );
1364				$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1365				} elsif ($visualization_data_field_width == 2) {
1366				$plot->gnuplot_cmd( "plot $arg_string" );
1367				$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1368				} elsif ($visualization_data_field_width == 1) {
1369				$plot->gnuplot_cmd( "plot $arg_string" );
1370				$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1371				}
1372				}
1373
1374				# This subroutine is the same as above except that it makes PNG plots (for hardcopy
1375				# printing) of the clusters.
1376				sub plot_hardcopy_clusters {
1377				my $self = shift;
1378				my $v_mask;
1379				my $pause_time;
1380				if (@_ == 1) {
1381				$v_mask = shift \|\| die "visualization mask missing";
1382				} elsif (@_ == 2) {
1383				$v_mask = shift \|\| die "visualization mask missing";
1384				$pause_time = shift;
1385				} else {
1386				die "visualize_clusters() called with wrong args";
1387				}
1388				my $master_datafile = $self->{_datafile};
1389				my @v_mask = split //, $v_mask;
1390				my $visualization_mask_width = @v_mask;
1391				my $original_data_mask = $self->{_mask};
1392				my @mask = split //, $original_data_mask;
1393				my $data_field_width = scalar grep {$_ eq '1'} @mask;
1394				die "\n\nABORTED: The width of the visualization mask (including " .
1395				"all its 1s and 0s) must equal the width of the original mask " .
1396				"used for reading the data file (counting only the 1's)"
1397				if $visualization_mask_width != $data_field_width;
1398				my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1399				if ($visualization_data_field_width == 2) {
1400				foreach my $cluster_index (0..$self->{_K}-1) {
1401				my $contour_filename = "__contour_" . $cluster_index . ".dat";
1402				my $mean = $self->{_cluster_means}->[$cluster_index];
1403				my $covariance = $self->{_cluster_covariances}->[$cluster_index];
1404				my ($mux,$muy) = $mean->as_list();
1405				my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
1406				my ($sigmayx,$vary) = $covariance->row(1)->as_list();
1407				die "Your covariance matrix does not look right"
1408				unless $sigmaxy == $sigmayx;
1409				my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
1410				my $argstring = <<"END";
1411				set contour
1412				mux = $mux
1413				muy = $muy
1414				sigmax = $sigmax
1415				sigmay = $sigmay
1416				sigmaxy = $sigmaxy
1417				determinant = (sigmax*2)(sigmay2) - sigmaxy2
1418				exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)*2)sigmay2 + ((y-muy)2)sigmax2 - 2sigmaxy(x-mux)(y-muy) )
1419				f(x,y) = exp( exponent(x,y) ) - 0.2
1420				xmax = mux + 2 * sigmax
1421				xmin = mux - 2 * sigmax
1422				ymax = muy + 2 * sigmay
1423				ymin = muy - 2 * sigmay
1424				set xrange [ xmin : xmax ]
1425				set yrange [ ymin : ymax ]
1426				set isosamples 200
1427				unset surface
1428				set cntrparam levels discrete 0
1429				set table \"$contour_filename\"
1430				splot f(x,y)
1431				unset table
1432				END
1433				my $plot = Graphics::GnuplotIF->new();
1434				$plot->gnuplot_cmd( $argstring );
1435				}
1436				}
1437				my %visualization_data;
1438				while ( my ($record_id, $data) = each %{$self->{_data}} ) {
1439				my @fields = @$data;
1440				die "\nABORTED: Visualization mask size exceeds data record size"
1441				if $#v_mask > $#fields;
1442				my @data_fields;
1443				foreach my $i (0..@fields-1) {
1444				if ($v_mask[$i] eq '0') {
1445				next;
1446				} elsif ($v_mask[$i] eq '1') {
1447				push @data_fields, $fields[$i];
1448				} else {
1449				die "Misformed visualization mask. It can only have 1s and 0s";
1450				}
1451				}
1452				$visualization_data{ $record_id } = \@data_fields;
1453				}
1454				my $K = scalar @{$self->{_clusters}};
1455				my $filename = basename($master_datafile);
1456				my $temp_file = "__temp_" . $filename;
1457				unlink $temp_file if -e $temp_file;
1458				open OUTPUT, ">$temp_file"
1459				or die "Unable to open a temp file in this directory: $!";
1460				foreach my $cluster (@{$self->{_clusters}}) {
1461				foreach my $item (@$cluster) {
1462				print OUTPUT "@{$visualization_data{$item}}";
1463				print OUTPUT "\n";
1464				}
1465				print OUTPUT "\n\n";
1466				}
1467				close OUTPUT;
1468				my $plot;
1469				if (!defined $pause_time) {
1470				$plot = Graphics::GnuplotIF->new( persist => 1 );
1471				} else {
1472				$plot = Graphics::GnuplotIF->new();
1473				}
1474				my $arg_string = "";
1475				if ($visualization_data_field_width > 2) {
1476				$plot->gnuplot_cmd( "set noclip" );
1477				$plot->gnuplot_cmd( "set pointsize 2" );
1478				foreach my $i (0..$K-1) {
1479				my $j = $i + 1;
1480				$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
1481				}
1482				} elsif ($visualization_data_field_width == 2) {
1483				$plot->gnuplot_cmd( "set noclip" );
1484				$plot->gnuplot_cmd( "set pointsize 2" );
1485				foreach my $i (0..$K-1) {
1486				my $j = $i + 1;
1487				$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
1488				my $ellipse_filename = "__contour_" . $i . ".dat";
1489				$arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
1490				}
1491				} elsif ($visualization_data_field_width == 1 ) {
1492				open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
1493				my @all_data = ;
1494				close INPUT;
1495				@all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
1496				my $all_joined_data = join ':', @all_data;
1497				my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
1498				my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
1499				foreach my $i (0..@separated-1) {
1500				$separated[$i] =~ s/SEPERATOR//g;
1501				my @cluster_for_hist = split /:/, $separated[$i];
1502				@cluster_for_hist = grep $_, @cluster_for_hist;
1503				my ($minval,$maxval) = minmax(\@cluster_for_hist);
1504				push @all_minvals, $minval;
1505				push @all_maxvals, $maxval;
1506				push @all_clusters_for_hist, \@cluster_for_hist;
1507				}
1508				push @all_minmaxvals, @all_minvals;
1509				push @all_minmaxvals, @all_maxvals;
1510				my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
1511				my $delta = ($abs_maxval - $abs_minval) / 100.0;
1512				$plot->gnuplot_cmd("set boxwidth 3");
1513				$plot->gnuplot_cmd("set style fill solid border -1");
1514				$plot->gnuplot_cmd("set ytics out nomirror");
1515				$plot->gnuplot_cmd("set style data histograms");
1516				$plot->gnuplot_cmd("set style histogram clustered");
1517				$plot->gnuplot_cmd("set title 'Clusters shown through histograms'");
1518				$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
1519				foreach my $cindex (0..@all_clusters_for_hist-1) {
1520				my $filename = basename($master_datafile);
1521				my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $filename;
1522				unlink $temp_file if -e $temp_file;
1523				open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
1524				print OUTPUT "Xstep histval\n";
1525				my @histogram = (0) x 100;
1526				foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
1527				$histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
1528				}
1529				foreach my $i (0..@histogram-1) {
1530				print OUTPUT "$i $histogram[$i]\n";
1531				}
1532				# $arg_string .= "\"$temp_file\" using 1:2 ti col smooth frequency with boxes lc $cindex, ";
1533				$arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
1534				close OUTPUT;
1535				}
1536				}
1537				$arg_string = $arg_string =~ /^(.*),[ ]+$/;
1538				$arg_string = $1;
1539				if ($visualization_data_field_width > 2) {
1540				$plot->gnuplot_cmd( 'set terminal png color',
1541				'set output "cluster_plot.png"');
1542				$plot->gnuplot_cmd( "splot $arg_string" );
1543				} elsif ($visualization_data_field_width == 2) {
1544				$plot->gnuplot_cmd('set terminal png',
1545				'set output "cluster_plot.png"');
1546				$plot->gnuplot_cmd( "plot $arg_string" );
1547				} elsif ($visualization_data_field_width == 1) {
1548				$plot->gnuplot_cmd('set terminal png',
1549				'set output "cluster_plot.png"');
1550				$plot->gnuplot_cmd( "plot $arg_string" );
1551				}
1552				}
1553
1554				# This method is for the visualization of the posterior class distributions. In
1555				# other words, this method allows us to see the soft clustering produced by the EM
1556				# algorithm. While much of the gnuplot logic here is the same as in the
1557				# visualize_clusters() method, there are significant differences in how the data is
1558				# pooled for the purpose of display.
1559				sub visualize_distributions {
1560				my $self = shift;
1561				my $v_mask;
1562				my $pause_time;
1563				if (@_ == 1) {
1564				$v_mask = shift \|\| die "visualization mask missing";
1565				} elsif (@_ == 2) {
1566				$v_mask = shift \|\| die "visualization mask missing";
1567				$pause_time = shift;
1568				} else {
1569				die "visualize_distributions() called with wrong args";
1570				}
1571				my @v_mask = split //, $v_mask;
1572				my $visualization_mask_width = @v_mask;
1573				my $original_data_mask = $self->{_mask};
1574				my @mask = split //, $original_data_mask;
1575				my $data_field_width = scalar grep {$_ eq '1'} @mask;
1576				die "\n\nABORTED: The width of the visualization mask (including " .
1577				"all its 1s and 0s) must equal the width of the original mask " .
1578				"used for reading the data file (counting only the 1's)"
1579				if $visualization_mask_width != $data_field_width;
1580				my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1581				if ($visualization_data_field_width == 2) {
1582				foreach my $cluster_index (0..$self->{_K}-1) {
1583				my $contour_filename = "__contour2_" . $cluster_index . ".dat";
1584				my $mean = $self->{_cluster_means}->[$cluster_index];
1585				my $covariance = $self->{_cluster_covariances}->[$cluster_index];
1586				my ($mux,$muy) = $mean->as_list();
1587				my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
1588				my ($sigmayx,$vary) = $covariance->row(1)->as_list();
1589				die "Your covariance matrix does not look right"
1590				unless $sigmaxy == $sigmayx;
1591				my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
1592				my $argstring = <<"END";
1593				set contour
1594				mux = $mux
1595				muy = $muy
1596				sigmax = $sigmax
1597				sigmay = $sigmay
1598				sigmaxy = $sigmaxy
1599				determinant = (sigmax*2)(sigmay2) - sigmaxy2
1600				exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)*2)sigmay2 + ((y-muy)2)sigmax2 - 2sigmaxy(x-mux)(y-muy) )
1601				f(x,y) = exp( exponent(x,y) ) - 0.2
1602				xmax = mux + 2 * sigmax
1603				xmin = mux - 2 * sigmax
1604				ymax = muy + 2 * sigmay
1605				ymin = muy - 2 * sigmay
1606				set xrange [ xmin : xmax ]
1607				set yrange [ ymin : ymax ]
1608				set isosamples 200
1609				unset surface
1610				set cntrparam levels discrete 0
1611				set table \"$contour_filename\"
1612				splot f(x,y)
1613				unset table
1614				END
1615				my $plot = Graphics::GnuplotIF->new();
1616				$plot->gnuplot_cmd( $argstring );
1617				}
1618				}
1619				my %visualization_data;
1620				while ( my ($record_id, $data) = each %{$self->{_data}} ) {
1621				my @fields = @$data;
1622				die "\nABORTED: Visualization mask size exceeds data record size"
1623				if $#v_mask > $#fields;
1624				my @data_fields;
1625				foreach my $i (0..@fields-1) {
1626				if ($v_mask[$i] eq '0') {
1627				next;
1628				} elsif ($v_mask[$i] eq '1') {
1629				push @data_fields, $fields[$i];
1630				} else {
1631				die "Misformed visualization mask. It can only have 1s and 0s";
1632				}
1633				}
1634				$visualization_data{ $record_id } = \@data_fields;
1635				}
1636				my $filename = basename($self->{_datafile});
1637				my $temp_file = "__temp2_" . $filename;
1638				unlink $temp_file if -e $temp_file;
1639				open OUTPUT, ">$temp_file"
1640				or die "Unable to open a temp file in this directory: $!";
1641				my @class_distributions;
1642				foreach my $cluster_index (0..$self->{_K}-1) {
1643				push @class_distributions, [];
1644				}
1645				foreach my $data_tag (@{$self->{_data_id_tags}}) {
1646				foreach my $cluster_index (0..$self->{_K}-1) {
1647				push @{$class_distributions[$cluster_index]}, $self->{_data}->{$data_tag}
1648				if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] > 0.2;
1649				}
1650				}
1651				foreach my $distribution (@class_distributions) {
1652				foreach my $item (@$distribution) {
1653				print OUTPUT "@$item";
1654				print OUTPUT "\n";
1655				}
1656				print OUTPUT "\n\n";
1657				}
1658				close OUTPUT;
1659				my $plot;
1660				if (!defined $pause_time) {
1661				$plot = Graphics::GnuplotIF->new( persist => 1 );
1662				} else {
1663				$plot = Graphics::GnuplotIF->new();
1664				}
1665				my $arg_string = "";
1666				if ($visualization_data_field_width > 2) {
1667				$plot->gnuplot_cmd( "set noclip" );
1668				$plot->gnuplot_cmd( "set pointsize 2" );
1669				foreach my $i (0..$self->{_K}-1) {
1670				my $j = $i + 1;
1671				$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
1672				}
1673				} elsif ($visualization_data_field_width == 2) {
1674				$plot->gnuplot_cmd( "set noclip" );
1675				$plot->gnuplot_cmd( "set pointsize 2" );
1676				foreach my $i (0..$self->{_K}-1) {
1677				my $j = $i + 1;
1678				$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
1679				my $ellipse_filename = "__contour2_" . $i . ".dat";
1680				$arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
1681				}
1682				} elsif ($visualization_data_field_width == 1 ) {
1683				open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
1684				my @all_data = ;
1685				close INPUT;
1686				@all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
1687				my $all_joined_data = join ':', @all_data;
1688				my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
1689				my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
1690				foreach my $i (0..@separated-1) {
1691				$separated[$i] =~ s/SEPERATOR//g;
1692				my @cluster_for_hist = split /:/, $separated[$i];
1693				@cluster_for_hist = grep $_, @cluster_for_hist;
1694				my ($minval,$maxval) = minmax(\@cluster_for_hist);
1695				push @all_minvals, $minval;
1696				push @all_maxvals, $maxval;
1697				push @all_clusters_for_hist, \@cluster_for_hist;
1698				}
1699				push @all_minmaxvals, @all_minvals;
1700				push @all_minmaxvals, @all_maxvals;
1701				my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
1702				my $delta = ($abs_maxval - $abs_minval) / 100.0;
1703				$plot->gnuplot_cmd("set boxwidth 3");
1704				$plot->gnuplot_cmd("set style fill solid border -1");
1705				$plot->gnuplot_cmd("set ytics out nomirror");
1706				$plot->gnuplot_cmd("set style data histograms");
1707				$plot->gnuplot_cmd("set style histogram clustered");
1708				$plot->gnuplot_cmd("set title 'Individual distributions shown through histograms'");
1709				$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
1710				foreach my $cindex (0..@all_clusters_for_hist-1) {
1711				my $localfilename = basename($filename);
1712				my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $localfilename;
1713				unlink $temp_file if -e $temp_file;
1714				open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
1715				print OUTPUT "Xstep histval\n";
1716
1717				my @histogram = (0) x 100;
1718				foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
1719				$histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
1720				}
1721				foreach my $i (0..@histogram-1) {
1722				print OUTPUT "$i $histogram[$i]\n";
1723				}
1724				# $arg_string .= "\"$temp_file\" using 1:2 ti col smooth frequency with boxes lc $cindex, ";
1725				$arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
1726				close OUTPUT;
1727				}
1728				}
1729				$arg_string = $arg_string =~ /^(.*),[ ]+$/;
1730				$arg_string = $1;
1731				if ($visualization_data_field_width > 2) {
1732				$plot->gnuplot_cmd( "splot $arg_string" );
1733				$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1734				} elsif ($visualization_data_field_width == 2) {
1735				$plot->gnuplot_cmd( "plot $arg_string" );
1736				$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1737				} elsif ($visualization_data_field_width == 1) {
1738				$plot->gnuplot_cmd( "plot $arg_string" );
1739				$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1740				}
1741				}
1742
1743				# This method is basically the same as the previous method, except that it is
1744				# intended for making PNG files from the distributions.
1745				sub plot_hardcopy_distributions {
1746				my $self = shift;
1747				my $v_mask;
1748				my $pause_time;
1749				if (@_ == 1) {
1750				$v_mask = shift \|\| die "visualization mask missing";
1751				} elsif (@_ == 2) {
1752				$v_mask = shift \|\| die "visualization mask missing";
1753				$pause_time = shift;
1754				} else {
1755				die "visualize_distributions() called with wrong args";
1756				}
1757				my @v_mask = split //, $v_mask;
1758				my $visualization_mask_width = @v_mask;
1759				my $original_data_mask = $self->{_mask};
1760				my @mask = split //, $original_data_mask;
1761				my $data_field_width = scalar grep {$_ eq '1'} @mask;
1762				die "\n\nABORTED: The width of the visualization mask (including " .
1763				"all its 1s and 0s) must equal the width of the original mask " .
1764				"used for reading the data file (counting only the 1's)"
1765				if $visualization_mask_width != $data_field_width;
1766				my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1767				if ($visualization_data_field_width == 2) {
1768				foreach my $cluster_index (0..$self->{_K}-1) {
1769				my $contour_filename = "__contour2_" . $cluster_index . ".dat";
1770				my $mean = $self->{_cluster_means}->[$cluster_index];
1771				my $covariance = $self->{_cluster_covariances}->[$cluster_index];
1772				my ($mux,$muy) = $mean->as_list();
1773				my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
1774				my ($sigmayx,$vary) = $covariance->row(1)->as_list();
1775				die "Your covariance matrix does not look right"
1776				unless $sigmaxy == $sigmayx;
1777				my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
1778				my $argstring = <<"END";
1779				set contour
1780				mux = $mux
1781				muy = $muy
1782				sigmax = $sigmax
1783				sigmay = $sigmay
1784				sigmaxy = $sigmaxy
1785				determinant = (sigmax*2)(sigmay2) - sigmaxy2
1786				exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)*2)sigmay2 + ((y-muy)2)sigmax2 - 2sigmaxy(x-mux)(y-muy) )
1787				f(x,y) = exp( exponent(x,y) ) - 0.2
1788				xmax = mux + 2 * sigmax
1789				xmin = mux - 2 * sigmax
1790				ymax = muy + 2 * sigmay
1791				ymin = muy - 2 * sigmay
1792				set xrange [ xmin : xmax ]
1793				set yrange [ ymin : ymax ]
1794				set isosamples 200
1795				unset surface
1796				set cntrparam levels discrete 0
1797				set table \"$contour_filename\"
1798				splot f(x,y)
1799				unset table
1800				END
1801				my $plot = Graphics::GnuplotIF->new();
1802				$plot->gnuplot_cmd( $argstring );
1803				}
1804				}
1805				my %visualization_data;
1806				while ( my ($record_id, $data) = each %{$self->{_data}} ) {
1807				my @fields = @$data;
1808				die "\nABORTED: Visualization mask size exceeds data record size"
1809				if $#v_mask > $#fields;
1810				my @data_fields;
1811				foreach my $i (0..@fields-1) {
1812				if ($v_mask[$i] eq '0') {
1813				next;
1814				} elsif ($v_mask[$i] eq '1') {
1815				push @data_fields, $fields[$i];
1816				} else {
1817				die "Misformed visualization mask. It can only have 1s and 0s";
1818				}
1819				}
1820				$visualization_data{ $record_id } = \@data_fields;
1821				}
1822				my $filename = basename($self->{_datafile});
1823				my $temp_file = "__temp2_" . $filename;
1824				unlink $temp_file if -e $temp_file;
1825				open OUTPUT, ">$temp_file"
1826				or die "Unable to open a temp file in this directory: $!";
1827				my @class_distributions;
1828				foreach my $cluster_index (0..$self->{_K}-1) {
1829				push @class_distributions, [];
1830				}
1831				foreach my $data_tag (@{$self->{_data_id_tags}}) {
1832				foreach my $cluster_index (0..$self->{_K}-1) {
1833				push @{$class_distributions[$cluster_index]}, $self->{_data}->{$data_tag}
1834				if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] > 0.2;
1835				}
1836				}
1837				foreach my $distribution (@class_distributions) {
1838				foreach my $item (@$distribution) {
1839				print OUTPUT "@$item";
1840				print OUTPUT "\n";
1841				}
1842				print OUTPUT "\n\n";
1843				}
1844				close OUTPUT;
1845				my $plot;
1846				if (!defined $pause_time) {
1847				$plot = Graphics::GnuplotIF->new( persist => 1 );
1848				} else {
1849				$plot = Graphics::GnuplotIF->new();
1850				}
1851				my $arg_string = "";
1852				if ($visualization_data_field_width > 2) {
1853				$plot->gnuplot_cmd( "set noclip" );
1854				$plot->gnuplot_cmd( "set pointsize 2" );
1855				foreach my $i (0..$self->{_K}-1) {
1856				my $j = $i + 1;
1857				$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
1858				}
1859				} elsif ($visualization_data_field_width == 2) {
1860				$plot->gnuplot_cmd( "set noclip" );
1861				$plot->gnuplot_cmd( "set pointsize 2" );
1862				foreach my $i (0..$self->{_K}-1) {
1863				my $j = $i + 1;
1864				$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
1865				my $ellipse_filename = "__contour2_" . $i . ".dat";
1866				$arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
1867				}
1868				} elsif ($visualization_data_field_width == 1 ) {
1869				open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
1870				my @all_data = ;
1871				close INPUT;
1872				@all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
1873				my $all_joined_data = join ':', @all_data;
1874				my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
1875				my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
1876				foreach my $i (0..@separated-1) {
1877				$separated[$i] =~ s/SEPERATOR//g;
1878				my @cluster_for_hist = split /:/, $separated[$i];
1879				@cluster_for_hist = grep $_, @cluster_for_hist;
1880				my ($minval,$maxval) = minmax(\@cluster_for_hist);
1881				push @all_minvals, $minval;
1882				push @all_maxvals, $maxval;
1883				push @all_clusters_for_hist, \@cluster_for_hist;
1884				}
1885				push @all_minmaxvals, @all_minvals;
1886				push @all_minmaxvals, @all_maxvals;
1887				my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
1888				my $delta = ($abs_maxval - $abs_minval) / 100.0;
1889				$plot->gnuplot_cmd("set boxwidth 3");
1890				$plot->gnuplot_cmd("set style fill solid border -1");
1891				$plot->gnuplot_cmd("set ytics out nomirror");
1892				$plot->gnuplot_cmd("set style data histograms");
1893				$plot->gnuplot_cmd("set style histogram clustered");
1894				$plot->gnuplot_cmd("set title 'Individual distributions shown through histograms'");
1895				$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
1896				foreach my $cindex (0..@all_clusters_for_hist-1) {
1897				my $localfilename = basename($filename);
1898				my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $localfilename;
1899				unlink $temp_file if -e $temp_file;
1900				open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
1901				print OUTPUT "Xstep histval\n";
1902
1903				my @histogram = (0) x 100;
1904				foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
1905				$histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
1906				}
1907				foreach my $i (0..@histogram-1) {
1908				print OUTPUT "$i $histogram[$i]\n";
1909				}
1910				$arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
1911				close OUTPUT;
1912				}
1913				}
1914				$arg_string = $arg_string =~ /^(.*),[ ]+$/;
1915				$arg_string = $1;
1916
1917				if ($visualization_data_field_width > 2) {
1918				$plot->gnuplot_cmd( 'set terminal png',
1919				'set output "posterior_prob_plot.png"');
1920				$plot->gnuplot_cmd( "splot $arg_string" );
1921				} elsif ($visualization_data_field_width == 2) {
1922				$plot->gnuplot_cmd( 'set terminal png',
1923				'set output "posterior_prob_plot.png"');
1924				$plot->gnuplot_cmd( "plot $arg_string" );
1925				} elsif ($visualization_data_field_width == 1) {
1926				$plot->gnuplot_cmd( 'set terminal png',
1927				'set output "posterior_prob_plot.png"');
1928				$plot->gnuplot_cmd( "plot $arg_string" );
1929				}
1930				}
1931
1932				# The method shown below should be called only AFTER you have called the method
1933				# read_data_from_file(). The visualize_data() is meant for the visualization of the
1934				# original data in its various 2D or 3D subspaces.
1935				sub visualize_data {
1936				my $self = shift;
1937				my $v_mask = shift \|\| die "visualization mask missing";
1938
1939				my $master_datafile = $self->{_datafile};
1940
1941				my @v_mask = split //, $v_mask;
1942				my $visualization_mask_width = @v_mask;
1943				my $original_data_mask = $self->{_mask};
1944				my @mask = split //, $original_data_mask;
1945				my $data_field_width = scalar grep {$_ eq '1'} @mask;
1946				die "\n\nABORTED: The width of the visualization mask (including " .
1947				"all its 1s and 0s) must equal the width of the original mask " .
1948				"used for reading the data file (counting only the 1's)"
1949				if $visualization_mask_width != $data_field_width;
1950				my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1951				my %visualization_data;
1952				my $data_source = $self->{_data};
1953				while ( my ($record_id, $data) = each %{$data_source} ) {
1954				my @fields = @$data;
1955				die "\nABORTED: Visualization mask size exceeds data record size"
1956				if $#v_mask > $#fields;
1957				my @data_fields;
1958				foreach my $i (0..@fields-1) {
1959				if ($v_mask[$i] eq '0') {
1960				next;
1961				} elsif ($v_mask[$i] eq '1') {
1962				push @data_fields, $fields[$i];
1963				} else {
1964				die "Misformed visualization mask. It can only have 1s and 0s";
1965				}
1966				}
1967				$visualization_data{ $record_id } = \@data_fields;
1968				}
1969				my $filename = basename($master_datafile);
1970				my $temp_file;
1971				$temp_file = "__temp_data_" . $filename;
1972				unlink $temp_file if -e $temp_file;
1973				open OUTPUT, ">$temp_file"
1974				or die "Unable to open a temp file in this directory: $!";
1975				foreach my $datapoint (values %visualization_data) {
1976				print OUTPUT "@$datapoint";
1977				print OUTPUT "\n";
1978				}
1979				close OUTPUT;
1980				my $plot = Graphics::GnuplotIF->new( persist => 1 );
1981				$plot->gnuplot_cmd( "set noclip" );
1982				$plot->gnuplot_cmd( "set pointsize 2" );
1983				my $plot_title = '"original data provided for EM"';
1984				my $arg_string ;
1985				if ($visualization_data_field_width > 2) {
1986				$arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
1987				} elsif ($visualization_data_field_width == 2) {
1988				$arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
1989				} elsif ($visualization_data_field_width == 1 ) {
1990				open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
1991				my @all_data = ;
1992				close INPUT;
1993				@all_data = map {chomp $_; $_} @all_data;
1994				@all_data = grep $_, @all_data;
1995				my ($minval,$maxval) = minmax(\@all_data);
1996				my $delta = ($maxval - $minval) / 100.0;
1997				$plot->gnuplot_cmd("set boxwidth 3");
1998				$plot->gnuplot_cmd("set style fill solid border -1");
1999				$plot->gnuplot_cmd("set ytics out nomirror");
2000				$plot->gnuplot_cmd("set style data histograms");
2001				$plot->gnuplot_cmd("set style histogram clustered");
2002				$plot->gnuplot_cmd("set title 'Overall distribution of 1D data'");
2003				$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
2004				my $localfilename = basename($filename);
2005				my $temp_file = "__temp1dhist_" . $localfilename;
2006				unlink $temp_file if -e $temp_file;
2007				open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
2008				print OUTPUT "Xstep histval\n";
2009				my @histogram = (0) x 100;
2010				foreach my $i (0..@all_data-1) {
2011				$histogram[int( ($all_data[$i] - $minval) / $delta )]++;
2012				}
2013				foreach my $i (0..@histogram-1) {
2014				print OUTPUT "$i $histogram[$i]\n";
2015				}
2016				$arg_string = "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc rgb 'green'";
2017				close OUTPUT;
2018				}
2019				if ($visualization_data_field_width > 2) {
2020				$plot->gnuplot_cmd( "splot $arg_string" );
2021				} elsif ($visualization_data_field_width == 2) {
2022				$plot->gnuplot_cmd( "plot $arg_string" );
2023				} elsif ($visualization_data_field_width == 1) {
2024				$plot->gnuplot_cmd( "plot $arg_string" );
2025				}
2026				}
2027
2028				# This method is the same as the one shown above, except that it is meant for
2029				# creating PNG files of the displays.
2030				sub plot_hardcopy_data {
2031				my $self = shift;
2032				my $v_mask = shift \|\| die "visualization mask missing";
2033				my $master_datafile = $self->{_datafile};
2034				my @v_mask = split //, $v_mask;
2035				my $visualization_mask_width = @v_mask;
2036				my $original_data_mask = $self->{_mask};
2037				my @mask = split //, $original_data_mask;
2038				my $data_field_width = scalar grep {$_ eq '1'} @mask;
2039				die "\n\nABORTED: The width of the visualization mask (including " .
2040				"all its 1s and 0s) must equal the width of the original mask " .
2041				"used for reading the data file (counting only the 1's)"
2042				if $visualization_mask_width != $data_field_width;
2043				my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
2044				my %visualization_data;
2045				my $data_source = $self->{_data};
2046				while ( my ($record_id, $data) = each %{$data_source} ) {
2047				my @fields = @$data;
2048				die "\nABORTED: Visualization mask size exceeds data record size"
2049				if $#v_mask > $#fields;
2050				my @data_fields;
2051				foreach my $i (0..@fields-1) {
2052				if ($v_mask[$i] eq '0') {
2053				next;
2054				} elsif ($v_mask[$i] eq '1') {
2055				push @data_fields, $fields[$i];
2056				} else {
2057				die "Misformed visualization mask. It can only have 1s and 0s";
2058				}
2059				}
2060				$visualization_data{ $record_id } = \@data_fields;
2061				}
2062				my $filename = basename($master_datafile);
2063				my $temp_file;
2064				$temp_file = "__temp_data_" . $filename;
2065				unlink $temp_file if -e $temp_file;
2066				open OUTPUT, ">$temp_file"
2067				or die "Unable to open a temp file in this directory: $!";
2068				foreach my $datapoint (values %visualization_data) {
2069				print OUTPUT "@$datapoint";
2070				print OUTPUT "\n";
2071				}
2072				close OUTPUT;
2073
2074				my $plot = Graphics::GnuplotIF->new( persist => 1 );
2075				$plot->gnuplot_cmd( "set noclip" );
2076				$plot->gnuplot_cmd( "set pointsize 2" );
2077				my $plot_title = '"original data provided for EM"';
2078				my $arg_string ;
2079				if ($visualization_data_field_width > 2) {
2080				$arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
2081				} elsif ($visualization_data_field_width == 2) {
2082				$arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
2083				} elsif ($visualization_data_field_width == 1 ) {
2084				open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
2085				my @all_data = ;
2086				close INPUT;
2087				@all_data = map {chomp $_; $_} @all_data;
2088				@all_data = grep $_, @all_data;
2089				my ($minval,$maxval) = minmax(\@all_data);
2090				my $delta = ($maxval - $minval) / 100.0;
2091				$plot->gnuplot_cmd("set boxwidth 3");
2092				$plot->gnuplot_cmd("set style fill solid border -1");
2093				$plot->gnuplot_cmd("set ytics out nomirror");
2094				$plot->gnuplot_cmd("set style data histograms");
2095				$plot->gnuplot_cmd("set style histogram clustered");
2096				$plot->gnuplot_cmd("set title 'Overall distribution of 1D data'");
2097				$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
2098				my $localfilename = basename($filename);
2099				my $temp_file = "__temp1dhist_" . $localfilename;
2100				unlink $temp_file if -e $temp_file;
2101				open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
2102				print OUTPUT "Xstep histval\n";
2103				my @histogram = (0) x 100;
2104				foreach my $i (0..@all_data-1) {
2105				$histogram[int( ($all_data[$i] - $minval) / $delta )]++;
2106				}
2107				foreach my $i (0..@histogram-1) {
2108				print OUTPUT "$i $histogram[$i]\n";
2109				}
2110				$arg_string = "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc rgb 'green'";
2111				close OUTPUT;
2112				}
2113				if ($visualization_data_field_width > 2) {
2114				$plot->gnuplot_cmd( 'set terminal png',
2115				'set output "data_scatter_plot.png"');
2116				$plot->gnuplot_cmd( "splot $arg_string" );
2117				} elsif ($visualization_data_field_width == 2) {
2118				$plot->gnuplot_cmd( 'set terminal png',
2119				'set output "data_scatter_plot.png"');
2120				$plot->gnuplot_cmd( "plot $arg_string" );
2121				} elsif ($visualization_data_field_width == 1) {
2122				$plot->gnuplot_cmd( 'set terminal png',
2123				'set output "data_scatter_plot.png"');
2124				$plot->gnuplot_cmd( "plot $arg_string" );
2125				}
2126				}
2127
2128
2129				################### Generating Synthetic Data for Clustering ###################
2130
2131				# The data generated corresponds to a multivariate distribution. The mean and the
2132				# covariance of each Gaussian in the distribution are specified individually in a
2133				# parameter file. The parameter file must also state the prior probabilities to be
2134				# associated with each Gaussian. See the example parameter file param1.txt in the
2135				# examples directory. Just edit this file for your own needs.
2136				#
2137				# The multivariate random numbers are generated by calling the Math::Random module.
2138				# As you would expect, that module will insist that the covariance matrix you
2139				# specify be symmetric and positive definite.
2140				sub cluster_data_generator {
2141				my $class = shift;
2142				die "illegal call of a class method"
2143				unless $class eq 'Algorithm::ExpectationMaximization';
2144				my %args = @_;
2145				my $input_parameter_file = $args{input_parameter_file};
2146				my $output_file = $args{output_datafile};
2147				my $N = $args{total_number_of_data_points};
2148				my @all_params;
2149				my $param_string;
2150				if (defined $input_parameter_file) {
2151				open INPUT, $input_parameter_file \|\| "unable to open parameter file: $!";
2152				@all_params = ;
2153				@all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
2154				chomp @all_params;
2155				$param_string = join ' ', @all_params;
2156				} else {
2157				# Just for testing. Used in t/test.t
2158				$param_string = "priors 0.34 0.33 0.33 " .
2159				"cluster 5 0 0 1 0 0 0 1 0 0 0 1 " .
2160				"cluster 0 5 0 1 0 0 0 1 0 0 0 1 " .
2161				"cluster 0 0 5 1 0 0 0 1 0 0 0 1";
2162				}
2163				my ($priors_string) = $param_string =~ /priors(.+?)cluster/;
2164				croak "You did not specify the prior probabilities in the parameter file"
2165				unless $priors_string;
2166				my @priors = split /\s+/, $priors_string;
2167				@priors = grep {/$_num_regex/; $_} @priors;
2168				my $sum = 0;
2169				foreach my $prior (@priors) {
2170				$sum += $prior;
2171				}
2172				croak "Your priors in the parameter file do not add up to 1" unless $sum == 1;
2173				my ($rest_of_string) = $param_string =~ /priors\s$priors_string(.)$/;
2174				my @cluster_strings = split /[ ]cluster[ ]/, $rest_of_string;
2175				@cluster_strings = grep $_, @cluster_strings;
2176
2177				my $K = @cluster_strings;
2178				croak "Too many clusters requested" if $K > 12;
2179				croak "Mismatch between the number of values for priors and the number " .
2180				"of clusters" unless $K == @priors;
2181				my @point_labels = ('a'..'z');
2182				print "Prior probabilities recorded from param file: @priors\n";
2183				print "Number of Gaussians used for the synthetic data: $K\n";
2184				my @means;
2185				my @covariances;
2186				my $data_dimension;
2187				foreach my $i (0..$K-1) {
2188				my @num_strings = split / /, $cluster_strings[$i];
2189				my @cluster_mean = map {/$_num_regex/;$_} split / /, $num_strings[0];
2190				$data_dimension = @cluster_mean;
2191				push @means, \@cluster_mean;
2192				my @covariance_nums = map {/$_num_regex/;$_} split / /, $num_strings[1];
2193				croak "dimensionality error" if @covariance_nums !=
2194				($data_dimension ** 2);
2195				my $cluster_covariance;
2196				foreach my $j (0..$data_dimension-1) {
2197				foreach my $k (0..$data_dimension-1) {
2198				$cluster_covariance->[$j]->[$k] =
2199				$covariance_nums[$j*$data_dimension + $k];
2200				}
2201				}
2202				push @covariances, $cluster_covariance;
2203				}
2204				random_seed_from_phrase( 'hellomellojello' );
2205				my @data_dump;
2206				foreach my $i (0..$K-1) {
2207				my @m = @{shift @means};
2208				my @covar = @{shift @covariances};
2209				my @new_data = Math::Random::random_multivariate_normal(
2210				int($N * $priors[$i]), @m, @covar );
2211				my $p = 0;
2212				my $label = $point_labels[$i];
2213				@new_data = map {unshift @$_, $label.$i; $i++; $_} @new_data;
2214				push @data_dump, @new_data;
2215				}
2216				fisher_yates_shuffle( \@data_dump );
2217				open OUTPUT, ">$output_file";
2218				print OUTPUT "\#Data generated from the parameter file: $input_parameter_file\n"
2219				if $input_parameter_file;
2220				print OUTPUT "\#Total number of data points in this file: $N\n";
2221				print OUTPUT "\#Prior class probabilities for this data: @priors\n";
2222				foreach my $ele (@data_dump) {
2223				foreach my $coord ( @$ele ) {
2224				print OUTPUT "$coord ";
2225				}
2226				print OUTPUT "\n";
2227				}
2228				print "Data written out to file $output_file\n";
2229				close OUTPUT;
2230				}
2231
2232				sub add_point_coords {
2233				my $self = shift;
2234				my @arr_of_ids = @{shift @_}; # array of data element names
2235				my @result;
2236				my $data_dimensionality = $self->{_data_dimensions};
2237				foreach my $i (0..$data_dimensionality-1) {
2238				$result[$i] = 0.0;
2239				}
2240				foreach my $id (@arr_of_ids) {
2241				my $ele = $self->{_data}->{$id};
2242				my $i = 0;
2243				foreach my $component (@$ele) {
2244				$result[$i] += $component;
2245				$i++;
2246				}
2247				}
2248				return \@result;
2249				}
2250
2251				###################### Support Routines ########################
2252
2253				# computer the outer product of two column vectors
2254				sub outer_product {
2255				my $vec1 = shift;
2256				my $vec2 = shift;
2257				my ($nrows1, $ncols1) = ($vec1->rows(), $vec1->cols());
2258				my ($nrows2, $ncols2) = ($vec2->rows(), $vec2->cols());
2259				die "Outer product operation called with non-matching vectors"
2260				unless $ncols1 == 1 && $ncols2 == 1 && $nrows1 == $nrows2;
2261				my @vec_arr1 = $vec1->as_list();
2262				my @vec_arr2 = $vec2->as_list();
2263				my $outer_product = Math::GSL::Matrix->new($nrows1, $nrows2);
2264				foreach my $index (0..$nrows1-1) {
2265				my @new_row = map $vec_arr1[$index] * $_, @vec_arr2;
2266				$outer_product->set_row($index, \@new_row);
2267				}
2268				return $outer_product;
2269				}
2270
2271				sub get_index_at_value {
2272				my $value = shift;
2273				my @array = @{shift @_};
2274				foreach my $i (0..@array-1) {
2275				return $i if $value == $array[$i];
2276				}
2277				}
2278
2279				# This routine is really not necessary in light of the new `~~' operator in Perl.
2280				# Will use the new operator in the next version.
2281				sub vector_equal {
2282				my $vec1 = shift;
2283				my $vec2 = shift;
2284				die "wrong data types for distance calculation" if @$vec1 != @$vec2;
2285				foreach my $i (0..@$vec1-1){
2286				return 0 if $vec1->[$i] != $vec2->[$i];
2287				}
2288				return 1;
2289				}
2290
2291				sub compare_array_floats {
2292				my $vec1 = shift;
2293				my $vec2 = shift;
2294				foreach my $i (0..@$vec1-1){
2295				return 0 if abs($vec1->[$i] - $vec2->[$i]) > 0.00001;
2296				}
2297				return 1;
2298				}
2299
2300				# Returns the minimum value and its positional index in an array
2301				sub minimum {
2302				my $arr = shift;
2303				my $min;
2304				my $index;
2305				foreach my $i (0..@{$arr}-1) {
2306				if ( (!defined $min) \|\| ($arr->[$i] < $min) ) {
2307				$index = $i;
2308				$min = $arr->[$i];
2309				}
2310				}
2311				return ($min, $index);
2312				}
2313
2314				sub minmax {
2315				my $arr = shift;
2316				my $min;
2317				my $max;
2318				foreach my $i (0..@{$arr}-1) {
2319				if ( (!defined $min) && (!defined $max) ) {
2320				$min = $arr->[$i];
2321				$max = $arr->[$i];
2322				} elsif ( $arr->[$i] < $min ) {
2323				$min = $arr->[$i];
2324				} elsif ( $arr->[$i] > $max ) {
2325				$max = $arr->[$i];
2326				}
2327				}
2328				return ($min, $max);
2329				}
2330
2331				# Meant only for constructing a deep copy of an array of
2332				# arrays:
2333				sub deep_copy_AoA {
2334				my $ref_in = shift;
2335				my $ref_out;
2336				foreach my $i (0..@{$ref_in}-1) {
2337				foreach my $j (0..@{$ref_in->[$i]}-1) {
2338				$ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
2339				}
2340				}
2341				return $ref_out;
2342				}
2343
2344				# Meant only for constructing a deep copy of an array of arrays for the case when
2345				# some elements of the top-level array may be undefined:
2346				sub deep_copy_AoA_with_nulls {
2347				my $ref_in = shift;
2348				my $ref_out;
2349				foreach my $i (0..@{$ref_in}-1) {
2350				if ( !defined $ref_in->[$i] ) {
2351				$ref_out->[$i] = undef;
2352				next;
2353				}
2354				foreach my $j (0..@{$ref_in->[$i]}-1) {
2355				$ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
2356				}
2357				}
2358				return $ref_out;
2359				}
2360
2361				# Meant only for constructing a deep copy of a hash in which each value is an
2362				# anonymous array of numbers:
2363				sub deep_copy_hash {
2364				my $ref_in = shift;
2365				my $ref_out;
2366				while ( my ($key, $value) = each( %$ref_in ) ) {
2367				$ref_out->{$key} = deep_copy_array( $value );
2368				}
2369				return $ref_out;
2370				}
2371
2372				# Meant only for an array of numbers:
2373				sub deep_copy_array {
2374				my $ref_in = shift;
2375				my $ref_out;
2376				foreach my $i (0..@{$ref_in}-1) {
2377				$ref_out->[$i] = $ref_in->[$i];
2378				}
2379				return $ref_out;
2380				}
2381
2382				# from perl docs:
2383				sub fisher_yates_shuffle {
2384				my $arr = shift;
2385				my $i = @$arr;
2386				while (--$i) {
2387				my $j = int rand( $i + 1 );
2388				@$arr[$i, $j] = @$arr[$j, $i];
2389				}
2390				}
2391
2392				sub mean_and_variance {
2393				my @data = @{shift @_};
2394				my ($mean, $variance);
2395				foreach my $i (1..@data) {
2396				if ($i == 1) {
2397				$mean = $data[0];
2398				$variance = 0;
2399				} else {
2400				# data[$i-1] because of zero-based indexing of vector
2401				$mean = ( (($i-1)/$i) * $mean ) + $data[$i-1] / $i;
2402				$variance = ( (($i-1)/$i) * $variance )
2403				+ ($data[$i-1]-$mean)**2 / ($i-1);
2404				}
2405				}
2406				return ($mean, $variance);
2407				}
2408
2409				sub check_for_illegal_params {
2410				my @params = @_;
2411				my @legal_params = qw / datafile
2412				mask
2413				K
2414				terminal_output
2415				max_em_iterations
2416				seeding
2417				class_priors
2418				seed_tags
2419				debug
2420				/;
2421				my $found_match_flag;
2422				foreach my $param (@params) {
2423				foreach my $legal (@legal_params) {
2424				$found_match_flag = 0;
2425				if ($param eq $legal) {
2426				$found_match_flag = 1;
2427				last;
2428				}
2429				}
2430				last if $found_match_flag == 0;
2431				}
2432				return $found_match_flag;
2433				}
2434
2435				sub get_value_index_hash {
2436				my $arr = shift;
2437				my %hash;
2438				foreach my $index (0..@$arr-1) {
2439				$hash{$arr->[$index]} = $index if $arr->[$index] > 0;
2440				}
2441				return \%hash;
2442				}
2443
2444				sub non_maximum_supression {
2445				my $arr = shift;
2446				my @output = (0) x @$arr;
2447				my @final_output = (0) x @$arr;
2448				my %hash;
2449				my @array_of_runs = ([$arr->[0]]);
2450				foreach my $index (1..@$arr-1) {
2451				if ($arr->[$index] == $arr->[$index-1]) {
2452				push @{$array_of_runs[-1]}, $arr->[$index];
2453				} else {
2454				push @array_of_runs, [$arr->[$index]];
2455				}
2456				}
2457				my $runstart_index = 0;
2458				foreach my $run_index (1..@array_of_runs-2) {
2459				$runstart_index += @{$array_of_runs[$run_index-1]};
2460				if ($array_of_runs[$run_index]->[0] >
2461				$array_of_runs[$run_index-1]->[0] &&
2462				$array_of_runs[$run_index]->[0] >
2463				$array_of_runs[$run_index+1]->[0]) {
2464				my $run_center = @{$array_of_runs[$run_index]} / 2;
2465				my $assignment_index = $runstart_index + $run_center;
2466				$output[$assignment_index] = $arr->[$assignment_index];
2467				}
2468				}
2469				if ($array_of_runs[-1]->[0] > $array_of_runs[-2]->[0]) {
2470				$runstart_index += @{$array_of_runs[-2]};
2471				my $run_center = @{$array_of_runs[-1]} / 2;
2472				my $assignment_index = $runstart_index + $run_center;
2473				$output[$assignment_index] = $arr->[$assignment_index];
2474				}
2475				if ($array_of_runs[0]->[0] > $array_of_runs[1]->[0]) {
2476				my $run_center = @{$array_of_runs[0]} / 2;
2477				$output[$run_center] = $arr->[$run_center];
2478				}
2479				return \@output;
2480				}
2481
2482				sub display_matrix {
2483				my $message = shift;
2484				my $matrix = shift;
2485				if (!defined blessed($matrix)) {
2486				print "display_matrix called on a scalar value: $matrix\n";
2487				return;
2488				}
2489				my $nrows = $matrix->rows();
2490				my $ncols = $matrix->cols();
2491				print "$message ($nrows rows and $ncols columns)\n";
2492				foreach my $i (0..$nrows-1) {
2493				my $row = $matrix->row($i);
2494				my @row_as_list = $row->as_list;
2495				print "@row_as_list\n";
2496				}
2497				print "\n";
2498				}
2499
2500				sub transpose {
2501				my $matrix = shift;
2502				my $num_rows = $matrix->rows();
2503				my $num_cols = $matrix->cols();
2504				my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
2505				foreach my $i (0..$num_rows-1) {
2506				my @row = $matrix->row($i)->as_list;
2507				$transpose->set_col($i, \@row );
2508				}
2509				return $transpose;
2510				}
2511
2512				sub vector_multiply {
2513				my $vec1 = shift;
2514				my $vec2 = shift;
2515				die "vec_multiply called with two vectors of different sizes"
2516				unless @$vec1 == @$vec2;
2517				my $result = 0;
2518				foreach my $i (0..@$vec1-1) {
2519				$result += $vec1->[$i] * $vec2->[$i];
2520				}
2521				return $result;
2522				}
2523
2524				sub vector_2_vector_multiply {
2525				my $vec1 = shift;
2526				my $vec2 = shift;
2527				die "vec_multiply called with two vectors of different sizes"
2528				unless @$vec1 == @$vec2;
2529				my @result_vec;
2530				foreach my $i (0..@$vec1-1) {
2531				$result_vec[$i] = $vec1->[$i] * $vec2->[$i];
2532				}
2533				return \@result_vec;
2534				}
2535
2536				sub matrix_multiply {
2537				my $matrix1 = shift;
2538				my $matrix2 = shift;
2539				my ($nrows1, $ncols1) = ($matrix1->rows(), $matrix1->cols());
2540				my ($nrows2, $ncols2) = ($matrix2->rows(), $matrix2->cols());
2541				die "matrix multiplication called with non-matching matrix arguments"
2542				# unless $ncols1 == $nrows2;
2543				unless $nrows1 == $ncols2 && $ncols1 == $nrows2;
2544				if ($nrows1 == 1) {
2545				my @row = $matrix1->row(0)->as_list;
2546				my @col = $matrix2->col(0)->as_list;
2547				my $result;
2548				foreach my $j (0..$ncols1-1) {
2549				$result += $row[$j] * $col[$j];
2550				}
2551				return $result;
2552				}
2553				my $product = Math::GSL::Matrix->new($nrows1, $nrows1);
2554				foreach my $i (0..$nrows1-1) {
2555				my $row = $matrix1->row($i);
2556				my @product_row;
2557				foreach my $j (0..$ncols2-1) {
2558				my $col = $matrix2->col($j);
2559				my $row_times_col = matrix_multiply($row, $col);
2560				push @product_row, $row_times_col;
2561				}
2562				$product->set_row($i, \@product_row);
2563				}
2564				return $product;
2565				}
2566
2567				sub vector_matrix_multiply {
2568				my $matrix1 = shift;
2569				my $matrix2 = shift;
2570				my ($nrows1, $ncols1) = ($matrix1->rows, $matrix1->cols);
2571				my ($nrows2, $ncols2) = ($matrix2->rows, $matrix2->cols);
2572				die "matrix multiplication called with non-matching matrix arguments"
2573				unless $nrows1 == 1 && $ncols1 == $nrows2;
2574				if ($ncols2 == 1) {
2575				my @row = $matrix1->row(0)->as_list;
2576				my @col = $matrix2->col(0)->as_list;
2577				my $result;
2578				foreach my $j (0..$ncols1-1) {
2579				$result += $row[$j] * $col[$j];
2580				}
2581				return $result;
2582				}
2583				my $product = Math::GSL::Matrix->new(1, $ncols2);
2584				my $row = $matrix1->row(0);
2585				my @product_row;
2586				foreach my $j (0..$ncols2-1) {
2587				my $col = $matrix2->col($j);
2588				my $row_times_col = vector_matrix_multiply($row, $col);
2589				push @product_row, $row_times_col;
2590				}
2591				$product->set_row(0, \@product_row);
2592				return $product;
2593				}
2594
2595				sub matrix_vector_multiply {
2596				my $matrix1 = shift;
2597				my $matrix2 = shift;
2598				my ($nrows1, $ncols1) = ($matrix1->rows(), $matrix1->cols());
2599				my ($nrows2, $ncols2) = ($matrix2->rows(), $matrix2->cols());
2600				die "matrix multiplication called with non-matching matrix arguments"
2601				unless $ncols1 == $nrows2 && $ncols2 == 1;
2602				if ($nrows1 == 1) {
2603				my @row = $matrix1->row(0)->as_list;
2604				my @col = $matrix2->col(0)->as_list;
2605				my $result;
2606				foreach my $j (0..$ncols1-1) {
2607				$result += $row[$j] * $col[$j];
2608				}
2609				return $result;
2610				}
2611				my $product = Math::GSL::Matrix->new($nrows1, 1);
2612				my $col = $matrix2->col(0);
2613				my @product_col;
2614				foreach my $i (0..$nrows1-1) {
2615				my $row = $matrix1->row($i);
2616				my $row_times_col = matrix_vector_multiply($row, $col);
2617				push @product_col, $row_times_col;
2618				}
2619				$product->set_col(0, \@product_col);
2620				return $product;
2621				}
2622
2623				sub matrix_trace {
2624				my $matrix = shift;
2625				my ($nrows, $ncols) = ($matrix->rows(), $matrix->cols());
2626				die "trace can only be calculated for a square matrix"
2627				unless $ncols == $nrows;
2628				my @elements = $matrix->as_list;
2629				my $trace = 0;
2630				foreach my $i (0..$nrows-1) {
2631				$trace += $elements[$i + $i * $ncols];
2632				}
2633				return $trace;
2634				}
2635
2636				1;
2637
2638				=pod
2639
2640				=head1 NAME
2641
2642				Algorithm::ExpectationMaximization -- A Perl module for clustering numerical
2643				multi-dimensional data with the Expectation-Maximization algorithm.
2644
2645				=head1 SYNOPSIS
2646
2647				use Algorithm::ExpectationMaximization;
2648
2649				# First name the data file:
2650
2651				my $datafile = "mydatafile1.dat";
2652
2653				# Next, set the mask to indicate which columns of the datafile to use for
2654				# clustering and which column contains a symbolic ID for each data record. For
2655				# example, if the symbolic name is in the first column, you want the second column
2656				# to be ignored, and you want the next three columns to be used for 3D clustering:
2657
2658				my $mask = "N0111";
2659
2660				# Now construct an instance of the clusterer. The parameter `K' controls the
2661				# number of clusters. Here is an example call to the constructor for instance
2662				# creation:
2663
2664				my $clusterer = Algorithm::ExpectationMaximization->new(
2665				datafile => $datafile,
2666				mask => $mask,
2667				K => 3,
2668				max_em_iterations => 300,
2669				seeding => 'random',
2670				terminal_output => 1,
2671				);
2672
2673				# Note the choice for `seeding'. The choice `random' means that the clusterer will
2674				# randomly select `K' data points to serve as initial cluster centers. Other
2675				# possible choices for the constructor parameter `seeding' are `kmeans' and
2676				# `manual'. With the `kmeans' option for `seeding', the output of a K-means
2677				# clusterer is used for the cluster seeds and the initial cluster covariances. If
2678				# you use the `manual' option for seeding, you must also specify the data elements
2679				# to use for seeding the clusters.
2680
2681				# Here is an example of a call to the constructor when we choose the `manual'
2682				# option for seeding the clusters and for specifying the data elements for
2683				# seeding. The data elements are specified by their tag names. In this case,
2684				# these names are `a26', `b53', and `c49':
2685
2686				my $clusterer = Algorithm::ExpectationMaximization->new(
2687				datafile => $datafile,
2688				mask => $mask,
2689				class_priors => [0.6, 0.2, 0.2],
2690				K => 3,
2691				max_em_iterations => 300,
2692				seeding => 'manual',
2693				seed_tags => ['a26', 'b53', 'c49'],
2694				terminal_output => 1,
2695				);
2696
2697				# This example call to the constructor also illustrates how you can inject class
2698				# priors into the clustering process. The class priors are the prior probabilities
2699				# of the class distributions in your dataset. As explained later, injecting class
2700				# priors in the manner shown above makes statistical sense only for the case of
2701				# manual seeding. When you do inject class priors, the order in which the priors
2702				# are expressed must correspond to the manually specified seeds for the clusters.
2703
2704				# After the invocation of the constructor, the following calls are mandatory
2705				# for reasons that should be obvious from the names of the methods:
2706
2707				$clusterer->read_data_from_file();
2708				srand(time);
2709				$clusterer->seed_the_clusters();
2710				$clusterer->EM();
2711				$clusterer->run_bayes_classifier();
2712				my $clusters = $clusterer->return_disjoint_clusters();
2713
2714				# where the call to `EM()' is the invocation of the expectation-maximization
2715				# algorithm. The call to `srand(time)' is to seed the pseudo random number
2716				# generator afresh for each run of the cluster seeding procedure. If you want to
2717				# see repeatable results from one run to another of the algorithm with random
2718				# seeding, you would obviously not invoke `srand(time)'.
2719
2720				# The call `run_bayes_classifier()' shown above carries out a disjoint clustering
2721				# of all the data points using the naive Bayes' classifier. And the call
2722				# `return_disjoint_clusters()' returns the clusters thus formed to you. Once you
2723				# have obtained access to the clusters in this manner, you can display them in
2724				# your terminal window by
2725
2726				foreach my $index (0..@$clusters-1) {
2727				print "Cluster $index (Naive Bayes): @{$clusters->[$index]}\n\n"
2728				}
2729
2730				# If you would like to also see the clusters purely on the basis of the posterior
2731				# class probabilities exceeding a threshold, call
2732
2733				my $theta1 = 0.2;
2734				my $posterior_prob_clusters =
2735				$clusterer->return_clusters_with_posterior_probs_above_threshold($theta1);
2736
2737				# where you can obviously set the threshold $theta1 to any value you wish. Note
2738				# that now you may end up with clusters that overlap. You can display them in
2739				# your terminal window in the same manner as shown above for the naive Bayes'
2740				# clusters.
2741
2742				# You can write the naive Bayes' clusters out to files, one cluster per file, by
2743				# calling
2744
2745				$clusterer->write_naive_bayes_clusters_to_files();
2746
2747				# The clusters are placed in files with names like
2748
2749				naive_bayes_cluster1.dat
2750				naive_bayes_cluster2.dat
2751				...
2752
2753				# In the same manner, you can write out the posterior probability based possibly
2754				# overlapping clusters to files by calling:
2755
2756				$clusterer->write_posterior_prob_clusters_above_threshold_to_files($theta1);
2757
2758				# where the threshold $theta1 sets the probability threshold for deciding which
2759				# data elements to place in a cluster. These clusters are placed in files with
2760				# names like
2761
2762				posterior_prob_cluster1.dat
2763				posterior_prob_cluster2.dat
2764				...
2765
2766				# CLUSTER VISUALIZATION:
2767
2768				# You must first set the mask for cluster visualization. This mask tells the
2769				# module which 2D or 3D subspace of the original data space you wish to visualize
2770				# the clusters in:
2771
2772				my $visualization_mask = "111";
2773				$clusterer->visualize_clusters($visualization_mask);
2774				$clusterer->visualize_distributions($visualization_mask);
2775				$clusterer->plot_hardcopy_clusters($visualization_mask);
2776				$clusterer->plot_hardcopy_distributions($visualization_mask);
2777
2778				# where the last two invocations are for writing out the PNG plots of the
2779				# visualization displays to disk files. The PNG image of the posterior
2780				# probability distributions is written out to a file named posterior_prob_plot.png
2781				# and the PNG image of the disjoint clusters to a file called cluster_plot.png.
2782
2783				# SYNTHETIC DATA GENERATION:
2784
2785				# The module has been provided with a class method for generating multivariate
2786				# data for experimenting with the EM algorithm. The data generation is controlled
2787				# by the contents of a parameter file that is supplied as an argument to the data
2788				# generator method. The priors, the means, and the covariance matrices in the
2789				# parameter file must be according to the syntax shown in the `param1.txt' file in
2790				# the `examples' directory. It is best to edit a copy of this file for your
2791				# synthetic data generation needs.
2792
2793				my $parameter_file = "param1.txt";
2794				my $out_datafile = "mydatafile1.dat";
2795				Algorithm::ExpectationMaximization->cluster_data_generator(
2796				input_parameter_file => $parameter_file,
2797				output_datafile => $out_datafile,
2798				total_number_of_data_points => $N );
2799
2800				# where the value of $N is the total number of data points you would like to see
2801				# generated for all of the Gaussians. How this total number is divided up amongst
2802				# the Gaussians is decided by the prior probabilities for the Gaussian components
2803				# as declared in input parameter file. The synthetic data may be visualized in a
2804				# terminal window and the visualization written out as a PNG image to a diskfile
2805				# by
2806
2807				my $data_visualization_mask = "11";
2808				$clusterer->visualize_data($data_visualization_mask);
2809				$clusterer->plot_hardcopy_data($data_visualization_mask);
2810
2811
2812				=head1 CHANGES
2813
2814				Version 1.21 incorporates minor code clean up. Overall, the module implementation
2815				remains unchanged.
2816
2817				Version 1.2 allows the module to also be used for 1-D data. The visualization code
2818				for 1-D shows the clusters through their histograms.
2819
2820				Version 1.1 incorporates much cleanup of the documentation associated with the
2821				module. Both the top-level module documentation, especially the Description part,
2822				and the comments embedded in the code were revised for better utilization of the
2823				module. The basic implementation code remains unchanged.
2824
2825
2826				=head1 DESCRIPTION
2827
2828				B is a I module for the
2829				Expectation-Maximization (EM) method of clustering numerical data that lends itself
2830				to modeling as a Gaussian mixture. Since the module is entirely in Perl (in the
2831				sense that it is not a Perl wrapper around a C library that actually does the
2832				clustering), the code in the module can easily be modified to experiment with several
2833				aspects of EM.
2834
2835				Gaussian Mixture Modeling (GMM) is based on the assumption that the data consists of
2836				C Gaussian components, each characterized by its own mean vector and its own
2837				covariance matrix. Obviously, given observed data for clustering, we do not know
2838				which of the C Gaussian components was responsible for any of the data elements.
2839				GMM also associates a prior probability with each Gaussian component. In general,
2840				these priors will also be unknown. So the problem of clustering consists of
2841				estimating the posterior class probability at each data element and also estimating
2842				the class priors. Once these posterior class probabilities and the priors are
2843				estimated with EM, we can use the naive Bayes' classifier to partition the data into
2844				disjoint clusters. Or, for "soft" clustering, we can find all the data elements that
2845				belong to a Gaussian component on the basis of the posterior class probabilities at
2846				the data elements exceeding a prescribed threshold.
2847
2848				If you do not mind the fact that it is possible for the EM algorithm to occasionally
2849				get stuck in a local maximum and to, therefore, produce a wrong answer even when you
2850				know the data to be perfectly multimodal Gaussian, EM is probably the most magical
2851				approach to clustering multidimensional data. Consider the case of clustering
2852				three-dimensional data. Each Gaussian cluster in 3D space is characterized by the
2853				following 10 variables: the 6 unique elements of the C<3x3> covariance matrix (which
2854				must be symmetric positive-definite), the 3 unique elements of the mean, and the
2855				prior associated with the Gaussian. Now let's say you expect to see six Gaussians in
2856				your data. What that means is that you would want the values for 59 variables
2857				(remember the unit-summation constraint on the class priors which reduces the overall
2858				number of variables by one) to be estimated by the algorithm that seeks to discover
2859				the clusters in your data. What's amazing is that, despite the large number of
2860				variables that must be optimized simultaneously, the EM algorithm will very likely
2861				give you a good approximation to the right answer.
2862
2863				At its core, EM depends on the notion of unobserved data and the averaging of the
2864				log-likelihood of the data actually observed over all admissible probabilities for
2865				the unobserved data. But what is unobserved data? While in some cases where EM is
2866				used, the unobserved data is literally the missing data, in others, it is something
2867				that cannot be seen directly but that nonetheless is relevant to the data actually
2868				observed. For the case of clustering multidimensional numerical data that can be
2869				modeled as a Gaussian mixture, it turns out that the best way to think of the
2870				unobserved data is in terms of a sequence of random variables, one for each observed
2871				data point, whose values dictate the selection of the Gaussian for that data point.
2872				This point is explained in great detail in my on-line tutorial at
2873				L.
2874
2875				The EM algorithm in our context reduces to an iterative invocation of the following
2876				steps: (1) Given the current guess for the means and the covariances of the different
2877				Gaussians in our mixture model, use Bayes' Rule to update the posterior class
2878				probabilities at each of the data points; (2) Using the updated posterior class
2879				probabilities, first update the class priors; (3) Using the updated class priors,
2880				update the class means and the class covariances; and go back to Step (1). Ideally,
2881				the iterations should terminate when the expected log-likelihood of the observed data
2882				has reached a maximum and does not change with any further iterations. The stopping
2883				rule used in this module is the detection of no change over three consecutive
2884				iterations in the values calculated for the priors.
2885
2886				This module provides three different choices for seeding the clusters: (1) random,
2887				(2) kmeans, and (3) manual. When random seeding is chosen, the algorithm randomly
2888				selects C data elements as cluster seeds. That is, the data vectors associated
2889				with these seeds are treated as initial guesses for the means of the Gaussian
2890				distributions. The covariances are then set to the values calculated from the entire
2891				dataset with respect to the means corresponding to the seeds. With kmeans seeding, on
2892				the other hand, the means and the covariances are set to whatever values are returned
2893				by the kmeans algorithm. And, when seeding is set to manual, you are allowed to
2894				choose C data elements --- by specifying their tag names --- for the seeds. The
2895				rest of the EM initialization for the manual mode is the same as for the random mode.
2896				The algorithm allows for the initial priors to be specified for the manual mode of
2897				seeding.
2898
2899				Much of code for the kmeans based seeding of EM was drawn from the
2900				C module by me. The code from that module used here corresponds to
2901				the case when the C option in the C module is set
2902				to C. The C option for KMeans consists of subjecting the data to a
2903				principal components analysis (PCA) to discover the direction of maximum variance in
2904				the data space. The data points are then projected on to this direction and a
2905				histogram constructed from the projections. Centers of the C largest peaks in
2906				this smoothed histogram are used to seed the KMeans based clusterer. As you'd
2907				expect, the output of the KMeans used to seed the EM algorithm.
2908
2909				This module uses two different criteria to measure the quality of the clustering
2910				achieved. The first is the Minimum Description Length (MDL) proposed originally by
2911				Rissanen (J. Rissanen: "Modeling by Shortest Data Description," Automatica, 1978, and
2912				"A Universal Prior for Integers and Estimation by Minimum Description Length," Annals
2913				of Statistics, 1983.) The MDL criterion is a difference of a log-likelihood term for
2914				all of the observed data and a model-complexity penalty term. In general, both the
2915				log-likelihood and the model-complexity terms increase as the number of clusters
2916				increases. The form of the MDL criterion in this module uses for the penalty term
2917				the Bayesian Information Criterion (BIC) of G. Schwartz, "Estimating the Dimensions
2918				of a Model," The Annals of Statistics, 1978. In general, the smaller the value of
2919				MDL quality measure, the better the clustering of the data.
2920
2921				For our second measure of clustering quality, we use `trace( SW^-1 . SB)' where SW is
2922				the within-class scatter matrix, more commonly denoted S_w, and SB the between-class
2923				scatter matrix, more commonly denoted S_b (the underscore means subscript). This
2924				measure can be thought of as the normalized average distance between the clusters,
2925				the normalization being provided by average cluster covariance SW^-1. Therefore, the
2926				larger the value of this quality measure, the better the separation between the
2927				clusters. Since this measure has its roots in the Fisher linear discriminant
2928				function, we incorporate the word C in the name of the quality measure.
2929				I When the
2930				clusters exhibit significant overlap, the numbers produced by this quality measure
2931				tend to be generally meaningless.
2932
2933				=head1 METHODS
2934
2935				The module provides the following methods for EM based
2936				clustering, for cluster visualization, for data
2937				visualization, and for the generation of data for testing a
2938				clustering algorithm:
2939
2940				=over
2941
2942				=item B
2943
2944				A call to C constructs a new instance of the
2945				C class. A typical form
2946				of this call when you want to use random option for seeding
2947				the algorithm is:
2948
2949				my $clusterer = Algorithm::ExpectationMaximization->new(
2950				datafile => $datafile,
2951				mask => $mask,
2952				K => 3,
2953				max_em_iterations => 300,
2954				seeding => 'random',
2955				terminal_output => 1,
2956				);
2957
2958				where C is the expected number of clusters and
2959				C the maximum number of EM iterations
2960				that you want to allow until convergence is achieved.
2961				Depending on your dataset and on the choice of the initial
2962				seeds, the actual number of iterations used could be as few
2963				as 10 and as many as reaching 300. The output produced by
2964				the algorithm shows the actual number of iterations used to
2965				arrive at convergence.
2966
2967				The data file supplied through the C option is
2968				expected to contain entries in the following format
2969
2970				c20 0 10.7087017086940 9.63528386251712 10.9512155258108 ...
2971				c7 0 12.8025925026787 10.6126270065785 10.5228482095349 ...
2972				b9 0 7.60118206283120 5.05889245193079 5.82841781759102 ...
2973				....
2974				....
2975
2976				where the first column contains the symbolic ID tag for each
2977				data record and the rest of the columns the numerical
2978				information. As to which columns are actually used for
2979				clustering is decided by the string value of the mask. For
2980				example, if we wanted to cluster on the basis of the entries
2981				in just the 3rd, the 4th, and the 5th columns above, the
2982				mask value would be C where the character C
2983				indicates that the ID tag is in the first column, the
2984				character C<0> that the second column is to be ignored, and
2985				the C<1>'s that follow that the 3rd, the 4th, and the 5th
2986				columns are to be used for clustering.
2987
2988				If instead of random seeding, you wish to use the kmeans
2989				based seeding, just replace the option C supplied
2990				for C by C. You can also do manual seeding
2991				by designating a specified set of data elements to serve as
2992				cluster seeds. The call to the constructor in this case
2993				looks like
2994
2995				my $clusterer = Algorithm::ExpectationMaximization->new(
2996				datafile => $datafile,
2997				mask => $mask,
2998				K => 3,
2999				max_em_iterations => 300,
3000				seeding => 'manual',
3001				seed_tags => ['a26', 'b53', 'c49'],
3002				terminal_output => 1,
3003				);
3004
3005				where the option C is set to an anonymous array
3006				of symbolic names associated with the data elements.
3007
3008				If you know the class priors, you can supply them through an
3009				additional option to the constructor that looks like
3010
3011				class_priors => [0.6, 0.2, 0.2],
3012
3013				for the case of C equal to 3. B
3014				be a useful thing to do only for the case of manual
3015				seeding.> If you go for manual seeding, the order in which
3016				the priors are expressed should correspond to the order of
3017				the manually chosen tags supplied through the C
3018				option.
3019
3020				Note that the parameter C is boolean; when
3021				not supplied in the call to C it defaults to 0. When
3022				set, this parameter displays useful information in the
3023				window of the terminal screen in which you invoke the
3024				algorithm.
3025
3026				=item B
3027
3028				$clusterer->read_data_from_file()
3029
3030				This is a required call after the constructor is invoked. As
3031				you would expect, this call reads in the data for
3032				clustering.
3033
3034				=item B
3035
3036				$clusterer->seed_the_clusters();
3037
3038				This is also a required call. It processes the option you
3039				supplied for C in the constructor call to choose
3040				the data elements for seeding the C clusters.
3041
3042				=item B
3043
3044				$clusterer->EM();
3045
3046				This is the workhorse of the module, as you would expect.
3047				The means, the covariances, and the priors estimated by this
3048				method are stored in instance variables that are subsequently
3049				accessed by other methods for the purpose of displaying the
3050				clusters, the probability distributions, etc.
3051
3052				=item B
3053
3054				$clusterer->run_bayes_classifier();
3055
3056				Using the posterior probability distributions estimated by
3057				the C method, this method partitions the data into the
3058				C disjoint clusters using the naive Bayes' classifier.
3059
3060				=item B
3061
3062				my $clusters = $clusterer->return_disjoint_clusters();
3063
3064				This allows you to access the clusters obtained with the
3065				application of the naive Bayes' classifier in your own
3066				scripts. If, say, you wanted to see the data records placed
3067				in each cluster, you could subsequently invoke the following
3068				loop in your own script:
3069
3070				foreach my $index (0..@$clusters-1) {
3071				print "Cluster $index (Naive Bayes): @{$clusters->[$index]}\n\n"
3072				}
3073
3074				where C<$clusters> holds the array reference returned by the
3075				call to C.
3076
3077				=item B
3078
3079				$clusterer->write_naive_bayes_clusters_to_files();
3080
3081				This method writes the clusters obtained by applying the
3082				naive Bayes' classifier to disk files, one cluster per
3083				file. What is written out to each file consists of the
3084				symbolic names of the data records that belong to the
3085				cluster corresponding to that file. The clusters are placed
3086				in files with names like
3087
3088				naive_bayes_cluster1.dat
3089				naive_bayes_cluster2.dat
3090				...
3091
3092				=item B
3093
3094				my $theta1 = 0.2;
3095				my $posterior_prob_clusters =
3096				$clusterer->return_clusters_with_posterior_probs_above_threshold($theta1);
3097
3098				This method returns a reference to an array of C
3099				anonymous arrays, each consisting of the symbolic names for
3100				the data records where the posterior class probability
3101				exceeds the threshold as specified by C<$theta1>.
3102				Subsequently, you can access each of these
3103				posterior-probability based clusters through a loop
3104				construct such as
3105
3106				foreach my $index (0..@$posterior_prob_clusters-1) {
3107				print "Cluster $index (based on posterior probs exceeding $theta1): " .
3108				"@{$posterior_prob_clusters->[$index]}\n\n"
3109				}
3110
3111				=item B
3112
3113				$clusterer->write_posterior_prob_clusters_above_threshold_to_files($theta1);
3114
3115				This call writes out the posterior-probability based soft
3116				clusters to disk files. As in the previous method, the
3117				threshold C<$theta1> sets the probability threshold for
3118				deciding which data elements belong to a cluster. These
3119				clusters are placed in files with names like
3120
3121				posterior_prob_cluster1.dat
3122				posterior_prob_cluster2.dat
3123				...
3124
3125				=item B
3126
3127				my $theta2 = 0.00001;
3128				my $class_distributions =
3129				$clusterer->return_individual_class_distributions_above_given_threshold($theta2);
3130
3131				This is the method to call if you wish to see the individual
3132				Gaussians in your own script. The method returns a reference
3133				to an array of anonymous arrays, with each anonymous array
3134				representing data membership in each Gaussian. Only those
3135				data points are included in each Gaussian where the
3136				probability exceeds the threshold C<$theta2>. Note that the
3137				larger the covariance and the higher the data
3138				dimensionality, the smaller this threshold must be for you
3139				to see any of the data points in a Gaussian. After you have
3140				accessed the Gaussian mixture in this manner, you can
3141				display the data membership in each Gaussian through the
3142				following sort of a loop:
3143
3144				foreach my $index (0..@$class_distributions-1) {
3145				print "Gaussian Distribution $index (only shows data elements " .
3146				"whose probabilities exceed the threshold $theta2: " .
3147				"@{$class_distributions->[$index]}\n\n"
3148				}
3149
3150				=item B
3151
3152				my $visualization_mask = "11";
3153				$clusterer->visualize_clusters($visualization_mask);
3154
3155				The visualization mask here does not have to be identical to
3156				the one used for clustering, but must be a subset of that
3157				mask. This is convenient for visualizing the clusters in
3158				two- or three-dimensional subspaces of the original space.
3159				The subset is specified by placing `0's in the positions
3160				corresponding to the dimensions you do NOT want to see
3161				through visualization. Depending on the mask used, this
3162				method creates a 2D or a 3D scatter plot of the clusters
3163				obtained through the naive Bayes' classification rule.
3164
3165				=item B
3166
3167				$clusterer->visualize_distributions($visualization_mask);
3168
3169				This is the method to call if you want to visualize the soft
3170				clustering corresponding to the posterior class
3171				probabilities exceeding the threshold specified in the call
3172				to
3173				C.
3174				Again, depending on the visualization mask used, you will
3175				see either a 2D plot or a 3D scatter plot.
3176
3177				=item B
3178
3179				$clusterer->plot_hardcopy_clusters($visualization_mask);
3180
3181				This method create a PNG file from the C created
3182				display of the naive Bayes' clusters obtained from the data.
3183				The plotting functionality of C is accessed through
3184				the Perl wrappers provided by the C
3185				module.
3186
3187				=item B
3188
3189				$clusterer->plot_hardcopy_distributions($visualization_mask);
3190
3191				This method create a PNG file from the C created
3192				display of the clusters that correspond to the posterior
3193				class probabilities exceeding a specified threshold. The
3194				plotting functionality of C is accessed through the
3195				Perl wrappers provided by the C module.
3196
3197				=item B
3198
3199				$clusterer->display_fisher_quality_vs_iterations();
3200
3201				This method measures the quality of clustering by
3202				calculating the normalized average squared distance between
3203				the cluster centers, the normalization being provided by the
3204				average cluster covariance. See the Description for further
3205				details. In general, this measure is NOT useful for
3206				overlapping clusters.
3207
3208				=item B
3209
3210				$clusterer->display_mdl_quality_vs_iterations();
3211
3212				At the end of each iteration, this method measures the
3213				quality of clustering my calculating its MDL (Minimum
3214				Description Length). As stated earlier in Description, the
3215				MDL measure is a difference of a log-likelihood term for all
3216				of the observed data and a model-complexity penalty term.
3217				The smaller the value returned by this method, the better
3218				the clustering.
3219
3220				=item B
3221
3222				my $estimated_priors = $clusterer->return_estimated_priors();
3223				print "Estimated class priors: @$estimated_priors\n";
3224
3225				This method can be used to access the final values of the
3226				class priors as estimated by the EM algorithm.
3227
3228				=item B
3229
3230				Algorithm::ExpectationMaximization->cluster_data_generator(
3231				input_parameter_file => $parameter_file,
3232				output_datafile => $out_datafile,
3233				total_number_of_data_points => 300
3234				);
3235
3236				for generating multivariate data for clustering if you wish to play with synthetic
3237				data for experimenting with the EM algorithm. The input parameter file must specify
3238				the priors to be used for the Gaussians, their means, and their covariance matrices.
3239				The format of the information contained in the parameter file must be as shown in the
3240				file C provided in the C directory. It will be easiest for you
3241				to just edit a copy of this file for your data generation needs. In addition to the
3242				format of the parameter file, the main constraint you need to observe in specifying
3243				the parameters is that the dimensionality of the covariance matrices must correspond
3244				to the dimensionality of the mean vectors. The multivariate random numbers are
3245				generated by calling the C module. As you would expect, this module
3246				requires that the covariance matrices you specify in your parameter file be symmetric
3247				and positive definite. Should the covariances in your parameter file not obey this
3248				condition, the C module will let you know.
3249
3250				=item B
3251
3252				$clusterer->visualize_data($data_visualization_mask);
3253
3254				This is the method to call if you want to visualize the data
3255				you plan to cluster with the EM algorithm. You'd need to
3256				specify argument mask in a manner similar to the
3257				visualization of the clusters, as explained earlier.
3258
3259				=item B
3260
3261				$clusterer->plot_hardcopy_data($data_visualization_mask);
3262
3263				This method creates a PNG file that can be used to print out
3264				a hardcopy of the data in different 2D and 3D subspaces of
3265				the data space. The visualization mask is used to select the
3266				subspace for the PNG image.
3267
3268				=back
3269
3270				=head1 HOW THE CLUSTERS ARE OUTPUT
3271
3272				This module produces two different types of clusters: the "hard" clusters and the
3273				"soft" clusters. The hard clusters correspond to the naive Bayes' classification of
3274				the data points on the basis of the Gaussian distributions and the class priors
3275				estimated by the EM algorithm. Such clusters partition the data into disjoint
3276				subsets. On the other hand, the soft clusters correspond to the posterior class
3277				probabilities calculated by the EM algorithm. A data element belongs to a cluster if
3278				its posterior probability for that Gaussian exceeds a threshold.
3279
3280				After the EM algorithm has finished running, the hard clusters are created by
3281				invoking the method C on an instance of the module and then
3282				made user-accessible by calling C. These clusters may
3283				then be displayed in a terminal window by dereferencing each element of the array
3284				whose reference is returned b C. The hard clusters can
3285				be written out to disk files by invoking C.
3286				This method writes out the clusters to files, one cluster per file. What is written
3287				out to each file consists of the symbolic names of the data records that belong to
3288				the cluster corresponding to that file. The clusters are placed in files with names
3289				like
3290
3291				naive_bayes_cluster1.dat
3292				naive_bayes_cluster2.dat
3293				...
3294
3295				The soft clusters on the other hand are created by calling
3296				C
3297				on an instance of the module, where the argument C<$theta1>
3298				is the threshold for deciding whether a data element belongs
3299				in a soft cluster. The posterior class probability at a
3300				data element must exceed the threshold for the element to
3301				belong to the corresponding cluster. The soft cluster can
3302				be written out to disk files by calling
3303				C.
3304				As with the hard clusters, each cluster is placed in a separate
3305				file. The filenames for such clusters look like:
3306
3307				posterior_prob_cluster1.dat
3308				posterior_prob_cluster2.dat
3309				...
3310
3311				=head1 WHAT IF THE NUMBER OF CLUSTERS IS UNKNOWN?
3312
3313				The module constructor requires that you supply a value for the parameter C, which
3314				is the number of clusters you expect to see in the data. But what if you do not have
3315				a good value for C? Note that it is possible to search for the best C by using
3316				the two clustering quality criteria included in the module. However, I have
3317				intentionally not yet incorporated that feature in the module because it slows down
3318				the execution of the code --- especially when the dimensionality of the data
3319				increases. However, nothing prevents you from writing a script --- along the lines
3320				of the five "canned_example" scripts in the C directory --- that would use
3321				the two clustering quality metrics for finding the best choice for C for a given
3322				dataset. Obviously, you will now have to incorporate the call to the constructor in
3323				a loop and check the value of the quality measures for each value of C.
3324
3325				=head1 SOME RESULTS OBTAINED WITH THIS MODULE
3326
3327				If you would like to see some results that have been obtained with this module, check
3328				out Section 7 of the report
3329				L.
3330
3331				=head1 THE C DIRECTORY
3332
3333				Becoming familiar with this directory should be your best
3334				strategy to become comfortable with this module (and its
3335				future versions). You are urged to start by executing the
3336				following five example scripts:
3337
3338				=over 16
3339
3340				=item I
3341
3342				This example applies the EM algorithm to the data contained in the datafile
3343				C. The mixture data in the file corresponds to three overlapping
3344				Gaussian components in a star-shaped pattern. The EM based clustering for this data
3345				is shown in the files C and
3346				C, the former displaying the hard clusters
3347				obtained by using the naive Bayes' classifier and the latter showing the soft
3348				clusters obtained on the basis of the posterior class probabilities at the data
3349				points.
3350
3351				=item I
3352
3353				The datafile used in this example is C. This mixture data
3354				corresponds to two well-separated relatively isotropic Gaussians. EM based clustering for this
3355				data is shown in the files C and
3356				C, the former displaying the hard clusters
3357				obtained by using the naive Bayes' classifier and the latter showing the soft
3358				clusters obtained by using the posterior class probabilities at the data points.
3359
3360				=item I
3361
3362				Like the first example, this example again involves three Gaussians, but now their
3363				means are not co-located. Additionally, we now seed the clusters manually by
3364				specifying three selected data points as the initial guesses for the cluster means.
3365				The datafile used for this example is C. The EM based clustering
3366				for this data is shown in the files C and
3367				C, the former displaying the hard clusters
3368				obtained by using the naive Bayes' classifier and the latter showing the soft
3369				clusters obtained on the basis of the posterior class probabilities at the data
3370				points.
3371
3372				=item I
3373
3374				Whereas the three previous examples demonstrated EM based clustering of 2D data, we
3375				now present an example of clustering in 3D. The datafile used in this example is
3376				C. This mixture data corresponds to three well-separated but highly
3377				anisotropic Gaussians. The EM derived clustering for this data is shown in the files
3378				C and C, the
3379				former displaying the hard clusters obtained by using the naive Bayes' classifier and
3380				the latter showing the soft clusters obtained on the basis of the posterior class
3381				probabilities at the data points.
3382
3383				=item I
3384
3385				We again demonstrate clustering in 3D but now we have one Gaussian cluster that
3386				"cuts" through the other two Gaussian clusters. The datafile used in this example is
3387				C. The three Gaussians in this case are highly overlapping and
3388				highly anisotropic. The EM derived clustering for this data is shown in the files
3389				C and C, the
3390				former displaying the hard clusters obtained by using the naive Bayes' classifier and
3391				the latter showing the soft clusters obtained through the posterior class
3392				probabilities at the data points.
3393
3394				=item I
3395
3396				This example, added in Version 1.2, demonstrates the use of this module for 1-D data.
3397				In order to visualize the clusters for the 1-D case, we show them through their
3398				respective histograms. The datafile used in this example is C. The
3399				data consists of two overlapping Gaussians. The EM derived clustering for this data
3400				is shown in the files C and
3401				C, the former displaying the hard clusters
3402				obtained by using the naive Bayes' classifier and the latter showing the soft
3403				clusters obtained through the posterior class probabilities at the data points.
3404
3405				=back
3406
3407				Going through the six examples listed above will make you familiar with how to make
3408				the calls to the clustering and the visualization methods. The C directory
3409				also includes several parameter files with names like
3410
3411				param1.txt
3412				param2.txt
3413				param3.txt
3414				...
3415
3416				These were used to generate the synthetic data for which the results are shown in the
3417				C directory. Just make a copy of one of these files and edit it if you
3418				would like to generate your own multivariate data for clustering. Note that you can
3419				generate data with any dimensionality through appropriate entries in the parameter
3420				file.
3421
3422				=head1 CAVEATS
3423
3424				When you run the scripts in the C directory, your results will NOT always
3425				look like what I have shown in the PNG image files in the directory. As mentioned
3426				earlier in Description, the EM algorithm starting from randomly chosen initial
3427				guesses for the cluster means can get stuck in a local maximum.
3428
3429				That raises an interesting question of how one judges the correctness of clustering
3430				results when dealing with real experimental data. For real data, the best approach
3431				is to try the EM algorithm multiple times with all of the seeding options included in
3432				this module. It would be safe to say that, at least in low dimensional spaces and
3433				with sufficient data, a majority of your runs should yield "correct" results.
3434
3435				Also bear in mind that a pure Perl implementation is not meant for the clustering of
3436				very large data files. It is really designed more for researching issues related to
3437				EM based approaches to clustering.
3438
3439
3440				=head1 REQUIRED
3441
3442				This module requires the following three modules:
3443
3444				Math::Random
3445				Graphics::GnuplotIF
3446				Math::GSL::Matrix
3447
3448				the first for generating the multivariate random numbers, the second for the
3449				visualization of the clusters, and the last for access to the Perl wrappers for the
3450				GNU Scientific Library. The C module of this library is used for various
3451				algebraic operations on the covariance matrices of the Gaussians.
3452
3453				=head1 EXPORT
3454
3455				None by design.
3456
3457
3458				=head1 BUGS
3459
3460				Please notify the author if you encounter any bugs. When sending email, please place
3461				the string 'Algorithm EM' in the subject line.
3462
3463				=head1 INSTALLATION
3464
3465				Download the archive from CPAN in any directory of your choice. Unpack the archive
3466				with a command that on a Linux machine would look like:
3467
3468				tar zxvf Algorithm-ExpectationMaximization-1.21.tar.gz
3469
3470				This will create an installation directory for you whose name will be
3471				C. Enter this directory and execute the
3472				following commands for a standard install of the module if you have root privileges:
3473
3474				perl Makefile.PL
3475				make
3476				make test
3477				sudo make install
3478
3479				If you do not have root privileges, you can carry out a non-standard install the
3480				module in any directory of your choice by:
3481
3482				perl Makefile.PL prefix=/some/other/directory/
3483				make
3484				make test
3485				make install
3486
3487				With a non-standard install, you may also have to set your PERL5LIB environment
3488				variable so that this module can find the required other modules. How you do that
3489				would depend on what platform you are working on. In order to install this module in
3490				a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
3491				variable by
3492
3493				setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
3494
3495				If I used bash, I'd need to declare:
3496
3497				export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
3498
3499
3500				=head1 ACKNOWLEDGMENTS
3501
3502				Version 1.2 is a result of the feedback received from Paul
3503				May of University of Birmingham. Thanks, Paul!
3504
3505				=head1 AUTHOR
3506
3507				Avinash Kak, kak@purdue.edu
3508
3509				If you send email, please place the string "EM Algorithm" in your
3510				subject line to get past my spam filter.
3511
3512				=head1 COPYRIGHT
3513
3514				This library is free software; you can redistribute it and/or modify it under the
3515				same terms as Perl itself.
3516
3517				Copyright 2014 Avinash Kak
3518
3519				=cut
3520