File Coverage

blib/lib/Algorithm/ExpectationMaximization.pm

Criterion	Covered	Total	%
statement	22	24	91.6
branch			n/a
condition			n/a
subroutine	8	8	100.0
pod			n/a
total	30	32	93.7

line	stmt	sub	time	code
1				package Algorithm::ExpectationMaximization;
2
3				#---------------------------------------------------------------------------
4				# Copyright (c) 2014 Avinash Kak. All rights reserved. This program is free
5				# software. You may modify and/or distribute it under the same terms as Perl itself.
6				# This copyright notice must remain attached to the file.
7				#
8				# Algorithm::ExpectationMaximization is a pure Perl implementation for
9				# Expectation-Maximization based clustering of multi-dimensional data that can be
10				# modeled as a Gaussian mixture.
11				# ---------------------------------------------------------------------------
12
13	1	1	24009	use 5.10.0;
	1		4
	1		64
14	1	1	6	use strict;
	1		2
	1		157
15	1	1	7	use warnings;
	1		87
	1		59
16	1	1	7	use Carp;
	1		2
	1		119
17	1	1	7	use File::Basename;
	1		1
	1		106
18	1	1	911	use Math::Random;
	1		9382
	1		145
19	1	1	808	use Graphics::GnuplotIF;
	1		15118
	1		74
20	1	1	1351	use Math::GSL::Matrix;
	0
	0
21				use Scalar::Util 'blessed';
22
23				our $VERSION = '1.22';
24
25				# from perl docs:
26				my $_num_regex = '^[+-]?\ (\d+(\.\d)?\|\.\d+)([eE][+-]?\d+)?$';
27
28				# Constructor:
29				sub new {
30				my ($class, %args) = @_;
31				my @params = keys %args;
32				croak "\nYou have used a wrong name for a keyword argument " .
33				"--- perhaps a misspelling\n"
34				if check_for_illegal_params(@params) == 0;
35				bless {
36				_datafile => $args{datafile} \|\| croak("datafile required"),
37				_mask => $args{mask} \|\| croak("mask required"),
38				_K => $args{K} \|\| croak("number of clusters required"),
39				_terminal_output => $args{terminal_output} \|\| 0,
40				_seeding => $args{seeding} \|\| 'random',
41				_seed_tags => $args{seed_tags} \|\| [],
42				_max_em_iterations=> $args{max_em_iterations} \|\| 100,
43				_class_priors => $args{class_priors} \|\| [],
44				_debug => $args{debug} \|\| 0,
45				_N => 0,
46				_data => {},
47				_data_id_tags => [],
48				_clusters => [],
49				_cluster_centers => [],
50				_data_dimensions => 0,
51				_cluster_normalizers => [],
52				_cluster_means => [],
53				_cluster_covariances => [],
54				_class_labels_for_data => {},
55				_class_probs_at_each_data_point => {},
56				_expected_class_probs => {},
57				_old_priors => [],
58				_old_old_priors => [],
59				_fisher_quality_vs_iteration => [],
60				_mdl_quality_vs_iterations => [],
61				}, $class;
62				}
63
64
65				sub read_data_from_file {
66				my $self = shift;
67				my $filename = $self->{_datafile};
68				$self->read_data_from_file_csv() if $filename =~ /.csv$/;
69				$self->read_data_from_file_dat() if $filename =~ /.dat$/;
70				}
71
72				sub read_data_from_file_csv {
73				my $self = shift;
74				my $numregex = '[+-]?\ (\d+(\.\d)?\|\.\d+)([eE][+-]?\d+)?';
75				my $filename = $self->{_datafile} \|\| die "you did not specify a file with the data to be clustered";
76				my $mask = $self->{_mask};
77				my @mask = split //, $mask;
78				$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
79				print "data dimensionality: $self->{_data_dimensions} \n"if $self->{_terminal_output};
80				open FILEIN, $filename or die "Unable to open $filename: $!";
81				die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
82				local $/ = undef;
83				my @all_data = split /\s+/, ;
84				my %data_hash = ();
85				my @data_tags = ();
86				foreach my $record (@all_data) {
87				my @splits = split /,/, $record;
88				die "\nYour mask size (including `N' and 1's and 0's) does not match\n" .
89				"the size of at least one of the data records in the file.\n"
90				unless scalar(@mask) == scalar(@splits);
91				my $record_name = shift @splits;
92				$data_hash{$record_name} = \@splits;
93				push @data_tags, $record_name;
94				}
95				$self->{_data} = \%data_hash;
96				$self->{_data_id_tags} = \@data_tags;
97				$self->{_N} = scalar @data_tags;
98				# Need to make the following call to set the global mean and covariance:
99				# my $covariance = $self->estimate_mean_and_covariance(\@data_tags);
100				# Need to make the following call to set the global eigenvec eigenval sets:
101				# $self->eigen_analysis_of_covariance($covariance);
102				if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
103				carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
104				"points must satisfy the relation N > 2xK**2 where K is " .
105				"the number of clusters requested for the clusters to be " .
106				"meaningful $!"
107				if ( $self->{_N} < (2 * $self->{_K} ** 2) );
108				print "\n\n\n";
109				}
110				}
111
112				sub read_data_from_file_dat {
113				my $self = shift;
114				my $datafile = $self->{_datafile};
115				my $mask = $self->{_mask};
116				my @mask = split //, $mask;
117				$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
118				print "data dimensionality: $self->{_data_dimensions} \n"
119				if $self->{_terminal_output};
120				open INPUT, $datafile
121				or die "unable to open file $datafile: $!";
122				chomp( my @raw_data = );
123				close INPUT;
124				# Transform strings into number data
125				foreach my $record (@raw_data) {
126				next unless $record;
127				next if $record =~ /^#/;
128				my @data_fields;
129				my @fields = split /\s+/, $record;
130				die "\nABORTED: Mask size does not correspond to row record size"
131				if $#fields != $#mask;
132				my $record_id;
133				foreach my $i (0..@fields-1) {
134				if ($mask[$i] eq '0') {
135				next;
136				} elsif ($mask[$i] eq 'N') {
137				$record_id = $fields[$i];
138				} elsif ($mask[$i] eq '1') {
139				push @data_fields, $fields[$i];
140				} else {
141				die "misformed mask for reading the data file";
142				}
143				}
144				my @nums = map {/$_num_regex/;$_} @data_fields;
145				$self->{_data}->{ $record_id } = \@nums;
146				}
147				my @all_data_ids = keys %{$self->{_data}};
148				$self->{_data_id_tags} = \@all_data_ids;
149				$self->{_N} = scalar @all_data_ids;
150				if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
151				carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
152				"points must satisfy the relation N > 2xK**2 where K is " .
153				"the number of clusters requested for the clusters to be " .
154				"meaningful $!"
155				if ( $self->{_N} < (2 * $self->{_K} ** 2) );
156				}
157				}
158
159
160				# This is the heart of the module --- in the sense that this method implements the EM
161				# algorithm for the estimating the parameters of a Gaussian mixture model for the
162				# data. In the implementation shown below, we declare convergence for the EM
163				# algorithm when the change in the class priors over three iterations falls below a
164				# threshold. The current value of this threshold, as can be seen in the function
165				# compare_array_floats(), is 0.00001.
166				sub EM {
167				my $self = shift;
168				$self->initialize_class_priors();
169				for (my $em_iteration=0; $em_iteration < $self->{_max_em_iterations};
170				$em_iteration++) {
171				if ($em_iteration == 0) {
172				print "\nSeeding the EM algorithm with:\n";
173				$self->display_seeding_stats();
174				print "\nFinished displaying the seeding information\n";
175				print "\nWill print out a dot for each iteration of EM:\n\n";
176				}
177				my $progress_indicator = $em_iteration % 5 == 0 ? $em_iteration : ".";
178				print $progress_indicator;
179				foreach my $data_id (@{$self->{_data_id_tags}}) {
180				$self->{_class_probs_at_each_data_point}->{$data_id} = [];
181				$self->{_expected_class_probs}->{$data_id} = [];
182				}
183				# Calculate prob(x \| C_i) --- this is the prob of data point x as
184				# a member of class C_i. You must do this for all K classes:
185				foreach my $cluster_index(0..$self->{_K}-1) {
186				$self->find_prob_at_each_datapoint_for_given_mean_and_covar(
187				$self->{_cluster_means}->[$cluster_index],
188				$self->{_cluster_covariances}->[$cluster_index] );
189				}
190				$self->{_cluster_normalizers} = [];
191				if ($self->{_debug}) {
192				print "\n\nDisplaying prob of a data point vis-a-vis each class:\n\n";
193				foreach my $data_id (sort keys %{$self->{_data}}) {
194				my $class_probs_at_a_point =
195				$self->{_class_probs_at_each_data_point}->{$data_id};
196				print "Class probs for $data_id: @$class_probs_at_a_point\n"
197				}
198				}
199				# Calculate prob(C_i \| x) which is the posterior prob of class
200				# considered as a r.v. to be C_i at a given point x. For a given
201				# x, the sum of such probabilities over all C_i must add up to 1:
202				$self->find_expected_classes_at_each_datapoint();
203				if ($self->{_debug}) {
204				print "\n\nDisplaying expected class probs at each data point:\n\n";
205				foreach my $data_id (sort keys %{$self->{_expected_class_probs}}) {
206				my $expected_classes_at_a_point =
207				$self->{_expected_class_probs}->{$data_id};
208				print "Expected classes $data_id: @$expected_classes_at_a_point\n";
209				}
210				}
211				# 1. UPDATE MEANS:
212				my @new_means;
213				foreach my $cluster_index(0..$self->{_K}-1) {
214				$new_means[$cluster_index] =
215				Math::GSL::Matrix->new($self->{_data_dimensions},1);
216				$new_means[$cluster_index]->zero();
217				foreach my $data_id (keys %{$self->{_data}}) {
218				my $data_record = $self->{_data}->{$data_id};
219				my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
220				$data_vec->set_col(0,$data_record);
221				$new_means[$cluster_index] +=
222				$self->{_expected_class_probs}->{$data_id}->[$cluster_index] *
223				$data_vec->copy();
224				$self->{_cluster_normalizers}->[$cluster_index] +=
225				$self->{_expected_class_probs}->{$data_id}->[$cluster_index];
226				}
227				$new_means[$cluster_index] *= 1.0 /
228				$self->{_cluster_normalizers}->[$cluster_index];
229				}
230				if ($self->{_debug}) {
231				foreach my $meanvec (@new_means) {
232				display_matrix("At EM Iteration $em_iteration, new mean vector is",
233				$meanvec);
234				}
235				}
236				$self->{_cluster_means} = \@new_means;
237				# 2. UPDATE COVARIANCES:
238				my @new_covariances;
239				foreach my $cluster_index(0..$self->{_K}-1) {
240				$new_covariances[$cluster_index] =
241				Math::GSL::Matrix->new($self->{_data_dimensions},
242				$self->{_data_dimensions});
243				$new_covariances[$cluster_index]->zero();
244				my $normalizer = 0;
245				foreach my $data_id (keys %{$self->{_data}}) {
246				my $data_record = $self->{_data}->{$data_id};
247				my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
248				$data_vec->set_col(0,$data_record);
249				my $mean_subtracted_data =
250				$data_vec - $self->{_cluster_means}->[$cluster_index];
251				my $outer_product = outer_product($mean_subtracted_data,
252				$mean_subtracted_data);
253				$new_covariances[$cluster_index] +=
254				$self->{_expected_class_probs}->{$data_id}->[$cluster_index] *
255				$outer_product;
256				}
257				$new_covariances[$cluster_index] *=
258				1.0 /
259				$self->{_cluster_normalizers}->[$cluster_index];
260				}
261				$self->{_cluster_covariances} = \@new_covariances;
262				# 3. UPDATE PRIORS:
263				$self->{_old_old_priors} = deep_copy_array( $self->{_old_priors} )
264				if @{$self->{_old_priors}} > 0;
265				$self->{_old_priors} = deep_copy_array( $self->{_class_priors} );
266				foreach my $cluster_index(0..$self->{_K}-1) {
267				$self->{_class_priors}->[$cluster_index] =
268				$self->{_cluster_normalizers}->[$cluster_index] / $self->{_N};
269				}
270				my @priors = @{$self->{_class_priors}};
271				print "\nUpdated priors: @priors\n\n\n" if $self->{_debug};
272				push @{$self->{_fisher_quality_vs_iteration}},
273				$self->clustering_quality_fisher();
274				push @{$self->{_mdl_quality_vs_iteration}}, $self->clustering_quality_mdl();
275				if ( ($em_iteration > 5 && $self->reached_convergence())
276				\|\| ($em_iteration == $self->{_max_em_iterations} - 1) ) {
277				my @old_old_priors = @{$self->{_old_old_priors}};
278				my @old_priors = @{$self->{_old_priors}};
279				print "\n\nPrevious to previous priors: @old_old_priors\n";
280				print "Previous priors: @old_priors\n";
281				print "Current class priors: @{$self->{_class_priors}}\n";
282				print "\n\nCONVERGENCE ACHIEVED AT ITERATION $em_iteration\n\n"
283				if $em_iteration < $self->{_max_em_iterations} - 1;
284				last;
285				}
286				}
287				print "\n\n\n";
288				}
289
290				sub reached_convergence {
291				my $self = shift;
292				return 1 if compare_array_floats($self->{_old_old_priors},
293				$self->{_old_priors})
294				&&
295				compare_array_floats($self->{_old_priors},
296				$self->{_class_priors});
297				return 0;
298				}
299
300				# Classify the data into disjoint clusters using the Naive Bayes' classification:
301				sub run_bayes_classifier {
302				my $self = shift;
303				$self->classify_all_data_tuples_bayes($self->{_cluster_means},
304				$self->{_cluster_covariances});
305				}
306
307				# Should NOT be called before run_bayes_classifier is run
308				sub return_disjoint_clusters {
309				my $self = shift;
310				return $self->{_clusters};
311				}
312
313				sub return_clusters_with_posterior_probs_above_threshold {
314				my $self = shift;
315				my $theta = shift;
316				my @class_distributions;
317				foreach my $cluster_index (0..$self->{_K}-1) {
318				push @class_distributions, [];
319				}
320				foreach my $data_tag (@{$self->{_data_id_tags}}) {
321				foreach my $cluster_index (0..$self->{_K}-1) {
322				push @{$class_distributions[$cluster_index]}, $data_tag
323				if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index]
324				> $theta;
325				}
326				}
327				return \@class_distributions;
328				}
329
330				sub return_individual_class_distributions_above_given_threshold {
331				my $self = shift;
332				my $theta = shift;
333				my @probability_distributions;
334				foreach my $cluster_index (0..$self->{_K}-1) {
335				push @probability_distributions, [];
336				}
337				foreach my $cluster_index (0..$self->{_K}-1) {
338				my $mean_vec = $self->{_cluster_means}->[$cluster_index];
339				my $covar = $self->{_cluster_covariances}->[$cluster_index];
340				foreach my $data_id (keys %{$self->{_data}}) {
341				my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
342				$data_vec->set_col( 0, $self->{_data}->{$data_id});
343				my $datavec_minus_mean = $data_vec - $mean_vec;
344				display_matrix( "datavec minus mean is ", $datavec_minus_mean )
345				if $self->{_debug};
346				my $exponent = undef;
347				if ($self->{_data_dimensions} > 1) {
348				$exponent = -0.5 * vector_matrix_multiply(
349				transpose($datavec_minus_mean),
350				matrix_vector_multiply($covar->inverse(), $datavec_minus_mean ) );
351				} else {
352				my @var_inverse = $covar->inverse()->as_list;
353				my $var_inverse_val = $var_inverse[0];
354				my @data_minus_mean = $datavec_minus_mean->as_list;
355				my $data_minus_mean_val = $data_minus_mean[0];
356				$exponent = -0.5 * ($data_minus_mean_val ** 2) * $var_inverse_val;
357				}
358				print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug};
359				my $coefficient = 1.0 / \
360				( (2 * $Math::GSL::Const::M_PI)*$self->{_data_dimensions} sqrt($covar->det()) );
361				my $prob = $coefficient * exp($exponent);
362				push @{$probability_distributions[$cluster_index]}, $data_id
363				if $prob > $theta;
364				}
365				}
366				return \@probability_distributions;
367				}
368
369				sub return_estimated_priors {
370				my $self = shift;
371				return $self->{_class_priors};
372				}
373
374				# Calculates the MDL (Minimum Description Length) clustering criterion according to
375				# Rissanen. (J. Rissanen: "Modeling by Shortest Data Description," Automatica, 1978,
376				# and "A Universal Prior for Integers and Estimation by Minimum Description Length,"
377				# Annals of Statistics, 1983.) The MDL criterion is a difference of a log-likelihood
378				# term for all of the observed data and a model-complexity penalty term. In general,
379				# both the log-likelihood and the model-complexity terms increase as the number of
380				# clusters increases. The form of the MDL criterion used in the implementation below
381				# uses for the penalty term the Bayesian Information Criterion (BIC) of G. Schwartz,
382				# "Estimating the Dimensions of a Model," The Annals of Statistics, 1978. In
383				# general, the smaller the value of the MDL quality measure calculated below, the
384				# better the clustering of the data.
385				sub clustering_quality_mdl {
386				my $self = shift;
387				# Calculate the inverses of all of the covariance matrices in order to avoid
388				# having to calculate them repeatedly inside the inner 'foreach' loop in the
389				# main part of this method. Here we go:
390				my @covar_inverses;
391				foreach my $cluster_index (0..$self->{_K}-1) {
392				my $covar = $self->{_cluster_covariances}->[$cluster_index];
393				push @covar_inverses, $covar->inverse();
394				}
395				# For the clustering quality, first calculate the log-likelihood of all the
396				# observed data:
397				my $log_likelihood = 0;
398				foreach my $tag (@{$self->{_data_id_tags}}) {
399				my $likelihood_for_each_tag = 0;
400				foreach my $cluster_index (0..$self->{_K}-1) {
401				my $mean_vec = $self->{_cluster_means}->[$cluster_index];
402				my $covar = $self->{_cluster_covariances}->[$cluster_index];
403				my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
404				$data_vec->set_col( 0, $self->{_data}->{$tag});
405				my $datavec_minus_mean = $data_vec - $mean_vec;
406				my $exponent = undef;
407				if ($self->{_data_dimensions} > 1) {
408				$exponent = -0.5 * vector_matrix_multiply(
409				transpose($datavec_minus_mean),
410				matrix_vector_multiply($covar_inverses[$cluster_index],
411				$datavec_minus_mean ) );
412				} else {
413				my @var_inverse = $covar_inverses[$cluster_index]->as_list;
414				my $var_inverse_val = $var_inverse[0];
415				my @data_minus_mean = $datavec_minus_mean->as_list;
416				my $data_minus_mean_val = $data_minus_mean[0];
417				$exponent = -0.5 * ($data_minus_mean_val ** 2) * $var_inverse_val;
418				}
419				next if $covar->det() < 0;
420				my $coefficient = 1.0 /
421				( (2 * $Math::GSL::Const::M_PI)**$self->{_data_dimensions}
422				* sqrt($covar->det()) );
423				my $prob = $coefficient * exp($exponent);
424				$likelihood_for_each_tag +=
425				$prob * $self->{_class_priors}->[$cluster_index];
426				}
427				$log_likelihood += log( $likelihood_for_each_tag );
428				}
429				# Now calculate the model complexity penalty. $L is the total number of
430				# parameters it takes to specify a mixture of K Gaussians. If d is the
431				# dimensionality of the data space, the covariance matrix of each Gaussian takes
432				# (d**2 -d)/2 + d = d(d+1)/2 parameters since this matrix must be symmetric. And
433				# then you need d mean value parameters, and one prior probability parameter
434				# for the Gaussian. So $L = K[1 + d + d(d+1)/2] - 1 where the final '1' that
435				# is subtracted is to account for the normalization on the class priors.
436				my $L = (0.5 * $self->{_K} *
437				($self->{_data_dimensions}*2 + 3$self->{_data_dimensions} + 2) ) - 1;
438				my $model_complexity_penalty = 0.5 * $L * log( $self->{_N} );
439				my $mdl_criterion = -1 * $log_likelihood + $model_complexity_penalty;
440				return $mdl_criterion;
441				}
442
443				# For our second measure of clustering quality, we use `trace( SW^-1 . SB)' where SW
444				# is the within-class scatter matrix, more commonly denoted S_w, and SB the
445				# between-class scatter matrix, more commonly denoted S_b (the underscore means
446				# subscript). This measure can be thought of as the normalized average distance
447				# between the clusters, the normalization being provided by average cluster
448				# covariance SW^-1. Therefore, the larger the value of this quality measure, the
449				# better the separation between the clusters. Since this measure has its roots in
450				# the Fisher linear discriminant function, we incorporate the word 'fisher' in the
451				# name of the quality measure. Note that this measure is good only when the clusters
452				# are disjoint. When the clusters exhibit significant overlap, the numbers produced
453				# by this quality measure tend to be generally meaningless. As an extreme case,
454				# let's say your data was produced by a set of Gaussians, all with the same mean
455				# vector, but each with a distinct covariance. For this extreme case, this measure
456				# will produce a value close to zero --- depending on the accuracy with which the
457				# means are estimated --- even when your clusterer is doing a good job of identifying
458				# the individual clusters.
459				sub clustering_quality_fisher {
460				my $self = shift;
461				my @cluster_quality_indices;
462				my $fisher_trace = 0;
463				my $S_w =
464				Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions});
465				$S_w->zero;
466				my $S_b =
467				Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions});
468				$S_b->zero;
469				my $global_mean = Math::GSL::Matrix->new($self->{_data_dimensions},1);
470				$global_mean->zero;
471				foreach my $cluster_index(0..$self->{_K}-1) {
472				$global_mean = $self->{_class_priors}->[$cluster_index] *
473				$self->{_cluster_means}->[$cluster_index];
474				}
475				foreach my $cluster_index(0..$self->{_K}-1) {
476				$S_w += $self->{_cluster_covariances}->[$cluster_index] *
477				$self->{_class_priors}->[$cluster_index];
478				my $class_mean_minus_global_mean = $self->{_cluster_means}->[$cluster_index]
479				- $global_mean;
480				my $outer_product = outer_product( $class_mean_minus_global_mean,
481				$class_mean_minus_global_mean );
482				$S_b += $self->{_class_priors}->[$cluster_index] * $outer_product;
483				}
484				my $fisher = matrix_multiply($S_w->inverse, $S_b);
485				return $fisher unless defined blessed($fisher);
486				return matrix_trace($fisher);
487				}
488
489				sub display_seeding_stats {
490				my $self = shift;
491				foreach my $cluster_index(0..$self->{_K}-1) {
492				print "\nSeeding for cluster $cluster_index:\n";
493				my $mean = $self->{_cluster_means}->[$cluster_index];
494				display_matrix("The mean is: ", $mean);
495				my $covariance = $self->{_cluster_covariances}->[$cluster_index];
496				display_matrix("The covariance is: ", $covariance);
497				}
498				}
499
500				sub display_fisher_quality_vs_iterations {
501				my $self = shift;
502				print "\n\nFisher Quality vs. Iterations: " .
503				"@{$self->{_fisher_quality_vs_iteration}}\n\n";
504				}
505
506				sub display_mdl_quality_vs_iterations {
507				my $self = shift;
508				print "\n\nMDL Quality vs. Iterations: @{$self->{_mdl_quality_vs_iteration}}\n\n";
509				}
510
511				sub find_prob_at_each_datapoint_for_given_mean_and_covar {
512				my $self = shift;
513				my $mean_vec_ref = shift;
514				my $covar_ref = shift;
515				foreach my $data_id (keys %{$self->{_data}}) {
516				my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
517				$data_vec->set_col( 0, $self->{_data}->{$data_id});
518				if ($self->{_debug}) {
519				display_matrix("data vec in find prob function", $data_vec);
520				display_matrix("mean vec in find prob function", $mean_vec_ref);
521				display_matrix("covariance in find prob function", $covar_ref);
522				}
523				my $datavec_minus_mean = $data_vec - $mean_vec_ref;
524				display_matrix( "datavec minus mean is ", $datavec_minus_mean ) if $self->{_debug};
525				my $exponent = undef;
526				if ($self->{_data_dimensions} > 1) {
527				$exponent = -0.5 * vector_matrix_multiply( transpose($datavec_minus_mean),
528				matrix_vector_multiply( $covar_ref->inverse(), $datavec_minus_mean ) );
529				} elsif (defined blessed($covar_ref)) {
530				my @data_minus_mean = $datavec_minus_mean->as_list;
531				my $data_minus_mean_val = $data_minus_mean[0];
532				my @covar_as_matrix = $covar_ref->as_list;
533				my $covar_val = $covar_as_matrix[0];
534				$exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_val;
535				} else {
536				my @data_minus_mean = $datavec_minus_mean->as_list;
537				my $data_minus_mean_val = $data_minus_mean[0];
538				$exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_ref;
539				}
540				print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug};
541				my $coefficient = undef;
542				if ($self->{_data_dimensions} > 1) {
543				$coefficient = 1.0 / sqrt( ((2 * $Math::GSL::Const::M_PI) ** $self->{_data_dimensions}) *
544				$covar_ref->det()) ;
545				} elsif (!defined blessed($covar_ref)) {
546				$coefficient = 1.0 / sqrt(2 * $covar_ref * $Math::GSL::Const::M_PI);
547				} else {
548				my @covar_as_matrix = $covar_ref->as_list;
549				my $covar_val = $covar_as_matrix[0];
550				$coefficient = 1.0 / sqrt(2 * $covar_val * $Math::GSL::Const::M_PI);
551				}
552				my $prob = $coefficient * exp($exponent);
553				push @{$self->{_class_probs_at_each_data_point}->{$data_id}}, $prob;
554				}
555				}
556
557				sub find_expected_classes_at_each_datapoint {
558				my $self = shift;
559				my @priors = @{$self->{_class_priors}};
560				foreach my $data_id (sort keys %{$self->{_class_probs_at_each_data_point}}) {
561				my $numerator =
562				vector_2_vector_multiply(
563				$self->{_class_probs_at_each_data_point}->{$data_id},
564				$self->{_class_priors} );
565				my $sum = 0;
566				foreach my $part (@$numerator) {
567				$sum += $part;
568				}
569				$self->{_expected_class_probs}->{$data_id} = [map $_/$sum, @{$numerator}];
570				}
571				}
572
573				sub initialize_class_priors {
574				my $self = shift;
575				if (@{$self->{_class_priors}} == 0) {
576				my $prior = 1.0 / $self->{_K};
577				foreach my $class_index (0..$self->{_K}-1) {
578				push @{$self->{_class_priors}}, $prior;
579				}
580				}
581				die "Mismatch between number of values for class priors " .
582				"and the number of clusters expected"
583				unless @{$self->{_class_priors}} == $self->{_K};
584				my $sum = 0;
585				foreach my $prior (@{$self->{_class_priors}}) {
586				$sum += $prior;
587				}
588				die "Your priors in the constructor call do not add up to 1"
589				unless abs($sum - 1) < 0.001;
590				print "\nInitially assumed class priors are: @{$self->{_class_priors}}\n";
591				}
592
593				sub estimate_class_priors {
594				my $self = shift;
595				foreach my $datatag (keys %{$self->{_data}}) {
596				my $class_label = $self->{_class_labels}->{$datatag};
597				$self->{_class_priors}[$class_label]++;
598				}
599				foreach my $prior (@{$self->{_class_priors}}) {
600				$prior /= $self->{_total_number_data_tuples};
601				}
602				foreach my $prior (@{$self->{_class_priors}}) {
603				print "class priors: @{$self->{_class_priors}}\n";
604				}
605				}
606
607				sub classify_all_data_tuples_bayes {
608				my $self = shift;
609				my $mean_vecs_ref = shift;
610				my $covariances_ref = shift;
611				my @new_clusters;
612				foreach my $index (0..$self->{_K}-1) {
613				push @new_clusters, [];
614				}
615				foreach my $data_id (@{$self->{_data_id_tags}}) {
616				my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
617				$data_vec->set_col( 0, deep_copy_array($self->{_data}->{$data_id}));
618				my $cluster_index_for_tuple =
619				$self->classify_a_data_point_bayes($data_vec,
620				$mean_vecs_ref, $covariances_ref);
621				$self->{_class_labels}->{$data_id} = $cluster_index_for_tuple;
622				push @{$new_clusters[$cluster_index_for_tuple]}, $data_id;
623				}
624				$self->{_clusters} = \@new_clusters;
625				}
626
627				sub classify_a_data_point_bayes {
628				my $self = shift;
629				my $data_vec = shift;
630				my $mean_vecs_ref = shift;
631				my $covariances_ref = shift;
632				my @cluster_mean_vecs = @$mean_vecs_ref;
633				my @cluster_covariances = @$covariances_ref;
634				my @log_likelihoods;
635				foreach my $cluster_index (0..@cluster_mean_vecs-1) {
636				my $mean = $cluster_mean_vecs[$cluster_index];
637				my $covariance = $cluster_covariances[$cluster_index];
638				my $datavec_minus_mean = $data_vec - $mean;
639				my $log_likely = undef;
640				if ($self->{_data_dimensions} > 1) {
641				$log_likely = -0.5 * vector_matrix_multiply(
642				transpose($datavec_minus_mean),
643				matrix_vector_multiply( $covariance->inverse(),
644				$datavec_minus_mean ) );
645				} else {
646
647				my @data_minus_mean = $datavec_minus_mean->as_list;
648				my $data_minus_mean_val = $data_minus_mean[0];
649				my @covar_as_matrix = $covariance->as_list;
650				my $covar_val = $covar_as_matrix[0];
651				$log_likely = -0.5 * ($data_minus_mean_val ** 2) / $covar_val;
652
653				}
654				my $posterior_log_likely = $log_likely +
655				log( $self->{_class_priors}[$cluster_index] );
656				push @log_likelihoods, $posterior_log_likely;
657				}
658				my ($minlikely, $maxlikely) = minmax(\@log_likelihoods);
659				my $cluster_index_for_data_point =
660				get_index_at_value( $maxlikely, \@log_likelihoods );
661				return $cluster_index_for_data_point;
662				}
663
664				sub find_cluster_means_and_covariances {
665				my $clusters = shift;
666				my $data_dimensions = find_data_dimensionality($clusters);
667				my (@cluster_mean_vecs, @cluster_covariances);
668				foreach my $cluster_index (0..@$clusters-1) {
669				my ($num_rows,$num_cols) =
670				($data_dimensions,scalar(@{$clusters->[$cluster_index]}));
671				print "\nFor cluster $cluster_index: rows: $num_rows and cols: $num_cols\n";
672				my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
673				my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
674				my $col_index = 0;
675				foreach my $ele (@{$clusters->[$cluster_index]}) {
676				$matrix->set_col($col_index++, $ele);
677				}
678				# display_matrix( "Displaying cluster matrix", $matrix );
679				foreach my $j (0..$num_cols-1) {
680				$mean_vec += $matrix->col($j);
681				}
682				$mean_vec *= 1.0 / $num_cols;
683				push @cluster_mean_vecs, $mean_vec;
684				display_matrix( "Displaying the mean vector", $mean_vec );
685				foreach my $j (0..$num_cols-1) {
686				my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
687				$matrix->set_col($j, \@new_col);
688				}
689				# display_matrix("Displaying mean subtracted data as a matrix", $matrix );
690				my $transposed = transpose( $matrix );
691				# display_matrix("Displaying transposed matrix",$transposed);
692				my $covariance = matrix_multiply( $matrix, $transposed );
693				$covariance *= 1.0 / $num_cols;
694				push @cluster_covariances, $covariance;
695				display_matrix("Displaying the cluster covariance", $covariance );
696				}
697				return (\@cluster_mean_vecs, \@cluster_covariances);
698				}
699
700				sub find_data_dimensionality {
701				my $clusters = shift;
702				my @first_cluster = @{$clusters->[0]};
703				my @first_data_element = @{$first_cluster[0]};
704				return scalar(@first_data_element);
705				}
706
707				sub find_seed_centered_covariances {
708				my $self = shift;
709				my $seed_tags = shift;
710				my (@seed_mean_vecs, @seed_based_covariances);
711				foreach my $seed_tag (@$seed_tags) {
712				my ($num_rows,$num_cols) = ($self->{_data_dimensions}, $self->{_N});
713				my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
714				my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
715				$mean_vec->set_col(0, $self->{_data}->{$seed_tag});
716				push @seed_mean_vecs, $mean_vec;
717				display_matrix( "Displaying the seed mean vector", $mean_vec );
718				my $col_index = 0;
719				foreach my $tag (@{$self->{_data_id_tags}}) {
720				$matrix->set_col($col_index++, $self->{_data}->{$tag});
721				}
722				foreach my $j (0..$num_cols-1) {
723				my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
724				$matrix->set_col($j, \@new_col);
725				}
726				my $transposed = transpose( $matrix );
727				my $covariance = matrix_multiply( $matrix, $transposed );
728				$covariance *= 1.0 / $num_cols;
729				push @seed_based_covariances, $covariance;
730				display_matrix("Displaying the seed covariance", $covariance )
731				if $self->{_debug};
732				}
733				return (\@seed_mean_vecs, \@seed_based_covariances);
734				}
735
736				# The most popular seeding mode for EM is random. We include two other seeding modes
737				# --- kmeans and manual --- since they do produce good results for specialized cases.
738				# For example, when the clusters in your data are non-overlapping and not too
739				# anisotropic, the kmeans based seeding should work at least as well as the random
740				# seeding. In such cases --- AND ONLY IN SUCH CASES --- the kmeans based seeding has
741				# the advantage of avoiding the getting stuck in a local-maximum problem of the EM
742				# algorithm.
743				sub seed_the_clusters {
744				my $self = shift;
745				if ($self->{_seeding} eq 'random') {
746				my @covariances;
747				my @means;
748				my @all_tags = @{$self->{_data_id_tags}};
749				my @seed_tags;
750				foreach my $i (0..$self->{_K}-1) {
751				push @seed_tags, $all_tags[int rand( $self->{_N} )];
752				}
753				print "Random Seeding: Randomly selected seeding tags are @seed_tags\n\n";
754				my ($seed_means, $seed_covars) =
755				$self->find_seed_centered_covariances(\@seed_tags);
756				$self->{_cluster_means} = $seed_means;
757				$self->{_cluster_covariances} = $seed_covars;
758				} elsif ($self->{_seeding} eq 'kmeans') {
759				$self->kmeans();
760				my $clusters = $self->{_clusters};
761				my @dataclusters;
762				foreach my $index (0..@$clusters-1) {
763				push @dataclusters, [];
764				}
765				foreach my $cluster_index (0..$self->{_K}-1) {
766				foreach my $tag (@{$clusters->[$cluster_index]}) {
767				my $data = $self->{_data}->{$tag};
768				push @{$dataclusters[$cluster_index]}, deep_copy_array($data);
769				}
770				}
771				($self->{_cluster_means}, $self->{_cluster_covariances}) =
772				find_cluster_means_and_covariances(\@dataclusters);
773				} elsif ($self->{_seeding} eq 'manual') {
774				die "You have not supplied the seeding tags for the option \"manual\""
775				unless @{$self->{_seed_tags}} > 0;
776				print "Manual Seeding: Seed tags are @{$self->{_seed_tags}}\n\n";
777				foreach my $tag (@{$self->{_seed_tags}}) {
778				die "invalid tag used for manual seeding"
779				unless exists $self->{_data}->{$tag};
780				}
781				my ($seed_means, $seed_covars) =
782				$self->find_seed_centered_covariances($self->{_seed_tags});
783				$self->{_cluster_means} = $seed_means;
784				$self->{_cluster_covariances} = $seed_covars;
785				} else {
786				die "Incorrect call syntax used. See documentation.";
787				}
788				}
789
790				# This is the top-level method for kmeans based initialization of the EM
791				# algorithm. The means and the covariances returned by kmeans are used to seed the EM
792				# algorithm.
793				sub kmeans {
794				my $self = shift;
795				my $K = $self->{_K};
796				$self->cluster_for_fixed_K_single_smart_try($K);
797				if ((defined $self->{_clusters}) && (defined $self->{_cluster_centers})){
798				return ($self->{_clusters}, $self->{_cluster_centers});
799				} else {
800				die "kmeans clustering failed.";
801				}
802				}
803
804				# Used by the kmeans algorithm for the initialization of the EM iterations. We do
805				# initial kmeans cluster seeding by subjecting the data to principal components
806				# analysis in order to discover the direction of maximum variance in the data space.
807				# Subsequently, we try to find the K largest peaks along this direction. The
808				# coordinates of these peaks serve as the seeds for the K clusters.
809				sub cluster_for_fixed_K_single_smart_try {
810				my $self = shift;
811				my $K = shift;
812				print "Clustering for K = $K\n" if $self->{_terminal_output};
813				my ($clusters, $cluster_centers) =
814				$self->cluster_for_given_K($K);
815				$self->{_clusters} = $clusters;
816				$self->{_cluster_centers} = $cluster_centers;
817				}
818
819				# Used by the kmeans part of the code for the initialization of the EM algorithm:
820				sub cluster_for_given_K {
821				my $self = shift;
822				my $K = shift;
823				my $cluster_centers = $self->get_initial_cluster_centers($K);
824				my $clusters = $self->assign_data_to_clusters_initial($cluster_centers);
825				my $cluster_nonexistant_flag = 0;
826				foreach my $trial (0..2) {
827				($clusters, $cluster_centers) =
828				$self->assign_data_to_clusters( $clusters, $K );
829				my $num_of_clusters_returned = @$clusters;
830				foreach my $cluster (@$clusters) {
831				$cluster_nonexistant_flag = 1 if ((!defined $cluster)
832				\|\| (@$cluster == 0));
833				}
834				last unless $cluster_nonexistant_flag;
835				}
836				return ($clusters, $cluster_centers);
837				}
838
839				# Used by the kmeans part of the code for the initialization of the EM algorithm:
840				sub get_initial_cluster_centers {
841				my $self = shift;
842				my $K = shift;
843				if ($self->{_data_dimensions} == 1) {
844				my @one_d_data;
845				foreach my $j (0..$self->{_N}-1) {
846				my $tag = $self->{_data_id_tags}[$j];
847				push @one_d_data, $self->{_data}->{$tag}->[0];
848				}
849				my @peak_points =
850				find_peak_points_in_given_direction(\@one_d_data,$K);
851				print "highest points at data values: @peak_points\n"
852				if $self->{_debug};
853				my @cluster_centers;
854				foreach my $peakpoint (@peak_points) {
855				push @cluster_centers, [$peakpoint];
856				}
857				return \@cluster_centers;
858				}
859				my ($num_rows,$num_cols) = ($self->{_data_dimensions},$self->{_N});
860				my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
861				my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
862				# All the record labels are stored in the array $self->{_data_id_tags}.
863				# The actual data for clustering is stored in a hash at $self->{_data}
864				# whose keys are the record labels; the value associated with each
865				# key is the array holding the corresponding numerical multidimensional
866				# data.
867				foreach my $j (0..$num_cols-1) {
868				my $tag = $self->{_data_id_tags}[$j];
869				my $data = $self->{_data}->{$tag};
870				$matrix->set_col($j, $data);
871				}
872				if ($self->{_debug}) {
873				print "\nDisplaying the original data as a matrix:";
874				display_matrix( $matrix );
875				}
876				foreach my $j (0..$num_cols-1) {
877				$mean_vec += $matrix->col($j);
878				}
879				$mean_vec *= 1.0 / $num_cols;
880				if ($self->{_debug}) {
881				print "Displaying the mean vector for the data:";
882				display_matrix( $mean_vec );
883				}
884				foreach my $j (0..$num_cols-1) {
885				my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
886				$matrix->set_col($j, \@new_col);
887				}
888				if ($self->{_debug}) {
889				print "Displaying mean subtracted data as a matrix:";
890				display_matrix( $matrix );
891				}
892				my $transposed = transpose( $matrix );
893				if ($self->{_debug}) {
894				print "Displaying transposed data matrix:";
895				display_matrix( $transposed );
896				}
897				my $covariance = matrix_multiply( $matrix, $transposed );
898				$covariance *= 1.0 / $num_cols;
899				if ($self->{_debug}) {
900				print "\nDisplaying the Covariance Matrix for your data:";
901				display_matrix( $covariance );
902				}
903				my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
904				my $num_of_eigens = @$eigenvalues;
905				my $largest_eigen_index = 0;
906				my $smallest_eigen_index = 0;
907				print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug};
908				foreach my $i (1..$num_of_eigens-1) {
909				$largest_eigen_index = $i if $eigenvalues->[$i] >
910				$eigenvalues->[$largest_eigen_index];
911				$smallest_eigen_index = $i if $eigenvalues->[$i] <
912				$eigenvalues->[$smallest_eigen_index];
913				print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug};
914				}
915				print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
916				print "\nsmallest eigen index: $smallest_eigen_index\n\n"
917				if $self->{_debug};
918				foreach my $i (0..$num_of_eigens-1) {
919				my @vec = $eigenvectors->[$i]->as_list;
920				print "Eigenvector $i: @vec\n" if $self->{_debug};
921				}
922				my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
923				print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug};
924				my @max_var_direction;
925				# Each element of the array @largest_eigen_vec is a Math::Complex object
926				foreach my $k (0..@largest_eigen_vec-1) {
927				my ($mag, $theta) = $largest_eigen_vec[$k] =~ /\[(\d*\.\d+),(\S+)\]/;
928				if ($theta eq '0') {
929				$max_var_direction[$k] = $mag;
930				} elsif ($theta eq 'pi') {
931				$max_var_direction[$k] = -1.0 * $mag;
932				} else {
933				die "eigendecomposition of covariance matrix produced a complex eigenvector --- something is wrong";
934				}
935				}
936				# "Maximum variance direction: @max_var_direction
937				print "\nMaximum Variance Direction: @max_var_direction\n\n"
938				if $self->{_debug};
939				# We now project all data points on the largest eigenvector.
940				# Each projection will yield a single point on the eigenvector.
941				my @projections;
942				foreach my $j (0..$self->{_N}-1) {
943				my $tag = $self->{_data_id_tags}[$j];
944				my $data = $self->{_data}->{$tag};
945				die "Dimensionality of the largest eigenvector does not "
946				. "match the dimensionality of the data"
947				unless @max_var_direction == $self->{_data_dimensions};
948				my $projection = vector_multiply($data, \@max_var_direction);
949				push @projections, $projection;
950				}
951				print "All projection points: @projections\n" if $self->{_debug};
952				my @peak_points = find_peak_points_in_given_direction(\@projections, $K);
953				print "highest points at points along largest eigenvec: @peak_points\n"
954				if $self->{_debug};
955				my @cluster_centers;
956				foreach my $peakpoint (@peak_points) {
957				my @actual_peak_coords = map {$peakpoint * $_} @max_var_direction;
958				push @cluster_centers, \@actual_peak_coords;
959				}
960				return \@cluster_centers;
961				}
962
963				# Used by the kmeans part of the code: This method is called by the previous method
964				# to locate K peaks in a smoothed histogram of the data points projected onto the
965				# maximal variance direction.
966				sub find_peak_points_in_given_direction {
967				my $dataref = shift;
968				my $how_many = shift;
969				my @data = @$dataref;
970				my ($min, $max) = minmax(\@data);
971				my $num_points = @data;
972				my @sorted_data = sort {$a <=> $b} @data;
973				#print "\n\nSorted data: @sorted_data\n";
974				my $scale = $max - $min;
975				foreach my $index (0..$#sorted_data-1) {
976				$sorted_data[$index] = ($sorted_data[$index] - $min) / $scale;
977				}
978				my $avg_diff = 0;
979				foreach my $index (0..$#sorted_data-1) {
980				my $diff = $sorted_data[$index+1] - $sorted_data[$index];
981				$avg_diff += ($diff - $avg_diff) / ($index + 1);
982				}
983				my $delta = 1.0 / 1000.0;
984				# It would be nice to set the delta adaptively, but I must
985				# change the number of cells in the next foreach loop accordingly
986				# my $delta = $avg_diff / 20;
987				my @accumulator = (0) x 1000;
988				foreach my $index (0..@sorted_data-1) {
989				my $cell_index = int($sorted_data[$index] / $delta);
990				my $smoothness = 40;
991				for my $index ($cell_index-$smoothness..$cell_index+$smoothness) {
992				next if $index < 0 \|\| $index > 999;
993				$accumulator[$index]++;
994				}
995				}
996				my $peaks_array = non_maximum_supression( \@accumulator );
997				my $peaks_index_hash = get_value_index_hash( $peaks_array );
998				my @K_highest_peak_locations;
999				my $k = 0;
1000				foreach my $peak (sort {$b <=> $a} keys %$peaks_index_hash) {
1001				my $unscaled_peak_point =
1002				$min + $peaks_index_hash->{$peak} * $scale * $delta;
1003				push @K_highest_peak_locations, $unscaled_peak_point
1004				if $k < $how_many;
1005				last if ++$k == $how_many;
1006				}
1007				return @K_highest_peak_locations;
1008				}
1009
1010				# Used by the kmeans part of the code: The purpose of this routine is to form initial
1011				# clusters by assigning the data samples to the initial clusters formed by the
1012				# previous routine on the basis of the best proximity of the data samples to the
1013				# different cluster centers.
1014				sub assign_data_to_clusters_initial {
1015				my $self = shift;
1016				my @cluster_centers = @{ shift @_ };
1017				my @clusters;
1018				foreach my $ele (@{$self->{_data_id_tags}}) {
1019				my $best_cluster;
1020				my @dist_from_clust_centers;
1021				foreach my $center (@cluster_centers) {
1022				push @dist_from_clust_centers, $self->distance($ele, $center);
1023				}
1024				my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
1025				push @{$clusters[$best_center_index]}, $ele;
1026				}
1027				return \@clusters;
1028				}
1029
1030				# Used by the kmeans part of the code: This is the main routine that along with the
1031				# update_cluster_centers() routine constitute the two key steps of the K-Means
1032				# algorithm. In most cases, the infinite while() loop will terminate automatically
1033				# when the cluster assignments of the data points remain unchanged. For the sake of
1034				# safety, we keep track of the number of iterations. If this number reaches 100, we
1035				# exit the while() loop anyway. In most cases, this limit will not be reached.
1036				sub assign_data_to_clusters {
1037				my $self = shift;
1038				my $clusters = shift;
1039				my $K = shift;
1040				my $final_cluster_centers;
1041				my $iteration_index = 0;
1042				while (1) {
1043				my $new_clusters;
1044				my $assignment_changed_flag = 0;
1045				my $current_cluster_center_index = 0;
1046				my $cluster_size_zero_condition = 0;
1047				my $how_many = @$clusters;
1048				my $cluster_centers = $self->update_cluster_centers(
1049				deep_copy_AoA_with_nulls( $clusters ) );
1050				$iteration_index++;
1051				foreach my $cluster (@$clusters) {
1052				my $current_cluster_center =
1053				$cluster_centers->[$current_cluster_center_index];
1054				foreach my $ele (@$cluster) {
1055				my @dist_from_clust_centers;
1056				foreach my $center (@$cluster_centers) {
1057				push @dist_from_clust_centers,
1058				$self->distance($ele, $center);
1059				}
1060				my ($min, $best_center_index) =
1061				minimum( \@dist_from_clust_centers );
1062				my $best_cluster_center =
1063				$cluster_centers->[$best_center_index];
1064				if (vector_equal($current_cluster_center,
1065				$best_cluster_center)){
1066				push @{$new_clusters->[$current_cluster_center_index]},
1067				$ele;
1068				} else {
1069				$assignment_changed_flag = 1;
1070				push @{$new_clusters->[$best_center_index]}, $ele;
1071				}
1072				}
1073				$current_cluster_center_index++;
1074				}
1075				# Now make sure that we still have K clusters since K is fixed:
1076				next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
1077				# Now make sure that none of the K clusters is an empty cluster:
1078				foreach my $newcluster (@$new_clusters) {
1079				$cluster_size_zero_condition = 1 if ((!defined $newcluster)
1080				or (@$newcluster == 0));
1081				}
1082				push @$new_clusters, (undef) x ($K - @$new_clusters)
1083				if @$new_clusters < $K;
1084				my $largest_cluster;
1085				foreach my $local_cluster (@$new_clusters) {
1086				next if !defined $local_cluster;
1087				$largest_cluster = $local_cluster if !defined $largest_cluster;
1088				if (@$local_cluster > @$largest_cluster) {
1089				$largest_cluster = $local_cluster;
1090				}
1091				}
1092				foreach my $local_cluster (@$new_clusters) {
1093				if ( (!defined $local_cluster) \|\| (@$local_cluster == 0) ) {
1094				push @$local_cluster, pop @$largest_cluster;
1095				}
1096				}
1097				next if (($cluster_size_zero_condition) && ($iteration_index < 100));
1098				last if $iteration_index == 100;
1099				# Now do a deep copy of new_clusters into clusters
1100				$clusters = deep_copy_AoA( $new_clusters );
1101				last if $assignment_changed_flag == 0;
1102				}
1103				$final_cluster_centers = $self->update_cluster_centers( $clusters );
1104				return ($clusters, $final_cluster_centers);
1105				}
1106
1107				# Used by the kmeans part of the code: After each new assignment of the data points
1108				# to the clusters on the basis of the current values for the cluster centers, we call
1109				# the routine shown here for updating the values of the cluster centers.
1110				sub update_cluster_centers {
1111				my $self = shift;
1112				my @clusters = @{ shift @_ };
1113				my @new_cluster_centers;
1114				my $largest_cluster;
1115				foreach my $cluster (@clusters) {
1116				next if !defined $cluster;
1117				$largest_cluster = $cluster if !defined $largest_cluster;
1118				if (@$cluster > @$largest_cluster) {
1119				$largest_cluster = $cluster;
1120				}
1121				}
1122				foreach my $cluster (@clusters) {
1123				if ( (!defined $cluster) \|\| (@$cluster == 0) ) {
1124				push @$cluster, pop @$largest_cluster;
1125				}
1126				}
1127				foreach my $cluster (@clusters) {
1128				die "Cluster became empty --- untenable condition " .
1129				"for a given K. Try again. " if !defined $cluster;
1130				my $cluster_size = @$cluster;
1131				die "Cluster size is zero --- untenable." if $cluster_size == 0;
1132				my @new_cluster_center = @{$self->add_point_coords( $cluster )};
1133				@new_cluster_center = map {my $x = $_/$cluster_size; $x}
1134				@new_cluster_center;
1135				push @new_cluster_centers, \@new_cluster_center;
1136				}
1137				return \@new_cluster_centers;
1138				}
1139
1140				# The following routine is for computing the distance between a data point specified
1141				# by its symbolic name in the master datafile and a point (such as the center of a
1142				# cluster) expressed as a vector of coordinates:
1143				sub distance {
1144				my $self = shift;
1145				my $ele1_id = shift @_; # symbolic name of data sample
1146				my @ele1 = @{$self->{_data}->{$ele1_id}};
1147				my @ele2 = @{shift @_};
1148				die "wrong data types for distance calculation" if @ele1 != @ele2;
1149				my $how_many = @ele1;
1150				my $squared_sum = 0;
1151				foreach my $i (0..$how_many-1) {
1152				$squared_sum += ($ele1[$i] - $ele2[$i])**2;
1153				}
1154				my $dist = sqrt $squared_sum;
1155				return $dist;
1156				}
1157
1158				# The following routine does the same as above but now both
1159				# arguments are expected to be arrays of numbers:
1160				sub distance2 {
1161				my $self = shift;
1162				my @ele1 = @{shift @_};
1163				my @ele2 = @{shift @_};
1164				die "wrong data types for distance calculation" if @ele1 != @ele2;
1165				my $how_many = @ele1;
1166				my $squared_sum = 0;
1167				foreach my $i (0..$how_many-1) {
1168				$squared_sum += ($ele1[$i] - $ele2[$i])**2;
1169				}
1170				return sqrt $squared_sum;
1171				}
1172
1173				sub write_naive_bayes_clusters_to_files {
1174				my $self = shift;
1175				my @clusters = @{$self->{_clusters}};
1176				unlink glob "naive_bayes_cluster*.txt";
1177				foreach my $i (1..@clusters) {
1178				my $filename = "naive_bayes_cluster" . $i . ".txt";
1179				print "Writing cluster $i to file $filename\n"
1180				if $self->{_terminal_output};
1181				open FILEHANDLE, "\| sort > $filename" or die "Unable to open file: $!";
1182				foreach my $ele (@{$clusters[$i-1]}) {
1183				print FILEHANDLE "$ele\n";
1184				}
1185				close FILEHANDLE;
1186				}
1187				}
1188
1189				sub write_posterior_prob_clusters_above_threshold_to_files {
1190				my $self = shift;
1191				my $theta = shift;
1192				my @class_distributions;
1193				foreach my $cluster_index (0..$self->{_K}-1) {
1194				push @class_distributions, [];
1195				}
1196				foreach my $data_tag (@{$self->{_data_id_tags}}) {
1197				foreach my $cluster_index (0..$self->{_K}-1) {
1198				push @{$class_distributions[$cluster_index]}, $data_tag
1199				if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index]
1200				> $theta;
1201				}
1202				}
1203				unlink glob "posterior_prob_cluster*.txt";
1204				foreach my $i (1..@class_distributions) {
1205				my $filename = "posterior_prob_cluster" . $i . ".txt";
1206				print "Writing posterior prob cluster $i to file $filename\n"
1207				if $self->{_terminal_output};
1208				open FILEHANDLE, "\| sort > $filename" or die "Unable to open file: $!";
1209				foreach my $ele (@{$class_distributions[$i-1]}) {
1210				print FILEHANDLE "$ele\n";
1211				}
1212				close FILEHANDLE;
1213				}
1214				}
1215
1216				sub DESTROY {
1217				unlink "__temp_" . basename($_[0]->{_datafile});
1218				unlink "__temp_data_" . basename($_[0]->{_datafile});
1219				unlink "__temp2_" . basename($_[0]->{_datafile});
1220				unlink glob "__temp1dhist*";
1221				unlink glob "__contour*";
1222				}
1223
1224				############################# Visualization Code ###############################
1225
1226				# The visualize_clusters() implementation displays as a plot in your terminal window
1227				# the clusters constructed by the EM algorithm. It can show either 2D plots or
1228				# 3D plots that you can rotate interactively for better visualization. For
1229				# multidimensional data, as to which 2D or 3D dimensions are used for visualization
1230				# is controlled by the mask you must supply as an argument to the method. Should it
1231				# happen that only one on bit is specified for the mask, visualize_clusters()
1232				# aborts.
1233				#
1234				# The visualization code consists of first accessing each of clusters created by the
1235				# EM() subroutine. Note that the clusters contain only the symbolic names for the
1236				# individual records in the source data file. We therefore next reach into the
1237				# $self->{_data} hash and get the data coordinates associated with each symbolic
1238				# label in a cluster. The numerical data thus generated is then written out to a
1239				# temp file. When doing so we must remember to insert TWO BLANK LINES between the
1240				# data blocks corresponding to the different clusters. This constraint is imposed
1241				# on us by Gnuplot when plotting data from the same file since we want to use
1242				# different point styles for the data points in different cluster files.
1243				# Subsequently, we call upon the Perl interface provided by the Graphics::GnuplotIF
1244				# module to plot the data clusters.
1245				sub visualize_clusters {
1246				my $self = shift;
1247				my $v_mask;
1248				my $pause_time;
1249				if (@_ == 1) {
1250				$v_mask = shift \|\| die "visualization mask missing";
1251				} elsif (@_ == 2) {
1252				$v_mask = shift \|\| die "visualization mask missing";
1253				$pause_time = shift;
1254				} else {
1255				die "visualize_clusters() called with wrong args";
1256				}
1257				my $master_datafile = $self->{_datafile};
1258				my @v_mask = split //, $v_mask;
1259				my $visualization_mask_width = @v_mask;
1260				my $original_data_mask = $self->{_mask};
1261				my @mask = split //, $original_data_mask;
1262				my $data_field_width = scalar grep {$_ eq '1'} @mask;
1263				die "\n\nABORTED: The width of the visualization mask (including " .
1264				"all its 1s and 0s) must equal the width of the original mask " .
1265				"used for reading the data file (counting only the 1's)"
1266				if $visualization_mask_width != $data_field_width;
1267				my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1268				# The following section is for the superimposed one-Mahalanobis-distance-unit
1269				# ellipses that are shown only for 2D plots:
1270				if ($visualization_data_field_width == 2) {
1271				foreach my $cluster_index (0..$self->{_K}-1) {
1272				my $contour_filename = "__contour_" . $cluster_index . ".dat";
1273				my $mean = $self->{_cluster_means}->[$cluster_index];
1274				my $covariance = $self->{_cluster_covariances}->[$cluster_index];
1275				my ($mux,$muy) = $mean->as_list();
1276				my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
1277				my ($sigmayx,$vary) = $covariance->row(1)->as_list();
1278				die "Your covariance matrix does not look right"
1279				unless $sigmaxy == $sigmayx;
1280				my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
1281				my $argstring = <<"END";
1282				set contour
1283				mux = $mux
1284				muy = $muy
1285				sigmax = $sigmax
1286				sigmay = $sigmay
1287				sigmaxy = $sigmaxy
1288				determinant = (sigmax*2)(sigmay2) - sigmaxy2
1289				exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)*2)sigmay2 + ((y-muy)2)sigmax2 - 2sigmaxy(x-mux)(y-muy) )
1290				f(x,y) = exp( exponent(x,y) ) - 0.2
1291				xmax = mux + 2 * sigmax
1292				xmin = mux - 2 * sigmax
1293				ymax = muy + 2 * sigmay
1294				ymin = muy - 2 * sigmay
1295				set xrange [ xmin : xmax ]
1296				set yrange [ ymin : ymax ]
1297				set isosamples 200
1298				unset surface
1299				set cntrparam levels discrete 0
1300				set table \"$contour_filename\"
1301				splot f(x,y)
1302				unset table
1303				END
1304				my $plot = Graphics::GnuplotIF->new();
1305				$plot->gnuplot_cmd( $argstring );
1306				}
1307				}
1308				my %visualization_data;
1309				while ( my ($record_id, $data) = each %{$self->{_data}} ) {
1310				my @fields = @$data;
1311				die "\nABORTED: Visualization mask size exceeds data record size"
1312				if $#v_mask > $#fields;
1313				my @data_fields;
1314				foreach my $i (0..@fields-1) {
1315				if ($v_mask[$i] eq '0') {
1316				next;
1317				} elsif ($v_mask[$i] eq '1') {
1318				push @data_fields, $fields[$i];
1319				} else {
1320				die "Misformed visualization mask. It can only have 1s and 0s";
1321				}
1322				}
1323				$visualization_data{ $record_id } = \@data_fields;
1324				}
1325				my $K = scalar @{$self->{_clusters}};
1326				my $filename = basename($master_datafile);
1327				my $temp_file = "__temp_" . $filename;
1328				unlink $temp_file if -e $temp_file;
1329				open OUTPUT, ">$temp_file"
1330				or die "Unable to open a temp file in this directory: $!";
1331				foreach my $cluster (@{$self->{_clusters}}) {
1332				foreach my $item (@$cluster) {
1333				print OUTPUT "@{$visualization_data{$item}}";
1334				print OUTPUT "\n";
1335				}
1336				print OUTPUT "\n\n";
1337				}
1338				close OUTPUT;
1339				my $plot;
1340				if (!defined $pause_time) {
1341				$plot = Graphics::GnuplotIF->new( persist => 1 );
1342				} else {
1343				$plot = Graphics::GnuplotIF->new();
1344				}
1345				my $arg_string = "";
1346				if ($visualization_data_field_width > 2) {
1347				$plot->gnuplot_cmd("set noclip");
1348				$plot->gnuplot_cmd("set pointsize 2");
1349				foreach my $i (0..$K-1) {
1350				my $j = $i + 1;
1351				$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
1352				}
1353				} elsif ($visualization_data_field_width == 2) {
1354				$plot->gnuplot_cmd("set noclip");
1355				$plot->gnuplot_cmd("set pointsize 2");
1356				foreach my $i (0..$K-1) {
1357				my $j = $i + 1;
1358				$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
1359				my $ellipse_filename = "__contour_" . $i . ".dat";
1360				$arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
1361				}
1362				} elsif ($visualization_data_field_width == 1 ) {
1363				open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
1364				my @all_data = ;
1365				close INPUT;
1366				@all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
1367				my $all_joined_data = join ':', @all_data;
1368				my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
1369				my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
1370				foreach my $i (0..@separated-1) {
1371				$separated[$i] =~ s/SEPERATOR//g;
1372				my @cluster_for_hist = split /:/, $separated[$i];
1373				@cluster_for_hist = grep $_, @cluster_for_hist;
1374				my ($minval,$maxval) = minmax(\@cluster_for_hist);
1375				push @all_minvals, $minval;
1376				push @all_maxvals, $maxval;
1377				push @all_clusters_for_hist, \@cluster_for_hist;
1378				}
1379				push @all_minmaxvals, @all_minvals;
1380				push @all_minmaxvals, @all_maxvals;
1381				my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
1382				my $delta = ($abs_maxval - $abs_minval) / 100.0;
1383				$plot->gnuplot_cmd("set boxwidth 3");
1384				$plot->gnuplot_cmd("set style fill solid border -1");
1385				$plot->gnuplot_cmd("set ytics out nomirror");
1386				$plot->gnuplot_cmd("set style data histograms");
1387				$plot->gnuplot_cmd("set style histogram clustered");
1388				$plot->gnuplot_cmd("set title 'Clusters shown through histograms'");
1389				$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
1390				foreach my $cindex (0..@all_clusters_for_hist-1) {
1391				my $filename = basename($master_datafile);
1392				my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $filename;
1393				unlink $temp_file if -e $temp_file;
1394				open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
1395				print OUTPUT "Xstep histval\n";
1396				my @histogram = (0) x 100;
1397				foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
1398				$histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
1399				}
1400				foreach my $i (0..@histogram-1) {
1401				print OUTPUT "$i $histogram[$i]\n";
1402				}
1403				$arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
1404				close OUTPUT;
1405				}
1406				}
1407				$arg_string = $arg_string =~ /^(.*),[ ]+$/;
1408				$arg_string = $1;
1409				if ($visualization_data_field_width > 2) {
1410				$plot->gnuplot_cmd( "splot $arg_string" );
1411				$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1412				} elsif ($visualization_data_field_width == 2) {
1413				$plot->gnuplot_cmd( "plot $arg_string" );
1414				$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1415				} elsif ($visualization_data_field_width == 1) {
1416				$plot->gnuplot_cmd( "plot $arg_string" );
1417				$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1418				}
1419				}
1420
1421				# This subroutine is the same as above except that it makes PNG plots (for hardcopy
1422				# printing) of the clusters.
1423				sub plot_hardcopy_clusters {
1424				my $self = shift;
1425				my $v_mask;
1426				my $pause_time;
1427				if (@_ == 1) {
1428				$v_mask = shift \|\| die "visualization mask missing";
1429				} elsif (@_ == 2) {
1430				$v_mask = shift \|\| die "visualization mask missing";
1431				$pause_time = shift;
1432				} else {
1433				die "visualize_clusters() called with wrong args";
1434				}
1435				my $master_datafile = $self->{_datafile};
1436				my @v_mask = split //, $v_mask;
1437				my $visualization_mask_width = @v_mask;
1438				my $original_data_mask = $self->{_mask};
1439				my @mask = split //, $original_data_mask;
1440				my $data_field_width = scalar grep {$_ eq '1'} @mask;
1441				die "\n\nABORTED: The width of the visualization mask (including " .
1442				"all its 1s and 0s) must equal the width of the original mask " .
1443				"used for reading the data file (counting only the 1's)"
1444				if $visualization_mask_width != $data_field_width;
1445				my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1446				if ($visualization_data_field_width == 2) {
1447				foreach my $cluster_index (0..$self->{_K}-1) {
1448				my $contour_filename = "__contour_" . $cluster_index . ".dat";
1449				my $mean = $self->{_cluster_means}->[$cluster_index];
1450				my $covariance = $self->{_cluster_covariances}->[$cluster_index];
1451				my ($mux,$muy) = $mean->as_list();
1452				my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
1453				my ($sigmayx,$vary) = $covariance->row(1)->as_list();
1454				die "Your covariance matrix does not look right"
1455				unless $sigmaxy == $sigmayx;
1456				my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
1457				my $argstring = <<"END";
1458				set contour
1459				mux = $mux
1460				muy = $muy
1461				sigmax = $sigmax
1462				sigmay = $sigmay
1463				sigmaxy = $sigmaxy
1464				determinant = (sigmax*2)(sigmay2) - sigmaxy2
1465				exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)*2)sigmay2 + ((y-muy)2)sigmax2 - 2sigmaxy(x-mux)(y-muy) )
1466				f(x,y) = exp( exponent(x,y) ) - 0.2
1467				xmax = mux + 2 * sigmax
1468				xmin = mux - 2 * sigmax
1469				ymax = muy + 2 * sigmay
1470				ymin = muy - 2 * sigmay
1471				set xrange [ xmin : xmax ]
1472				set yrange [ ymin : ymax ]
1473				set isosamples 200
1474				unset surface
1475				set cntrparam levels discrete 0
1476				set table \"$contour_filename\"
1477				splot f(x,y)
1478				unset table
1479				END
1480				my $plot = Graphics::GnuplotIF->new();
1481				$plot->gnuplot_cmd( $argstring );
1482				}
1483				}
1484				my %visualization_data;
1485				while ( my ($record_id, $data) = each %{$self->{_data}} ) {
1486				my @fields = @$data;
1487				die "\nABORTED: Visualization mask size exceeds data record size"
1488				if $#v_mask > $#fields;
1489				my @data_fields;
1490				foreach my $i (0..@fields-1) {
1491				if ($v_mask[$i] eq '0') {
1492				next;
1493				} elsif ($v_mask[$i] eq '1') {
1494				push @data_fields, $fields[$i];
1495				} else {
1496				die "Misformed visualization mask. It can only have 1s and 0s";
1497				}
1498				}
1499				$visualization_data{ $record_id } = \@data_fields;
1500				}
1501				my $K = scalar @{$self->{_clusters}};
1502				my $filename = basename($master_datafile);
1503				my $temp_file = "__temp_" . $filename;
1504				unlink $temp_file if -e $temp_file;
1505				open OUTPUT, ">$temp_file"
1506				or die "Unable to open a temp file in this directory: $!";
1507				foreach my $cluster (@{$self->{_clusters}}) {
1508				foreach my $item (@$cluster) {
1509				print OUTPUT "@{$visualization_data{$item}}";
1510				print OUTPUT "\n";
1511				}
1512				print OUTPUT "\n\n";
1513				}
1514				close OUTPUT;
1515				my $plot;
1516				if (!defined $pause_time) {
1517				$plot = Graphics::GnuplotIF->new( persist => 1 );
1518				} else {
1519				$plot = Graphics::GnuplotIF->new();
1520				}
1521				my $arg_string = "";
1522				if ($visualization_data_field_width > 2) {
1523				$plot->gnuplot_cmd( "set noclip" );
1524				$plot->gnuplot_cmd( "set pointsize 2" );
1525				foreach my $i (0..$K-1) {
1526				my $j = $i + 1;
1527				$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
1528				}
1529				} elsif ($visualization_data_field_width == 2) {
1530				$plot->gnuplot_cmd( "set noclip" );
1531				$plot->gnuplot_cmd( "set pointsize 2" );
1532				foreach my $i (0..$K-1) {
1533				my $j = $i + 1;
1534				$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
1535				my $ellipse_filename = "__contour_" . $i . ".dat";
1536				$arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
1537				}
1538				} elsif ($visualization_data_field_width == 1 ) {
1539				open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
1540				my @all_data = ;
1541				close INPUT;
1542				@all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
1543				my $all_joined_data = join ':', @all_data;
1544				my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
1545				my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
1546				foreach my $i (0..@separated-1) {
1547				$separated[$i] =~ s/SEPERATOR//g;
1548				my @cluster_for_hist = split /:/, $separated[$i];
1549				@cluster_for_hist = grep $_, @cluster_for_hist;
1550				my ($minval,$maxval) = minmax(\@cluster_for_hist);
1551				push @all_minvals, $minval;
1552				push @all_maxvals, $maxval;
1553				push @all_clusters_for_hist, \@cluster_for_hist;
1554				}
1555				push @all_minmaxvals, @all_minvals;
1556				push @all_minmaxvals, @all_maxvals;
1557				my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
1558				my $delta = ($abs_maxval - $abs_minval) / 100.0;
1559				$plot->gnuplot_cmd("set boxwidth 3");
1560				$plot->gnuplot_cmd("set style fill solid border -1");
1561				$plot->gnuplot_cmd("set ytics out nomirror");
1562				$plot->gnuplot_cmd("set style data histograms");
1563				$plot->gnuplot_cmd("set style histogram clustered");
1564				$plot->gnuplot_cmd("set title 'Clusters shown through histograms'");
1565				$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
1566				foreach my $cindex (0..@all_clusters_for_hist-1) {
1567				my $filename = basename($master_datafile);
1568				my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $filename;
1569				unlink $temp_file if -e $temp_file;
1570				open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
1571				print OUTPUT "Xstep histval\n";
1572				my @histogram = (0) x 100;
1573				foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
1574				$histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
1575				}
1576				foreach my $i (0..@histogram-1) {
1577				print OUTPUT "$i $histogram[$i]\n";
1578				}
1579				# $arg_string .= "\"$temp_file\" using 1:2 ti col smooth frequency with boxes lc $cindex, ";
1580				$arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
1581				close OUTPUT;
1582				}
1583				}
1584				$arg_string = $arg_string =~ /^(.*),[ ]+$/;
1585				$arg_string = $1;
1586				if ($visualization_data_field_width > 2) {
1587				$plot->gnuplot_cmd( 'set terminal png color',
1588				'set output "cluster_plot.png"');
1589				$plot->gnuplot_cmd( "splot $arg_string" );
1590				} elsif ($visualization_data_field_width == 2) {
1591				$plot->gnuplot_cmd('set terminal png',
1592				'set output "cluster_plot.png"');
1593				$plot->gnuplot_cmd( "plot $arg_string" );
1594				} elsif ($visualization_data_field_width == 1) {
1595				$plot->gnuplot_cmd('set terminal png',
1596				'set output "cluster_plot.png"');
1597				$plot->gnuplot_cmd( "plot $arg_string" );
1598				}
1599				}
1600
1601				# This method is for the visualization of the posterior class distributions. In
1602				# other words, this method allows us to see the soft clustering produced by the EM
1603				# algorithm. While much of the gnuplot logic here is the same as in the
1604				# visualize_clusters() method, there are significant differences in how the data is
1605				# pooled for the purpose of display.
1606				sub visualize_distributions {
1607				my $self = shift;
1608				my $v_mask;
1609				my $pause_time;
1610				if (@_ == 1) {
1611				$v_mask = shift \|\| die "visualization mask missing";
1612				} elsif (@_ == 2) {
1613				$v_mask = shift \|\| die "visualization mask missing";
1614				$pause_time = shift;
1615				} else {
1616				die "visualize_distributions() called with wrong args";
1617				}
1618				my @v_mask = split //, $v_mask;
1619				my $visualization_mask_width = @v_mask;
1620				my $original_data_mask = $self->{_mask};
1621				my @mask = split //, $original_data_mask;
1622				my $data_field_width = scalar grep {$_ eq '1'} @mask;
1623				die "\n\nABORTED: The width of the visualization mask (including " .
1624				"all its 1s and 0s) must equal the width of the original mask " .
1625				"used for reading the data file (counting only the 1's)"
1626				if $visualization_mask_width != $data_field_width;
1627				my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1628				if ($visualization_data_field_width == 2) {
1629				foreach my $cluster_index (0..$self->{_K}-1) {
1630				my $contour_filename = "__contour2_" . $cluster_index . ".dat";
1631				my $mean = $self->{_cluster_means}->[$cluster_index];
1632				my $covariance = $self->{_cluster_covariances}->[$cluster_index];
1633				my ($mux,$muy) = $mean->as_list();
1634				my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
1635				my ($sigmayx,$vary) = $covariance->row(1)->as_list();
1636				die "Your covariance matrix does not look right"
1637				unless $sigmaxy == $sigmayx;
1638				my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
1639				my $argstring = <<"END";
1640				set contour
1641				mux = $mux
1642				muy = $muy
1643				sigmax = $sigmax
1644				sigmay = $sigmay
1645				sigmaxy = $sigmaxy
1646				determinant = (sigmax*2)(sigmay2) - sigmaxy2
1647				exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)*2)sigmay2 + ((y-muy)2)sigmax2 - 2sigmaxy(x-mux)(y-muy) )
1648				f(x,y) = exp( exponent(x,y) ) - 0.2
1649				xmax = mux + 2 * sigmax
1650				xmin = mux - 2 * sigmax
1651				ymax = muy + 2 * sigmay
1652				ymin = muy - 2 * sigmay
1653				set xrange [ xmin : xmax ]
1654				set yrange [ ymin : ymax ]
1655				set isosamples 200
1656				unset surface
1657				set cntrparam levels discrete 0
1658				set table \"$contour_filename\"
1659				splot f(x,y)
1660				unset table
1661				END
1662				my $plot = Graphics::GnuplotIF->new();
1663				$plot->gnuplot_cmd( $argstring );
1664				}
1665				}
1666				my %visualization_data;
1667				while ( my ($record_id, $data) = each %{$self->{_data}} ) {
1668				my @fields = @$data;
1669				die "\nABORTED: Visualization mask size exceeds data record size"
1670				if $#v_mask > $#fields;
1671				my @data_fields;
1672				foreach my $i (0..@fields-1) {
1673				if ($v_mask[$i] eq '0') {
1674				next;
1675				} elsif ($v_mask[$i] eq '1') {
1676				push @data_fields, $fields[$i];
1677				} else {
1678				die "Misformed visualization mask. It can only have 1s and 0s";
1679				}
1680				}
1681				$visualization_data{ $record_id } = \@data_fields;
1682				}
1683				my $filename = basename($self->{_datafile});
1684				my $temp_file = "__temp2_" . $filename;
1685				unlink $temp_file if -e $temp_file;
1686				open OUTPUT, ">$temp_file"
1687				or die "Unable to open a temp file in this directory: $!";
1688				my @class_distributions;
1689				foreach my $cluster_index (0..$self->{_K}-1) {
1690				push @class_distributions, [];
1691				}
1692				foreach my $data_tag (@{$self->{_data_id_tags}}) {
1693				foreach my $cluster_index (0..$self->{_K}-1) {
1694				push @{$class_distributions[$cluster_index]}, $self->{_data}->{$data_tag}
1695				if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] > 0.2;
1696				}
1697				}
1698				foreach my $distribution (@class_distributions) {
1699				foreach my $item (@$distribution) {
1700				print OUTPUT "@$item";
1701				print OUTPUT "\n";
1702				}
1703				print OUTPUT "\n\n";
1704				}
1705				close OUTPUT;
1706				my $plot;
1707				if (!defined $pause_time) {
1708				$plot = Graphics::GnuplotIF->new( persist => 1 );
1709				} else {
1710				$plot = Graphics::GnuplotIF->new();
1711				}
1712				my $arg_string = "";
1713				if ($visualization_data_field_width > 2) {
1714				$plot->gnuplot_cmd( "set noclip" );
1715				$plot->gnuplot_cmd( "set pointsize 2" );
1716				foreach my $i (0..$self->{_K}-1) {
1717				my $j = $i + 1;
1718				$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
1719				}
1720				} elsif ($visualization_data_field_width == 2) {
1721				$plot->gnuplot_cmd( "set noclip" );
1722				$plot->gnuplot_cmd( "set pointsize 2" );
1723				foreach my $i (0..$self->{_K}-1) {
1724				my $j = $i + 1;
1725				$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
1726				my $ellipse_filename = "__contour2_" . $i . ".dat";
1727				$arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
1728				}
1729				} elsif ($visualization_data_field_width == 1 ) {
1730				open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
1731				my @all_data = ;
1732				close INPUT;
1733				@all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
1734				my $all_joined_data = join ':', @all_data;
1735				my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
1736				my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
1737				foreach my $i (0..@separated-1) {
1738				$separated[$i] =~ s/SEPERATOR//g;
1739				my @cluster_for_hist = split /:/, $separated[$i];
1740				@cluster_for_hist = grep $_, @cluster_for_hist;
1741				my ($minval,$maxval) = minmax(\@cluster_for_hist);
1742				push @all_minvals, $minval;
1743				push @all_maxvals, $maxval;
1744				push @all_clusters_for_hist, \@cluster_for_hist;
1745				}
1746				push @all_minmaxvals, @all_minvals;
1747				push @all_minmaxvals, @all_maxvals;
1748				my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
1749				my $delta = ($abs_maxval - $abs_minval) / 100.0;
1750				$plot->gnuplot_cmd("set boxwidth 3");
1751				$plot->gnuplot_cmd("set style fill solid border -1");
1752				$plot->gnuplot_cmd("set ytics out nomirror");
1753				$plot->gnuplot_cmd("set style data histograms");
1754				$plot->gnuplot_cmd("set style histogram clustered");
1755				$plot->gnuplot_cmd("set title 'Individual distributions shown through histograms'");
1756				$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
1757				foreach my $cindex (0..@all_clusters_for_hist-1) {
1758				my $localfilename = basename($filename);
1759				my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $localfilename;
1760				unlink $temp_file if -e $temp_file;
1761				open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
1762				print OUTPUT "Xstep histval\n";
1763
1764				my @histogram = (0) x 100;
1765				foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
1766				$histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
1767				}
1768				foreach my $i (0..@histogram-1) {
1769				print OUTPUT "$i $histogram[$i]\n";
1770				}
1771				# $arg_string .= "\"$temp_file\" using 1:2 ti col smooth frequency with boxes lc $cindex, ";
1772				$arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
1773				close OUTPUT;
1774				}
1775				}
1776				$arg_string = $arg_string =~ /^(.*),[ ]+$/;
1777				$arg_string = $1;
1778				if ($visualization_data_field_width > 2) {
1779				$plot->gnuplot_cmd( "splot $arg_string" );
1780				$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1781				} elsif ($visualization_data_field_width == 2) {
1782				$plot->gnuplot_cmd( "plot $arg_string" );
1783				$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1784				} elsif ($visualization_data_field_width == 1) {
1785				$plot->gnuplot_cmd( "plot $arg_string" );
1786				$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1787				}
1788				}
1789
1790				# This method is basically the same as the previous method, except that it is
1791				# intended for making PNG files from the distributions.
1792				sub plot_hardcopy_distributions {
1793				my $self = shift;
1794				my $v_mask;
1795				my $pause_time;
1796				if (@_ == 1) {
1797				$v_mask = shift \|\| die "visualization mask missing";
1798				} elsif (@_ == 2) {
1799				$v_mask = shift \|\| die "visualization mask missing";
1800				$pause_time = shift;
1801				} else {
1802				die "visualize_distributions() called with wrong args";
1803				}
1804				my @v_mask = split //, $v_mask;
1805				my $visualization_mask_width = @v_mask;
1806				my $original_data_mask = $self->{_mask};
1807				my @mask = split //, $original_data_mask;
1808				my $data_field_width = scalar grep {$_ eq '1'} @mask;
1809				die "\n\nABORTED: The width of the visualization mask (including " .
1810				"all its 1s and 0s) must equal the width of the original mask " .
1811				"used for reading the data file (counting only the 1's)"
1812				if $visualization_mask_width != $data_field_width;
1813				my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1814				if ($visualization_data_field_width == 2) {
1815				foreach my $cluster_index (0..$self->{_K}-1) {
1816				my $contour_filename = "__contour2_" . $cluster_index . ".dat";
1817				my $mean = $self->{_cluster_means}->[$cluster_index];
1818				my $covariance = $self->{_cluster_covariances}->[$cluster_index];
1819				my ($mux,$muy) = $mean->as_list();
1820				my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
1821				my ($sigmayx,$vary) = $covariance->row(1)->as_list();
1822				die "Your covariance matrix does not look right"
1823				unless $sigmaxy == $sigmayx;
1824				my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
1825				my $argstring = <<"END";
1826				set contour
1827				mux = $mux
1828				muy = $muy
1829				sigmax = $sigmax
1830				sigmay = $sigmay
1831				sigmaxy = $sigmaxy
1832				determinant = (sigmax*2)(sigmay2) - sigmaxy2
1833				exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)*2)sigmay2 + ((y-muy)2)sigmax2 - 2sigmaxy(x-mux)(y-muy) )
1834				f(x,y) = exp( exponent(x,y) ) - 0.2
1835				xmax = mux + 2 * sigmax
1836				xmin = mux - 2 * sigmax
1837				ymax = muy + 2 * sigmay
1838				ymin = muy - 2 * sigmay
1839				set xrange [ xmin : xmax ]
1840				set yrange [ ymin : ymax ]
1841				set isosamples 200
1842				unset surface
1843				set cntrparam levels discrete 0
1844				set table \"$contour_filename\"
1845				splot f(x,y)
1846				unset table
1847				END
1848				my $plot = Graphics::GnuplotIF->new();
1849				$plot->gnuplot_cmd( $argstring );
1850				}
1851				}
1852				my %visualization_data;
1853				while ( my ($record_id, $data) = each %{$self->{_data}} ) {
1854				my @fields = @$data;
1855				die "\nABORTED: Visualization mask size exceeds data record size"
1856				if $#v_mask > $#fields;
1857				my @data_fields;
1858				foreach my $i (0..@fields-1) {
1859				if ($v_mask[$i] eq '0') {
1860				next;
1861				} elsif ($v_mask[$i] eq '1') {
1862				push @data_fields, $fields[$i];
1863				} else {
1864				die "Misformed visualization mask. It can only have 1s and 0s";
1865				}
1866				}
1867				$visualization_data{ $record_id } = \@data_fields;
1868				}
1869				my $filename = basename($self->{_datafile});
1870				my $temp_file = "__temp2_" . $filename;
1871				unlink $temp_file if -e $temp_file;
1872				open OUTPUT, ">$temp_file"
1873				or die "Unable to open a temp file in this directory: $!";
1874				my @class_distributions;
1875				foreach my $cluster_index (0..$self->{_K}-1) {
1876				push @class_distributions, [];
1877				}
1878				foreach my $data_tag (@{$self->{_data_id_tags}}) {
1879				foreach my $cluster_index (0..$self->{_K}-1) {
1880				push @{$class_distributions[$cluster_index]}, $self->{_data}->{$data_tag}
1881				if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] > 0.2;
1882				}
1883				}
1884				foreach my $distribution (@class_distributions) {
1885				foreach my $item (@$distribution) {
1886				print OUTPUT "@$item";
1887				print OUTPUT "\n";
1888				}
1889				print OUTPUT "\n\n";
1890				}
1891				close OUTPUT;
1892				my $plot;
1893				if (!defined $pause_time) {
1894				$plot = Graphics::GnuplotIF->new( persist => 1 );
1895				} else {
1896				$plot = Graphics::GnuplotIF->new();
1897				}
1898				my $arg_string = "";
1899				if ($visualization_data_field_width > 2) {
1900				$plot->gnuplot_cmd( "set noclip" );
1901				$plot->gnuplot_cmd( "set pointsize 2" );
1902				foreach my $i (0..$self->{_K}-1) {
1903				my $j = $i + 1;
1904				$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
1905				}
1906				} elsif ($visualization_data_field_width == 2) {
1907				$plot->gnuplot_cmd( "set noclip" );
1908				$plot->gnuplot_cmd( "set pointsize 2" );
1909				foreach my $i (0..$self->{_K}-1) {
1910				my $j = $i + 1;
1911				$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
1912				my $ellipse_filename = "__contour2_" . $i . ".dat";
1913				$arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
1914				}
1915				} elsif ($visualization_data_field_width == 1 ) {
1916				open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
1917				my @all_data = ;
1918				close INPUT;
1919				@all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
1920				my $all_joined_data = join ':', @all_data;
1921				my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
1922				my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
1923				foreach my $i (0..@separated-1) {
1924				$separated[$i] =~ s/SEPERATOR//g;
1925				my @cluster_for_hist = split /:/, $separated[$i];
1926				@cluster_for_hist = grep $_, @cluster_for_hist;
1927				my ($minval,$maxval) = minmax(\@cluster_for_hist);
1928				push @all_minvals, $minval;
1929				push @all_maxvals, $maxval;
1930				push @all_clusters_for_hist, \@cluster_for_hist;
1931				}
1932				push @all_minmaxvals, @all_minvals;
1933				push @all_minmaxvals, @all_maxvals;
1934				my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
1935				my $delta = ($abs_maxval - $abs_minval) / 100.0;
1936				$plot->gnuplot_cmd("set boxwidth 3");
1937				$plot->gnuplot_cmd("set style fill solid border -1");
1938				$plot->gnuplot_cmd("set ytics out nomirror");
1939				$plot->gnuplot_cmd("set style data histograms");
1940				$plot->gnuplot_cmd("set style histogram clustered");
1941				$plot->gnuplot_cmd("set title 'Individual distributions shown through histograms'");
1942				$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
1943				foreach my $cindex (0..@all_clusters_for_hist-1) {
1944				my $localfilename = basename($filename);
1945				my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $localfilename;
1946				unlink $temp_file if -e $temp_file;
1947				open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
1948				print OUTPUT "Xstep histval\n";
1949
1950				my @histogram = (0) x 100;
1951				foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
1952				$histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
1953				}
1954				foreach my $i (0..@histogram-1) {
1955				print OUTPUT "$i $histogram[$i]\n";
1956				}
1957				$arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
1958				close OUTPUT;
1959				}
1960				}
1961				$arg_string = $arg_string =~ /^(.*),[ ]+$/;
1962				$arg_string = $1;
1963
1964				if ($visualization_data_field_width > 2) {
1965				$plot->gnuplot_cmd( 'set terminal png',
1966				'set output "posterior_prob_plot.png"');
1967				$plot->gnuplot_cmd( "splot $arg_string" );
1968				} elsif ($visualization_data_field_width == 2) {
1969				$plot->gnuplot_cmd( 'set terminal png',
1970				'set output "posterior_prob_plot.png"');
1971				$plot->gnuplot_cmd( "plot $arg_string" );
1972				} elsif ($visualization_data_field_width == 1) {
1973				$plot->gnuplot_cmd( 'set terminal png',
1974				'set output "posterior_prob_plot.png"');
1975				$plot->gnuplot_cmd( "plot $arg_string" );
1976				}
1977				}
1978
1979				# The method shown below should be called only AFTER you have called the method
1980				# read_data_from_file(). The visualize_data() is meant for the visualization of the
1981				# original data in its various 2D or 3D subspaces.
1982				sub visualize_data {
1983				my $self = shift;
1984				my $v_mask = shift \|\| die "visualization mask missing";
1985
1986				my $master_datafile = $self->{_datafile};
1987
1988				my @v_mask = split //, $v_mask;
1989				my $visualization_mask_width = @v_mask;
1990				my $original_data_mask = $self->{_mask};
1991				my @mask = split //, $original_data_mask;
1992				my $data_field_width = scalar grep {$_ eq '1'} @mask;
1993				die "\n\nABORTED: The width of the visualization mask (including " .
1994				"all its 1s and 0s) must equal the width of the original mask " .
1995				"used for reading the data file (counting only the 1's)"
1996				if $visualization_mask_width != $data_field_width;
1997				my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1998				my %visualization_data;
1999				my $data_source = $self->{_data};
2000				while ( my ($record_id, $data) = each %{$data_source} ) {
2001				my @fields = @$data;
2002				die "\nABORTED: Visualization mask size exceeds data record size"
2003				if $#v_mask > $#fields;
2004				my @data_fields;
2005				foreach my $i (0..@fields-1) {
2006				if ($v_mask[$i] eq '0') {
2007				next;
2008				} elsif ($v_mask[$i] eq '1') {
2009				push @data_fields, $fields[$i];
2010				} else {
2011				die "Misformed visualization mask. It can only have 1s and 0s";
2012				}
2013				}
2014				$visualization_data{ $record_id } = \@data_fields;
2015				}
2016				my $filename = basename($master_datafile);
2017				my $temp_file;
2018				$temp_file = "__temp_data_" . $filename;
2019				unlink $temp_file if -e $temp_file;
2020				open OUTPUT, ">$temp_file"
2021				or die "Unable to open a temp file in this directory: $!";
2022				foreach my $datapoint (values %visualization_data) {
2023				print OUTPUT "@$datapoint";
2024				print OUTPUT "\n";
2025				}
2026				close OUTPUT;
2027				my $plot = Graphics::GnuplotIF->new( persist => 1 );
2028				$plot->gnuplot_cmd( "set noclip" );
2029				$plot->gnuplot_cmd( "set pointsize 2" );
2030				my $plot_title = '"original data provided for EM"';
2031				my $arg_string ;
2032				if ($visualization_data_field_width > 2) {
2033				$arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
2034				} elsif ($visualization_data_field_width == 2) {
2035				$arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
2036				} elsif ($visualization_data_field_width == 1 ) {
2037				open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
2038				my @all_data = ;
2039				close INPUT;
2040				@all_data = map {chomp $_; $_} @all_data;
2041				@all_data = grep $_, @all_data;
2042				my ($minval,$maxval) = minmax(\@all_data);
2043				my $delta = ($maxval - $minval) / 100.0;
2044				$plot->gnuplot_cmd("set boxwidth 3");
2045				$plot->gnuplot_cmd("set style fill solid border -1");
2046				$plot->gnuplot_cmd("set ytics out nomirror");
2047				$plot->gnuplot_cmd("set style data histograms");
2048				$plot->gnuplot_cmd("set style histogram clustered");
2049				$plot->gnuplot_cmd("set title 'Overall distribution of 1D data'");
2050				$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
2051				my $localfilename = basename($filename);
2052				my $temp_file = "__temp1dhist_" . $localfilename;
2053				unlink $temp_file if -e $temp_file;
2054				open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
2055				print OUTPUT "Xstep histval\n";
2056				my @histogram = (0) x 100;
2057				foreach my $i (0..@all_data-1) {
2058				$histogram[int( ($all_data[$i] - $minval) / $delta )]++;
2059				}
2060				foreach my $i (0..@histogram-1) {
2061				print OUTPUT "$i $histogram[$i]\n";
2062				}
2063				$arg_string = "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc rgb 'green'";
2064				close OUTPUT;
2065				}
2066				if ($visualization_data_field_width > 2) {
2067				$plot->gnuplot_cmd( "splot $arg_string" );
2068				} elsif ($visualization_data_field_width == 2) {
2069				$plot->gnuplot_cmd( "plot $arg_string" );
2070				} elsif ($visualization_data_field_width == 1) {
2071				$plot->gnuplot_cmd( "plot $arg_string" );
2072				}
2073				}
2074
2075				# This method is the same as the one shown above, except that it is meant for
2076				# creating PNG files of the displays.
2077				sub plot_hardcopy_data {
2078				my $self = shift;
2079				my $v_mask = shift \|\| die "visualization mask missing";
2080				my $master_datafile = $self->{_datafile};
2081				my @v_mask = split //, $v_mask;
2082				my $visualization_mask_width = @v_mask;
2083				my $original_data_mask = $self->{_mask};
2084				my @mask = split //, $original_data_mask;
2085				my $data_field_width = scalar grep {$_ eq '1'} @mask;
2086				die "\n\nABORTED: The width of the visualization mask (including " .
2087				"all its 1s and 0s) must equal the width of the original mask " .
2088				"used for reading the data file (counting only the 1's)"
2089				if $visualization_mask_width != $data_field_width;
2090				my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
2091				my %visualization_data;
2092				my $data_source = $self->{_data};
2093				while ( my ($record_id, $data) = each %{$data_source} ) {
2094				my @fields = @$data;
2095				die "\nABORTED: Visualization mask size exceeds data record size"
2096				if $#v_mask > $#fields;
2097				my @data_fields;
2098				foreach my $i (0..@fields-1) {
2099				if ($v_mask[$i] eq '0') {
2100				next;
2101				} elsif ($v_mask[$i] eq '1') {
2102				push @data_fields, $fields[$i];
2103				} else {
2104				die "Misformed visualization mask. It can only have 1s and 0s";
2105				}
2106				}
2107				$visualization_data{ $record_id } = \@data_fields;
2108				}
2109				my $filename = basename($master_datafile);
2110				my $temp_file;
2111				$temp_file = "__temp_data_" . $filename;
2112				unlink $temp_file if -e $temp_file;
2113				open OUTPUT, ">$temp_file"
2114				or die "Unable to open a temp file in this directory: $!";
2115				foreach my $datapoint (values %visualization_data) {
2116				print OUTPUT "@$datapoint";
2117				print OUTPUT "\n";
2118				}
2119				close OUTPUT;
2120
2121				my $plot = Graphics::GnuplotIF->new( persist => 1 );
2122				$plot->gnuplot_cmd( "set noclip" );
2123				$plot->gnuplot_cmd( "set pointsize 2" );
2124				my $plot_title = '"original data provided for EM"';
2125				my $arg_string ;
2126				if ($visualization_data_field_width > 2) {
2127				$arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
2128				} elsif ($visualization_data_field_width == 2) {
2129				$arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
2130				} elsif ($visualization_data_field_width == 1 ) {
2131				open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
2132				my @all_data = ;
2133				close INPUT;
2134				@all_data = map {chomp $_; $_} @all_data;
2135				@all_data = grep $_, @all_data;
2136				my ($minval,$maxval) = minmax(\@all_data);
2137				my $delta = ($maxval - $minval) / 100.0;
2138				$plot->gnuplot_cmd("set boxwidth 3");
2139				$plot->gnuplot_cmd("set style fill solid border -1");
2140				$plot->gnuplot_cmd("set ytics out nomirror");
2141				$plot->gnuplot_cmd("set style data histograms");
2142				$plot->gnuplot_cmd("set style histogram clustered");
2143				$plot->gnuplot_cmd("set title 'Overall distribution of 1D data'");
2144				$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
2145				my $localfilename = basename($filename);
2146				my $temp_file = "__temp1dhist_" . $localfilename;
2147				unlink $temp_file if -e $temp_file;
2148				open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
2149				print OUTPUT "Xstep histval\n";
2150				my @histogram = (0) x 100;
2151				foreach my $i (0..@all_data-1) {
2152				$histogram[int( ($all_data[$i] - $minval) / $delta )]++;
2153				}
2154				foreach my $i (0..@histogram-1) {
2155				print OUTPUT "$i $histogram[$i]\n";
2156				}
2157				$arg_string = "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc rgb 'green'";
2158				close OUTPUT;
2159				}
2160				if ($visualization_data_field_width > 2) {
2161				$plot->gnuplot_cmd( 'set terminal png',
2162				'set output "data_scatter_plot.png"');
2163				$plot->gnuplot_cmd( "splot $arg_string" );
2164				} elsif ($visualization_data_field_width == 2) {
2165				$plot->gnuplot_cmd( 'set terminal png',
2166				'set output "data_scatter_plot.png"');
2167				$plot->gnuplot_cmd( "plot $arg_string" );
2168				} elsif ($visualization_data_field_width == 1) {
2169				$plot->gnuplot_cmd( 'set terminal png',
2170				'set output "data_scatter_plot.png"');
2171				$plot->gnuplot_cmd( "plot $arg_string" );
2172				}
2173				}
2174
2175
2176				################### Generating Synthetic Data for Clustering ###################
2177
2178				# The data generated corresponds to a multivariate distribution. The mean and the
2179				# covariance of each Gaussian in the distribution are specified individually in a
2180				# parameter file. The parameter file must also state the prior probabilities to be
2181				# associated with each Gaussian. See the example parameter file param1.txt in the
2182				# examples directory. Just edit this file for your own needs.
2183				#
2184				# The multivariate random numbers are generated by calling the Math::Random module.
2185				# As you would expect, that module will insist that the covariance matrix you
2186				# specify be symmetric and positive definite.
2187				sub cluster_data_generator {
2188				my $class = shift;
2189				die "illegal call of a class method"
2190				unless $class eq 'Algorithm::ExpectationMaximization';
2191				my %args = @_;
2192				my $input_parameter_file = $args{input_parameter_file};
2193				my $output_file = $args{output_datafile};
2194				my $N = $args{total_number_of_data_points};
2195				my @all_params;
2196				my $param_string;
2197				if (defined $input_parameter_file) {
2198				open INPUT, $input_parameter_file \|\| "unable to open parameter file: $!";
2199				@all_params = ;
2200				@all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
2201				chomp @all_params;
2202				$param_string = join ' ', @all_params;
2203				} else {
2204				# Just for testing. Used in t/test.t
2205				$param_string = "priors 0.34 0.33 0.33 " .
2206				"cluster 5 0 0 1 0 0 0 1 0 0 0 1 " .
2207				"cluster 0 5 0 1 0 0 0 1 0 0 0 1 " .
2208				"cluster 0 0 5 1 0 0 0 1 0 0 0 1";
2209				}
2210				my ($priors_string) = $param_string =~ /priors(.+?)cluster/;
2211				croak "You did not specify the prior probabilities in the parameter file"
2212				unless $priors_string;
2213				my @priors = split /\s+/, $priors_string;
2214				@priors = grep {/$_num_regex/; $_} @priors;
2215				my $sum = 0;
2216				foreach my $prior (@priors) {
2217				$sum += $prior;
2218				}
2219				croak "Your priors in the parameter file do not add up to 1" unless $sum == 1;
2220				my ($rest_of_string) = $param_string =~ /priors\s$priors_string(.)$/;
2221				my @cluster_strings = split /[ ]cluster[ ]/, $rest_of_string;
2222				@cluster_strings = grep $_, @cluster_strings;
2223
2224				my $K = @cluster_strings;
2225				croak "Too many clusters requested" if $K > 12;
2226				croak "Mismatch between the number of values for priors and the number " .
2227				"of clusters" unless $K == @priors;
2228				my @point_labels = ('a'..'z');
2229				print "Prior probabilities recorded from param file: @priors\n";
2230				print "Number of Gaussians used for the synthetic data: $K\n";
2231				my @means;
2232				my @covariances;
2233				my $data_dimension;
2234				foreach my $i (0..$K-1) {
2235				my @num_strings = split / /, $cluster_strings[$i];
2236				my @cluster_mean = map {/$_num_regex/;$_} split / /, $num_strings[0];
2237				$data_dimension = @cluster_mean;
2238				push @means, \@cluster_mean;
2239				my @covariance_nums = map {/$_num_regex/;$_} split / /, $num_strings[1];
2240				croak "dimensionality error" if @covariance_nums !=
2241				($data_dimension ** 2);
2242				my $cluster_covariance;
2243				foreach my $j (0..$data_dimension-1) {
2244				foreach my $k (0..$data_dimension-1) {
2245				$cluster_covariance->[$j]->[$k] =
2246				$covariance_nums[$j*$data_dimension + $k];
2247				}
2248				}
2249				push @covariances, $cluster_covariance;
2250				}
2251				random_seed_from_phrase( 'hellomellojello' );
2252				my @data_dump;
2253				foreach my $i (0..$K-1) {
2254				my @m = @{shift @means};
2255				my @covar = @{shift @covariances};
2256				my @new_data = Math::Random::random_multivariate_normal(
2257				int($N * $priors[$i]), @m, @covar );
2258				my $p = 0;
2259				my $label = $point_labels[$i];
2260				@new_data = map {unshift @$_, $label.$i; $i++; $_} @new_data;
2261				push @data_dump, @new_data;
2262				}
2263				fisher_yates_shuffle( \@data_dump );
2264				open OUTPUT, ">$output_file";
2265				print OUTPUT "\#Data generated from the parameter file: $input_parameter_file\n"
2266				if $input_parameter_file;
2267				print OUTPUT "\#Total number of data points in this file: $N\n";
2268				print OUTPUT "\#Prior class probabilities for this data: @priors\n";
2269				foreach my $ele (@data_dump) {
2270				foreach my $coord ( @$ele ) {
2271				print OUTPUT "$coord ";
2272				}
2273				print OUTPUT "\n";
2274				}
2275				print "Data written out to file $output_file\n";
2276				close OUTPUT;
2277				}
2278
2279				sub add_point_coords {
2280				my $self = shift;
2281				my @arr_of_ids = @{shift @_}; # array of data element names
2282				my @result;
2283				my $data_dimensionality = $self->{_data_dimensions};
2284				foreach my $i (0..$data_dimensionality-1) {
2285				$result[$i] = 0.0;
2286				}
2287				foreach my $id (@arr_of_ids) {
2288				my $ele = $self->{_data}->{$id};
2289				my $i = 0;
2290				foreach my $component (@$ele) {
2291				$result[$i] += $component;
2292				$i++;
2293				}
2294				}
2295				return \@result;
2296				}
2297
2298				###################### Support Routines ########################
2299
2300				# computer the outer product of two column vectors
2301				sub outer_product {
2302				my $vec1 = shift;
2303				my $vec2 = shift;
2304				my ($nrows1, $ncols1) = ($vec1->rows(), $vec1->cols());
2305				my ($nrows2, $ncols2) = ($vec2->rows(), $vec2->cols());
2306				die "Outer product operation called with non-matching vectors"
2307				unless $ncols1 == 1 && $ncols2 == 1 && $nrows1 == $nrows2;
2308				my @vec_arr1 = $vec1->as_list();
2309				my @vec_arr2 = $vec2->as_list();
2310				my $outer_product = Math::GSL::Matrix->new($nrows1, $nrows2);
2311				foreach my $index (0..$nrows1-1) {
2312				my @new_row = map $vec_arr1[$index] * $_, @vec_arr2;
2313				$outer_product->set_row($index, \@new_row);
2314				}
2315				return $outer_product;
2316				}
2317
2318				sub get_index_at_value {
2319				my $value = shift;
2320				my @array = @{shift @_};
2321				foreach my $i (0..@array-1) {
2322				return $i if $value == $array[$i];
2323				}
2324				}
2325
2326				# This routine is really not necessary in light of the new `~~' operator in Perl.
2327				# Will use the new operator in the next version.
2328				sub vector_equal {
2329				my $vec1 = shift;
2330				my $vec2 = shift;
2331				die "wrong data types for distance calculation" if @$vec1 != @$vec2;
2332				foreach my $i (0..@$vec1-1){
2333				return 0 if $vec1->[$i] != $vec2->[$i];
2334				}
2335				return 1;
2336				}
2337
2338				sub compare_array_floats {
2339				my $vec1 = shift;
2340				my $vec2 = shift;
2341				foreach my $i (0..@$vec1-1){
2342				return 0 if abs($vec1->[$i] - $vec2->[$i]) > 0.00001;
2343				}
2344				return 1;
2345				}
2346
2347				# Returns the minimum value and its positional index in an array
2348				sub minimum {
2349				my $arr = shift;
2350				my $min;
2351				my $index;
2352				foreach my $i (0..@{$arr}-1) {
2353				if ( (!defined $min) \|\| ($arr->[$i] < $min) ) {
2354				$index = $i;
2355				$min = $arr->[$i];
2356				}
2357				}
2358				return ($min, $index);
2359				}
2360
2361				sub minmax {
2362				my $arr = shift;
2363				my $min;
2364				my $max;
2365				foreach my $i (0..@{$arr}-1) {
2366				if ( (!defined $min) && (!defined $max) ) {
2367				$min = $arr->[$i];
2368				$max = $arr->[$i];
2369				} elsif ( $arr->[$i] < $min ) {
2370				$min = $arr->[$i];
2371				} elsif ( $arr->[$i] > $max ) {
2372				$max = $arr->[$i];
2373				}
2374				}
2375				return ($min, $max);
2376				}
2377
2378				# Meant only for constructing a deep copy of an array of
2379				# arrays:
2380				sub deep_copy_AoA {
2381				my $ref_in = shift;
2382				my $ref_out;
2383				foreach my $i (0..@{$ref_in}-1) {
2384				foreach my $j (0..@{$ref_in->[$i]}-1) {
2385				$ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
2386				}
2387				}
2388				return $ref_out;
2389				}
2390
2391				# Meant only for constructing a deep copy of an array of arrays for the case when
2392				# some elements of the top-level array may be undefined:
2393				sub deep_copy_AoA_with_nulls {
2394				my $ref_in = shift;
2395				my $ref_out;
2396				foreach my $i (0..@{$ref_in}-1) {
2397				if ( !defined $ref_in->[$i] ) {
2398				$ref_out->[$i] = undef;
2399				next;
2400				}
2401				foreach my $j (0..@{$ref_in->[$i]}-1) {
2402				$ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
2403				}
2404				}
2405				return $ref_out;
2406				}
2407
2408				# Meant only for constructing a deep copy of a hash in which each value is an
2409				# anonymous array of numbers:
2410				sub deep_copy_hash {
2411				my $ref_in = shift;
2412				my $ref_out;
2413				while ( my ($key, $value) = each( %$ref_in ) ) {
2414				$ref_out->{$key} = deep_copy_array( $value );
2415				}
2416				return $ref_out;
2417				}
2418
2419				# Meant only for an array of numbers:
2420				sub deep_copy_array {
2421				my $ref_in = shift;
2422				my $ref_out;
2423				foreach my $i (0..@{$ref_in}-1) {
2424				$ref_out->[$i] = $ref_in->[$i];
2425				}
2426				return $ref_out;
2427				}
2428
2429				# from perl docs:
2430				sub fisher_yates_shuffle {
2431				my $arr = shift;
2432				my $i = @$arr;
2433				while (--$i) {
2434				my $j = int rand( $i + 1 );
2435				@$arr[$i, $j] = @$arr[$j, $i];
2436				}
2437				}
2438
2439				sub mean_and_variance {
2440				my @data = @{shift @_};
2441				my ($mean, $variance);
2442				foreach my $i (1..@data) {
2443				if ($i == 1) {
2444				$mean = $data[0];
2445				$variance = 0;
2446				} else {
2447				# data[$i-1] because of zero-based indexing of vector
2448				$mean = ( (($i-1)/$i) * $mean ) + $data[$i-1] / $i;
2449				$variance = ( (($i-1)/$i) * $variance )
2450				+ ($data[$i-1]-$mean)**2 / ($i-1);
2451				}
2452				}
2453				return ($mean, $variance);
2454				}
2455
2456				sub check_for_illegal_params {
2457				my @params = @_;
2458				my @legal_params = qw / datafile
2459				mask
2460				K
2461				terminal_output
2462				max_em_iterations
2463				seeding
2464				class_priors
2465				seed_tags
2466				debug
2467				/;
2468				my $found_match_flag;
2469				foreach my $param (@params) {
2470				foreach my $legal (@legal_params) {
2471				$found_match_flag = 0;
2472				if ($param eq $legal) {
2473				$found_match_flag = 1;
2474				last;
2475				}
2476				}
2477				last if $found_match_flag == 0;
2478				}
2479				return $found_match_flag;
2480				}
2481
2482				sub get_value_index_hash {
2483				my $arr = shift;
2484				my %hash;
2485				foreach my $index (0..@$arr-1) {
2486				$hash{$arr->[$index]} = $index if $arr->[$index] > 0;
2487				}
2488				return \%hash;
2489				}
2490
2491				sub non_maximum_supression {
2492				my $arr = shift;
2493				my @output = (0) x @$arr;
2494				my @final_output = (0) x @$arr;
2495				my %hash;
2496				my @array_of_runs = ([$arr->[0]]);
2497				foreach my $index (1..@$arr-1) {
2498				if ($arr->[$index] == $arr->[$index-1]) {
2499				push @{$array_of_runs[-1]}, $arr->[$index];
2500				} else {
2501				push @array_of_runs, [$arr->[$index]];
2502				}
2503				}
2504				my $runstart_index = 0;
2505				foreach my $run_index (1..@array_of_runs-2) {
2506				$runstart_index += @{$array_of_runs[$run_index-1]};
2507				if ($array_of_runs[$run_index]->[0] >
2508				$array_of_runs[$run_index-1]->[0] &&
2509				$array_of_runs[$run_index]->[0] >
2510				$array_of_runs[$run_index+1]->[0]) {
2511				my $run_center = @{$array_of_runs[$run_index]} / 2;
2512				my $assignment_index = $runstart_index + $run_center;
2513				$output[$assignment_index] = $arr->[$assignment_index];
2514				}
2515				}
2516				if ($array_of_runs[-1]->[0] > $array_of_runs[-2]->[0]) {
2517				$runstart_index += @{$array_of_runs[-2]};
2518				my $run_center = @{$array_of_runs[-1]} / 2;
2519				my $assignment_index = $runstart_index + $run_center;
2520				$output[$assignment_index] = $arr->[$assignment_index];
2521				}
2522				if ($array_of_runs[0]->[0] > $array_of_runs[1]->[0]) {
2523				my $run_center = @{$array_of_runs[0]} / 2;
2524				$output[$run_center] = $arr->[$run_center];
2525				}
2526				return \@output;
2527				}
2528
2529				sub display_matrix {
2530				my $message = shift;
2531				my $matrix = shift;
2532				if (!defined blessed($matrix)) {
2533				print "display_matrix called on a scalar value: $matrix\n";
2534				return;
2535				}
2536				my $nrows = $matrix->rows();
2537				my $ncols = $matrix->cols();
2538				print "$message ($nrows rows and $ncols columns)\n";
2539				foreach my $i (0..$nrows-1) {
2540				my $row = $matrix->row($i);
2541				my @row_as_list = $row->as_list;
2542				print "@row_as_list\n";
2543				}
2544				print "\n";
2545				}
2546
2547				sub transpose {
2548				my $matrix = shift;
2549				my $num_rows = $matrix->rows();
2550				my $num_cols = $matrix->cols();
2551				my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
2552				foreach my $i (0..$num_rows-1) {
2553				my @row = $matrix->row($i)->as_list;
2554				$transpose->set_col($i, \@row );
2555				}
2556				return $transpose;
2557				}
2558
2559				sub vector_multiply {
2560				my $vec1 = shift;
2561				my $vec2 = shift;
2562				die "vec_multiply called with two vectors of different sizes"
2563				unless @$vec1 == @$vec2;
2564				my $result = 0;
2565				foreach my $i (0..@$vec1-1) {
2566				$result += $vec1->[$i] * $vec2->[$i];
2567				}
2568				return $result;
2569				}
2570
2571				sub vector_2_vector_multiply {
2572				my $vec1 = shift;
2573				my $vec2 = shift;
2574				die "vec_multiply called with two vectors of different sizes"
2575				unless @$vec1 == @$vec2;
2576				my @result_vec;
2577				foreach my $i (0..@$vec1-1) {
2578				$result_vec[$i] = $vec1->[$i] * $vec2->[$i];
2579				}
2580				return \@result_vec;
2581				}
2582
2583				sub matrix_multiply {
2584				my $matrix1 = shift;
2585				my $matrix2 = shift;
2586				my ($nrows1, $ncols1) = ($matrix1->rows(), $matrix1->cols());
2587				my ($nrows2, $ncols2) = ($matrix2->rows(), $matrix2->cols());
2588				die "matrix multiplication called with non-matching matrix arguments"
2589				# unless $ncols1 == $nrows2;
2590				unless $nrows1 == $ncols2 && $ncols1 == $nrows2;
2591				if ($nrows1 == 1) {
2592				my @row = $matrix1->row(0)->as_list;
2593				my @col = $matrix2->col(0)->as_list;
2594				my $result;
2595				foreach my $j (0..$ncols1-1) {
2596				$result += $row[$j] * $col[$j];
2597				}
2598				return $result;
2599				}
2600				my $product = Math::GSL::Matrix->new($nrows1, $nrows1);
2601				foreach my $i (0..$nrows1-1) {
2602				my $row = $matrix1->row($i);
2603				my @product_row;
2604				foreach my $j (0..$ncols2-1) {
2605				my $col = $matrix2->col($j);
2606				my $row_times_col = matrix_multiply($row, $col);
2607				push @product_row, $row_times_col;
2608				}
2609				$product->set_row($i, \@product_row);
2610				}
2611				return $product;
2612				}
2613
2614				sub vector_matrix_multiply {
2615				my $matrix1 = shift;
2616				my $matrix2 = shift;
2617				my ($nrows1, $ncols1) = ($matrix1->rows, $matrix1->cols);
2618				my ($nrows2, $ncols2) = ($matrix2->rows, $matrix2->cols);
2619				die "matrix multiplication called with non-matching matrix arguments"
2620				unless $nrows1 == 1 && $ncols1 == $nrows2;
2621				if ($ncols2 == 1) {
2622				my @row = $matrix1->row(0)->as_list;
2623				my @col = $matrix2->col(0)->as_list;
2624				my $result;
2625				foreach my $j (0..$ncols1-1) {
2626				$result += $row[$j] * $col[$j];
2627				}
2628				return $result;
2629				}
2630				my $product = Math::GSL::Matrix->new(1, $ncols2);
2631				my $row = $matrix1->row(0);
2632				my @product_row;
2633				foreach my $j (0..$ncols2-1) {
2634				my $col = $matrix2->col($j);
2635				my $row_times_col = vector_matrix_multiply($row, $col);
2636				push @product_row, $row_times_col;
2637				}
2638				$product->set_row(0, \@product_row);
2639				return $product;
2640				}
2641
2642				sub matrix_vector_multiply {
2643				my $matrix1 = shift;
2644				my $matrix2 = shift;
2645				my ($nrows1, $ncols1) = ($matrix1->rows(), $matrix1->cols());
2646				my ($nrows2, $ncols2) = ($matrix2->rows(), $matrix2->cols());
2647				die "matrix multiplication called with non-matching matrix arguments"
2648				unless $ncols1 == $nrows2 && $ncols2 == 1;
2649				if ($nrows1 == 1) {
2650				my @row = $matrix1->row(0)->as_list;
2651				my @col = $matrix2->col(0)->as_list;
2652				my $result;
2653				foreach my $j (0..$ncols1-1) {
2654				$result += $row[$j] * $col[$j];
2655				}
2656				return $result;
2657				}
2658				my $product = Math::GSL::Matrix->new($nrows1, 1);
2659				my $col = $matrix2->col(0);
2660				my @product_col;
2661				foreach my $i (0..$nrows1-1) {
2662				my $row = $matrix1->row($i);
2663				my $row_times_col = matrix_vector_multiply($row, $col);
2664				push @product_col, $row_times_col;
2665				}
2666				$product->set_col(0, \@product_col);
2667				return $product;
2668				}
2669
2670				sub matrix_trace {
2671				my $matrix = shift;
2672				my ($nrows, $ncols) = ($matrix->rows(), $matrix->cols());
2673				die "trace can only be calculated for a square matrix"
2674				unless $ncols == $nrows;
2675				my @elements = $matrix->as_list;
2676				my $trace = 0;
2677				foreach my $i (0..$nrows-1) {
2678				$trace += $elements[$i + $i * $ncols];
2679				}
2680				return $trace;
2681				}
2682
2683				1;
2684
2685				=pod
2686
2687				=head1 NAME
2688
2689				Algorithm::ExpectationMaximization -- A Perl module for clustering numerical
2690				multi-dimensional data with the Expectation-Maximization algorithm.
2691
2692				=head1 SYNOPSIS
2693
2694				use Algorithm::ExpectationMaximization;
2695
2696				# First name the data file:
2697
2698				my $datafile = "mydatafile.csv";
2699
2700				# Next, set the mask to indicate which columns of the datafile to use for
2701				# clustering and which column contains a symbolic ID for each data record. For
2702				# example, if the symbolic name is in the first column, you want the second column
2703				# to be ignored, and you want the next three columns to be used for 3D clustering:
2704
2705				my $mask = "N0111";
2706
2707				# Now construct an instance of the clusterer. The parameter `K' controls the
2708				# number of clusters. Here is an example call to the constructor for instance
2709				# creation:
2710
2711				my $clusterer = Algorithm::ExpectationMaximization->new(
2712				datafile => $datafile,
2713				mask => $mask,
2714				K => 3,
2715				max_em_iterations => 300,
2716				seeding => 'random',
2717				terminal_output => 1,
2718				);
2719
2720				# Note the choice for `seeding'. The choice `random' means that the clusterer will
2721				# randomly select `K' data points to serve as initial cluster centers. Other
2722				# possible choices for the constructor parameter `seeding' are `kmeans' and
2723				# `manual'. With the `kmeans' option for `seeding', the output of a K-means
2724				# clusterer is used for the cluster seeds and the initial cluster covariances. If
2725				# you use the `manual' option for seeding, you must also specify the data elements
2726				# to use for seeding the clusters.
2727
2728				# Here is an example of a call to the constructor when we choose the `manual'
2729				# option for seeding the clusters and for specifying the data elements for
2730				# seeding. The data elements are specified by their tag names. In this case,
2731				# these names are `a26', `b53', and `c49':
2732
2733				my $clusterer = Algorithm::ExpectationMaximization->new(
2734				datafile => $datafile,
2735				mask => $mask,
2736				class_priors => [0.6, 0.2, 0.2],
2737				K => 3,
2738				max_em_iterations => 300,
2739				seeding => 'manual',
2740				seed_tags => ['a26', 'b53', 'c49'],
2741				terminal_output => 1,
2742				);
2743
2744				# This example call to the constructor also illustrates how you can inject class
2745				# priors into the clustering process. The class priors are the prior probabilities
2746				# of the class distributions in your dataset. As explained later, injecting class
2747				# priors in the manner shown above makes statistical sense only for the case of
2748				# manual seeding. When you do inject class priors, the order in which the priors
2749				# are expressed must correspond to the manually specified seeds for the clusters.
2750
2751				# After the invocation of the constructor, the following calls are mandatory
2752				# for reasons that should be obvious from the names of the methods:
2753
2754				$clusterer->read_data_from_file();
2755				srand(time);
2756				$clusterer->seed_the_clusters();
2757				$clusterer->EM();
2758				$clusterer->run_bayes_classifier();
2759				my $clusters = $clusterer->return_disjoint_clusters();
2760
2761				# where the call to `EM()' is the invocation of the expectation-maximization
2762				# algorithm. The call to `srand(time)' is to seed the pseudo random number
2763				# generator afresh for each run of the cluster seeding procedure. If you want to
2764				# see repeatable results from one run to another of the algorithm with random
2765				# seeding, you would obviously not invoke `srand(time)'.
2766
2767				# The call `run_bayes_classifier()' shown above carries out a disjoint clustering
2768				# of all the data points using the naive Bayes' classifier. And the call
2769				# `return_disjoint_clusters()' returns the clusters thus formed to you. Once you
2770				# have obtained access to the clusters in this manner, you can display them in
2771				# your terminal window by
2772
2773				foreach my $index (0..@$clusters-1) {
2774				print "Cluster $index (Naive Bayes): @{$clusters->[$index]}\n\n"
2775				}
2776
2777				# If you would like to also see the clusters purely on the basis of the posterior
2778				# class probabilities exceeding a threshold, call
2779
2780				my $theta1 = 0.2;
2781				my $posterior_prob_clusters =
2782				$clusterer->return_clusters_with_posterior_probs_above_threshold($theta1);
2783
2784				# where you can obviously set the threshold $theta1 to any value you wish. Note
2785				# that now you may end up with clusters that overlap. You can display them in
2786				# your terminal window in the same manner as shown above for the naive Bayes'
2787				# clusters.
2788
2789				# You can write the naive Bayes' clusters out to files, one cluster per file, by
2790				# calling
2791
2792				$clusterer->write_naive_bayes_clusters_to_files();
2793
2794				# The clusters are placed in files with names like
2795
2796				naive_bayes_cluster1.txt
2797				naive_bayes_cluster2.txt
2798				...
2799
2800				# In the same manner, you can write out the posterior probability based possibly
2801				# overlapping clusters to files by calling:
2802
2803				$clusterer->write_posterior_prob_clusters_above_threshold_to_files($theta1);
2804
2805				# where the threshold $theta1 sets the probability threshold for deciding which
2806				# data elements to place in a cluster. These clusters are placed in files with
2807				# names like
2808
2809				posterior_prob_cluster1.txt
2810				posterior_prob_cluster2.txt
2811				...
2812
2813				# CLUSTER VISUALIZATION:
2814
2815				# You must first set the mask for cluster visualization. This mask tells the
2816				# module which 2D or 3D subspace of the original data space you wish to visualize
2817				# the clusters in:
2818
2819				my $visualization_mask = "111";
2820				$clusterer->visualize_clusters($visualization_mask);
2821				$clusterer->visualize_distributions($visualization_mask);
2822				$clusterer->plot_hardcopy_clusters($visualization_mask);
2823				$clusterer->plot_hardcopy_distributions($visualization_mask);
2824
2825				# where the last two invocations are for writing out the PNG plots of the
2826				# visualization displays to disk files. The PNG image of the posterior
2827				# probability distributions is written out to a file named posterior_prob_plot.png
2828				# and the PNG image of the disjoint clusters to a file called cluster_plot.png.
2829
2830				# SYNTHETIC DATA GENERATION:
2831
2832				# The module has been provided with a class method for generating multivariate
2833				# data for experimenting with the EM algorithm. The data generation is controlled
2834				# by the contents of a parameter file that is supplied as an argument to the data
2835				# generator method. The priors, the means, and the covariance matrices in the
2836				# parameter file must be according to the syntax shown in the `param1.txt' file in
2837				# the `examples' directory. It is best to edit a copy of this file for your
2838				# synthetic data generation needs.
2839
2840				my $parameter_file = "param1.txt";
2841				my $out_datafile = "mydatafile1.dat";
2842				Algorithm::ExpectationMaximization->cluster_data_generator(
2843				input_parameter_file => $parameter_file,
2844				output_datafile => $out_datafile,
2845				total_number_of_data_points => $N );
2846
2847				# where the value of $N is the total number of data points you would like to see
2848				# generated for all of the Gaussians. How this total number is divided up amongst
2849				# the Gaussians is decided by the prior probabilities for the Gaussian components
2850				# as declared in input parameter file. The synthetic data may be visualized in a
2851				# terminal window and the visualization written out as a PNG image to a diskfile
2852				# by
2853
2854				my $data_visualization_mask = "11";
2855				$clusterer->visualize_data($data_visualization_mask);
2856				$clusterer->plot_hardcopy_data($data_visualization_mask);
2857
2858
2859				=head1 CHANGES
2860
2861				Version 1.22 should work with data in CSV files.
2862
2863				Version 1.21 incorporates minor code clean up. Overall, the module implementation
2864				remains unchanged.
2865
2866				Version 1.2 allows the module to also be used for 1-D data. The visualization code
2867				for 1-D shows the clusters through their histograms.
2868
2869				Version 1.1 incorporates much cleanup of the documentation associated with the
2870				module. Both the top-level module documentation, especially the Description part,
2871				and the comments embedded in the code were revised for better utilization of the
2872				module. The basic implementation code remains unchanged.
2873
2874
2875				=head1 DESCRIPTION
2876
2877				B is a I module for the
2878				Expectation-Maximization (EM) method of clustering numerical data that lends itself
2879				to modeling as a Gaussian mixture. Since the module is entirely in Perl (in the
2880				sense that it is not a Perl wrapper around a C library that actually does the
2881				clustering), the code in the module can easily be modified to experiment with several
2882				aspects of EM.
2883
2884				Gaussian Mixture Modeling (GMM) is based on the assumption that the data consists of
2885				C Gaussian components, each characterized by its own mean vector and its own
2886				covariance matrix. Obviously, given observed data for clustering, we do not know
2887				which of the C Gaussian components was responsible for any of the data elements.
2888				GMM also associates a prior probability with each Gaussian component. In general,
2889				these priors will also be unknown. So the problem of clustering consists of
2890				estimating the posterior class probability at each data element and also estimating
2891				the class priors. Once these posterior class probabilities and the priors are
2892				estimated with EM, we can use the naive Bayes' classifier to partition the data into
2893				disjoint clusters. Or, for "soft" clustering, we can find all the data elements that
2894				belong to a Gaussian component on the basis of the posterior class probabilities at
2895				the data elements exceeding a prescribed threshold.
2896
2897				If you do not mind the fact that it is possible for the EM algorithm to occasionally
2898				get stuck in a local maximum and to, therefore, produce a wrong answer even when you
2899				know the data to be perfectly multimodal Gaussian, EM is probably the most magical
2900				approach to clustering multidimensional data. Consider the case of clustering
2901				three-dimensional data. Each Gaussian cluster in 3D space is characterized by the
2902				following 10 variables: the 6 unique elements of the C<3x3> covariance matrix (which
2903				must be symmetric positive-definite), the 3 unique elements of the mean, and the
2904				prior associated with the Gaussian. Now let's say you expect to see six Gaussians in
2905				your data. What that means is that you would want the values for 59 variables
2906				(remember the unit-summation constraint on the class priors which reduces the overall
2907				number of variables by one) to be estimated by the algorithm that seeks to discover
2908				the clusters in your data. What's amazing is that, despite the large number of
2909				variables that must be optimized simultaneously, the EM algorithm will very likely
2910				give you a good approximation to the right answer.
2911
2912				At its core, EM depends on the notion of unobserved data and the averaging of the
2913				log-likelihood of the data actually observed over all admissible probabilities for
2914				the unobserved data. But what is unobserved data? While in some cases where EM is
2915				used, the unobserved data is literally the missing data, in others, it is something
2916				that cannot be seen directly but that nonetheless is relevant to the data actually
2917				observed. For the case of clustering multidimensional numerical data that can be
2918				modeled as a Gaussian mixture, it turns out that the best way to think of the
2919				unobserved data is in terms of a sequence of random variables, one for each observed
2920				data point, whose values dictate the selection of the Gaussian for that data point.
2921				This point is explained in great detail in my on-line tutorial at
2922				L.
2923
2924				The EM algorithm in our context reduces to an iterative invocation of the following
2925				steps: (1) Given the current guess for the means and the covariances of the different
2926				Gaussians in our mixture model, use Bayes' Rule to update the posterior class
2927				probabilities at each of the data points; (2) Using the updated posterior class
2928				probabilities, first update the class priors; (3) Using the updated class priors,
2929				update the class means and the class covariances; and go back to Step (1). Ideally,
2930				the iterations should terminate when the expected log-likelihood of the observed data
2931				has reached a maximum and does not change with any further iterations. The stopping
2932				rule used in this module is the detection of no change over three consecutive
2933				iterations in the values calculated for the priors.
2934
2935				This module provides three different choices for seeding the clusters: (1) random,
2936				(2) kmeans, and (3) manual. When random seeding is chosen, the algorithm randomly
2937				selects C data elements as cluster seeds. That is, the data vectors associated
2938				with these seeds are treated as initial guesses for the means of the Gaussian
2939				distributions. The covariances are then set to the values calculated from the entire
2940				dataset with respect to the means corresponding to the seeds. With kmeans seeding, on
2941				the other hand, the means and the covariances are set to whatever values are returned
2942				by the kmeans algorithm. And, when seeding is set to manual, you are allowed to
2943				choose C data elements --- by specifying their tag names --- for the seeds. The
2944				rest of the EM initialization for the manual mode is the same as for the random mode.
2945				The algorithm allows for the initial priors to be specified for the manual mode of
2946				seeding.
2947
2948				Much of code for the kmeans based seeding of EM was drawn from the
2949				C module by me. The code from that module used here corresponds to
2950				the case when the C option in the C module is set
2951				to C. The C option for KMeans consists of subjecting the data to a
2952				principal components analysis (PCA) to discover the direction of maximum variance in
2953				the data space. The data points are then projected on to this direction and a
2954				histogram constructed from the projections. Centers of the C largest peaks in
2955				this smoothed histogram are used to seed the KMeans based clusterer. As you'd
2956				expect, the output of the KMeans used to seed the EM algorithm.
2957
2958				This module uses two different criteria to measure the quality of the clustering
2959				achieved. The first is the Minimum Description Length (MDL) proposed originally by
2960				Rissanen (J. Rissanen: "Modeling by Shortest Data Description," Automatica, 1978, and
2961				"A Universal Prior for Integers and Estimation by Minimum Description Length," Annals
2962				of Statistics, 1983.) The MDL criterion is a difference of a log-likelihood term for
2963				all of the observed data and a model-complexity penalty term. In general, both the
2964				log-likelihood and the model-complexity terms increase as the number of clusters
2965				increases. The form of the MDL criterion in this module uses for the penalty term
2966				the Bayesian Information Criterion (BIC) of G. Schwartz, "Estimating the Dimensions
2967				of a Model," The Annals of Statistics, 1978. In general, the smaller the value of
2968				MDL quality measure, the better the clustering of the data.
2969
2970				For our second measure of clustering quality, we use `trace( SW^-1 . SB)' where SW is
2971				the within-class scatter matrix, more commonly denoted S_w, and SB the between-class
2972				scatter matrix, more commonly denoted S_b (the underscore means subscript). This
2973				measure can be thought of as the normalized average distance between the clusters,
2974				the normalization being provided by average cluster covariance SW^-1. Therefore, the
2975				larger the value of this quality measure, the better the separation between the
2976				clusters. Since this measure has its roots in the Fisher linear discriminant
2977				function, we incorporate the word C in the name of the quality measure.
2978				I When the
2979				clusters exhibit significant overlap, the numbers produced by this quality measure
2980				tend to be generally meaningless.
2981
2982				=head1 METHODS
2983
2984				The module provides the following methods for EM based
2985				clustering, for cluster visualization, for data
2986				visualization, and for the generation of data for testing a
2987				clustering algorithm:
2988
2989				=over
2990
2991				=item B
2992
2993				A call to C constructs a new instance of the
2994				C class. A typical form
2995				of this call when you want to use random option for seeding
2996				the algorithm is:
2997
2998				my $clusterer = Algorithm::ExpectationMaximization->new(
2999				datafile => $datafile,
3000				mask => $mask,
3001				K => 3,
3002				max_em_iterations => 300,
3003				seeding => 'random',
3004				terminal_output => 1,
3005				);
3006
3007				where C is the expected number of clusters and
3008				C the maximum number of EM iterations
3009				that you want to allow until convergence is achieved.
3010				Depending on your dataset and on the choice of the initial
3011				seeds, the actual number of iterations used could be as few
3012				as 10 and as many as reaching 300. The output produced by
3013				the algorithm shows the actual number of iterations used to
3014				arrive at convergence.
3015
3016				The data file supplied through the C option is
3017				expected to contain entries in the following format
3018
3019				c20 0 10.7087017086940 9.63528386251712 10.9512155258108 ...
3020				c7 0 12.8025925026787 10.6126270065785 10.5228482095349 ...
3021				b9 0 7.60118206283120 5.05889245193079 5.82841781759102 ...
3022				....
3023				....
3024
3025				where the first column contains the symbolic ID tag for each
3026				data record and the rest of the columns the numerical
3027				information. As to which columns are actually used for
3028				clustering is decided by the string value of the mask. For
3029				example, if we wanted to cluster on the basis of the entries
3030				in just the 3rd, the 4th, and the 5th columns above, the
3031				mask value would be C where the character C
3032				indicates that the ID tag is in the first column, the
3033				character C<0> that the second column is to be ignored, and
3034				the C<1>'s that follow that the 3rd, the 4th, and the 5th
3035				columns are to be used for clustering.
3036
3037				If instead of random seeding, you wish to use the kmeans
3038				based seeding, just replace the option C supplied
3039				for C by C. You can also do manual seeding
3040				by designating a specified set of data elements to serve as
3041				cluster seeds. The call to the constructor in this case
3042				looks like
3043
3044				my $clusterer = Algorithm::ExpectationMaximization->new(
3045				datafile => $datafile,
3046				mask => $mask,
3047				K => 3,
3048				max_em_iterations => 300,
3049				seeding => 'manual',
3050				seed_tags => ['a26', 'b53', 'c49'],
3051				terminal_output => 1,
3052				);
3053
3054				where the option C is set to an anonymous array
3055				of symbolic names associated with the data elements.
3056
3057				If you know the class priors, you can supply them through an
3058				additional option to the constructor that looks like
3059
3060				class_priors => [0.6, 0.2, 0.2],
3061
3062				for the case of C equal to 3. B
3063				be a useful thing to do only for the case of manual
3064				seeding.> If you go for manual seeding, the order in which
3065				the priors are expressed should correspond to the order of
3066				the manually chosen tags supplied through the C
3067				option.
3068
3069				Note that the parameter C is boolean; when
3070				not supplied in the call to C it defaults to 0. When
3071				set, this parameter displays useful information in the
3072				window of the terminal screen in which you invoke the
3073				algorithm.
3074
3075				=item B
3076
3077				$clusterer->read_data_from_file()
3078
3079				This is a required call after the constructor is invoked. As
3080				you would expect, this call reads in the data for
3081				clustering.
3082
3083				=item B
3084
3085				$clusterer->seed_the_clusters();
3086
3087				This is also a required call. It processes the option you
3088				supplied for C in the constructor call to choose
3089				the data elements for seeding the C clusters.
3090
3091				=item B
3092
3093				$clusterer->EM();
3094
3095				This is the workhorse of the module, as you would expect.
3096				The means, the covariances, and the priors estimated by this
3097				method are stored in instance variables that are subsequently
3098				accessed by other methods for the purpose of displaying the
3099				clusters, the probability distributions, etc.
3100
3101				=item B
3102
3103				$clusterer->run_bayes_classifier();
3104
3105				Using the posterior probability distributions estimated by
3106				the C method, this method partitions the data into the
3107				C disjoint clusters using the naive Bayes' classifier.
3108
3109				=item B
3110
3111				my $clusters = $clusterer->return_disjoint_clusters();
3112
3113				This allows you to access the clusters obtained with the
3114				application of the naive Bayes' classifier in your own
3115				scripts. If, say, you wanted to see the data records placed
3116				in each cluster, you could subsequently invoke the following
3117				loop in your own script:
3118
3119				foreach my $index (0..@$clusters-1) {
3120				print "Cluster $index (Naive Bayes): @{$clusters->[$index]}\n\n"
3121				}
3122
3123				where C<$clusters> holds the array reference returned by the
3124				call to C.
3125
3126				=item B
3127
3128				$clusterer->write_naive_bayes_clusters_to_files();
3129
3130				This method writes the clusters obtained by applying the
3131				naive Bayes' classifier to disk files, one cluster per
3132				file. What is written out to each file consists of the
3133				symbolic names of the data records that belong to the
3134				cluster corresponding to that file. The clusters are placed
3135				in files with names like
3136
3137				naive_bayes_cluster1.txt
3138				naive_bayes_cluster2.txt
3139				...
3140
3141				=item B
3142
3143				my $theta1 = 0.2;
3144				my $posterior_prob_clusters =
3145				$clusterer->return_clusters_with_posterior_probs_above_threshold($theta1);
3146
3147				This method returns a reference to an array of C
3148				anonymous arrays, each consisting of the symbolic names for
3149				the data records where the posterior class probability
3150				exceeds the threshold as specified by C<$theta1>.
3151				Subsequently, you can access each of these
3152				posterior-probability based clusters through a loop
3153				construct such as
3154
3155				foreach my $index (0..@$posterior_prob_clusters-1) {
3156				print "Cluster $index (based on posterior probs exceeding $theta1): " .
3157				"@{$posterior_prob_clusters->[$index]}\n\n"
3158				}
3159
3160				=item B
3161
3162				$clusterer->write_posterior_prob_clusters_above_threshold_to_files($theta1);
3163
3164				This call writes out the posterior-probability based soft
3165				clusters to disk files. As in the previous method, the
3166				threshold C<$theta1> sets the probability threshold for
3167				deciding which data elements belong to a cluster. These
3168				clusters are placed in files with names like
3169
3170				posterior_prob_cluster1.txt
3171				posterior_prob_cluster2.txt
3172				...
3173
3174				=item B
3175
3176				my $theta2 = 0.00001;
3177				my $class_distributions =
3178				$clusterer->return_individual_class_distributions_above_given_threshold($theta2);
3179
3180				This is the method to call if you wish to see the individual
3181				Gaussians in your own script. The method returns a reference
3182				to an array of anonymous arrays, with each anonymous array
3183				representing data membership in each Gaussian. Only those
3184				data points are included in each Gaussian where the
3185				probability exceeds the threshold C<$theta2>. Note that the
3186				larger the covariance and the higher the data
3187				dimensionality, the smaller this threshold must be for you
3188				to see any of the data points in a Gaussian. After you have
3189				accessed the Gaussian mixture in this manner, you can
3190				display the data membership in each Gaussian through the
3191				following sort of a loop:
3192
3193				foreach my $index (0..@$class_distributions-1) {
3194				print "Gaussian Distribution $index (only shows data elements " .
3195				"whose probabilities exceed the threshold $theta2: " .
3196				"@{$class_distributions->[$index]}\n\n"
3197				}
3198
3199				=item B
3200
3201				my $visualization_mask = "11";
3202				$clusterer->visualize_clusters($visualization_mask);
3203
3204				The visualization mask here does not have to be identical to
3205				the one used for clustering, but must be a subset of that
3206				mask. This is convenient for visualizing the clusters in
3207				two- or three-dimensional subspaces of the original space.
3208				The subset is specified by placing `0's in the positions
3209				corresponding to the dimensions you do NOT want to see
3210				through visualization. Depending on the mask used, this
3211				method creates a 2D or a 3D scatter plot of the clusters
3212				obtained through the naive Bayes' classification rule.
3213
3214				=item B
3215
3216				$clusterer->visualize_distributions($visualization_mask);
3217
3218				This is the method to call if you want to visualize the soft
3219				clustering corresponding to the posterior class
3220				probabilities exceeding the threshold specified in the call
3221				to
3222				C.
3223				Again, depending on the visualization mask used, you will
3224				see either a 2D plot or a 3D scatter plot.
3225
3226				=item B
3227
3228				$clusterer->plot_hardcopy_clusters($visualization_mask);
3229
3230				This method create a PNG file from the C created
3231				display of the naive Bayes' clusters obtained from the data.
3232				The plotting functionality of C is accessed through
3233				the Perl wrappers provided by the C
3234				module.
3235
3236				=item B
3237
3238				$clusterer->plot_hardcopy_distributions($visualization_mask);
3239
3240				This method create a PNG file from the C created
3241				display of the clusters that correspond to the posterior
3242				class probabilities exceeding a specified threshold. The
3243				plotting functionality of C is accessed through the
3244				Perl wrappers provided by the C module.
3245
3246				=item B
3247
3248				$clusterer->display_fisher_quality_vs_iterations();
3249
3250				This method measures the quality of clustering by
3251				calculating the normalized average squared distance between
3252				the cluster centers, the normalization being provided by the
3253				average cluster covariance. See the Description for further
3254				details. In general, this measure is NOT useful for
3255				overlapping clusters.
3256
3257				=item B
3258
3259				$clusterer->display_mdl_quality_vs_iterations();
3260
3261				At the end of each iteration, this method measures the
3262				quality of clustering my calculating its MDL (Minimum
3263				Description Length). As stated earlier in Description, the
3264				MDL measure is a difference of a log-likelihood term for all
3265				of the observed data and a model-complexity penalty term.
3266				The smaller the value returned by this method, the better
3267				the clustering.
3268
3269				=item B
3270
3271				my $estimated_priors = $clusterer->return_estimated_priors();
3272				print "Estimated class priors: @$estimated_priors\n";
3273
3274				This method can be used to access the final values of the
3275				class priors as estimated by the EM algorithm.
3276
3277				=item B
3278
3279				Algorithm::ExpectationMaximization->cluster_data_generator(
3280				input_parameter_file => $parameter_file,
3281				output_datafile => $out_datafile,
3282				total_number_of_data_points => 300
3283				);
3284
3285				for generating multivariate data for clustering if you wish to play with synthetic
3286				data for experimenting with the EM algorithm. The input parameter file must specify
3287				the priors to be used for the Gaussians, their means, and their covariance matrices.
3288				The format of the information contained in the parameter file must be as shown in the
3289				file C provided in the C directory. It will be easiest for you
3290				to just edit a copy of this file for your data generation needs. In addition to the
3291				format of the parameter file, the main constraint you need to observe in specifying
3292				the parameters is that the dimensionality of the covariance matrices must correspond
3293				to the dimensionality of the mean vectors. The multivariate random numbers are
3294				generated by calling the C module. As you would expect, this module
3295				requires that the covariance matrices you specify in your parameter file be symmetric
3296				and positive definite. Should the covariances in your parameter file not obey this
3297				condition, the C module will let you know.
3298
3299				=item B
3300
3301				$clusterer->visualize_data($data_visualization_mask);
3302
3303				This is the method to call if you want to visualize the data
3304				you plan to cluster with the EM algorithm. You'd need to
3305				specify argument mask in a manner similar to the
3306				visualization of the clusters, as explained earlier.
3307
3308				=item B
3309
3310				$clusterer->plot_hardcopy_data($data_visualization_mask);
3311
3312				This method creates a PNG file that can be used to print out
3313				a hardcopy of the data in different 2D and 3D subspaces of
3314				the data space. The visualization mask is used to select the
3315				subspace for the PNG image.
3316
3317				=back
3318
3319				=head1 HOW THE CLUSTERS ARE OUTPUT
3320
3321				This module produces two different types of clusters: the "hard" clusters and the
3322				"soft" clusters. The hard clusters correspond to the naive Bayes' classification of
3323				the data points on the basis of the Gaussian distributions and the class priors
3324				estimated by the EM algorithm. Such clusters partition the data into disjoint
3325				subsets. On the other hand, the soft clusters correspond to the posterior class
3326				probabilities calculated by the EM algorithm. A data element belongs to a cluster if
3327				its posterior probability for that Gaussian exceeds a threshold.
3328
3329				After the EM algorithm has finished running, the hard clusters are created by
3330				invoking the method C on an instance of the module and then
3331				made user-accessible by calling C. These clusters may
3332				then be displayed in a terminal window by dereferencing each element of the array
3333				whose reference is returned b C. The hard clusters can
3334				be written out to disk files by invoking C.
3335				This method writes out the clusters to files, one cluster per file. What is written
3336				out to each file consists of the symbolic names of the data records that belong to
3337				the cluster corresponding to that file. The clusters are placed in files with names
3338				like
3339
3340				naive_bayes_cluster1.txt
3341				naive_bayes_cluster2.txt
3342				...
3343
3344				The soft clusters on the other hand are created by calling
3345				C
3346				on an instance of the module, where the argument C<$theta1>
3347				is the threshold for deciding whether a data element belongs
3348				in a soft cluster. The posterior class probability at a
3349				data element must exceed the threshold for the element to
3350				belong to the corresponding cluster. The soft cluster can
3351				be written out to disk files by calling
3352				C.
3353				As with the hard clusters, each cluster is placed in a separate
3354				file. The filenames for such clusters look like:
3355
3356				posterior_prob_cluster1.txt
3357				posterior_prob_cluster2.txt
3358				...
3359
3360				=head1 WHAT IF THE NUMBER OF CLUSTERS IS UNKNOWN?
3361
3362				The module constructor requires that you supply a value for the parameter C, which
3363				is the number of clusters you expect to see in the data. But what if you do not have
3364				a good value for C? Note that it is possible to search for the best C by using
3365				the two clustering quality criteria included in the module. However, I have
3366				intentionally not yet incorporated that feature in the module because it slows down
3367				the execution of the code --- especially when the dimensionality of the data
3368				increases. However, nothing prevents you from writing a script --- along the lines
3369				of the five "canned_example" scripts in the C directory --- that would use
3370				the two clustering quality metrics for finding the best choice for C for a given
3371				dataset. Obviously, you will now have to incorporate the call to the constructor in
3372				a loop and check the value of the quality measures for each value of C.
3373
3374				=head1 SOME RESULTS OBTAINED WITH THIS MODULE
3375
3376				If you would like to see some results that have been obtained with this module, check
3377				out Section 7 of the report
3378				L.
3379
3380				=head1 THE C DIRECTORY
3381
3382				Becoming familiar with this directory should be your best
3383				strategy to become comfortable with this module (and its
3384				future versions). You are urged to start by executing the
3385				following five example scripts:
3386
3387				=over 16
3388
3389				=item I
3390
3391				This example applies the EM algorithm to the data contained in the datafile
3392				C. The mixture data in the file corresponds to three overlapping
3393				Gaussian components in a star-shaped pattern. The EM based clustering for this data
3394				is shown in the files C and
3395				C, the former displaying the hard clusters
3396				obtained by using the naive Bayes' classifier and the latter showing the soft
3397				clusters obtained on the basis of the posterior class probabilities at the data
3398				points.
3399
3400				=item I
3401
3402				The datafile used in this example is C. This mixture data
3403				corresponds to two well-separated relatively isotropic Gaussians. EM based clustering for this
3404				data is shown in the files C and
3405				C, the former displaying the hard clusters
3406				obtained by using the naive Bayes' classifier and the latter showing the soft
3407				clusters obtained by using the posterior class probabilities at the data points.
3408
3409				=item I
3410
3411				Like the first example, this example again involves three Gaussians, but now their
3412				means are not co-located. Additionally, we now seed the clusters manually by
3413				specifying three selected data points as the initial guesses for the cluster means.
3414				The datafile used for this example is C. The EM based clustering
3415				for this data is shown in the files C and
3416				C, the former displaying the hard clusters
3417				obtained by using the naive Bayes' classifier and the latter showing the soft
3418				clusters obtained on the basis of the posterior class probabilities at the data
3419				points.
3420
3421				=item I
3422
3423				Whereas the three previous examples demonstrated EM based clustering of 2D data, we
3424				now present an example of clustering in 3D. The datafile used in this example is
3425				C. This mixture data corresponds to three well-separated but highly
3426				anisotropic Gaussians. The EM derived clustering for this data is shown in the files
3427				C and C, the
3428				former displaying the hard clusters obtained by using the naive Bayes' classifier and
3429				the latter showing the soft clusters obtained on the basis of the posterior class
3430				probabilities at the data points.
3431
3432				You may also wish to run this example on the data in a CSV file in the C
3433				directory. The name of the file is C.
3434
3435				=item I
3436
3437				We again demonstrate clustering in 3D but now we have one Gaussian cluster that
3438				"cuts" through the other two Gaussian clusters. The datafile used in this example is
3439				C. The three Gaussians in this case are highly overlapping and
3440				highly anisotropic. The EM derived clustering for this data is shown in the files
3441				C and C, the
3442				former displaying the hard clusters obtained by using the naive Bayes' classifier and
3443				the latter showing the soft clusters obtained through the posterior class
3444				probabilities at the data points.
3445
3446				=item I
3447
3448				This example, added in Version 1.2, demonstrates the use of this module for 1-D data.
3449				In order to visualize the clusters for the 1-D case, we show them through their
3450				respective histograms. The datafile used in this example is C. The
3451				data consists of two overlapping Gaussians. The EM derived clustering for this data
3452				is shown in the files C and
3453				C, the former displaying the hard clusters
3454				obtained by using the naive Bayes' classifier and the latter showing the soft
3455				clusters obtained through the posterior class probabilities at the data points.
3456
3457				=back
3458
3459				Going through the six examples listed above will make you familiar with how to make
3460				the calls to the clustering and the visualization methods. The C directory
3461				also includes several parameter files with names like
3462
3463				param1.txt
3464				param2.txt
3465				param3.txt
3466				...
3467
3468				These were used to generate the synthetic data for which the results are shown in the
3469				C directory. Just make a copy of one of these files and edit it if you
3470				would like to generate your own multivariate data for clustering. Note that you can
3471				generate data with any dimensionality through appropriate entries in the parameter
3472				file.
3473
3474				=head1 CAVEATS
3475
3476				When you run the scripts in the C directory, your results will NOT always
3477				look like what I have shown in the PNG image files in the directory. As mentioned
3478				earlier in Description, the EM algorithm starting from randomly chosen initial
3479				guesses for the cluster means can get stuck in a local maximum.
3480
3481				That raises an interesting question of how one judges the correctness of clustering
3482				results when dealing with real experimental data. For real data, the best approach
3483				is to try the EM algorithm multiple times with all of the seeding options included in
3484				this module. It would be safe to say that, at least in low dimensional spaces and
3485				with sufficient data, a majority of your runs should yield "correct" results.
3486
3487				Also bear in mind that a pure Perl implementation is not meant for the clustering of
3488				very large data files. It is really designed more for researching issues related to
3489				EM based approaches to clustering.
3490
3491
3492				=head1 REQUIRED
3493
3494				This module requires the following three modules:
3495
3496				Math::Random
3497				Graphics::GnuplotIF
3498				Math::GSL::Matrix
3499
3500				the first for generating the multivariate random numbers, the second for the
3501				visualization of the clusters, and the last for access to the Perl wrappers for the
3502				GNU Scientific Library. The C module of this library is used for various
3503				algebraic operations on the covariance matrices of the Gaussians.
3504
3505				=head1 EXPORT
3506
3507				None by design.
3508
3509
3510				=head1 BUGS
3511
3512				Please notify the author if you encounter any bugs. When sending email, please place
3513				the string 'Algorithm EM' in the subject line.
3514
3515				=head1 INSTALLATION
3516
3517				Download the archive from CPAN in any directory of your choice. Unpack the archive
3518				with a command that on a Linux machine would look like:
3519
3520				tar zxvf Algorithm-ExpectationMaximization-1.22.tar.gz
3521
3522				This will create an installation directory for you whose name will be
3523				C. Enter this directory and execute the
3524				following commands for a standard install of the module if you have root privileges:
3525
3526				perl Makefile.PL
3527				make
3528				make test
3529				sudo make install
3530
3531				If you do not have root privileges, you can carry out a non-standard install the
3532				module in any directory of your choice by:
3533
3534				perl Makefile.PL prefix=/some/other/directory/
3535				make
3536				make test
3537				make install
3538
3539				With a non-standard install, you may also have to set your PERL5LIB environment
3540				variable so that this module can find the required other modules. How you do that
3541				would depend on what platform you are working on. In order to install this module in
3542				a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
3543				variable by
3544
3545				setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
3546
3547				If I used bash, I'd need to declare:
3548
3549				export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
3550
3551
3552				=head1 ACKNOWLEDGMENTS
3553
3554				Version 1.2 is a result of the feedback received from Paul
3555				May of University of Birmingham. Thanks, Paul!
3556
3557				=head1 AUTHOR
3558
3559				Avinash Kak, kak@purdue.edu
3560
3561				If you send email, please place the string "EM Algorithm" in your
3562				subject line to get past my spam filter.
3563
3564				=head1 COPYRIGHT
3565
3566				This library is free software; you can redistribute it and/or modify it under the
3567				same terms as Perl itself.
3568
3569				Copyright 2014 Avinash Kak
3570
3571				=cut
3572