File Coverage

blib/lib/Algorithm/ExpectationMaximization.pm
Criterion Covered Total %
statement 22 24 91.6
branch n/a
condition n/a
subroutine 8 8 100.0
pod n/a
total 30 32 93.7


line stmt bran cond sub pod time code
1             package Algorithm::ExpectationMaximization;
2              
3             #---------------------------------------------------------------------------
4             # Copyright (c) 2014 Avinash Kak. All rights reserved. This program is free
5             # software. You may modify and/or distribute it under the same terms as Perl itself.
6             # This copyright notice must remain attached to the file.
7             #
8             # Algorithm::ExpectationMaximization is a pure Perl implementation for
9             # Expectation-Maximization based clustering of multi-dimensional data that can be
10             # modeled as a Gaussian mixture.
11             # ---------------------------------------------------------------------------
12              
13 1     1   12560 use 5.10.0;
  1         2  
  1         33  
14 1     1   4 use strict;
  1         1  
  1         21  
15 1     1   2 use warnings;
  1         9  
  1         27  
16 1     1   4 use Carp;
  1         0  
  1         69  
17 1     1   5 use File::Basename;
  1         1  
  1         61  
18 1     1   506 use Math::Random;
  1         4765  
  1         105  
19 1     1   514 use Graphics::GnuplotIF;
  1         8517  
  1         44  
20 1     1   1004 use Math::GSL::Matrix;
  0            
  0            
21             use Scalar::Util 'blessed';
22              
23             our $VERSION = '1.21';
24              
25             # from perl docs:
26             my $_num_regex = '^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$';
27              
28             # Constructor:
29             sub new {
30             my ($class, %args) = @_;
31             my @params = keys %args;
32             croak "\nYou have used a wrong name for a keyword argument " .
33             "--- perhaps a misspelling\n"
34             if check_for_illegal_params(@params) == 0;
35             bless {
36             _datafile => $args{datafile} || croak("datafile required"),
37             _mask => $args{mask} || croak("mask required"),
38             _K => $args{K} || croak("number of clusters required"),
39             _terminal_output => $args{terminal_output} || 0,
40             _seeding => $args{seeding} || 'random',
41             _seed_tags => $args{seed_tags} || [],
42             _max_em_iterations=> $args{max_em_iterations} || 100,
43             _class_priors => $args{class_priors} || [],
44             _debug => $args{debug} || 0,
45             _N => 0,
46             _data => {},
47             _data_id_tags => [],
48             _clusters => [],
49             _cluster_centers => [],
50             _data_dimensions => 0,
51             _cluster_normalizers => [],
52             _cluster_means => [],
53             _cluster_covariances => [],
54             _class_labels_for_data => {},
55             _class_probs_at_each_data_point => {},
56             _expected_class_probs => {},
57             _old_priors => [],
58             _old_old_priors => [],
59             _fisher_quality_vs_iteration => [],
60             _mdl_quality_vs_iterations => [],
61             }, $class;
62             }
63              
64             sub read_data_from_file {
65             my $self = shift;
66             my $datafile = $self->{_datafile};
67             my $mask = $self->{_mask};
68             my @mask = split //, $mask;
69             $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
70             print "data dimensionality: $self->{_data_dimensions} \n"
71             if $self->{_terminal_output};
72             open INPUT, $datafile
73             or die "unable to open file $datafile: $!";
74             chomp( my @raw_data = );
75             close INPUT;
76             # Transform strings into number data
77             foreach my $record (@raw_data) {
78             next unless $record;
79             next if $record =~ /^#/;
80             my @data_fields;
81             my @fields = split /\s+/, $record;
82             die "\nABORTED: Mask size does not correspond to row record size"
83             if $#fields != $#mask;
84             my $record_id;
85             foreach my $i (0..@fields-1) {
86             if ($mask[$i] eq '0') {
87             next;
88             } elsif ($mask[$i] eq 'N') {
89             $record_id = $fields[$i];
90             } elsif ($mask[$i] eq '1') {
91             push @data_fields, $fields[$i];
92             } else {
93             die "misformed mask for reading the data file";
94             }
95             }
96             my @nums = map {/$_num_regex/;$_} @data_fields;
97             $self->{_data}->{ $record_id } = \@nums;
98             }
99             my @all_data_ids = keys %{$self->{_data}};
100             $self->{_data_id_tags} = \@all_data_ids;
101             $self->{_N} = scalar @all_data_ids;
102             if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
103             carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
104             "points must satisfy the relation N > 2xK**2 where K is " .
105             "the number of clusters requested for the clusters to be " .
106             "meaningful $!"
107             if ( $self->{_N} < (2 * $self->{_K} ** 2) );
108             }
109             srand(123456789);
110             }
111              
112              
113             # This is the heart of the module --- in the sense that this method implements the EM
114             # algorithm for the estimating the parameters of a Gaussian mixture model for the
115             # data. In the implementation shown below, we declare convergence for the EM
116             # algorithm when the change in the class priors over three iterations falls below a
117             # threshold. The current value of this threshold, as can be seen in the function
118             # compare_array_floats(), is 0.00001.
119             sub EM {
120             my $self = shift;
121             $self->initialize_class_priors();
122             for (my $em_iteration=0; $em_iteration < $self->{_max_em_iterations};
123             $em_iteration++) {
124             if ($em_iteration == 0) {
125             print "\nSeeding the EM algorithm with:\n";
126             $self->display_seeding_stats();
127             print "\nFinished displaying the seeding information\n";
128             print "\nWill print out a dot for each iteration of EM:\n\n";
129             }
130             my $progress_indicator = $em_iteration % 5 == 0 ? $em_iteration : ".";
131             print $progress_indicator;
132             foreach my $data_id (@{$self->{_data_id_tags}}) {
133             $self->{_class_probs_at_each_data_point}->{$data_id} = [];
134             $self->{_expected_class_probs}->{$data_id} = [];
135             }
136             # Calculate prob(x | C_i) --- this is the prob of data point x as
137             # a member of class C_i. You must do this for all K classes:
138             foreach my $cluster_index(0..$self->{_K}-1) {
139             $self->find_prob_at_each_datapoint_for_given_mean_and_covar(
140             $self->{_cluster_means}->[$cluster_index],
141             $self->{_cluster_covariances}->[$cluster_index] );
142             }
143             $self->{_cluster_normalizers} = [];
144             if ($self->{_debug}) {
145             print "\n\nDisplaying prob of a data point vis-a-vis each class:\n\n";
146             foreach my $data_id (sort keys %{$self->{_data}}) {
147             my $class_probs_at_a_point =
148             $self->{_class_probs_at_each_data_point}->{$data_id};
149             print "Class probs for $data_id: @$class_probs_at_a_point\n"
150             }
151             }
152             # Calculate prob(C_i | x) which is the posterior prob of class
153             # considered as a r.v. to be C_i at a given point x. For a given
154             # x, the sum of such probabilities over all C_i must add up to 1:
155             $self->find_expected_classes_at_each_datapoint();
156             if ($self->{_debug}) {
157             print "\n\nDisplaying expected class probs at each data point:\n\n";
158             foreach my $data_id (sort keys %{$self->{_expected_class_probs}}) {
159             my $expected_classes_at_a_point =
160             $self->{_expected_class_probs}->{$data_id};
161             print "Expected classes $data_id: @$expected_classes_at_a_point\n";
162             }
163             }
164             # 1. UPDATE MEANS:
165             my @new_means;
166             foreach my $cluster_index(0..$self->{_K}-1) {
167             $new_means[$cluster_index] =
168             Math::GSL::Matrix->new($self->{_data_dimensions},1);
169             $new_means[$cluster_index]->zero();
170             foreach my $data_id (keys %{$self->{_data}}) {
171             my $data_record = $self->{_data}->{$data_id};
172             my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
173             $data_vec->set_col(0,$data_record);
174             $new_means[$cluster_index] +=
175             $self->{_expected_class_probs}->{$data_id}->[$cluster_index] *
176             $data_vec->copy();
177             $self->{_cluster_normalizers}->[$cluster_index] +=
178             $self->{_expected_class_probs}->{$data_id}->[$cluster_index];
179             }
180             $new_means[$cluster_index] *= 1.0 /
181             $self->{_cluster_normalizers}->[$cluster_index];
182             }
183             if ($self->{_debug}) {
184             foreach my $meanvec (@new_means) {
185             display_matrix("At EM Iteration $em_iteration, new mean vector is",
186             $meanvec);
187             }
188             }
189             $self->{_cluster_means} = \@new_means;
190             # 2. UPDATE COVARIANCES:
191             my @new_covariances;
192             foreach my $cluster_index(0..$self->{_K}-1) {
193             $new_covariances[$cluster_index] =
194             Math::GSL::Matrix->new($self->{_data_dimensions},
195             $self->{_data_dimensions});
196             $new_covariances[$cluster_index]->zero();
197             my $normalizer = 0;
198             foreach my $data_id (keys %{$self->{_data}}) {
199             my $data_record = $self->{_data}->{$data_id};
200             my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
201             $data_vec->set_col(0,$data_record);
202             my $mean_subtracted_data =
203             $data_vec - $self->{_cluster_means}->[$cluster_index];
204             my $outer_product = outer_product($mean_subtracted_data,
205             $mean_subtracted_data);
206             $new_covariances[$cluster_index] +=
207             $self->{_expected_class_probs}->{$data_id}->[$cluster_index] *
208             $outer_product;
209             }
210             $new_covariances[$cluster_index] *=
211             1.0 /
212             $self->{_cluster_normalizers}->[$cluster_index];
213             }
214             $self->{_cluster_covariances} = \@new_covariances;
215             # 3. UPDATE PRIORS:
216             $self->{_old_old_priors} = deep_copy_array( $self->{_old_priors} )
217             if @{$self->{_old_priors}} > 0;
218             $self->{_old_priors} = deep_copy_array( $self->{_class_priors} );
219             foreach my $cluster_index(0..$self->{_K}-1) {
220             $self->{_class_priors}->[$cluster_index] =
221             $self->{_cluster_normalizers}->[$cluster_index] / $self->{_N};
222             }
223             my @priors = @{$self->{_class_priors}};
224             print "\nUpdated priors: @priors\n\n\n" if $self->{_debug};
225             push @{$self->{_fisher_quality_vs_iteration}},
226             $self->clustering_quality_fisher();
227             push @{$self->{_mdl_quality_vs_iteration}}, $self->clustering_quality_mdl();
228             if ( ($em_iteration > 5 && $self->reached_convergence())
229             || ($em_iteration == $self->{_max_em_iterations} - 1) ) {
230             my @old_old_priors = @{$self->{_old_old_priors}};
231             my @old_priors = @{$self->{_old_priors}};
232             print "\n\nPrevious to previous priors: @old_old_priors\n";
233             print "Previous priors: @old_priors\n";
234             print "Current class priors: @{$self->{_class_priors}}\n";
235             print "\n\nCONVERGENCE ACHIEVED AT ITERATION $em_iteration\n\n"
236             if $em_iteration < $self->{_max_em_iterations} - 1;
237             last;
238             }
239             }
240             print "\n\n\n";
241             }
242              
243             sub reached_convergence {
244             my $self = shift;
245             return 1 if compare_array_floats($self->{_old_old_priors},
246             $self->{_old_priors})
247             &&
248             compare_array_floats($self->{_old_priors},
249             $self->{_class_priors});
250             return 0;
251             }
252              
253             # Classify the data into disjoint clusters using the Naive Bayes' classification:
254             sub run_bayes_classifier {
255             my $self = shift;
256             $self->classify_all_data_tuples_bayes($self->{_cluster_means},
257             $self->{_cluster_covariances});
258             }
259              
260             # Should NOT be called before run_bayes_classifier is run
261             sub return_disjoint_clusters {
262             my $self = shift;
263             return $self->{_clusters};
264             }
265              
266             sub return_clusters_with_posterior_probs_above_threshold {
267             my $self = shift;
268             my $theta = shift;
269             my @class_distributions;
270             foreach my $cluster_index (0..$self->{_K}-1) {
271             push @class_distributions, [];
272             }
273             foreach my $data_tag (@{$self->{_data_id_tags}}) {
274             foreach my $cluster_index (0..$self->{_K}-1) {
275             push @{$class_distributions[$cluster_index]}, $data_tag
276             if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index]
277             > $theta;
278             }
279             }
280             return \@class_distributions;
281             }
282              
283             sub return_individual_class_distributions_above_given_threshold {
284             my $self = shift;
285             my $theta = shift;
286             my @probability_distributions;
287             foreach my $cluster_index (0..$self->{_K}-1) {
288             push @probability_distributions, [];
289             }
290             foreach my $cluster_index (0..$self->{_K}-1) {
291             my $mean_vec = $self->{_cluster_means}->[$cluster_index];
292             my $covar = $self->{_cluster_covariances}->[$cluster_index];
293             foreach my $data_id (keys %{$self->{_data}}) {
294             my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
295             $data_vec->set_col( 0, $self->{_data}->{$data_id});
296             my $datavec_minus_mean = $data_vec - $mean_vec;
297             display_matrix( "datavec minus mean is ", $datavec_minus_mean )
298             if $self->{_debug};
299             my $exponent = undef;
300             if ($self->{_data_dimensions} > 1) {
301             $exponent = -0.5 * vector_matrix_multiply(
302             transpose($datavec_minus_mean),
303             matrix_vector_multiply($covar->inverse(), $datavec_minus_mean ) );
304             } else {
305             my @var_inverse = $covar->inverse()->as_list;
306             my $var_inverse_val = $var_inverse[0];
307             my @data_minus_mean = $datavec_minus_mean->as_list;
308             my $data_minus_mean_val = $data_minus_mean[0];
309             $exponent = -0.5 * ($data_minus_mean_val ** 2) * $var_inverse_val;
310             }
311             print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug};
312             my $coefficient = 1.0 / \
313             ( (2 * $Math::GSL::Const::M_PI)**$self->{_data_dimensions} * sqrt($covar->det()) );
314             my $prob = $coefficient * exp($exponent);
315             push @{$probability_distributions[$cluster_index]}, $data_id
316             if $prob > $theta;
317             }
318             }
319             return \@probability_distributions;
320             }
321              
322             sub return_estimated_priors {
323             my $self = shift;
324             return $self->{_class_priors};
325             }
326              
327             # Calculates the MDL (Minimum Description Length) clustering criterion according to
328             # Rissanen. (J. Rissanen: "Modeling by Shortest Data Description," Automatica, 1978,
329             # and "A Universal Prior for Integers and Estimation by Minimum Description Length,"
330             # Annals of Statistics, 1983.) The MDL criterion is a difference of a log-likelihood
331             # term for all of the observed data and a model-complexity penalty term. In general,
332             # both the log-likelihood and the model-complexity terms increase as the number of
333             # clusters increases. The form of the MDL criterion used in the implementation below
334             # uses for the penalty term the Bayesian Information Criterion (BIC) of G. Schwartz,
335             # "Estimating the Dimensions of a Model," The Annals of Statistics, 1978. In
336             # general, the smaller the value of the MDL quality measure calculated below, the
337             # better the clustering of the data.
338             sub clustering_quality_mdl {
339             my $self = shift;
340             # Calculate the inverses of all of the covariance matrices in order to avoid
341             # having to calculate them repeatedly inside the inner 'foreach' loop in the
342             # main part of this method. Here we go:
343             my @covar_inverses;
344             foreach my $cluster_index (0..$self->{_K}-1) {
345             my $covar = $self->{_cluster_covariances}->[$cluster_index];
346             push @covar_inverses, $covar->inverse();
347             }
348             # For the clustering quality, first calculate the log-likelihood of all the
349             # observed data:
350             my $log_likelihood = 0;
351             foreach my $tag (@{$self->{_data_id_tags}}) {
352             my $likelihood_for_each_tag = 0;
353             foreach my $cluster_index (0..$self->{_K}-1) {
354             my $mean_vec = $self->{_cluster_means}->[$cluster_index];
355             my $covar = $self->{_cluster_covariances}->[$cluster_index];
356             my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
357             $data_vec->set_col( 0, $self->{_data}->{$tag});
358             my $datavec_minus_mean = $data_vec - $mean_vec;
359             my $exponent = undef;
360             if ($self->{_data_dimensions} > 1) {
361             $exponent = -0.5 * vector_matrix_multiply(
362             transpose($datavec_minus_mean),
363             matrix_vector_multiply($covar_inverses[$cluster_index],
364             $datavec_minus_mean ) );
365             } else {
366             my @var_inverse = $covar_inverses[$cluster_index]->as_list;
367             my $var_inverse_val = $var_inverse[0];
368             my @data_minus_mean = $datavec_minus_mean->as_list;
369             my $data_minus_mean_val = $data_minus_mean[0];
370             $exponent = -0.5 * ($data_minus_mean_val ** 2) * $var_inverse_val;
371             }
372             next if $covar->det() < 0;
373             my $coefficient = 1.0 /
374             ( (2 * $Math::GSL::Const::M_PI)**$self->{_data_dimensions}
375             * sqrt($covar->det()) );
376             my $prob = $coefficient * exp($exponent);
377             $likelihood_for_each_tag +=
378             $prob * $self->{_class_priors}->[$cluster_index];
379             }
380             $log_likelihood += log( $likelihood_for_each_tag );
381             }
382             # Now calculate the model complexity penalty. $L is the total number of
383             # parameters it takes to specify a mixture of K Gaussians. If d is the
384             # dimensionality of the data space, the covariance matrix of each Gaussian takes
385             # (d**2 -d)/2 + d = d(d+1)/2 parameters since this matrix must be symmetric. And
386             # then you need d mean value parameters, and one prior probability parameter
387             # for the Gaussian. So $L = K[1 + d + d(d+1)/2] - 1 where the final '1' that
388             # is subtracted is to account for the normalization on the class priors.
389             my $L = (0.5 * $self->{_K} *
390             ($self->{_data_dimensions}**2 + 3*$self->{_data_dimensions} + 2) ) - 1;
391             my $model_complexity_penalty = 0.5 * $L * log( $self->{_N} );
392             my $mdl_criterion = -1 * $log_likelihood + $model_complexity_penalty;
393             return $mdl_criterion;
394             }
395              
396             # For our second measure of clustering quality, we use `trace( SW^-1 . SB)' where SW
397             # is the within-class scatter matrix, more commonly denoted S_w, and SB the
398             # between-class scatter matrix, more commonly denoted S_b (the underscore means
399             # subscript). This measure can be thought of as the normalized average distance
400             # between the clusters, the normalization being provided by average cluster
401             # covariance SW^-1. Therefore, the larger the value of this quality measure, the
402             # better the separation between the clusters. Since this measure has its roots in
403             # the Fisher linear discriminant function, we incorporate the word 'fisher' in the
404             # name of the quality measure. Note that this measure is good only when the clusters
405             # are disjoint. When the clusters exhibit significant overlap, the numbers produced
406             # by this quality measure tend to be generally meaningless. As an extreme case,
407             # let's say your data was produced by a set of Gaussians, all with the same mean
408             # vector, but each with a distinct covariance. For this extreme case, this measure
409             # will produce a value close to zero --- depending on the accuracy with which the
410             # means are estimated --- even when your clusterer is doing a good job of identifying
411             # the individual clusters.
412             sub clustering_quality_fisher {
413             my $self = shift;
414             my @cluster_quality_indices;
415             my $fisher_trace = 0;
416             my $S_w =
417             Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions});
418             $S_w->zero;
419             my $S_b =
420             Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions});
421             $S_b->zero;
422             my $global_mean = Math::GSL::Matrix->new($self->{_data_dimensions},1);
423             $global_mean->zero;
424             foreach my $cluster_index(0..$self->{_K}-1) {
425             $global_mean = $self->{_class_priors}->[$cluster_index] *
426             $self->{_cluster_means}->[$cluster_index];
427             }
428             foreach my $cluster_index(0..$self->{_K}-1) {
429             $S_w += $self->{_cluster_covariances}->[$cluster_index] *
430             $self->{_class_priors}->[$cluster_index];
431             my $class_mean_minus_global_mean = $self->{_cluster_means}->[$cluster_index]
432             - $global_mean;
433             my $outer_product = outer_product( $class_mean_minus_global_mean,
434             $class_mean_minus_global_mean );
435             $S_b += $self->{_class_priors}->[$cluster_index] * $outer_product;
436             }
437             my $fisher = matrix_multiply($S_w->inverse, $S_b);
438             return $fisher unless defined blessed($fisher);
439             return matrix_trace($fisher);
440             }
441              
442             sub display_seeding_stats {
443             my $self = shift;
444             foreach my $cluster_index(0..$self->{_K}-1) {
445             print "\nSeeding for cluster $cluster_index:\n";
446             my $mean = $self->{_cluster_means}->[$cluster_index];
447             display_matrix("The mean is: ", $mean);
448             my $covariance = $self->{_cluster_covariances}->[$cluster_index];
449             display_matrix("The covariance is: ", $covariance);
450             }
451             }
452              
453             sub display_fisher_quality_vs_iterations {
454             my $self = shift;
455             print "\n\nFisher Quality vs. Iterations: " .
456             "@{$self->{_fisher_quality_vs_iteration}}\n\n";
457             }
458              
459             sub display_mdl_quality_vs_iterations {
460             my $self = shift;
461             print "\n\nMDL Quality vs. Iterations: @{$self->{_mdl_quality_vs_iteration}}\n\n";
462             }
463              
464             sub find_prob_at_each_datapoint_for_given_mean_and_covar {
465             my $self = shift;
466             my $mean_vec_ref = shift;
467             my $covar_ref = shift;
468             foreach my $data_id (keys %{$self->{_data}}) {
469             my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
470             $data_vec->set_col( 0, $self->{_data}->{$data_id});
471             if ($self->{_debug}) {
472             display_matrix("data vec in find prob function", $data_vec);
473             display_matrix("mean vec in find prob function", $mean_vec_ref);
474             display_matrix("covariance in find prob function", $covar_ref);
475             }
476             my $datavec_minus_mean = $data_vec - $mean_vec_ref;
477             display_matrix( "datavec minus mean is ", $datavec_minus_mean ) if $self->{_debug};
478             my $exponent = undef;
479             if ($self->{_data_dimensions} > 1) {
480             $exponent = -0.5 * vector_matrix_multiply( transpose($datavec_minus_mean),
481             matrix_vector_multiply( $covar_ref->inverse(), $datavec_minus_mean ) );
482             } elsif (defined blessed($covar_ref)) {
483             my @data_minus_mean = $datavec_minus_mean->as_list;
484             my $data_minus_mean_val = $data_minus_mean[0];
485             my @covar_as_matrix = $covar_ref->as_list;
486             my $covar_val = $covar_as_matrix[0];
487             $exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_val;
488             } else {
489             my @data_minus_mean = $datavec_minus_mean->as_list;
490             my $data_minus_mean_val = $data_minus_mean[0];
491             $exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_ref;
492             }
493             print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug};
494             my $coefficient = undef;
495             if ($self->{_data_dimensions} > 1) {
496             $coefficient = 1.0 / sqrt( ((2 * $Math::GSL::Const::M_PI) ** $self->{_data_dimensions}) *
497             $covar_ref->det()) ;
498             } elsif (!defined blessed($covar_ref)) {
499             $coefficient = 1.0 / sqrt(2 * $covar_ref * $Math::GSL::Const::M_PI);
500             } else {
501             my @covar_as_matrix = $covar_ref->as_list;
502             my $covar_val = $covar_as_matrix[0];
503             $coefficient = 1.0 / sqrt(2 * $covar_val * $Math::GSL::Const::M_PI);
504             }
505             my $prob = $coefficient * exp($exponent);
506             push @{$self->{_class_probs_at_each_data_point}->{$data_id}}, $prob;
507             }
508             }
509              
510             sub find_expected_classes_at_each_datapoint {
511             my $self = shift;
512             my @priors = @{$self->{_class_priors}};
513             foreach my $data_id (sort keys %{$self->{_class_probs_at_each_data_point}}) {
514             my $numerator =
515             vector_2_vector_multiply(
516             $self->{_class_probs_at_each_data_point}->{$data_id},
517             $self->{_class_priors} );
518             my $sum = 0;
519             foreach my $part (@$numerator) {
520             $sum += $part;
521             }
522             $self->{_expected_class_probs}->{$data_id} = [map $_/$sum, @{$numerator}];
523             }
524             }
525              
526             sub initialize_class_priors {
527             my $self = shift;
528             if (@{$self->{_class_priors}} == 0) {
529             my $prior = 1.0 / $self->{_K};
530             foreach my $class_index (0..$self->{_K}-1) {
531             push @{$self->{_class_priors}}, $prior;
532             }
533             }
534             die "Mismatch between number of values for class priors " .
535             "and the number of clusters expected"
536             unless @{$self->{_class_priors}} == $self->{_K};
537             my $sum = 0;
538             foreach my $prior (@{$self->{_class_priors}}) {
539             $sum += $prior;
540             }
541             die "Your priors in the constructor call do not add up to 1"
542             unless abs($sum - 1) < 0.001;
543             print "\nInitially assumed class priors are: @{$self->{_class_priors}}\n";
544             }
545              
546             sub estimate_class_priors {
547             my $self = shift;
548             foreach my $datatag (keys %{$self->{_data}}) {
549             my $class_label = $self->{_class_labels}->{$datatag};
550             $self->{_class_priors}[$class_label]++;
551             }
552             foreach my $prior (@{$self->{_class_priors}}) {
553             $prior /= $self->{_total_number_data_tuples};
554             }
555             foreach my $prior (@{$self->{_class_priors}}) {
556             print "class priors: @{$self->{_class_priors}}\n";
557             }
558             }
559              
560             sub classify_all_data_tuples_bayes {
561             my $self = shift;
562             my $mean_vecs_ref = shift;
563             my $covariances_ref = shift;
564             my @new_clusters;
565             foreach my $index (0..$self->{_K}-1) {
566             push @new_clusters, [];
567             }
568             foreach my $data_id (@{$self->{_data_id_tags}}) {
569             my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
570             $data_vec->set_col( 0, deep_copy_array($self->{_data}->{$data_id}));
571             my $cluster_index_for_tuple =
572             $self->classify_a_data_point_bayes($data_vec,
573             $mean_vecs_ref, $covariances_ref);
574             $self->{_class_labels}->{$data_id} = $cluster_index_for_tuple;
575             push @{$new_clusters[$cluster_index_for_tuple]}, $data_id;
576             }
577             $self->{_clusters} = \@new_clusters;
578             }
579              
580             sub classify_a_data_point_bayes {
581             my $self = shift;
582             my $data_vec = shift;
583             my $mean_vecs_ref = shift;
584             my $covariances_ref = shift;
585             my @cluster_mean_vecs = @$mean_vecs_ref;
586             my @cluster_covariances = @$covariances_ref;
587             my @log_likelihoods;
588             foreach my $cluster_index (0..@cluster_mean_vecs-1) {
589             my $mean = $cluster_mean_vecs[$cluster_index];
590             my $covariance = $cluster_covariances[$cluster_index];
591             my $datavec_minus_mean = $data_vec - $mean;
592             my $log_likely = undef;
593             if ($self->{_data_dimensions} > 1) {
594             $log_likely = -0.5 * vector_matrix_multiply(
595             transpose($datavec_minus_mean),
596             matrix_vector_multiply( $covariance->inverse(),
597             $datavec_minus_mean ) );
598             } else {
599              
600             my @data_minus_mean = $datavec_minus_mean->as_list;
601             my $data_minus_mean_val = $data_minus_mean[0];
602             my @covar_as_matrix = $covariance->as_list;
603             my $covar_val = $covar_as_matrix[0];
604             $log_likely = -0.5 * ($data_minus_mean_val ** 2) / $covar_val;
605              
606             }
607             my $posterior_log_likely = $log_likely +
608             log( $self->{_class_priors}[$cluster_index] );
609             push @log_likelihoods, $posterior_log_likely;
610             }
611             my ($minlikely, $maxlikely) = minmax(\@log_likelihoods);
612             my $cluster_index_for_data_point =
613             get_index_at_value( $maxlikely, \@log_likelihoods );
614             return $cluster_index_for_data_point;
615             }
616              
617             sub find_cluster_means_and_covariances {
618             my $clusters = shift;
619             my $data_dimensions = find_data_dimensionality($clusters);
620             my (@cluster_mean_vecs, @cluster_covariances);
621             foreach my $cluster_index (0..@$clusters-1) {
622             my ($num_rows,$num_cols) =
623             ($data_dimensions,scalar(@{$clusters->[$cluster_index]}));
624             print "\nFor cluster $cluster_index: rows: $num_rows and cols: $num_cols\n";
625             my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
626             my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
627             my $col_index = 0;
628             foreach my $ele (@{$clusters->[$cluster_index]}) {
629             $matrix->set_col($col_index++, $ele);
630             }
631             # display_matrix( "Displaying cluster matrix", $matrix );
632             foreach my $j (0..$num_cols-1) {
633             $mean_vec += $matrix->col($j);
634             }
635             $mean_vec *= 1.0 / $num_cols;
636             push @cluster_mean_vecs, $mean_vec;
637             display_matrix( "Displaying the mean vector", $mean_vec );
638             foreach my $j (0..$num_cols-1) {
639             my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
640             $matrix->set_col($j, \@new_col);
641             }
642             # display_matrix("Displaying mean subtracted data as a matrix", $matrix );
643             my $transposed = transpose( $matrix );
644             # display_matrix("Displaying transposed matrix",$transposed);
645             my $covariance = matrix_multiply( $matrix, $transposed );
646             $covariance *= 1.0 / $num_cols;
647             push @cluster_covariances, $covariance;
648             display_matrix("Displaying the cluster covariance", $covariance );
649             }
650             return (\@cluster_mean_vecs, \@cluster_covariances);
651             }
652              
653             sub find_data_dimensionality {
654             my $clusters = shift;
655             my @first_cluster = @{$clusters->[0]};
656             my @first_data_element = @{$first_cluster[0]};
657             return scalar(@first_data_element);
658             }
659              
660             sub find_seed_centered_covariances {
661             my $self = shift;
662             my $seed_tags = shift;
663             my (@seed_mean_vecs, @seed_based_covariances);
664             foreach my $seed_tag (@$seed_tags) {
665             my ($num_rows,$num_cols) = ($self->{_data_dimensions}, $self->{_N});
666             my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
667             my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
668             $mean_vec->set_col(0, $self->{_data}->{$seed_tag});
669             push @seed_mean_vecs, $mean_vec;
670             display_matrix( "Displaying the seed mean vector", $mean_vec );
671             my $col_index = 0;
672             foreach my $tag (@{$self->{_data_id_tags}}) {
673             $matrix->set_col($col_index++, $self->{_data}->{$tag});
674             }
675             foreach my $j (0..$num_cols-1) {
676             my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
677             $matrix->set_col($j, \@new_col);
678             }
679             my $transposed = transpose( $matrix );
680             my $covariance = matrix_multiply( $matrix, $transposed );
681             $covariance *= 1.0 / $num_cols;
682             push @seed_based_covariances, $covariance;
683             display_matrix("Displaying the seed covariance", $covariance )
684             if $self->{_debug};
685             }
686             return (\@seed_mean_vecs, \@seed_based_covariances);
687             }
688              
689             # The most popular seeding mode for EM is random. We include two other seeding modes
690             # --- kmeans and manual --- since they do produce good results for specialized cases.
691             # For example, when the clusters in your data are non-overlapping and not too
692             # anisotropic, the kmeans based seeding should work at least as well as the random
693             # seeding. In such cases --- AND ONLY IN SUCH CASES --- the kmeans based seeding has
694             # the advantage of avoiding the getting stuck in a local-maximum problem of the EM
695             # algorithm.
696             sub seed_the_clusters {
697             my $self = shift;
698             if ($self->{_seeding} eq 'random') {
699             my @covariances;
700             my @means;
701             my @all_tags = @{$self->{_data_id_tags}};
702             my @seed_tags;
703             foreach my $i (0..$self->{_K}-1) {
704             push @seed_tags, $all_tags[int rand( $self->{_N} )];
705             }
706             print "Random Seeding: Randomly selected seeding tags are @seed_tags\n\n";
707             my ($seed_means, $seed_covars) =
708             $self->find_seed_centered_covariances(\@seed_tags);
709             $self->{_cluster_means} = $seed_means;
710             $self->{_cluster_covariances} = $seed_covars;
711             } elsif ($self->{_seeding} eq 'kmeans') {
712             $self->kmeans();
713             my $clusters = $self->{_clusters};
714             my @dataclusters;
715             foreach my $index (0..@$clusters-1) {
716             push @dataclusters, [];
717             }
718             foreach my $cluster_index (0..$self->{_K}-1) {
719             foreach my $tag (@{$clusters->[$cluster_index]}) {
720             my $data = $self->{_data}->{$tag};
721             push @{$dataclusters[$cluster_index]}, deep_copy_array($data);
722             }
723             }
724             ($self->{_cluster_means}, $self->{_cluster_covariances}) =
725             find_cluster_means_and_covariances(\@dataclusters);
726             } elsif ($self->{_seeding} eq 'manual') {
727             die "You have not supplied the seeding tags for the option \"manual\""
728             unless @{$self->{_seed_tags}} > 0;
729             print "Manual Seeding: Seed tags are @{$self->{_seed_tags}}\n\n";
730             foreach my $tag (@{$self->{_seed_tags}}) {
731             die "invalid tag used for manual seeding"
732             unless exists $self->{_data}->{$tag};
733             }
734             my ($seed_means, $seed_covars) =
735             $self->find_seed_centered_covariances($self->{_seed_tags});
736             $self->{_cluster_means} = $seed_means;
737             $self->{_cluster_covariances} = $seed_covars;
738             } else {
739             die "Incorrect call syntax used. See documentation.";
740             }
741             }
742              
743             # This is the top-level method for kmeans based initialization of the EM
744             # algorithm. The means and the covariances returned by kmeans are used to seed the EM
745             # algorithm.
746             sub kmeans {
747             my $self = shift;
748             my $K = $self->{_K};
749             $self->cluster_for_fixed_K_single_smart_try($K);
750             if ((defined $self->{_clusters}) && (defined $self->{_cluster_centers})){
751             return ($self->{_clusters}, $self->{_cluster_centers});
752             } else {
753             die "kmeans clustering failed.";
754             }
755             }
756              
757             # Used by the kmeans algorithm for the initialization of the EM iterations. We do
758             # initial kmeans cluster seeding by subjecting the data to principal components
759             # analysis in order to discover the direction of maximum variance in the data space.
760             # Subsequently, we try to find the K largest peaks along this direction. The
761             # coordinates of these peaks serve as the seeds for the K clusters.
762             sub cluster_for_fixed_K_single_smart_try {
763             my $self = shift;
764             my $K = shift;
765             print "Clustering for K = $K\n" if $self->{_terminal_output};
766             my ($clusters, $cluster_centers) =
767             $self->cluster_for_given_K($K);
768             $self->{_clusters} = $clusters;
769             $self->{_cluster_centers} = $cluster_centers;
770             }
771              
772             # Used by the kmeans part of the code for the initialization of the EM algorithm:
773             sub cluster_for_given_K {
774             my $self = shift;
775             my $K = shift;
776             my $cluster_centers = $self->get_initial_cluster_centers($K);
777             my $clusters = $self->assign_data_to_clusters_initial($cluster_centers);
778             my $cluster_nonexistant_flag = 0;
779             foreach my $trial (0..2) {
780             ($clusters, $cluster_centers) =
781             $self->assign_data_to_clusters( $clusters, $K );
782             my $num_of_clusters_returned = @$clusters;
783             foreach my $cluster (@$clusters) {
784             $cluster_nonexistant_flag = 1 if ((!defined $cluster)
785             || (@$cluster == 0));
786             }
787             last unless $cluster_nonexistant_flag;
788             }
789             return ($clusters, $cluster_centers);
790             }
791              
792             # Used by the kmeans part of the code for the initialization of the EM algorithm:
793             sub get_initial_cluster_centers {
794             my $self = shift;
795             my $K = shift;
796             if ($self->{_data_dimensions} == 1) {
797             my @one_d_data;
798             foreach my $j (0..$self->{_N}-1) {
799             my $tag = $self->{_data_id_tags}[$j];
800             push @one_d_data, $self->{_data}->{$tag}->[0];
801             }
802             my @peak_points =
803             find_peak_points_in_given_direction(\@one_d_data,$K);
804             print "highest points at data values: @peak_points\n"
805             if $self->{_debug};
806             my @cluster_centers;
807             foreach my $peakpoint (@peak_points) {
808             push @cluster_centers, [$peakpoint];
809             }
810             return \@cluster_centers;
811             }
812             my ($num_rows,$num_cols) = ($self->{_data_dimensions},$self->{_N});
813             my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
814             my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
815             # All the record labels are stored in the array $self->{_data_id_tags}.
816             # The actual data for clustering is stored in a hash at $self->{_data}
817             # whose keys are the record labels; the value associated with each
818             # key is the array holding the corresponding numerical multidimensional
819             # data.
820             foreach my $j (0..$num_cols-1) {
821             my $tag = $self->{_data_id_tags}[$j];
822             my $data = $self->{_data}->{$tag};
823             $matrix->set_col($j, $data);
824             }
825             if ($self->{_debug}) {
826             print "\nDisplaying the original data as a matrix:";
827             display_matrix( $matrix );
828             }
829             foreach my $j (0..$num_cols-1) {
830             $mean_vec += $matrix->col($j);
831             }
832             $mean_vec *= 1.0 / $num_cols;
833             if ($self->{_debug}) {
834             print "Displaying the mean vector for the data:";
835             display_matrix( $mean_vec );
836             }
837             foreach my $j (0..$num_cols-1) {
838             my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
839             $matrix->set_col($j, \@new_col);
840             }
841             if ($self->{_debug}) {
842             print "Displaying mean subtracted data as a matrix:";
843             display_matrix( $matrix );
844             }
845             my $transposed = transpose( $matrix );
846             if ($self->{_debug}) {
847             print "Displaying transposed data matrix:";
848             display_matrix( $transposed );
849             }
850             my $covariance = matrix_multiply( $matrix, $transposed );
851             $covariance *= 1.0 / $num_cols;
852             if ($self->{_debug}) {
853             print "\nDisplaying the Covariance Matrix for your data:";
854             display_matrix( $covariance );
855             }
856             my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
857             my $num_of_eigens = @$eigenvalues;
858             my $largest_eigen_index = 0;
859             my $smallest_eigen_index = 0;
860             print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug};
861             foreach my $i (1..$num_of_eigens-1) {
862             $largest_eigen_index = $i if $eigenvalues->[$i] >
863             $eigenvalues->[$largest_eigen_index];
864             $smallest_eigen_index = $i if $eigenvalues->[$i] <
865             $eigenvalues->[$smallest_eigen_index];
866             print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug};
867             }
868             print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
869             print "\nsmallest eigen index: $smallest_eigen_index\n\n"
870             if $self->{_debug};
871             foreach my $i (0..$num_of_eigens-1) {
872             my @vec = $eigenvectors->[$i]->as_list;
873             print "Eigenvector $i: @vec\n" if $self->{_debug};
874             }
875             my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
876             print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug};
877             my @max_var_direction;
878             # Each element of the array @largest_eigen_vec is a Math::Complex object
879             foreach my $k (0..@largest_eigen_vec-1) {
880             my ($mag, $theta) = $largest_eigen_vec[$k] =~ /\[(\d*\.\d+),(\S+)\]/;
881             if ($theta eq '0') {
882             $max_var_direction[$k] = $mag;
883             } elsif ($theta eq 'pi') {
884             $max_var_direction[$k] = -1.0 * $mag;
885             } else {
886             die "eigendecomposition of covariance matrix produced a complex eigenvector --- something is wrong";
887             }
888             }
889             # "Maximum variance direction: @max_var_direction
890             print "\nMaximum Variance Direction: @max_var_direction\n\n"
891             if $self->{_debug};
892             # We now project all data points on the largest eigenvector.
893             # Each projection will yield a single point on the eigenvector.
894             my @projections;
895             foreach my $j (0..$self->{_N}-1) {
896             my $tag = $self->{_data_id_tags}[$j];
897             my $data = $self->{_data}->{$tag};
898             die "Dimensionality of the largest eigenvector does not "
899             . "match the dimensionality of the data"
900             unless @max_var_direction == $self->{_data_dimensions};
901             my $projection = vector_multiply($data, \@max_var_direction);
902             push @projections, $projection;
903             }
904             print "All projection points: @projections\n" if $self->{_debug};
905             my @peak_points = find_peak_points_in_given_direction(\@projections, $K);
906             print "highest points at points along largest eigenvec: @peak_points\n"
907             if $self->{_debug};
908             my @cluster_centers;
909             foreach my $peakpoint (@peak_points) {
910             my @actual_peak_coords = map {$peakpoint * $_} @max_var_direction;
911             push @cluster_centers, \@actual_peak_coords;
912             }
913             return \@cluster_centers;
914             }
915              
916             # Used by the kmeans part of the code: This method is called by the previous method
917             # to locate K peaks in a smoothed histogram of the data points projected onto the
918             # maximal variance direction.
919             sub find_peak_points_in_given_direction {
920             my $dataref = shift;
921             my $how_many = shift;
922             my @data = @$dataref;
923             my ($min, $max) = minmax(\@data);
924             my $num_points = @data;
925             my @sorted_data = sort {$a <=> $b} @data;
926             #print "\n\nSorted data: @sorted_data\n";
927             my $scale = $max - $min;
928             foreach my $index (0..$#sorted_data-1) {
929             $sorted_data[$index] = ($sorted_data[$index] - $min) / $scale;
930             }
931             my $avg_diff = 0;
932             foreach my $index (0..$#sorted_data-1) {
933             my $diff = $sorted_data[$index+1] - $sorted_data[$index];
934             $avg_diff += ($diff - $avg_diff) / ($index + 1);
935             }
936             my $delta = 1.0 / 1000.0;
937             # It would be nice to set the delta adaptively, but I must
938             # change the number of cells in the next foreach loop accordingly
939             # my $delta = $avg_diff / 20;
940             my @accumulator = (0) x 1000;
941             foreach my $index (0..@sorted_data-1) {
942             my $cell_index = int($sorted_data[$index] / $delta);
943             my $smoothness = 40;
944             for my $index ($cell_index-$smoothness..$cell_index+$smoothness) {
945             next if $index < 0 || $index > 999;
946             $accumulator[$index]++;
947             }
948             }
949             my $peaks_array = non_maximum_supression( \@accumulator );
950             my $peaks_index_hash = get_value_index_hash( $peaks_array );
951             my @K_highest_peak_locations;
952             my $k = 0;
953             foreach my $peak (sort {$b <=> $a} keys %$peaks_index_hash) {
954             my $unscaled_peak_point =
955             $min + $peaks_index_hash->{$peak} * $scale * $delta;
956             push @K_highest_peak_locations, $unscaled_peak_point
957             if $k < $how_many;
958             last if ++$k == $how_many;
959             }
960             return @K_highest_peak_locations;
961             }
962              
963             # Used by the kmeans part of the code: The purpose of this routine is to form initial
964             # clusters by assigning the data samples to the initial clusters formed by the
965             # previous routine on the basis of the best proximity of the data samples to the
966             # different cluster centers.
967             sub assign_data_to_clusters_initial {
968             my $self = shift;
969             my @cluster_centers = @{ shift @_ };
970             my @clusters;
971             foreach my $ele (@{$self->{_data_id_tags}}) {
972             my $best_cluster;
973             my @dist_from_clust_centers;
974             foreach my $center (@cluster_centers) {
975             push @dist_from_clust_centers, $self->distance($ele, $center);
976             }
977             my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
978             push @{$clusters[$best_center_index]}, $ele;
979             }
980             return \@clusters;
981             }
982              
983             # Used by the kmeans part of the code: This is the main routine that along with the
984             # update_cluster_centers() routine constitute the two key steps of the K-Means
985             # algorithm. In most cases, the infinite while() loop will terminate automatically
986             # when the cluster assignments of the data points remain unchanged. For the sake of
987             # safety, we keep track of the number of iterations. If this number reaches 100, we
988             # exit the while() loop anyway. In most cases, this limit will not be reached.
989             sub assign_data_to_clusters {
990             my $self = shift;
991             my $clusters = shift;
992             my $K = shift;
993             my $final_cluster_centers;
994             my $iteration_index = 0;
995             while (1) {
996             my $new_clusters;
997             my $assignment_changed_flag = 0;
998             my $current_cluster_center_index = 0;
999             my $cluster_size_zero_condition = 0;
1000             my $how_many = @$clusters;
1001             my $cluster_centers = $self->update_cluster_centers(
1002             deep_copy_AoA_with_nulls( $clusters ) );
1003             $iteration_index++;
1004             foreach my $cluster (@$clusters) {
1005             my $current_cluster_center =
1006             $cluster_centers->[$current_cluster_center_index];
1007             foreach my $ele (@$cluster) {
1008             my @dist_from_clust_centers;
1009             foreach my $center (@$cluster_centers) {
1010             push @dist_from_clust_centers,
1011             $self->distance($ele, $center);
1012             }
1013             my ($min, $best_center_index) =
1014             minimum( \@dist_from_clust_centers );
1015             my $best_cluster_center =
1016             $cluster_centers->[$best_center_index];
1017             if (vector_equal($current_cluster_center,
1018             $best_cluster_center)){
1019             push @{$new_clusters->[$current_cluster_center_index]},
1020             $ele;
1021             } else {
1022             $assignment_changed_flag = 1;
1023             push @{$new_clusters->[$best_center_index]}, $ele;
1024             }
1025             }
1026             $current_cluster_center_index++;
1027             }
1028             # Now make sure that we still have K clusters since K is fixed:
1029             next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
1030             # Now make sure that none of the K clusters is an empty cluster:
1031             foreach my $newcluster (@$new_clusters) {
1032             $cluster_size_zero_condition = 1 if ((!defined $newcluster)
1033             or (@$newcluster == 0));
1034             }
1035             push @$new_clusters, (undef) x ($K - @$new_clusters)
1036             if @$new_clusters < $K;
1037             my $largest_cluster;
1038             foreach my $local_cluster (@$new_clusters) {
1039             next if !defined $local_cluster;
1040             $largest_cluster = $local_cluster if !defined $largest_cluster;
1041             if (@$local_cluster > @$largest_cluster) {
1042             $largest_cluster = $local_cluster;
1043             }
1044             }
1045             foreach my $local_cluster (@$new_clusters) {
1046             if ( (!defined $local_cluster) || (@$local_cluster == 0) ) {
1047             push @$local_cluster, pop @$largest_cluster;
1048             }
1049             }
1050             next if (($cluster_size_zero_condition) && ($iteration_index < 100));
1051             last if $iteration_index == 100;
1052             # Now do a deep copy of new_clusters into clusters
1053             $clusters = deep_copy_AoA( $new_clusters );
1054             last if $assignment_changed_flag == 0;
1055             }
1056             $final_cluster_centers = $self->update_cluster_centers( $clusters );
1057             return ($clusters, $final_cluster_centers);
1058             }
1059              
1060             # Used by the kmeans part of the code: After each new assignment of the data points
1061             # to the clusters on the basis of the current values for the cluster centers, we call
1062             # the routine shown here for updating the values of the cluster centers.
1063             sub update_cluster_centers {
1064             my $self = shift;
1065             my @clusters = @{ shift @_ };
1066             my @new_cluster_centers;
1067             my $largest_cluster;
1068             foreach my $cluster (@clusters) {
1069             next if !defined $cluster;
1070             $largest_cluster = $cluster if !defined $largest_cluster;
1071             if (@$cluster > @$largest_cluster) {
1072             $largest_cluster = $cluster;
1073             }
1074             }
1075             foreach my $cluster (@clusters) {
1076             if ( (!defined $cluster) || (@$cluster == 0) ) {
1077             push @$cluster, pop @$largest_cluster;
1078             }
1079             }
1080             foreach my $cluster (@clusters) {
1081             die "Cluster became empty --- untenable condition " .
1082             "for a given K. Try again. " if !defined $cluster;
1083             my $cluster_size = @$cluster;
1084             die "Cluster size is zero --- untenable." if $cluster_size == 0;
1085             my @new_cluster_center = @{$self->add_point_coords( $cluster )};
1086             @new_cluster_center = map {my $x = $_/$cluster_size; $x}
1087             @new_cluster_center;
1088             push @new_cluster_centers, \@new_cluster_center;
1089             }
1090             return \@new_cluster_centers;
1091             }
1092              
1093             # The following routine is for computing the distance between a data point specified
1094             # by its symbolic name in the master datafile and a point (such as the center of a
1095             # cluster) expressed as a vector of coordinates:
1096             sub distance {
1097             my $self = shift;
1098             my $ele1_id = shift @_; # symbolic name of data sample
1099             my @ele1 = @{$self->{_data}->{$ele1_id}};
1100             my @ele2 = @{shift @_};
1101             die "wrong data types for distance calculation" if @ele1 != @ele2;
1102             my $how_many = @ele1;
1103             my $squared_sum = 0;
1104             foreach my $i (0..$how_many-1) {
1105             $squared_sum += ($ele1[$i] - $ele2[$i])**2;
1106             }
1107             my $dist = sqrt $squared_sum;
1108             return $dist;
1109             }
1110              
1111             # The following routine does the same as above but now both
1112             # arguments are expected to be arrays of numbers:
1113             sub distance2 {
1114             my $self = shift;
1115             my @ele1 = @{shift @_};
1116             my @ele2 = @{shift @_};
1117             die "wrong data types for distance calculation" if @ele1 != @ele2;
1118             my $how_many = @ele1;
1119             my $squared_sum = 0;
1120             foreach my $i (0..$how_many-1) {
1121             $squared_sum += ($ele1[$i] - $ele2[$i])**2;
1122             }
1123             return sqrt $squared_sum;
1124             }
1125              
1126             sub write_naive_bayes_clusters_to_files {
1127             my $self = shift;
1128             my @clusters = @{$self->{_clusters}};
1129             unlink glob "naive_bayes_cluster*.dat";
1130             foreach my $i (1..@clusters) {
1131             my $filename = "naive_bayes_cluster" . $i . ".dat";
1132             print "Writing cluster $i to file $filename\n"
1133             if $self->{_terminal_output};
1134             open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!";
1135             foreach my $ele (@{$clusters[$i-1]}) {
1136             print FILEHANDLE "$ele\n";
1137             }
1138             close FILEHANDLE;
1139             }
1140             }
1141              
1142             sub write_posterior_prob_clusters_above_threshold_to_files {
1143             my $self = shift;
1144             my $theta = shift;
1145             my @class_distributions;
1146             foreach my $cluster_index (0..$self->{_K}-1) {
1147             push @class_distributions, [];
1148             }
1149             foreach my $data_tag (@{$self->{_data_id_tags}}) {
1150             foreach my $cluster_index (0..$self->{_K}-1) {
1151             push @{$class_distributions[$cluster_index]}, $data_tag
1152             if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index]
1153             > $theta;
1154             }
1155             }
1156             unlink glob "posterior_prob_cluster*.dat";
1157             foreach my $i (1..@class_distributions) {
1158             my $filename = "posterior_prob_cluster" . $i . ".dat";
1159             print "Writing posterior prob cluster $i to file $filename\n"
1160             if $self->{_terminal_output};
1161             open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!";
1162             foreach my $ele (@{$class_distributions[$i-1]}) {
1163             print FILEHANDLE "$ele\n";
1164             }
1165             close FILEHANDLE;
1166             }
1167             }
1168              
1169             sub DESTROY {
1170             unlink "__temp_" . basename($_[0]->{_datafile});
1171             unlink "__temp_data_" . basename($_[0]->{_datafile});
1172             unlink "__temp2_" . basename($_[0]->{_datafile});
1173             unlink glob "__temp1dhist*";
1174             unlink glob "__contour*";
1175             }
1176              
1177             ############################# Visualization Code ###############################
1178              
1179             # The visualize_clusters() implementation displays as a plot in your terminal window
1180             # the clusters constructed by the EM algorithm. It can show either 2D plots or
1181             # 3D plots that you can rotate interactively for better visualization. For
1182             # multidimensional data, as to which 2D or 3D dimensions are used for visualization
1183             # is controlled by the mask you must supply as an argument to the method. Should it
1184             # happen that only one on bit is specified for the mask, visualize_clusters()
1185             # aborts.
1186             #
1187             # The visualization code consists of first accessing each of clusters created by the
1188             # EM() subroutine. Note that the clusters contain only the symbolic names for the
1189             # individual records in the source data file. We therefore next reach into the
1190             # $self->{_data} hash and get the data coordinates associated with each symbolic
1191             # label in a cluster. The numerical data thus generated is then written out to a
1192             # temp file. When doing so we must remember to insert TWO BLANK LINES between the
1193             # data blocks corresponding to the different clusters. This constraint is imposed
1194             # on us by Gnuplot when plotting data from the same file since we want to use
1195             # different point styles for the data points in different cluster files.
1196             # Subsequently, we call upon the Perl interface provided by the Graphics::GnuplotIF
1197             # module to plot the data clusters.
1198             sub visualize_clusters {
1199             my $self = shift;
1200             my $v_mask;
1201             my $pause_time;
1202             if (@_ == 1) {
1203             $v_mask = shift || die "visualization mask missing";
1204             } elsif (@_ == 2) {
1205             $v_mask = shift || die "visualization mask missing";
1206             $pause_time = shift;
1207             } else {
1208             die "visualize_clusters() called with wrong args";
1209             }
1210             my $master_datafile = $self->{_datafile};
1211             my @v_mask = split //, $v_mask;
1212             my $visualization_mask_width = @v_mask;
1213             my $original_data_mask = $self->{_mask};
1214             my @mask = split //, $original_data_mask;
1215             my $data_field_width = scalar grep {$_ eq '1'} @mask;
1216             die "\n\nABORTED: The width of the visualization mask (including " .
1217             "all its 1s and 0s) must equal the width of the original mask " .
1218             "used for reading the data file (counting only the 1's)"
1219             if $visualization_mask_width != $data_field_width;
1220             my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1221             # The following section is for the superimposed one-Mahalanobis-distance-unit
1222             # ellipses that are shown only for 2D plots:
1223             if ($visualization_data_field_width == 2) {
1224             foreach my $cluster_index (0..$self->{_K}-1) {
1225             my $contour_filename = "__contour_" . $cluster_index . ".dat";
1226             my $mean = $self->{_cluster_means}->[$cluster_index];
1227             my $covariance = $self->{_cluster_covariances}->[$cluster_index];
1228             my ($mux,$muy) = $mean->as_list();
1229             my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
1230             my ($sigmayx,$vary) = $covariance->row(1)->as_list();
1231             die "Your covariance matrix does not look right"
1232             unless $sigmaxy == $sigmayx;
1233             my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
1234             my $argstring = <<"END";
1235             set contour
1236             mux = $mux
1237             muy = $muy
1238             sigmax = $sigmax
1239             sigmay = $sigmay
1240             sigmaxy = $sigmaxy
1241             determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2
1242             exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) )
1243             f(x,y) = exp( exponent(x,y) ) - 0.2
1244             xmax = mux + 2 * sigmax
1245             xmin = mux - 2 * sigmax
1246             ymax = muy + 2 * sigmay
1247             ymin = muy - 2 * sigmay
1248             set xrange [ xmin : xmax ]
1249             set yrange [ ymin : ymax ]
1250             set isosamples 200
1251             unset surface
1252             set cntrparam levels discrete 0
1253             set table \"$contour_filename\"
1254             splot f(x,y)
1255             unset table
1256             END
1257             my $plot = Graphics::GnuplotIF->new();
1258             $plot->gnuplot_cmd( $argstring );
1259             }
1260             }
1261             my %visualization_data;
1262             while ( my ($record_id, $data) = each %{$self->{_data}} ) {
1263             my @fields = @$data;
1264             die "\nABORTED: Visualization mask size exceeds data record size"
1265             if $#v_mask > $#fields;
1266             my @data_fields;
1267             foreach my $i (0..@fields-1) {
1268             if ($v_mask[$i] eq '0') {
1269             next;
1270             } elsif ($v_mask[$i] eq '1') {
1271             push @data_fields, $fields[$i];
1272             } else {
1273             die "Misformed visualization mask. It can only have 1s and 0s";
1274             }
1275             }
1276             $visualization_data{ $record_id } = \@data_fields;
1277             }
1278             my $K = scalar @{$self->{_clusters}};
1279             my $filename = basename($master_datafile);
1280             my $temp_file = "__temp_" . $filename;
1281             unlink $temp_file if -e $temp_file;
1282             open OUTPUT, ">$temp_file"
1283             or die "Unable to open a temp file in this directory: $!";
1284             foreach my $cluster (@{$self->{_clusters}}) {
1285             foreach my $item (@$cluster) {
1286             print OUTPUT "@{$visualization_data{$item}}";
1287             print OUTPUT "\n";
1288             }
1289             print OUTPUT "\n\n";
1290             }
1291             close OUTPUT;
1292             my $plot;
1293             if (!defined $pause_time) {
1294             $plot = Graphics::GnuplotIF->new( persist => 1 );
1295             } else {
1296             $plot = Graphics::GnuplotIF->new();
1297             }
1298             my $arg_string = "";
1299             if ($visualization_data_field_width > 2) {
1300             $plot->gnuplot_cmd("set noclip");
1301             $plot->gnuplot_cmd("set pointsize 2");
1302             foreach my $i (0..$K-1) {
1303             my $j = $i + 1;
1304             $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
1305             }
1306             } elsif ($visualization_data_field_width == 2) {
1307             $plot->gnuplot_cmd("set noclip");
1308             $plot->gnuplot_cmd("set pointsize 2");
1309             foreach my $i (0..$K-1) {
1310             my $j = $i + 1;
1311             $arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
1312             my $ellipse_filename = "__contour_" . $i . ".dat";
1313             $arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
1314             }
1315             } elsif ($visualization_data_field_width == 1 ) {
1316             open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
1317             my @all_data = ;
1318             close INPUT;
1319             @all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
1320             my $all_joined_data = join ':', @all_data;
1321             my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
1322             my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
1323             foreach my $i (0..@separated-1) {
1324             $separated[$i] =~ s/SEPERATOR//g;
1325             my @cluster_for_hist = split /:/, $separated[$i];
1326             @cluster_for_hist = grep $_, @cluster_for_hist;
1327             my ($minval,$maxval) = minmax(\@cluster_for_hist);
1328             push @all_minvals, $minval;
1329             push @all_maxvals, $maxval;
1330             push @all_clusters_for_hist, \@cluster_for_hist;
1331             }
1332             push @all_minmaxvals, @all_minvals;
1333             push @all_minmaxvals, @all_maxvals;
1334             my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
1335             my $delta = ($abs_maxval - $abs_minval) / 100.0;
1336             $plot->gnuplot_cmd("set boxwidth 3");
1337             $plot->gnuplot_cmd("set style fill solid border -1");
1338             $plot->gnuplot_cmd("set ytics out nomirror");
1339             $plot->gnuplot_cmd("set style data histograms");
1340             $plot->gnuplot_cmd("set style histogram clustered");
1341             $plot->gnuplot_cmd("set title 'Clusters shown through histograms'");
1342             $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
1343             foreach my $cindex (0..@all_clusters_for_hist-1) {
1344             my $filename = basename($master_datafile);
1345             my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $filename;
1346             unlink $temp_file if -e $temp_file;
1347             open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
1348             print OUTPUT "Xstep histval\n";
1349             my @histogram = (0) x 100;
1350             foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
1351             $histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
1352             }
1353             foreach my $i (0..@histogram-1) {
1354             print OUTPUT "$i $histogram[$i]\n";
1355             }
1356             $arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
1357             close OUTPUT;
1358             }
1359             }
1360             $arg_string = $arg_string =~ /^(.*),[ ]+$/;
1361             $arg_string = $1;
1362             if ($visualization_data_field_width > 2) {
1363             $plot->gnuplot_cmd( "splot $arg_string" );
1364             $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1365             } elsif ($visualization_data_field_width == 2) {
1366             $plot->gnuplot_cmd( "plot $arg_string" );
1367             $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1368             } elsif ($visualization_data_field_width == 1) {
1369             $plot->gnuplot_cmd( "plot $arg_string" );
1370             $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1371             }
1372             }
1373              
1374             # This subroutine is the same as above except that it makes PNG plots (for hardcopy
1375             # printing) of the clusters.
1376             sub plot_hardcopy_clusters {
1377             my $self = shift;
1378             my $v_mask;
1379             my $pause_time;
1380             if (@_ == 1) {
1381             $v_mask = shift || die "visualization mask missing";
1382             } elsif (@_ == 2) {
1383             $v_mask = shift || die "visualization mask missing";
1384             $pause_time = shift;
1385             } else {
1386             die "visualize_clusters() called with wrong args";
1387             }
1388             my $master_datafile = $self->{_datafile};
1389             my @v_mask = split //, $v_mask;
1390             my $visualization_mask_width = @v_mask;
1391             my $original_data_mask = $self->{_mask};
1392             my @mask = split //, $original_data_mask;
1393             my $data_field_width = scalar grep {$_ eq '1'} @mask;
1394             die "\n\nABORTED: The width of the visualization mask (including " .
1395             "all its 1s and 0s) must equal the width of the original mask " .
1396             "used for reading the data file (counting only the 1's)"
1397             if $visualization_mask_width != $data_field_width;
1398             my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1399             if ($visualization_data_field_width == 2) {
1400             foreach my $cluster_index (0..$self->{_K}-1) {
1401             my $contour_filename = "__contour_" . $cluster_index . ".dat";
1402             my $mean = $self->{_cluster_means}->[$cluster_index];
1403             my $covariance = $self->{_cluster_covariances}->[$cluster_index];
1404             my ($mux,$muy) = $mean->as_list();
1405             my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
1406             my ($sigmayx,$vary) = $covariance->row(1)->as_list();
1407             die "Your covariance matrix does not look right"
1408             unless $sigmaxy == $sigmayx;
1409             my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
1410             my $argstring = <<"END";
1411             set contour
1412             mux = $mux
1413             muy = $muy
1414             sigmax = $sigmax
1415             sigmay = $sigmay
1416             sigmaxy = $sigmaxy
1417             determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2
1418             exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) )
1419             f(x,y) = exp( exponent(x,y) ) - 0.2
1420             xmax = mux + 2 * sigmax
1421             xmin = mux - 2 * sigmax
1422             ymax = muy + 2 * sigmay
1423             ymin = muy - 2 * sigmay
1424             set xrange [ xmin : xmax ]
1425             set yrange [ ymin : ymax ]
1426             set isosamples 200
1427             unset surface
1428             set cntrparam levels discrete 0
1429             set table \"$contour_filename\"
1430             splot f(x,y)
1431             unset table
1432             END
1433             my $plot = Graphics::GnuplotIF->new();
1434             $plot->gnuplot_cmd( $argstring );
1435             }
1436             }
1437             my %visualization_data;
1438             while ( my ($record_id, $data) = each %{$self->{_data}} ) {
1439             my @fields = @$data;
1440             die "\nABORTED: Visualization mask size exceeds data record size"
1441             if $#v_mask > $#fields;
1442             my @data_fields;
1443             foreach my $i (0..@fields-1) {
1444             if ($v_mask[$i] eq '0') {
1445             next;
1446             } elsif ($v_mask[$i] eq '1') {
1447             push @data_fields, $fields[$i];
1448             } else {
1449             die "Misformed visualization mask. It can only have 1s and 0s";
1450             }
1451             }
1452             $visualization_data{ $record_id } = \@data_fields;
1453             }
1454             my $K = scalar @{$self->{_clusters}};
1455             my $filename = basename($master_datafile);
1456             my $temp_file = "__temp_" . $filename;
1457             unlink $temp_file if -e $temp_file;
1458             open OUTPUT, ">$temp_file"
1459             or die "Unable to open a temp file in this directory: $!";
1460             foreach my $cluster (@{$self->{_clusters}}) {
1461             foreach my $item (@$cluster) {
1462             print OUTPUT "@{$visualization_data{$item}}";
1463             print OUTPUT "\n";
1464             }
1465             print OUTPUT "\n\n";
1466             }
1467             close OUTPUT;
1468             my $plot;
1469             if (!defined $pause_time) {
1470             $plot = Graphics::GnuplotIF->new( persist => 1 );
1471             } else {
1472             $plot = Graphics::GnuplotIF->new();
1473             }
1474             my $arg_string = "";
1475             if ($visualization_data_field_width > 2) {
1476             $plot->gnuplot_cmd( "set noclip" );
1477             $plot->gnuplot_cmd( "set pointsize 2" );
1478             foreach my $i (0..$K-1) {
1479             my $j = $i + 1;
1480             $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
1481             }
1482             } elsif ($visualization_data_field_width == 2) {
1483             $plot->gnuplot_cmd( "set noclip" );
1484             $plot->gnuplot_cmd( "set pointsize 2" );
1485             foreach my $i (0..$K-1) {
1486             my $j = $i + 1;
1487             $arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
1488             my $ellipse_filename = "__contour_" . $i . ".dat";
1489             $arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
1490             }
1491             } elsif ($visualization_data_field_width == 1 ) {
1492             open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
1493             my @all_data = ;
1494             close INPUT;
1495             @all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
1496             my $all_joined_data = join ':', @all_data;
1497             my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
1498             my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
1499             foreach my $i (0..@separated-1) {
1500             $separated[$i] =~ s/SEPERATOR//g;
1501             my @cluster_for_hist = split /:/, $separated[$i];
1502             @cluster_for_hist = grep $_, @cluster_for_hist;
1503             my ($minval,$maxval) = minmax(\@cluster_for_hist);
1504             push @all_minvals, $minval;
1505             push @all_maxvals, $maxval;
1506             push @all_clusters_for_hist, \@cluster_for_hist;
1507             }
1508             push @all_minmaxvals, @all_minvals;
1509             push @all_minmaxvals, @all_maxvals;
1510             my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
1511             my $delta = ($abs_maxval - $abs_minval) / 100.0;
1512             $plot->gnuplot_cmd("set boxwidth 3");
1513             $plot->gnuplot_cmd("set style fill solid border -1");
1514             $plot->gnuplot_cmd("set ytics out nomirror");
1515             $plot->gnuplot_cmd("set style data histograms");
1516             $plot->gnuplot_cmd("set style histogram clustered");
1517             $plot->gnuplot_cmd("set title 'Clusters shown through histograms'");
1518             $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
1519             foreach my $cindex (0..@all_clusters_for_hist-1) {
1520             my $filename = basename($master_datafile);
1521             my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $filename;
1522             unlink $temp_file if -e $temp_file;
1523             open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
1524             print OUTPUT "Xstep histval\n";
1525             my @histogram = (0) x 100;
1526             foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
1527             $histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
1528             }
1529             foreach my $i (0..@histogram-1) {
1530             print OUTPUT "$i $histogram[$i]\n";
1531             }
1532             # $arg_string .= "\"$temp_file\" using 1:2 ti col smooth frequency with boxes lc $cindex, ";
1533             $arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
1534             close OUTPUT;
1535             }
1536             }
1537             $arg_string = $arg_string =~ /^(.*),[ ]+$/;
1538             $arg_string = $1;
1539             if ($visualization_data_field_width > 2) {
1540             $plot->gnuplot_cmd( 'set terminal png color',
1541             'set output "cluster_plot.png"');
1542             $plot->gnuplot_cmd( "splot $arg_string" );
1543             } elsif ($visualization_data_field_width == 2) {
1544             $plot->gnuplot_cmd('set terminal png',
1545             'set output "cluster_plot.png"');
1546             $plot->gnuplot_cmd( "plot $arg_string" );
1547             } elsif ($visualization_data_field_width == 1) {
1548             $plot->gnuplot_cmd('set terminal png',
1549             'set output "cluster_plot.png"');
1550             $plot->gnuplot_cmd( "plot $arg_string" );
1551             }
1552             }
1553              
1554             # This method is for the visualization of the posterior class distributions. In
1555             # other words, this method allows us to see the soft clustering produced by the EM
1556             # algorithm. While much of the gnuplot logic here is the same as in the
1557             # visualize_clusters() method, there are significant differences in how the data is
1558             # pooled for the purpose of display.
1559             sub visualize_distributions {
1560             my $self = shift;
1561             my $v_mask;
1562             my $pause_time;
1563             if (@_ == 1) {
1564             $v_mask = shift || die "visualization mask missing";
1565             } elsif (@_ == 2) {
1566             $v_mask = shift || die "visualization mask missing";
1567             $pause_time = shift;
1568             } else {
1569             die "visualize_distributions() called with wrong args";
1570             }
1571             my @v_mask = split //, $v_mask;
1572             my $visualization_mask_width = @v_mask;
1573             my $original_data_mask = $self->{_mask};
1574             my @mask = split //, $original_data_mask;
1575             my $data_field_width = scalar grep {$_ eq '1'} @mask;
1576             die "\n\nABORTED: The width of the visualization mask (including " .
1577             "all its 1s and 0s) must equal the width of the original mask " .
1578             "used for reading the data file (counting only the 1's)"
1579             if $visualization_mask_width != $data_field_width;
1580             my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1581             if ($visualization_data_field_width == 2) {
1582             foreach my $cluster_index (0..$self->{_K}-1) {
1583             my $contour_filename = "__contour2_" . $cluster_index . ".dat";
1584             my $mean = $self->{_cluster_means}->[$cluster_index];
1585             my $covariance = $self->{_cluster_covariances}->[$cluster_index];
1586             my ($mux,$muy) = $mean->as_list();
1587             my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
1588             my ($sigmayx,$vary) = $covariance->row(1)->as_list();
1589             die "Your covariance matrix does not look right"
1590             unless $sigmaxy == $sigmayx;
1591             my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
1592             my $argstring = <<"END";
1593             set contour
1594             mux = $mux
1595             muy = $muy
1596             sigmax = $sigmax
1597             sigmay = $sigmay
1598             sigmaxy = $sigmaxy
1599             determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2
1600             exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) )
1601             f(x,y) = exp( exponent(x,y) ) - 0.2
1602             xmax = mux + 2 * sigmax
1603             xmin = mux - 2 * sigmax
1604             ymax = muy + 2 * sigmay
1605             ymin = muy - 2 * sigmay
1606             set xrange [ xmin : xmax ]
1607             set yrange [ ymin : ymax ]
1608             set isosamples 200
1609             unset surface
1610             set cntrparam levels discrete 0
1611             set table \"$contour_filename\"
1612             splot f(x,y)
1613             unset table
1614             END
1615             my $plot = Graphics::GnuplotIF->new();
1616             $plot->gnuplot_cmd( $argstring );
1617             }
1618             }
1619             my %visualization_data;
1620             while ( my ($record_id, $data) = each %{$self->{_data}} ) {
1621             my @fields = @$data;
1622             die "\nABORTED: Visualization mask size exceeds data record size"
1623             if $#v_mask > $#fields;
1624             my @data_fields;
1625             foreach my $i (0..@fields-1) {
1626             if ($v_mask[$i] eq '0') {
1627             next;
1628             } elsif ($v_mask[$i] eq '1') {
1629             push @data_fields, $fields[$i];
1630             } else {
1631             die "Misformed visualization mask. It can only have 1s and 0s";
1632             }
1633             }
1634             $visualization_data{ $record_id } = \@data_fields;
1635             }
1636             my $filename = basename($self->{_datafile});
1637             my $temp_file = "__temp2_" . $filename;
1638             unlink $temp_file if -e $temp_file;
1639             open OUTPUT, ">$temp_file"
1640             or die "Unable to open a temp file in this directory: $!";
1641             my @class_distributions;
1642             foreach my $cluster_index (0..$self->{_K}-1) {
1643             push @class_distributions, [];
1644             }
1645             foreach my $data_tag (@{$self->{_data_id_tags}}) {
1646             foreach my $cluster_index (0..$self->{_K}-1) {
1647             push @{$class_distributions[$cluster_index]}, $self->{_data}->{$data_tag}
1648             if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] > 0.2;
1649             }
1650             }
1651             foreach my $distribution (@class_distributions) {
1652             foreach my $item (@$distribution) {
1653             print OUTPUT "@$item";
1654             print OUTPUT "\n";
1655             }
1656             print OUTPUT "\n\n";
1657             }
1658             close OUTPUT;
1659             my $plot;
1660             if (!defined $pause_time) {
1661             $plot = Graphics::GnuplotIF->new( persist => 1 );
1662             } else {
1663             $plot = Graphics::GnuplotIF->new();
1664             }
1665             my $arg_string = "";
1666             if ($visualization_data_field_width > 2) {
1667             $plot->gnuplot_cmd( "set noclip" );
1668             $plot->gnuplot_cmd( "set pointsize 2" );
1669             foreach my $i (0..$self->{_K}-1) {
1670             my $j = $i + 1;
1671             $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
1672             }
1673             } elsif ($visualization_data_field_width == 2) {
1674             $plot->gnuplot_cmd( "set noclip" );
1675             $plot->gnuplot_cmd( "set pointsize 2" );
1676             foreach my $i (0..$self->{_K}-1) {
1677             my $j = $i + 1;
1678             $arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
1679             my $ellipse_filename = "__contour2_" . $i . ".dat";
1680             $arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
1681             }
1682             } elsif ($visualization_data_field_width == 1 ) {
1683             open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
1684             my @all_data = ;
1685             close INPUT;
1686             @all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
1687             my $all_joined_data = join ':', @all_data;
1688             my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
1689             my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
1690             foreach my $i (0..@separated-1) {
1691             $separated[$i] =~ s/SEPERATOR//g;
1692             my @cluster_for_hist = split /:/, $separated[$i];
1693             @cluster_for_hist = grep $_, @cluster_for_hist;
1694             my ($minval,$maxval) = minmax(\@cluster_for_hist);
1695             push @all_minvals, $minval;
1696             push @all_maxvals, $maxval;
1697             push @all_clusters_for_hist, \@cluster_for_hist;
1698             }
1699             push @all_minmaxvals, @all_minvals;
1700             push @all_minmaxvals, @all_maxvals;
1701             my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
1702             my $delta = ($abs_maxval - $abs_minval) / 100.0;
1703             $plot->gnuplot_cmd("set boxwidth 3");
1704             $plot->gnuplot_cmd("set style fill solid border -1");
1705             $plot->gnuplot_cmd("set ytics out nomirror");
1706             $plot->gnuplot_cmd("set style data histograms");
1707             $plot->gnuplot_cmd("set style histogram clustered");
1708             $plot->gnuplot_cmd("set title 'Individual distributions shown through histograms'");
1709             $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
1710             foreach my $cindex (0..@all_clusters_for_hist-1) {
1711             my $localfilename = basename($filename);
1712             my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $localfilename;
1713             unlink $temp_file if -e $temp_file;
1714             open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
1715             print OUTPUT "Xstep histval\n";
1716              
1717             my @histogram = (0) x 100;
1718             foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
1719             $histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
1720             }
1721             foreach my $i (0..@histogram-1) {
1722             print OUTPUT "$i $histogram[$i]\n";
1723             }
1724             # $arg_string .= "\"$temp_file\" using 1:2 ti col smooth frequency with boxes lc $cindex, ";
1725             $arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
1726             close OUTPUT;
1727             }
1728             }
1729             $arg_string = $arg_string =~ /^(.*),[ ]+$/;
1730             $arg_string = $1;
1731             if ($visualization_data_field_width > 2) {
1732             $plot->gnuplot_cmd( "splot $arg_string" );
1733             $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1734             } elsif ($visualization_data_field_width == 2) {
1735             $plot->gnuplot_cmd( "plot $arg_string" );
1736             $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1737             } elsif ($visualization_data_field_width == 1) {
1738             $plot->gnuplot_cmd( "plot $arg_string" );
1739             $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1740             }
1741             }
1742              
1743             # This method is basically the same as the previous method, except that it is
1744             # intended for making PNG files from the distributions.
1745             sub plot_hardcopy_distributions {
1746             my $self = shift;
1747             my $v_mask;
1748             my $pause_time;
1749             if (@_ == 1) {
1750             $v_mask = shift || die "visualization mask missing";
1751             } elsif (@_ == 2) {
1752             $v_mask = shift || die "visualization mask missing";
1753             $pause_time = shift;
1754             } else {
1755             die "visualize_distributions() called with wrong args";
1756             }
1757             my @v_mask = split //, $v_mask;
1758             my $visualization_mask_width = @v_mask;
1759             my $original_data_mask = $self->{_mask};
1760             my @mask = split //, $original_data_mask;
1761             my $data_field_width = scalar grep {$_ eq '1'} @mask;
1762             die "\n\nABORTED: The width of the visualization mask (including " .
1763             "all its 1s and 0s) must equal the width of the original mask " .
1764             "used for reading the data file (counting only the 1's)"
1765             if $visualization_mask_width != $data_field_width;
1766             my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1767             if ($visualization_data_field_width == 2) {
1768             foreach my $cluster_index (0..$self->{_K}-1) {
1769             my $contour_filename = "__contour2_" . $cluster_index . ".dat";
1770             my $mean = $self->{_cluster_means}->[$cluster_index];
1771             my $covariance = $self->{_cluster_covariances}->[$cluster_index];
1772             my ($mux,$muy) = $mean->as_list();
1773             my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
1774             my ($sigmayx,$vary) = $covariance->row(1)->as_list();
1775             die "Your covariance matrix does not look right"
1776             unless $sigmaxy == $sigmayx;
1777             my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
1778             my $argstring = <<"END";
1779             set contour
1780             mux = $mux
1781             muy = $muy
1782             sigmax = $sigmax
1783             sigmay = $sigmay
1784             sigmaxy = $sigmaxy
1785             determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2
1786             exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) )
1787             f(x,y) = exp( exponent(x,y) ) - 0.2
1788             xmax = mux + 2 * sigmax
1789             xmin = mux - 2 * sigmax
1790             ymax = muy + 2 * sigmay
1791             ymin = muy - 2 * sigmay
1792             set xrange [ xmin : xmax ]
1793             set yrange [ ymin : ymax ]
1794             set isosamples 200
1795             unset surface
1796             set cntrparam levels discrete 0
1797             set table \"$contour_filename\"
1798             splot f(x,y)
1799             unset table
1800             END
1801             my $plot = Graphics::GnuplotIF->new();
1802             $plot->gnuplot_cmd( $argstring );
1803             }
1804             }
1805             my %visualization_data;
1806             while ( my ($record_id, $data) = each %{$self->{_data}} ) {
1807             my @fields = @$data;
1808             die "\nABORTED: Visualization mask size exceeds data record size"
1809             if $#v_mask > $#fields;
1810             my @data_fields;
1811             foreach my $i (0..@fields-1) {
1812             if ($v_mask[$i] eq '0') {
1813             next;
1814             } elsif ($v_mask[$i] eq '1') {
1815             push @data_fields, $fields[$i];
1816             } else {
1817             die "Misformed visualization mask. It can only have 1s and 0s";
1818             }
1819             }
1820             $visualization_data{ $record_id } = \@data_fields;
1821             }
1822             my $filename = basename($self->{_datafile});
1823             my $temp_file = "__temp2_" . $filename;
1824             unlink $temp_file if -e $temp_file;
1825             open OUTPUT, ">$temp_file"
1826             or die "Unable to open a temp file in this directory: $!";
1827             my @class_distributions;
1828             foreach my $cluster_index (0..$self->{_K}-1) {
1829             push @class_distributions, [];
1830             }
1831             foreach my $data_tag (@{$self->{_data_id_tags}}) {
1832             foreach my $cluster_index (0..$self->{_K}-1) {
1833             push @{$class_distributions[$cluster_index]}, $self->{_data}->{$data_tag}
1834             if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] > 0.2;
1835             }
1836             }
1837             foreach my $distribution (@class_distributions) {
1838             foreach my $item (@$distribution) {
1839             print OUTPUT "@$item";
1840             print OUTPUT "\n";
1841             }
1842             print OUTPUT "\n\n";
1843             }
1844             close OUTPUT;
1845             my $plot;
1846             if (!defined $pause_time) {
1847             $plot = Graphics::GnuplotIF->new( persist => 1 );
1848             } else {
1849             $plot = Graphics::GnuplotIF->new();
1850             }
1851             my $arg_string = "";
1852             if ($visualization_data_field_width > 2) {
1853             $plot->gnuplot_cmd( "set noclip" );
1854             $plot->gnuplot_cmd( "set pointsize 2" );
1855             foreach my $i (0..$self->{_K}-1) {
1856             my $j = $i + 1;
1857             $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
1858             }
1859             } elsif ($visualization_data_field_width == 2) {
1860             $plot->gnuplot_cmd( "set noclip" );
1861             $plot->gnuplot_cmd( "set pointsize 2" );
1862             foreach my $i (0..$self->{_K}-1) {
1863             my $j = $i + 1;
1864             $arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
1865             my $ellipse_filename = "__contour2_" . $i . ".dat";
1866             $arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
1867             }
1868             } elsif ($visualization_data_field_width == 1 ) {
1869             open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
1870             my @all_data = ;
1871             close INPUT;
1872             @all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
1873             my $all_joined_data = join ':', @all_data;
1874             my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
1875             my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
1876             foreach my $i (0..@separated-1) {
1877             $separated[$i] =~ s/SEPERATOR//g;
1878             my @cluster_for_hist = split /:/, $separated[$i];
1879             @cluster_for_hist = grep $_, @cluster_for_hist;
1880             my ($minval,$maxval) = minmax(\@cluster_for_hist);
1881             push @all_minvals, $minval;
1882             push @all_maxvals, $maxval;
1883             push @all_clusters_for_hist, \@cluster_for_hist;
1884             }
1885             push @all_minmaxvals, @all_minvals;
1886             push @all_minmaxvals, @all_maxvals;
1887             my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
1888             my $delta = ($abs_maxval - $abs_minval) / 100.0;
1889             $plot->gnuplot_cmd("set boxwidth 3");
1890             $plot->gnuplot_cmd("set style fill solid border -1");
1891             $plot->gnuplot_cmd("set ytics out nomirror");
1892             $plot->gnuplot_cmd("set style data histograms");
1893             $plot->gnuplot_cmd("set style histogram clustered");
1894             $plot->gnuplot_cmd("set title 'Individual distributions shown through histograms'");
1895             $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
1896             foreach my $cindex (0..@all_clusters_for_hist-1) {
1897             my $localfilename = basename($filename);
1898             my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $localfilename;
1899             unlink $temp_file if -e $temp_file;
1900             open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
1901             print OUTPUT "Xstep histval\n";
1902              
1903             my @histogram = (0) x 100;
1904             foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
1905             $histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
1906             }
1907             foreach my $i (0..@histogram-1) {
1908             print OUTPUT "$i $histogram[$i]\n";
1909             }
1910             $arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
1911             close OUTPUT;
1912             }
1913             }
1914             $arg_string = $arg_string =~ /^(.*),[ ]+$/;
1915             $arg_string = $1;
1916              
1917             if ($visualization_data_field_width > 2) {
1918             $plot->gnuplot_cmd( 'set terminal png',
1919             'set output "posterior_prob_plot.png"');
1920             $plot->gnuplot_cmd( "splot $arg_string" );
1921             } elsif ($visualization_data_field_width == 2) {
1922             $plot->gnuplot_cmd( 'set terminal png',
1923             'set output "posterior_prob_plot.png"');
1924             $plot->gnuplot_cmd( "plot $arg_string" );
1925             } elsif ($visualization_data_field_width == 1) {
1926             $plot->gnuplot_cmd( 'set terminal png',
1927             'set output "posterior_prob_plot.png"');
1928             $plot->gnuplot_cmd( "plot $arg_string" );
1929             }
1930             }
1931              
1932             # The method shown below should be called only AFTER you have called the method
1933             # read_data_from_file(). The visualize_data() is meant for the visualization of the
1934             # original data in its various 2D or 3D subspaces.
1935             sub visualize_data {
1936             my $self = shift;
1937             my $v_mask = shift || die "visualization mask missing";
1938              
1939             my $master_datafile = $self->{_datafile};
1940              
1941             my @v_mask = split //, $v_mask;
1942             my $visualization_mask_width = @v_mask;
1943             my $original_data_mask = $self->{_mask};
1944             my @mask = split //, $original_data_mask;
1945             my $data_field_width = scalar grep {$_ eq '1'} @mask;
1946             die "\n\nABORTED: The width of the visualization mask (including " .
1947             "all its 1s and 0s) must equal the width of the original mask " .
1948             "used for reading the data file (counting only the 1's)"
1949             if $visualization_mask_width != $data_field_width;
1950             my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1951             my %visualization_data;
1952             my $data_source = $self->{_data};
1953             while ( my ($record_id, $data) = each %{$data_source} ) {
1954             my @fields = @$data;
1955             die "\nABORTED: Visualization mask size exceeds data record size"
1956             if $#v_mask > $#fields;
1957             my @data_fields;
1958             foreach my $i (0..@fields-1) {
1959             if ($v_mask[$i] eq '0') {
1960             next;
1961             } elsif ($v_mask[$i] eq '1') {
1962             push @data_fields, $fields[$i];
1963             } else {
1964             die "Misformed visualization mask. It can only have 1s and 0s";
1965             }
1966             }
1967             $visualization_data{ $record_id } = \@data_fields;
1968             }
1969             my $filename = basename($master_datafile);
1970             my $temp_file;
1971             $temp_file = "__temp_data_" . $filename;
1972             unlink $temp_file if -e $temp_file;
1973             open OUTPUT, ">$temp_file"
1974             or die "Unable to open a temp file in this directory: $!";
1975             foreach my $datapoint (values %visualization_data) {
1976             print OUTPUT "@$datapoint";
1977             print OUTPUT "\n";
1978             }
1979             close OUTPUT;
1980             my $plot = Graphics::GnuplotIF->new( persist => 1 );
1981             $plot->gnuplot_cmd( "set noclip" );
1982             $plot->gnuplot_cmd( "set pointsize 2" );
1983             my $plot_title = '"original data provided for EM"';
1984             my $arg_string ;
1985             if ($visualization_data_field_width > 2) {
1986             $arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
1987             } elsif ($visualization_data_field_width == 2) {
1988             $arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
1989             } elsif ($visualization_data_field_width == 1 ) {
1990             open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
1991             my @all_data = ;
1992             close INPUT;
1993             @all_data = map {chomp $_; $_} @all_data;
1994             @all_data = grep $_, @all_data;
1995             my ($minval,$maxval) = minmax(\@all_data);
1996             my $delta = ($maxval - $minval) / 100.0;
1997             $plot->gnuplot_cmd("set boxwidth 3");
1998             $plot->gnuplot_cmd("set style fill solid border -1");
1999             $plot->gnuplot_cmd("set ytics out nomirror");
2000             $plot->gnuplot_cmd("set style data histograms");
2001             $plot->gnuplot_cmd("set style histogram clustered");
2002             $plot->gnuplot_cmd("set title 'Overall distribution of 1D data'");
2003             $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
2004             my $localfilename = basename($filename);
2005             my $temp_file = "__temp1dhist_" . $localfilename;
2006             unlink $temp_file if -e $temp_file;
2007             open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
2008             print OUTPUT "Xstep histval\n";
2009             my @histogram = (0) x 100;
2010             foreach my $i (0..@all_data-1) {
2011             $histogram[int( ($all_data[$i] - $minval) / $delta )]++;
2012             }
2013             foreach my $i (0..@histogram-1) {
2014             print OUTPUT "$i $histogram[$i]\n";
2015             }
2016             $arg_string = "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc rgb 'green'";
2017             close OUTPUT;
2018             }
2019             if ($visualization_data_field_width > 2) {
2020             $plot->gnuplot_cmd( "splot $arg_string" );
2021             } elsif ($visualization_data_field_width == 2) {
2022             $plot->gnuplot_cmd( "plot $arg_string" );
2023             } elsif ($visualization_data_field_width == 1) {
2024             $plot->gnuplot_cmd( "plot $arg_string" );
2025             }
2026             }
2027              
2028             # This method is the same as the one shown above, except that it is meant for
2029             # creating PNG files of the displays.
2030             sub plot_hardcopy_data {
2031             my $self = shift;
2032             my $v_mask = shift || die "visualization mask missing";
2033             my $master_datafile = $self->{_datafile};
2034             my @v_mask = split //, $v_mask;
2035             my $visualization_mask_width = @v_mask;
2036             my $original_data_mask = $self->{_mask};
2037             my @mask = split //, $original_data_mask;
2038             my $data_field_width = scalar grep {$_ eq '1'} @mask;
2039             die "\n\nABORTED: The width of the visualization mask (including " .
2040             "all its 1s and 0s) must equal the width of the original mask " .
2041             "used for reading the data file (counting only the 1's)"
2042             if $visualization_mask_width != $data_field_width;
2043             my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
2044             my %visualization_data;
2045             my $data_source = $self->{_data};
2046             while ( my ($record_id, $data) = each %{$data_source} ) {
2047             my @fields = @$data;
2048             die "\nABORTED: Visualization mask size exceeds data record size"
2049             if $#v_mask > $#fields;
2050             my @data_fields;
2051             foreach my $i (0..@fields-1) {
2052             if ($v_mask[$i] eq '0') {
2053             next;
2054             } elsif ($v_mask[$i] eq '1') {
2055             push @data_fields, $fields[$i];
2056             } else {
2057             die "Misformed visualization mask. It can only have 1s and 0s";
2058             }
2059             }
2060             $visualization_data{ $record_id } = \@data_fields;
2061             }
2062             my $filename = basename($master_datafile);
2063             my $temp_file;
2064             $temp_file = "__temp_data_" . $filename;
2065             unlink $temp_file if -e $temp_file;
2066             open OUTPUT, ">$temp_file"
2067             or die "Unable to open a temp file in this directory: $!";
2068             foreach my $datapoint (values %visualization_data) {
2069             print OUTPUT "@$datapoint";
2070             print OUTPUT "\n";
2071             }
2072             close OUTPUT;
2073              
2074             my $plot = Graphics::GnuplotIF->new( persist => 1 );
2075             $plot->gnuplot_cmd( "set noclip" );
2076             $plot->gnuplot_cmd( "set pointsize 2" );
2077             my $plot_title = '"original data provided for EM"';
2078             my $arg_string ;
2079             if ($visualization_data_field_width > 2) {
2080             $arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
2081             } elsif ($visualization_data_field_width == 2) {
2082             $arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
2083             } elsif ($visualization_data_field_width == 1 ) {
2084             open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
2085             my @all_data = ;
2086             close INPUT;
2087             @all_data = map {chomp $_; $_} @all_data;
2088             @all_data = grep $_, @all_data;
2089             my ($minval,$maxval) = minmax(\@all_data);
2090             my $delta = ($maxval - $minval) / 100.0;
2091             $plot->gnuplot_cmd("set boxwidth 3");
2092             $plot->gnuplot_cmd("set style fill solid border -1");
2093             $plot->gnuplot_cmd("set ytics out nomirror");
2094             $plot->gnuplot_cmd("set style data histograms");
2095             $plot->gnuplot_cmd("set style histogram clustered");
2096             $plot->gnuplot_cmd("set title 'Overall distribution of 1D data'");
2097             $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
2098             my $localfilename = basename($filename);
2099             my $temp_file = "__temp1dhist_" . $localfilename;
2100             unlink $temp_file if -e $temp_file;
2101             open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
2102             print OUTPUT "Xstep histval\n";
2103             my @histogram = (0) x 100;
2104             foreach my $i (0..@all_data-1) {
2105             $histogram[int( ($all_data[$i] - $minval) / $delta )]++;
2106             }
2107             foreach my $i (0..@histogram-1) {
2108             print OUTPUT "$i $histogram[$i]\n";
2109             }
2110             $arg_string = "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc rgb 'green'";
2111             close OUTPUT;
2112             }
2113             if ($visualization_data_field_width > 2) {
2114             $plot->gnuplot_cmd( 'set terminal png',
2115             'set output "data_scatter_plot.png"');
2116             $plot->gnuplot_cmd( "splot $arg_string" );
2117             } elsif ($visualization_data_field_width == 2) {
2118             $plot->gnuplot_cmd( 'set terminal png',
2119             'set output "data_scatter_plot.png"');
2120             $plot->gnuplot_cmd( "plot $arg_string" );
2121             } elsif ($visualization_data_field_width == 1) {
2122             $plot->gnuplot_cmd( 'set terminal png',
2123             'set output "data_scatter_plot.png"');
2124             $plot->gnuplot_cmd( "plot $arg_string" );
2125             }
2126             }
2127              
2128              
2129             ################### Generating Synthetic Data for Clustering ###################
2130              
2131             # The data generated corresponds to a multivariate distribution. The mean and the
2132             # covariance of each Gaussian in the distribution are specified individually in a
2133             # parameter file. The parameter file must also state the prior probabilities to be
2134             # associated with each Gaussian. See the example parameter file param1.txt in the
2135             # examples directory. Just edit this file for your own needs.
2136             #
2137             # The multivariate random numbers are generated by calling the Math::Random module.
2138             # As you would expect, that module will insist that the covariance matrix you
2139             # specify be symmetric and positive definite.
2140             sub cluster_data_generator {
2141             my $class = shift;
2142             die "illegal call of a class method"
2143             unless $class eq 'Algorithm::ExpectationMaximization';
2144             my %args = @_;
2145             my $input_parameter_file = $args{input_parameter_file};
2146             my $output_file = $args{output_datafile};
2147             my $N = $args{total_number_of_data_points};
2148             my @all_params;
2149             my $param_string;
2150             if (defined $input_parameter_file) {
2151             open INPUT, $input_parameter_file || "unable to open parameter file: $!";
2152             @all_params = ;
2153             @all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
2154             chomp @all_params;
2155             $param_string = join ' ', @all_params;
2156             } else {
2157             # Just for testing. Used in t/test.t
2158             $param_string = "priors 0.34 0.33 0.33 " .
2159             "cluster 5 0 0 1 0 0 0 1 0 0 0 1 " .
2160             "cluster 0 5 0 1 0 0 0 1 0 0 0 1 " .
2161             "cluster 0 0 5 1 0 0 0 1 0 0 0 1";
2162             }
2163             my ($priors_string) = $param_string =~ /priors(.+?)cluster/;
2164             croak "You did not specify the prior probabilities in the parameter file"
2165             unless $priors_string;
2166             my @priors = split /\s+/, $priors_string;
2167             @priors = grep {/$_num_regex/; $_} @priors;
2168             my $sum = 0;
2169             foreach my $prior (@priors) {
2170             $sum += $prior;
2171             }
2172             croak "Your priors in the parameter file do not add up to 1" unless $sum == 1;
2173             my ($rest_of_string) = $param_string =~ /priors\s*$priors_string(.*)$/;
2174             my @cluster_strings = split /[ ]*cluster[ ]*/, $rest_of_string;
2175             @cluster_strings = grep $_, @cluster_strings;
2176              
2177             my $K = @cluster_strings;
2178             croak "Too many clusters requested" if $K > 12;
2179             croak "Mismatch between the number of values for priors and the number " .
2180             "of clusters" unless $K == @priors;
2181             my @point_labels = ('a'..'z');
2182             print "Prior probabilities recorded from param file: @priors\n";
2183             print "Number of Gaussians used for the synthetic data: $K\n";
2184             my @means;
2185             my @covariances;
2186             my $data_dimension;
2187             foreach my $i (0..$K-1) {
2188             my @num_strings = split / /, $cluster_strings[$i];
2189             my @cluster_mean = map {/$_num_regex/;$_} split / /, $num_strings[0];
2190             $data_dimension = @cluster_mean;
2191             push @means, \@cluster_mean;
2192             my @covariance_nums = map {/$_num_regex/;$_} split / /, $num_strings[1];
2193             croak "dimensionality error" if @covariance_nums !=
2194             ($data_dimension ** 2);
2195             my $cluster_covariance;
2196             foreach my $j (0..$data_dimension-1) {
2197             foreach my $k (0..$data_dimension-1) {
2198             $cluster_covariance->[$j]->[$k] =
2199             $covariance_nums[$j*$data_dimension + $k];
2200             }
2201             }
2202             push @covariances, $cluster_covariance;
2203             }
2204             random_seed_from_phrase( 'hellomellojello' );
2205             my @data_dump;
2206             foreach my $i (0..$K-1) {
2207             my @m = @{shift @means};
2208             my @covar = @{shift @covariances};
2209             my @new_data = Math::Random::random_multivariate_normal(
2210             int($N * $priors[$i]), @m, @covar );
2211             my $p = 0;
2212             my $label = $point_labels[$i];
2213             @new_data = map {unshift @$_, $label.$i; $i++; $_} @new_data;
2214             push @data_dump, @new_data;
2215             }
2216             fisher_yates_shuffle( \@data_dump );
2217             open OUTPUT, ">$output_file";
2218             print OUTPUT "\#Data generated from the parameter file: $input_parameter_file\n"
2219             if $input_parameter_file;
2220             print OUTPUT "\#Total number of data points in this file: $N\n";
2221             print OUTPUT "\#Prior class probabilities for this data: @priors\n";
2222             foreach my $ele (@data_dump) {
2223             foreach my $coord ( @$ele ) {
2224             print OUTPUT "$coord ";
2225             }
2226             print OUTPUT "\n";
2227             }
2228             print "Data written out to file $output_file\n";
2229             close OUTPUT;
2230             }
2231              
2232             sub add_point_coords {
2233             my $self = shift;
2234             my @arr_of_ids = @{shift @_}; # array of data element names
2235             my @result;
2236             my $data_dimensionality = $self->{_data_dimensions};
2237             foreach my $i (0..$data_dimensionality-1) {
2238             $result[$i] = 0.0;
2239             }
2240             foreach my $id (@arr_of_ids) {
2241             my $ele = $self->{_data}->{$id};
2242             my $i = 0;
2243             foreach my $component (@$ele) {
2244             $result[$i] += $component;
2245             $i++;
2246             }
2247             }
2248             return \@result;
2249             }
2250              
2251             ###################### Support Routines ########################
2252              
2253             # computer the outer product of two column vectors
2254             sub outer_product {
2255             my $vec1 = shift;
2256             my $vec2 = shift;
2257             my ($nrows1, $ncols1) = ($vec1->rows(), $vec1->cols());
2258             my ($nrows2, $ncols2) = ($vec2->rows(), $vec2->cols());
2259             die "Outer product operation called with non-matching vectors"
2260             unless $ncols1 == 1 && $ncols2 == 1 && $nrows1 == $nrows2;
2261             my @vec_arr1 = $vec1->as_list();
2262             my @vec_arr2 = $vec2->as_list();
2263             my $outer_product = Math::GSL::Matrix->new($nrows1, $nrows2);
2264             foreach my $index (0..$nrows1-1) {
2265             my @new_row = map $vec_arr1[$index] * $_, @vec_arr2;
2266             $outer_product->set_row($index, \@new_row);
2267             }
2268             return $outer_product;
2269             }
2270              
2271             sub get_index_at_value {
2272             my $value = shift;
2273             my @array = @{shift @_};
2274             foreach my $i (0..@array-1) {
2275             return $i if $value == $array[$i];
2276             }
2277             }
2278              
2279             # This routine is really not necessary in light of the new `~~' operator in Perl.
2280             # Will use the new operator in the next version.
2281             sub vector_equal {
2282             my $vec1 = shift;
2283             my $vec2 = shift;
2284             die "wrong data types for distance calculation" if @$vec1 != @$vec2;
2285             foreach my $i (0..@$vec1-1){
2286             return 0 if $vec1->[$i] != $vec2->[$i];
2287             }
2288             return 1;
2289             }
2290              
2291             sub compare_array_floats {
2292             my $vec1 = shift;
2293             my $vec2 = shift;
2294             foreach my $i (0..@$vec1-1){
2295             return 0 if abs($vec1->[$i] - $vec2->[$i]) > 0.00001;
2296             }
2297             return 1;
2298             }
2299              
2300             # Returns the minimum value and its positional index in an array
2301             sub minimum {
2302             my $arr = shift;
2303             my $min;
2304             my $index;
2305             foreach my $i (0..@{$arr}-1) {
2306             if ( (!defined $min) || ($arr->[$i] < $min) ) {
2307             $index = $i;
2308             $min = $arr->[$i];
2309             }
2310             }
2311             return ($min, $index);
2312             }
2313              
2314             sub minmax {
2315             my $arr = shift;
2316             my $min;
2317             my $max;
2318             foreach my $i (0..@{$arr}-1) {
2319             if ( (!defined $min) && (!defined $max) ) {
2320             $min = $arr->[$i];
2321             $max = $arr->[$i];
2322             } elsif ( $arr->[$i] < $min ) {
2323             $min = $arr->[$i];
2324             } elsif ( $arr->[$i] > $max ) {
2325             $max = $arr->[$i];
2326             }
2327             }
2328             return ($min, $max);
2329             }
2330              
2331             # Meant only for constructing a deep copy of an array of
2332             # arrays:
2333             sub deep_copy_AoA {
2334             my $ref_in = shift;
2335             my $ref_out;
2336             foreach my $i (0..@{$ref_in}-1) {
2337             foreach my $j (0..@{$ref_in->[$i]}-1) {
2338             $ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
2339             }
2340             }
2341             return $ref_out;
2342             }
2343              
2344             # Meant only for constructing a deep copy of an array of arrays for the case when
2345             # some elements of the top-level array may be undefined:
2346             sub deep_copy_AoA_with_nulls {
2347             my $ref_in = shift;
2348             my $ref_out;
2349             foreach my $i (0..@{$ref_in}-1) {
2350             if ( !defined $ref_in->[$i] ) {
2351             $ref_out->[$i] = undef;
2352             next;
2353             }
2354             foreach my $j (0..@{$ref_in->[$i]}-1) {
2355             $ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
2356             }
2357             }
2358             return $ref_out;
2359             }
2360              
2361             # Meant only for constructing a deep copy of a hash in which each value is an
2362             # anonymous array of numbers:
2363             sub deep_copy_hash {
2364             my $ref_in = shift;
2365             my $ref_out;
2366             while ( my ($key, $value) = each( %$ref_in ) ) {
2367             $ref_out->{$key} = deep_copy_array( $value );
2368             }
2369             return $ref_out;
2370             }
2371              
2372             # Meant only for an array of numbers:
2373             sub deep_copy_array {
2374             my $ref_in = shift;
2375             my $ref_out;
2376             foreach my $i (0..@{$ref_in}-1) {
2377             $ref_out->[$i] = $ref_in->[$i];
2378             }
2379             return $ref_out;
2380             }
2381              
2382             # from perl docs:
2383             sub fisher_yates_shuffle {
2384             my $arr = shift;
2385             my $i = @$arr;
2386             while (--$i) {
2387             my $j = int rand( $i + 1 );
2388             @$arr[$i, $j] = @$arr[$j, $i];
2389             }
2390             }
2391              
2392             sub mean_and_variance {
2393             my @data = @{shift @_};
2394             my ($mean, $variance);
2395             foreach my $i (1..@data) {
2396             if ($i == 1) {
2397             $mean = $data[0];
2398             $variance = 0;
2399             } else {
2400             # data[$i-1] because of zero-based indexing of vector
2401             $mean = ( (($i-1)/$i) * $mean ) + $data[$i-1] / $i;
2402             $variance = ( (($i-1)/$i) * $variance )
2403             + ($data[$i-1]-$mean)**2 / ($i-1);
2404             }
2405             }
2406             return ($mean, $variance);
2407             }
2408              
2409             sub check_for_illegal_params {
2410             my @params = @_;
2411             my @legal_params = qw / datafile
2412             mask
2413             K
2414             terminal_output
2415             max_em_iterations
2416             seeding
2417             class_priors
2418             seed_tags
2419             debug
2420             /;
2421             my $found_match_flag;
2422             foreach my $param (@params) {
2423             foreach my $legal (@legal_params) {
2424             $found_match_flag = 0;
2425             if ($param eq $legal) {
2426             $found_match_flag = 1;
2427             last;
2428             }
2429             }
2430             last if $found_match_flag == 0;
2431             }
2432             return $found_match_flag;
2433             }
2434              
2435             sub get_value_index_hash {
2436             my $arr = shift;
2437             my %hash;
2438             foreach my $index (0..@$arr-1) {
2439             $hash{$arr->[$index]} = $index if $arr->[$index] > 0;
2440             }
2441             return \%hash;
2442             }
2443              
2444             sub non_maximum_supression {
2445             my $arr = shift;
2446             my @output = (0) x @$arr;
2447             my @final_output = (0) x @$arr;
2448             my %hash;
2449             my @array_of_runs = ([$arr->[0]]);
2450             foreach my $index (1..@$arr-1) {
2451             if ($arr->[$index] == $arr->[$index-1]) {
2452             push @{$array_of_runs[-1]}, $arr->[$index];
2453             } else {
2454             push @array_of_runs, [$arr->[$index]];
2455             }
2456             }
2457             my $runstart_index = 0;
2458             foreach my $run_index (1..@array_of_runs-2) {
2459             $runstart_index += @{$array_of_runs[$run_index-1]};
2460             if ($array_of_runs[$run_index]->[0] >
2461             $array_of_runs[$run_index-1]->[0] &&
2462             $array_of_runs[$run_index]->[0] >
2463             $array_of_runs[$run_index+1]->[0]) {
2464             my $run_center = @{$array_of_runs[$run_index]} / 2;
2465             my $assignment_index = $runstart_index + $run_center;
2466             $output[$assignment_index] = $arr->[$assignment_index];
2467             }
2468             }
2469             if ($array_of_runs[-1]->[0] > $array_of_runs[-2]->[0]) {
2470             $runstart_index += @{$array_of_runs[-2]};
2471             my $run_center = @{$array_of_runs[-1]} / 2;
2472             my $assignment_index = $runstart_index + $run_center;
2473             $output[$assignment_index] = $arr->[$assignment_index];
2474             }
2475             if ($array_of_runs[0]->[0] > $array_of_runs[1]->[0]) {
2476             my $run_center = @{$array_of_runs[0]} / 2;
2477             $output[$run_center] = $arr->[$run_center];
2478             }
2479             return \@output;
2480             }
2481              
2482             sub display_matrix {
2483             my $message = shift;
2484             my $matrix = shift;
2485             if (!defined blessed($matrix)) {
2486             print "display_matrix called on a scalar value: $matrix\n";
2487             return;
2488             }
2489             my $nrows = $matrix->rows();
2490             my $ncols = $matrix->cols();
2491             print "$message ($nrows rows and $ncols columns)\n";
2492             foreach my $i (0..$nrows-1) {
2493             my $row = $matrix->row($i);
2494             my @row_as_list = $row->as_list;
2495             print "@row_as_list\n";
2496             }
2497             print "\n";
2498             }
2499              
2500             sub transpose {
2501             my $matrix = shift;
2502             my $num_rows = $matrix->rows();
2503             my $num_cols = $matrix->cols();
2504             my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
2505             foreach my $i (0..$num_rows-1) {
2506             my @row = $matrix->row($i)->as_list;
2507             $transpose->set_col($i, \@row );
2508             }
2509             return $transpose;
2510             }
2511              
2512             sub vector_multiply {
2513             my $vec1 = shift;
2514             my $vec2 = shift;
2515             die "vec_multiply called with two vectors of different sizes"
2516             unless @$vec1 == @$vec2;
2517             my $result = 0;
2518             foreach my $i (0..@$vec1-1) {
2519             $result += $vec1->[$i] * $vec2->[$i];
2520             }
2521             return $result;
2522             }
2523              
2524             sub vector_2_vector_multiply {
2525             my $vec1 = shift;
2526             my $vec2 = shift;
2527             die "vec_multiply called with two vectors of different sizes"
2528             unless @$vec1 == @$vec2;
2529             my @result_vec;
2530             foreach my $i (0..@$vec1-1) {
2531             $result_vec[$i] = $vec1->[$i] * $vec2->[$i];
2532             }
2533             return \@result_vec;
2534             }
2535              
2536             sub matrix_multiply {
2537             my $matrix1 = shift;
2538             my $matrix2 = shift;
2539             my ($nrows1, $ncols1) = ($matrix1->rows(), $matrix1->cols());
2540             my ($nrows2, $ncols2) = ($matrix2->rows(), $matrix2->cols());
2541             die "matrix multiplication called with non-matching matrix arguments"
2542             # unless $ncols1 == $nrows2;
2543             unless $nrows1 == $ncols2 && $ncols1 == $nrows2;
2544             if ($nrows1 == 1) {
2545             my @row = $matrix1->row(0)->as_list;
2546             my @col = $matrix2->col(0)->as_list;
2547             my $result;
2548             foreach my $j (0..$ncols1-1) {
2549             $result += $row[$j] * $col[$j];
2550             }
2551             return $result;
2552             }
2553             my $product = Math::GSL::Matrix->new($nrows1, $nrows1);
2554             foreach my $i (0..$nrows1-1) {
2555             my $row = $matrix1->row($i);
2556             my @product_row;
2557             foreach my $j (0..$ncols2-1) {
2558             my $col = $matrix2->col($j);
2559             my $row_times_col = matrix_multiply($row, $col);
2560             push @product_row, $row_times_col;
2561             }
2562             $product->set_row($i, \@product_row);
2563             }
2564             return $product;
2565             }
2566              
2567             sub vector_matrix_multiply {
2568             my $matrix1 = shift;
2569             my $matrix2 = shift;
2570             my ($nrows1, $ncols1) = ($matrix1->rows, $matrix1->cols);
2571             my ($nrows2, $ncols2) = ($matrix2->rows, $matrix2->cols);
2572             die "matrix multiplication called with non-matching matrix arguments"
2573             unless $nrows1 == 1 && $ncols1 == $nrows2;
2574             if ($ncols2 == 1) {
2575             my @row = $matrix1->row(0)->as_list;
2576             my @col = $matrix2->col(0)->as_list;
2577             my $result;
2578             foreach my $j (0..$ncols1-1) {
2579             $result += $row[$j] * $col[$j];
2580             }
2581             return $result;
2582             }
2583             my $product = Math::GSL::Matrix->new(1, $ncols2);
2584             my $row = $matrix1->row(0);
2585             my @product_row;
2586             foreach my $j (0..$ncols2-1) {
2587             my $col = $matrix2->col($j);
2588             my $row_times_col = vector_matrix_multiply($row, $col);
2589             push @product_row, $row_times_col;
2590             }
2591             $product->set_row(0, \@product_row);
2592             return $product;
2593             }
2594              
2595             sub matrix_vector_multiply {
2596             my $matrix1 = shift;
2597             my $matrix2 = shift;
2598             my ($nrows1, $ncols1) = ($matrix1->rows(), $matrix1->cols());
2599             my ($nrows2, $ncols2) = ($matrix2->rows(), $matrix2->cols());
2600             die "matrix multiplication called with non-matching matrix arguments"
2601             unless $ncols1 == $nrows2 && $ncols2 == 1;
2602             if ($nrows1 == 1) {
2603             my @row = $matrix1->row(0)->as_list;
2604             my @col = $matrix2->col(0)->as_list;
2605             my $result;
2606             foreach my $j (0..$ncols1-1) {
2607             $result += $row[$j] * $col[$j];
2608             }
2609             return $result;
2610             }
2611             my $product = Math::GSL::Matrix->new($nrows1, 1);
2612             my $col = $matrix2->col(0);
2613             my @product_col;
2614             foreach my $i (0..$nrows1-1) {
2615             my $row = $matrix1->row($i);
2616             my $row_times_col = matrix_vector_multiply($row, $col);
2617             push @product_col, $row_times_col;
2618             }
2619             $product->set_col(0, \@product_col);
2620             return $product;
2621             }
2622              
2623             sub matrix_trace {
2624             my $matrix = shift;
2625             my ($nrows, $ncols) = ($matrix->rows(), $matrix->cols());
2626             die "trace can only be calculated for a square matrix"
2627             unless $ncols == $nrows;
2628             my @elements = $matrix->as_list;
2629             my $trace = 0;
2630             foreach my $i (0..$nrows-1) {
2631             $trace += $elements[$i + $i * $ncols];
2632             }
2633             return $trace;
2634             }
2635              
2636             1;
2637              
2638             =pod
2639              
2640             =head1 NAME
2641              
2642             Algorithm::ExpectationMaximization -- A Perl module for clustering numerical
2643             multi-dimensional data with the Expectation-Maximization algorithm.
2644              
2645             =head1 SYNOPSIS
2646              
2647             use Algorithm::ExpectationMaximization;
2648              
2649             # First name the data file:
2650              
2651             my $datafile = "mydatafile1.dat";
2652              
2653             # Next, set the mask to indicate which columns of the datafile to use for
2654             # clustering and which column contains a symbolic ID for each data record. For
2655             # example, if the symbolic name is in the first column, you want the second column
2656             # to be ignored, and you want the next three columns to be used for 3D clustering:
2657              
2658             my $mask = "N0111";
2659              
2660             # Now construct an instance of the clusterer. The parameter `K' controls the
2661             # number of clusters. Here is an example call to the constructor for instance
2662             # creation:
2663              
2664             my $clusterer = Algorithm::ExpectationMaximization->new(
2665             datafile => $datafile,
2666             mask => $mask,
2667             K => 3,
2668             max_em_iterations => 300,
2669             seeding => 'random',
2670             terminal_output => 1,
2671             );
2672            
2673             # Note the choice for `seeding'. The choice `random' means that the clusterer will
2674             # randomly select `K' data points to serve as initial cluster centers. Other
2675             # possible choices for the constructor parameter `seeding' are `kmeans' and
2676             # `manual'. With the `kmeans' option for `seeding', the output of a K-means
2677             # clusterer is used for the cluster seeds and the initial cluster covariances. If
2678             # you use the `manual' option for seeding, you must also specify the data elements
2679             # to use for seeding the clusters.
2680              
2681             # Here is an example of a call to the constructor when we choose the `manual'
2682             # option for seeding the clusters and for specifying the data elements for
2683             # seeding. The data elements are specified by their tag names. In this case,
2684             # these names are `a26', `b53', and `c49':
2685              
2686             my $clusterer = Algorithm::ExpectationMaximization->new(
2687             datafile => $datafile,
2688             mask => $mask,
2689             class_priors => [0.6, 0.2, 0.2],
2690             K => 3,
2691             max_em_iterations => 300,
2692             seeding => 'manual',
2693             seed_tags => ['a26', 'b53', 'c49'],
2694             terminal_output => 1,
2695             );
2696              
2697             # This example call to the constructor also illustrates how you can inject class
2698             # priors into the clustering process. The class priors are the prior probabilities
2699             # of the class distributions in your dataset. As explained later, injecting class
2700             # priors in the manner shown above makes statistical sense only for the case of
2701             # manual seeding. When you do inject class priors, the order in which the priors
2702             # are expressed must correspond to the manually specified seeds for the clusters.
2703              
2704             # After the invocation of the constructor, the following calls are mandatory
2705             # for reasons that should be obvious from the names of the methods:
2706              
2707             $clusterer->read_data_from_file();
2708             srand(time);
2709             $clusterer->seed_the_clusters();
2710             $clusterer->EM();
2711             $clusterer->run_bayes_classifier();
2712             my $clusters = $clusterer->return_disjoint_clusters();
2713              
2714             # where the call to `EM()' is the invocation of the expectation-maximization
2715             # algorithm. The call to `srand(time)' is to seed the pseudo random number
2716             # generator afresh for each run of the cluster seeding procedure. If you want to
2717             # see repeatable results from one run to another of the algorithm with random
2718             # seeding, you would obviously not invoke `srand(time)'.
2719              
2720             # The call `run_bayes_classifier()' shown above carries out a disjoint clustering
2721             # of all the data points using the naive Bayes' classifier. And the call
2722             # `return_disjoint_clusters()' returns the clusters thus formed to you. Once you
2723             # have obtained access to the clusters in this manner, you can display them in
2724             # your terminal window by
2725              
2726             foreach my $index (0..@$clusters-1) {
2727             print "Cluster $index (Naive Bayes): @{$clusters->[$index]}\n\n"
2728             }
2729              
2730             # If you would like to also see the clusters purely on the basis of the posterior
2731             # class probabilities exceeding a threshold, call
2732              
2733             my $theta1 = 0.2;
2734             my $posterior_prob_clusters =
2735             $clusterer->return_clusters_with_posterior_probs_above_threshold($theta1);
2736              
2737             # where you can obviously set the threshold $theta1 to any value you wish. Note
2738             # that now you may end up with clusters that overlap. You can display them in
2739             # your terminal window in the same manner as shown above for the naive Bayes'
2740             # clusters.
2741              
2742             # You can write the naive Bayes' clusters out to files, one cluster per file, by
2743             # calling
2744              
2745             $clusterer->write_naive_bayes_clusters_to_files();
2746              
2747             # The clusters are placed in files with names like
2748              
2749             naive_bayes_cluster1.dat
2750             naive_bayes_cluster2.dat
2751             ...
2752              
2753             # In the same manner, you can write out the posterior probability based possibly
2754             # overlapping clusters to files by calling:
2755              
2756             $clusterer->write_posterior_prob_clusters_above_threshold_to_files($theta1);
2757              
2758             # where the threshold $theta1 sets the probability threshold for deciding which
2759             # data elements to place in a cluster. These clusters are placed in files with
2760             # names like
2761              
2762             posterior_prob_cluster1.dat
2763             posterior_prob_cluster2.dat
2764             ...
2765              
2766             # CLUSTER VISUALIZATION:
2767              
2768             # You must first set the mask for cluster visualization. This mask tells the
2769             # module which 2D or 3D subspace of the original data space you wish to visualize
2770             # the clusters in:
2771              
2772             my $visualization_mask = "111";
2773             $clusterer->visualize_clusters($visualization_mask);
2774             $clusterer->visualize_distributions($visualization_mask);
2775             $clusterer->plot_hardcopy_clusters($visualization_mask);
2776             $clusterer->plot_hardcopy_distributions($visualization_mask);
2777              
2778             # where the last two invocations are for writing out the PNG plots of the
2779             # visualization displays to disk files. The PNG image of the posterior
2780             # probability distributions is written out to a file named posterior_prob_plot.png
2781             # and the PNG image of the disjoint clusters to a file called cluster_plot.png.
2782              
2783             # SYNTHETIC DATA GENERATION:
2784              
2785             # The module has been provided with a class method for generating multivariate
2786             # data for experimenting with the EM algorithm. The data generation is controlled
2787             # by the contents of a parameter file that is supplied as an argument to the data
2788             # generator method. The priors, the means, and the covariance matrices in the
2789             # parameter file must be according to the syntax shown in the `param1.txt' file in
2790             # the `examples' directory. It is best to edit a copy of this file for your
2791             # synthetic data generation needs.
2792              
2793             my $parameter_file = "param1.txt";
2794             my $out_datafile = "mydatafile1.dat";
2795             Algorithm::ExpectationMaximization->cluster_data_generator(
2796             input_parameter_file => $parameter_file,
2797             output_datafile => $out_datafile,
2798             total_number_of_data_points => $N );
2799              
2800             # where the value of $N is the total number of data points you would like to see
2801             # generated for all of the Gaussians. How this total number is divided up amongst
2802             # the Gaussians is decided by the prior probabilities for the Gaussian components
2803             # as declared in input parameter file. The synthetic data may be visualized in a
2804             # terminal window and the visualization written out as a PNG image to a diskfile
2805             # by
2806              
2807             my $data_visualization_mask = "11";
2808             $clusterer->visualize_data($data_visualization_mask);
2809             $clusterer->plot_hardcopy_data($data_visualization_mask);
2810              
2811              
2812             =head1 CHANGES
2813              
2814             Version 1.21 incorporates minor code clean up. Overall, the module implementation
2815             remains unchanged.
2816              
2817             Version 1.2 allows the module to also be used for 1-D data. The visualization code
2818             for 1-D shows the clusters through their histograms.
2819              
2820             Version 1.1 incorporates much cleanup of the documentation associated with the
2821             module. Both the top-level module documentation, especially the Description part,
2822             and the comments embedded in the code were revised for better utilization of the
2823             module. The basic implementation code remains unchanged.
2824              
2825              
2826             =head1 DESCRIPTION
2827              
2828             B is a I module for the
2829             Expectation-Maximization (EM) method of clustering numerical data that lends itself
2830             to modeling as a Gaussian mixture. Since the module is entirely in Perl (in the
2831             sense that it is not a Perl wrapper around a C library that actually does the
2832             clustering), the code in the module can easily be modified to experiment with several
2833             aspects of EM.
2834              
2835             Gaussian Mixture Modeling (GMM) is based on the assumption that the data consists of
2836             C Gaussian components, each characterized by its own mean vector and its own
2837             covariance matrix. Obviously, given observed data for clustering, we do not know
2838             which of the C Gaussian components was responsible for any of the data elements.
2839             GMM also associates a prior probability with each Gaussian component. In general,
2840             these priors will also be unknown. So the problem of clustering consists of
2841             estimating the posterior class probability at each data element and also estimating
2842             the class priors. Once these posterior class probabilities and the priors are
2843             estimated with EM, we can use the naive Bayes' classifier to partition the data into
2844             disjoint clusters. Or, for "soft" clustering, we can find all the data elements that
2845             belong to a Gaussian component on the basis of the posterior class probabilities at
2846             the data elements exceeding a prescribed threshold.
2847              
2848             If you do not mind the fact that it is possible for the EM algorithm to occasionally
2849             get stuck in a local maximum and to, therefore, produce a wrong answer even when you
2850             know the data to be perfectly multimodal Gaussian, EM is probably the most magical
2851             approach to clustering multidimensional data. Consider the case of clustering
2852             three-dimensional data. Each Gaussian cluster in 3D space is characterized by the
2853             following 10 variables: the 6 unique elements of the C<3x3> covariance matrix (which
2854             must be symmetric positive-definite), the 3 unique elements of the mean, and the
2855             prior associated with the Gaussian. Now let's say you expect to see six Gaussians in
2856             your data. What that means is that you would want the values for 59 variables
2857             (remember the unit-summation constraint on the class priors which reduces the overall
2858             number of variables by one) to be estimated by the algorithm that seeks to discover
2859             the clusters in your data. What's amazing is that, despite the large number of
2860             variables that must be optimized simultaneously, the EM algorithm will very likely
2861             give you a good approximation to the right answer.
2862              
2863             At its core, EM depends on the notion of unobserved data and the averaging of the
2864             log-likelihood of the data actually observed over all admissible probabilities for
2865             the unobserved data. But what is unobserved data? While in some cases where EM is
2866             used, the unobserved data is literally the missing data, in others, it is something
2867             that cannot be seen directly but that nonetheless is relevant to the data actually
2868             observed. For the case of clustering multidimensional numerical data that can be
2869             modeled as a Gaussian mixture, it turns out that the best way to think of the
2870             unobserved data is in terms of a sequence of random variables, one for each observed
2871             data point, whose values dictate the selection of the Gaussian for that data point.
2872             This point is explained in great detail in my on-line tutorial at
2873             L.
2874              
2875             The EM algorithm in our context reduces to an iterative invocation of the following
2876             steps: (1) Given the current guess for the means and the covariances of the different
2877             Gaussians in our mixture model, use Bayes' Rule to update the posterior class
2878             probabilities at each of the data points; (2) Using the updated posterior class
2879             probabilities, first update the class priors; (3) Using the updated class priors,
2880             update the class means and the class covariances; and go back to Step (1). Ideally,
2881             the iterations should terminate when the expected log-likelihood of the observed data
2882             has reached a maximum and does not change with any further iterations. The stopping
2883             rule used in this module is the detection of no change over three consecutive
2884             iterations in the values calculated for the priors.
2885              
2886             This module provides three different choices for seeding the clusters: (1) random,
2887             (2) kmeans, and (3) manual. When random seeding is chosen, the algorithm randomly
2888             selects C data elements as cluster seeds. That is, the data vectors associated
2889             with these seeds are treated as initial guesses for the means of the Gaussian
2890             distributions. The covariances are then set to the values calculated from the entire
2891             dataset with respect to the means corresponding to the seeds. With kmeans seeding, on
2892             the other hand, the means and the covariances are set to whatever values are returned
2893             by the kmeans algorithm. And, when seeding is set to manual, you are allowed to
2894             choose C data elements --- by specifying their tag names --- for the seeds. The
2895             rest of the EM initialization for the manual mode is the same as for the random mode.
2896             The algorithm allows for the initial priors to be specified for the manual mode of
2897             seeding.
2898              
2899             Much of code for the kmeans based seeding of EM was drawn from the
2900             C module by me. The code from that module used here corresponds to
2901             the case when the C option in the C module is set
2902             to C. The C option for KMeans consists of subjecting the data to a
2903             principal components analysis (PCA) to discover the direction of maximum variance in
2904             the data space. The data points are then projected on to this direction and a
2905             histogram constructed from the projections. Centers of the C largest peaks in
2906             this smoothed histogram are used to seed the KMeans based clusterer. As you'd
2907             expect, the output of the KMeans used to seed the EM algorithm.
2908              
2909             This module uses two different criteria to measure the quality of the clustering
2910             achieved. The first is the Minimum Description Length (MDL) proposed originally by
2911             Rissanen (J. Rissanen: "Modeling by Shortest Data Description," Automatica, 1978, and
2912             "A Universal Prior for Integers and Estimation by Minimum Description Length," Annals
2913             of Statistics, 1983.) The MDL criterion is a difference of a log-likelihood term for
2914             all of the observed data and a model-complexity penalty term. In general, both the
2915             log-likelihood and the model-complexity terms increase as the number of clusters
2916             increases. The form of the MDL criterion in this module uses for the penalty term
2917             the Bayesian Information Criterion (BIC) of G. Schwartz, "Estimating the Dimensions
2918             of a Model," The Annals of Statistics, 1978. In general, the smaller the value of
2919             MDL quality measure, the better the clustering of the data.
2920              
2921             For our second measure of clustering quality, we use `trace( SW^-1 . SB)' where SW is
2922             the within-class scatter matrix, more commonly denoted S_w, and SB the between-class
2923             scatter matrix, more commonly denoted S_b (the underscore means subscript). This
2924             measure can be thought of as the normalized average distance between the clusters,
2925             the normalization being provided by average cluster covariance SW^-1. Therefore, the
2926             larger the value of this quality measure, the better the separation between the
2927             clusters. Since this measure has its roots in the Fisher linear discriminant
2928             function, we incorporate the word C in the name of the quality measure.
2929             I When the
2930             clusters exhibit significant overlap, the numbers produced by this quality measure
2931             tend to be generally meaningless.
2932              
2933             =head1 METHODS
2934              
2935             The module provides the following methods for EM based
2936             clustering, for cluster visualization, for data
2937             visualization, and for the generation of data for testing a
2938             clustering algorithm:
2939              
2940             =over
2941              
2942             =item B
2943              
2944             A call to C constructs a new instance of the
2945             C class. A typical form
2946             of this call when you want to use random option for seeding
2947             the algorithm is:
2948              
2949             my $clusterer = Algorithm::ExpectationMaximization->new(
2950             datafile => $datafile,
2951             mask => $mask,
2952             K => 3,
2953             max_em_iterations => 300,
2954             seeding => 'random',
2955             terminal_output => 1,
2956             );
2957              
2958             where C is the expected number of clusters and
2959             C the maximum number of EM iterations
2960             that you want to allow until convergence is achieved.
2961             Depending on your dataset and on the choice of the initial
2962             seeds, the actual number of iterations used could be as few
2963             as 10 and as many as reaching 300. The output produced by
2964             the algorithm shows the actual number of iterations used to
2965             arrive at convergence.
2966              
2967             The data file supplied through the C option is
2968             expected to contain entries in the following format
2969              
2970             c20 0 10.7087017086940 9.63528386251712 10.9512155258108 ...
2971             c7 0 12.8025925026787 10.6126270065785 10.5228482095349 ...
2972             b9 0 7.60118206283120 5.05889245193079 5.82841781759102 ...
2973             ....
2974             ....
2975              
2976             where the first column contains the symbolic ID tag for each
2977             data record and the rest of the columns the numerical
2978             information. As to which columns are actually used for
2979             clustering is decided by the string value of the mask. For
2980             example, if we wanted to cluster on the basis of the entries
2981             in just the 3rd, the 4th, and the 5th columns above, the
2982             mask value would be C where the character C
2983             indicates that the ID tag is in the first column, the
2984             character C<0> that the second column is to be ignored, and
2985             the C<1>'s that follow that the 3rd, the 4th, and the 5th
2986             columns are to be used for clustering.
2987              
2988             If instead of random seeding, you wish to use the kmeans
2989             based seeding, just replace the option C supplied
2990             for C by C. You can also do manual seeding
2991             by designating a specified set of data elements to serve as
2992             cluster seeds. The call to the constructor in this case
2993             looks like
2994              
2995             my $clusterer = Algorithm::ExpectationMaximization->new(
2996             datafile => $datafile,
2997             mask => $mask,
2998             K => 3,
2999             max_em_iterations => 300,
3000             seeding => 'manual',
3001             seed_tags => ['a26', 'b53', 'c49'],
3002             terminal_output => 1,
3003             );
3004              
3005             where the option C is set to an anonymous array
3006             of symbolic names associated with the data elements.
3007              
3008             If you know the class priors, you can supply them through an
3009             additional option to the constructor that looks like
3010              
3011             class_priors => [0.6, 0.2, 0.2],
3012              
3013             for the case of C equal to 3. B
3014             be a useful thing to do only for the case of manual
3015             seeding.> If you go for manual seeding, the order in which
3016             the priors are expressed should correspond to the order of
3017             the manually chosen tags supplied through the C
3018             option.
3019              
3020             Note that the parameter C is boolean; when
3021             not supplied in the call to C it defaults to 0. When
3022             set, this parameter displays useful information in the
3023             window of the terminal screen in which you invoke the
3024             algorithm.
3025              
3026             =item B
3027              
3028             $clusterer->read_data_from_file()
3029              
3030             This is a required call after the constructor is invoked. As
3031             you would expect, this call reads in the data for
3032             clustering.
3033              
3034             =item B
3035              
3036             $clusterer->seed_the_clusters();
3037              
3038             This is also a required call. It processes the option you
3039             supplied for C in the constructor call to choose
3040             the data elements for seeding the C clusters.
3041              
3042             =item B
3043              
3044             $clusterer->EM();
3045              
3046             This is the workhorse of the module, as you would expect.
3047             The means, the covariances, and the priors estimated by this
3048             method are stored in instance variables that are subsequently
3049             accessed by other methods for the purpose of displaying the
3050             clusters, the probability distributions, etc.
3051              
3052             =item B
3053              
3054             $clusterer->run_bayes_classifier();
3055              
3056             Using the posterior probability distributions estimated by
3057             the C method, this method partitions the data into the
3058             C disjoint clusters using the naive Bayes' classifier.
3059              
3060             =item B
3061              
3062             my $clusters = $clusterer->return_disjoint_clusters();
3063              
3064             This allows you to access the clusters obtained with the
3065             application of the naive Bayes' classifier in your own
3066             scripts. If, say, you wanted to see the data records placed
3067             in each cluster, you could subsequently invoke the following
3068             loop in your own script:
3069              
3070             foreach my $index (0..@$clusters-1) {
3071             print "Cluster $index (Naive Bayes): @{$clusters->[$index]}\n\n"
3072             }
3073              
3074             where C<$clusters> holds the array reference returned by the
3075             call to C.
3076              
3077             =item B
3078              
3079             $clusterer->write_naive_bayes_clusters_to_files();
3080              
3081             This method writes the clusters obtained by applying the
3082             naive Bayes' classifier to disk files, one cluster per
3083             file. What is written out to each file consists of the
3084             symbolic names of the data records that belong to the
3085             cluster corresponding to that file. The clusters are placed
3086             in files with names like
3087              
3088             naive_bayes_cluster1.dat
3089             naive_bayes_cluster2.dat
3090             ...
3091              
3092             =item B
3093              
3094             my $theta1 = 0.2;
3095             my $posterior_prob_clusters =
3096             $clusterer->return_clusters_with_posterior_probs_above_threshold($theta1);
3097              
3098             This method returns a reference to an array of C
3099             anonymous arrays, each consisting of the symbolic names for
3100             the data records where the posterior class probability
3101             exceeds the threshold as specified by C<$theta1>.
3102             Subsequently, you can access each of these
3103             posterior-probability based clusters through a loop
3104             construct such as
3105              
3106             foreach my $index (0..@$posterior_prob_clusters-1) {
3107             print "Cluster $index (based on posterior probs exceeding $theta1): " .
3108             "@{$posterior_prob_clusters->[$index]}\n\n"
3109             }
3110              
3111             =item B
3112              
3113             $clusterer->write_posterior_prob_clusters_above_threshold_to_files($theta1);
3114              
3115             This call writes out the posterior-probability based soft
3116             clusters to disk files. As in the previous method, the
3117             threshold C<$theta1> sets the probability threshold for
3118             deciding which data elements belong to a cluster. These
3119             clusters are placed in files with names like
3120              
3121             posterior_prob_cluster1.dat
3122             posterior_prob_cluster2.dat
3123             ...
3124              
3125             =item B
3126              
3127             my $theta2 = 0.00001;
3128             my $class_distributions =
3129             $clusterer->return_individual_class_distributions_above_given_threshold($theta2);
3130              
3131             This is the method to call if you wish to see the individual
3132             Gaussians in your own script. The method returns a reference
3133             to an array of anonymous arrays, with each anonymous array
3134             representing data membership in each Gaussian. Only those
3135             data points are included in each Gaussian where the
3136             probability exceeds the threshold C<$theta2>. Note that the
3137             larger the covariance and the higher the data
3138             dimensionality, the smaller this threshold must be for you
3139             to see any of the data points in a Gaussian. After you have
3140             accessed the Gaussian mixture in this manner, you can
3141             display the data membership in each Gaussian through the
3142             following sort of a loop:
3143              
3144             foreach my $index (0..@$class_distributions-1) {
3145             print "Gaussian Distribution $index (only shows data elements " .
3146             "whose probabilities exceed the threshold $theta2: " .
3147             "@{$class_distributions->[$index]}\n\n"
3148             }
3149              
3150             =item B
3151              
3152             my $visualization_mask = "11";
3153             $clusterer->visualize_clusters($visualization_mask);
3154              
3155             The visualization mask here does not have to be identical to
3156             the one used for clustering, but must be a subset of that
3157             mask. This is convenient for visualizing the clusters in
3158             two- or three-dimensional subspaces of the original space.
3159             The subset is specified by placing `0's in the positions
3160             corresponding to the dimensions you do NOT want to see
3161             through visualization. Depending on the mask used, this
3162             method creates a 2D or a 3D scatter plot of the clusters
3163             obtained through the naive Bayes' classification rule.
3164              
3165             =item B
3166              
3167             $clusterer->visualize_distributions($visualization_mask);
3168              
3169             This is the method to call if you want to visualize the soft
3170             clustering corresponding to the posterior class
3171             probabilities exceeding the threshold specified in the call
3172             to
3173             C.
3174             Again, depending on the visualization mask used, you will
3175             see either a 2D plot or a 3D scatter plot.
3176              
3177             =item B
3178              
3179             $clusterer->plot_hardcopy_clusters($visualization_mask);
3180              
3181             This method create a PNG file from the C created
3182             display of the naive Bayes' clusters obtained from the data.
3183             The plotting functionality of C is accessed through
3184             the Perl wrappers provided by the C
3185             module.
3186              
3187             =item B
3188              
3189             $clusterer->plot_hardcopy_distributions($visualization_mask);
3190              
3191             This method create a PNG file from the C created
3192             display of the clusters that correspond to the posterior
3193             class probabilities exceeding a specified threshold. The
3194             plotting functionality of C is accessed through the
3195             Perl wrappers provided by the C module.
3196              
3197             =item B
3198              
3199             $clusterer->display_fisher_quality_vs_iterations();
3200              
3201             This method measures the quality of clustering by
3202             calculating the normalized average squared distance between
3203             the cluster centers, the normalization being provided by the
3204             average cluster covariance. See the Description for further
3205             details. In general, this measure is NOT useful for
3206             overlapping clusters.
3207              
3208             =item B
3209              
3210             $clusterer->display_mdl_quality_vs_iterations();
3211              
3212             At the end of each iteration, this method measures the
3213             quality of clustering my calculating its MDL (Minimum
3214             Description Length). As stated earlier in Description, the
3215             MDL measure is a difference of a log-likelihood term for all
3216             of the observed data and a model-complexity penalty term.
3217             The smaller the value returned by this method, the better
3218             the clustering.
3219              
3220             =item B
3221              
3222             my $estimated_priors = $clusterer->return_estimated_priors();
3223             print "Estimated class priors: @$estimated_priors\n";
3224              
3225             This method can be used to access the final values of the
3226             class priors as estimated by the EM algorithm.
3227              
3228             =item B
3229              
3230             Algorithm::ExpectationMaximization->cluster_data_generator(
3231             input_parameter_file => $parameter_file,
3232             output_datafile => $out_datafile,
3233             total_number_of_data_points => 300
3234             );
3235              
3236             for generating multivariate data for clustering if you wish to play with synthetic
3237             data for experimenting with the EM algorithm. The input parameter file must specify
3238             the priors to be used for the Gaussians, their means, and their covariance matrices.
3239             The format of the information contained in the parameter file must be as shown in the
3240             file C provided in the C directory. It will be easiest for you
3241             to just edit a copy of this file for your data generation needs. In addition to the
3242             format of the parameter file, the main constraint you need to observe in specifying
3243             the parameters is that the dimensionality of the covariance matrices must correspond
3244             to the dimensionality of the mean vectors. The multivariate random numbers are
3245             generated by calling the C module. As you would expect, this module
3246             requires that the covariance matrices you specify in your parameter file be symmetric
3247             and positive definite. Should the covariances in your parameter file not obey this
3248             condition, the C module will let you know.
3249              
3250             =item B
3251              
3252             $clusterer->visualize_data($data_visualization_mask);
3253              
3254             This is the method to call if you want to visualize the data
3255             you plan to cluster with the EM algorithm. You'd need to
3256             specify argument mask in a manner similar to the
3257             visualization of the clusters, as explained earlier.
3258              
3259             =item B
3260              
3261             $clusterer->plot_hardcopy_data($data_visualization_mask);
3262              
3263             This method creates a PNG file that can be used to print out
3264             a hardcopy of the data in different 2D and 3D subspaces of
3265             the data space. The visualization mask is used to select the
3266             subspace for the PNG image.
3267              
3268             =back
3269              
3270             =head1 HOW THE CLUSTERS ARE OUTPUT
3271              
3272             This module produces two different types of clusters: the "hard" clusters and the
3273             "soft" clusters. The hard clusters correspond to the naive Bayes' classification of
3274             the data points on the basis of the Gaussian distributions and the class priors
3275             estimated by the EM algorithm. Such clusters partition the data into disjoint
3276             subsets. On the other hand, the soft clusters correspond to the posterior class
3277             probabilities calculated by the EM algorithm. A data element belongs to a cluster if
3278             its posterior probability for that Gaussian exceeds a threshold.
3279              
3280             After the EM algorithm has finished running, the hard clusters are created by
3281             invoking the method C on an instance of the module and then
3282             made user-accessible by calling C. These clusters may
3283             then be displayed in a terminal window by dereferencing each element of the array
3284             whose reference is returned b C. The hard clusters can
3285             be written out to disk files by invoking C.
3286             This method writes out the clusters to files, one cluster per file. What is written
3287             out to each file consists of the symbolic names of the data records that belong to
3288             the cluster corresponding to that file. The clusters are placed in files with names
3289             like
3290              
3291             naive_bayes_cluster1.dat
3292             naive_bayes_cluster2.dat
3293             ...
3294              
3295             The soft clusters on the other hand are created by calling
3296             C
3297             on an instance of the module, where the argument C<$theta1>
3298             is the threshold for deciding whether a data element belongs
3299             in a soft cluster. The posterior class probability at a
3300             data element must exceed the threshold for the element to
3301             belong to the corresponding cluster. The soft cluster can
3302             be written out to disk files by calling
3303             C.
3304             As with the hard clusters, each cluster is placed in a separate
3305             file. The filenames for such clusters look like:
3306              
3307             posterior_prob_cluster1.dat
3308             posterior_prob_cluster2.dat
3309             ...
3310              
3311             =head1 WHAT IF THE NUMBER OF CLUSTERS IS UNKNOWN?
3312              
3313             The module constructor requires that you supply a value for the parameter C, which
3314             is the number of clusters you expect to see in the data. But what if you do not have
3315             a good value for C? Note that it is possible to search for the best C by using
3316             the two clustering quality criteria included in the module. However, I have
3317             intentionally not yet incorporated that feature in the module because it slows down
3318             the execution of the code --- especially when the dimensionality of the data
3319             increases. However, nothing prevents you from writing a script --- along the lines
3320             of the five "canned_example" scripts in the C directory --- that would use
3321             the two clustering quality metrics for finding the best choice for C for a given
3322             dataset. Obviously, you will now have to incorporate the call to the constructor in
3323             a loop and check the value of the quality measures for each value of C.
3324              
3325             =head1 SOME RESULTS OBTAINED WITH THIS MODULE
3326              
3327             If you would like to see some results that have been obtained with this module, check
3328             out Section 7 of the report
3329             L.
3330              
3331             =head1 THE C DIRECTORY
3332              
3333             Becoming familiar with this directory should be your best
3334             strategy to become comfortable with this module (and its
3335             future versions). You are urged to start by executing the
3336             following five example scripts:
3337              
3338             =over 16
3339              
3340             =item I
3341              
3342             This example applies the EM algorithm to the data contained in the datafile
3343             C. The mixture data in the file corresponds to three overlapping
3344             Gaussian components in a star-shaped pattern. The EM based clustering for this data
3345             is shown in the files C and
3346             C, the former displaying the hard clusters
3347             obtained by using the naive Bayes' classifier and the latter showing the soft
3348             clusters obtained on the basis of the posterior class probabilities at the data
3349             points.
3350              
3351             =item I
3352              
3353             The datafile used in this example is C. This mixture data
3354             corresponds to two well-separated relatively isotropic Gaussians. EM based clustering for this
3355             data is shown in the files C and
3356             C, the former displaying the hard clusters
3357             obtained by using the naive Bayes' classifier and the latter showing the soft
3358             clusters obtained by using the posterior class probabilities at the data points.
3359              
3360             =item I
3361              
3362             Like the first example, this example again involves three Gaussians, but now their
3363             means are not co-located. Additionally, we now seed the clusters manually by
3364             specifying three selected data points as the initial guesses for the cluster means.
3365             The datafile used for this example is C. The EM based clustering
3366             for this data is shown in the files C and
3367             C, the former displaying the hard clusters
3368             obtained by using the naive Bayes' classifier and the latter showing the soft
3369             clusters obtained on the basis of the posterior class probabilities at the data
3370             points.
3371              
3372             =item I
3373              
3374             Whereas the three previous examples demonstrated EM based clustering of 2D data, we
3375             now present an example of clustering in 3D. The datafile used in this example is
3376             C. This mixture data corresponds to three well-separated but highly
3377             anisotropic Gaussians. The EM derived clustering for this data is shown in the files
3378             C and C, the
3379             former displaying the hard clusters obtained by using the naive Bayes' classifier and
3380             the latter showing the soft clusters obtained on the basis of the posterior class
3381             probabilities at the data points.
3382              
3383             =item I
3384              
3385             We again demonstrate clustering in 3D but now we have one Gaussian cluster that
3386             "cuts" through the other two Gaussian clusters. The datafile used in this example is
3387             C. The three Gaussians in this case are highly overlapping and
3388             highly anisotropic. The EM derived clustering for this data is shown in the files
3389             C and C, the
3390             former displaying the hard clusters obtained by using the naive Bayes' classifier and
3391             the latter showing the soft clusters obtained through the posterior class
3392             probabilities at the data points.
3393              
3394             =item I
3395              
3396             This example, added in Version 1.2, demonstrates the use of this module for 1-D data.
3397             In order to visualize the clusters for the 1-D case, we show them through their
3398             respective histograms. The datafile used in this example is C. The
3399             data consists of two overlapping Gaussians. The EM derived clustering for this data
3400             is shown in the files C and
3401             C, the former displaying the hard clusters
3402             obtained by using the naive Bayes' classifier and the latter showing the soft
3403             clusters obtained through the posterior class probabilities at the data points.
3404              
3405             =back
3406              
3407             Going through the six examples listed above will make you familiar with how to make
3408             the calls to the clustering and the visualization methods. The C directory
3409             also includes several parameter files with names like
3410              
3411             param1.txt
3412             param2.txt
3413             param3.txt
3414             ...
3415              
3416             These were used to generate the synthetic data for which the results are shown in the
3417             C directory. Just make a copy of one of these files and edit it if you
3418             would like to generate your own multivariate data for clustering. Note that you can
3419             generate data with any dimensionality through appropriate entries in the parameter
3420             file.
3421              
3422             =head1 CAVEATS
3423              
3424             When you run the scripts in the C directory, your results will NOT always
3425             look like what I have shown in the PNG image files in the directory. As mentioned
3426             earlier in Description, the EM algorithm starting from randomly chosen initial
3427             guesses for the cluster means can get stuck in a local maximum.
3428              
3429             That raises an interesting question of how one judges the correctness of clustering
3430             results when dealing with real experimental data. For real data, the best approach
3431             is to try the EM algorithm multiple times with all of the seeding options included in
3432             this module. It would be safe to say that, at least in low dimensional spaces and
3433             with sufficient data, a majority of your runs should yield "correct" results.
3434              
3435             Also bear in mind that a pure Perl implementation is not meant for the clustering of
3436             very large data files. It is really designed more for researching issues related to
3437             EM based approaches to clustering.
3438              
3439              
3440             =head1 REQUIRED
3441              
3442             This module requires the following three modules:
3443              
3444             Math::Random
3445             Graphics::GnuplotIF
3446             Math::GSL::Matrix
3447              
3448             the first for generating the multivariate random numbers, the second for the
3449             visualization of the clusters, and the last for access to the Perl wrappers for the
3450             GNU Scientific Library. The C module of this library is used for various
3451             algebraic operations on the covariance matrices of the Gaussians.
3452              
3453             =head1 EXPORT
3454              
3455             None by design.
3456              
3457              
3458             =head1 BUGS
3459              
3460             Please notify the author if you encounter any bugs. When sending email, please place
3461             the string 'Algorithm EM' in the subject line.
3462              
3463             =head1 INSTALLATION
3464              
3465             Download the archive from CPAN in any directory of your choice. Unpack the archive
3466             with a command that on a Linux machine would look like:
3467              
3468             tar zxvf Algorithm-ExpectationMaximization-1.21.tar.gz
3469              
3470             This will create an installation directory for you whose name will be
3471             C. Enter this directory and execute the
3472             following commands for a standard install of the module if you have root privileges:
3473              
3474             perl Makefile.PL
3475             make
3476             make test
3477             sudo make install
3478              
3479             If you do not have root privileges, you can carry out a non-standard install the
3480             module in any directory of your choice by:
3481              
3482             perl Makefile.PL prefix=/some/other/directory/
3483             make
3484             make test
3485             make install
3486              
3487             With a non-standard install, you may also have to set your PERL5LIB environment
3488             variable so that this module can find the required other modules. How you do that
3489             would depend on what platform you are working on. In order to install this module in
3490             a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
3491             variable by
3492              
3493             setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
3494              
3495             If I used bash, I'd need to declare:
3496              
3497             export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
3498              
3499              
3500             =head1 ACKNOWLEDGMENTS
3501              
3502             Version 1.2 is a result of the feedback received from Paul
3503             May of University of Birmingham. Thanks, Paul!
3504              
3505             =head1 AUTHOR
3506              
3507             Avinash Kak, kak@purdue.edu
3508              
3509             If you send email, please place the string "EM Algorithm" in your
3510             subject line to get past my spam filter.
3511              
3512             =head1 COPYRIGHT
3513              
3514             This library is free software; you can redistribute it and/or modify it under the
3515             same terms as Perl itself.
3516              
3517             Copyright 2014 Avinash Kak
3518              
3519             =cut
3520