File Coverage

blib/lib/Algorithm/ExpectationMaximization.pm
Criterion Covered Total %
statement 22 24 91.6
branch n/a
condition n/a
subroutine 8 8 100.0
pod n/a
total 30 32 93.7


line stmt bran cond sub pod time code
1             package Algorithm::ExpectationMaximization;
2              
3             #---------------------------------------------------------------------------
4             # Copyright (c) 2013 Avinash Kak. All rights reserved. This program is free
5             # software. You may modify and/or distribute it under the same terms as Perl itself.
6             # This copyright notice must remain attached to the file.
7             #
8             # Algorithm::ExpectationMaximization is a pure Perl implementation for
9             # Expectation-Maximization based clustering of multi-dimensional data that can be
10             # modeled as a Gaussian mixture.
11             # ---------------------------------------------------------------------------
12              
13 1     1   22045 use 5.10.0;
  1         4  
  1         54  
14 1     1   6 use strict;
  1         2  
  1         33  
15 1     1   5 use warnings;
  1         14  
  1         43  
16 1     1   6 use Carp;
  1         1  
  1         117  
17 1     1   6 use File::Basename;
  1         2  
  1         111  
18 1     1   1254 use Math::Random;
  1         8695  
  1         130  
19 1     1   1257 use Graphics::GnuplotIF;
  1         17022  
  1         214  
20 1     1   1722 use Math::GSL::Matrix;
  0            
  0            
21             use Scalar::Util 'blessed';
22              
23             our $VERSION = '1.2';
24              
25             # from perl docs:
26             my $_num_regex = '^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$';
27              
28             # Constructor:
29             sub new {
30             my ($class, %args) = @_;
31             my @params = keys %args;
32             croak "\nYou have used a wrong name for a keyword argument " .
33             "--- perhaps a misspelling\n"
34             if check_for_illegal_params(@params) == 0;
35             bless {
36             _datafile => $args{datafile} || croak("datafile required"),
37             _mask => $args{mask} || croak("mask required"),
38             _K => $args{K} || croak("number of clusters required"),
39             _terminal_output => $args{terminal_output} || 0,
40             _seeding => $args{seeding} || 'random',
41             _seed_tags => $args{seed_tags} || [],
42             _max_em_iterations=> $args{max_em_iterations} || 100,
43             _class_priors => $args{class_priors} || [],
44             _debug => $args{debug} || 0,
45             _N => 0,
46             _data => {},
47             _data_id_tags => [],
48             _clusters => [],
49             _cluster_centers => [],
50             _data_dimensions => 0,
51             _cluster_normalizers => [],
52             _cluster_means => [],
53             _cluster_covariances => [],
54             _class_labels_for_data => {},
55             _class_probs_at_each_data_point => {},
56             _expected_class_probs => {},
57             _old_priors => [],
58             _old_old_priors => [],
59             _fisher_quality_vs_iteration => [],
60             _mdl_quality_vs_iterations => [],
61             }, $class;
62             }
63              
64             sub read_data_from_file {
65             my $self = shift;
66             my $datafile = $self->{_datafile};
67             my $mask = $self->{_mask};
68             my @mask = split //, $mask;
69             $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
70             print "data dimensionality: $self->{_data_dimensions} \n"
71             if $self->{_terminal_output};
72             open INPUT, $datafile
73             or die "unable to open file $datafile: $!\n";
74             chomp( my @raw_data = );
75             close INPUT;
76             # Transform strings into number data
77             foreach my $record (@raw_data) {
78             next unless $record;
79             next if $record =~ /^#/;
80             my @data_fields;
81             my @fields = split /\s+/, $record;
82             die "\nABORTED: Mask size does not correspond to row record size\n"
83             if $#fields != $#mask;
84             my $record_id;
85             foreach my $i (0..@fields-1) {
86             if ($mask[$i] eq '0') {
87             next;
88             } elsif ($mask[$i] eq 'N') {
89             $record_id = $fields[$i];
90             } elsif ($mask[$i] eq '1') {
91             push @data_fields, $fields[$i];
92             } else {
93             die "misformed mask for reading the data file\n";
94             }
95             }
96             my @nums = map {/$_num_regex/;$_} @data_fields;
97             $self->{_data}->{ $record_id } = \@nums;
98             }
99             my @all_data_ids = keys %{$self->{_data}};
100             $self->{_data_id_tags} = \@all_data_ids;
101             $self->{_N} = scalar @all_data_ids;
102             if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
103             carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
104             "points must satisfy the relation N > 2xK**2 where K is " .
105             "the number of clusters requested for the clusters to be " .
106             "meaningful $!"
107             if ( $self->{_N} < (2 * $self->{_K} ** 2) );
108             }
109             srand(123456789);
110             }
111              
112              
113             # This is the heart of the module --- in the sense that this method implements the EM
114             # algorithm for the estimating the parameters of a Gaussian mixture model for the
115             # data. In the implementation shown below, we declare convergence for the EM
116             # algorithm when the change in the class priors over three iterations falls below a
117             # threshold. The current value of this threshold, as can be seen in the function
118             # compare_array_floats(), is 0.00001.
119             sub EM {
120             my $self = shift;
121             $self->initialize_class_priors();
122             for (my $em_iteration=0; $em_iteration < $self->{_max_em_iterations};
123             $em_iteration++) {
124             if ($em_iteration == 0) {
125             print "\nSeeding the EM algorithm with:\n";
126             $self->display_seeding_stats();
127             print "\nFinished displaying the seeding information\n";
128             print "\nWill print out a dot for each iteration of EM:\n\n";
129             }
130             my $progress_indicator = $em_iteration % 5 == 0 ? $em_iteration : ".";
131             print $progress_indicator;
132             foreach my $data_id (@{$self->{_data_id_tags}}) {
133             $self->{_class_probs_at_each_data_point}->{$data_id} = [];
134             $self->{_expected_class_probs}->{$data_id} = [];
135             }
136             # Calculate prob(x | C_i) --- this is the prob of data point x as
137             # a member of class C_i. You must do this for all K classes:
138             foreach my $cluster_index(0..$self->{_K}-1) {
139             $self->find_prob_at_each_datapoint_for_given_mean_and_covar(
140             $self->{_cluster_means}->[$cluster_index],
141             $self->{_cluster_covariances}->[$cluster_index] );
142             }
143             $self->{_cluster_normalizers} = [];
144             if ($self->{_debug}) {
145             print "\n\nDisplaying prob of a data point vis-a-vis each class:\n\n";
146             foreach my $data_id (sort keys %{$self->{_data}}) {
147             my $class_probs_at_a_point =
148             $self->{_class_probs_at_each_data_point}->{$data_id};
149             print "Class probs for $data_id: @$class_probs_at_a_point\n"
150             }
151             }
152             # Calculate prob(C_i | x) which is the posterior prob of class
153             # considered as a r.v. to be C_i at a given point x. For a given
154             # x, the sum of such probabilities over all C_i must add up to 1:
155             $self->find_expected_classes_at_each_datapoint();
156             if ($self->{_debug}) {
157             print "\n\nDisplaying expected class probs at each data point:\n\n";
158             foreach my $data_id (sort keys %{$self->{_expected_class_probs}}) {
159             my $expected_classes_at_a_point =
160             $self->{_expected_class_probs}->{$data_id};
161             print "Expected classes $data_id: @$expected_classes_at_a_point\n";
162             }
163             }
164             # 1. UPDATE MEANS:
165             my @new_means;
166             foreach my $cluster_index(0..$self->{_K}-1) {
167             $new_means[$cluster_index] =
168             Math::GSL::Matrix->new($self->{_data_dimensions},1);
169             $new_means[$cluster_index]->zero();
170             foreach my $data_id (keys %{$self->{_data}}) {
171             my $data_record = $self->{_data}->{$data_id};
172             my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
173             $data_vec->set_col(0,$data_record);
174             $new_means[$cluster_index] +=
175             $self->{_expected_class_probs}->{$data_id}->[$cluster_index] *
176             $data_vec->copy();
177             $self->{_cluster_normalizers}->[$cluster_index] +=
178             $self->{_expected_class_probs}->{$data_id}->[$cluster_index];
179             }
180             $new_means[$cluster_index] *= 1.0 /
181             $self->{_cluster_normalizers}->[$cluster_index];
182             }
183             if ($self->{_debug}) {
184             foreach my $meanvec (@new_means) {
185             display_matrix("At EM Iteration $em_iteration, new mean vector is",
186             $meanvec);
187             }
188             }
189             $self->{_cluster_means} = \@new_means;
190             # 2. UPDATE COVARIANCES:
191             my @new_covariances;
192             foreach my $cluster_index(0..$self->{_K}-1) {
193             $new_covariances[$cluster_index] =
194             Math::GSL::Matrix->new($self->{_data_dimensions},
195             $self->{_data_dimensions});
196             $new_covariances[$cluster_index]->zero();
197             my $normalizer = 0;
198             foreach my $data_id (keys %{$self->{_data}}) {
199             my $data_record = $self->{_data}->{$data_id};
200             my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
201             $data_vec->set_col(0,$data_record);
202             my $mean_subtracted_data =
203             $data_vec - $self->{_cluster_means}->[$cluster_index];
204             my $outer_product = outer_product($mean_subtracted_data,
205             $mean_subtracted_data);
206             $new_covariances[$cluster_index] +=
207             $self->{_expected_class_probs}->{$data_id}->[$cluster_index] *
208             $outer_product;
209             }
210             $new_covariances[$cluster_index] *=
211             1.0 /
212             $self->{_cluster_normalizers}->[$cluster_index];
213             }
214             $self->{_cluster_covariances} = \@new_covariances;
215             # 3. UPDATE PRIORS:
216             $self->{_old_old_priors} = deep_copy_array( $self->{_old_priors} )
217             if @{$self->{_old_priors}} > 0;
218             $self->{_old_priors} = deep_copy_array( $self->{_class_priors} );
219             foreach my $cluster_index(0..$self->{_K}-1) {
220             $self->{_class_priors}->[$cluster_index] =
221             $self->{_cluster_normalizers}->[$cluster_index] / $self->{_N};
222             }
223             my @priors = @{$self->{_class_priors}};
224             print "\nUpdated priors: @priors\n\n\n" if $self->{_debug};
225             push @{$self->{_fisher_quality_vs_iteration}},
226             $self->clustering_quality_fisher();
227             push @{$self->{_mdl_quality_vs_iteration}}, $self->clustering_quality_mdl();
228             if ( ($em_iteration > 5 && $self->reached_convergence())
229             || ($em_iteration == $self->{_max_em_iterations} - 1) ) {
230             my @old_old_priors = @{$self->{_old_old_priors}};
231             my @old_priors = @{$self->{_old_priors}};
232             print "\n\nPrevious to previous priors: @old_old_priors\n";
233             print "Previous priors: @old_priors\n";
234             print "Current class priors: @{$self->{_class_priors}}\n";
235             print "\n\nCONVERGENCE ACHIEVED AT ITERATION $em_iteration\n\n"
236             if $em_iteration < $self->{_max_em_iterations} - 1;
237             last;
238             }
239             }
240             print "\n\n\n";
241             }
242              
243             sub reached_convergence {
244             my $self = shift;
245             return 1 if compare_array_floats($self->{_old_old_priors},
246             $self->{_old_priors})
247             &&
248             compare_array_floats($self->{_old_priors},
249             $self->{_class_priors});
250             return 0;
251             }
252              
253             # Classify the data into disjoint clusters using the Naive Bayes' classification:
254             sub run_bayes_classifier {
255             my $self = shift;
256             $self->classify_all_data_tuples_bayes($self->{_cluster_means},
257             $self->{_cluster_covariances});
258             }
259              
260             # Should NOT be called before run_bayes_classifier is run
261             sub return_disjoint_clusters {
262             my $self = shift;
263             return $self->{_clusters};
264             }
265              
266             sub return_clusters_with_posterior_probs_above_threshold {
267             my $self = shift;
268             my $theta = shift;
269             my @class_distributions;
270             foreach my $cluster_index (0..$self->{_K}-1) {
271             push @class_distributions, [];
272             }
273             foreach my $data_tag (@{$self->{_data_id_tags}}) {
274             foreach my $cluster_index (0..$self->{_K}-1) {
275             push @{$class_distributions[$cluster_index]}, $data_tag
276             if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index]
277             > $theta;
278             }
279             }
280             return \@class_distributions;
281             }
282              
283             sub return_individual_class_distributions_above_given_threshold {
284             my $self = shift;
285             my $theta = shift;
286             my @probability_distributions;
287             foreach my $cluster_index (0..$self->{_K}-1) {
288             push @probability_distributions, [];
289             }
290             foreach my $cluster_index (0..$self->{_K}-1) {
291             my $mean_vec = $self->{_cluster_means}->[$cluster_index];
292             my $covar = $self->{_cluster_covariances}->[$cluster_index];
293             foreach my $data_id (keys %{$self->{_data}}) {
294             my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
295             $data_vec->set_col( 0, $self->{_data}->{$data_id});
296             my $datavec_minus_mean = $data_vec - $mean_vec;
297             display_matrix( "datavec minus mean is ", $datavec_minus_mean )
298             if $self->{_debug};
299             my $exponent = undef;
300             if ($self->{_data_dimensions} > 1) {
301             $exponent = -0.5 * vector_matrix_multiply(
302             transpose($datavec_minus_mean),
303             matrix_vector_multiply($covar->inverse(), $datavec_minus_mean ) );
304             } else {
305             my @var_inverse = $covar->inverse()->as_list;
306             my $var_inverse_val = $var_inverse[0];
307             my @data_minus_mean = $datavec_minus_mean->as_list;
308             my $data_minus_mean_val = $data_minus_mean[0];
309             $exponent = -0.5 * ($data_minus_mean_val ** 2) * $var_inverse_val;
310             }
311             print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug};
312             my $coefficient = 1.0 / \
313             ( (2 * $Math::GSL::Const::M_PI)**$self->{_data_dimensions} * sqrt($covar->det()) );
314             my $prob = $coefficient * exp($exponent);
315             push @{$probability_distributions[$cluster_index]}, $data_id
316             if $prob > $theta;
317             }
318             }
319             return \@probability_distributions;
320             }
321              
322             sub return_estimated_priors {
323             my $self = shift;
324             return $self->{_class_priors};
325             }
326              
327             # Calculates the MDL (Minimum Description Length) clustering criterion according to
328             # Rissanen. (J. Rissanen: "Modeling by Shortest Data Description," Automatica, 1978,
329             # and "A Universal Prior for Integers and Estimation by Minimum Description Length,"
330             # Annals of Statistics, 1983.) The MDL criterion is a difference of a log-likelihood
331             # term for all of the observed data and a model-complexity penalty term. In general,
332             # both the log-likelihood and the model-complexity terms increase as the number of
333             # clusters increases. The form of the MDL criterion used in the implementation below
334             # uses for the penalty term the Bayesian Information Criterion (BIC) of G. Schwartz,
335             # "Estimating the Dimensions of a Model," The Annals of Statistics, 1978. In
336             # general, the smaller the value of the MDL quality measure calculated below, the
337             # better the clustering of the data.
338             sub clustering_quality_mdl {
339             my $self = shift;
340             # Calculate the inverses of all of the covariance matrices in order to avoid
341             # having to calculate them repeatedly inside the inner 'foreach' loop in the
342             # main part of this method. Here we go:
343             my @covar_inverses;
344             foreach my $cluster_index (0..$self->{_K}-1) {
345             my $covar = $self->{_cluster_covariances}->[$cluster_index];
346             push @covar_inverses, $covar->inverse();
347             }
348             # For the clustering quality, first calculate the log-likelihood of all the
349             # observed data:
350             my $log_likelihood = 0;
351             foreach my $tag (@{$self->{_data_id_tags}}) {
352             my $likelihood_for_each_tag = 0;
353             foreach my $cluster_index (0..$self->{_K}-1) {
354             my $mean_vec = $self->{_cluster_means}->[$cluster_index];
355             my $covar = $self->{_cluster_covariances}->[$cluster_index];
356             my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
357             $data_vec->set_col( 0, $self->{_data}->{$tag});
358             my $datavec_minus_mean = $data_vec - $mean_vec;
359             my $exponent = undef;
360             if ($self->{_data_dimensions} > 1) {
361             $exponent = -0.5 * vector_matrix_multiply(
362             transpose($datavec_minus_mean),
363             matrix_vector_multiply($covar_inverses[$cluster_index],
364             $datavec_minus_mean ) );
365             } else {
366             my @var_inverse = $covar_inverses[$cluster_index]->as_list;
367             my $var_inverse_val = $var_inverse[0];
368             my @data_minus_mean = $datavec_minus_mean->as_list;
369             my $data_minus_mean_val = $data_minus_mean[0];
370             $exponent = -0.5 * ($data_minus_mean_val ** 2) * $var_inverse_val;
371             }
372             next if $covar->det() < 0;
373             my $coefficient = 1.0 /
374             ( (2 * $Math::GSL::Const::M_PI)**$self->{_data_dimensions}
375             * sqrt($covar->det()) );
376             my $prob = $coefficient * exp($exponent);
377             $likelihood_for_each_tag +=
378             $prob * $self->{_class_priors}->[$cluster_index];
379             }
380             $log_likelihood += log( $likelihood_for_each_tag );
381             }
382             # Now calculate the model complexity penalty. $L is the total number of
383             # parameters it takes to specify a mixture of K Gaussians. If d is the
384             # dimensionality of the data space, the covariance matrix of each Gaussian takes
385             # (d**2 -d)/2 + d = d(d+1)/2 parameters since this matrix must be symmetric. And
386             # then you need d mean value parameters, and one prior probability parameter
387             # for the Gaussian. So $L = K[1 + d + d(d+1)/2] - 1 where the final '1' that
388             # is subtracted is to account for the normalization on the class priors.
389             my $L = (0.5 * $self->{_K} *
390             ($self->{_data_dimensions}**2 + 3*$self->{_data_dimensions} + 2) ) - 1;
391             my $model_complexity_penalty = 0.5 * $L * log( $self->{_N} );
392             my $mdl_criterion = -1 * $log_likelihood + $model_complexity_penalty;
393             return $mdl_criterion;
394             }
395              
396             # For our second measure of clustering quality, we use `trace( SW^-1 . SB)' where SW
397             # is the within-class scatter matrix, more commonly denoted S_w, and SB the
398             # between-class scatter matrix, more commonly denoted S_b (the underscore means
399             # subscript). This measure can be thought of as the normalized average distance
400             # between the clusters, the normalization being provided by average cluster
401             # covariance SW^-1. Therefore, the larger the value of this quality measure, the
402             # better the separation between the clusters. Since this measure has its roots in
403             # the Fisher linear discriminant function, we incorporate the word 'fisher' in the
404             # name of the quality measure. Note that this measure is good only when the clusters
405             # are disjoint. When the clusters exhibit significant overlap, the numbers produced
406             # by this quality measure tend to be generally meaningless. As an extreme case,
407             # let's say your data was produced by a set of Gaussians, all with the same mean
408             # vector, but each with a distinct covariance. For this extreme case, this measure
409             # will produce a value close to zero --- depending on the accuracy with which the
410             # means are estimated --- even when your clusterer is doing a good job of identifying
411             # the individual clusters.
412             sub clustering_quality_fisher {
413             my $self = shift;
414             my @cluster_quality_indices;
415             my $fisher_trace = 0;
416             my $S_w =
417             Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions});
418             $S_w->zero;
419             my $S_b =
420             Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions});
421             $S_b->zero;
422             my $global_mean = Math::GSL::Matrix->new($self->{_data_dimensions},1);
423             $global_mean->zero;
424             foreach my $cluster_index(0..$self->{_K}-1) {
425             $global_mean = $self->{_class_priors}->[$cluster_index] *
426             $self->{_cluster_means}->[$cluster_index];
427             }
428             foreach my $cluster_index(0..$self->{_K}-1) {
429             $S_w += $self->{_cluster_covariances}->[$cluster_index] *
430             $self->{_class_priors}->[$cluster_index];
431             my $class_mean_minus_global_mean = $self->{_cluster_means}->[$cluster_index]
432             - $global_mean;
433             my $outer_product = outer_product( $class_mean_minus_global_mean,
434             $class_mean_minus_global_mean );
435             $S_b += $self->{_class_priors}->[$cluster_index] * $outer_product;
436             }
437             my $fisher = matrix_multiply($S_w->inverse, $S_b);
438             return $fisher unless defined blessed($fisher);
439             return matrix_trace($fisher);
440             }
441              
442             sub display_seeding_stats {
443             my $self = shift;
444             foreach my $cluster_index(0..$self->{_K}-1) {
445             print "\nSeeding for cluster $cluster_index:\n";
446             my $mean = $self->{_cluster_means}->[$cluster_index];
447             display_matrix("The mean is: ", $mean);
448             my $covariance = $self->{_cluster_covariances}->[$cluster_index];
449             display_matrix("The covariance is: ", $covariance);
450             }
451             }
452              
453             sub display_fisher_quality_vs_iterations {
454             my $self = shift;
455             print "\n\nFisher Quality vs. Iterations: " .
456             "@{$self->{_fisher_quality_vs_iteration}}\n\n";
457             }
458              
459             sub display_mdl_quality_vs_iterations {
460             my $self = shift;
461             print "\n\nMDL Quality vs. Iterations: @{$self->{_mdl_quality_vs_iteration}}\n\n";
462             }
463              
464             sub find_prob_at_each_datapoint_for_given_mean_and_covar {
465             my $self = shift;
466             my $mean_vec_ref = shift;
467             my $covar_ref = shift;
468             foreach my $data_id (keys %{$self->{_data}}) {
469             my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
470             $data_vec->set_col( 0, $self->{_data}->{$data_id});
471             if ($self->{_debug}) {
472             display_matrix("data vec in find prob function", $data_vec);
473             display_matrix("mean vec in find prob function", $mean_vec_ref);
474             display_matrix("covariance in find prob function", $covar_ref);
475             }
476             my $datavec_minus_mean = $data_vec - $mean_vec_ref;
477             display_matrix( "datavec minus mean is ", $datavec_minus_mean ) if $self->{_debug};
478             my $exponent = undef;
479             if ($self->{_data_dimensions} > 1) {
480             $exponent = -0.5 * vector_matrix_multiply( transpose($datavec_minus_mean),
481             matrix_vector_multiply( $covar_ref->inverse(), $datavec_minus_mean ) );
482             } elsif (defined blessed($covar_ref)) {
483             my @data_minus_mean = $datavec_minus_mean->as_list;
484             my $data_minus_mean_val = $data_minus_mean[0];
485             my @covar_as_matrix = $covar_ref->as_list;
486             my $covar_val = $covar_as_matrix[0];
487             $exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_val;
488             } else {
489             my @data_minus_mean = $datavec_minus_mean->as_list;
490             my $data_minus_mean_val = $data_minus_mean[0];
491             $exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_ref;
492             }
493             print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug};
494             my $coefficient = undef;
495             if ($self->{_data_dimensions} > 1) {
496             $coefficient = 1.0 / sqrt( ((2 * $Math::GSL::Const::M_PI) ** $self->{_data_dimensions}) *
497             $covar_ref->det()) ;
498             } elsif (!defined blessed($covar_ref)) {
499             $coefficient = 1.0 / sqrt(2 * $covar_ref * $Math::GSL::Const::M_PI);
500             } else {
501             my @covar_as_matrix = $covar_ref->as_list;
502             my $covar_val = $covar_as_matrix[0];
503             $coefficient = 1.0 / sqrt(2 * $covar_val * $Math::GSL::Const::M_PI);
504             }
505             my $prob = $coefficient * exp($exponent);
506             push @{$self->{_class_probs_at_each_data_point}->{$data_id}}, $prob;
507             }
508             }
509              
510             sub find_expected_classes_at_each_datapoint {
511             my $self = shift;
512             my @priors = @{$self->{_class_priors}};
513             foreach my $data_id (sort keys %{$self->{_class_probs_at_each_data_point}}) {
514             my $numerator =
515             vector_2_vector_multiply(
516             $self->{_class_probs_at_each_data_point}->{$data_id},
517             $self->{_class_priors} );
518             my $sum = 0;
519             foreach my $part (@$numerator) {
520             $sum += $part;
521             }
522             $self->{_expected_class_probs}->{$data_id} = [map $_/$sum, @{$numerator}];
523             }
524             }
525              
526             sub initialize_class_priors {
527             my $self = shift;
528             if (@{$self->{_class_priors}} == 0) {
529             my $prior = 1.0 / $self->{_K};
530             foreach my $class_index (0..$self->{_K}-1) {
531             push @{$self->{_class_priors}}, $prior;
532             }
533             }
534             die "Mismatch between number of values for class priors " .
535             "and the number of clusters expected"
536             unless @{$self->{_class_priors}} == $self->{_K};
537             my $sum = 0;
538             foreach my $prior (@{$self->{_class_priors}}) {
539             $sum += $prior;
540             }
541             die "Your priors in the constructor call do not add up to 1"
542             unless abs($sum - 1) < 0.001;
543             print "\nInitially assumed class priors are: @{$self->{_class_priors}}\n";
544             }
545              
546             sub estimate_class_priors {
547             my $self = shift;
548             foreach my $datatag (keys %{$self->{_data}}) {
549             my $class_label = $self->{_class_labels}->{$datatag};
550             $self->{_class_priors}[$class_label]++;
551             }
552             foreach my $prior (@{$self->{_class_priors}}) {
553             $prior /= $self->{_total_number_data_tuples};
554             }
555             foreach my $prior (@{$self->{_class_priors}}) {
556             print "class priors: @{$self->{_class_priors}}\n";
557             }
558             }
559              
560             sub classify_all_data_tuples_bayes {
561             my $self = shift;
562             my $mean_vecs_ref = shift;
563             my $covariances_ref = shift;
564             my @new_clusters;
565             foreach my $index (0..$self->{_K}-1) {
566             push @new_clusters, [];
567             }
568             foreach my $data_id (@{$self->{_data_id_tags}}) {
569             my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
570             $data_vec->set_col( 0, deep_copy_array($self->{_data}->{$data_id}));
571             my $cluster_index_for_tuple =
572             $self->classify_a_data_point_bayes($data_vec,
573             $mean_vecs_ref, $covariances_ref);
574             $self->{_class_labels}->{$data_id} = $cluster_index_for_tuple;
575             push @{$new_clusters[$cluster_index_for_tuple]}, $data_id;
576             }
577             $self->{_clusters} = \@new_clusters;
578             }
579              
580             sub classify_a_data_point_bayes {
581             my $self = shift;
582             my $data_vec = shift;
583             my $mean_vecs_ref = shift;
584             my $covariances_ref = shift;
585             my @cluster_mean_vecs = @$mean_vecs_ref;
586             my @cluster_covariances = @$covariances_ref;
587             my @log_likelihoods;
588             foreach my $cluster_index (0..@cluster_mean_vecs-1) {
589             my $mean = $cluster_mean_vecs[$cluster_index];
590             my $covariance = $cluster_covariances[$cluster_index];
591             my $datavec_minus_mean = $data_vec - $mean;
592             my $log_likely = undef;
593             if ($self->{_data_dimensions} > 1) {
594             $log_likely = -0.5 * vector_matrix_multiply(
595             transpose($datavec_minus_mean),
596             matrix_vector_multiply( $covariance->inverse(),
597             $datavec_minus_mean ) );
598             } else {
599              
600             my @data_minus_mean = $datavec_minus_mean->as_list;
601             my $data_minus_mean_val = $data_minus_mean[0];
602             my @covar_as_matrix = $covariance->as_list;
603             my $covar_val = $covar_as_matrix[0];
604             $log_likely = -0.5 * ($data_minus_mean_val ** 2) / $covar_val;
605              
606             }
607             my $posterior_log_likely = $log_likely +
608             log( $self->{_class_priors}[$cluster_index] );
609             push @log_likelihoods, $posterior_log_likely;
610             }
611             my ($minlikely, $maxlikely) = minmax(\@log_likelihoods);
612             my $cluster_index_for_data_point =
613             get_index_at_value( $maxlikely, \@log_likelihoods );
614             return $cluster_index_for_data_point;
615             }
616              
617             sub find_cluster_means_and_covariances {
618             my $clusters = shift;
619             my $data_dimensions = find_data_dimensionality($clusters);
620             my (@cluster_mean_vecs, @cluster_covariances);
621             foreach my $cluster_index (0..@$clusters-1) {
622             my ($num_rows,$num_cols) =
623             ($data_dimensions,scalar(@{$clusters->[$cluster_index]}));
624             print "\nFor cluster $cluster_index: rows: $num_rows and cols: $num_cols\n";
625             my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
626             my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
627             my $col_index = 0;
628             foreach my $ele (@{$clusters->[$cluster_index]}) {
629             $matrix->set_col($col_index++, $ele);
630             }
631             # display_matrix( "Displaying cluster matrix", $matrix );
632             foreach my $j (0..$num_cols-1) {
633             $mean_vec += $matrix->col($j);
634             }
635             $mean_vec *= 1.0 / $num_cols;
636             push @cluster_mean_vecs, $mean_vec;
637             display_matrix( "Displaying the mean vector", $mean_vec );
638             foreach my $j (0..$num_cols-1) {
639             my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
640             $matrix->set_col($j, \@new_col);
641             }
642             # display_matrix("Displaying mean subtracted data as a matrix", $matrix );
643             my $transposed = transpose( $matrix );
644             # display_matrix("Displaying transposed matrix",$transposed);
645             my $covariance = matrix_multiply( $matrix, $transposed );
646             $covariance *= 1.0 / $num_cols;
647             push @cluster_covariances, $covariance;
648             display_matrix("Displaying the cluster covariance", $covariance );
649             }
650             return (\@cluster_mean_vecs, \@cluster_covariances);
651             }
652              
653             sub find_data_dimensionality {
654             my $clusters = shift;
655             my @first_cluster = @{$clusters->[0]};
656             my @first_data_element = @{$first_cluster[0]};
657             return scalar(@first_data_element);
658             }
659              
660             sub find_seed_centered_covariances {
661             my $self = shift;
662             my $seed_tags = shift;
663             my (@seed_mean_vecs, @seed_based_covariances);
664             foreach my $seed_tag (@$seed_tags) {
665             my ($num_rows,$num_cols) = ($self->{_data_dimensions}, $self->{_N});
666             my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
667             my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
668             $mean_vec->set_col(0, $self->{_data}->{$seed_tag});
669             push @seed_mean_vecs, $mean_vec;
670             display_matrix( "Displaying the seed mean vector", $mean_vec );
671             my $col_index = 0;
672             foreach my $tag (@{$self->{_data_id_tags}}) {
673             $matrix->set_col($col_index++, $self->{_data}->{$tag});
674             }
675             foreach my $j (0..$num_cols-1) {
676             my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
677             $matrix->set_col($j, \@new_col);
678             }
679             my $transposed = transpose( $matrix );
680             my $covariance = matrix_multiply( $matrix, $transposed );
681             $covariance *= 1.0 / $num_cols;
682             push @seed_based_covariances, $covariance;
683             display_matrix("Displaying the seed covariance", $covariance )
684             if $self->{_debug};
685             }
686             return (\@seed_mean_vecs, \@seed_based_covariances);
687             }
688              
689             # The most popular seeding mode for EM is random. We include two other seeding modes
690             # --- kmeans and manual --- since they do produce good results for specialized cases.
691             # For example, when the clusters in your data are non-overlapping and not too
692             # anisotropic, the kmeans based seeding should work at least as well as the random
693             # seeding. In such cases --- AND ONLY IN SUCH CASES --- the kmeans based seeding has
694             # the advantage of avoiding the getting stuck in a local-maximum problem of the EM
695             # algorithm.
696              
697             sub seed_the_clusters {
698             my $self = shift;
699             if ($self->{_seeding} eq 'random') {
700             my @covariances;
701             my @means;
702             my @all_tags = @{$self->{_data_id_tags}};
703             my @seed_tags;
704             foreach my $i (0..$self->{_K}-1) {
705             push @seed_tags, $all_tags[int rand( $self->{_N} )];
706             }
707             print "Random Seeding: Randomly selected seeding tags are @seed_tags\n\n";
708             my ($seed_means, $seed_covars) =
709             $self->find_seed_centered_covariances(\@seed_tags);
710             $self->{_cluster_means} = $seed_means;
711             $self->{_cluster_covariances} = $seed_covars;
712             } elsif ($self->{_seeding} eq 'kmeans') {
713             $self->kmeans();
714             my $clusters = $self->{_clusters};
715             my @dataclusters;
716             foreach my $index (0..@$clusters-1) {
717             push @dataclusters, [];
718             }
719             foreach my $cluster_index (0..$self->{_K}-1) {
720             foreach my $tag (@{$clusters->[$cluster_index]}) {
721             my $data = $self->{_data}->{$tag};
722             push @{$dataclusters[$cluster_index]}, deep_copy_array($data);
723             }
724             }
725             ($self->{_cluster_means}, $self->{_cluster_covariances}) =
726             find_cluster_means_and_covariances(\@dataclusters);
727             } elsif ($self->{_seeding} eq 'manual') {
728             die "You have not supplied the seeding tags for the option \"manual\""
729             unless @{$self->{_seed_tags}} > 0;
730             print "Manual Seeding: Seed tags are @{$self->{_seed_tags}}\n\n";
731             foreach my $tag (@{$self->{_seed_tags}}) {
732             die "invalid tag used for manual seeding"
733             unless exists $self->{_data}->{$tag};
734             }
735             my ($seed_means, $seed_covars) =
736             $self->find_seed_centered_covariances($self->{_seed_tags});
737             $self->{_cluster_means} = $seed_means;
738             $self->{_cluster_covariances} = $seed_covars;
739             } else {
740             die "Incorrect call syntax used. See documentation.";
741             }
742             }
743              
744             # This is the top-level method for kmeans based initialization of the EM
745             # algorithm. The means and the covariances returned by kmeans are used to seed the EM
746             # algorithm.
747             sub kmeans {
748             my $self = shift;
749             my $K = $self->{_K};
750             $self->cluster_for_fixed_K_single_smart_try($K);
751             if ((defined $self->{_clusters}) && (defined $self->{_cluster_centers})){
752             return ($self->{_clusters}, $self->{_cluster_centers});
753             } else {
754             die "kmeans clustering failed.";
755             }
756             }
757              
758             # Used by the kmeans algorithm for the initialization of the EM iterations. We do
759             # initial kmeans cluster seeding by subjecting the data to principal components
760             # analysis in order to discover the direction of maximum variance in the data space.
761             # Subsequently, we try to find the K largest peaks along this direction. The
762             # coordinates of these peaks serve as the seeds for the K clusters.
763             sub cluster_for_fixed_K_single_smart_try {
764             my $self = shift;
765             my $K = shift;
766             print "Clustering for K = $K\n" if $self->{_terminal_output};
767             my ($clusters, $cluster_centers) =
768             $self->cluster_for_given_K($K);
769             $self->{_clusters} = $clusters;
770             $self->{_cluster_centers} = $cluster_centers;
771             }
772              
773             # Used by the kmeans part of the code for the initialization of the EM algorithm:
774             sub cluster_for_given_K {
775             my $self = shift;
776             my $K = shift;
777             my $cluster_centers = $self->get_initial_cluster_centers($K);
778             my $clusters = $self->assign_data_to_clusters_initial($cluster_centers);
779             my $cluster_nonexistant_flag = 0;
780             foreach my $trial (0..2) {
781             ($clusters, $cluster_centers) =
782             $self->assign_data_to_clusters( $clusters, $K );
783             my $num_of_clusters_returned = @$clusters;
784             foreach my $cluster (@$clusters) {
785             $cluster_nonexistant_flag = 1 if ((!defined $cluster)
786             || (@$cluster == 0));
787             }
788             last unless $cluster_nonexistant_flag;
789             }
790             return ($clusters, $cluster_centers);
791             }
792              
793             # Used by the kmeans part of the code for the initialization of the EM algorithm:
794             sub get_initial_cluster_centers {
795             my $self = shift;
796             my $K = shift;
797             if ($self->{_data_dimensions} == 1) {
798             my @one_d_data;
799             foreach my $j (0..$self->{_N}-1) {
800             my $tag = $self->{_data_id_tags}[$j];
801             push @one_d_data, $self->{_data}->{$tag}->[0];
802             }
803             my @peak_points =
804             find_peak_points_in_given_direction(\@one_d_data,$K);
805             print "highest points at data values: @peak_points\n"
806             if $self->{_debug};
807             my @cluster_centers;
808             foreach my $peakpoint (@peak_points) {
809             push @cluster_centers, [$peakpoint];
810             }
811             return \@cluster_centers;
812             }
813             my ($num_rows,$num_cols) = ($self->{_data_dimensions},$self->{_N});
814             my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
815             my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
816             # All the record labels are stored in the array $self->{_data_id_tags}.
817             # The actual data for clustering is stored in a hash at $self->{_data}
818             # whose keys are the record labels; the value associated with each
819             # key is the array holding the corresponding numerical multidimensional
820             # data.
821             foreach my $j (0..$num_cols-1) {
822             my $tag = $self->{_data_id_tags}[$j];
823             my $data = $self->{_data}->{$tag};
824             $matrix->set_col($j, $data);
825             }
826             if ($self->{_debug}) {
827             print "\nDisplaying the original data as a matrix:";
828             display_matrix( $matrix );
829             }
830             foreach my $j (0..$num_cols-1) {
831             $mean_vec += $matrix->col($j);
832             }
833             $mean_vec *= 1.0 / $num_cols;
834             if ($self->{_debug}) {
835             print "Displaying the mean vector for the data:";
836             display_matrix( $mean_vec );
837             }
838             foreach my $j (0..$num_cols-1) {
839             my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
840             $matrix->set_col($j, \@new_col);
841             }
842             if ($self->{_debug}) {
843             print "Displaying mean subtracted data as a matrix:";
844             display_matrix( $matrix );
845             }
846             my $transposed = transpose( $matrix );
847             if ($self->{_debug}) {
848             print "Displaying transposed data matrix:";
849             display_matrix( $transposed );
850             }
851             my $covariance = matrix_multiply( $matrix, $transposed );
852             $covariance *= 1.0 / $num_cols;
853             if ($self->{_debug}) {
854             print "\nDisplaying the Covariance Matrix for your data:";
855             display_matrix( $covariance );
856             }
857             my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
858             my $num_of_eigens = @$eigenvalues;
859             my $largest_eigen_index = 0;
860             my $smallest_eigen_index = 0;
861             print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug};
862             foreach my $i (1..$num_of_eigens-1) {
863             $largest_eigen_index = $i if $eigenvalues->[$i] >
864             $eigenvalues->[$largest_eigen_index];
865             $smallest_eigen_index = $i if $eigenvalues->[$i] <
866             $eigenvalues->[$smallest_eigen_index];
867             print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug};
868             }
869             print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
870             print "\nsmallest eigen index: $smallest_eigen_index\n\n"
871             if $self->{_debug};
872             foreach my $i (0..$num_of_eigens-1) {
873             my @vec = $eigenvectors->[$i]->as_list;
874             print "Eigenvector $i: @vec\n" if $self->{_debug};
875             }
876             my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
877             print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug};
878             my @max_var_direction;
879             # Each element of the array @largest_eigen_vec is a Math::Complex object
880             foreach my $k (0..@largest_eigen_vec-1) {
881             my ($mag, $theta) = $largest_eigen_vec[$k] =~ /\[(\d*\.\d+),(\S+)\]/;
882             if ($theta eq '0') {
883             $max_var_direction[$k] = $mag;
884             } elsif ($theta eq 'pi') {
885             $max_var_direction[$k] = -1.0 * $mag;
886             } else {
887             die "eigendecomposition of covariance matrix produced a complex eigenvector --- something is wrong";
888             }
889             }
890             # "Maximum variance direction: @max_var_direction
891             print "\nMaximum Variance Direction: @max_var_direction\n\n"
892             if $self->{_debug};
893             # We now project all data points on the largest eigenvector.
894             # Each projection will yield a single point on the eigenvector.
895             my @projections;
896             foreach my $j (0..$self->{_N}-1) {
897             my $tag = $self->{_data_id_tags}[$j];
898             my $data = $self->{_data}->{$tag};
899             die "Dimensionality of the largest eigenvector does not "
900             . "match the dimensionality of the data"
901             unless @max_var_direction == $self->{_data_dimensions};
902             my $projection = vector_multiply($data, \@max_var_direction);
903             push @projections, $projection;
904             }
905             print "All projection points: @projections\n" if $self->{_debug};
906             my @peak_points = find_peak_points_in_given_direction(\@projections, $K);
907             print "highest points at points along largest eigenvec: @peak_points\n"
908             if $self->{_debug};
909             my @cluster_centers;
910             foreach my $peakpoint (@peak_points) {
911             my @actual_peak_coords = map {$peakpoint * $_} @max_var_direction;
912             push @cluster_centers, \@actual_peak_coords;
913             }
914             return \@cluster_centers;
915             }
916              
917             # Used by the kmeans part of the code: This method is called by the previous method
918             # to locate K peaks in a smoothed histogram of the data points projected onto the
919             # maximal variance direction.
920             sub find_peak_points_in_given_direction {
921             my $dataref = shift;
922             my $how_many = shift;
923             my @data = @$dataref;
924             my ($min, $max) = minmax(\@data);
925             my $num_points = @data;
926             my @sorted_data = sort {$a <=> $b} @data;
927             #print "\n\nSorted data: @sorted_data\n";
928             my $scale = $max - $min;
929             foreach my $index (0..$#sorted_data-1) {
930             $sorted_data[$index] = ($sorted_data[$index] - $min) / $scale;
931             }
932             my $avg_diff = 0;
933             foreach my $index (0..$#sorted_data-1) {
934             my $diff = $sorted_data[$index+1] - $sorted_data[$index];
935             $avg_diff += ($diff - $avg_diff) / ($index + 1);
936             }
937             my $delta = 1.0 / 1000.0;
938             # It would be nice to set the delta adaptively, but I must
939             # change the number of cells in the next foreach loop accordingly
940             # my $delta = $avg_diff / 20;
941             my @accumulator = (0) x 1000;
942             foreach my $index (0..@sorted_data-1) {
943             my $cell_index = int($sorted_data[$index] / $delta);
944             my $smoothness = 40;
945             for my $index ($cell_index-$smoothness..$cell_index+$smoothness) {
946             next if $index < 0 || $index > 999;
947             $accumulator[$index]++;
948             }
949             }
950             my $peaks_array = non_maximum_supression( \@accumulator );
951             my $peaks_index_hash = get_value_index_hash( $peaks_array );
952             my @K_highest_peak_locations;
953             my $k = 0;
954             foreach my $peak (sort {$b <=> $a} keys %$peaks_index_hash) {
955             my $unscaled_peak_point =
956             $min + $peaks_index_hash->{$peak} * $scale * $delta;
957             push @K_highest_peak_locations, $unscaled_peak_point
958             if $k < $how_many;
959             last if ++$k == $how_many;
960             }
961             return @K_highest_peak_locations;
962             }
963              
964             # Used by the kmeans part of the code: The purpose of this routine is to form initial
965             # clusters by assigning the data samples to the initial clusters formed by the
966             # previous routine on the basis of the best proximity of the data samples to the
967             # different cluster centers.
968             sub assign_data_to_clusters_initial {
969             my $self = shift;
970             my @cluster_centers = @{ shift @_ };
971             my @clusters;
972             foreach my $ele (@{$self->{_data_id_tags}}) {
973             my $best_cluster;
974             my @dist_from_clust_centers;
975             foreach my $center (@cluster_centers) {
976             push @dist_from_clust_centers, $self->distance($ele, $center);
977             }
978             my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
979             push @{$clusters[$best_center_index]}, $ele;
980             }
981             return \@clusters;
982             }
983              
984             # Used by the kmeans part of the code: This is the main routine that along with the
985             # update_cluster_centers() routine constitute the two key steps of the K-Means
986             # algorithm. In most cases, the infinite while() loop will terminate automatically
987             # when the cluster assignments of the data points remain unchanged. For the sake of
988             # safety, we keep track of the number of iterations. If this number reaches 100, we
989             # exit the while() loop anyway. In most cases, this limit will not be reached.
990             sub assign_data_to_clusters {
991             my $self = shift;
992             my $clusters = shift;
993             my $K = shift;
994             my $final_cluster_centers;
995             my $iteration_index = 0;
996             while (1) {
997             my $new_clusters;
998             my $assignment_changed_flag = 0;
999             my $current_cluster_center_index = 0;
1000             my $cluster_size_zero_condition = 0;
1001             my $how_many = @$clusters;
1002             my $cluster_centers = $self->update_cluster_centers(
1003             deep_copy_AoA_with_nulls( $clusters ) );
1004             $iteration_index++;
1005             foreach my $cluster (@$clusters) {
1006             my $current_cluster_center =
1007             $cluster_centers->[$current_cluster_center_index];
1008             foreach my $ele (@$cluster) {
1009             my @dist_from_clust_centers;
1010             foreach my $center (@$cluster_centers) {
1011             push @dist_from_clust_centers,
1012             $self->distance($ele, $center);
1013             }
1014             my ($min, $best_center_index) =
1015             minimum( \@dist_from_clust_centers );
1016             my $best_cluster_center =
1017             $cluster_centers->[$best_center_index];
1018             if (vector_equal($current_cluster_center,
1019             $best_cluster_center)){
1020             push @{$new_clusters->[$current_cluster_center_index]},
1021             $ele;
1022             } else {
1023             $assignment_changed_flag = 1;
1024             push @{$new_clusters->[$best_center_index]}, $ele;
1025             }
1026             }
1027             $current_cluster_center_index++;
1028             }
1029             # Now make sure that we still have K clusters since K is fixed:
1030             next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
1031             # Now make sure that none of the K clusters is an empty cluster:
1032             foreach my $newcluster (@$new_clusters) {
1033             $cluster_size_zero_condition = 1 if ((!defined $newcluster)
1034             or (@$newcluster == 0));
1035             }
1036             push @$new_clusters, (undef) x ($K - @$new_clusters)
1037             if @$new_clusters < $K;
1038             my $largest_cluster;
1039             foreach my $local_cluster (@$new_clusters) {
1040             next if !defined $local_cluster;
1041             $largest_cluster = $local_cluster if !defined $largest_cluster;
1042             if (@$local_cluster > @$largest_cluster) {
1043             $largest_cluster = $local_cluster;
1044             }
1045             }
1046             foreach my $local_cluster (@$new_clusters) {
1047             if ( (!defined $local_cluster) || (@$local_cluster == 0) ) {
1048             push @$local_cluster, pop @$largest_cluster;
1049             }
1050             }
1051             next if (($cluster_size_zero_condition) && ($iteration_index < 100));
1052             last if $iteration_index == 100;
1053             # Now do a deep copy of new_clusters into clusters
1054             $clusters = deep_copy_AoA( $new_clusters );
1055             last if $assignment_changed_flag == 0;
1056             }
1057             $final_cluster_centers = $self->update_cluster_centers( $clusters );
1058             return ($clusters, $final_cluster_centers);
1059             }
1060              
1061             # Used by the kmeans part of the code: After each new assignment of the data points
1062             # to the clusters on the basis of the current values for the cluster centers, we call
1063             # the routine shown here for updating the values of the cluster centers.
1064             sub update_cluster_centers {
1065             my $self = shift;
1066             my @clusters = @{ shift @_ };
1067             my @new_cluster_centers;
1068             my $largest_cluster;
1069             foreach my $cluster (@clusters) {
1070             next if !defined $cluster;
1071             $largest_cluster = $cluster if !defined $largest_cluster;
1072             if (@$cluster > @$largest_cluster) {
1073             $largest_cluster = $cluster;
1074             }
1075             }
1076             foreach my $cluster (@clusters) {
1077             if ( (!defined $cluster) || (@$cluster == 0) ) {
1078             push @$cluster, pop @$largest_cluster;
1079             }
1080             }
1081             foreach my $cluster (@clusters) {
1082             die "Cluster became empty --- untenable condition " .
1083             "for a given K. Try again. \n" if !defined $cluster;
1084             my $cluster_size = @$cluster;
1085             die "Cluster size is zero --- untenable.\n" if $cluster_size == 0;
1086             my @new_cluster_center = @{$self->add_point_coords( $cluster )};
1087             @new_cluster_center = map {my $x = $_/$cluster_size; $x}
1088             @new_cluster_center;
1089             push @new_cluster_centers, \@new_cluster_center;
1090             }
1091             return \@new_cluster_centers;
1092             }
1093              
1094             # The following routine is for computing the distance between a data point specified
1095             # by its symbolic name in the master datafile and a point (such as the center of a
1096             # cluster) expressed as a vector of coordinates:
1097             sub distance {
1098             my $self = shift;
1099             my $ele1_id = shift @_; # symbolic name of data sample
1100             my @ele1 = @{$self->{_data}->{$ele1_id}};
1101             my @ele2 = @{shift @_};
1102             die "wrong data types for distance calculation\n" if @ele1 != @ele2;
1103             my $how_many = @ele1;
1104             my $squared_sum = 0;
1105             foreach my $i (0..$how_many-1) {
1106             $squared_sum += ($ele1[$i] - $ele2[$i])**2;
1107             }
1108             my $dist = sqrt $squared_sum;
1109             return $dist;
1110             }
1111              
1112             # The following routine does the same as above but now both
1113             # arguments are expected to be arrays of numbers:
1114             sub distance2 {
1115             my $self = shift;
1116             my @ele1 = @{shift @_};
1117             my @ele2 = @{shift @_};
1118             die "wrong data types for distance calculation\n" if @ele1 != @ele2;
1119             my $how_many = @ele1;
1120             my $squared_sum = 0;
1121             foreach my $i (0..$how_many-1) {
1122             $squared_sum += ($ele1[$i] - $ele2[$i])**2;
1123             }
1124             return sqrt $squared_sum;
1125             }
1126              
1127             sub write_naive_bayes_clusters_to_files {
1128             my $self = shift;
1129             my @clusters = @{$self->{_clusters}};
1130             unlink glob "naive_bayes_cluster*.dat";
1131             foreach my $i (1..@clusters) {
1132             my $filename = "naive_bayes_cluster" . $i . ".dat";
1133             print "Writing cluster $i to file $filename\n"
1134             if $self->{_terminal_output};
1135             open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!";
1136             foreach my $ele (@{$clusters[$i-1]}) {
1137             print FILEHANDLE "$ele\n";
1138             }
1139             close FILEHANDLE;
1140             }
1141             }
1142              
1143             sub write_posterior_prob_clusters_above_threshold_to_files {
1144             my $self = shift;
1145             my $theta = shift;
1146             my @class_distributions;
1147             foreach my $cluster_index (0..$self->{_K}-1) {
1148             push @class_distributions, [];
1149             }
1150             foreach my $data_tag (@{$self->{_data_id_tags}}) {
1151             foreach my $cluster_index (0..$self->{_K}-1) {
1152             push @{$class_distributions[$cluster_index]}, $data_tag
1153             if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index]
1154             > $theta;
1155             }
1156             }
1157             unlink glob "posterior_prob_cluster*.dat";
1158             foreach my $i (1..@class_distributions) {
1159             my $filename = "posterior_prob_cluster" . $i . ".dat";
1160             print "Writing posterior prob cluster $i to file $filename\n"
1161             if $self->{_terminal_output};
1162             open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!";
1163             foreach my $ele (@{$class_distributions[$i-1]}) {
1164             print FILEHANDLE "$ele\n";
1165             }
1166             close FILEHANDLE;
1167             }
1168             }
1169              
1170             sub DESTROY {
1171             unlink "__temp_" . basename($_[0]->{_datafile});
1172             unlink "__temp_data_" . basename($_[0]->{_datafile});
1173             unlink "__temp2_" . basename($_[0]->{_datafile});
1174             unlink glob "__temp1dhist*";
1175             unlink glob "__contour*";
1176             }
1177              
1178             ############################# Visualization Code ###############################
1179              
1180             # The visualize_clusters() implementation displays as a plot in your terminal window
1181             # the clusters constructed by the EM algorithm. It can show either 2D plots or
1182             # 3D plots that you can rotate interactively for better visualization. For
1183             # multidimensional data, as to which 2D or 3D dimensions are used for visualization
1184             # is controlled by the mask you must supply as an argument to the method. Should it
1185             # happen that only one on bit is specified for the mask, visualize_clusters()
1186             # aborts.
1187             #
1188             # The visualization code consists of first accessing each of clusters created by the
1189             # EM() subroutine. Note that the clusters contain only the symbolic names for the
1190             # individual records in the source data file. We therefore next reach into the
1191             # $self->{_data} hash and get the data coordinates associated with each symbolic
1192             # label in a cluster. The numerical data thus generated is then written out to a
1193             # temp file. When doing so we must remember to insert TWO BLANK LINES between the
1194             # data blocks corresponding to the different clusters. This constraint is imposed
1195             # on us by Gnuplot when plotting data from the same file since we want to use
1196             # different point styles for the data points in different cluster files.
1197             # Subsequently, we call upon the Perl interface provided by the Graphics::GnuplotIF
1198             # module to plot the data clusters.
1199             sub visualize_clusters {
1200             my $self = shift;
1201             my $v_mask;
1202             my $pause_time;
1203             if (@_ == 1) {
1204             $v_mask = shift || die "visualization mask missing";
1205             } elsif (@_ == 2) {
1206             $v_mask = shift || die "visualization mask missing";
1207             $pause_time = shift;
1208             } else {
1209             die "visualize_clusters() called with wrong args";
1210             }
1211             my $master_datafile = $self->{_datafile};
1212             my @v_mask = split //, $v_mask;
1213             my $visualization_mask_width = @v_mask;
1214             my $original_data_mask = $self->{_mask};
1215             my @mask = split //, $original_data_mask;
1216             my $data_field_width = scalar grep {$_ eq '1'} @mask;
1217             die "\n\nABORTED: The width of the visualization mask (including " .
1218             "all its 1s and 0s) must equal the width of the original mask " .
1219             "used for reading the data file (counting only the 1's)"
1220             if $visualization_mask_width != $data_field_width;
1221             my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1222             # The following section is for the superimposed one-Mahalanobis-distance-unit
1223             # ellipses that are shown only for 2D plots:
1224             if ($visualization_data_field_width == 2) {
1225             foreach my $cluster_index (0..$self->{_K}-1) {
1226             my $contour_filename = "__contour_" . $cluster_index . ".dat";
1227             my $mean = $self->{_cluster_means}->[$cluster_index];
1228             my $covariance = $self->{_cluster_covariances}->[$cluster_index];
1229             my ($mux,$muy) = $mean->as_list();
1230             my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
1231             my ($sigmayx,$vary) = $covariance->row(1)->as_list();
1232             die "Your covariance matrix does not look right"
1233             unless $sigmaxy == $sigmayx;
1234             my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
1235             my $argstring = <<"END";
1236             set contour
1237             mux = $mux
1238             muy = $muy
1239             sigmax = $sigmax
1240             sigmay = $sigmay
1241             sigmaxy = $sigmaxy
1242             determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2
1243             exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) )
1244             f(x,y) = exp( exponent(x,y) ) - 0.2
1245             xmax = mux + 2 * sigmax
1246             xmin = mux - 2 * sigmax
1247             ymax = muy + 2 * sigmay
1248             ymin = muy - 2 * sigmay
1249             set xrange [ xmin : xmax ]
1250             set yrange [ ymin : ymax ]
1251             set isosamples 200
1252             unset surface
1253             set cntrparam levels discrete 0
1254             set table \"$contour_filename\"
1255             splot f(x,y)
1256             unset table
1257             END
1258             my $plot = Graphics::GnuplotIF->new();
1259             $plot->gnuplot_cmd( $argstring );
1260             }
1261             }
1262             my %visualization_data;
1263             while ( my ($record_id, $data) = each %{$self->{_data}} ) {
1264             my @fields = @$data;
1265             die "\nABORTED: Visualization mask size exceeds data record size"
1266             if $#v_mask > $#fields;
1267             my @data_fields;
1268             foreach my $i (0..@fields-1) {
1269             if ($v_mask[$i] eq '0') {
1270             next;
1271             } elsif ($v_mask[$i] eq '1') {
1272             push @data_fields, $fields[$i];
1273             } else {
1274             die "Misformed visualization mask. It can only have 1s and 0s";
1275             }
1276             }
1277             $visualization_data{ $record_id } = \@data_fields;
1278             }
1279             my $K = scalar @{$self->{_clusters}};
1280             my $filename = basename($master_datafile);
1281             my $temp_file = "__temp_" . $filename;
1282             unlink $temp_file if -e $temp_file;
1283             open OUTPUT, ">$temp_file"
1284             or die "Unable to open a temp file in this directory: $!\n";
1285             foreach my $cluster (@{$self->{_clusters}}) {
1286             foreach my $item (@$cluster) {
1287             print OUTPUT "@{$visualization_data{$item}}";
1288             print OUTPUT "\n";
1289             }
1290             print OUTPUT "\n\n";
1291             }
1292             close OUTPUT;
1293             my $plot;
1294             if (!defined $pause_time) {
1295             $plot = Graphics::GnuplotIF->new( persist => 1 );
1296             } else {
1297             $plot = Graphics::GnuplotIF->new();
1298             }
1299             my $arg_string = "";
1300             if ($visualization_data_field_width > 2) {
1301             $plot->gnuplot_cmd("set noclip");
1302             $plot->gnuplot_cmd("set pointsize 2");
1303             foreach my $i (0..$K-1) {
1304             my $j = $i + 1;
1305             $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
1306             }
1307             } elsif ($visualization_data_field_width == 2) {
1308             $plot->gnuplot_cmd("set noclip");
1309             $plot->gnuplot_cmd("set pointsize 2");
1310             foreach my $i (0..$K-1) {
1311             my $j = $i + 1;
1312             $arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
1313             my $ellipse_filename = "__contour_" . $i . ".dat";
1314             $arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
1315             }
1316             } elsif ($visualization_data_field_width == 1 ) {
1317             open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!\n";
1318             my @all_data = ;
1319             close INPUT;
1320             @all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
1321             my $all_joined_data = join ':', @all_data;
1322             my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
1323             my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
1324             foreach my $i (0..@separated-1) {
1325             $separated[$i] =~ s/SEPERATOR//g;
1326             my @cluster_for_hist = split /:/, $separated[$i];
1327             @cluster_for_hist = grep $_, @cluster_for_hist;
1328             my ($minval,$maxval) = minmax(\@cluster_for_hist);
1329             push @all_minvals, $minval;
1330             push @all_maxvals, $maxval;
1331             push @all_clusters_for_hist, \@cluster_for_hist;
1332             }
1333             push @all_minmaxvals, @all_minvals;
1334             push @all_minmaxvals, @all_maxvals;
1335             my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
1336             my $delta = ($abs_maxval - $abs_minval) / 100.0;
1337             $plot->gnuplot_cmd("set boxwidth 3");
1338             $plot->gnuplot_cmd("set style fill solid border -1");
1339             $plot->gnuplot_cmd("set ytics out nomirror");
1340             $plot->gnuplot_cmd("set style data histograms");
1341             $plot->gnuplot_cmd("set style histogram clustered");
1342             $plot->gnuplot_cmd("set title 'Clusters shown through histograms'");
1343             $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
1344             foreach my $cindex (0..@all_clusters_for_hist-1) {
1345             my $filename = basename($master_datafile);
1346             my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $filename;
1347             unlink $temp_file if -e $temp_file;
1348             open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!\n";
1349             print OUTPUT "Xstep histval\n";
1350             my @histogram = (0) x 100;
1351             foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
1352             $histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
1353             }
1354             foreach my $i (0..@histogram-1) {
1355             print OUTPUT "$i $histogram[$i]\n";
1356             }
1357             $arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
1358             close OUTPUT;
1359             }
1360             }
1361             $arg_string = $arg_string =~ /^(.*),[ ]+$/;
1362             $arg_string = $1;
1363             if ($visualization_data_field_width > 2) {
1364             $plot->gnuplot_cmd( "splot $arg_string" );
1365             $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1366             } elsif ($visualization_data_field_width == 2) {
1367             $plot->gnuplot_cmd( "plot $arg_string" );
1368             $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1369             } elsif ($visualization_data_field_width == 1) {
1370             $plot->gnuplot_cmd( "plot $arg_string" );
1371             $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1372             }
1373             }
1374              
1375             # This subroutine is the same as above except that it makes PNG plots (for hardcopy
1376             # printing) of the clusters.
1377             sub plot_hardcopy_clusters {
1378             my $self = shift;
1379             my $v_mask;
1380             my $pause_time;
1381             if (@_ == 1) {
1382             $v_mask = shift || die "visualization mask missing";
1383             } elsif (@_ == 2) {
1384             $v_mask = shift || die "visualization mask missing";
1385             $pause_time = shift;
1386             } else {
1387             die "visualize_clusters() called with wrong args";
1388             }
1389             my $master_datafile = $self->{_datafile};
1390             my @v_mask = split //, $v_mask;
1391             my $visualization_mask_width = @v_mask;
1392             my $original_data_mask = $self->{_mask};
1393             my @mask = split //, $original_data_mask;
1394             my $data_field_width = scalar grep {$_ eq '1'} @mask;
1395             die "\n\nABORTED: The width of the visualization mask (including " .
1396             "all its 1s and 0s) must equal the width of the original mask " .
1397             "used for reading the data file (counting only the 1's)"
1398             if $visualization_mask_width != $data_field_width;
1399             my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1400             if ($visualization_data_field_width == 2) {
1401             foreach my $cluster_index (0..$self->{_K}-1) {
1402             my $contour_filename = "__contour_" . $cluster_index . ".dat";
1403             my $mean = $self->{_cluster_means}->[$cluster_index];
1404             my $covariance = $self->{_cluster_covariances}->[$cluster_index];
1405             my ($mux,$muy) = $mean->as_list();
1406             my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
1407             my ($sigmayx,$vary) = $covariance->row(1)->as_list();
1408             die "Your covariance matrix does not look right"
1409             unless $sigmaxy == $sigmayx;
1410             my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
1411             my $argstring = <<"END";
1412             set contour
1413             mux = $mux
1414             muy = $muy
1415             sigmax = $sigmax
1416             sigmay = $sigmay
1417             sigmaxy = $sigmaxy
1418             determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2
1419             exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) )
1420             f(x,y) = exp( exponent(x,y) ) - 0.2
1421             xmax = mux + 2 * sigmax
1422             xmin = mux - 2 * sigmax
1423             ymax = muy + 2 * sigmay
1424             ymin = muy - 2 * sigmay
1425             set xrange [ xmin : xmax ]
1426             set yrange [ ymin : ymax ]
1427             set isosamples 200
1428             unset surface
1429             set cntrparam levels discrete 0
1430             set table \"$contour_filename\"
1431             splot f(x,y)
1432             unset table
1433             END
1434             my $plot = Graphics::GnuplotIF->new();
1435             $plot->gnuplot_cmd( $argstring );
1436             }
1437             }
1438             my %visualization_data;
1439             while ( my ($record_id, $data) = each %{$self->{_data}} ) {
1440             my @fields = @$data;
1441             die "\nABORTED: Visualization mask size exceeds data record size"
1442             if $#v_mask > $#fields;
1443             my @data_fields;
1444             foreach my $i (0..@fields-1) {
1445             if ($v_mask[$i] eq '0') {
1446             next;
1447             } elsif ($v_mask[$i] eq '1') {
1448             push @data_fields, $fields[$i];
1449             } else {
1450             die "Misformed visualization mask. It can only have 1s and 0s";
1451             }
1452             }
1453             $visualization_data{ $record_id } = \@data_fields;
1454             }
1455             my $K = scalar @{$self->{_clusters}};
1456             my $filename = basename($master_datafile);
1457             my $temp_file = "__temp_" . $filename;
1458             unlink $temp_file if -e $temp_file;
1459             open OUTPUT, ">$temp_file"
1460             or die "Unable to open a temp file in this directory: $!\n";
1461             foreach my $cluster (@{$self->{_clusters}}) {
1462             foreach my $item (@$cluster) {
1463             print OUTPUT "@{$visualization_data{$item}}";
1464             print OUTPUT "\n";
1465             }
1466             print OUTPUT "\n\n";
1467             }
1468             close OUTPUT;
1469             my $plot;
1470             if (!defined $pause_time) {
1471             $plot = Graphics::GnuplotIF->new( persist => 1 );
1472             } else {
1473             $plot = Graphics::GnuplotIF->new();
1474             }
1475             my $arg_string = "";
1476             if ($visualization_data_field_width > 2) {
1477             $plot->gnuplot_cmd( "set noclip" );
1478             $plot->gnuplot_cmd( "set pointsize 2" );
1479             foreach my $i (0..$K-1) {
1480             my $j = $i + 1;
1481             $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
1482             }
1483             } elsif ($visualization_data_field_width == 2) {
1484             $plot->gnuplot_cmd( "set noclip" );
1485             $plot->gnuplot_cmd( "set pointsize 2" );
1486             foreach my $i (0..$K-1) {
1487             my $j = $i + 1;
1488             $arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
1489             my $ellipse_filename = "__contour_" . $i . ".dat";
1490             $arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
1491             }
1492             } elsif ($visualization_data_field_width == 1 ) {
1493             open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!\n";
1494             my @all_data = ;
1495             close INPUT;
1496             @all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
1497             my $all_joined_data = join ':', @all_data;
1498             my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
1499             my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
1500             foreach my $i (0..@separated-1) {
1501             $separated[$i] =~ s/SEPERATOR//g;
1502             my @cluster_for_hist = split /:/, $separated[$i];
1503             @cluster_for_hist = grep $_, @cluster_for_hist;
1504             my ($minval,$maxval) = minmax(\@cluster_for_hist);
1505             push @all_minvals, $minval;
1506             push @all_maxvals, $maxval;
1507             push @all_clusters_for_hist, \@cluster_for_hist;
1508             }
1509             push @all_minmaxvals, @all_minvals;
1510             push @all_minmaxvals, @all_maxvals;
1511             my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
1512             my $delta = ($abs_maxval - $abs_minval) / 100.0;
1513             $plot->gnuplot_cmd("set boxwidth 3");
1514             $plot->gnuplot_cmd("set style fill solid border -1");
1515             $plot->gnuplot_cmd("set ytics out nomirror");
1516             $plot->gnuplot_cmd("set style data histograms");
1517             $plot->gnuplot_cmd("set style histogram clustered");
1518             $plot->gnuplot_cmd("set title 'Clusters shown through histograms'");
1519             $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
1520             foreach my $cindex (0..@all_clusters_for_hist-1) {
1521             my $filename = basename($master_datafile);
1522             my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $filename;
1523             unlink $temp_file if -e $temp_file;
1524             open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!\n";
1525             print OUTPUT "Xstep histval\n";
1526             my @histogram = (0) x 100;
1527             foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
1528             $histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
1529             }
1530             foreach my $i (0..@histogram-1) {
1531             print OUTPUT "$i $histogram[$i]\n";
1532             }
1533             # $arg_string .= "\"$temp_file\" using 1:2 ti col smooth frequency with boxes lc $cindex, ";
1534             $arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
1535             close OUTPUT;
1536             }
1537             }
1538             $arg_string = $arg_string =~ /^(.*),[ ]+$/;
1539             $arg_string = $1;
1540             if ($visualization_data_field_width > 2) {
1541             $plot->gnuplot_cmd( 'set terminal png color',
1542             'set output "cluster_plot.png"');
1543             $plot->gnuplot_cmd( "splot $arg_string" );
1544             } elsif ($visualization_data_field_width == 2) {
1545             $plot->gnuplot_cmd('set terminal png',
1546             'set output "cluster_plot.png"');
1547             $plot->gnuplot_cmd( "plot $arg_string" );
1548             } elsif ($visualization_data_field_width == 1) {
1549             $plot->gnuplot_cmd('set terminal png',
1550             'set output "cluster_plot.png"');
1551             $plot->gnuplot_cmd( "plot $arg_string" );
1552             }
1553             }
1554              
1555             # This method is for the visualization of the posterior class distributions. In
1556             # other words, this method allows us to see the soft clustering produced by the EM
1557             # algorithm. While much of the gnuplot logic here is the same as in the
1558             # visualize_clusters() method, there are significant differences in how the data is
1559             # pooled for the purpose of display.
1560             sub visualize_distributions {
1561             my $self = shift;
1562             my $v_mask;
1563             my $pause_time;
1564             if (@_ == 1) {
1565             $v_mask = shift || die "visualization mask missing";
1566             } elsif (@_ == 2) {
1567             $v_mask = shift || die "visualization mask missing";
1568             $pause_time = shift;
1569             } else {
1570             die "visualize_distributions() called with wrong args";
1571             }
1572             my @v_mask = split //, $v_mask;
1573             my $visualization_mask_width = @v_mask;
1574             my $original_data_mask = $self->{_mask};
1575             my @mask = split //, $original_data_mask;
1576             my $data_field_width = scalar grep {$_ eq '1'} @mask;
1577             die "\n\nABORTED: The width of the visualization mask (including " .
1578             "all its 1s and 0s) must equal the width of the original mask " .
1579             "used for reading the data file (counting only the 1's)"
1580             if $visualization_mask_width != $data_field_width;
1581             my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1582             if ($visualization_data_field_width == 2) {
1583             foreach my $cluster_index (0..$self->{_K}-1) {
1584             my $contour_filename = "__contour2_" . $cluster_index . ".dat";
1585             my $mean = $self->{_cluster_means}->[$cluster_index];
1586             my $covariance = $self->{_cluster_covariances}->[$cluster_index];
1587             my ($mux,$muy) = $mean->as_list();
1588             my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
1589             my ($sigmayx,$vary) = $covariance->row(1)->as_list();
1590             die "Your covariance matrix does not look right"
1591             unless $sigmaxy == $sigmayx;
1592             my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
1593             my $argstring = <<"END";
1594             set contour
1595             mux = $mux
1596             muy = $muy
1597             sigmax = $sigmax
1598             sigmay = $sigmay
1599             sigmaxy = $sigmaxy
1600             determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2
1601             exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) )
1602             f(x,y) = exp( exponent(x,y) ) - 0.2
1603             xmax = mux + 2 * sigmax
1604             xmin = mux - 2 * sigmax
1605             ymax = muy + 2 * sigmay
1606             ymin = muy - 2 * sigmay
1607             set xrange [ xmin : xmax ]
1608             set yrange [ ymin : ymax ]
1609             set isosamples 200
1610             unset surface
1611             set cntrparam levels discrete 0
1612             set table \"$contour_filename\"
1613             splot f(x,y)
1614             unset table
1615             END
1616             my $plot = Graphics::GnuplotIF->new();
1617             $plot->gnuplot_cmd( $argstring );
1618             }
1619             }
1620             my %visualization_data;
1621             while ( my ($record_id, $data) = each %{$self->{_data}} ) {
1622             my @fields = @$data;
1623             die "\nABORTED: Visualization mask size exceeds data record size"
1624             if $#v_mask > $#fields;
1625             my @data_fields;
1626             foreach my $i (0..@fields-1) {
1627             if ($v_mask[$i] eq '0') {
1628             next;
1629             } elsif ($v_mask[$i] eq '1') {
1630             push @data_fields, $fields[$i];
1631             } else {
1632             die "Misformed visualization mask. It can only have 1s and 0s";
1633             }
1634             }
1635             $visualization_data{ $record_id } = \@data_fields;
1636             }
1637             my $filename = basename($self->{_datafile});
1638             my $temp_file = "__temp2_" . $filename;
1639             unlink $temp_file if -e $temp_file;
1640             open OUTPUT, ">$temp_file"
1641             or die "Unable to open a temp file in this directory: $!\n";
1642             my @class_distributions;
1643             foreach my $cluster_index (0..$self->{_K}-1) {
1644             push @class_distributions, [];
1645             }
1646             foreach my $data_tag (@{$self->{_data_id_tags}}) {
1647             foreach my $cluster_index (0..$self->{_K}-1) {
1648             push @{$class_distributions[$cluster_index]}, $self->{_data}->{$data_tag}
1649             if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] > 0.2;
1650             }
1651             }
1652             foreach my $distribution (@class_distributions) {
1653             foreach my $item (@$distribution) {
1654             print OUTPUT "@$item";
1655             print OUTPUT "\n";
1656             }
1657             print OUTPUT "\n\n";
1658             }
1659             close OUTPUT;
1660             my $plot;
1661             if (!defined $pause_time) {
1662             $plot = Graphics::GnuplotIF->new( persist => 1 );
1663             } else {
1664             $plot = Graphics::GnuplotIF->new();
1665             }
1666             my $arg_string = "";
1667             if ($visualization_data_field_width > 2) {
1668             $plot->gnuplot_cmd( "set noclip" );
1669             $plot->gnuplot_cmd( "set pointsize 2" );
1670             foreach my $i (0..$self->{_K}-1) {
1671             my $j = $i + 1;
1672             $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
1673             }
1674             } elsif ($visualization_data_field_width == 2) {
1675             $plot->gnuplot_cmd( "set noclip" );
1676             $plot->gnuplot_cmd( "set pointsize 2" );
1677             foreach my $i (0..$self->{_K}-1) {
1678             my $j = $i + 1;
1679             $arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
1680             my $ellipse_filename = "__contour2_" . $i . ".dat";
1681             $arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
1682             }
1683             } elsif ($visualization_data_field_width == 1 ) {
1684             open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!\n";
1685             my @all_data = ;
1686             close INPUT;
1687             @all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
1688             my $all_joined_data = join ':', @all_data;
1689             my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
1690             my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
1691             foreach my $i (0..@separated-1) {
1692             $separated[$i] =~ s/SEPERATOR//g;
1693             my @cluster_for_hist = split /:/, $separated[$i];
1694             @cluster_for_hist = grep $_, @cluster_for_hist;
1695             my ($minval,$maxval) = minmax(\@cluster_for_hist);
1696             push @all_minvals, $minval;
1697             push @all_maxvals, $maxval;
1698             push @all_clusters_for_hist, \@cluster_for_hist;
1699             }
1700             push @all_minmaxvals, @all_minvals;
1701             push @all_minmaxvals, @all_maxvals;
1702             my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
1703             my $delta = ($abs_maxval - $abs_minval) / 100.0;
1704             $plot->gnuplot_cmd("set boxwidth 3");
1705             $plot->gnuplot_cmd("set style fill solid border -1");
1706             $plot->gnuplot_cmd("set ytics out nomirror");
1707             $plot->gnuplot_cmd("set style data histograms");
1708             $plot->gnuplot_cmd("set style histogram clustered");
1709             $plot->gnuplot_cmd("set title 'Individual distributions shown through histograms'");
1710             $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
1711             foreach my $cindex (0..@all_clusters_for_hist-1) {
1712             my $localfilename = basename($filename);
1713             my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $localfilename;
1714             unlink $temp_file if -e $temp_file;
1715             open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!\n";
1716             print OUTPUT "Xstep histval\n";
1717              
1718             my @histogram = (0) x 100;
1719             foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
1720             $histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
1721             }
1722             foreach my $i (0..@histogram-1) {
1723             print OUTPUT "$i $histogram[$i]\n";
1724             }
1725             # $arg_string .= "\"$temp_file\" using 1:2 ti col smooth frequency with boxes lc $cindex, ";
1726             $arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
1727             close OUTPUT;
1728             }
1729             }
1730             $arg_string = $arg_string =~ /^(.*),[ ]+$/;
1731             $arg_string = $1;
1732             if ($visualization_data_field_width > 2) {
1733             $plot->gnuplot_cmd( "splot $arg_string" );
1734             $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1735             } elsif ($visualization_data_field_width == 2) {
1736             $plot->gnuplot_cmd( "plot $arg_string" );
1737             $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1738             } elsif ($visualization_data_field_width == 1) {
1739             $plot->gnuplot_cmd( "plot $arg_string" );
1740             $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1741             }
1742             }
1743              
1744             # This method is basically the same as the previous method, except that it is
1745             # intended for making PNG files from the distributions.
1746             sub plot_hardcopy_distributions {
1747             my $self = shift;
1748             my $v_mask;
1749             my $pause_time;
1750             if (@_ == 1) {
1751             $v_mask = shift || die "visualization mask missing";
1752             } elsif (@_ == 2) {
1753             $v_mask = shift || die "visualization mask missing";
1754             $pause_time = shift;
1755             } else {
1756             die "visualize_distributions() called with wrong args";
1757             }
1758             my @v_mask = split //, $v_mask;
1759             my $visualization_mask_width = @v_mask;
1760             my $original_data_mask = $self->{_mask};
1761             my @mask = split //, $original_data_mask;
1762             my $data_field_width = scalar grep {$_ eq '1'} @mask;
1763             die "\n\nABORTED: The width of the visualization mask (including " .
1764             "all its 1s and 0s) must equal the width of the original mask " .
1765             "used for reading the data file (counting only the 1's)"
1766             if $visualization_mask_width != $data_field_width;
1767             my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1768             if ($visualization_data_field_width == 2) {
1769             foreach my $cluster_index (0..$self->{_K}-1) {
1770             my $contour_filename = "__contour2_" . $cluster_index . ".dat";
1771             my $mean = $self->{_cluster_means}->[$cluster_index];
1772             my $covariance = $self->{_cluster_covariances}->[$cluster_index];
1773             my ($mux,$muy) = $mean->as_list();
1774             my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
1775             my ($sigmayx,$vary) = $covariance->row(1)->as_list();
1776             die "Your covariance matrix does not look right"
1777             unless $sigmaxy == $sigmayx;
1778             my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
1779             my $argstring = <<"END";
1780             set contour
1781             mux = $mux
1782             muy = $muy
1783             sigmax = $sigmax
1784             sigmay = $sigmay
1785             sigmaxy = $sigmaxy
1786             determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2
1787             exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) )
1788             f(x,y) = exp( exponent(x,y) ) - 0.2
1789             xmax = mux + 2 * sigmax
1790             xmin = mux - 2 * sigmax
1791             ymax = muy + 2 * sigmay
1792             ymin = muy - 2 * sigmay
1793             set xrange [ xmin : xmax ]
1794             set yrange [ ymin : ymax ]
1795             set isosamples 200
1796             unset surface
1797             set cntrparam levels discrete 0
1798             set table \"$contour_filename\"
1799             splot f(x,y)
1800             unset table
1801             END
1802             my $plot = Graphics::GnuplotIF->new();
1803             $plot->gnuplot_cmd( $argstring );
1804             }
1805             }
1806             my %visualization_data;
1807             while ( my ($record_id, $data) = each %{$self->{_data}} ) {
1808             my @fields = @$data;
1809             die "\nABORTED: Visualization mask size exceeds data record size"
1810             if $#v_mask > $#fields;
1811             my @data_fields;
1812             foreach my $i (0..@fields-1) {
1813             if ($v_mask[$i] eq '0') {
1814             next;
1815             } elsif ($v_mask[$i] eq '1') {
1816             push @data_fields, $fields[$i];
1817             } else {
1818             die "Misformed visualization mask. It can only have 1s and 0s";
1819             }
1820             }
1821             $visualization_data{ $record_id } = \@data_fields;
1822             }
1823             my $filename = basename($self->{_datafile});
1824             my $temp_file = "__temp2_" . $filename;
1825             unlink $temp_file if -e $temp_file;
1826             open OUTPUT, ">$temp_file"
1827             or die "Unable to open a temp file in this directory: $!\n";
1828             my @class_distributions;
1829             foreach my $cluster_index (0..$self->{_K}-1) {
1830             push @class_distributions, [];
1831             }
1832             foreach my $data_tag (@{$self->{_data_id_tags}}) {
1833             foreach my $cluster_index (0..$self->{_K}-1) {
1834             push @{$class_distributions[$cluster_index]}, $self->{_data}->{$data_tag}
1835             if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] > 0.2;
1836             }
1837             }
1838             foreach my $distribution (@class_distributions) {
1839             foreach my $item (@$distribution) {
1840             print OUTPUT "@$item";
1841             print OUTPUT "\n";
1842             }
1843             print OUTPUT "\n\n";
1844             }
1845             close OUTPUT;
1846             my $plot;
1847             if (!defined $pause_time) {
1848             $plot = Graphics::GnuplotIF->new( persist => 1 );
1849             } else {
1850             $plot = Graphics::GnuplotIF->new();
1851             }
1852             my $arg_string = "";
1853             if ($visualization_data_field_width > 2) {
1854             $plot->gnuplot_cmd( "set noclip" );
1855             $plot->gnuplot_cmd( "set pointsize 2" );
1856             foreach my $i (0..$self->{_K}-1) {
1857             my $j = $i + 1;
1858             $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
1859             }
1860             } elsif ($visualization_data_field_width == 2) {
1861             $plot->gnuplot_cmd( "set noclip" );
1862             $plot->gnuplot_cmd( "set pointsize 2" );
1863             foreach my $i (0..$self->{_K}-1) {
1864             my $j = $i + 1;
1865             $arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
1866             my $ellipse_filename = "__contour2_" . $i . ".dat";
1867             $arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
1868             }
1869             } elsif ($visualization_data_field_width == 1 ) {
1870             open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!\n";
1871             my @all_data = ;
1872             close INPUT;
1873             @all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
1874             my $all_joined_data = join ':', @all_data;
1875             my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
1876             my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
1877             foreach my $i (0..@separated-1) {
1878             $separated[$i] =~ s/SEPERATOR//g;
1879             my @cluster_for_hist = split /:/, $separated[$i];
1880             @cluster_for_hist = grep $_, @cluster_for_hist;
1881             my ($minval,$maxval) = minmax(\@cluster_for_hist);
1882             push @all_minvals, $minval;
1883             push @all_maxvals, $maxval;
1884             push @all_clusters_for_hist, \@cluster_for_hist;
1885             }
1886             push @all_minmaxvals, @all_minvals;
1887             push @all_minmaxvals, @all_maxvals;
1888             my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
1889             my $delta = ($abs_maxval - $abs_minval) / 100.0;
1890             $plot->gnuplot_cmd("set boxwidth 3");
1891             $plot->gnuplot_cmd("set style fill solid border -1");
1892             $plot->gnuplot_cmd("set ytics out nomirror");
1893             $plot->gnuplot_cmd("set style data histograms");
1894             $plot->gnuplot_cmd("set style histogram clustered");
1895             $plot->gnuplot_cmd("set title 'Individual distributions shown through histograms'");
1896             $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
1897             foreach my $cindex (0..@all_clusters_for_hist-1) {
1898             my $localfilename = basename($filename);
1899             my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $localfilename;
1900             unlink $temp_file if -e $temp_file;
1901             open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!\n";
1902             print OUTPUT "Xstep histval\n";
1903              
1904             my @histogram = (0) x 100;
1905             foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
1906             $histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
1907             }
1908             foreach my $i (0..@histogram-1) {
1909             print OUTPUT "$i $histogram[$i]\n";
1910             }
1911             $arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
1912             close OUTPUT;
1913             }
1914             }
1915             $arg_string = $arg_string =~ /^(.*),[ ]+$/;
1916             $arg_string = $1;
1917              
1918             if ($visualization_data_field_width > 2) {
1919             $plot->gnuplot_cmd( 'set terminal png',
1920             'set output "posterior_prob_plot.png"');
1921             $plot->gnuplot_cmd( "splot $arg_string" );
1922             } elsif ($visualization_data_field_width == 2) {
1923             $plot->gnuplot_cmd( 'set terminal png',
1924             'set output "posterior_prob_plot.png"');
1925             $plot->gnuplot_cmd( "plot $arg_string" );
1926             } elsif ($visualization_data_field_width == 1) {
1927             $plot->gnuplot_cmd( 'set terminal png',
1928             'set output "posterior_prob_plot.png"');
1929             $plot->gnuplot_cmd( "plot $arg_string" );
1930             }
1931             }
1932              
1933             # The method shown below should be called only AFTER you have called the method
1934             # read_data_from_file(). The visualize_data() is meant for the visualization of the
1935             # original data in its various 2D or 3D subspaces.
1936             sub visualize_data {
1937             my $self = shift;
1938             my $v_mask = shift || die "visualization mask missing";
1939              
1940             my $master_datafile = $self->{_datafile};
1941              
1942             my @v_mask = split //, $v_mask;
1943             my $visualization_mask_width = @v_mask;
1944             my $original_data_mask = $self->{_mask};
1945             my @mask = split //, $original_data_mask;
1946             my $data_field_width = scalar grep {$_ eq '1'} @mask;
1947             die "\n\nABORTED: The width of the visualization mask (including " .
1948             "all its 1s and 0s) must equal the width of the original mask " .
1949             "used for reading the data file (counting only the 1's)"
1950             if $visualization_mask_width != $data_field_width;
1951             my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1952             my %visualization_data;
1953             my $data_source = $self->{_data};
1954             while ( my ($record_id, $data) = each %{$data_source} ) {
1955             my @fields = @$data;
1956             die "\nABORTED: Visualization mask size exceeds data record size"
1957             if $#v_mask > $#fields;
1958             my @data_fields;
1959             foreach my $i (0..@fields-1) {
1960             if ($v_mask[$i] eq '0') {
1961             next;
1962             } elsif ($v_mask[$i] eq '1') {
1963             push @data_fields, $fields[$i];
1964             } else {
1965             die "Misformed visualization mask. It can only have 1s and 0s";
1966             }
1967             }
1968             $visualization_data{ $record_id } = \@data_fields;
1969             }
1970             my $filename = basename($master_datafile);
1971             my $temp_file;
1972             $temp_file = "__temp_data_" . $filename;
1973             unlink $temp_file if -e $temp_file;
1974             open OUTPUT, ">$temp_file"
1975             or die "Unable to open a temp file in this directory: $!\n";
1976             foreach my $datapoint (values %visualization_data) {
1977             print OUTPUT "@$datapoint";
1978             print OUTPUT "\n";
1979             }
1980             close OUTPUT;
1981             my $plot = Graphics::GnuplotIF->new( persist => 1 );
1982             $plot->gnuplot_cmd( "set noclip" );
1983             $plot->gnuplot_cmd( "set pointsize 2" );
1984             my $plot_title = '"original data provided for EM"';
1985             my $arg_string ;
1986             if ($visualization_data_field_width > 2) {
1987             $arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
1988             } elsif ($visualization_data_field_width == 2) {
1989             $arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
1990             } elsif ($visualization_data_field_width == 1 ) {
1991             open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!\n";
1992             my @all_data = ;
1993             close INPUT;
1994             @all_data = map {chomp $_; $_} @all_data;
1995             @all_data = grep $_, @all_data;
1996             my ($minval,$maxval) = minmax(\@all_data);
1997             my $delta = ($maxval - $minval) / 100.0;
1998             $plot->gnuplot_cmd("set boxwidth 3");
1999             $plot->gnuplot_cmd("set style fill solid border -1");
2000             $plot->gnuplot_cmd("set ytics out nomirror");
2001             $plot->gnuplot_cmd("set style data histograms");
2002             $plot->gnuplot_cmd("set style histogram clustered");
2003             $plot->gnuplot_cmd("set title 'Overall distribution of 1D data'");
2004             $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
2005             my $localfilename = basename($filename);
2006             my $temp_file = "__temp1dhist_" . $localfilename;
2007             unlink $temp_file if -e $temp_file;
2008             open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!\n";
2009             print OUTPUT "Xstep histval\n";
2010             my @histogram = (0) x 100;
2011             foreach my $i (0..@all_data-1) {
2012             $histogram[int( ($all_data[$i] - $minval) / $delta )]++;
2013             }
2014             foreach my $i (0..@histogram-1) {
2015             print OUTPUT "$i $histogram[$i]\n";
2016             }
2017             $arg_string = "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc rgb 'green'";
2018             close OUTPUT;
2019             }
2020             if ($visualization_data_field_width > 2) {
2021             $plot->gnuplot_cmd( "splot $arg_string" );
2022             } elsif ($visualization_data_field_width == 2) {
2023             $plot->gnuplot_cmd( "plot $arg_string" );
2024             } elsif ($visualization_data_field_width == 1) {
2025             $plot->gnuplot_cmd( "plot $arg_string" );
2026             }
2027             }
2028              
2029             # This method is the same as the one shown above, except that it is meant for
2030             # creating PNG files of the displays.
2031             sub plot_hardcopy_data {
2032             my $self = shift;
2033             my $v_mask = shift || die "visualization mask missing";
2034             my $master_datafile = $self->{_datafile};
2035             my @v_mask = split //, $v_mask;
2036             my $visualization_mask_width = @v_mask;
2037             my $original_data_mask = $self->{_mask};
2038             my @mask = split //, $original_data_mask;
2039             my $data_field_width = scalar grep {$_ eq '1'} @mask;
2040             die "\n\nABORTED: The width of the visualization mask (including " .
2041             "all its 1s and 0s) must equal the width of the original mask " .
2042             "used for reading the data file (counting only the 1's)"
2043             if $visualization_mask_width != $data_field_width;
2044             my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
2045             my %visualization_data;
2046             my $data_source = $self->{_data};
2047             while ( my ($record_id, $data) = each %{$data_source} ) {
2048             my @fields = @$data;
2049             die "\nABORTED: Visualization mask size exceeds data record size"
2050             if $#v_mask > $#fields;
2051             my @data_fields;
2052             foreach my $i (0..@fields-1) {
2053             if ($v_mask[$i] eq '0') {
2054             next;
2055             } elsif ($v_mask[$i] eq '1') {
2056             push @data_fields, $fields[$i];
2057             } else {
2058             die "Misformed visualization mask. It can only have 1s and 0s";
2059             }
2060             }
2061             $visualization_data{ $record_id } = \@data_fields;
2062             }
2063             my $filename = basename($master_datafile);
2064             my $temp_file;
2065             $temp_file = "__temp_data_" . $filename;
2066             unlink $temp_file if -e $temp_file;
2067             open OUTPUT, ">$temp_file"
2068             or die "Unable to open a temp file in this directory: $!\n";
2069             foreach my $datapoint (values %visualization_data) {
2070             print OUTPUT "@$datapoint";
2071             print OUTPUT "\n";
2072             }
2073             close OUTPUT;
2074              
2075             my $plot = Graphics::GnuplotIF->new( persist => 1 );
2076             $plot->gnuplot_cmd( "set noclip" );
2077             $plot->gnuplot_cmd( "set pointsize 2" );
2078             my $plot_title = '"original data provided for EM"';
2079             my $arg_string ;
2080             if ($visualization_data_field_width > 2) {
2081             $arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
2082             } elsif ($visualization_data_field_width == 2) {
2083             $arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
2084             } elsif ($visualization_data_field_width == 1 ) {
2085             open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!\n";
2086             my @all_data = ;
2087             close INPUT;
2088             @all_data = map {chomp $_; $_} @all_data;
2089             @all_data = grep $_, @all_data;
2090             my ($minval,$maxval) = minmax(\@all_data);
2091             my $delta = ($maxval - $minval) / 100.0;
2092             $plot->gnuplot_cmd("set boxwidth 3");
2093             $plot->gnuplot_cmd("set style fill solid border -1");
2094             $plot->gnuplot_cmd("set ytics out nomirror");
2095             $plot->gnuplot_cmd("set style data histograms");
2096             $plot->gnuplot_cmd("set style histogram clustered");
2097             $plot->gnuplot_cmd("set title 'Overall distribution of 1D data'");
2098             $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
2099             my $localfilename = basename($filename);
2100             my $temp_file = "__temp1dhist_" . $localfilename;
2101             unlink $temp_file if -e $temp_file;
2102             open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!\n";
2103             print OUTPUT "Xstep histval\n";
2104             my @histogram = (0) x 100;
2105             foreach my $i (0..@all_data-1) {
2106             $histogram[int( ($all_data[$i] - $minval) / $delta )]++;
2107             }
2108             foreach my $i (0..@histogram-1) {
2109             print OUTPUT "$i $histogram[$i]\n";
2110             }
2111             $arg_string = "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc rgb 'green'";
2112             close OUTPUT;
2113             }
2114             if ($visualization_data_field_width > 2) {
2115             $plot->gnuplot_cmd( 'set terminal png',
2116             'set output "data_scatter_plot.png"');
2117             $plot->gnuplot_cmd( "splot $arg_string" );
2118             } elsif ($visualization_data_field_width == 2) {
2119             $plot->gnuplot_cmd( 'set terminal png',
2120             'set output "data_scatter_plot.png"');
2121             $plot->gnuplot_cmd( "plot $arg_string" );
2122             } elsif ($visualization_data_field_width == 1) {
2123             $plot->gnuplot_cmd( 'set terminal png',
2124             'set output "data_scatter_plot.png"');
2125             $plot->gnuplot_cmd( "plot $arg_string" );
2126             }
2127             }
2128              
2129              
2130             ################### Generating Synthetic Data for Clustering ###################
2131              
2132             # The data generated corresponds to a multivariate distribution. The mean and the
2133             # covariance of each Gaussian in the distribution are specified individually in a
2134             # parameter file. The parameter file must also state the prior probabilities to be
2135             # associated with each Gaussian. See the example parameter file param1.txt in the
2136             # examples directory. Just edit this file for your own needs.
2137             #
2138             # The multivariate random numbers are generated by calling the Math::Random module.
2139             # As you would expect, that module will insist that the covariance matrix you
2140             # specify be symmetric and positive definite.
2141             sub cluster_data_generator {
2142             my $class = shift;
2143             die "illegal call of a class method"
2144             unless $class eq 'Algorithm::ExpectationMaximization';
2145             my %args = @_;
2146             my $input_parameter_file = $args{input_parameter_file};
2147             my $output_file = $args{output_datafile};
2148             my $N = $args{total_number_of_data_points};
2149             my @all_params;
2150             my $param_string;
2151             if (defined $input_parameter_file) {
2152             open INPUT, $input_parameter_file || "unable to open parameter file: $!";
2153             @all_params = ;
2154             @all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
2155             chomp @all_params;
2156             $param_string = join ' ', @all_params;
2157             } else {
2158             # Just for testing. Used in t/test.t
2159             $param_string = "priors 0.34 0.33 0.33 " .
2160             "cluster 5 0 0 1 0 0 0 1 0 0 0 1 " .
2161             "cluster 0 5 0 1 0 0 0 1 0 0 0 1 " .
2162             "cluster 0 0 5 1 0 0 0 1 0 0 0 1";
2163             }
2164             my ($priors_string) = $param_string =~ /priors(.+?)cluster/;
2165             croak "You did not specify the prior probabilities in the parameter file"
2166             unless $priors_string;
2167             my @priors = split /\s+/, $priors_string;
2168             @priors = grep {/$_num_regex/; $_} @priors;
2169             my $sum = 0;
2170             foreach my $prior (@priors) {
2171             $sum += $prior;
2172             }
2173             croak "Your priors in the parameter file do not add up to 1" unless $sum == 1;
2174             my ($rest_of_string) = $param_string =~ /priors\s*$priors_string(.*)$/;
2175             my @cluster_strings = split /[ ]*cluster[ ]*/, $rest_of_string;
2176             @cluster_strings = grep $_, @cluster_strings;
2177              
2178             my $K = @cluster_strings;
2179             croak "Too many clusters requested" if $K > 12;
2180             croak "Mismatch between the number of values for priors and the number " .
2181             "of clusters" unless $K == @priors;
2182             my @point_labels = ('a'..'z');
2183             print "Prior probabilities recorded from param file: @priors\n";
2184             print "Number of Gaussians used for the synthetic data: $K\n";
2185             my @means;
2186             my @covariances;
2187             my $data_dimension;
2188             foreach my $i (0..$K-1) {
2189             my @num_strings = split / /, $cluster_strings[$i];
2190             my @cluster_mean = map {/$_num_regex/;$_} split / /, $num_strings[0];
2191             $data_dimension = @cluster_mean;
2192             push @means, \@cluster_mean;
2193             my @covariance_nums = map {/$_num_regex/;$_} split / /, $num_strings[1];
2194             croak "dimensionality error" if @covariance_nums !=
2195             ($data_dimension ** 2);
2196             my $cluster_covariance;
2197             foreach my $j (0..$data_dimension-1) {
2198             foreach my $k (0..$data_dimension-1) {
2199             $cluster_covariance->[$j]->[$k] =
2200             $covariance_nums[$j*$data_dimension + $k];
2201             }
2202             }
2203             push @covariances, $cluster_covariance;
2204             }
2205             random_seed_from_phrase( 'hellomellojello' );
2206             my @data_dump;
2207             foreach my $i (0..$K-1) {
2208             my @m = @{shift @means};
2209             my @covar = @{shift @covariances};
2210             my @new_data = Math::Random::random_multivariate_normal(
2211             int($N * $priors[$i]), @m, @covar );
2212             my $p = 0;
2213             my $label = $point_labels[$i];
2214             @new_data = map {unshift @$_, $label.$i; $i++; $_} @new_data;
2215             push @data_dump, @new_data;
2216             }
2217             fisher_yates_shuffle( \@data_dump );
2218             open OUTPUT, ">$output_file";
2219             print OUTPUT "\#Data generated from the parameter file: $input_parameter_file\n"
2220             if $input_parameter_file;
2221             print OUTPUT "\#Total number of data points in this file: $N\n";
2222             print OUTPUT "\#Prior class probabilities for this data: @priors\n";
2223             foreach my $ele (@data_dump) {
2224             foreach my $coord ( @$ele ) {
2225             print OUTPUT "$coord ";
2226             }
2227             print OUTPUT "\n";
2228             }
2229             print "Data written out to file $output_file\n";
2230             close OUTPUT;
2231             }
2232              
2233             sub add_point_coords {
2234             my $self = shift;
2235             my @arr_of_ids = @{shift @_}; # array of data element names
2236             my @result;
2237             my $data_dimensionality = $self->{_data_dimensions};
2238             foreach my $i (0..$data_dimensionality-1) {
2239             $result[$i] = 0.0;
2240             }
2241             foreach my $id (@arr_of_ids) {
2242             my $ele = $self->{_data}->{$id};
2243             my $i = 0;
2244             foreach my $component (@$ele) {
2245             $result[$i] += $component;
2246             $i++;
2247             }
2248             }
2249             return \@result;
2250             }
2251              
2252             ###################### Support Routines ########################
2253              
2254             # computer the outer product of two column vectors
2255             sub outer_product {
2256             my $vec1 = shift;
2257             my $vec2 = shift;
2258             my ($nrows1, $ncols1) = ($vec1->rows(), $vec1->cols());
2259             my ($nrows2, $ncols2) = ($vec2->rows(), $vec2->cols());
2260             die "Outer product operation called with non-matching vectors"
2261             unless $ncols1 == 1 && $ncols2 == 1 && $nrows1 == $nrows2;
2262             my @vec_arr1 = $vec1->as_list();
2263             my @vec_arr2 = $vec2->as_list();
2264             my $outer_product = Math::GSL::Matrix->new($nrows1, $nrows2);
2265             foreach my $index (0..$nrows1-1) {
2266             my @new_row = map $vec_arr1[$index] * $_, @vec_arr2;
2267             $outer_product->set_row($index, \@new_row);
2268             }
2269             return $outer_product;
2270             }
2271              
2272             sub get_index_at_value {
2273             my $value = shift;
2274             my @array = @{shift @_};
2275             foreach my $i (0..@array-1) {
2276             return $i if $value == $array[$i];
2277             }
2278             }
2279              
2280             # This routine is really not necessary in light of the new `~~' operator in Perl.
2281             # Will use the new operator in the next version.
2282             sub vector_equal {
2283             my $vec1 = shift;
2284             my $vec2 = shift;
2285             die "wrong data types for distance calculation\n" if @$vec1 != @$vec2;
2286             foreach my $i (0..@$vec1-1){
2287             return 0 if $vec1->[$i] != $vec2->[$i];
2288             }
2289             return 1;
2290             }
2291              
2292             sub compare_array_floats {
2293             my $vec1 = shift;
2294             my $vec2 = shift;
2295             foreach my $i (0..@$vec1-1){
2296             return 0 if abs($vec1->[$i] - $vec2->[$i]) > 0.00001;
2297             }
2298             return 1;
2299             }
2300              
2301             # Returns the minimum value and its positional index in an array
2302             sub minimum {
2303             my $arr = shift;
2304             my $min;
2305             my $index;
2306             foreach my $i (0..@{$arr}-1) {
2307             if ( (!defined $min) || ($arr->[$i] < $min) ) {
2308             $index = $i;
2309             $min = $arr->[$i];
2310             }
2311             }
2312             return ($min, $index);
2313             }
2314              
2315             sub minmax {
2316             my $arr = shift;
2317             my $min;
2318             my $max;
2319             foreach my $i (0..@{$arr}-1) {
2320             if ( (!defined $min) && (!defined $max) ) {
2321             $min = $arr->[$i];
2322             $max = $arr->[$i];
2323             } elsif ( $arr->[$i] < $min ) {
2324             $min = $arr->[$i];
2325             } elsif ( $arr->[$i] > $max ) {
2326             $max = $arr->[$i];
2327             }
2328             }
2329             return ($min, $max);
2330             }
2331              
2332             # Meant only for constructing a deep copy of an array of
2333             # arrays:
2334             sub deep_copy_AoA {
2335             my $ref_in = shift;
2336             my $ref_out;
2337             foreach my $i (0..@{$ref_in}-1) {
2338             foreach my $j (0..@{$ref_in->[$i]}-1) {
2339             $ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
2340             }
2341             }
2342             return $ref_out;
2343             }
2344              
2345             # Meant only for constructing a deep copy of an array of arrays for the case when
2346             # some elements of the top-level array may be undefined:
2347             sub deep_copy_AoA_with_nulls {
2348             my $ref_in = shift;
2349             my $ref_out;
2350             foreach my $i (0..@{$ref_in}-1) {
2351             if ( !defined $ref_in->[$i] ) {
2352             $ref_out->[$i] = undef;
2353             next;
2354             }
2355             foreach my $j (0..@{$ref_in->[$i]}-1) {
2356             $ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
2357             }
2358             }
2359             return $ref_out;
2360             }
2361              
2362             # Meant only for constructing a deep copy of a hash in which each value is an
2363             # anonymous array of numbers:
2364             sub deep_copy_hash {
2365             my $ref_in = shift;
2366             my $ref_out;
2367             while ( my ($key, $value) = each( %$ref_in ) ) {
2368             $ref_out->{$key} = deep_copy_array( $value );
2369             }
2370             return $ref_out;
2371             }
2372              
2373             # Meant only for an array of numbers:
2374             sub deep_copy_array {
2375             my $ref_in = shift;
2376             my $ref_out;
2377             foreach my $i (0..@{$ref_in}-1) {
2378             $ref_out->[$i] = $ref_in->[$i];
2379             }
2380             return $ref_out;
2381             }
2382              
2383             # from perl docs:
2384             sub fisher_yates_shuffle {
2385             my $arr = shift;
2386             my $i = @$arr;
2387             while (--$i) {
2388             my $j = int rand( $i + 1 );
2389             @$arr[$i, $j] = @$arr[$j, $i];
2390             }
2391             }
2392              
2393             sub mean_and_variance {
2394             my @data = @{shift @_};
2395             my ($mean, $variance);
2396             foreach my $i (1..@data) {
2397             if ($i == 1) {
2398             $mean = $data[0];
2399             $variance = 0;
2400             } else {
2401             # data[$i-1] because of zero-based indexing of vector
2402             $mean = ( (($i-1)/$i) * $mean ) + $data[$i-1] / $i;
2403             $variance = ( (($i-1)/$i) * $variance )
2404             + ($data[$i-1]-$mean)**2 / ($i-1);
2405             }
2406             }
2407             return ($mean, $variance);
2408             }
2409              
2410             sub check_for_illegal_params {
2411             my @params = @_;
2412             my @legal_params = qw / datafile
2413             mask
2414             K
2415             terminal_output
2416             max_em_iterations
2417             seeding
2418             class_priors
2419             seed_tags
2420             debug
2421             /;
2422             my $found_match_flag;
2423             foreach my $param (@params) {
2424             foreach my $legal (@legal_params) {
2425             $found_match_flag = 0;
2426             if ($param eq $legal) {
2427             $found_match_flag = 1;
2428             last;
2429             }
2430             }
2431             last if $found_match_flag == 0;
2432             }
2433             return $found_match_flag;
2434             }
2435              
2436             sub get_value_index_hash {
2437             my $arr = shift;
2438             my %hash;
2439             foreach my $index (0..@$arr-1) {
2440             $hash{$arr->[$index]} = $index if $arr->[$index] > 0;
2441             }
2442             return \%hash;
2443             }
2444              
2445             sub non_maximum_supression {
2446             my $arr = shift;
2447             my @output = (0) x @$arr;
2448             my @final_output = (0) x @$arr;
2449             my %hash;
2450             my @array_of_runs = ([$arr->[0]]);
2451             foreach my $index (1..@$arr-1) {
2452             if ($arr->[$index] == $arr->[$index-1]) {
2453             push @{$array_of_runs[-1]}, $arr->[$index];
2454             } else {
2455             push @array_of_runs, [$arr->[$index]];
2456             }
2457             }
2458             my $runstart_index = 0;
2459             foreach my $run_index (1..@array_of_runs-2) {
2460             $runstart_index += @{$array_of_runs[$run_index-1]};
2461             if ($array_of_runs[$run_index]->[0] >
2462             $array_of_runs[$run_index-1]->[0] &&
2463             $array_of_runs[$run_index]->[0] >
2464             $array_of_runs[$run_index+1]->[0]) {
2465             my $run_center = @{$array_of_runs[$run_index]} / 2;
2466             my $assignment_index = $runstart_index + $run_center;
2467             $output[$assignment_index] = $arr->[$assignment_index];
2468             }
2469             }
2470             if ($array_of_runs[-1]->[0] > $array_of_runs[-2]->[0]) {
2471             $runstart_index += @{$array_of_runs[-2]};
2472             my $run_center = @{$array_of_runs[-1]} / 2;
2473             my $assignment_index = $runstart_index + $run_center;
2474             $output[$assignment_index] = $arr->[$assignment_index];
2475             }
2476             if ($array_of_runs[0]->[0] > $array_of_runs[1]->[0]) {
2477             my $run_center = @{$array_of_runs[0]} / 2;
2478             $output[$run_center] = $arr->[$run_center];
2479             }
2480             return \@output;
2481             }
2482              
2483             sub display_matrix {
2484             my $message = shift;
2485             my $matrix = shift;
2486             if (!defined blessed($matrix)) {
2487             print "display_matrix called on a scalar value: $matrix\n";
2488             return;
2489             }
2490             my $nrows = $matrix->rows();
2491             my $ncols = $matrix->cols();
2492             print "$message ($nrows rows and $ncols columns)\n";
2493             foreach my $i (0..$nrows-1) {
2494             my $row = $matrix->row($i);
2495             my @row_as_list = $row->as_list;
2496             print "@row_as_list\n";
2497             }
2498             print "\n";
2499             }
2500              
2501             sub transpose {
2502             my $matrix = shift;
2503             my $num_rows = $matrix->rows();
2504             my $num_cols = $matrix->cols();
2505             my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
2506             foreach my $i (0..$num_rows-1) {
2507             my @row = $matrix->row($i)->as_list;
2508             $transpose->set_col($i, \@row );
2509             }
2510             return $transpose;
2511             }
2512              
2513             sub vector_multiply {
2514             my $vec1 = shift;
2515             my $vec2 = shift;
2516             die "vec_multiply called with two vectors of different sizes"
2517             unless @$vec1 == @$vec2;
2518             my $result = 0;
2519             foreach my $i (0..@$vec1-1) {
2520             $result += $vec1->[$i] * $vec2->[$i];
2521             }
2522             return $result;
2523             }
2524              
2525             sub vector_2_vector_multiply {
2526             my $vec1 = shift;
2527             my $vec2 = shift;
2528             die "vec_multiply called with two vectors of different sizes"
2529             unless @$vec1 == @$vec2;
2530             my @result_vec;
2531             foreach my $i (0..@$vec1-1) {
2532             $result_vec[$i] = $vec1->[$i] * $vec2->[$i];
2533             }
2534             return \@result_vec;
2535             }
2536              
2537             sub matrix_multiply {
2538             my $matrix1 = shift;
2539             my $matrix2 = shift;
2540             my ($nrows1, $ncols1) = ($matrix1->rows(), $matrix1->cols());
2541             my ($nrows2, $ncols2) = ($matrix2->rows(), $matrix2->cols());
2542             die "matrix multiplication called with non-matching matrix arguments"
2543             # unless $ncols1 == $nrows2;
2544             unless $nrows1 == $ncols2 && $ncols1 == $nrows2;
2545             if ($nrows1 == 1) {
2546             my @row = $matrix1->row(0)->as_list;
2547             my @col = $matrix2->col(0)->as_list;
2548             my $result;
2549             foreach my $j (0..$ncols1-1) {
2550             $result += $row[$j] * $col[$j];
2551             }
2552             return $result;
2553             }
2554             my $product = Math::GSL::Matrix->new($nrows1, $nrows1);
2555             foreach my $i (0..$nrows1-1) {
2556             my $row = $matrix1->row($i);
2557             my @product_row;
2558             foreach my $j (0..$ncols2-1) {
2559             my $col = $matrix2->col($j);
2560             my $row_times_col = matrix_multiply($row, $col);
2561             push @product_row, $row_times_col;
2562             }
2563             $product->set_row($i, \@product_row);
2564             }
2565             return $product;
2566             }
2567              
2568             sub vector_matrix_multiply {
2569             my $matrix1 = shift;
2570             my $matrix2 = shift;
2571             my ($nrows1, $ncols1) = ($matrix1->rows, $matrix1->cols);
2572             my ($nrows2, $ncols2) = ($matrix2->rows, $matrix2->cols);
2573             die "matrix multiplication called with non-matching matrix arguments"
2574             unless $nrows1 == 1 && $ncols1 == $nrows2;
2575             if ($ncols2 == 1) {
2576             my @row = $matrix1->row(0)->as_list;
2577             my @col = $matrix2->col(0)->as_list;
2578             my $result;
2579             foreach my $j (0..$ncols1-1) {
2580             $result += $row[$j] * $col[$j];
2581             }
2582             return $result;
2583             }
2584             my $product = Math::GSL::Matrix->new(1, $ncols2);
2585             my $row = $matrix1->row(0);
2586             my @product_row;
2587             foreach my $j (0..$ncols2-1) {
2588             my $col = $matrix2->col($j);
2589             my $row_times_col = vector_matrix_multiply($row, $col);
2590             push @product_row, $row_times_col;
2591             }
2592             $product->set_row(0, \@product_row);
2593             return $product;
2594             }
2595              
2596             sub matrix_vector_multiply {
2597             my $matrix1 = shift;
2598             my $matrix2 = shift;
2599             my ($nrows1, $ncols1) = ($matrix1->rows(), $matrix1->cols());
2600             my ($nrows2, $ncols2) = ($matrix2->rows(), $matrix2->cols());
2601             die "matrix multiplication called with non-matching matrix arguments"
2602             unless $ncols1 == $nrows2 && $ncols2 == 1;
2603             if ($nrows1 == 1) {
2604             my @row = $matrix1->row(0)->as_list;
2605             my @col = $matrix2->col(0)->as_list;
2606             my $result;
2607             foreach my $j (0..$ncols1-1) {
2608             $result += $row[$j] * $col[$j];
2609             }
2610             return $result;
2611             }
2612             my $product = Math::GSL::Matrix->new($nrows1, 1);
2613             my $col = $matrix2->col(0);
2614             my @product_col;
2615             foreach my $i (0..$nrows1-1) {
2616             my $row = $matrix1->row($i);
2617             my $row_times_col = matrix_vector_multiply($row, $col);
2618             push @product_col, $row_times_col;
2619             }
2620             $product->set_col(0, \@product_col);
2621             return $product;
2622             }
2623              
2624             sub matrix_trace {
2625             my $matrix = shift;
2626             my ($nrows, $ncols) = ($matrix->rows(), $matrix->cols());
2627             die "trace can only be calculated for a square matrix"
2628             unless $ncols == $nrows;
2629             my @elements = $matrix->as_list;
2630             my $trace = 0;
2631             foreach my $i (0..$nrows-1) {
2632             $trace += $elements[$i + $i * $ncols];
2633             }
2634             return $trace;
2635             }
2636              
2637             1;
2638              
2639             =pod
2640             =head1 NAME
2641              
2642             Algorithm::ExpectationMaximization -- A pure-Perl implementation for clustering
2643             numerical multi-dimensional data with the Expectation-Maximization algorithm.
2644              
2645             =head1 SYNOPSIS
2646              
2647             use Algorithm::ExpectationMaximization;
2648              
2649             # First name the data file:
2650              
2651             my $datafile = "mydatafile1.dat";
2652              
2653             # Next, set the mask to indicate which columns of the datafile to use for
2654             # clustering and which column contains a symbolic ID for each data record. For
2655             # example, if the symbolic name is in the first column, you want the second column
2656             # to be ignored, and you want the next three columns to be used for 3D clustering:
2657              
2658             my $mask = "N0111";
2659              
2660             # Now construct an instance of the clusterer. The parameter `K' controls the
2661             # number of clusters. Here is an example call to the constructor for instance
2662             # creation:
2663              
2664             my $clusterer = Algorithm::ExpectationMaximization->new(
2665             datafile => $datafile,
2666             mask => $mask,
2667             K => 3,
2668             max_em_iterations => 300,
2669             seeding => 'random',
2670             terminal_output => 1,
2671             debug => 0,
2672             );
2673            
2674             # Note the choice for `seeding'. The choice `random' means that the clusterer will
2675             # randomly select `K' data points to serve as initial cluster centers. Other
2676             # possible choices for the constructor parameter `seeding' are `kmeans' and
2677             # `manual'. With the `kmeans' option for `seeding', the output of a K-means
2678             # clusterer is used for the cluster seeds and the initial cluster covariances. If
2679             # you use the `manual' option for seeding, you must also specify the data elements
2680             # to use for seeding the clusters.
2681              
2682             # Here is an example of a call to the constructor when we choose the `manual'
2683             # option for seeding the clusters and for specifying the data elements for
2684             # seeding. The data elements are specified by their tag names. In this case,
2685             # these names are `a26', `b53', and `c49':
2686              
2687             my $clusterer = Algorithm::ExpectationMaximization->new(
2688             datafile => $datafile,
2689             mask => $mask,
2690             class_priors => [0.6, 0.2, 0.2],
2691             K => 3,
2692             max_em_iterations => 300,
2693             seeding => 'manual',
2694             seed_tags => ['a26', 'b53', 'c49'],
2695             terminal_output => 1,
2696             debug => 0,
2697             );
2698              
2699             # This example call to the constructor also illustrates how you can inject class
2700             # priors into the clustering process. The class priors are the prior probabilities
2701             # of the class distributions in your dataset. As explained later, injecting class
2702             # priors in the manner shown above makes statistical sense only for the case of
2703             # manual seeding. When you do inject class priors, the order in which the priors
2704             # are expressed must correspond to the manually specified seeds for the clusters.
2705              
2706             # After the invocation of the constructor, the following calls are mandatory
2707             # for reasons that should be obvious from the names of the methods:
2708              
2709             $clusterer->read_data_from_file();
2710             srand(time);
2711             $clusterer->seed_the_clusters();
2712             $clusterer->EM();
2713             $clusterer->run_bayes_classifier();
2714             my $clusters = $clusterer->return_disjoint_clusters();
2715              
2716             # where the call to `EM()' is the invocation of the expectation-maximization
2717             # algorithm. The call to `srand(time)' is to seed the pseudo random number
2718             # generator afresh for each run of the cluster seeding procedure. If you want to
2719             # see repeatable results from one run to another of the algorithm with random
2720             # seeding, you would obviously not invoke `srand(time)'.
2721              
2722             # The call `run_bayes_classifier()' shown above carries out a disjoint clustering
2723             # of all the data points using the naive Bayes' classifier. And the call
2724             # `return_disjoint_clusters()' returns the clusters thus formed to you. Once you
2725             # have obtained access to the clusters in this manner, you can display them in
2726             # your terminal window by
2727              
2728             foreach my $index (0..@$clusters-1) {
2729             print "Cluster $index (Naive Bayes): @{$clusters->[$index]}\n\n"
2730             }
2731              
2732             # If you would like to also see the clusters purely on the basis of the posterior
2733             # class probabilities exceeding a threshold, call
2734              
2735             my $theta1 = 0.2;
2736             my $posterior_prob_clusters =
2737             $clusterer->return_clusters_with_posterior_probs_above_threshold($theta1);
2738              
2739             # where you can obviously set the threshold $theta1 to any value you wish. Note
2740             # that now you may end up with clusters that overlap. You can display them in
2741             # your terminal window in the same manner as shown above for the naive Bayes'
2742             # clusters.
2743              
2744             # You can write the naive Bayes' clusters out to files, one cluster per file, by
2745             # calling
2746              
2747             $clusterer->write_naive_bayes_clusters_to_files();
2748              
2749             # The clusters are placed in files with names like
2750              
2751             naive_bayes_cluster1.dat
2752             naive_bayes_cluster2.dat
2753             ...
2754              
2755             # In the same manner, you can write out the posterior probability based possibly
2756             # overlapping clusters to files by calling:
2757              
2758             $clusterer->write_posterior_prob_clusters_above_threshold_to_files($theta1);
2759              
2760             # where the threshold $theta1 sets the probability threshold for deciding which
2761             # data elements to place in a cluster. These clusters are placed in files with
2762             # names like
2763              
2764             posterior_prob_cluster1.dat
2765             posterior_prob_cluster2.dat
2766             ...
2767              
2768             # CLUSTER VISUALIZATION:
2769              
2770             # You must first set the mask for cluster visualization. This mask tells the
2771             # module which 2D or 3D subspace of the original data space you wish to visualize
2772             # the clusters in:
2773              
2774             my $visualization_mask = "111";
2775             $clusterer->visualize_clusters($visualization_mask);
2776             $clusterer->visualize_distributions($visualization_mask);
2777             $clusterer->plot_hardcopy_clusters($visualization_mask);
2778             $clusterer->plot_hardcopy_distributions($visualization_mask);
2779              
2780             # where the last two invocations are for writing out the PNG plots of the
2781             # visualization displays to disk files. The PNG image of the posterior
2782             # probability distributions is written out to a file named posterior_prob_plot.png
2783             # and the PNG image of the disjoint clusters to a file called cluster_plot.png.
2784              
2785             # SYNTHETIC DATA GENERATION:
2786              
2787             # The module has been provided with a class method for generating multivariate
2788             # data for experimenting with the EM algorithm. The data generation is controlled
2789             # by the contents of a parameter file that is supplied as an argument to the data
2790             # generator method. The priors, the means, and the covariance matrices in the
2791             # parameter file must be according to the syntax shown in the `param1.txt' file in
2792             # the `examples' directory. It is best to edit a copy of this file for your
2793             # synthetic data generation needs.
2794              
2795             my $parameter_file = "param1.txt";
2796             my $out_datafile = "mydatafile1.dat";
2797             Algorithm::ExpectationMaximization->cluster_data_generator(
2798             input_parameter_file => $parameter_file,
2799             output_datafile => $out_datafile,
2800             total_number_of_data_points => $N );
2801              
2802             # where the value of $N is the total number of data points you would like to see
2803             # generated for all of the Gaussians. How this total number is divided up amongst
2804             # the Gaussians is decided by the prior probabilities for the Gaussian components
2805             # as declared in input parameter file. The synthetic data may be visualized in a
2806             # terminal window and the visualization written out as a PNG image to a diskfile
2807             # by
2808              
2809             my $data_visualization_mask = "11";
2810             $clusterer->visualize_data($data_visualization_mask);
2811             $clusterer->plot_hardcopy_data($data_visualization_mask);
2812              
2813              
2814             =head1 CHANGES
2815              
2816             Version 1.2 allows the module to also be used for 1-D data. The visualization code
2817             for 1-D shows the clusters through their histograms.
2818              
2819             Version 1.1 incorporates much cleanup of the documentation associated with the
2820             module. Both the top-level module documentation, especially the Description part,
2821             and the comments embedded in the code were revised for better utilization of the
2822             module. The basic implementation code remains unchanged.
2823              
2824              
2825             =head1 DESCRIPTION
2826              
2827             B is a I module for the
2828             Expectation-Maximization (EM) method of clustering numerical data that lends itself
2829             to modeling as a Gaussian mixture. Since the module is entirely in Perl (in the
2830             sense that it is not a Perl wrapper around a C library that actually does the
2831             clustering), the code in the module can easily be modified to experiment with several
2832             aspects of EM.
2833              
2834             Gaussian Mixture Modeling (GMM) is based on the assumption that the data consists of
2835             C Gaussian components, each characterized by its own mean vector and its own
2836             covariance matrix. Obviously, given observed data for clustering, we do not know
2837             which of the C Gaussian components was responsible for any of the data elements.
2838             GMM also associates a prior probability with each Gaussian component. In general,
2839             these priors will also be unknown. So the problem of clustering consists of
2840             estimating the posterior class probability at each data element and also estimating
2841             the class priors. Once these posterior class probabilities and the priors are
2842             estimated with EM, we can use the naive Bayes' classifier to partition the data into
2843             disjoint clusters. Or, for "soft" clustering, we can find all the data elements that
2844             belong to a Gaussian component on the basis of the posterior class probabilities at
2845             the data elements exceeding a prescribed threshold.
2846              
2847             If you do not mind the fact that it is possible for the EM algorithm to occasionally
2848             get stuck in a local maximum and to, therefore, produce a wrong answer even when you
2849             know the data to be perfectly multimodal Gaussian, EM is probably the most magical
2850             approach to clustering multidimensional data. Consider the case of clustering
2851             three-dimensional data. Each Gaussian cluster in 3D space is characterized by the
2852             following 10 variables: the 6 unique elements of the C<3x3> covariance matrix (which
2853             must be symmetric positive-definite), the 3 unique elements of the mean, and the
2854             prior associated with the Gaussian. Now let's say you expect to see six Gaussians in
2855             your data. What that means is that you would want the values for 59 variables
2856             (remember the unit-summation constraint on the class priors which reduces the overall
2857             number of variables by one) to be estimated by the algorithm that seeks to discover
2858             the clusters in your data. What's amazing is that despite the large number of
2859             variables that are being simultaneously optimized, the chances are that the EM
2860             algorithm will give you a good approximation to the right answer.
2861              
2862             At its core, EM depends on the notion of unobserved data and the averaging of the
2863             log-likelihood of the data actually observed over all admissible probabilities for
2864             the unobserved data. But what is unobserved data? While in some cases where EM is
2865             used, the unobserved data is literally the missing data, in others, it is something
2866             that cannot be seen directly but that nonetheless is relevant to the data actually
2867             observed. For the case of clustering multidimensional numerical data that can be
2868             modeled as a Gaussian mixture, it turns out that the best way to think of the
2869             unobserved data is in terms of a sequence of random variables, one for each observed
2870             data point, whose values dictate the selection of the Gaussian for that data point.
2871             This point is explained in great detail in my on-line tutorial at
2872             L.
2873              
2874             The EM algorithm in our context reduces to an iterative invocation of the following
2875             steps: (1) Given the current guess for the means and the covariances of the different
2876             Gaussians in our mixture model, use Bayes' Rule to update the posterior class
2877             probabilities at each of the data points; (2) Using the updated posterior class
2878             probabilities, first update the class priors; (3) Using the updated class priors,
2879             update the class means and the class covariances; and go back to Step (1). Ideally,
2880             the iterations should terminate when the expected log-likelihood of the observed data
2881             has reached a maximum and does not change with any further iterations. The stopping
2882             rule used in this module is the detection of no change over three consecutive
2883             iterations in the values calculated for the priors.
2884              
2885             This module provides three different choices for seeding the clusters: (1) random,
2886             (2) kmeans, and (3) manual. When random seeding is chosen, the algorithm randomly
2887             selects C data elements as cluster seeds. That is, the data vectors associated
2888             with these seeds are treated as initial guesses for the means of the Gaussian
2889             distributions. The covariances are then set to the values calculated from the entire
2890             dataset with respect to the means corresponding to the seeds. With kmeans seeding, on
2891             the other hand, the means and the covariances are set to whatever values are returned
2892             by the kmeans algorithm. And, when seeding is set to manual, you are allowed to
2893             choose C data elements --- by specifying their tag names --- for the seeds. The
2894             rest of the EM initialization for the manual mode is the same as for the random mode.
2895             The algorithm allows for the initial priors to be specified for the manual mode of
2896             seeding.
2897              
2898             Much of code for the kmeans based seeding of EM was drawn from the
2899             C module by me. The code from that module used here corresponds to
2900             the case when the C option in the C module is set
2901             to C. The C option for KMeans consists of subjecting the data to a
2902             principal components analysis (PCA) to discover the direction of maximum variance in
2903             the data space. The data points are then projected on to this direction and a
2904             histogram constructed from the projections. Centers of the C largest peaks in
2905             this smoothed histogram are used to seed the KMeans based clusterer. As you'd
2906             expect, the output of the KMeans used to seed the EM algorithm.
2907              
2908             This module uses two different criteria to measure the quality of the clustering
2909             achieved. The first is the Minimum Description Length (MDL) proposed originally by
2910             Rissanen (J. Rissanen: "Modeling by Shortest Data Description," Automatica, 1978, and
2911             "A Universal Prior for Integers and Estimation by Minimum Description Length," Annals
2912             of Statistics, 1983.) The MDL criterion is a difference of a log-likelihood term for
2913             all of the observed data and a model-complexity penalty term. In general, both the
2914             log-likelihood and the model-complexity terms increase as the number of clusters
2915             increases. The form of the MDL criterion in this module uses for the penalty term
2916             the Bayesian Information Criterion (BIC) of G. Schwartz, "Estimating the Dimensions
2917             of a Model," The Annals of Statistics, 1978. In general, the smaller the value of
2918             MDL quality measure, the better the clustering of the data.
2919              
2920             For our second measure of clustering quality, we use `trace( SW^-1 . SB)' where SW is
2921             the within-class scatter matrix, more commonly denoted S_w, and SB the between-class
2922             scatter matrix, more commonly denoted S_b (the underscore means subscript). This
2923             measure can be thought of as the normalized average distance between the clusters,
2924             the normalization being provided by average cluster covariance SW^-1. Therefore, the
2925             larger the value of this quality measure, the better the separation between the
2926             clusters. Since this measure has its roots in the Fisher linear discriminant
2927             function, we incorporate the word C in the name of the quality measure.
2928             I When the
2929             clusters exhibit significant overlap, the numbers produced by this quality measure
2930             tend to be generally meaningless.
2931              
2932             =head1 METHODS
2933              
2934             The module provides the following methods for EM based
2935             clustering, for cluster visualization, for data
2936             visualization, and for the generation of data for testing a
2937             clustering algorithm:
2938              
2939             =over
2940              
2941             =item B
2942              
2943             A call to C constructs a new instance of the
2944             C class. A typical form
2945             of this call when you want to use random option for seeding
2946             the algorithm is:
2947              
2948             my $clusterer = Algorithm::ExpectationMaximization->new(
2949             datafile => $datafile,
2950             mask => $mask,
2951             K => 3,
2952             max_em_iterations => 300,
2953             seeding => 'random',
2954             terminal_output => 1,
2955             debug => 0,
2956             );
2957              
2958             where C is the expected number of clusters and
2959             C the maximum number of EM iterations
2960             that you want to allow until convergence is achieved.
2961             Depending on your dataset and on the choice of the initial
2962             seeds, the actual number of iterations used could be as few
2963             as 10 and as many as reaching 300. The output produced by
2964             the algorithm shows the actual number of iterations used to
2965             arrive at convergence.
2966              
2967             The data file supplied through the C option is
2968             expected to contain entries in the following format
2969              
2970             c20 0 10.7087017086940 9.63528386251712 10.9512155258108 ...
2971             c7 0 12.8025925026787 10.6126270065785 10.5228482095349 ...
2972             b9 0 7.60118206283120 5.05889245193079 5.82841781759102 ...
2973             ....
2974             ....
2975              
2976             where the first column contains the symbolic ID tag for each
2977             data record and the rest of the columns the numerical
2978             information. As to which columns are actually used for
2979             clustering is decided by the string value of the mask. For
2980             example, if we wanted to cluster on the basis of the entries
2981             in just the 3rd, the 4th, and the 5th columns above, the
2982             mask value would be C where the character C
2983             indicates that the ID tag is in the first column, the
2984             character C<0> that the second column is to be ignored, and
2985             the C<1>'s that follow that the 3rd, the 4th, and the 5th
2986             columns are to be used for clustering.
2987              
2988             If instead of random seeding, you wish to use the kmeans
2989             based seeding, just replace the option C supplied
2990             for C by C. You can also do manual seeding
2991             by designating a specified set of data elements to serve as
2992             cluster seeds. The call to the constructor in this case
2993             looks like
2994              
2995             my $clusterer = Algorithm::ExpectationMaximization->new(
2996             datafile => $datafile,
2997             mask => $mask,
2998             K => 3,
2999             max_em_iterations => 300,
3000             seeding => 'manual',
3001             seed_tags => ['a26', 'b53', 'c49'],
3002             terminal_output => 1,
3003             debug => 0,
3004             );
3005              
3006             where the option C is set to an anonymous array
3007             of symbolic names associated with the data elements.
3008              
3009             If you know the class priors, you can supply them through an
3010             additional option to the constructor that looks like
3011              
3012             class_priors => [0.6, 0.2, 0.2],
3013              
3014             for the case of C equal to 3. B
3015             be a useful thing to do only for the case of manual
3016             seeding.> If you go for manual seeding, the order in which
3017             the priors are expressed should correspond to the order of
3018             the manually chosen tags supplied through the C
3019             option.
3020              
3021             Note that the parameter C is boolean; when
3022             not supplied in the call to C it defaults to 0. When
3023             set, this parameter displays useful information in the
3024             window of the terminal screen in which you invoke the
3025             algorithm.
3026              
3027             =item B
3028              
3029             $clusterer->read_data_from_file()
3030              
3031             This is a required call after the constructor is invoked. As
3032             you would expect, this call reads in the data for
3033             clustering.
3034              
3035             =item B
3036              
3037             $clusterer->seed_the_clusters();
3038              
3039             This is also a required call. It processes the option you
3040             supplied for C in the constructor call to choose
3041             the data elements for seeding the C clusters.
3042              
3043             =item B
3044              
3045             $clusterer->EM();
3046              
3047             This is the workhorse of the module, as you would expect.
3048             The means, the covariances, and the priors estimated by this
3049             method are stored in instance variables that are subsequently
3050             accessed by other methods for the purpose of displaying the
3051             clusters, the probability distributions, etc.
3052              
3053             =item B
3054              
3055             $clusterer->run_bayes_classifier();
3056              
3057             Using the posterior probability distributions estimated by
3058             the C method, this method partitions the data into the
3059             C disjoint clusters using the naive Bayes' classifier.
3060              
3061             =item B
3062              
3063             my $clusters = $clusterer->return_disjoint_clusters();
3064              
3065             This allows you to access the clusters obtained with the
3066             application of the naive Bayes' classifier in your own
3067             scripts. If, say, you wanted to see the data records placed
3068             in each cluster, you could subsequently invoke the following
3069             loop in your own script:
3070              
3071             foreach my $index (0..@$clusters-1) {
3072             print "Cluster $index (Naive Bayes): @{$clusters->[$index]}\n\n"
3073             }
3074              
3075             where C<$clusters> holds the array reference returned by the
3076             call to C.
3077              
3078             =item B
3079              
3080             $clusterer->write_naive_bayes_clusters_to_files();
3081              
3082             This method writes the clusters obtained by applying the
3083             naive Bayes' classifier to disk files, one cluster per
3084             file. What is written out to each file consists of the
3085             symbolic names of the data records that belong to the
3086             cluster corresponding to that file. The clusters are placed
3087             in files with names like
3088              
3089             naive_bayes_cluster1.dat
3090             naive_bayes_cluster2.dat
3091             ...
3092              
3093             =item B
3094              
3095             my $theta1 = 0.2;
3096             my $posterior_prob_clusters =
3097             $clusterer->return_clusters_with_posterior_probs_above_threshold($theta1);
3098              
3099             This method returns a reference to an array of C
3100             anonymous arrays, each consisting of the symbolic names for
3101             the data records where the posterior class probability
3102             exceeds the threshold as specified by C<$theta1>.
3103             Subsequently, you can access each of these
3104             posterior-probability based clusters through a loop
3105             construct such as
3106              
3107             foreach my $index (0..@$posterior_prob_clusters-1) {
3108             print "Cluster $index (based on posterior probs exceeding $theta1): " .
3109             "@{$posterior_prob_clusters->[$index]}\n\n"
3110             }
3111              
3112             =item B
3113              
3114             $clusterer->write_posterior_prob_clusters_above_threshold_to_files($theta1);
3115              
3116             This call writes out the posterior-probability based soft
3117             clusters to disk files. As in the previous method, the
3118             threshold C<$theta1> sets the probability threshold for
3119             deciding which data elements belong to a cluster. These
3120             clusters are placed in files with names like
3121              
3122             posterior_prob_cluster1.dat
3123             posterior_prob_cluster2.dat
3124             ...
3125              
3126             =item B
3127              
3128             my $theta2 = 0.00001;
3129             my $class_distributions =
3130             $clusterer->return_individual_class_distributions_above_given_threshold($theta2);
3131              
3132             This is the method to call if you wish to see the individual
3133             Gaussians in your own script. The method returns a reference
3134             to an array of anonymous arrays, with each anonymous array
3135             representing data membership in each Gaussian. Only those
3136             data points are included in each Gaussian where the
3137             probability exceeds the threshold C<$theta2>. Note that the
3138             larger the covariance and the higher the data
3139             dimensionality, the smaller this threshold must be for you
3140             to see any of the data points in a Gaussian. After you have
3141             accessed the Gaussian mixture in this manner, you can
3142             display the data membership in each Gaussian through the
3143             following sort of a loop:
3144              
3145             foreach my $index (0..@$class_distributions-1) {
3146             print "Gaussian Distribution $index (only shows data elements " .
3147             "whose probabilities exceed the threshold $theta2: " .
3148             "@{$class_distributions->[$index]}\n\n"
3149             }
3150              
3151             =item B
3152              
3153             my $visualization_mask = "11";
3154             $clusterer->visualize_clusters($visualization_mask);
3155              
3156             The visualization mask here does not have to be identical to
3157             the one used for clustering, but must be a subset of that
3158             mask. This is convenient for visualizing the clusters in
3159             two- or three-dimensional subspaces of the original space.
3160             The subset is specified by placing `0's in the positions
3161             corresponding to the dimensions you do NOT want to see
3162             through visualization. Depending on the mask used, this
3163             method creates a 2D or a 3D scatter plot of the clusters
3164             obtained through the naive Bayes' classification rule.
3165              
3166             =item B
3167              
3168             $clusterer->visualize_distributions($visualization_mask);
3169              
3170             This is the method to call if you want to visualize the soft
3171             clustering corresponding to the posterior class
3172             probabilities exceeding the threshold specified in the call
3173             to
3174             C.
3175             Again, depending on the visualization mask used, you will
3176             see either a 2D plot or a 3D scatter plot.
3177              
3178             =item B
3179              
3180             $clusterer->plot_hardcopy_clusters($visualization_mask);
3181              
3182             This method create a PNG file from the C created
3183             display of the naive Bayes' clusters obtained from the data.
3184             The plotting functionality of C is accessed through
3185             the Perl wrappers provided by the C
3186             module.
3187              
3188             =item B
3189              
3190             $clusterer->plot_hardcopy_distributions($visualization_mask);
3191              
3192             This method create a PNG file from the C created
3193             display of the clusters that correspond to the posterior
3194             class probabilities exceeding a specified threshold. The
3195             plotting functionality of C is accessed through the
3196             Perl wrappers provided by the C module.
3197              
3198             =item B
3199              
3200             $clusterer->display_fisher_quality_vs_iterations();
3201              
3202             This method measures the quality of clustering by
3203             calculating the normalized average squared distance between
3204             the cluster centers, the normalization being provided by the
3205             average cluster covariance. See the Description for further
3206             details. In general, this measure is NOT useful for
3207             overlapping clusters.
3208              
3209             =item B
3210              
3211             $clusterer->display_mdl_quality_vs_iterations();
3212              
3213             At the end of each iteration, this method measures the
3214             quality of clustering my calculating its MDL (Minimum
3215             Description Length). As stated earlier in Description, the
3216             MDL measure is a difference of a log-likelihood term for all
3217             of the observed data and a model-complexity penalty term.
3218             The smaller the value returned by this method, the better
3219             the clustering.
3220              
3221             =item B
3222              
3223             my $estimated_priors = $clusterer->return_estimated_priors();
3224             print "Estimated class priors: @$estimated_priors\n";
3225              
3226             This method can be used to access the final values of the
3227             class priors as estimated by the EM algorithm.
3228              
3229             =item B
3230              
3231             Algorithm::ExpectationMaximization->cluster_data_generator(
3232             input_parameter_file => $parameter_file,
3233             output_datafile => $out_datafile,
3234             total_number_of_data_points => 300
3235             );
3236              
3237             for generating multivariate data for clustering if you wish to play with synthetic
3238             data for experimenting with the EM algorithm. The input parameter file must specify
3239             the priors to be used for the Gaussians, their means, and their covariance matrices.
3240             The format of the information contained in the parameter file must be as shown in the
3241             file C provided in the C directory. It will be easiest for you
3242             to just edit a copy of this file for your data generation needs. In addition to the
3243             format of the parameter file, the main constraint you need to observe in specifying
3244             the parameters is that the dimensionality of the covariance matrices must correspond
3245             to the dimensionality of the mean vectors. The multivariate random numbers are
3246             generated by calling the C module. As you would expect, this module
3247             requires that the covariance matrices you specify in your parameter file be symmetric
3248             and positive definite. Should the covariances in your parameter file not obey this
3249             condition, the C module will let you know.
3250              
3251             =item B
3252              
3253             $clusterer->visualize_data($data_visualization_mask);
3254              
3255             This is the method to call if you want to visualize the data
3256             you plan to cluster with the EM algorithm. You'd need to
3257             specify argument mask in a manner similar to the
3258             visualization of the clusters, as explained earlier.
3259              
3260             =item B
3261              
3262             $clusterer->plot_hardcopy_data($data_visualization_mask);
3263              
3264             This method creates a PNG file that can be used to print out
3265             a hardcopy of the data in different 2D and 3D subspaces of
3266             the data space. The visualization mask is used to select the
3267             subspace for the PNG image.
3268              
3269             =back
3270              
3271             =head1 HOW THE CLUSTERS ARE OUTPUT
3272              
3273             This module produces two different types of clusters: the "hard" clusters and the
3274             "soft" clusters. The hard clusters correspond to the naive Bayes' classification of
3275             the data points on the basis of the Gaussian distributions and the class priors
3276             estimated by the EM algorithm. Such clusters partition the data into disjoint
3277             subsets. On the other hand, the soft clusters correspond to the posterior class
3278             probabilities calculated by the EM algorithm. A data element belongs to a cluster if
3279             its posterior probability for that Gaussian exceeds a threshold.
3280              
3281             After the EM algorithm has finished running, the hard clusters are created by
3282             invoking the method C on an instance of the module and then
3283             made user-accessible by calling C. These clusters may
3284             then be displayed in a terminal window by dereferencing each element of the array
3285             whose reference is returned b C. The hard clusters can
3286             be written out to disk files by invoking C.
3287             This method writes out the clusters to files, one cluster per file. What is written
3288             out to each file consists of the symbolic names of the data records that belong to
3289             the cluster corresponding to that file. The clusters are placed in files with names
3290             like
3291              
3292             naive_bayes_cluster1.dat
3293             naive_bayes_cluster2.dat
3294             ...
3295              
3296             The soft clusters on the other hand are created by calling
3297             C
3298             on an instance of the module, where the argument C<$theta1>
3299             is the threshold for deciding whether a data element belongs
3300             in a soft cluster. The posterior class probability at a
3301             data element must exceed the threshold for the element to
3302             belong to the corresponding cluster. The soft cluster can
3303             be written out to disk files by calling
3304             C.
3305             As with the hard clusters, each cluster is placed in a separate
3306             file. The filenames for such clusters look like:
3307              
3308             posterior_prob_cluster1.dat
3309             posterior_prob_cluster2.dat
3310             ...
3311              
3312             =head1 WHAT IF THE NUMBER OF CLUSTERS IS UNKNOWN?
3313              
3314             The module constructor requires that you supply a value for the parameter C, which
3315             is the number of clusters you expect to see in the data. But what if you do not have
3316             a good value for C? Note that it is possible to search for the best C by using
3317             the two clustering quality criteria included in the module. However, I have
3318             intentionally not yet incorporated that feature in the module because it slows down
3319             the execution of the code --- especially when the dimensionality of the data
3320             increases. However, nothing prevents you from writing a script --- along the lines
3321             of the five "canned_example" scripts in the C directory --- that would use
3322             the two clustering quality metrics for finding the best choice for C for a given
3323             dataset. Obviously, you will now have to incorporate the call to the constructor in
3324             a loop and check the value of the quality measures for each value of C.
3325              
3326             =head1 SOME RESULTS OBTAINED WITH THIS MODULE
3327              
3328             If you would like to see some results that have been obtained with this module, check
3329             out Section 7 of the report
3330             L.
3331              
3332             =head1 THE C DIRECTORY
3333              
3334             Becoming familiar with this directory should be your best
3335             strategy to become comfortable with this module (and its
3336             future versions). You are urged to start by executing the
3337             following five example scripts:
3338              
3339             =over 16
3340              
3341             =item I
3342              
3343             This example applies the EM algorithm to the data contained in the datafile
3344             C. The mixture data in the file corresponds to three overlapping
3345             Gaussian components in a star-shaped pattern. The EM based clustering for this data
3346             is shown in the files C and
3347             C, the former displaying the hard clusters
3348             obtained by using the naive Bayes' classifier and the latter showing the soft
3349             clusters obtained on the basis of the posterior class probabilities at the data
3350             points.
3351              
3352             =item I
3353              
3354             The datafile used in this example is C. This mixture data
3355             corresponds to two well-separated relatively isotropic Gaussians. EM based clustering for this
3356             data is shown in the files C and
3357             C, the former displaying the hard clusters
3358             obtained by using the naive Bayes' classifier and the latter showing the soft
3359             clusters obtained by using the posterior class probabilities at the data points.
3360              
3361             =item I
3362              
3363             Like the first example, this example again involves three Gaussians, but now their
3364             means are not co-located. Additionally, we now seed the clusters manually by
3365             specifying three selected data points as the initial guesses for the cluster means.
3366             The datafile used for this example is C. The EM based clustering
3367             for this data is shown in the files C and
3368             C, the former displaying the hard clusters
3369             obtained by using the naive Bayes' classifier and the latter showing the soft
3370             clusters obtained on the basis of the posterior class probabilities at the data
3371             points.
3372              
3373             =item I
3374              
3375             Whereas the three previous examples demonstrated EM based clustering of 2D data, we
3376             now present an example of clustering in 3D. The datafile used in this example is
3377             C. This mixture data corresponds to three well-separated but highly
3378             anisotropic Gaussians. The EM derived clustering for this data is shown in the files
3379             C and C, the
3380             former displaying the hard clusters obtained by using the naive Bayes' classifier and
3381             the latter showing the soft clusters obtained on the basis of the posterior class
3382             probabilities at the data points.
3383              
3384             =item I
3385              
3386             We again demonstrate clustering in 3D but now we have one Gaussian cluster that
3387             "cuts" through the other two Gaussian clusters. The datafile used in this example is
3388             C. The three Gaussians in this case are highly overlapping and
3389             highly anisotropic. The EM derived clustering for this data is shown in the files
3390             C and C, the
3391             former displaying the hard clusters obtained by using the naive Bayes' classifier and
3392             the latter showing the soft clusters obtained through the posterior class
3393             probabilities at the data points.
3394              
3395             =item I
3396              
3397             This example, added in Version 1.2, demonstrates the use of this module for 1-D data.
3398             In order to visualize the clusters for the 1-D case, we show them through their
3399             respective histograms. The datafile used in this example is C. The
3400             data consists of two overlapping Gaussians. The EM derived clustering for this data
3401             is shown in the files C and
3402             C, the former displaying the hard clusters
3403             obtained by using the naive Bayes' classifier and the latter showing the soft
3404             clusters obtained through the posterior class probabilities at the data points.
3405              
3406             =back
3407              
3408             Going through the six examples listed above will make you familiar with how to make
3409             the calls to the clustering and the visualization methods. The C directory
3410             also includes several parameter files with names like
3411              
3412             param1.txt
3413             param2.txt
3414             param3.txt
3415             ...
3416              
3417             These were used to generate the synthetic data for which the results are shown in the
3418             C directory. Just make a copy of one of these files and edit it if you
3419             would like to generate your own multivariate data for clustering. Note that you can
3420             generate data with any dimensionality through appropriate entries in the parameter
3421             file.
3422              
3423             =head1 CAVEATS
3424              
3425             When you run the scripts in the C directory, your results will NOT always
3426             look like what I have shown in the PNG image files in the directory. As mentioned
3427             earlier in Description, the EM algorithm starting from randomly chosen initial
3428             guesses for the cluster means can get stuck in a local maximum.
3429              
3430             That raises an interesting question of how one judges the correctness of clustering
3431             results when dealing with real experimental data. For real data, the best approach
3432             is to try the EM algorithm multiple times with all of the seeding options included in
3433             this module. It would be safe to say that, at least in low dimensional spaces and
3434             with sufficient data, a majority of your runs should yield "correct" results.
3435              
3436             Also bear in mind that a pure Perl implementation is not meant for the clustering of
3437             very large data files. It is really designed more for researching issues related to
3438             EM based approaches to clustering.
3439              
3440              
3441             =head1 REQUIRED
3442              
3443             This module requires the following three modules:
3444              
3445             Math::Random
3446             Graphics::GnuplotIF
3447             Math::GSL::Matrix
3448              
3449             the first for generating the multivariate random numbers, the second for the
3450             visualization of the clusters, and the last for access to the Perl wrappers for the
3451             GNU Scientific Library. The C module of this library is used for various
3452             algebraic operations on the covariance matrices of the Gaussians.
3453              
3454             =head1 EXPORT
3455              
3456             None by design.
3457              
3458              
3459             =head1 BUGS
3460              
3461             Please notify the author if you encounter any bugs. When sending email, please place
3462             the string 'Algorithm EM' in the subject line.
3463              
3464             =head1 INSTALLATION
3465              
3466             The usual
3467              
3468             perl Makefile.PL
3469             make
3470             make test
3471             make install
3472              
3473             if you have root access. If not,
3474              
3475             perl Makefile.PL prefix=/some/other/directory/
3476             make
3477             make test
3478             make install
3479              
3480             =head1 ACKNOWLEDGMENTS
3481              
3482             Version 1.2 is a result of the feedback received from Paul
3483             May of University of Birmingham. Thanks, Paul!
3484              
3485             =head1 AUTHOR
3486              
3487             Avinash Kak, kak@purdue.edu
3488              
3489             If you send email, please place the string "EM Algorithm" in your
3490             subject line to get past my spam filter.
3491              
3492             =head1 COPYRIGHT
3493              
3494             This library is free software; you can redistribute it and/or modify it under the
3495             same terms as Perl itself.
3496              
3497             Copyright 2013 Avinash Kak
3498              
3499             =cut
3500