File Coverage

blib/lib/Algorithm/ExpectationMaximization.pm
Criterion Covered Total %
statement 22 24 91.6
branch n/a
condition n/a
subroutine 8 8 100.0
pod n/a
total 30 32 93.7


line stmt bran cond sub pod time code
1             package Algorithm::ExpectationMaximization;
2              
3             #---------------------------------------------------------------------------
4             # Copyright (c) 2014 Avinash Kak. All rights reserved. This program is free
5             # software. You may modify and/or distribute it under the same terms as Perl itself.
6             # This copyright notice must remain attached to the file.
7             #
8             # Algorithm::ExpectationMaximization is a pure Perl implementation for
9             # Expectation-Maximization based clustering of multi-dimensional data that can be
10             # modeled as a Gaussian mixture.
11             # ---------------------------------------------------------------------------
12              
13 1     1   24009 use 5.10.0;
  1         4  
  1         64  
14 1     1   6 use strict;
  1         2  
  1         157  
15 1     1   7 use warnings;
  1         87  
  1         59  
16 1     1   7 use Carp;
  1         2  
  1         119  
17 1     1   7 use File::Basename;
  1         1  
  1         106  
18 1     1   911 use Math::Random;
  1         9382  
  1         145  
19 1     1   808 use Graphics::GnuplotIF;
  1         15118  
  1         74  
20 1     1   1351 use Math::GSL::Matrix;
  0            
  0            
21             use Scalar::Util 'blessed';
22              
23             our $VERSION = '1.22';
24              
25             # from perl docs:
26             my $_num_regex = '^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$';
27              
28             # Constructor:
29             sub new {
30             my ($class, %args) = @_;
31             my @params = keys %args;
32             croak "\nYou have used a wrong name for a keyword argument " .
33             "--- perhaps a misspelling\n"
34             if check_for_illegal_params(@params) == 0;
35             bless {
36             _datafile => $args{datafile} || croak("datafile required"),
37             _mask => $args{mask} || croak("mask required"),
38             _K => $args{K} || croak("number of clusters required"),
39             _terminal_output => $args{terminal_output} || 0,
40             _seeding => $args{seeding} || 'random',
41             _seed_tags => $args{seed_tags} || [],
42             _max_em_iterations=> $args{max_em_iterations} || 100,
43             _class_priors => $args{class_priors} || [],
44             _debug => $args{debug} || 0,
45             _N => 0,
46             _data => {},
47             _data_id_tags => [],
48             _clusters => [],
49             _cluster_centers => [],
50             _data_dimensions => 0,
51             _cluster_normalizers => [],
52             _cluster_means => [],
53             _cluster_covariances => [],
54             _class_labels_for_data => {},
55             _class_probs_at_each_data_point => {},
56             _expected_class_probs => {},
57             _old_priors => [],
58             _old_old_priors => [],
59             _fisher_quality_vs_iteration => [],
60             _mdl_quality_vs_iterations => [],
61             }, $class;
62             }
63              
64              
65             sub read_data_from_file {
66             my $self = shift;
67             my $filename = $self->{_datafile};
68             $self->read_data_from_file_csv() if $filename =~ /.csv$/;
69             $self->read_data_from_file_dat() if $filename =~ /.dat$/;
70             }
71              
72             sub read_data_from_file_csv {
73             my $self = shift;
74             my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
75             my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered";
76             my $mask = $self->{_mask};
77             my @mask = split //, $mask;
78             $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
79             print "data dimensionality: $self->{_data_dimensions} \n"if $self->{_terminal_output};
80             open FILEIN, $filename or die "Unable to open $filename: $!";
81             die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
82             local $/ = undef;
83             my @all_data = split /\s+/, ;
84             my %data_hash = ();
85             my @data_tags = ();
86             foreach my $record (@all_data) {
87             my @splits = split /,/, $record;
88             die "\nYour mask size (including `N' and 1's and 0's) does not match\n" .
89             "the size of at least one of the data records in the file.\n"
90             unless scalar(@mask) == scalar(@splits);
91             my $record_name = shift @splits;
92             $data_hash{$record_name} = \@splits;
93             push @data_tags, $record_name;
94             }
95             $self->{_data} = \%data_hash;
96             $self->{_data_id_tags} = \@data_tags;
97             $self->{_N} = scalar @data_tags;
98             # Need to make the following call to set the global mean and covariance:
99             # my $covariance = $self->estimate_mean_and_covariance(\@data_tags);
100             # Need to make the following call to set the global eigenvec eigenval sets:
101             # $self->eigen_analysis_of_covariance($covariance);
102             if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
103             carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
104             "points must satisfy the relation N > 2xK**2 where K is " .
105             "the number of clusters requested for the clusters to be " .
106             "meaningful $!"
107             if ( $self->{_N} < (2 * $self->{_K} ** 2) );
108             print "\n\n\n";
109             }
110             }
111              
112             sub read_data_from_file_dat {
113             my $self = shift;
114             my $datafile = $self->{_datafile};
115             my $mask = $self->{_mask};
116             my @mask = split //, $mask;
117             $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
118             print "data dimensionality: $self->{_data_dimensions} \n"
119             if $self->{_terminal_output};
120             open INPUT, $datafile
121             or die "unable to open file $datafile: $!";
122             chomp( my @raw_data = );
123             close INPUT;
124             # Transform strings into number data
125             foreach my $record (@raw_data) {
126             next unless $record;
127             next if $record =~ /^#/;
128             my @data_fields;
129             my @fields = split /\s+/, $record;
130             die "\nABORTED: Mask size does not correspond to row record size"
131             if $#fields != $#mask;
132             my $record_id;
133             foreach my $i (0..@fields-1) {
134             if ($mask[$i] eq '0') {
135             next;
136             } elsif ($mask[$i] eq 'N') {
137             $record_id = $fields[$i];
138             } elsif ($mask[$i] eq '1') {
139             push @data_fields, $fields[$i];
140             } else {
141             die "misformed mask for reading the data file";
142             }
143             }
144             my @nums = map {/$_num_regex/;$_} @data_fields;
145             $self->{_data}->{ $record_id } = \@nums;
146             }
147             my @all_data_ids = keys %{$self->{_data}};
148             $self->{_data_id_tags} = \@all_data_ids;
149             $self->{_N} = scalar @all_data_ids;
150             if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
151             carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
152             "points must satisfy the relation N > 2xK**2 where K is " .
153             "the number of clusters requested for the clusters to be " .
154             "meaningful $!"
155             if ( $self->{_N} < (2 * $self->{_K} ** 2) );
156             }
157             }
158              
159              
160             # This is the heart of the module --- in the sense that this method implements the EM
161             # algorithm for the estimating the parameters of a Gaussian mixture model for the
162             # data. In the implementation shown below, we declare convergence for the EM
163             # algorithm when the change in the class priors over three iterations falls below a
164             # threshold. The current value of this threshold, as can be seen in the function
165             # compare_array_floats(), is 0.00001.
166             sub EM {
167             my $self = shift;
168             $self->initialize_class_priors();
169             for (my $em_iteration=0; $em_iteration < $self->{_max_em_iterations};
170             $em_iteration++) {
171             if ($em_iteration == 0) {
172             print "\nSeeding the EM algorithm with:\n";
173             $self->display_seeding_stats();
174             print "\nFinished displaying the seeding information\n";
175             print "\nWill print out a dot for each iteration of EM:\n\n";
176             }
177             my $progress_indicator = $em_iteration % 5 == 0 ? $em_iteration : ".";
178             print $progress_indicator;
179             foreach my $data_id (@{$self->{_data_id_tags}}) {
180             $self->{_class_probs_at_each_data_point}->{$data_id} = [];
181             $self->{_expected_class_probs}->{$data_id} = [];
182             }
183             # Calculate prob(x | C_i) --- this is the prob of data point x as
184             # a member of class C_i. You must do this for all K classes:
185             foreach my $cluster_index(0..$self->{_K}-1) {
186             $self->find_prob_at_each_datapoint_for_given_mean_and_covar(
187             $self->{_cluster_means}->[$cluster_index],
188             $self->{_cluster_covariances}->[$cluster_index] );
189             }
190             $self->{_cluster_normalizers} = [];
191             if ($self->{_debug}) {
192             print "\n\nDisplaying prob of a data point vis-a-vis each class:\n\n";
193             foreach my $data_id (sort keys %{$self->{_data}}) {
194             my $class_probs_at_a_point =
195             $self->{_class_probs_at_each_data_point}->{$data_id};
196             print "Class probs for $data_id: @$class_probs_at_a_point\n"
197             }
198             }
199             # Calculate prob(C_i | x) which is the posterior prob of class
200             # considered as a r.v. to be C_i at a given point x. For a given
201             # x, the sum of such probabilities over all C_i must add up to 1:
202             $self->find_expected_classes_at_each_datapoint();
203             if ($self->{_debug}) {
204             print "\n\nDisplaying expected class probs at each data point:\n\n";
205             foreach my $data_id (sort keys %{$self->{_expected_class_probs}}) {
206             my $expected_classes_at_a_point =
207             $self->{_expected_class_probs}->{$data_id};
208             print "Expected classes $data_id: @$expected_classes_at_a_point\n";
209             }
210             }
211             # 1. UPDATE MEANS:
212             my @new_means;
213             foreach my $cluster_index(0..$self->{_K}-1) {
214             $new_means[$cluster_index] =
215             Math::GSL::Matrix->new($self->{_data_dimensions},1);
216             $new_means[$cluster_index]->zero();
217             foreach my $data_id (keys %{$self->{_data}}) {
218             my $data_record = $self->{_data}->{$data_id};
219             my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
220             $data_vec->set_col(0,$data_record);
221             $new_means[$cluster_index] +=
222             $self->{_expected_class_probs}->{$data_id}->[$cluster_index] *
223             $data_vec->copy();
224             $self->{_cluster_normalizers}->[$cluster_index] +=
225             $self->{_expected_class_probs}->{$data_id}->[$cluster_index];
226             }
227             $new_means[$cluster_index] *= 1.0 /
228             $self->{_cluster_normalizers}->[$cluster_index];
229             }
230             if ($self->{_debug}) {
231             foreach my $meanvec (@new_means) {
232             display_matrix("At EM Iteration $em_iteration, new mean vector is",
233             $meanvec);
234             }
235             }
236             $self->{_cluster_means} = \@new_means;
237             # 2. UPDATE COVARIANCES:
238             my @new_covariances;
239             foreach my $cluster_index(0..$self->{_K}-1) {
240             $new_covariances[$cluster_index] =
241             Math::GSL::Matrix->new($self->{_data_dimensions},
242             $self->{_data_dimensions});
243             $new_covariances[$cluster_index]->zero();
244             my $normalizer = 0;
245             foreach my $data_id (keys %{$self->{_data}}) {
246             my $data_record = $self->{_data}->{$data_id};
247             my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
248             $data_vec->set_col(0,$data_record);
249             my $mean_subtracted_data =
250             $data_vec - $self->{_cluster_means}->[$cluster_index];
251             my $outer_product = outer_product($mean_subtracted_data,
252             $mean_subtracted_data);
253             $new_covariances[$cluster_index] +=
254             $self->{_expected_class_probs}->{$data_id}->[$cluster_index] *
255             $outer_product;
256             }
257             $new_covariances[$cluster_index] *=
258             1.0 /
259             $self->{_cluster_normalizers}->[$cluster_index];
260             }
261             $self->{_cluster_covariances} = \@new_covariances;
262             # 3. UPDATE PRIORS:
263             $self->{_old_old_priors} = deep_copy_array( $self->{_old_priors} )
264             if @{$self->{_old_priors}} > 0;
265             $self->{_old_priors} = deep_copy_array( $self->{_class_priors} );
266             foreach my $cluster_index(0..$self->{_K}-1) {
267             $self->{_class_priors}->[$cluster_index] =
268             $self->{_cluster_normalizers}->[$cluster_index] / $self->{_N};
269             }
270             my @priors = @{$self->{_class_priors}};
271             print "\nUpdated priors: @priors\n\n\n" if $self->{_debug};
272             push @{$self->{_fisher_quality_vs_iteration}},
273             $self->clustering_quality_fisher();
274             push @{$self->{_mdl_quality_vs_iteration}}, $self->clustering_quality_mdl();
275             if ( ($em_iteration > 5 && $self->reached_convergence())
276             || ($em_iteration == $self->{_max_em_iterations} - 1) ) {
277             my @old_old_priors = @{$self->{_old_old_priors}};
278             my @old_priors = @{$self->{_old_priors}};
279             print "\n\nPrevious to previous priors: @old_old_priors\n";
280             print "Previous priors: @old_priors\n";
281             print "Current class priors: @{$self->{_class_priors}}\n";
282             print "\n\nCONVERGENCE ACHIEVED AT ITERATION $em_iteration\n\n"
283             if $em_iteration < $self->{_max_em_iterations} - 1;
284             last;
285             }
286             }
287             print "\n\n\n";
288             }
289              
290             sub reached_convergence {
291             my $self = shift;
292             return 1 if compare_array_floats($self->{_old_old_priors},
293             $self->{_old_priors})
294             &&
295             compare_array_floats($self->{_old_priors},
296             $self->{_class_priors});
297             return 0;
298             }
299              
300             # Classify the data into disjoint clusters using the Naive Bayes' classification:
301             sub run_bayes_classifier {
302             my $self = shift;
303             $self->classify_all_data_tuples_bayes($self->{_cluster_means},
304             $self->{_cluster_covariances});
305             }
306              
307             # Should NOT be called before run_bayes_classifier is run
308             sub return_disjoint_clusters {
309             my $self = shift;
310             return $self->{_clusters};
311             }
312              
313             sub return_clusters_with_posterior_probs_above_threshold {
314             my $self = shift;
315             my $theta = shift;
316             my @class_distributions;
317             foreach my $cluster_index (0..$self->{_K}-1) {
318             push @class_distributions, [];
319             }
320             foreach my $data_tag (@{$self->{_data_id_tags}}) {
321             foreach my $cluster_index (0..$self->{_K}-1) {
322             push @{$class_distributions[$cluster_index]}, $data_tag
323             if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index]
324             > $theta;
325             }
326             }
327             return \@class_distributions;
328             }
329              
330             sub return_individual_class_distributions_above_given_threshold {
331             my $self = shift;
332             my $theta = shift;
333             my @probability_distributions;
334             foreach my $cluster_index (0..$self->{_K}-1) {
335             push @probability_distributions, [];
336             }
337             foreach my $cluster_index (0..$self->{_K}-1) {
338             my $mean_vec = $self->{_cluster_means}->[$cluster_index];
339             my $covar = $self->{_cluster_covariances}->[$cluster_index];
340             foreach my $data_id (keys %{$self->{_data}}) {
341             my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
342             $data_vec->set_col( 0, $self->{_data}->{$data_id});
343             my $datavec_minus_mean = $data_vec - $mean_vec;
344             display_matrix( "datavec minus mean is ", $datavec_minus_mean )
345             if $self->{_debug};
346             my $exponent = undef;
347             if ($self->{_data_dimensions} > 1) {
348             $exponent = -0.5 * vector_matrix_multiply(
349             transpose($datavec_minus_mean),
350             matrix_vector_multiply($covar->inverse(), $datavec_minus_mean ) );
351             } else {
352             my @var_inverse = $covar->inverse()->as_list;
353             my $var_inverse_val = $var_inverse[0];
354             my @data_minus_mean = $datavec_minus_mean->as_list;
355             my $data_minus_mean_val = $data_minus_mean[0];
356             $exponent = -0.5 * ($data_minus_mean_val ** 2) * $var_inverse_val;
357             }
358             print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug};
359             my $coefficient = 1.0 / \
360             ( (2 * $Math::GSL::Const::M_PI)**$self->{_data_dimensions} * sqrt($covar->det()) );
361             my $prob = $coefficient * exp($exponent);
362             push @{$probability_distributions[$cluster_index]}, $data_id
363             if $prob > $theta;
364             }
365             }
366             return \@probability_distributions;
367             }
368              
369             sub return_estimated_priors {
370             my $self = shift;
371             return $self->{_class_priors};
372             }
373              
374             # Calculates the MDL (Minimum Description Length) clustering criterion according to
375             # Rissanen. (J. Rissanen: "Modeling by Shortest Data Description," Automatica, 1978,
376             # and "A Universal Prior for Integers and Estimation by Minimum Description Length,"
377             # Annals of Statistics, 1983.) The MDL criterion is a difference of a log-likelihood
378             # term for all of the observed data and a model-complexity penalty term. In general,
379             # both the log-likelihood and the model-complexity terms increase as the number of
380             # clusters increases. The form of the MDL criterion used in the implementation below
381             # uses for the penalty term the Bayesian Information Criterion (BIC) of G. Schwartz,
382             # "Estimating the Dimensions of a Model," The Annals of Statistics, 1978. In
383             # general, the smaller the value of the MDL quality measure calculated below, the
384             # better the clustering of the data.
385             sub clustering_quality_mdl {
386             my $self = shift;
387             # Calculate the inverses of all of the covariance matrices in order to avoid
388             # having to calculate them repeatedly inside the inner 'foreach' loop in the
389             # main part of this method. Here we go:
390             my @covar_inverses;
391             foreach my $cluster_index (0..$self->{_K}-1) {
392             my $covar = $self->{_cluster_covariances}->[$cluster_index];
393             push @covar_inverses, $covar->inverse();
394             }
395             # For the clustering quality, first calculate the log-likelihood of all the
396             # observed data:
397             my $log_likelihood = 0;
398             foreach my $tag (@{$self->{_data_id_tags}}) {
399             my $likelihood_for_each_tag = 0;
400             foreach my $cluster_index (0..$self->{_K}-1) {
401             my $mean_vec = $self->{_cluster_means}->[$cluster_index];
402             my $covar = $self->{_cluster_covariances}->[$cluster_index];
403             my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
404             $data_vec->set_col( 0, $self->{_data}->{$tag});
405             my $datavec_minus_mean = $data_vec - $mean_vec;
406             my $exponent = undef;
407             if ($self->{_data_dimensions} > 1) {
408             $exponent = -0.5 * vector_matrix_multiply(
409             transpose($datavec_minus_mean),
410             matrix_vector_multiply($covar_inverses[$cluster_index],
411             $datavec_minus_mean ) );
412             } else {
413             my @var_inverse = $covar_inverses[$cluster_index]->as_list;
414             my $var_inverse_val = $var_inverse[0];
415             my @data_minus_mean = $datavec_minus_mean->as_list;
416             my $data_minus_mean_val = $data_minus_mean[0];
417             $exponent = -0.5 * ($data_minus_mean_val ** 2) * $var_inverse_val;
418             }
419             next if $covar->det() < 0;
420             my $coefficient = 1.0 /
421             ( (2 * $Math::GSL::Const::M_PI)**$self->{_data_dimensions}
422             * sqrt($covar->det()) );
423             my $prob = $coefficient * exp($exponent);
424             $likelihood_for_each_tag +=
425             $prob * $self->{_class_priors}->[$cluster_index];
426             }
427             $log_likelihood += log( $likelihood_for_each_tag );
428             }
429             # Now calculate the model complexity penalty. $L is the total number of
430             # parameters it takes to specify a mixture of K Gaussians. If d is the
431             # dimensionality of the data space, the covariance matrix of each Gaussian takes
432             # (d**2 -d)/2 + d = d(d+1)/2 parameters since this matrix must be symmetric. And
433             # then you need d mean value parameters, and one prior probability parameter
434             # for the Gaussian. So $L = K[1 + d + d(d+1)/2] - 1 where the final '1' that
435             # is subtracted is to account for the normalization on the class priors.
436             my $L = (0.5 * $self->{_K} *
437             ($self->{_data_dimensions}**2 + 3*$self->{_data_dimensions} + 2) ) - 1;
438             my $model_complexity_penalty = 0.5 * $L * log( $self->{_N} );
439             my $mdl_criterion = -1 * $log_likelihood + $model_complexity_penalty;
440             return $mdl_criterion;
441             }
442              
443             # For our second measure of clustering quality, we use `trace( SW^-1 . SB)' where SW
444             # is the within-class scatter matrix, more commonly denoted S_w, and SB the
445             # between-class scatter matrix, more commonly denoted S_b (the underscore means
446             # subscript). This measure can be thought of as the normalized average distance
447             # between the clusters, the normalization being provided by average cluster
448             # covariance SW^-1. Therefore, the larger the value of this quality measure, the
449             # better the separation between the clusters. Since this measure has its roots in
450             # the Fisher linear discriminant function, we incorporate the word 'fisher' in the
451             # name of the quality measure. Note that this measure is good only when the clusters
452             # are disjoint. When the clusters exhibit significant overlap, the numbers produced
453             # by this quality measure tend to be generally meaningless. As an extreme case,
454             # let's say your data was produced by a set of Gaussians, all with the same mean
455             # vector, but each with a distinct covariance. For this extreme case, this measure
456             # will produce a value close to zero --- depending on the accuracy with which the
457             # means are estimated --- even when your clusterer is doing a good job of identifying
458             # the individual clusters.
459             sub clustering_quality_fisher {
460             my $self = shift;
461             my @cluster_quality_indices;
462             my $fisher_trace = 0;
463             my $S_w =
464             Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions});
465             $S_w->zero;
466             my $S_b =
467             Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions});
468             $S_b->zero;
469             my $global_mean = Math::GSL::Matrix->new($self->{_data_dimensions},1);
470             $global_mean->zero;
471             foreach my $cluster_index(0..$self->{_K}-1) {
472             $global_mean = $self->{_class_priors}->[$cluster_index] *
473             $self->{_cluster_means}->[$cluster_index];
474             }
475             foreach my $cluster_index(0..$self->{_K}-1) {
476             $S_w += $self->{_cluster_covariances}->[$cluster_index] *
477             $self->{_class_priors}->[$cluster_index];
478             my $class_mean_minus_global_mean = $self->{_cluster_means}->[$cluster_index]
479             - $global_mean;
480             my $outer_product = outer_product( $class_mean_minus_global_mean,
481             $class_mean_minus_global_mean );
482             $S_b += $self->{_class_priors}->[$cluster_index] * $outer_product;
483             }
484             my $fisher = matrix_multiply($S_w->inverse, $S_b);
485             return $fisher unless defined blessed($fisher);
486             return matrix_trace($fisher);
487             }
488              
489             sub display_seeding_stats {
490             my $self = shift;
491             foreach my $cluster_index(0..$self->{_K}-1) {
492             print "\nSeeding for cluster $cluster_index:\n";
493             my $mean = $self->{_cluster_means}->[$cluster_index];
494             display_matrix("The mean is: ", $mean);
495             my $covariance = $self->{_cluster_covariances}->[$cluster_index];
496             display_matrix("The covariance is: ", $covariance);
497             }
498             }
499              
500             sub display_fisher_quality_vs_iterations {
501             my $self = shift;
502             print "\n\nFisher Quality vs. Iterations: " .
503             "@{$self->{_fisher_quality_vs_iteration}}\n\n";
504             }
505              
506             sub display_mdl_quality_vs_iterations {
507             my $self = shift;
508             print "\n\nMDL Quality vs. Iterations: @{$self->{_mdl_quality_vs_iteration}}\n\n";
509             }
510              
511             sub find_prob_at_each_datapoint_for_given_mean_and_covar {
512             my $self = shift;
513             my $mean_vec_ref = shift;
514             my $covar_ref = shift;
515             foreach my $data_id (keys %{$self->{_data}}) {
516             my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
517             $data_vec->set_col( 0, $self->{_data}->{$data_id});
518             if ($self->{_debug}) {
519             display_matrix("data vec in find prob function", $data_vec);
520             display_matrix("mean vec in find prob function", $mean_vec_ref);
521             display_matrix("covariance in find prob function", $covar_ref);
522             }
523             my $datavec_minus_mean = $data_vec - $mean_vec_ref;
524             display_matrix( "datavec minus mean is ", $datavec_minus_mean ) if $self->{_debug};
525             my $exponent = undef;
526             if ($self->{_data_dimensions} > 1) {
527             $exponent = -0.5 * vector_matrix_multiply( transpose($datavec_minus_mean),
528             matrix_vector_multiply( $covar_ref->inverse(), $datavec_minus_mean ) );
529             } elsif (defined blessed($covar_ref)) {
530             my @data_minus_mean = $datavec_minus_mean->as_list;
531             my $data_minus_mean_val = $data_minus_mean[0];
532             my @covar_as_matrix = $covar_ref->as_list;
533             my $covar_val = $covar_as_matrix[0];
534             $exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_val;
535             } else {
536             my @data_minus_mean = $datavec_minus_mean->as_list;
537             my $data_minus_mean_val = $data_minus_mean[0];
538             $exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_ref;
539             }
540             print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug};
541             my $coefficient = undef;
542             if ($self->{_data_dimensions} > 1) {
543             $coefficient = 1.0 / sqrt( ((2 * $Math::GSL::Const::M_PI) ** $self->{_data_dimensions}) *
544             $covar_ref->det()) ;
545             } elsif (!defined blessed($covar_ref)) {
546             $coefficient = 1.0 / sqrt(2 * $covar_ref * $Math::GSL::Const::M_PI);
547             } else {
548             my @covar_as_matrix = $covar_ref->as_list;
549             my $covar_val = $covar_as_matrix[0];
550             $coefficient = 1.0 / sqrt(2 * $covar_val * $Math::GSL::Const::M_PI);
551             }
552             my $prob = $coefficient * exp($exponent);
553             push @{$self->{_class_probs_at_each_data_point}->{$data_id}}, $prob;
554             }
555             }
556              
557             sub find_expected_classes_at_each_datapoint {
558             my $self = shift;
559             my @priors = @{$self->{_class_priors}};
560             foreach my $data_id (sort keys %{$self->{_class_probs_at_each_data_point}}) {
561             my $numerator =
562             vector_2_vector_multiply(
563             $self->{_class_probs_at_each_data_point}->{$data_id},
564             $self->{_class_priors} );
565             my $sum = 0;
566             foreach my $part (@$numerator) {
567             $sum += $part;
568             }
569             $self->{_expected_class_probs}->{$data_id} = [map $_/$sum, @{$numerator}];
570             }
571             }
572              
573             sub initialize_class_priors {
574             my $self = shift;
575             if (@{$self->{_class_priors}} == 0) {
576             my $prior = 1.0 / $self->{_K};
577             foreach my $class_index (0..$self->{_K}-1) {
578             push @{$self->{_class_priors}}, $prior;
579             }
580             }
581             die "Mismatch between number of values for class priors " .
582             "and the number of clusters expected"
583             unless @{$self->{_class_priors}} == $self->{_K};
584             my $sum = 0;
585             foreach my $prior (@{$self->{_class_priors}}) {
586             $sum += $prior;
587             }
588             die "Your priors in the constructor call do not add up to 1"
589             unless abs($sum - 1) < 0.001;
590             print "\nInitially assumed class priors are: @{$self->{_class_priors}}\n";
591             }
592              
593             sub estimate_class_priors {
594             my $self = shift;
595             foreach my $datatag (keys %{$self->{_data}}) {
596             my $class_label = $self->{_class_labels}->{$datatag};
597             $self->{_class_priors}[$class_label]++;
598             }
599             foreach my $prior (@{$self->{_class_priors}}) {
600             $prior /= $self->{_total_number_data_tuples};
601             }
602             foreach my $prior (@{$self->{_class_priors}}) {
603             print "class priors: @{$self->{_class_priors}}\n";
604             }
605             }
606              
607             sub classify_all_data_tuples_bayes {
608             my $self = shift;
609             my $mean_vecs_ref = shift;
610             my $covariances_ref = shift;
611             my @new_clusters;
612             foreach my $index (0..$self->{_K}-1) {
613             push @new_clusters, [];
614             }
615             foreach my $data_id (@{$self->{_data_id_tags}}) {
616             my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
617             $data_vec->set_col( 0, deep_copy_array($self->{_data}->{$data_id}));
618             my $cluster_index_for_tuple =
619             $self->classify_a_data_point_bayes($data_vec,
620             $mean_vecs_ref, $covariances_ref);
621             $self->{_class_labels}->{$data_id} = $cluster_index_for_tuple;
622             push @{$new_clusters[$cluster_index_for_tuple]}, $data_id;
623             }
624             $self->{_clusters} = \@new_clusters;
625             }
626              
627             sub classify_a_data_point_bayes {
628             my $self = shift;
629             my $data_vec = shift;
630             my $mean_vecs_ref = shift;
631             my $covariances_ref = shift;
632             my @cluster_mean_vecs = @$mean_vecs_ref;
633             my @cluster_covariances = @$covariances_ref;
634             my @log_likelihoods;
635             foreach my $cluster_index (0..@cluster_mean_vecs-1) {
636             my $mean = $cluster_mean_vecs[$cluster_index];
637             my $covariance = $cluster_covariances[$cluster_index];
638             my $datavec_minus_mean = $data_vec - $mean;
639             my $log_likely = undef;
640             if ($self->{_data_dimensions} > 1) {
641             $log_likely = -0.5 * vector_matrix_multiply(
642             transpose($datavec_minus_mean),
643             matrix_vector_multiply( $covariance->inverse(),
644             $datavec_minus_mean ) );
645             } else {
646              
647             my @data_minus_mean = $datavec_minus_mean->as_list;
648             my $data_minus_mean_val = $data_minus_mean[0];
649             my @covar_as_matrix = $covariance->as_list;
650             my $covar_val = $covar_as_matrix[0];
651             $log_likely = -0.5 * ($data_minus_mean_val ** 2) / $covar_val;
652              
653             }
654             my $posterior_log_likely = $log_likely +
655             log( $self->{_class_priors}[$cluster_index] );
656             push @log_likelihoods, $posterior_log_likely;
657             }
658             my ($minlikely, $maxlikely) = minmax(\@log_likelihoods);
659             my $cluster_index_for_data_point =
660             get_index_at_value( $maxlikely, \@log_likelihoods );
661             return $cluster_index_for_data_point;
662             }
663              
664             sub find_cluster_means_and_covariances {
665             my $clusters = shift;
666             my $data_dimensions = find_data_dimensionality($clusters);
667             my (@cluster_mean_vecs, @cluster_covariances);
668             foreach my $cluster_index (0..@$clusters-1) {
669             my ($num_rows,$num_cols) =
670             ($data_dimensions,scalar(@{$clusters->[$cluster_index]}));
671             print "\nFor cluster $cluster_index: rows: $num_rows and cols: $num_cols\n";
672             my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
673             my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
674             my $col_index = 0;
675             foreach my $ele (@{$clusters->[$cluster_index]}) {
676             $matrix->set_col($col_index++, $ele);
677             }
678             # display_matrix( "Displaying cluster matrix", $matrix );
679             foreach my $j (0..$num_cols-1) {
680             $mean_vec += $matrix->col($j);
681             }
682             $mean_vec *= 1.0 / $num_cols;
683             push @cluster_mean_vecs, $mean_vec;
684             display_matrix( "Displaying the mean vector", $mean_vec );
685             foreach my $j (0..$num_cols-1) {
686             my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
687             $matrix->set_col($j, \@new_col);
688             }
689             # display_matrix("Displaying mean subtracted data as a matrix", $matrix );
690             my $transposed = transpose( $matrix );
691             # display_matrix("Displaying transposed matrix",$transposed);
692             my $covariance = matrix_multiply( $matrix, $transposed );
693             $covariance *= 1.0 / $num_cols;
694             push @cluster_covariances, $covariance;
695             display_matrix("Displaying the cluster covariance", $covariance );
696             }
697             return (\@cluster_mean_vecs, \@cluster_covariances);
698             }
699              
700             sub find_data_dimensionality {
701             my $clusters = shift;
702             my @first_cluster = @{$clusters->[0]};
703             my @first_data_element = @{$first_cluster[0]};
704             return scalar(@first_data_element);
705             }
706              
707             sub find_seed_centered_covariances {
708             my $self = shift;
709             my $seed_tags = shift;
710             my (@seed_mean_vecs, @seed_based_covariances);
711             foreach my $seed_tag (@$seed_tags) {
712             my ($num_rows,$num_cols) = ($self->{_data_dimensions}, $self->{_N});
713             my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
714             my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
715             $mean_vec->set_col(0, $self->{_data}->{$seed_tag});
716             push @seed_mean_vecs, $mean_vec;
717             display_matrix( "Displaying the seed mean vector", $mean_vec );
718             my $col_index = 0;
719             foreach my $tag (@{$self->{_data_id_tags}}) {
720             $matrix->set_col($col_index++, $self->{_data}->{$tag});
721             }
722             foreach my $j (0..$num_cols-1) {
723             my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
724             $matrix->set_col($j, \@new_col);
725             }
726             my $transposed = transpose( $matrix );
727             my $covariance = matrix_multiply( $matrix, $transposed );
728             $covariance *= 1.0 / $num_cols;
729             push @seed_based_covariances, $covariance;
730             display_matrix("Displaying the seed covariance", $covariance )
731             if $self->{_debug};
732             }
733             return (\@seed_mean_vecs, \@seed_based_covariances);
734             }
735              
736             # The most popular seeding mode for EM is random. We include two other seeding modes
737             # --- kmeans and manual --- since they do produce good results for specialized cases.
738             # For example, when the clusters in your data are non-overlapping and not too
739             # anisotropic, the kmeans based seeding should work at least as well as the random
740             # seeding. In such cases --- AND ONLY IN SUCH CASES --- the kmeans based seeding has
741             # the advantage of avoiding the getting stuck in a local-maximum problem of the EM
742             # algorithm.
743             sub seed_the_clusters {
744             my $self = shift;
745             if ($self->{_seeding} eq 'random') {
746             my @covariances;
747             my @means;
748             my @all_tags = @{$self->{_data_id_tags}};
749             my @seed_tags;
750             foreach my $i (0..$self->{_K}-1) {
751             push @seed_tags, $all_tags[int rand( $self->{_N} )];
752             }
753             print "Random Seeding: Randomly selected seeding tags are @seed_tags\n\n";
754             my ($seed_means, $seed_covars) =
755             $self->find_seed_centered_covariances(\@seed_tags);
756             $self->{_cluster_means} = $seed_means;
757             $self->{_cluster_covariances} = $seed_covars;
758             } elsif ($self->{_seeding} eq 'kmeans') {
759             $self->kmeans();
760             my $clusters = $self->{_clusters};
761             my @dataclusters;
762             foreach my $index (0..@$clusters-1) {
763             push @dataclusters, [];
764             }
765             foreach my $cluster_index (0..$self->{_K}-1) {
766             foreach my $tag (@{$clusters->[$cluster_index]}) {
767             my $data = $self->{_data}->{$tag};
768             push @{$dataclusters[$cluster_index]}, deep_copy_array($data);
769             }
770             }
771             ($self->{_cluster_means}, $self->{_cluster_covariances}) =
772             find_cluster_means_and_covariances(\@dataclusters);
773             } elsif ($self->{_seeding} eq 'manual') {
774             die "You have not supplied the seeding tags for the option \"manual\""
775             unless @{$self->{_seed_tags}} > 0;
776             print "Manual Seeding: Seed tags are @{$self->{_seed_tags}}\n\n";
777             foreach my $tag (@{$self->{_seed_tags}}) {
778             die "invalid tag used for manual seeding"
779             unless exists $self->{_data}->{$tag};
780             }
781             my ($seed_means, $seed_covars) =
782             $self->find_seed_centered_covariances($self->{_seed_tags});
783             $self->{_cluster_means} = $seed_means;
784             $self->{_cluster_covariances} = $seed_covars;
785             } else {
786             die "Incorrect call syntax used. See documentation.";
787             }
788             }
789              
790             # This is the top-level method for kmeans based initialization of the EM
791             # algorithm. The means and the covariances returned by kmeans are used to seed the EM
792             # algorithm.
793             sub kmeans {
794             my $self = shift;
795             my $K = $self->{_K};
796             $self->cluster_for_fixed_K_single_smart_try($K);
797             if ((defined $self->{_clusters}) && (defined $self->{_cluster_centers})){
798             return ($self->{_clusters}, $self->{_cluster_centers});
799             } else {
800             die "kmeans clustering failed.";
801             }
802             }
803              
804             # Used by the kmeans algorithm for the initialization of the EM iterations. We do
805             # initial kmeans cluster seeding by subjecting the data to principal components
806             # analysis in order to discover the direction of maximum variance in the data space.
807             # Subsequently, we try to find the K largest peaks along this direction. The
808             # coordinates of these peaks serve as the seeds for the K clusters.
809             sub cluster_for_fixed_K_single_smart_try {
810             my $self = shift;
811             my $K = shift;
812             print "Clustering for K = $K\n" if $self->{_terminal_output};
813             my ($clusters, $cluster_centers) =
814             $self->cluster_for_given_K($K);
815             $self->{_clusters} = $clusters;
816             $self->{_cluster_centers} = $cluster_centers;
817             }
818              
819             # Used by the kmeans part of the code for the initialization of the EM algorithm:
820             sub cluster_for_given_K {
821             my $self = shift;
822             my $K = shift;
823             my $cluster_centers = $self->get_initial_cluster_centers($K);
824             my $clusters = $self->assign_data_to_clusters_initial($cluster_centers);
825             my $cluster_nonexistant_flag = 0;
826             foreach my $trial (0..2) {
827             ($clusters, $cluster_centers) =
828             $self->assign_data_to_clusters( $clusters, $K );
829             my $num_of_clusters_returned = @$clusters;
830             foreach my $cluster (@$clusters) {
831             $cluster_nonexistant_flag = 1 if ((!defined $cluster)
832             || (@$cluster == 0));
833             }
834             last unless $cluster_nonexistant_flag;
835             }
836             return ($clusters, $cluster_centers);
837             }
838              
839             # Used by the kmeans part of the code for the initialization of the EM algorithm:
840             sub get_initial_cluster_centers {
841             my $self = shift;
842             my $K = shift;
843             if ($self->{_data_dimensions} == 1) {
844             my @one_d_data;
845             foreach my $j (0..$self->{_N}-1) {
846             my $tag = $self->{_data_id_tags}[$j];
847             push @one_d_data, $self->{_data}->{$tag}->[0];
848             }
849             my @peak_points =
850             find_peak_points_in_given_direction(\@one_d_data,$K);
851             print "highest points at data values: @peak_points\n"
852             if $self->{_debug};
853             my @cluster_centers;
854             foreach my $peakpoint (@peak_points) {
855             push @cluster_centers, [$peakpoint];
856             }
857             return \@cluster_centers;
858             }
859             my ($num_rows,$num_cols) = ($self->{_data_dimensions},$self->{_N});
860             my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
861             my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
862             # All the record labels are stored in the array $self->{_data_id_tags}.
863             # The actual data for clustering is stored in a hash at $self->{_data}
864             # whose keys are the record labels; the value associated with each
865             # key is the array holding the corresponding numerical multidimensional
866             # data.
867             foreach my $j (0..$num_cols-1) {
868             my $tag = $self->{_data_id_tags}[$j];
869             my $data = $self->{_data}->{$tag};
870             $matrix->set_col($j, $data);
871             }
872             if ($self->{_debug}) {
873             print "\nDisplaying the original data as a matrix:";
874             display_matrix( $matrix );
875             }
876             foreach my $j (0..$num_cols-1) {
877             $mean_vec += $matrix->col($j);
878             }
879             $mean_vec *= 1.0 / $num_cols;
880             if ($self->{_debug}) {
881             print "Displaying the mean vector for the data:";
882             display_matrix( $mean_vec );
883             }
884             foreach my $j (0..$num_cols-1) {
885             my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
886             $matrix->set_col($j, \@new_col);
887             }
888             if ($self->{_debug}) {
889             print "Displaying mean subtracted data as a matrix:";
890             display_matrix( $matrix );
891             }
892             my $transposed = transpose( $matrix );
893             if ($self->{_debug}) {
894             print "Displaying transposed data matrix:";
895             display_matrix( $transposed );
896             }
897             my $covariance = matrix_multiply( $matrix, $transposed );
898             $covariance *= 1.0 / $num_cols;
899             if ($self->{_debug}) {
900             print "\nDisplaying the Covariance Matrix for your data:";
901             display_matrix( $covariance );
902             }
903             my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
904             my $num_of_eigens = @$eigenvalues;
905             my $largest_eigen_index = 0;
906             my $smallest_eigen_index = 0;
907             print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug};
908             foreach my $i (1..$num_of_eigens-1) {
909             $largest_eigen_index = $i if $eigenvalues->[$i] >
910             $eigenvalues->[$largest_eigen_index];
911             $smallest_eigen_index = $i if $eigenvalues->[$i] <
912             $eigenvalues->[$smallest_eigen_index];
913             print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug};
914             }
915             print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
916             print "\nsmallest eigen index: $smallest_eigen_index\n\n"
917             if $self->{_debug};
918             foreach my $i (0..$num_of_eigens-1) {
919             my @vec = $eigenvectors->[$i]->as_list;
920             print "Eigenvector $i: @vec\n" if $self->{_debug};
921             }
922             my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
923             print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug};
924             my @max_var_direction;
925             # Each element of the array @largest_eigen_vec is a Math::Complex object
926             foreach my $k (0..@largest_eigen_vec-1) {
927             my ($mag, $theta) = $largest_eigen_vec[$k] =~ /\[(\d*\.\d+),(\S+)\]/;
928             if ($theta eq '0') {
929             $max_var_direction[$k] = $mag;
930             } elsif ($theta eq 'pi') {
931             $max_var_direction[$k] = -1.0 * $mag;
932             } else {
933             die "eigendecomposition of covariance matrix produced a complex eigenvector --- something is wrong";
934             }
935             }
936             # "Maximum variance direction: @max_var_direction
937             print "\nMaximum Variance Direction: @max_var_direction\n\n"
938             if $self->{_debug};
939             # We now project all data points on the largest eigenvector.
940             # Each projection will yield a single point on the eigenvector.
941             my @projections;
942             foreach my $j (0..$self->{_N}-1) {
943             my $tag = $self->{_data_id_tags}[$j];
944             my $data = $self->{_data}->{$tag};
945             die "Dimensionality of the largest eigenvector does not "
946             . "match the dimensionality of the data"
947             unless @max_var_direction == $self->{_data_dimensions};
948             my $projection = vector_multiply($data, \@max_var_direction);
949             push @projections, $projection;
950             }
951             print "All projection points: @projections\n" if $self->{_debug};
952             my @peak_points = find_peak_points_in_given_direction(\@projections, $K);
953             print "highest points at points along largest eigenvec: @peak_points\n"
954             if $self->{_debug};
955             my @cluster_centers;
956             foreach my $peakpoint (@peak_points) {
957             my @actual_peak_coords = map {$peakpoint * $_} @max_var_direction;
958             push @cluster_centers, \@actual_peak_coords;
959             }
960             return \@cluster_centers;
961             }
962              
963             # Used by the kmeans part of the code: This method is called by the previous method
964             # to locate K peaks in a smoothed histogram of the data points projected onto the
965             # maximal variance direction.
966             sub find_peak_points_in_given_direction {
967             my $dataref = shift;
968             my $how_many = shift;
969             my @data = @$dataref;
970             my ($min, $max) = minmax(\@data);
971             my $num_points = @data;
972             my @sorted_data = sort {$a <=> $b} @data;
973             #print "\n\nSorted data: @sorted_data\n";
974             my $scale = $max - $min;
975             foreach my $index (0..$#sorted_data-1) {
976             $sorted_data[$index] = ($sorted_data[$index] - $min) / $scale;
977             }
978             my $avg_diff = 0;
979             foreach my $index (0..$#sorted_data-1) {
980             my $diff = $sorted_data[$index+1] - $sorted_data[$index];
981             $avg_diff += ($diff - $avg_diff) / ($index + 1);
982             }
983             my $delta = 1.0 / 1000.0;
984             # It would be nice to set the delta adaptively, but I must
985             # change the number of cells in the next foreach loop accordingly
986             # my $delta = $avg_diff / 20;
987             my @accumulator = (0) x 1000;
988             foreach my $index (0..@sorted_data-1) {
989             my $cell_index = int($sorted_data[$index] / $delta);
990             my $smoothness = 40;
991             for my $index ($cell_index-$smoothness..$cell_index+$smoothness) {
992             next if $index < 0 || $index > 999;
993             $accumulator[$index]++;
994             }
995             }
996             my $peaks_array = non_maximum_supression( \@accumulator );
997             my $peaks_index_hash = get_value_index_hash( $peaks_array );
998             my @K_highest_peak_locations;
999             my $k = 0;
1000             foreach my $peak (sort {$b <=> $a} keys %$peaks_index_hash) {
1001             my $unscaled_peak_point =
1002             $min + $peaks_index_hash->{$peak} * $scale * $delta;
1003             push @K_highest_peak_locations, $unscaled_peak_point
1004             if $k < $how_many;
1005             last if ++$k == $how_many;
1006             }
1007             return @K_highest_peak_locations;
1008             }
1009              
1010             # Used by the kmeans part of the code: The purpose of this routine is to form initial
1011             # clusters by assigning the data samples to the initial clusters formed by the
1012             # previous routine on the basis of the best proximity of the data samples to the
1013             # different cluster centers.
1014             sub assign_data_to_clusters_initial {
1015             my $self = shift;
1016             my @cluster_centers = @{ shift @_ };
1017             my @clusters;
1018             foreach my $ele (@{$self->{_data_id_tags}}) {
1019             my $best_cluster;
1020             my @dist_from_clust_centers;
1021             foreach my $center (@cluster_centers) {
1022             push @dist_from_clust_centers, $self->distance($ele, $center);
1023             }
1024             my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
1025             push @{$clusters[$best_center_index]}, $ele;
1026             }
1027             return \@clusters;
1028             }
1029              
1030             # Used by the kmeans part of the code: This is the main routine that along with the
1031             # update_cluster_centers() routine constitute the two key steps of the K-Means
1032             # algorithm. In most cases, the infinite while() loop will terminate automatically
1033             # when the cluster assignments of the data points remain unchanged. For the sake of
1034             # safety, we keep track of the number of iterations. If this number reaches 100, we
1035             # exit the while() loop anyway. In most cases, this limit will not be reached.
1036             sub assign_data_to_clusters {
1037             my $self = shift;
1038             my $clusters = shift;
1039             my $K = shift;
1040             my $final_cluster_centers;
1041             my $iteration_index = 0;
1042             while (1) {
1043             my $new_clusters;
1044             my $assignment_changed_flag = 0;
1045             my $current_cluster_center_index = 0;
1046             my $cluster_size_zero_condition = 0;
1047             my $how_many = @$clusters;
1048             my $cluster_centers = $self->update_cluster_centers(
1049             deep_copy_AoA_with_nulls( $clusters ) );
1050             $iteration_index++;
1051             foreach my $cluster (@$clusters) {
1052             my $current_cluster_center =
1053             $cluster_centers->[$current_cluster_center_index];
1054             foreach my $ele (@$cluster) {
1055             my @dist_from_clust_centers;
1056             foreach my $center (@$cluster_centers) {
1057             push @dist_from_clust_centers,
1058             $self->distance($ele, $center);
1059             }
1060             my ($min, $best_center_index) =
1061             minimum( \@dist_from_clust_centers );
1062             my $best_cluster_center =
1063             $cluster_centers->[$best_center_index];
1064             if (vector_equal($current_cluster_center,
1065             $best_cluster_center)){
1066             push @{$new_clusters->[$current_cluster_center_index]},
1067             $ele;
1068             } else {
1069             $assignment_changed_flag = 1;
1070             push @{$new_clusters->[$best_center_index]}, $ele;
1071             }
1072             }
1073             $current_cluster_center_index++;
1074             }
1075             # Now make sure that we still have K clusters since K is fixed:
1076             next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
1077             # Now make sure that none of the K clusters is an empty cluster:
1078             foreach my $newcluster (@$new_clusters) {
1079             $cluster_size_zero_condition = 1 if ((!defined $newcluster)
1080             or (@$newcluster == 0));
1081             }
1082             push @$new_clusters, (undef) x ($K - @$new_clusters)
1083             if @$new_clusters < $K;
1084             my $largest_cluster;
1085             foreach my $local_cluster (@$new_clusters) {
1086             next if !defined $local_cluster;
1087             $largest_cluster = $local_cluster if !defined $largest_cluster;
1088             if (@$local_cluster > @$largest_cluster) {
1089             $largest_cluster = $local_cluster;
1090             }
1091             }
1092             foreach my $local_cluster (@$new_clusters) {
1093             if ( (!defined $local_cluster) || (@$local_cluster == 0) ) {
1094             push @$local_cluster, pop @$largest_cluster;
1095             }
1096             }
1097             next if (($cluster_size_zero_condition) && ($iteration_index < 100));
1098             last if $iteration_index == 100;
1099             # Now do a deep copy of new_clusters into clusters
1100             $clusters = deep_copy_AoA( $new_clusters );
1101             last if $assignment_changed_flag == 0;
1102             }
1103             $final_cluster_centers = $self->update_cluster_centers( $clusters );
1104             return ($clusters, $final_cluster_centers);
1105             }
1106              
1107             # Used by the kmeans part of the code: After each new assignment of the data points
1108             # to the clusters on the basis of the current values for the cluster centers, we call
1109             # the routine shown here for updating the values of the cluster centers.
1110             sub update_cluster_centers {
1111             my $self = shift;
1112             my @clusters = @{ shift @_ };
1113             my @new_cluster_centers;
1114             my $largest_cluster;
1115             foreach my $cluster (@clusters) {
1116             next if !defined $cluster;
1117             $largest_cluster = $cluster if !defined $largest_cluster;
1118             if (@$cluster > @$largest_cluster) {
1119             $largest_cluster = $cluster;
1120             }
1121             }
1122             foreach my $cluster (@clusters) {
1123             if ( (!defined $cluster) || (@$cluster == 0) ) {
1124             push @$cluster, pop @$largest_cluster;
1125             }
1126             }
1127             foreach my $cluster (@clusters) {
1128             die "Cluster became empty --- untenable condition " .
1129             "for a given K. Try again. " if !defined $cluster;
1130             my $cluster_size = @$cluster;
1131             die "Cluster size is zero --- untenable." if $cluster_size == 0;
1132             my @new_cluster_center = @{$self->add_point_coords( $cluster )};
1133             @new_cluster_center = map {my $x = $_/$cluster_size; $x}
1134             @new_cluster_center;
1135             push @new_cluster_centers, \@new_cluster_center;
1136             }
1137             return \@new_cluster_centers;
1138             }
1139              
1140             # The following routine is for computing the distance between a data point specified
1141             # by its symbolic name in the master datafile and a point (such as the center of a
1142             # cluster) expressed as a vector of coordinates:
1143             sub distance {
1144             my $self = shift;
1145             my $ele1_id = shift @_; # symbolic name of data sample
1146             my @ele1 = @{$self->{_data}->{$ele1_id}};
1147             my @ele2 = @{shift @_};
1148             die "wrong data types for distance calculation" if @ele1 != @ele2;
1149             my $how_many = @ele1;
1150             my $squared_sum = 0;
1151             foreach my $i (0..$how_many-1) {
1152             $squared_sum += ($ele1[$i] - $ele2[$i])**2;
1153             }
1154             my $dist = sqrt $squared_sum;
1155             return $dist;
1156             }
1157              
1158             # The following routine does the same as above but now both
1159             # arguments are expected to be arrays of numbers:
1160             sub distance2 {
1161             my $self = shift;
1162             my @ele1 = @{shift @_};
1163             my @ele2 = @{shift @_};
1164             die "wrong data types for distance calculation" if @ele1 != @ele2;
1165             my $how_many = @ele1;
1166             my $squared_sum = 0;
1167             foreach my $i (0..$how_many-1) {
1168             $squared_sum += ($ele1[$i] - $ele2[$i])**2;
1169             }
1170             return sqrt $squared_sum;
1171             }
1172              
1173             sub write_naive_bayes_clusters_to_files {
1174             my $self = shift;
1175             my @clusters = @{$self->{_clusters}};
1176             unlink glob "naive_bayes_cluster*.txt";
1177             foreach my $i (1..@clusters) {
1178             my $filename = "naive_bayes_cluster" . $i . ".txt";
1179             print "Writing cluster $i to file $filename\n"
1180             if $self->{_terminal_output};
1181             open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!";
1182             foreach my $ele (@{$clusters[$i-1]}) {
1183             print FILEHANDLE "$ele\n";
1184             }
1185             close FILEHANDLE;
1186             }
1187             }
1188              
1189             sub write_posterior_prob_clusters_above_threshold_to_files {
1190             my $self = shift;
1191             my $theta = shift;
1192             my @class_distributions;
1193             foreach my $cluster_index (0..$self->{_K}-1) {
1194             push @class_distributions, [];
1195             }
1196             foreach my $data_tag (@{$self->{_data_id_tags}}) {
1197             foreach my $cluster_index (0..$self->{_K}-1) {
1198             push @{$class_distributions[$cluster_index]}, $data_tag
1199             if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index]
1200             > $theta;
1201             }
1202             }
1203             unlink glob "posterior_prob_cluster*.txt";
1204             foreach my $i (1..@class_distributions) {
1205             my $filename = "posterior_prob_cluster" . $i . ".txt";
1206             print "Writing posterior prob cluster $i to file $filename\n"
1207             if $self->{_terminal_output};
1208             open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!";
1209             foreach my $ele (@{$class_distributions[$i-1]}) {
1210             print FILEHANDLE "$ele\n";
1211             }
1212             close FILEHANDLE;
1213             }
1214             }
1215              
1216             sub DESTROY {
1217             unlink "__temp_" . basename($_[0]->{_datafile});
1218             unlink "__temp_data_" . basename($_[0]->{_datafile});
1219             unlink "__temp2_" . basename($_[0]->{_datafile});
1220             unlink glob "__temp1dhist*";
1221             unlink glob "__contour*";
1222             }
1223              
1224             ############################# Visualization Code ###############################
1225              
1226             # The visualize_clusters() implementation displays as a plot in your terminal window
1227             # the clusters constructed by the EM algorithm. It can show either 2D plots or
1228             # 3D plots that you can rotate interactively for better visualization. For
1229             # multidimensional data, as to which 2D or 3D dimensions are used for visualization
1230             # is controlled by the mask you must supply as an argument to the method. Should it
1231             # happen that only one on bit is specified for the mask, visualize_clusters()
1232             # aborts.
1233             #
1234             # The visualization code consists of first accessing each of clusters created by the
1235             # EM() subroutine. Note that the clusters contain only the symbolic names for the
1236             # individual records in the source data file. We therefore next reach into the
1237             # $self->{_data} hash and get the data coordinates associated with each symbolic
1238             # label in a cluster. The numerical data thus generated is then written out to a
1239             # temp file. When doing so we must remember to insert TWO BLANK LINES between the
1240             # data blocks corresponding to the different clusters. This constraint is imposed
1241             # on us by Gnuplot when plotting data from the same file since we want to use
1242             # different point styles for the data points in different cluster files.
1243             # Subsequently, we call upon the Perl interface provided by the Graphics::GnuplotIF
1244             # module to plot the data clusters.
1245             sub visualize_clusters {
1246             my $self = shift;
1247             my $v_mask;
1248             my $pause_time;
1249             if (@_ == 1) {
1250             $v_mask = shift || die "visualization mask missing";
1251             } elsif (@_ == 2) {
1252             $v_mask = shift || die "visualization mask missing";
1253             $pause_time = shift;
1254             } else {
1255             die "visualize_clusters() called with wrong args";
1256             }
1257             my $master_datafile = $self->{_datafile};
1258             my @v_mask = split //, $v_mask;
1259             my $visualization_mask_width = @v_mask;
1260             my $original_data_mask = $self->{_mask};
1261             my @mask = split //, $original_data_mask;
1262             my $data_field_width = scalar grep {$_ eq '1'} @mask;
1263             die "\n\nABORTED: The width of the visualization mask (including " .
1264             "all its 1s and 0s) must equal the width of the original mask " .
1265             "used for reading the data file (counting only the 1's)"
1266             if $visualization_mask_width != $data_field_width;
1267             my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1268             # The following section is for the superimposed one-Mahalanobis-distance-unit
1269             # ellipses that are shown only for 2D plots:
1270             if ($visualization_data_field_width == 2) {
1271             foreach my $cluster_index (0..$self->{_K}-1) {
1272             my $contour_filename = "__contour_" . $cluster_index . ".dat";
1273             my $mean = $self->{_cluster_means}->[$cluster_index];
1274             my $covariance = $self->{_cluster_covariances}->[$cluster_index];
1275             my ($mux,$muy) = $mean->as_list();
1276             my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
1277             my ($sigmayx,$vary) = $covariance->row(1)->as_list();
1278             die "Your covariance matrix does not look right"
1279             unless $sigmaxy == $sigmayx;
1280             my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
1281             my $argstring = <<"END";
1282             set contour
1283             mux = $mux
1284             muy = $muy
1285             sigmax = $sigmax
1286             sigmay = $sigmay
1287             sigmaxy = $sigmaxy
1288             determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2
1289             exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) )
1290             f(x,y) = exp( exponent(x,y) ) - 0.2
1291             xmax = mux + 2 * sigmax
1292             xmin = mux - 2 * sigmax
1293             ymax = muy + 2 * sigmay
1294             ymin = muy - 2 * sigmay
1295             set xrange [ xmin : xmax ]
1296             set yrange [ ymin : ymax ]
1297             set isosamples 200
1298             unset surface
1299             set cntrparam levels discrete 0
1300             set table \"$contour_filename\"
1301             splot f(x,y)
1302             unset table
1303             END
1304             my $plot = Graphics::GnuplotIF->new();
1305             $plot->gnuplot_cmd( $argstring );
1306             }
1307             }
1308             my %visualization_data;
1309             while ( my ($record_id, $data) = each %{$self->{_data}} ) {
1310             my @fields = @$data;
1311             die "\nABORTED: Visualization mask size exceeds data record size"
1312             if $#v_mask > $#fields;
1313             my @data_fields;
1314             foreach my $i (0..@fields-1) {
1315             if ($v_mask[$i] eq '0') {
1316             next;
1317             } elsif ($v_mask[$i] eq '1') {
1318             push @data_fields, $fields[$i];
1319             } else {
1320             die "Misformed visualization mask. It can only have 1s and 0s";
1321             }
1322             }
1323             $visualization_data{ $record_id } = \@data_fields;
1324             }
1325             my $K = scalar @{$self->{_clusters}};
1326             my $filename = basename($master_datafile);
1327             my $temp_file = "__temp_" . $filename;
1328             unlink $temp_file if -e $temp_file;
1329             open OUTPUT, ">$temp_file"
1330             or die "Unable to open a temp file in this directory: $!";
1331             foreach my $cluster (@{$self->{_clusters}}) {
1332             foreach my $item (@$cluster) {
1333             print OUTPUT "@{$visualization_data{$item}}";
1334             print OUTPUT "\n";
1335             }
1336             print OUTPUT "\n\n";
1337             }
1338             close OUTPUT;
1339             my $plot;
1340             if (!defined $pause_time) {
1341             $plot = Graphics::GnuplotIF->new( persist => 1 );
1342             } else {
1343             $plot = Graphics::GnuplotIF->new();
1344             }
1345             my $arg_string = "";
1346             if ($visualization_data_field_width > 2) {
1347             $plot->gnuplot_cmd("set noclip");
1348             $plot->gnuplot_cmd("set pointsize 2");
1349             foreach my $i (0..$K-1) {
1350             my $j = $i + 1;
1351             $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
1352             }
1353             } elsif ($visualization_data_field_width == 2) {
1354             $plot->gnuplot_cmd("set noclip");
1355             $plot->gnuplot_cmd("set pointsize 2");
1356             foreach my $i (0..$K-1) {
1357             my $j = $i + 1;
1358             $arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
1359             my $ellipse_filename = "__contour_" . $i . ".dat";
1360             $arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
1361             }
1362             } elsif ($visualization_data_field_width == 1 ) {
1363             open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
1364             my @all_data = ;
1365             close INPUT;
1366             @all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
1367             my $all_joined_data = join ':', @all_data;
1368             my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
1369             my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
1370             foreach my $i (0..@separated-1) {
1371             $separated[$i] =~ s/SEPERATOR//g;
1372             my @cluster_for_hist = split /:/, $separated[$i];
1373             @cluster_for_hist = grep $_, @cluster_for_hist;
1374             my ($minval,$maxval) = minmax(\@cluster_for_hist);
1375             push @all_minvals, $minval;
1376             push @all_maxvals, $maxval;
1377             push @all_clusters_for_hist, \@cluster_for_hist;
1378             }
1379             push @all_minmaxvals, @all_minvals;
1380             push @all_minmaxvals, @all_maxvals;
1381             my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
1382             my $delta = ($abs_maxval - $abs_minval) / 100.0;
1383             $plot->gnuplot_cmd("set boxwidth 3");
1384             $plot->gnuplot_cmd("set style fill solid border -1");
1385             $plot->gnuplot_cmd("set ytics out nomirror");
1386             $plot->gnuplot_cmd("set style data histograms");
1387             $plot->gnuplot_cmd("set style histogram clustered");
1388             $plot->gnuplot_cmd("set title 'Clusters shown through histograms'");
1389             $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
1390             foreach my $cindex (0..@all_clusters_for_hist-1) {
1391             my $filename = basename($master_datafile);
1392             my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $filename;
1393             unlink $temp_file if -e $temp_file;
1394             open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
1395             print OUTPUT "Xstep histval\n";
1396             my @histogram = (0) x 100;
1397             foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
1398             $histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
1399             }
1400             foreach my $i (0..@histogram-1) {
1401             print OUTPUT "$i $histogram[$i]\n";
1402             }
1403             $arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
1404             close OUTPUT;
1405             }
1406             }
1407             $arg_string = $arg_string =~ /^(.*),[ ]+$/;
1408             $arg_string = $1;
1409             if ($visualization_data_field_width > 2) {
1410             $plot->gnuplot_cmd( "splot $arg_string" );
1411             $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1412             } elsif ($visualization_data_field_width == 2) {
1413             $plot->gnuplot_cmd( "plot $arg_string" );
1414             $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1415             } elsif ($visualization_data_field_width == 1) {
1416             $plot->gnuplot_cmd( "plot $arg_string" );
1417             $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1418             }
1419             }
1420              
1421             # This subroutine is the same as above except that it makes PNG plots (for hardcopy
1422             # printing) of the clusters.
1423             sub plot_hardcopy_clusters {
1424             my $self = shift;
1425             my $v_mask;
1426             my $pause_time;
1427             if (@_ == 1) {
1428             $v_mask = shift || die "visualization mask missing";
1429             } elsif (@_ == 2) {
1430             $v_mask = shift || die "visualization mask missing";
1431             $pause_time = shift;
1432             } else {
1433             die "visualize_clusters() called with wrong args";
1434             }
1435             my $master_datafile = $self->{_datafile};
1436             my @v_mask = split //, $v_mask;
1437             my $visualization_mask_width = @v_mask;
1438             my $original_data_mask = $self->{_mask};
1439             my @mask = split //, $original_data_mask;
1440             my $data_field_width = scalar grep {$_ eq '1'} @mask;
1441             die "\n\nABORTED: The width of the visualization mask (including " .
1442             "all its 1s and 0s) must equal the width of the original mask " .
1443             "used for reading the data file (counting only the 1's)"
1444             if $visualization_mask_width != $data_field_width;
1445             my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1446             if ($visualization_data_field_width == 2) {
1447             foreach my $cluster_index (0..$self->{_K}-1) {
1448             my $contour_filename = "__contour_" . $cluster_index . ".dat";
1449             my $mean = $self->{_cluster_means}->[$cluster_index];
1450             my $covariance = $self->{_cluster_covariances}->[$cluster_index];
1451             my ($mux,$muy) = $mean->as_list();
1452             my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
1453             my ($sigmayx,$vary) = $covariance->row(1)->as_list();
1454             die "Your covariance matrix does not look right"
1455             unless $sigmaxy == $sigmayx;
1456             my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
1457             my $argstring = <<"END";
1458             set contour
1459             mux = $mux
1460             muy = $muy
1461             sigmax = $sigmax
1462             sigmay = $sigmay
1463             sigmaxy = $sigmaxy
1464             determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2
1465             exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) )
1466             f(x,y) = exp( exponent(x,y) ) - 0.2
1467             xmax = mux + 2 * sigmax
1468             xmin = mux - 2 * sigmax
1469             ymax = muy + 2 * sigmay
1470             ymin = muy - 2 * sigmay
1471             set xrange [ xmin : xmax ]
1472             set yrange [ ymin : ymax ]
1473             set isosamples 200
1474             unset surface
1475             set cntrparam levels discrete 0
1476             set table \"$contour_filename\"
1477             splot f(x,y)
1478             unset table
1479             END
1480             my $plot = Graphics::GnuplotIF->new();
1481             $plot->gnuplot_cmd( $argstring );
1482             }
1483             }
1484             my %visualization_data;
1485             while ( my ($record_id, $data) = each %{$self->{_data}} ) {
1486             my @fields = @$data;
1487             die "\nABORTED: Visualization mask size exceeds data record size"
1488             if $#v_mask > $#fields;
1489             my @data_fields;
1490             foreach my $i (0..@fields-1) {
1491             if ($v_mask[$i] eq '0') {
1492             next;
1493             } elsif ($v_mask[$i] eq '1') {
1494             push @data_fields, $fields[$i];
1495             } else {
1496             die "Misformed visualization mask. It can only have 1s and 0s";
1497             }
1498             }
1499             $visualization_data{ $record_id } = \@data_fields;
1500             }
1501             my $K = scalar @{$self->{_clusters}};
1502             my $filename = basename($master_datafile);
1503             my $temp_file = "__temp_" . $filename;
1504             unlink $temp_file if -e $temp_file;
1505             open OUTPUT, ">$temp_file"
1506             or die "Unable to open a temp file in this directory: $!";
1507             foreach my $cluster (@{$self->{_clusters}}) {
1508             foreach my $item (@$cluster) {
1509             print OUTPUT "@{$visualization_data{$item}}";
1510             print OUTPUT "\n";
1511             }
1512             print OUTPUT "\n\n";
1513             }
1514             close OUTPUT;
1515             my $plot;
1516             if (!defined $pause_time) {
1517             $plot = Graphics::GnuplotIF->new( persist => 1 );
1518             } else {
1519             $plot = Graphics::GnuplotIF->new();
1520             }
1521             my $arg_string = "";
1522             if ($visualization_data_field_width > 2) {
1523             $plot->gnuplot_cmd( "set noclip" );
1524             $plot->gnuplot_cmd( "set pointsize 2" );
1525             foreach my $i (0..$K-1) {
1526             my $j = $i + 1;
1527             $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
1528             }
1529             } elsif ($visualization_data_field_width == 2) {
1530             $plot->gnuplot_cmd( "set noclip" );
1531             $plot->gnuplot_cmd( "set pointsize 2" );
1532             foreach my $i (0..$K-1) {
1533             my $j = $i + 1;
1534             $arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
1535             my $ellipse_filename = "__contour_" . $i . ".dat";
1536             $arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
1537             }
1538             } elsif ($visualization_data_field_width == 1 ) {
1539             open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
1540             my @all_data = ;
1541             close INPUT;
1542             @all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
1543             my $all_joined_data = join ':', @all_data;
1544             my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
1545             my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
1546             foreach my $i (0..@separated-1) {
1547             $separated[$i] =~ s/SEPERATOR//g;
1548             my @cluster_for_hist = split /:/, $separated[$i];
1549             @cluster_for_hist = grep $_, @cluster_for_hist;
1550             my ($minval,$maxval) = minmax(\@cluster_for_hist);
1551             push @all_minvals, $minval;
1552             push @all_maxvals, $maxval;
1553             push @all_clusters_for_hist, \@cluster_for_hist;
1554             }
1555             push @all_minmaxvals, @all_minvals;
1556             push @all_minmaxvals, @all_maxvals;
1557             my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
1558             my $delta = ($abs_maxval - $abs_minval) / 100.0;
1559             $plot->gnuplot_cmd("set boxwidth 3");
1560             $plot->gnuplot_cmd("set style fill solid border -1");
1561             $plot->gnuplot_cmd("set ytics out nomirror");
1562             $plot->gnuplot_cmd("set style data histograms");
1563             $plot->gnuplot_cmd("set style histogram clustered");
1564             $plot->gnuplot_cmd("set title 'Clusters shown through histograms'");
1565             $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
1566             foreach my $cindex (0..@all_clusters_for_hist-1) {
1567             my $filename = basename($master_datafile);
1568             my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $filename;
1569             unlink $temp_file if -e $temp_file;
1570             open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
1571             print OUTPUT "Xstep histval\n";
1572             my @histogram = (0) x 100;
1573             foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
1574             $histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
1575             }
1576             foreach my $i (0..@histogram-1) {
1577             print OUTPUT "$i $histogram[$i]\n";
1578             }
1579             # $arg_string .= "\"$temp_file\" using 1:2 ti col smooth frequency with boxes lc $cindex, ";
1580             $arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
1581             close OUTPUT;
1582             }
1583             }
1584             $arg_string = $arg_string =~ /^(.*),[ ]+$/;
1585             $arg_string = $1;
1586             if ($visualization_data_field_width > 2) {
1587             $plot->gnuplot_cmd( 'set terminal png color',
1588             'set output "cluster_plot.png"');
1589             $plot->gnuplot_cmd( "splot $arg_string" );
1590             } elsif ($visualization_data_field_width == 2) {
1591             $plot->gnuplot_cmd('set terminal png',
1592             'set output "cluster_plot.png"');
1593             $plot->gnuplot_cmd( "plot $arg_string" );
1594             } elsif ($visualization_data_field_width == 1) {
1595             $plot->gnuplot_cmd('set terminal png',
1596             'set output "cluster_plot.png"');
1597             $plot->gnuplot_cmd( "plot $arg_string" );
1598             }
1599             }
1600              
1601             # This method is for the visualization of the posterior class distributions. In
1602             # other words, this method allows us to see the soft clustering produced by the EM
1603             # algorithm. While much of the gnuplot logic here is the same as in the
1604             # visualize_clusters() method, there are significant differences in how the data is
1605             # pooled for the purpose of display.
1606             sub visualize_distributions {
1607             my $self = shift;
1608             my $v_mask;
1609             my $pause_time;
1610             if (@_ == 1) {
1611             $v_mask = shift || die "visualization mask missing";
1612             } elsif (@_ == 2) {
1613             $v_mask = shift || die "visualization mask missing";
1614             $pause_time = shift;
1615             } else {
1616             die "visualize_distributions() called with wrong args";
1617             }
1618             my @v_mask = split //, $v_mask;
1619             my $visualization_mask_width = @v_mask;
1620             my $original_data_mask = $self->{_mask};
1621             my @mask = split //, $original_data_mask;
1622             my $data_field_width = scalar grep {$_ eq '1'} @mask;
1623             die "\n\nABORTED: The width of the visualization mask (including " .
1624             "all its 1s and 0s) must equal the width of the original mask " .
1625             "used for reading the data file (counting only the 1's)"
1626             if $visualization_mask_width != $data_field_width;
1627             my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1628             if ($visualization_data_field_width == 2) {
1629             foreach my $cluster_index (0..$self->{_K}-1) {
1630             my $contour_filename = "__contour2_" . $cluster_index . ".dat";
1631             my $mean = $self->{_cluster_means}->[$cluster_index];
1632             my $covariance = $self->{_cluster_covariances}->[$cluster_index];
1633             my ($mux,$muy) = $mean->as_list();
1634             my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
1635             my ($sigmayx,$vary) = $covariance->row(1)->as_list();
1636             die "Your covariance matrix does not look right"
1637             unless $sigmaxy == $sigmayx;
1638             my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
1639             my $argstring = <<"END";
1640             set contour
1641             mux = $mux
1642             muy = $muy
1643             sigmax = $sigmax
1644             sigmay = $sigmay
1645             sigmaxy = $sigmaxy
1646             determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2
1647             exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) )
1648             f(x,y) = exp( exponent(x,y) ) - 0.2
1649             xmax = mux + 2 * sigmax
1650             xmin = mux - 2 * sigmax
1651             ymax = muy + 2 * sigmay
1652             ymin = muy - 2 * sigmay
1653             set xrange [ xmin : xmax ]
1654             set yrange [ ymin : ymax ]
1655             set isosamples 200
1656             unset surface
1657             set cntrparam levels discrete 0
1658             set table \"$contour_filename\"
1659             splot f(x,y)
1660             unset table
1661             END
1662             my $plot = Graphics::GnuplotIF->new();
1663             $plot->gnuplot_cmd( $argstring );
1664             }
1665             }
1666             my %visualization_data;
1667             while ( my ($record_id, $data) = each %{$self->{_data}} ) {
1668             my @fields = @$data;
1669             die "\nABORTED: Visualization mask size exceeds data record size"
1670             if $#v_mask > $#fields;
1671             my @data_fields;
1672             foreach my $i (0..@fields-1) {
1673             if ($v_mask[$i] eq '0') {
1674             next;
1675             } elsif ($v_mask[$i] eq '1') {
1676             push @data_fields, $fields[$i];
1677             } else {
1678             die "Misformed visualization mask. It can only have 1s and 0s";
1679             }
1680             }
1681             $visualization_data{ $record_id } = \@data_fields;
1682             }
1683             my $filename = basename($self->{_datafile});
1684             my $temp_file = "__temp2_" . $filename;
1685             unlink $temp_file if -e $temp_file;
1686             open OUTPUT, ">$temp_file"
1687             or die "Unable to open a temp file in this directory: $!";
1688             my @class_distributions;
1689             foreach my $cluster_index (0..$self->{_K}-1) {
1690             push @class_distributions, [];
1691             }
1692             foreach my $data_tag (@{$self->{_data_id_tags}}) {
1693             foreach my $cluster_index (0..$self->{_K}-1) {
1694             push @{$class_distributions[$cluster_index]}, $self->{_data}->{$data_tag}
1695             if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] > 0.2;
1696             }
1697             }
1698             foreach my $distribution (@class_distributions) {
1699             foreach my $item (@$distribution) {
1700             print OUTPUT "@$item";
1701             print OUTPUT "\n";
1702             }
1703             print OUTPUT "\n\n";
1704             }
1705             close OUTPUT;
1706             my $plot;
1707             if (!defined $pause_time) {
1708             $plot = Graphics::GnuplotIF->new( persist => 1 );
1709             } else {
1710             $plot = Graphics::GnuplotIF->new();
1711             }
1712             my $arg_string = "";
1713             if ($visualization_data_field_width > 2) {
1714             $plot->gnuplot_cmd( "set noclip" );
1715             $plot->gnuplot_cmd( "set pointsize 2" );
1716             foreach my $i (0..$self->{_K}-1) {
1717             my $j = $i + 1;
1718             $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
1719             }
1720             } elsif ($visualization_data_field_width == 2) {
1721             $plot->gnuplot_cmd( "set noclip" );
1722             $plot->gnuplot_cmd( "set pointsize 2" );
1723             foreach my $i (0..$self->{_K}-1) {
1724             my $j = $i + 1;
1725             $arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
1726             my $ellipse_filename = "__contour2_" . $i . ".dat";
1727             $arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
1728             }
1729             } elsif ($visualization_data_field_width == 1 ) {
1730             open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
1731             my @all_data = ;
1732             close INPUT;
1733             @all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
1734             my $all_joined_data = join ':', @all_data;
1735             my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
1736             my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
1737             foreach my $i (0..@separated-1) {
1738             $separated[$i] =~ s/SEPERATOR//g;
1739             my @cluster_for_hist = split /:/, $separated[$i];
1740             @cluster_for_hist = grep $_, @cluster_for_hist;
1741             my ($minval,$maxval) = minmax(\@cluster_for_hist);
1742             push @all_minvals, $minval;
1743             push @all_maxvals, $maxval;
1744             push @all_clusters_for_hist, \@cluster_for_hist;
1745             }
1746             push @all_minmaxvals, @all_minvals;
1747             push @all_minmaxvals, @all_maxvals;
1748             my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
1749             my $delta = ($abs_maxval - $abs_minval) / 100.0;
1750             $plot->gnuplot_cmd("set boxwidth 3");
1751             $plot->gnuplot_cmd("set style fill solid border -1");
1752             $plot->gnuplot_cmd("set ytics out nomirror");
1753             $plot->gnuplot_cmd("set style data histograms");
1754             $plot->gnuplot_cmd("set style histogram clustered");
1755             $plot->gnuplot_cmd("set title 'Individual distributions shown through histograms'");
1756             $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
1757             foreach my $cindex (0..@all_clusters_for_hist-1) {
1758             my $localfilename = basename($filename);
1759             my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $localfilename;
1760             unlink $temp_file if -e $temp_file;
1761             open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
1762             print OUTPUT "Xstep histval\n";
1763              
1764             my @histogram = (0) x 100;
1765             foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
1766             $histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
1767             }
1768             foreach my $i (0..@histogram-1) {
1769             print OUTPUT "$i $histogram[$i]\n";
1770             }
1771             # $arg_string .= "\"$temp_file\" using 1:2 ti col smooth frequency with boxes lc $cindex, ";
1772             $arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
1773             close OUTPUT;
1774             }
1775             }
1776             $arg_string = $arg_string =~ /^(.*),[ ]+$/;
1777             $arg_string = $1;
1778             if ($visualization_data_field_width > 2) {
1779             $plot->gnuplot_cmd( "splot $arg_string" );
1780             $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1781             } elsif ($visualization_data_field_width == 2) {
1782             $plot->gnuplot_cmd( "plot $arg_string" );
1783             $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1784             } elsif ($visualization_data_field_width == 1) {
1785             $plot->gnuplot_cmd( "plot $arg_string" );
1786             $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1787             }
1788             }
1789              
1790             # This method is basically the same as the previous method, except that it is
1791             # intended for making PNG files from the distributions.
1792             sub plot_hardcopy_distributions {
1793             my $self = shift;
1794             my $v_mask;
1795             my $pause_time;
1796             if (@_ == 1) {
1797             $v_mask = shift || die "visualization mask missing";
1798             } elsif (@_ == 2) {
1799             $v_mask = shift || die "visualization mask missing";
1800             $pause_time = shift;
1801             } else {
1802             die "visualize_distributions() called with wrong args";
1803             }
1804             my @v_mask = split //, $v_mask;
1805             my $visualization_mask_width = @v_mask;
1806             my $original_data_mask = $self->{_mask};
1807             my @mask = split //, $original_data_mask;
1808             my $data_field_width = scalar grep {$_ eq '1'} @mask;
1809             die "\n\nABORTED: The width of the visualization mask (including " .
1810             "all its 1s and 0s) must equal the width of the original mask " .
1811             "used for reading the data file (counting only the 1's)"
1812             if $visualization_mask_width != $data_field_width;
1813             my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1814             if ($visualization_data_field_width == 2) {
1815             foreach my $cluster_index (0..$self->{_K}-1) {
1816             my $contour_filename = "__contour2_" . $cluster_index . ".dat";
1817             my $mean = $self->{_cluster_means}->[$cluster_index];
1818             my $covariance = $self->{_cluster_covariances}->[$cluster_index];
1819             my ($mux,$muy) = $mean->as_list();
1820             my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
1821             my ($sigmayx,$vary) = $covariance->row(1)->as_list();
1822             die "Your covariance matrix does not look right"
1823             unless $sigmaxy == $sigmayx;
1824             my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
1825             my $argstring = <<"END";
1826             set contour
1827             mux = $mux
1828             muy = $muy
1829             sigmax = $sigmax
1830             sigmay = $sigmay
1831             sigmaxy = $sigmaxy
1832             determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2
1833             exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) )
1834             f(x,y) = exp( exponent(x,y) ) - 0.2
1835             xmax = mux + 2 * sigmax
1836             xmin = mux - 2 * sigmax
1837             ymax = muy + 2 * sigmay
1838             ymin = muy - 2 * sigmay
1839             set xrange [ xmin : xmax ]
1840             set yrange [ ymin : ymax ]
1841             set isosamples 200
1842             unset surface
1843             set cntrparam levels discrete 0
1844             set table \"$contour_filename\"
1845             splot f(x,y)
1846             unset table
1847             END
1848             my $plot = Graphics::GnuplotIF->new();
1849             $plot->gnuplot_cmd( $argstring );
1850             }
1851             }
1852             my %visualization_data;
1853             while ( my ($record_id, $data) = each %{$self->{_data}} ) {
1854             my @fields = @$data;
1855             die "\nABORTED: Visualization mask size exceeds data record size"
1856             if $#v_mask > $#fields;
1857             my @data_fields;
1858             foreach my $i (0..@fields-1) {
1859             if ($v_mask[$i] eq '0') {
1860             next;
1861             } elsif ($v_mask[$i] eq '1') {
1862             push @data_fields, $fields[$i];
1863             } else {
1864             die "Misformed visualization mask. It can only have 1s and 0s";
1865             }
1866             }
1867             $visualization_data{ $record_id } = \@data_fields;
1868             }
1869             my $filename = basename($self->{_datafile});
1870             my $temp_file = "__temp2_" . $filename;
1871             unlink $temp_file if -e $temp_file;
1872             open OUTPUT, ">$temp_file"
1873             or die "Unable to open a temp file in this directory: $!";
1874             my @class_distributions;
1875             foreach my $cluster_index (0..$self->{_K}-1) {
1876             push @class_distributions, [];
1877             }
1878             foreach my $data_tag (@{$self->{_data_id_tags}}) {
1879             foreach my $cluster_index (0..$self->{_K}-1) {
1880             push @{$class_distributions[$cluster_index]}, $self->{_data}->{$data_tag}
1881             if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] > 0.2;
1882             }
1883             }
1884             foreach my $distribution (@class_distributions) {
1885             foreach my $item (@$distribution) {
1886             print OUTPUT "@$item";
1887             print OUTPUT "\n";
1888             }
1889             print OUTPUT "\n\n";
1890             }
1891             close OUTPUT;
1892             my $plot;
1893             if (!defined $pause_time) {
1894             $plot = Graphics::GnuplotIF->new( persist => 1 );
1895             } else {
1896             $plot = Graphics::GnuplotIF->new();
1897             }
1898             my $arg_string = "";
1899             if ($visualization_data_field_width > 2) {
1900             $plot->gnuplot_cmd( "set noclip" );
1901             $plot->gnuplot_cmd( "set pointsize 2" );
1902             foreach my $i (0..$self->{_K}-1) {
1903             my $j = $i + 1;
1904             $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
1905             }
1906             } elsif ($visualization_data_field_width == 2) {
1907             $plot->gnuplot_cmd( "set noclip" );
1908             $plot->gnuplot_cmd( "set pointsize 2" );
1909             foreach my $i (0..$self->{_K}-1) {
1910             my $j = $i + 1;
1911             $arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
1912             my $ellipse_filename = "__contour2_" . $i . ".dat";
1913             $arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
1914             }
1915             } elsif ($visualization_data_field_width == 1 ) {
1916             open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
1917             my @all_data = ;
1918             close INPUT;
1919             @all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
1920             my $all_joined_data = join ':', @all_data;
1921             my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
1922             my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
1923             foreach my $i (0..@separated-1) {
1924             $separated[$i] =~ s/SEPERATOR//g;
1925             my @cluster_for_hist = split /:/, $separated[$i];
1926             @cluster_for_hist = grep $_, @cluster_for_hist;
1927             my ($minval,$maxval) = minmax(\@cluster_for_hist);
1928             push @all_minvals, $minval;
1929             push @all_maxvals, $maxval;
1930             push @all_clusters_for_hist, \@cluster_for_hist;
1931             }
1932             push @all_minmaxvals, @all_minvals;
1933             push @all_minmaxvals, @all_maxvals;
1934             my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
1935             my $delta = ($abs_maxval - $abs_minval) / 100.0;
1936             $plot->gnuplot_cmd("set boxwidth 3");
1937             $plot->gnuplot_cmd("set style fill solid border -1");
1938             $plot->gnuplot_cmd("set ytics out nomirror");
1939             $plot->gnuplot_cmd("set style data histograms");
1940             $plot->gnuplot_cmd("set style histogram clustered");
1941             $plot->gnuplot_cmd("set title 'Individual distributions shown through histograms'");
1942             $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
1943             foreach my $cindex (0..@all_clusters_for_hist-1) {
1944             my $localfilename = basename($filename);
1945             my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $localfilename;
1946             unlink $temp_file if -e $temp_file;
1947             open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
1948             print OUTPUT "Xstep histval\n";
1949              
1950             my @histogram = (0) x 100;
1951             foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
1952             $histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
1953             }
1954             foreach my $i (0..@histogram-1) {
1955             print OUTPUT "$i $histogram[$i]\n";
1956             }
1957             $arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
1958             close OUTPUT;
1959             }
1960             }
1961             $arg_string = $arg_string =~ /^(.*),[ ]+$/;
1962             $arg_string = $1;
1963              
1964             if ($visualization_data_field_width > 2) {
1965             $plot->gnuplot_cmd( 'set terminal png',
1966             'set output "posterior_prob_plot.png"');
1967             $plot->gnuplot_cmd( "splot $arg_string" );
1968             } elsif ($visualization_data_field_width == 2) {
1969             $plot->gnuplot_cmd( 'set terminal png',
1970             'set output "posterior_prob_plot.png"');
1971             $plot->gnuplot_cmd( "plot $arg_string" );
1972             } elsif ($visualization_data_field_width == 1) {
1973             $plot->gnuplot_cmd( 'set terminal png',
1974             'set output "posterior_prob_plot.png"');
1975             $plot->gnuplot_cmd( "plot $arg_string" );
1976             }
1977             }
1978              
1979             # The method shown below should be called only AFTER you have called the method
1980             # read_data_from_file(). The visualize_data() is meant for the visualization of the
1981             # original data in its various 2D or 3D subspaces.
1982             sub visualize_data {
1983             my $self = shift;
1984             my $v_mask = shift || die "visualization mask missing";
1985              
1986             my $master_datafile = $self->{_datafile};
1987              
1988             my @v_mask = split //, $v_mask;
1989             my $visualization_mask_width = @v_mask;
1990             my $original_data_mask = $self->{_mask};
1991             my @mask = split //, $original_data_mask;
1992             my $data_field_width = scalar grep {$_ eq '1'} @mask;
1993             die "\n\nABORTED: The width of the visualization mask (including " .
1994             "all its 1s and 0s) must equal the width of the original mask " .
1995             "used for reading the data file (counting only the 1's)"
1996             if $visualization_mask_width != $data_field_width;
1997             my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
1998             my %visualization_data;
1999             my $data_source = $self->{_data};
2000             while ( my ($record_id, $data) = each %{$data_source} ) {
2001             my @fields = @$data;
2002             die "\nABORTED: Visualization mask size exceeds data record size"
2003             if $#v_mask > $#fields;
2004             my @data_fields;
2005             foreach my $i (0..@fields-1) {
2006             if ($v_mask[$i] eq '0') {
2007             next;
2008             } elsif ($v_mask[$i] eq '1') {
2009             push @data_fields, $fields[$i];
2010             } else {
2011             die "Misformed visualization mask. It can only have 1s and 0s";
2012             }
2013             }
2014             $visualization_data{ $record_id } = \@data_fields;
2015             }
2016             my $filename = basename($master_datafile);
2017             my $temp_file;
2018             $temp_file = "__temp_data_" . $filename;
2019             unlink $temp_file if -e $temp_file;
2020             open OUTPUT, ">$temp_file"
2021             or die "Unable to open a temp file in this directory: $!";
2022             foreach my $datapoint (values %visualization_data) {
2023             print OUTPUT "@$datapoint";
2024             print OUTPUT "\n";
2025             }
2026             close OUTPUT;
2027             my $plot = Graphics::GnuplotIF->new( persist => 1 );
2028             $plot->gnuplot_cmd( "set noclip" );
2029             $plot->gnuplot_cmd( "set pointsize 2" );
2030             my $plot_title = '"original data provided for EM"';
2031             my $arg_string ;
2032             if ($visualization_data_field_width > 2) {
2033             $arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
2034             } elsif ($visualization_data_field_width == 2) {
2035             $arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
2036             } elsif ($visualization_data_field_width == 1 ) {
2037             open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
2038             my @all_data = ;
2039             close INPUT;
2040             @all_data = map {chomp $_; $_} @all_data;
2041             @all_data = grep $_, @all_data;
2042             my ($minval,$maxval) = minmax(\@all_data);
2043             my $delta = ($maxval - $minval) / 100.0;
2044             $plot->gnuplot_cmd("set boxwidth 3");
2045             $plot->gnuplot_cmd("set style fill solid border -1");
2046             $plot->gnuplot_cmd("set ytics out nomirror");
2047             $plot->gnuplot_cmd("set style data histograms");
2048             $plot->gnuplot_cmd("set style histogram clustered");
2049             $plot->gnuplot_cmd("set title 'Overall distribution of 1D data'");
2050             $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
2051             my $localfilename = basename($filename);
2052             my $temp_file = "__temp1dhist_" . $localfilename;
2053             unlink $temp_file if -e $temp_file;
2054             open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
2055             print OUTPUT "Xstep histval\n";
2056             my @histogram = (0) x 100;
2057             foreach my $i (0..@all_data-1) {
2058             $histogram[int( ($all_data[$i] - $minval) / $delta )]++;
2059             }
2060             foreach my $i (0..@histogram-1) {
2061             print OUTPUT "$i $histogram[$i]\n";
2062             }
2063             $arg_string = "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc rgb 'green'";
2064             close OUTPUT;
2065             }
2066             if ($visualization_data_field_width > 2) {
2067             $plot->gnuplot_cmd( "splot $arg_string" );
2068             } elsif ($visualization_data_field_width == 2) {
2069             $plot->gnuplot_cmd( "plot $arg_string" );
2070             } elsif ($visualization_data_field_width == 1) {
2071             $plot->gnuplot_cmd( "plot $arg_string" );
2072             }
2073             }
2074              
2075             # This method is the same as the one shown above, except that it is meant for
2076             # creating PNG files of the displays.
2077             sub plot_hardcopy_data {
2078             my $self = shift;
2079             my $v_mask = shift || die "visualization mask missing";
2080             my $master_datafile = $self->{_datafile};
2081             my @v_mask = split //, $v_mask;
2082             my $visualization_mask_width = @v_mask;
2083             my $original_data_mask = $self->{_mask};
2084             my @mask = split //, $original_data_mask;
2085             my $data_field_width = scalar grep {$_ eq '1'} @mask;
2086             die "\n\nABORTED: The width of the visualization mask (including " .
2087             "all its 1s and 0s) must equal the width of the original mask " .
2088             "used for reading the data file (counting only the 1's)"
2089             if $visualization_mask_width != $data_field_width;
2090             my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
2091             my %visualization_data;
2092             my $data_source = $self->{_data};
2093             while ( my ($record_id, $data) = each %{$data_source} ) {
2094             my @fields = @$data;
2095             die "\nABORTED: Visualization mask size exceeds data record size"
2096             if $#v_mask > $#fields;
2097             my @data_fields;
2098             foreach my $i (0..@fields-1) {
2099             if ($v_mask[$i] eq '0') {
2100             next;
2101             } elsif ($v_mask[$i] eq '1') {
2102             push @data_fields, $fields[$i];
2103             } else {
2104             die "Misformed visualization mask. It can only have 1s and 0s";
2105             }
2106             }
2107             $visualization_data{ $record_id } = \@data_fields;
2108             }
2109             my $filename = basename($master_datafile);
2110             my $temp_file;
2111             $temp_file = "__temp_data_" . $filename;
2112             unlink $temp_file if -e $temp_file;
2113             open OUTPUT, ">$temp_file"
2114             or die "Unable to open a temp file in this directory: $!";
2115             foreach my $datapoint (values %visualization_data) {
2116             print OUTPUT "@$datapoint";
2117             print OUTPUT "\n";
2118             }
2119             close OUTPUT;
2120              
2121             my $plot = Graphics::GnuplotIF->new( persist => 1 );
2122             $plot->gnuplot_cmd( "set noclip" );
2123             $plot->gnuplot_cmd( "set pointsize 2" );
2124             my $plot_title = '"original data provided for EM"';
2125             my $arg_string ;
2126             if ($visualization_data_field_width > 2) {
2127             $arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
2128             } elsif ($visualization_data_field_width == 2) {
2129             $arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
2130             } elsif ($visualization_data_field_width == 1 ) {
2131             open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
2132             my @all_data = ;
2133             close INPUT;
2134             @all_data = map {chomp $_; $_} @all_data;
2135             @all_data = grep $_, @all_data;
2136             my ($minval,$maxval) = minmax(\@all_data);
2137             my $delta = ($maxval - $minval) / 100.0;
2138             $plot->gnuplot_cmd("set boxwidth 3");
2139             $plot->gnuplot_cmd("set style fill solid border -1");
2140             $plot->gnuplot_cmd("set ytics out nomirror");
2141             $plot->gnuplot_cmd("set style data histograms");
2142             $plot->gnuplot_cmd("set style histogram clustered");
2143             $plot->gnuplot_cmd("set title 'Overall distribution of 1D data'");
2144             $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
2145             my $localfilename = basename($filename);
2146             my $temp_file = "__temp1dhist_" . $localfilename;
2147             unlink $temp_file if -e $temp_file;
2148             open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
2149             print OUTPUT "Xstep histval\n";
2150             my @histogram = (0) x 100;
2151             foreach my $i (0..@all_data-1) {
2152             $histogram[int( ($all_data[$i] - $minval) / $delta )]++;
2153             }
2154             foreach my $i (0..@histogram-1) {
2155             print OUTPUT "$i $histogram[$i]\n";
2156             }
2157             $arg_string = "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc rgb 'green'";
2158             close OUTPUT;
2159             }
2160             if ($visualization_data_field_width > 2) {
2161             $plot->gnuplot_cmd( 'set terminal png',
2162             'set output "data_scatter_plot.png"');
2163             $plot->gnuplot_cmd( "splot $arg_string" );
2164             } elsif ($visualization_data_field_width == 2) {
2165             $plot->gnuplot_cmd( 'set terminal png',
2166             'set output "data_scatter_plot.png"');
2167             $plot->gnuplot_cmd( "plot $arg_string" );
2168             } elsif ($visualization_data_field_width == 1) {
2169             $plot->gnuplot_cmd( 'set terminal png',
2170             'set output "data_scatter_plot.png"');
2171             $plot->gnuplot_cmd( "plot $arg_string" );
2172             }
2173             }
2174              
2175              
2176             ################### Generating Synthetic Data for Clustering ###################
2177              
2178             # The data generated corresponds to a multivariate distribution. The mean and the
2179             # covariance of each Gaussian in the distribution are specified individually in a
2180             # parameter file. The parameter file must also state the prior probabilities to be
2181             # associated with each Gaussian. See the example parameter file param1.txt in the
2182             # examples directory. Just edit this file for your own needs.
2183             #
2184             # The multivariate random numbers are generated by calling the Math::Random module.
2185             # As you would expect, that module will insist that the covariance matrix you
2186             # specify be symmetric and positive definite.
2187             sub cluster_data_generator {
2188             my $class = shift;
2189             die "illegal call of a class method"
2190             unless $class eq 'Algorithm::ExpectationMaximization';
2191             my %args = @_;
2192             my $input_parameter_file = $args{input_parameter_file};
2193             my $output_file = $args{output_datafile};
2194             my $N = $args{total_number_of_data_points};
2195             my @all_params;
2196             my $param_string;
2197             if (defined $input_parameter_file) {
2198             open INPUT, $input_parameter_file || "unable to open parameter file: $!";
2199             @all_params = ;
2200             @all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
2201             chomp @all_params;
2202             $param_string = join ' ', @all_params;
2203             } else {
2204             # Just for testing. Used in t/test.t
2205             $param_string = "priors 0.34 0.33 0.33 " .
2206             "cluster 5 0 0 1 0 0 0 1 0 0 0 1 " .
2207             "cluster 0 5 0 1 0 0 0 1 0 0 0 1 " .
2208             "cluster 0 0 5 1 0 0 0 1 0 0 0 1";
2209             }
2210             my ($priors_string) = $param_string =~ /priors(.+?)cluster/;
2211             croak "You did not specify the prior probabilities in the parameter file"
2212             unless $priors_string;
2213             my @priors = split /\s+/, $priors_string;
2214             @priors = grep {/$_num_regex/; $_} @priors;
2215             my $sum = 0;
2216             foreach my $prior (@priors) {
2217             $sum += $prior;
2218             }
2219             croak "Your priors in the parameter file do not add up to 1" unless $sum == 1;
2220             my ($rest_of_string) = $param_string =~ /priors\s*$priors_string(.*)$/;
2221             my @cluster_strings = split /[ ]*cluster[ ]*/, $rest_of_string;
2222             @cluster_strings = grep $_, @cluster_strings;
2223              
2224             my $K = @cluster_strings;
2225             croak "Too many clusters requested" if $K > 12;
2226             croak "Mismatch between the number of values for priors and the number " .
2227             "of clusters" unless $K == @priors;
2228             my @point_labels = ('a'..'z');
2229             print "Prior probabilities recorded from param file: @priors\n";
2230             print "Number of Gaussians used for the synthetic data: $K\n";
2231             my @means;
2232             my @covariances;
2233             my $data_dimension;
2234             foreach my $i (0..$K-1) {
2235             my @num_strings = split / /, $cluster_strings[$i];
2236             my @cluster_mean = map {/$_num_regex/;$_} split / /, $num_strings[0];
2237             $data_dimension = @cluster_mean;
2238             push @means, \@cluster_mean;
2239             my @covariance_nums = map {/$_num_regex/;$_} split / /, $num_strings[1];
2240             croak "dimensionality error" if @covariance_nums !=
2241             ($data_dimension ** 2);
2242             my $cluster_covariance;
2243             foreach my $j (0..$data_dimension-1) {
2244             foreach my $k (0..$data_dimension-1) {
2245             $cluster_covariance->[$j]->[$k] =
2246             $covariance_nums[$j*$data_dimension + $k];
2247             }
2248             }
2249             push @covariances, $cluster_covariance;
2250             }
2251             random_seed_from_phrase( 'hellomellojello' );
2252             my @data_dump;
2253             foreach my $i (0..$K-1) {
2254             my @m = @{shift @means};
2255             my @covar = @{shift @covariances};
2256             my @new_data = Math::Random::random_multivariate_normal(
2257             int($N * $priors[$i]), @m, @covar );
2258             my $p = 0;
2259             my $label = $point_labels[$i];
2260             @new_data = map {unshift @$_, $label.$i; $i++; $_} @new_data;
2261             push @data_dump, @new_data;
2262             }
2263             fisher_yates_shuffle( \@data_dump );
2264             open OUTPUT, ">$output_file";
2265             print OUTPUT "\#Data generated from the parameter file: $input_parameter_file\n"
2266             if $input_parameter_file;
2267             print OUTPUT "\#Total number of data points in this file: $N\n";
2268             print OUTPUT "\#Prior class probabilities for this data: @priors\n";
2269             foreach my $ele (@data_dump) {
2270             foreach my $coord ( @$ele ) {
2271             print OUTPUT "$coord ";
2272             }
2273             print OUTPUT "\n";
2274             }
2275             print "Data written out to file $output_file\n";
2276             close OUTPUT;
2277             }
2278              
2279             sub add_point_coords {
2280             my $self = shift;
2281             my @arr_of_ids = @{shift @_}; # array of data element names
2282             my @result;
2283             my $data_dimensionality = $self->{_data_dimensions};
2284             foreach my $i (0..$data_dimensionality-1) {
2285             $result[$i] = 0.0;
2286             }
2287             foreach my $id (@arr_of_ids) {
2288             my $ele = $self->{_data}->{$id};
2289             my $i = 0;
2290             foreach my $component (@$ele) {
2291             $result[$i] += $component;
2292             $i++;
2293             }
2294             }
2295             return \@result;
2296             }
2297              
2298             ###################### Support Routines ########################
2299              
2300             # computer the outer product of two column vectors
2301             sub outer_product {
2302             my $vec1 = shift;
2303             my $vec2 = shift;
2304             my ($nrows1, $ncols1) = ($vec1->rows(), $vec1->cols());
2305             my ($nrows2, $ncols2) = ($vec2->rows(), $vec2->cols());
2306             die "Outer product operation called with non-matching vectors"
2307             unless $ncols1 == 1 && $ncols2 == 1 && $nrows1 == $nrows2;
2308             my @vec_arr1 = $vec1->as_list();
2309             my @vec_arr2 = $vec2->as_list();
2310             my $outer_product = Math::GSL::Matrix->new($nrows1, $nrows2);
2311             foreach my $index (0..$nrows1-1) {
2312             my @new_row = map $vec_arr1[$index] * $_, @vec_arr2;
2313             $outer_product->set_row($index, \@new_row);
2314             }
2315             return $outer_product;
2316             }
2317              
2318             sub get_index_at_value {
2319             my $value = shift;
2320             my @array = @{shift @_};
2321             foreach my $i (0..@array-1) {
2322             return $i if $value == $array[$i];
2323             }
2324             }
2325              
2326             # This routine is really not necessary in light of the new `~~' operator in Perl.
2327             # Will use the new operator in the next version.
2328             sub vector_equal {
2329             my $vec1 = shift;
2330             my $vec2 = shift;
2331             die "wrong data types for distance calculation" if @$vec1 != @$vec2;
2332             foreach my $i (0..@$vec1-1){
2333             return 0 if $vec1->[$i] != $vec2->[$i];
2334             }
2335             return 1;
2336             }
2337              
2338             sub compare_array_floats {
2339             my $vec1 = shift;
2340             my $vec2 = shift;
2341             foreach my $i (0..@$vec1-1){
2342             return 0 if abs($vec1->[$i] - $vec2->[$i]) > 0.00001;
2343             }
2344             return 1;
2345             }
2346              
2347             # Returns the minimum value and its positional index in an array
2348             sub minimum {
2349             my $arr = shift;
2350             my $min;
2351             my $index;
2352             foreach my $i (0..@{$arr}-1) {
2353             if ( (!defined $min) || ($arr->[$i] < $min) ) {
2354             $index = $i;
2355             $min = $arr->[$i];
2356             }
2357             }
2358             return ($min, $index);
2359             }
2360              
2361             sub minmax {
2362             my $arr = shift;
2363             my $min;
2364             my $max;
2365             foreach my $i (0..@{$arr}-1) {
2366             if ( (!defined $min) && (!defined $max) ) {
2367             $min = $arr->[$i];
2368             $max = $arr->[$i];
2369             } elsif ( $arr->[$i] < $min ) {
2370             $min = $arr->[$i];
2371             } elsif ( $arr->[$i] > $max ) {
2372             $max = $arr->[$i];
2373             }
2374             }
2375             return ($min, $max);
2376             }
2377              
2378             # Meant only for constructing a deep copy of an array of
2379             # arrays:
2380             sub deep_copy_AoA {
2381             my $ref_in = shift;
2382             my $ref_out;
2383             foreach my $i (0..@{$ref_in}-1) {
2384             foreach my $j (0..@{$ref_in->[$i]}-1) {
2385             $ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
2386             }
2387             }
2388             return $ref_out;
2389             }
2390              
2391             # Meant only for constructing a deep copy of an array of arrays for the case when
2392             # some elements of the top-level array may be undefined:
2393             sub deep_copy_AoA_with_nulls {
2394             my $ref_in = shift;
2395             my $ref_out;
2396             foreach my $i (0..@{$ref_in}-1) {
2397             if ( !defined $ref_in->[$i] ) {
2398             $ref_out->[$i] = undef;
2399             next;
2400             }
2401             foreach my $j (0..@{$ref_in->[$i]}-1) {
2402             $ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
2403             }
2404             }
2405             return $ref_out;
2406             }
2407              
2408             # Meant only for constructing a deep copy of a hash in which each value is an
2409             # anonymous array of numbers:
2410             sub deep_copy_hash {
2411             my $ref_in = shift;
2412             my $ref_out;
2413             while ( my ($key, $value) = each( %$ref_in ) ) {
2414             $ref_out->{$key} = deep_copy_array( $value );
2415             }
2416             return $ref_out;
2417             }
2418              
2419             # Meant only for an array of numbers:
2420             sub deep_copy_array {
2421             my $ref_in = shift;
2422             my $ref_out;
2423             foreach my $i (0..@{$ref_in}-1) {
2424             $ref_out->[$i] = $ref_in->[$i];
2425             }
2426             return $ref_out;
2427             }
2428              
2429             # from perl docs:
2430             sub fisher_yates_shuffle {
2431             my $arr = shift;
2432             my $i = @$arr;
2433             while (--$i) {
2434             my $j = int rand( $i + 1 );
2435             @$arr[$i, $j] = @$arr[$j, $i];
2436             }
2437             }
2438              
2439             sub mean_and_variance {
2440             my @data = @{shift @_};
2441             my ($mean, $variance);
2442             foreach my $i (1..@data) {
2443             if ($i == 1) {
2444             $mean = $data[0];
2445             $variance = 0;
2446             } else {
2447             # data[$i-1] because of zero-based indexing of vector
2448             $mean = ( (($i-1)/$i) * $mean ) + $data[$i-1] / $i;
2449             $variance = ( (($i-1)/$i) * $variance )
2450             + ($data[$i-1]-$mean)**2 / ($i-1);
2451             }
2452             }
2453             return ($mean, $variance);
2454             }
2455              
2456             sub check_for_illegal_params {
2457             my @params = @_;
2458             my @legal_params = qw / datafile
2459             mask
2460             K
2461             terminal_output
2462             max_em_iterations
2463             seeding
2464             class_priors
2465             seed_tags
2466             debug
2467             /;
2468             my $found_match_flag;
2469             foreach my $param (@params) {
2470             foreach my $legal (@legal_params) {
2471             $found_match_flag = 0;
2472             if ($param eq $legal) {
2473             $found_match_flag = 1;
2474             last;
2475             }
2476             }
2477             last if $found_match_flag == 0;
2478             }
2479             return $found_match_flag;
2480             }
2481              
2482             sub get_value_index_hash {
2483             my $arr = shift;
2484             my %hash;
2485             foreach my $index (0..@$arr-1) {
2486             $hash{$arr->[$index]} = $index if $arr->[$index] > 0;
2487             }
2488             return \%hash;
2489             }
2490              
2491             sub non_maximum_supression {
2492             my $arr = shift;
2493             my @output = (0) x @$arr;
2494             my @final_output = (0) x @$arr;
2495             my %hash;
2496             my @array_of_runs = ([$arr->[0]]);
2497             foreach my $index (1..@$arr-1) {
2498             if ($arr->[$index] == $arr->[$index-1]) {
2499             push @{$array_of_runs[-1]}, $arr->[$index];
2500             } else {
2501             push @array_of_runs, [$arr->[$index]];
2502             }
2503             }
2504             my $runstart_index = 0;
2505             foreach my $run_index (1..@array_of_runs-2) {
2506             $runstart_index += @{$array_of_runs[$run_index-1]};
2507             if ($array_of_runs[$run_index]->[0] >
2508             $array_of_runs[$run_index-1]->[0] &&
2509             $array_of_runs[$run_index]->[0] >
2510             $array_of_runs[$run_index+1]->[0]) {
2511             my $run_center = @{$array_of_runs[$run_index]} / 2;
2512             my $assignment_index = $runstart_index + $run_center;
2513             $output[$assignment_index] = $arr->[$assignment_index];
2514             }
2515             }
2516             if ($array_of_runs[-1]->[0] > $array_of_runs[-2]->[0]) {
2517             $runstart_index += @{$array_of_runs[-2]};
2518             my $run_center = @{$array_of_runs[-1]} / 2;
2519             my $assignment_index = $runstart_index + $run_center;
2520             $output[$assignment_index] = $arr->[$assignment_index];
2521             }
2522             if ($array_of_runs[0]->[0] > $array_of_runs[1]->[0]) {
2523             my $run_center = @{$array_of_runs[0]} / 2;
2524             $output[$run_center] = $arr->[$run_center];
2525             }
2526             return \@output;
2527             }
2528              
2529             sub display_matrix {
2530             my $message = shift;
2531             my $matrix = shift;
2532             if (!defined blessed($matrix)) {
2533             print "display_matrix called on a scalar value: $matrix\n";
2534             return;
2535             }
2536             my $nrows = $matrix->rows();
2537             my $ncols = $matrix->cols();
2538             print "$message ($nrows rows and $ncols columns)\n";
2539             foreach my $i (0..$nrows-1) {
2540             my $row = $matrix->row($i);
2541             my @row_as_list = $row->as_list;
2542             print "@row_as_list\n";
2543             }
2544             print "\n";
2545             }
2546              
2547             sub transpose {
2548             my $matrix = shift;
2549             my $num_rows = $matrix->rows();
2550             my $num_cols = $matrix->cols();
2551             my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
2552             foreach my $i (0..$num_rows-1) {
2553             my @row = $matrix->row($i)->as_list;
2554             $transpose->set_col($i, \@row );
2555             }
2556             return $transpose;
2557             }
2558              
2559             sub vector_multiply {
2560             my $vec1 = shift;
2561             my $vec2 = shift;
2562             die "vec_multiply called with two vectors of different sizes"
2563             unless @$vec1 == @$vec2;
2564             my $result = 0;
2565             foreach my $i (0..@$vec1-1) {
2566             $result += $vec1->[$i] * $vec2->[$i];
2567             }
2568             return $result;
2569             }
2570              
2571             sub vector_2_vector_multiply {
2572             my $vec1 = shift;
2573             my $vec2 = shift;
2574             die "vec_multiply called with two vectors of different sizes"
2575             unless @$vec1 == @$vec2;
2576             my @result_vec;
2577             foreach my $i (0..@$vec1-1) {
2578             $result_vec[$i] = $vec1->[$i] * $vec2->[$i];
2579             }
2580             return \@result_vec;
2581             }
2582              
2583             sub matrix_multiply {
2584             my $matrix1 = shift;
2585             my $matrix2 = shift;
2586             my ($nrows1, $ncols1) = ($matrix1->rows(), $matrix1->cols());
2587             my ($nrows2, $ncols2) = ($matrix2->rows(), $matrix2->cols());
2588             die "matrix multiplication called with non-matching matrix arguments"
2589             # unless $ncols1 == $nrows2;
2590             unless $nrows1 == $ncols2 && $ncols1 == $nrows2;
2591             if ($nrows1 == 1) {
2592             my @row = $matrix1->row(0)->as_list;
2593             my @col = $matrix2->col(0)->as_list;
2594             my $result;
2595             foreach my $j (0..$ncols1-1) {
2596             $result += $row[$j] * $col[$j];
2597             }
2598             return $result;
2599             }
2600             my $product = Math::GSL::Matrix->new($nrows1, $nrows1);
2601             foreach my $i (0..$nrows1-1) {
2602             my $row = $matrix1->row($i);
2603             my @product_row;
2604             foreach my $j (0..$ncols2-1) {
2605             my $col = $matrix2->col($j);
2606             my $row_times_col = matrix_multiply($row, $col);
2607             push @product_row, $row_times_col;
2608             }
2609             $product->set_row($i, \@product_row);
2610             }
2611             return $product;
2612             }
2613              
2614             sub vector_matrix_multiply {
2615             my $matrix1 = shift;
2616             my $matrix2 = shift;
2617             my ($nrows1, $ncols1) = ($matrix1->rows, $matrix1->cols);
2618             my ($nrows2, $ncols2) = ($matrix2->rows, $matrix2->cols);
2619             die "matrix multiplication called with non-matching matrix arguments"
2620             unless $nrows1 == 1 && $ncols1 == $nrows2;
2621             if ($ncols2 == 1) {
2622             my @row = $matrix1->row(0)->as_list;
2623             my @col = $matrix2->col(0)->as_list;
2624             my $result;
2625             foreach my $j (0..$ncols1-1) {
2626             $result += $row[$j] * $col[$j];
2627             }
2628             return $result;
2629             }
2630             my $product = Math::GSL::Matrix->new(1, $ncols2);
2631             my $row = $matrix1->row(0);
2632             my @product_row;
2633             foreach my $j (0..$ncols2-1) {
2634             my $col = $matrix2->col($j);
2635             my $row_times_col = vector_matrix_multiply($row, $col);
2636             push @product_row, $row_times_col;
2637             }
2638             $product->set_row(0, \@product_row);
2639             return $product;
2640             }
2641              
2642             sub matrix_vector_multiply {
2643             my $matrix1 = shift;
2644             my $matrix2 = shift;
2645             my ($nrows1, $ncols1) = ($matrix1->rows(), $matrix1->cols());
2646             my ($nrows2, $ncols2) = ($matrix2->rows(), $matrix2->cols());
2647             die "matrix multiplication called with non-matching matrix arguments"
2648             unless $ncols1 == $nrows2 && $ncols2 == 1;
2649             if ($nrows1 == 1) {
2650             my @row = $matrix1->row(0)->as_list;
2651             my @col = $matrix2->col(0)->as_list;
2652             my $result;
2653             foreach my $j (0..$ncols1-1) {
2654             $result += $row[$j] * $col[$j];
2655             }
2656             return $result;
2657             }
2658             my $product = Math::GSL::Matrix->new($nrows1, 1);
2659             my $col = $matrix2->col(0);
2660             my @product_col;
2661             foreach my $i (0..$nrows1-1) {
2662             my $row = $matrix1->row($i);
2663             my $row_times_col = matrix_vector_multiply($row, $col);
2664             push @product_col, $row_times_col;
2665             }
2666             $product->set_col(0, \@product_col);
2667             return $product;
2668             }
2669              
2670             sub matrix_trace {
2671             my $matrix = shift;
2672             my ($nrows, $ncols) = ($matrix->rows(), $matrix->cols());
2673             die "trace can only be calculated for a square matrix"
2674             unless $ncols == $nrows;
2675             my @elements = $matrix->as_list;
2676             my $trace = 0;
2677             foreach my $i (0..$nrows-1) {
2678             $trace += $elements[$i + $i * $ncols];
2679             }
2680             return $trace;
2681             }
2682              
2683             1;
2684              
2685             =pod
2686              
2687             =head1 NAME
2688              
2689             Algorithm::ExpectationMaximization -- A Perl module for clustering numerical
2690             multi-dimensional data with the Expectation-Maximization algorithm.
2691              
2692             =head1 SYNOPSIS
2693              
2694             use Algorithm::ExpectationMaximization;
2695              
2696             # First name the data file:
2697              
2698             my $datafile = "mydatafile.csv";
2699              
2700             # Next, set the mask to indicate which columns of the datafile to use for
2701             # clustering and which column contains a symbolic ID for each data record. For
2702             # example, if the symbolic name is in the first column, you want the second column
2703             # to be ignored, and you want the next three columns to be used for 3D clustering:
2704              
2705             my $mask = "N0111";
2706              
2707             # Now construct an instance of the clusterer. The parameter `K' controls the
2708             # number of clusters. Here is an example call to the constructor for instance
2709             # creation:
2710              
2711             my $clusterer = Algorithm::ExpectationMaximization->new(
2712             datafile => $datafile,
2713             mask => $mask,
2714             K => 3,
2715             max_em_iterations => 300,
2716             seeding => 'random',
2717             terminal_output => 1,
2718             );
2719            
2720             # Note the choice for `seeding'. The choice `random' means that the clusterer will
2721             # randomly select `K' data points to serve as initial cluster centers. Other
2722             # possible choices for the constructor parameter `seeding' are `kmeans' and
2723             # `manual'. With the `kmeans' option for `seeding', the output of a K-means
2724             # clusterer is used for the cluster seeds and the initial cluster covariances. If
2725             # you use the `manual' option for seeding, you must also specify the data elements
2726             # to use for seeding the clusters.
2727              
2728             # Here is an example of a call to the constructor when we choose the `manual'
2729             # option for seeding the clusters and for specifying the data elements for
2730             # seeding. The data elements are specified by their tag names. In this case,
2731             # these names are `a26', `b53', and `c49':
2732              
2733             my $clusterer = Algorithm::ExpectationMaximization->new(
2734             datafile => $datafile,
2735             mask => $mask,
2736             class_priors => [0.6, 0.2, 0.2],
2737             K => 3,
2738             max_em_iterations => 300,
2739             seeding => 'manual',
2740             seed_tags => ['a26', 'b53', 'c49'],
2741             terminal_output => 1,
2742             );
2743              
2744             # This example call to the constructor also illustrates how you can inject class
2745             # priors into the clustering process. The class priors are the prior probabilities
2746             # of the class distributions in your dataset. As explained later, injecting class
2747             # priors in the manner shown above makes statistical sense only for the case of
2748             # manual seeding. When you do inject class priors, the order in which the priors
2749             # are expressed must correspond to the manually specified seeds for the clusters.
2750              
2751             # After the invocation of the constructor, the following calls are mandatory
2752             # for reasons that should be obvious from the names of the methods:
2753              
2754             $clusterer->read_data_from_file();
2755             srand(time);
2756             $clusterer->seed_the_clusters();
2757             $clusterer->EM();
2758             $clusterer->run_bayes_classifier();
2759             my $clusters = $clusterer->return_disjoint_clusters();
2760              
2761             # where the call to `EM()' is the invocation of the expectation-maximization
2762             # algorithm. The call to `srand(time)' is to seed the pseudo random number
2763             # generator afresh for each run of the cluster seeding procedure. If you want to
2764             # see repeatable results from one run to another of the algorithm with random
2765             # seeding, you would obviously not invoke `srand(time)'.
2766              
2767             # The call `run_bayes_classifier()' shown above carries out a disjoint clustering
2768             # of all the data points using the naive Bayes' classifier. And the call
2769             # `return_disjoint_clusters()' returns the clusters thus formed to you. Once you
2770             # have obtained access to the clusters in this manner, you can display them in
2771             # your terminal window by
2772              
2773             foreach my $index (0..@$clusters-1) {
2774             print "Cluster $index (Naive Bayes): @{$clusters->[$index]}\n\n"
2775             }
2776              
2777             # If you would like to also see the clusters purely on the basis of the posterior
2778             # class probabilities exceeding a threshold, call
2779              
2780             my $theta1 = 0.2;
2781             my $posterior_prob_clusters =
2782             $clusterer->return_clusters_with_posterior_probs_above_threshold($theta1);
2783              
2784             # where you can obviously set the threshold $theta1 to any value you wish. Note
2785             # that now you may end up with clusters that overlap. You can display them in
2786             # your terminal window in the same manner as shown above for the naive Bayes'
2787             # clusters.
2788              
2789             # You can write the naive Bayes' clusters out to files, one cluster per file, by
2790             # calling
2791              
2792             $clusterer->write_naive_bayes_clusters_to_files();
2793              
2794             # The clusters are placed in files with names like
2795              
2796             naive_bayes_cluster1.txt
2797             naive_bayes_cluster2.txt
2798             ...
2799              
2800             # In the same manner, you can write out the posterior probability based possibly
2801             # overlapping clusters to files by calling:
2802              
2803             $clusterer->write_posterior_prob_clusters_above_threshold_to_files($theta1);
2804              
2805             # where the threshold $theta1 sets the probability threshold for deciding which
2806             # data elements to place in a cluster. These clusters are placed in files with
2807             # names like
2808              
2809             posterior_prob_cluster1.txt
2810             posterior_prob_cluster2.txt
2811             ...
2812              
2813             # CLUSTER VISUALIZATION:
2814              
2815             # You must first set the mask for cluster visualization. This mask tells the
2816             # module which 2D or 3D subspace of the original data space you wish to visualize
2817             # the clusters in:
2818              
2819             my $visualization_mask = "111";
2820             $clusterer->visualize_clusters($visualization_mask);
2821             $clusterer->visualize_distributions($visualization_mask);
2822             $clusterer->plot_hardcopy_clusters($visualization_mask);
2823             $clusterer->plot_hardcopy_distributions($visualization_mask);
2824              
2825             # where the last two invocations are for writing out the PNG plots of the
2826             # visualization displays to disk files. The PNG image of the posterior
2827             # probability distributions is written out to a file named posterior_prob_plot.png
2828             # and the PNG image of the disjoint clusters to a file called cluster_plot.png.
2829              
2830             # SYNTHETIC DATA GENERATION:
2831              
2832             # The module has been provided with a class method for generating multivariate
2833             # data for experimenting with the EM algorithm. The data generation is controlled
2834             # by the contents of a parameter file that is supplied as an argument to the data
2835             # generator method. The priors, the means, and the covariance matrices in the
2836             # parameter file must be according to the syntax shown in the `param1.txt' file in
2837             # the `examples' directory. It is best to edit a copy of this file for your
2838             # synthetic data generation needs.
2839              
2840             my $parameter_file = "param1.txt";
2841             my $out_datafile = "mydatafile1.dat";
2842             Algorithm::ExpectationMaximization->cluster_data_generator(
2843             input_parameter_file => $parameter_file,
2844             output_datafile => $out_datafile,
2845             total_number_of_data_points => $N );
2846              
2847             # where the value of $N is the total number of data points you would like to see
2848             # generated for all of the Gaussians. How this total number is divided up amongst
2849             # the Gaussians is decided by the prior probabilities for the Gaussian components
2850             # as declared in input parameter file. The synthetic data may be visualized in a
2851             # terminal window and the visualization written out as a PNG image to a diskfile
2852             # by
2853              
2854             my $data_visualization_mask = "11";
2855             $clusterer->visualize_data($data_visualization_mask);
2856             $clusterer->plot_hardcopy_data($data_visualization_mask);
2857              
2858              
2859             =head1 CHANGES
2860              
2861             Version 1.22 should work with data in CSV files.
2862              
2863             Version 1.21 incorporates minor code clean up. Overall, the module implementation
2864             remains unchanged.
2865              
2866             Version 1.2 allows the module to also be used for 1-D data. The visualization code
2867             for 1-D shows the clusters through their histograms.
2868              
2869             Version 1.1 incorporates much cleanup of the documentation associated with the
2870             module. Both the top-level module documentation, especially the Description part,
2871             and the comments embedded in the code were revised for better utilization of the
2872             module. The basic implementation code remains unchanged.
2873              
2874              
2875             =head1 DESCRIPTION
2876              
2877             B is a I module for the
2878             Expectation-Maximization (EM) method of clustering numerical data that lends itself
2879             to modeling as a Gaussian mixture. Since the module is entirely in Perl (in the
2880             sense that it is not a Perl wrapper around a C library that actually does the
2881             clustering), the code in the module can easily be modified to experiment with several
2882             aspects of EM.
2883              
2884             Gaussian Mixture Modeling (GMM) is based on the assumption that the data consists of
2885             C Gaussian components, each characterized by its own mean vector and its own
2886             covariance matrix. Obviously, given observed data for clustering, we do not know
2887             which of the C Gaussian components was responsible for any of the data elements.
2888             GMM also associates a prior probability with each Gaussian component. In general,
2889             these priors will also be unknown. So the problem of clustering consists of
2890             estimating the posterior class probability at each data element and also estimating
2891             the class priors. Once these posterior class probabilities and the priors are
2892             estimated with EM, we can use the naive Bayes' classifier to partition the data into
2893             disjoint clusters. Or, for "soft" clustering, we can find all the data elements that
2894             belong to a Gaussian component on the basis of the posterior class probabilities at
2895             the data elements exceeding a prescribed threshold.
2896              
2897             If you do not mind the fact that it is possible for the EM algorithm to occasionally
2898             get stuck in a local maximum and to, therefore, produce a wrong answer even when you
2899             know the data to be perfectly multimodal Gaussian, EM is probably the most magical
2900             approach to clustering multidimensional data. Consider the case of clustering
2901             three-dimensional data. Each Gaussian cluster in 3D space is characterized by the
2902             following 10 variables: the 6 unique elements of the C<3x3> covariance matrix (which
2903             must be symmetric positive-definite), the 3 unique elements of the mean, and the
2904             prior associated with the Gaussian. Now let's say you expect to see six Gaussians in
2905             your data. What that means is that you would want the values for 59 variables
2906             (remember the unit-summation constraint on the class priors which reduces the overall
2907             number of variables by one) to be estimated by the algorithm that seeks to discover
2908             the clusters in your data. What's amazing is that, despite the large number of
2909             variables that must be optimized simultaneously, the EM algorithm will very likely
2910             give you a good approximation to the right answer.
2911              
2912             At its core, EM depends on the notion of unobserved data and the averaging of the
2913             log-likelihood of the data actually observed over all admissible probabilities for
2914             the unobserved data. But what is unobserved data? While in some cases where EM is
2915             used, the unobserved data is literally the missing data, in others, it is something
2916             that cannot be seen directly but that nonetheless is relevant to the data actually
2917             observed. For the case of clustering multidimensional numerical data that can be
2918             modeled as a Gaussian mixture, it turns out that the best way to think of the
2919             unobserved data is in terms of a sequence of random variables, one for each observed
2920             data point, whose values dictate the selection of the Gaussian for that data point.
2921             This point is explained in great detail in my on-line tutorial at
2922             L.
2923              
2924             The EM algorithm in our context reduces to an iterative invocation of the following
2925             steps: (1) Given the current guess for the means and the covariances of the different
2926             Gaussians in our mixture model, use Bayes' Rule to update the posterior class
2927             probabilities at each of the data points; (2) Using the updated posterior class
2928             probabilities, first update the class priors; (3) Using the updated class priors,
2929             update the class means and the class covariances; and go back to Step (1). Ideally,
2930             the iterations should terminate when the expected log-likelihood of the observed data
2931             has reached a maximum and does not change with any further iterations. The stopping
2932             rule used in this module is the detection of no change over three consecutive
2933             iterations in the values calculated for the priors.
2934              
2935             This module provides three different choices for seeding the clusters: (1) random,
2936             (2) kmeans, and (3) manual. When random seeding is chosen, the algorithm randomly
2937             selects C data elements as cluster seeds. That is, the data vectors associated
2938             with these seeds are treated as initial guesses for the means of the Gaussian
2939             distributions. The covariances are then set to the values calculated from the entire
2940             dataset with respect to the means corresponding to the seeds. With kmeans seeding, on
2941             the other hand, the means and the covariances are set to whatever values are returned
2942             by the kmeans algorithm. And, when seeding is set to manual, you are allowed to
2943             choose C data elements --- by specifying their tag names --- for the seeds. The
2944             rest of the EM initialization for the manual mode is the same as for the random mode.
2945             The algorithm allows for the initial priors to be specified for the manual mode of
2946             seeding.
2947              
2948             Much of code for the kmeans based seeding of EM was drawn from the
2949             C module by me. The code from that module used here corresponds to
2950             the case when the C option in the C module is set
2951             to C. The C option for KMeans consists of subjecting the data to a
2952             principal components analysis (PCA) to discover the direction of maximum variance in
2953             the data space. The data points are then projected on to this direction and a
2954             histogram constructed from the projections. Centers of the C largest peaks in
2955             this smoothed histogram are used to seed the KMeans based clusterer. As you'd
2956             expect, the output of the KMeans used to seed the EM algorithm.
2957              
2958             This module uses two different criteria to measure the quality of the clustering
2959             achieved. The first is the Minimum Description Length (MDL) proposed originally by
2960             Rissanen (J. Rissanen: "Modeling by Shortest Data Description," Automatica, 1978, and
2961             "A Universal Prior for Integers and Estimation by Minimum Description Length," Annals
2962             of Statistics, 1983.) The MDL criterion is a difference of a log-likelihood term for
2963             all of the observed data and a model-complexity penalty term. In general, both the
2964             log-likelihood and the model-complexity terms increase as the number of clusters
2965             increases. The form of the MDL criterion in this module uses for the penalty term
2966             the Bayesian Information Criterion (BIC) of G. Schwartz, "Estimating the Dimensions
2967             of a Model," The Annals of Statistics, 1978. In general, the smaller the value of
2968             MDL quality measure, the better the clustering of the data.
2969              
2970             For our second measure of clustering quality, we use `trace( SW^-1 . SB)' where SW is
2971             the within-class scatter matrix, more commonly denoted S_w, and SB the between-class
2972             scatter matrix, more commonly denoted S_b (the underscore means subscript). This
2973             measure can be thought of as the normalized average distance between the clusters,
2974             the normalization being provided by average cluster covariance SW^-1. Therefore, the
2975             larger the value of this quality measure, the better the separation between the
2976             clusters. Since this measure has its roots in the Fisher linear discriminant
2977             function, we incorporate the word C in the name of the quality measure.
2978             I When the
2979             clusters exhibit significant overlap, the numbers produced by this quality measure
2980             tend to be generally meaningless.
2981              
2982             =head1 METHODS
2983              
2984             The module provides the following methods for EM based
2985             clustering, for cluster visualization, for data
2986             visualization, and for the generation of data for testing a
2987             clustering algorithm:
2988              
2989             =over
2990              
2991             =item B
2992              
2993             A call to C constructs a new instance of the
2994             C class. A typical form
2995             of this call when you want to use random option for seeding
2996             the algorithm is:
2997              
2998             my $clusterer = Algorithm::ExpectationMaximization->new(
2999             datafile => $datafile,
3000             mask => $mask,
3001             K => 3,
3002             max_em_iterations => 300,
3003             seeding => 'random',
3004             terminal_output => 1,
3005             );
3006              
3007             where C is the expected number of clusters and
3008             C the maximum number of EM iterations
3009             that you want to allow until convergence is achieved.
3010             Depending on your dataset and on the choice of the initial
3011             seeds, the actual number of iterations used could be as few
3012             as 10 and as many as reaching 300. The output produced by
3013             the algorithm shows the actual number of iterations used to
3014             arrive at convergence.
3015              
3016             The data file supplied through the C option is
3017             expected to contain entries in the following format
3018              
3019             c20 0 10.7087017086940 9.63528386251712 10.9512155258108 ...
3020             c7 0 12.8025925026787 10.6126270065785 10.5228482095349 ...
3021             b9 0 7.60118206283120 5.05889245193079 5.82841781759102 ...
3022             ....
3023             ....
3024              
3025             where the first column contains the symbolic ID tag for each
3026             data record and the rest of the columns the numerical
3027             information. As to which columns are actually used for
3028             clustering is decided by the string value of the mask. For
3029             example, if we wanted to cluster on the basis of the entries
3030             in just the 3rd, the 4th, and the 5th columns above, the
3031             mask value would be C where the character C
3032             indicates that the ID tag is in the first column, the
3033             character C<0> that the second column is to be ignored, and
3034             the C<1>'s that follow that the 3rd, the 4th, and the 5th
3035             columns are to be used for clustering.
3036              
3037             If instead of random seeding, you wish to use the kmeans
3038             based seeding, just replace the option C supplied
3039             for C by C. You can also do manual seeding
3040             by designating a specified set of data elements to serve as
3041             cluster seeds. The call to the constructor in this case
3042             looks like
3043              
3044             my $clusterer = Algorithm::ExpectationMaximization->new(
3045             datafile => $datafile,
3046             mask => $mask,
3047             K => 3,
3048             max_em_iterations => 300,
3049             seeding => 'manual',
3050             seed_tags => ['a26', 'b53', 'c49'],
3051             terminal_output => 1,
3052             );
3053              
3054             where the option C is set to an anonymous array
3055             of symbolic names associated with the data elements.
3056              
3057             If you know the class priors, you can supply them through an
3058             additional option to the constructor that looks like
3059              
3060             class_priors => [0.6, 0.2, 0.2],
3061              
3062             for the case of C equal to 3. B
3063             be a useful thing to do only for the case of manual
3064             seeding.> If you go for manual seeding, the order in which
3065             the priors are expressed should correspond to the order of
3066             the manually chosen tags supplied through the C
3067             option.
3068              
3069             Note that the parameter C is boolean; when
3070             not supplied in the call to C it defaults to 0. When
3071             set, this parameter displays useful information in the
3072             window of the terminal screen in which you invoke the
3073             algorithm.
3074              
3075             =item B
3076              
3077             $clusterer->read_data_from_file()
3078              
3079             This is a required call after the constructor is invoked. As
3080             you would expect, this call reads in the data for
3081             clustering.
3082              
3083             =item B
3084              
3085             $clusterer->seed_the_clusters();
3086              
3087             This is also a required call. It processes the option you
3088             supplied for C in the constructor call to choose
3089             the data elements for seeding the C clusters.
3090              
3091             =item B
3092              
3093             $clusterer->EM();
3094              
3095             This is the workhorse of the module, as you would expect.
3096             The means, the covariances, and the priors estimated by this
3097             method are stored in instance variables that are subsequently
3098             accessed by other methods for the purpose of displaying the
3099             clusters, the probability distributions, etc.
3100              
3101             =item B
3102              
3103             $clusterer->run_bayes_classifier();
3104              
3105             Using the posterior probability distributions estimated by
3106             the C method, this method partitions the data into the
3107             C disjoint clusters using the naive Bayes' classifier.
3108              
3109             =item B
3110              
3111             my $clusters = $clusterer->return_disjoint_clusters();
3112              
3113             This allows you to access the clusters obtained with the
3114             application of the naive Bayes' classifier in your own
3115             scripts. If, say, you wanted to see the data records placed
3116             in each cluster, you could subsequently invoke the following
3117             loop in your own script:
3118              
3119             foreach my $index (0..@$clusters-1) {
3120             print "Cluster $index (Naive Bayes): @{$clusters->[$index]}\n\n"
3121             }
3122              
3123             where C<$clusters> holds the array reference returned by the
3124             call to C.
3125              
3126             =item B
3127              
3128             $clusterer->write_naive_bayes_clusters_to_files();
3129              
3130             This method writes the clusters obtained by applying the
3131             naive Bayes' classifier to disk files, one cluster per
3132             file. What is written out to each file consists of the
3133             symbolic names of the data records that belong to the
3134             cluster corresponding to that file. The clusters are placed
3135             in files with names like
3136              
3137             naive_bayes_cluster1.txt
3138             naive_bayes_cluster2.txt
3139             ...
3140              
3141             =item B
3142              
3143             my $theta1 = 0.2;
3144             my $posterior_prob_clusters =
3145             $clusterer->return_clusters_with_posterior_probs_above_threshold($theta1);
3146              
3147             This method returns a reference to an array of C
3148             anonymous arrays, each consisting of the symbolic names for
3149             the data records where the posterior class probability
3150             exceeds the threshold as specified by C<$theta1>.
3151             Subsequently, you can access each of these
3152             posterior-probability based clusters through a loop
3153             construct such as
3154              
3155             foreach my $index (0..@$posterior_prob_clusters-1) {
3156             print "Cluster $index (based on posterior probs exceeding $theta1): " .
3157             "@{$posterior_prob_clusters->[$index]}\n\n"
3158             }
3159              
3160             =item B
3161              
3162             $clusterer->write_posterior_prob_clusters_above_threshold_to_files($theta1);
3163              
3164             This call writes out the posterior-probability based soft
3165             clusters to disk files. As in the previous method, the
3166             threshold C<$theta1> sets the probability threshold for
3167             deciding which data elements belong to a cluster. These
3168             clusters are placed in files with names like
3169              
3170             posterior_prob_cluster1.txt
3171             posterior_prob_cluster2.txt
3172             ...
3173              
3174             =item B
3175              
3176             my $theta2 = 0.00001;
3177             my $class_distributions =
3178             $clusterer->return_individual_class_distributions_above_given_threshold($theta2);
3179              
3180             This is the method to call if you wish to see the individual
3181             Gaussians in your own script. The method returns a reference
3182             to an array of anonymous arrays, with each anonymous array
3183             representing data membership in each Gaussian. Only those
3184             data points are included in each Gaussian where the
3185             probability exceeds the threshold C<$theta2>. Note that the
3186             larger the covariance and the higher the data
3187             dimensionality, the smaller this threshold must be for you
3188             to see any of the data points in a Gaussian. After you have
3189             accessed the Gaussian mixture in this manner, you can
3190             display the data membership in each Gaussian through the
3191             following sort of a loop:
3192              
3193             foreach my $index (0..@$class_distributions-1) {
3194             print "Gaussian Distribution $index (only shows data elements " .
3195             "whose probabilities exceed the threshold $theta2: " .
3196             "@{$class_distributions->[$index]}\n\n"
3197             }
3198              
3199             =item B
3200              
3201             my $visualization_mask = "11";
3202             $clusterer->visualize_clusters($visualization_mask);
3203              
3204             The visualization mask here does not have to be identical to
3205             the one used for clustering, but must be a subset of that
3206             mask. This is convenient for visualizing the clusters in
3207             two- or three-dimensional subspaces of the original space.
3208             The subset is specified by placing `0's in the positions
3209             corresponding to the dimensions you do NOT want to see
3210             through visualization. Depending on the mask used, this
3211             method creates a 2D or a 3D scatter plot of the clusters
3212             obtained through the naive Bayes' classification rule.
3213              
3214             =item B
3215              
3216             $clusterer->visualize_distributions($visualization_mask);
3217              
3218             This is the method to call if you want to visualize the soft
3219             clustering corresponding to the posterior class
3220             probabilities exceeding the threshold specified in the call
3221             to
3222             C.
3223             Again, depending on the visualization mask used, you will
3224             see either a 2D plot or a 3D scatter plot.
3225              
3226             =item B
3227              
3228             $clusterer->plot_hardcopy_clusters($visualization_mask);
3229              
3230             This method create a PNG file from the C created
3231             display of the naive Bayes' clusters obtained from the data.
3232             The plotting functionality of C is accessed through
3233             the Perl wrappers provided by the C
3234             module.
3235              
3236             =item B
3237              
3238             $clusterer->plot_hardcopy_distributions($visualization_mask);
3239              
3240             This method create a PNG file from the C created
3241             display of the clusters that correspond to the posterior
3242             class probabilities exceeding a specified threshold. The
3243             plotting functionality of C is accessed through the
3244             Perl wrappers provided by the C module.
3245              
3246             =item B
3247              
3248             $clusterer->display_fisher_quality_vs_iterations();
3249              
3250             This method measures the quality of clustering by
3251             calculating the normalized average squared distance between
3252             the cluster centers, the normalization being provided by the
3253             average cluster covariance. See the Description for further
3254             details. In general, this measure is NOT useful for
3255             overlapping clusters.
3256              
3257             =item B
3258              
3259             $clusterer->display_mdl_quality_vs_iterations();
3260              
3261             At the end of each iteration, this method measures the
3262             quality of clustering my calculating its MDL (Minimum
3263             Description Length). As stated earlier in Description, the
3264             MDL measure is a difference of a log-likelihood term for all
3265             of the observed data and a model-complexity penalty term.
3266             The smaller the value returned by this method, the better
3267             the clustering.
3268              
3269             =item B
3270              
3271             my $estimated_priors = $clusterer->return_estimated_priors();
3272             print "Estimated class priors: @$estimated_priors\n";
3273              
3274             This method can be used to access the final values of the
3275             class priors as estimated by the EM algorithm.
3276              
3277             =item B
3278              
3279             Algorithm::ExpectationMaximization->cluster_data_generator(
3280             input_parameter_file => $parameter_file,
3281             output_datafile => $out_datafile,
3282             total_number_of_data_points => 300
3283             );
3284              
3285             for generating multivariate data for clustering if you wish to play with synthetic
3286             data for experimenting with the EM algorithm. The input parameter file must specify
3287             the priors to be used for the Gaussians, their means, and their covariance matrices.
3288             The format of the information contained in the parameter file must be as shown in the
3289             file C provided in the C directory. It will be easiest for you
3290             to just edit a copy of this file for your data generation needs. In addition to the
3291             format of the parameter file, the main constraint you need to observe in specifying
3292             the parameters is that the dimensionality of the covariance matrices must correspond
3293             to the dimensionality of the mean vectors. The multivariate random numbers are
3294             generated by calling the C module. As you would expect, this module
3295             requires that the covariance matrices you specify in your parameter file be symmetric
3296             and positive definite. Should the covariances in your parameter file not obey this
3297             condition, the C module will let you know.
3298              
3299             =item B
3300              
3301             $clusterer->visualize_data($data_visualization_mask);
3302              
3303             This is the method to call if you want to visualize the data
3304             you plan to cluster with the EM algorithm. You'd need to
3305             specify argument mask in a manner similar to the
3306             visualization of the clusters, as explained earlier.
3307              
3308             =item B
3309              
3310             $clusterer->plot_hardcopy_data($data_visualization_mask);
3311              
3312             This method creates a PNG file that can be used to print out
3313             a hardcopy of the data in different 2D and 3D subspaces of
3314             the data space. The visualization mask is used to select the
3315             subspace for the PNG image.
3316              
3317             =back
3318              
3319             =head1 HOW THE CLUSTERS ARE OUTPUT
3320              
3321             This module produces two different types of clusters: the "hard" clusters and the
3322             "soft" clusters. The hard clusters correspond to the naive Bayes' classification of
3323             the data points on the basis of the Gaussian distributions and the class priors
3324             estimated by the EM algorithm. Such clusters partition the data into disjoint
3325             subsets. On the other hand, the soft clusters correspond to the posterior class
3326             probabilities calculated by the EM algorithm. A data element belongs to a cluster if
3327             its posterior probability for that Gaussian exceeds a threshold.
3328              
3329             After the EM algorithm has finished running, the hard clusters are created by
3330             invoking the method C on an instance of the module and then
3331             made user-accessible by calling C. These clusters may
3332             then be displayed in a terminal window by dereferencing each element of the array
3333             whose reference is returned b C. The hard clusters can
3334             be written out to disk files by invoking C.
3335             This method writes out the clusters to files, one cluster per file. What is written
3336             out to each file consists of the symbolic names of the data records that belong to
3337             the cluster corresponding to that file. The clusters are placed in files with names
3338             like
3339              
3340             naive_bayes_cluster1.txt
3341             naive_bayes_cluster2.txt
3342             ...
3343              
3344             The soft clusters on the other hand are created by calling
3345             C
3346             on an instance of the module, where the argument C<$theta1>
3347             is the threshold for deciding whether a data element belongs
3348             in a soft cluster. The posterior class probability at a
3349             data element must exceed the threshold for the element to
3350             belong to the corresponding cluster. The soft cluster can
3351             be written out to disk files by calling
3352             C.
3353             As with the hard clusters, each cluster is placed in a separate
3354             file. The filenames for such clusters look like:
3355              
3356             posterior_prob_cluster1.txt
3357             posterior_prob_cluster2.txt
3358             ...
3359              
3360             =head1 WHAT IF THE NUMBER OF CLUSTERS IS UNKNOWN?
3361              
3362             The module constructor requires that you supply a value for the parameter C, which
3363             is the number of clusters you expect to see in the data. But what if you do not have
3364             a good value for C? Note that it is possible to search for the best C by using
3365             the two clustering quality criteria included in the module. However, I have
3366             intentionally not yet incorporated that feature in the module because it slows down
3367             the execution of the code --- especially when the dimensionality of the data
3368             increases. However, nothing prevents you from writing a script --- along the lines
3369             of the five "canned_example" scripts in the C directory --- that would use
3370             the two clustering quality metrics for finding the best choice for C for a given
3371             dataset. Obviously, you will now have to incorporate the call to the constructor in
3372             a loop and check the value of the quality measures for each value of C.
3373              
3374             =head1 SOME RESULTS OBTAINED WITH THIS MODULE
3375              
3376             If you would like to see some results that have been obtained with this module, check
3377             out Section 7 of the report
3378             L.
3379              
3380             =head1 THE C DIRECTORY
3381              
3382             Becoming familiar with this directory should be your best
3383             strategy to become comfortable with this module (and its
3384             future versions). You are urged to start by executing the
3385             following five example scripts:
3386              
3387             =over 16
3388              
3389             =item I
3390              
3391             This example applies the EM algorithm to the data contained in the datafile
3392             C. The mixture data in the file corresponds to three overlapping
3393             Gaussian components in a star-shaped pattern. The EM based clustering for this data
3394             is shown in the files C and
3395             C, the former displaying the hard clusters
3396             obtained by using the naive Bayes' classifier and the latter showing the soft
3397             clusters obtained on the basis of the posterior class probabilities at the data
3398             points.
3399              
3400             =item I
3401              
3402             The datafile used in this example is C. This mixture data
3403             corresponds to two well-separated relatively isotropic Gaussians. EM based clustering for this
3404             data is shown in the files C and
3405             C, the former displaying the hard clusters
3406             obtained by using the naive Bayes' classifier and the latter showing the soft
3407             clusters obtained by using the posterior class probabilities at the data points.
3408              
3409             =item I
3410              
3411             Like the first example, this example again involves three Gaussians, but now their
3412             means are not co-located. Additionally, we now seed the clusters manually by
3413             specifying three selected data points as the initial guesses for the cluster means.
3414             The datafile used for this example is C. The EM based clustering
3415             for this data is shown in the files C and
3416             C, the former displaying the hard clusters
3417             obtained by using the naive Bayes' classifier and the latter showing the soft
3418             clusters obtained on the basis of the posterior class probabilities at the data
3419             points.
3420              
3421             =item I
3422              
3423             Whereas the three previous examples demonstrated EM based clustering of 2D data, we
3424             now present an example of clustering in 3D. The datafile used in this example is
3425             C. This mixture data corresponds to three well-separated but highly
3426             anisotropic Gaussians. The EM derived clustering for this data is shown in the files
3427             C and C, the
3428             former displaying the hard clusters obtained by using the naive Bayes' classifier and
3429             the latter showing the soft clusters obtained on the basis of the posterior class
3430             probabilities at the data points.
3431              
3432             You may also wish to run this example on the data in a CSV file in the C
3433             directory. The name of the file is C.
3434              
3435             =item I
3436              
3437             We again demonstrate clustering in 3D but now we have one Gaussian cluster that
3438             "cuts" through the other two Gaussian clusters. The datafile used in this example is
3439             C. The three Gaussians in this case are highly overlapping and
3440             highly anisotropic. The EM derived clustering for this data is shown in the files
3441             C and C, the
3442             former displaying the hard clusters obtained by using the naive Bayes' classifier and
3443             the latter showing the soft clusters obtained through the posterior class
3444             probabilities at the data points.
3445              
3446             =item I
3447              
3448             This example, added in Version 1.2, demonstrates the use of this module for 1-D data.
3449             In order to visualize the clusters for the 1-D case, we show them through their
3450             respective histograms. The datafile used in this example is C. The
3451             data consists of two overlapping Gaussians. The EM derived clustering for this data
3452             is shown in the files C and
3453             C, the former displaying the hard clusters
3454             obtained by using the naive Bayes' classifier and the latter showing the soft
3455             clusters obtained through the posterior class probabilities at the data points.
3456              
3457             =back
3458              
3459             Going through the six examples listed above will make you familiar with how to make
3460             the calls to the clustering and the visualization methods. The C directory
3461             also includes several parameter files with names like
3462              
3463             param1.txt
3464             param2.txt
3465             param3.txt
3466             ...
3467              
3468             These were used to generate the synthetic data for which the results are shown in the
3469             C directory. Just make a copy of one of these files and edit it if you
3470             would like to generate your own multivariate data for clustering. Note that you can
3471             generate data with any dimensionality through appropriate entries in the parameter
3472             file.
3473              
3474             =head1 CAVEATS
3475              
3476             When you run the scripts in the C directory, your results will NOT always
3477             look like what I have shown in the PNG image files in the directory. As mentioned
3478             earlier in Description, the EM algorithm starting from randomly chosen initial
3479             guesses for the cluster means can get stuck in a local maximum.
3480              
3481             That raises an interesting question of how one judges the correctness of clustering
3482             results when dealing with real experimental data. For real data, the best approach
3483             is to try the EM algorithm multiple times with all of the seeding options included in
3484             this module. It would be safe to say that, at least in low dimensional spaces and
3485             with sufficient data, a majority of your runs should yield "correct" results.
3486              
3487             Also bear in mind that a pure Perl implementation is not meant for the clustering of
3488             very large data files. It is really designed more for researching issues related to
3489             EM based approaches to clustering.
3490              
3491              
3492             =head1 REQUIRED
3493              
3494             This module requires the following three modules:
3495              
3496             Math::Random
3497             Graphics::GnuplotIF
3498             Math::GSL::Matrix
3499              
3500             the first for generating the multivariate random numbers, the second for the
3501             visualization of the clusters, and the last for access to the Perl wrappers for the
3502             GNU Scientific Library. The C module of this library is used for various
3503             algebraic operations on the covariance matrices of the Gaussians.
3504              
3505             =head1 EXPORT
3506              
3507             None by design.
3508              
3509              
3510             =head1 BUGS
3511              
3512             Please notify the author if you encounter any bugs. When sending email, please place
3513             the string 'Algorithm EM' in the subject line.
3514              
3515             =head1 INSTALLATION
3516              
3517             Download the archive from CPAN in any directory of your choice. Unpack the archive
3518             with a command that on a Linux machine would look like:
3519              
3520             tar zxvf Algorithm-ExpectationMaximization-1.22.tar.gz
3521              
3522             This will create an installation directory for you whose name will be
3523             C. Enter this directory and execute the
3524             following commands for a standard install of the module if you have root privileges:
3525              
3526             perl Makefile.PL
3527             make
3528             make test
3529             sudo make install
3530              
3531             If you do not have root privileges, you can carry out a non-standard install the
3532             module in any directory of your choice by:
3533              
3534             perl Makefile.PL prefix=/some/other/directory/
3535             make
3536             make test
3537             make install
3538              
3539             With a non-standard install, you may also have to set your PERL5LIB environment
3540             variable so that this module can find the required other modules. How you do that
3541             would depend on what platform you are working on. In order to install this module in
3542             a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
3543             variable by
3544              
3545             setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
3546              
3547             If I used bash, I'd need to declare:
3548              
3549             export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
3550              
3551              
3552             =head1 ACKNOWLEDGMENTS
3553              
3554             Version 1.2 is a result of the feedback received from Paul
3555             May of University of Birmingham. Thanks, Paul!
3556              
3557             =head1 AUTHOR
3558              
3559             Avinash Kak, kak@purdue.edu
3560              
3561             If you send email, please place the string "EM Algorithm" in your
3562             subject line to get past my spam filter.
3563              
3564             =head1 COPYRIGHT
3565              
3566             This library is free software; you can redistribute it and/or modify it under the
3567             same terms as Perl itself.
3568              
3569             Copyright 2014 Avinash Kak
3570              
3571             =cut
3572