File Coverage

blib/lib/Algorithm/LinearManifoldDataClusterer.pm
Criterion Covered Total %
statement 25 27 92.5
branch n/a
condition n/a
subroutine 9 9 100.0
pod n/a
total 34 36 94.4


line stmt bran cond sub pod time code
1             package Algorithm::LinearManifoldDataClusterer;
2              
3             #------------------------------------------------------------------------------------
4             # Copyright (c) 2015 Avinash Kak. All rights reserved. This program is free
5             # software. You may modify and/or distribute it under the same terms as Perl itself.
6             # This copyright notice must remain attached to the file.
7             #
8             # Algorithm::LinearManifoldDataClusterer is a Perl module for clustering data that
9             # resides on a low-dimensional manifold in a high-dimensional measurement space.
10             # -----------------------------------------------------------------------------------
11              
12 1     1   13460 use 5.10.0;
  1         3  
  1         48  
13 1     1   5 use strict;
  1         1  
  1         33  
14 1     1   5 use warnings;
  1         5  
  1         30  
15 1     1   4 use Carp;
  1         3  
  1         81  
16 1     1   7 use List::Util qw(reduce any);
  1         2  
  1         123  
17 1     1   6 use File::Basename;
  1         1  
  1         79  
18 1     1   1009 use Math::Random;
  1         5355  
  1         98  
19 1     1   579 use Graphics::GnuplotIF;
  1         10104  
  1         66  
20 1     1   1020 use Math::GSL::Matrix;
  0            
  0            
21             use POSIX ();
22              
23             our $VERSION = '1.0';
24              
25             # Constructor:
26             sub new {
27             my ($class, %args) = @_;
28             my @params = keys %args;
29             croak "\nYou have used a wrong name for a keyword argument " .
30             "--- perhaps a misspelling\n"
31             if check_for_illegal_params(@params) == 0;
32             bless {
33             _datafile => $args{datafile} || croak("datafile required"),
34             _mask => $args{mask} || croak("mask required"),
35             _K => $args{K} || 0,
36             _P => $args{P} || 0,
37             _terminal_output => $args{terminal_output} || 0,
38             _max_iterations => $args{max_iterations} || 0,
39             _delta_reconstruction_error => $args{delta_reconstruction_error} || 0.001,
40             _delta_normalized_error => undef,
41             _cluster_search_multiplier => $args{cluster_search_multiplier} || 1,
42             _visualize_each_iteration => $args{visualize_each_iteration} == 0 ? 0 : 1,
43             _show_hidden_in_3D_plots => $args{show_hidden_in_3D_plots} == 0 ? 0 : 1,
44             _make_png_for_each_iteration => $args{make_png_for_each_iteration} == 0 ? 0 : 1,
45             _debug => $args{debug} || 0,
46             _N => 0,
47             _KM => $args{K} * $args{cluster_search_multiplier},
48             _data_hash => {},
49             _data_tags => [],
50             _data_dimensions => 0,
51             _final_clusters => [],
52             _auto_retry_flag => 0,
53             _num_iterations_actually_used => undef,
54             _scale_factor => undef,
55             _data_tags_to_cluster_label_hash => {},
56             _final_reference_vecs_for_all_subspaces => [],
57             _reconstruction_error_as_a_function_of_iteration => [],
58             _final_trailing_eigenvec_matrices_for_all_subspaces => [],
59             _subspace_construction_error_as_a_function_of_iteration => [],
60             }, $class;
61             }
62              
63             sub get_data_from_csv {
64             my $self = shift;
65             my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered";
66             my $mask = $self->{_mask};
67             my @mask = split //, $mask;
68             $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
69             print "data dimensionality: $self->{_data_dimensions} \n" if $self->{_terminal_output};
70             open FILEIN, $filename or die "Unable to open $filename: $!";
71             die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
72             local $/ = undef;
73             my @all_data = split /\s+/, ;
74             my %data_hash = ();
75             my @data_tags = ();
76             foreach my $record (@all_data) {
77             my @splits = split /,/, $record;
78             my $record_name = shift @splits;
79             $data_hash{$record_name} = \@splits;
80             push @data_tags, $record_name;
81             }
82             $self->{_data_hash} = \%data_hash;
83             $self->{_data_tags} = \@data_tags;
84             $self->{_N} = scalar @data_tags;
85             }
86              
87             sub estimate_mean_and_covariance {
88             my $self = shift;
89             my $tag_set = shift;
90             my $cluster_size = @$tag_set;
91             my @cluster_center = @{$self->add_point_coords($tag_set)};
92             @cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;
93             # for covariance calculation:
94             my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$tag_set));
95             print "\nThe data will be stuffed into a matrix of $num_rows rows and $num_cols columns\n\n"
96             if $self->{_debug};
97             my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
98             my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
99             # All the record labels are stored in the array $self->{_data_tags}. The actual
100             # data for clustering is stored in a hash at $self->{_data_hash} whose keys are
101             # the record labels; the value associated with each key is the array holding the
102             # corresponding numerical multidimensional data.
103             $mean_vec->set_col(0, \@cluster_center);
104             if ($self->{_debug}) {
105             print "\nDisplaying the mean vector for the cluster:\n";
106             display_matrix( $mean_vec ) if $self->{_terminal_output};
107             }
108             foreach my $j (0..$num_cols-1) {
109             my $tag = $tag_set->[$j];
110             my $data = $self->{_data_hash}->{$tag};
111             my @diff_from_mean = vector_subtract($data, \@cluster_center);
112             $matrix->set_col($j, \@diff_from_mean);
113             }
114             my $transposed = transpose( $matrix );
115             my $covariance = $matrix * $transposed;
116             $covariance *= 1.0 / $num_cols;
117             if ($self->{_debug}) {
118             print "\nDisplaying the Covariance Matrix for cluster:";
119             display_matrix( $covariance ) if $self->{_terminal_output};
120             }
121             return ($mean_vec, $covariance);
122             }
123              
124             sub eigen_analysis_of_covariance {
125             my $self = shift;
126             my $covariance = shift;
127             my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
128             my $num_of_eigens = @$eigenvalues;
129             my $largest_eigen_index = 0;
130             my $smallest_eigen_index = 0;
131             print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug};
132             foreach my $i (1..$num_of_eigens-1) {
133             $largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
134             $smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
135             print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug};
136             }
137             print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
138             print "\nsmallest eigen index: $smallest_eigen_index\n\n" if $self->{_debug};
139             my @all_my_eigenvecs;
140             foreach my $i (0..$num_of_eigens-1) {
141             my @vec = $eigenvectors->[$i]->as_list;
142             my @eigenvec;
143             foreach my $ele (@vec) {
144             my ($mag, $theta) = $ele =~ /\[(\d*\.?\d*e?[+-]?\d*),(\S+)\]/;
145             if ($theta eq "0") {
146             push @eigenvec, $mag;
147             } elsif ($theta eq "pi") {
148             push @eigenvec, -1.0 * $mag;
149             } else {
150             die "Eigendecomposition produced a complex eigenvector -- " .
151             "which should not happen for a covariance matrix!";
152             }
153             }
154             print "Eigenvector $i: @eigenvec\n" if $self->{_debug};
155             push @all_my_eigenvecs, \@eigenvec;
156             }
157             my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
158             print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug};
159             my @sorted_eigenvec_indexes = sort {$eigenvalues->[$b] <=> $eigenvalues->[$a]} 0..@all_my_eigenvecs-1;
160             my @sorted_eigenvecs;
161             my @sorted_eigenvals;
162             foreach my $i (0..@sorted_eigenvec_indexes-1) {
163             $sorted_eigenvecs[$i] = $all_my_eigenvecs[$sorted_eigenvec_indexes[$i]];
164             $sorted_eigenvals[$i] = $eigenvalues->[$sorted_eigenvec_indexes[$i]];
165             }
166             if ($self->{_debug}) {
167             print "\nHere come sorted eigenvectors --- from the largest to the smallest:\n";
168             foreach my $i (0..@sorted_eigenvecs-1) {
169             print "eigenvec: @{$sorted_eigenvecs[$i]} eigenvalue: $sorted_eigenvals[$i]\n";
170             }
171             }
172             return (\@sorted_eigenvecs, \@sorted_eigenvals);
173             }
174              
175             sub auto_retry_clusterer {
176             my $self = shift;
177             $self->{_auto_retry_flag} = 1;
178             my $clusters;
179             $@ = 1;
180             my $retry_attempts = 1;
181             while ($@) {
182             eval {
183             $clusters = $self->linear_manifold_clusterer();
184             };
185             if ($@) {
186             if ($self->{_terminal_output}) {
187             print "Clustering failed. Trying again. --- $@";
188             print "\n\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n";
189             print "VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV\n\n";
190             }
191             $retry_attempts++;
192             } else {
193             print "\n\nNumber of retry attempts: $retry_attempts\n\n" if $self->{_terminal_output};
194             return $clusters;
195             }
196             }
197             }
198              
199             sub linear_manifold_clusterer {
200             my $self = shift;
201             my $KM = $self->{_KM};
202             my @initial_cluster_center_tags;
203             my $visualization_msg;
204             my @initial_cluster_center_indexes = $self->initialize_cluster_centers($KM, $self->{_N});
205             print "Randomly selected indexes for cluster center tags: @initial_cluster_center_indexes\n"
206             if $self->{_debug};
207             @initial_cluster_center_tags = map {$self->{_data_tags}->[$_]} @initial_cluster_center_indexes;
208             my @initial_cluster_center_coords = map {$self->{_data_hash}->{$_}} @initial_cluster_center_tags;
209             if ($self->{_debug}) {
210             foreach my $centroid (@initial_cluster_center_coords) {
211             print "Initial cluster center coords: @{$centroid}\n";
212             }
213             }
214             my $initial_clusters = $self->assign_data_to_clusters_initial(\@initial_cluster_center_coords);
215             if ($self->{_data_dimensions} == 3) {
216             $visualization_msg = "initial_clusters";
217             $self->visualize_clusters_on_sphere($visualization_msg, $initial_clusters)
218             if $self->{_visualize_each_iteration};
219             $self->visualize_clusters_on_sphere($visualization_msg, $initial_clusters, "png")
220             if $self->{_make_png_for_each_iteration};
221             }
222             foreach my $cluster (@$initial_clusters) {
223             my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
224             display_mean_and_covariance($mean, $covariance) if $self->{_debug};
225             }
226             my @clusters = @$initial_clusters;
227             display_clusters(\@clusters) if $self->{_debug};
228             my $iteration_index = 0;
229             my $unimodal_correction_flag;
230             my $previous_min_value_for_unimodality_quotient;
231             while ($iteration_index < $self->{_max_iterations}) {
232             print "\n\n========================== STARTING ITERATION $iteration_index =====================\n\n"
233             if $self->{_terminal_output};
234             my $total_reconstruction_error_this_iteration = 0;
235             my @subspace_construction_errors_this_iteration;
236             my @trailing_eigenvec_matrices_for_all_subspaces;
237             my @reference_vecs_for_all_subspaces;
238             foreach my $cluster (@clusters) {
239             next if @$cluster == 0;
240             my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
241             display_mean_and_covariance($mean, $covariance) if $self->{_debug};
242             print "--------------end of displaying mean and covariance\n\n" if $self->{_debug};
243             my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance);
244             my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
245             my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1];
246             my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals;
247             push @subspace_construction_errors_this_iteration, $subspace_construction_error;
248             my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions},
249             scalar(@trailing_eigenvecs));
250             foreach my $j (0..@trailing_eigenvecs-1) {
251             print "trailing eigenvec column: @{$trailing_eigenvecs[$j]}\n" if $self->{_debug};
252             $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
253             }
254             push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
255             push @reference_vecs_for_all_subspaces, $mean;
256             }
257             $self->set_termination_reconstruction_error_threshold(\@reference_vecs_for_all_subspaces);
258             my %best_subspace_based_partition_of_data;
259             foreach my $i (0..$self->{_KM}-1) {
260             $best_subspace_based_partition_of_data{$i} = [];
261             }
262             foreach my $data_tag (@{$self->{_data_tags}}) {
263             my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
264             $data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
265             my @errors = map {$self->reconstruction_error($data_vec,
266             $trailing_eigenvec_matrices_for_all_subspaces[$_],
267             $reference_vecs_for_all_subspaces[$_])}
268             0 .. $self->{_KM}-1;
269             my ($minval, $index_for_closest_subspace) = minimum(\@errors);
270             $total_reconstruction_error_this_iteration += $minval;
271             push @{$best_subspace_based_partition_of_data{$index_for_closest_subspace}},
272             $data_tag;
273             }
274             print "Finished calculating the eigenvectors for the clusters produced by the previous\n" .
275             "iteration and re-assigning the data samples to the new subspaces on the basis of\n".
276             "the least reconstruction error.\n\n" .
277             "Total reconstruction error in this iteration: $total_reconstruction_error_this_iteration\n"
278             if $self->{_terminal_output};
279             foreach my $i (0..$self->{_KM}-1) {
280             $clusters[$i] = $best_subspace_based_partition_of_data{$i};
281             }
282             display_clusters(\@clusters) if $self->{_terminal_output};
283             # Check if any cluster has lost all its elements. If so, fragment the worst
284             # existing cluster to create the additional clusters needed:
285             if (any {@$_ == 0} @clusters) {
286             die "empty cluster found" if $self->{_auto_retry_flag};
287             print "\nOne or more clusters have become empty. Will carve out the needed clusters\n" .
288             "from the cluster with the largest subspace construction error.\n\n";
289             $total_reconstruction_error_this_iteration = 0;
290             @subspace_construction_errors_this_iteration = ();
291             my $how_many_extra_clusters_needed = $self->{_KM} - scalar(grep {@$_ != 0} @clusters);
292             print "number of extra clusters needed at iteration $iteration_index: $how_many_extra_clusters_needed\n";
293             my $max = List::Util::max @subspace_construction_errors_this_iteration;
294             my $maxindex = List::Util::first {$_ == $max} @subspace_construction_errors_this_iteration;
295             my @cluster_fragments = cluster_split($clusters[$maxindex],
296             $how_many_extra_clusters_needed + 1);
297             my @newclusters;
298             push @newclusters, @clusters[0 .. $maxindex-1];
299             push @newclusters, @clusters[$maxindex+1 .. $self->{_KM}-1];
300             push @newclusters, @cluster_fragments;
301             @newclusters = grep {@$_ != 0} @newclusters;
302             die "something went wrong with cluster fragmentation"
303             unless $self->{_KM} = @newclusters;
304             @trailing_eigenvec_matrices_for_all_subspaces = ();
305             @reference_vecs_for_all_subspaces = ();
306             foreach my $cluster (@newclusters) {
307             die "Linear Manifold Clustering did not work $!" if @$cluster == 0;
308             my ($mean, $covariance) = estimate_mean_and_covariance($cluster);
309             my ($eigenvecs, $eigenvals) = eigen_analysis_of_covariance($covariance);
310             my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
311             my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1];
312             my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals;
313             push @subspace_construction_errors_this_iteration, $subspace_construction_error;
314             my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions},
315             scalar(@trailing_eigenvecs));
316             foreach my $j (0..@trailing_eigenvecs-1) {
317             $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
318             }
319             push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
320             push @reference_vecs_for_all_subspaces, $mean;
321             }
322             my %best_subspace_based_partition_of_data;
323             foreach my $i (0..$self->{_KM}-1) {
324             $best_subspace_based_partition_of_data{$i} = [];
325             }
326             foreach my $data_tag (@{$self->{_data_tags}}) {
327             my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
328             $data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
329             my @errors = map {reconstruction_error($data_vec,
330             $trailing_eigenvec_matrices_for_all_subspaces[$_],
331             $reference_vecs_for_all_subspaces[$_])}
332             0 .. $self->{_KM}-1;
333             my ($minval, $index_for_closest_subspace) = minimum(\@errors);
334             $total_reconstruction_error_this_iteration += $minval;
335             push @{$best_subspace_based_partition_of_data{$index_for_closest_subspace}},
336             $data_tag;
337             }
338             print "empty-cluster jag: total reconstruction error in this iteration: \n" .
339             "$total_reconstruction_error_this_iteration\n"
340             if $self->{_debug};
341             foreach my $i (0..$self->{_KM}-1) {
342             $clusters[$i] = $best_subspace_based_partition_of_data{$i};
343             }
344             display_clusters(\@newclusters) if $self->{_terminal_output};
345             @clusters = grep {@$_ != 0} @newclusters;
346             die "linear manifold based algorithm does not appear to work in this case $!"
347             unless @clusters == $self->{_KM};
348             }# end of foreach my $cluster (@clusters) ... loop followed by if clause for empty clusters
349             if ($self->{_data_dimensions} == 3) {
350             $visualization_msg = "clustering_at_iteration_$iteration_index";
351             $self->visualize_clusters_on_sphere($visualization_msg, \@clusters)
352             if $self->{_visualize_each_iteration};
353             $self->visualize_clusters_on_sphere($visualization_msg, \@clusters, "png")
354             if $self->{_make_png_for_each_iteration};
355             }
356             my @cluster_unimodality_quotients = map {$self->cluster_unimodality_quotient($clusters[$_],
357             $reference_vecs_for_all_subspaces[$_])} 0..@clusters-1;
358             my $min_value_for_unimodality_quotient = List::Util::min @cluster_unimodality_quotients;
359             print "\nCluster unimodality quotients: @cluster_unimodality_quotients\n" if $self->{_terminal_output};
360             die "\n\nBailing out!\n" .
361             "It does not look like these iterations will lead to a good clustering result.\n" .
362             "Program terminating. Try running again.\n"
363             if defined($previous_min_value_for_unimodality_quotient)
364             && ($min_value_for_unimodality_quotient < 0.4)
365             && ($min_value_for_unimodality_quotient < (0.5 * $previous_min_value_for_unimodality_quotient));
366             if ( $min_value_for_unimodality_quotient < 0.5 ) {
367             $unimodal_correction_flag = 1;
368             print "\nApplying unimodality correction:\n\n" if $self->{_terminal_output};
369             my @sorted_cluster_indexes =
370             sort {$cluster_unimodality_quotients[$b] <=> $cluster_unimodality_quotients[$a]} 0..@clusters-1;
371             my @newclusters;
372             foreach my $cluster_index (0..@clusters - 1) {
373             push @newclusters, $clusters[$sorted_cluster_indexes[$cluster_index]];
374             }
375             @clusters = @newclusters;
376             my $worst_cluster = pop @clusters;
377             print "\nthe worst cluster: @$worst_cluster\n" if $self->{_terminal_output};
378             my $second_worst_cluster = pop @clusters;
379             print "\nthe second worst cluster: @$second_worst_cluster\n" if $self->{_terminal_output};
380             push @$worst_cluster, @$second_worst_cluster;
381             fisher_yates_shuffle($worst_cluster);
382             my @first_half = @$worst_cluster[0 .. int(scalar(@$worst_cluster)/2) - 1];
383             my @second_half = @$worst_cluster[int(scalar(@$worst_cluster)/2) .. @$worst_cluster - 1];
384             push @clusters, \@first_half;
385             push @clusters, \@second_half;
386             if ($self->{_terminal_output}) {
387             print "\n\nShowing the clusters obtained after applying the unimodality correction:\n";
388             display_clusters(\@clusters);
389             }
390             }
391             if (@{$self->{_reconstruction_error_as_a_function_of_iteration}} > 0) {
392             my $last_recon_error = pop @{$self->{_reconstruction_error_as_a_function_of_iteration}};
393             push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, $last_recon_error;
394             if (($last_recon_error - $total_reconstruction_error_this_iteration)
395             < $self->{_delta_normalized_error}) {
396             push @{$self->{_reconstruction_error_as_a_function_of_iteration}},
397             $total_reconstruction_error_this_iteration;
398             last;
399             }
400             }
401             push @{$self->{_reconstruction_error_as_a_function_of_iteration}},
402             $total_reconstruction_error_this_iteration;
403             $iteration_index++;
404             $previous_min_value_for_unimodality_quotient = $min_value_for_unimodality_quotient;
405             } # end of while loop on iteration_index
406             $self->{_num_iterations_actually_used} =
407             scalar @{$self->{_reconstruction_error_as_a_function_of_iteration}};
408             if ($self->{_terminal_output}) {
409             print "\nIterations of the main loop terminated at iteration number $iteration_index.\n";
410             print "Will now invoke graph partitioning to discover dominant clusters and to\n" .
411             "merge small clusters.\n\n" if $self->{_cluster_search_multiplier} > 1;
412             print "Total reconstruction error as a function of iterations: " .
413             "@{$self->{_reconstruction_error_as_a_function_of_iteration}}";
414             }
415             # now merge sub-clusters if cluster_search_multiplier > 1
416             my @final_clusters;
417             if ($self->{_cluster_search_multiplier} > 1) {
418             print "\n\nInvoking recursive graph partitioning to merge small clusters\n\n";
419             my @array_of_partitioned_cluster_groups = (\@clusters);
420             my @partitioned_cluster_groups;
421             my $how_many_clusters_looking_for = $self->{_K};
422             while (scalar(@final_clusters) < $self->{_K}) {
423             @partitioned_cluster_groups =
424             $self->graph_partition(shift @array_of_partitioned_cluster_groups,
425             $how_many_clusters_looking_for );
426             if (@{$partitioned_cluster_groups[0]} == 1) {
427             my $singular_cluster = shift @{$partitioned_cluster_groups[0]};
428             push @final_clusters, $singular_cluster;
429             $how_many_clusters_looking_for--;
430             push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[1];
431             } elsif (@{$partitioned_cluster_groups[1]} == 1) {
432             my $singular_cluster = shift @{$partitioned_cluster_groups[1]};
433             push @final_clusters, $singular_cluster;
434             $how_many_clusters_looking_for--;
435             push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[0];
436             } else {
437             push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[0];
438             push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[1];
439             }
440             }
441             my @data_clustered;
442             foreach my $cluster (@final_clusters) {
443             push @data_clustered, @$cluster;
444             }
445             unless (scalar(@data_clustered) == scalar(@{$self->{_data_tags}})) {
446             $self->{_final_clusters} = \@final_clusters;
447             my %data_clustered = map {$_ => 1} @data_clustered;
448             my @data_tags_not_clustered =
449             grep {$_} map {exists $data_clustered{$_} ? undef : $_} @{$self->{_data_tags}};
450             if ($self->{_terminal_output}) {
451             print "\n\nNot all data clustered. The most reliable clusters found by graph partitioning:\n";
452             display_clusters(\@final_clusters);
453             print "\n\nData not yet clustered:\n\n@data_tags_not_clustered\n";
454             }
455             if ($self->{_data_dimensions} == 3) {
456             $visualization_msg = "$self->{_K}_best_clusters_produced_by_graph_partitioning";
457             $self->visualize_clusters_on_sphere($visualization_msg, \@final_clusters)
458             if $self->{_visualize_each_iteration};
459             $self->visualize_clusters_on_sphere($visualization_msg, \@final_clusters, "png")
460             if $self->{_make_png_for_each_iteration};
461             }
462             my %data_tags_to_cluster_label_hash;
463             foreach my $i (0..@final_clusters-1) {
464             map {$data_tags_to_cluster_label_hash{$_} = $i} @{$final_clusters[$i]};
465             }
466             $self->{_data_tags_to_cluster_label_hash} = \%data_tags_to_cluster_label_hash;
467             foreach my $tag (@data_tags_not_clustered) {
468             my $which_cluster = $self->which_cluster_for_new_element($tag);
469             $self->{_data_tags_to_cluster_label_hash}->{$tag} = $which_cluster;
470             }
471             die "Some data elements are still missing from the final tally"
472             unless scalar(keys %{$self->{_data_tags_to_cluster_label_hash}}) ==
473             scalar(@{$self->{_data_tags}});
474             my @new_final_clusters;
475             map { foreach my $ele (keys %{$self->{_data_tags_to_cluster_label_hash}}) {
476             push @{$new_final_clusters[$_]}, $ele
477             if $self->{_data_tags_to_cluster_label_hash}->{$ele} == $_ }
478             } 0..$self->{_K}-1;
479             if ($self->{_debug}) {
480             print "\ndisplaying the final clusters after accounting for unclustered data:\n";
481             display_clusters(\@new_final_clusters);
482             }
483             $self->{_final_clusters} = \@new_final_clusters;
484             @final_clusters = @new_final_clusters;
485             }
486             } else {
487             @final_clusters = @clusters;
488             }
489             print "\n\nDisplaying final clustering results:\n\n" if $self->{_terminal_output};
490             display_clusters(\@final_clusters) if $self->{_terminal_output};
491             return \@final_clusters;
492             }
493              
494             sub display_reconstruction_errors_as_a_function_of_iterations {
495             my $self = shift;
496             print "\n\nNumber of iterations used in Phase 1: $self->{_num_iterations_actually_used}\n";
497             print "\nTotal reconstruction error as a function of iterations in Phase 1: " .
498             "@{$self->{_reconstruction_error_as_a_function_of_iteration}}\n";
499             }
500              
501             sub set_termination_reconstruction_error_threshold {
502             my $self = shift;
503             my $all_ref_vecs = shift;
504             my @mean_vecs = @$all_ref_vecs;
505             my $sum_of_mean_magnitudes = reduce {$a+$b} map { my $result = transpose($_) * $_;
506             my @result = $result->as_list;
507             sqrt($result[0])
508             } @mean_vecs;
509             $self->{_scale_factor} = $sum_of_mean_magnitudes / @mean_vecs;
510             $self->{_delta_normalized_error} = ($sum_of_mean_magnitudes / @mean_vecs ) *
511             $self->{_delta_reconstruction_error};
512             }
513              
514             # This method is called only in the `unless' clause at the end of the main
515             # linear_manifold_clusterer() method. It is called to find the cluster labels for
516             # those data elements that were left unclustered by the main part of the algorithm
517             # when graph partitioning is used to merge similar sub-clusters. The operating logic
518             # here is that graph partition yields K main clusters even though each main cluster
519             # may not yet be fully populated.
520             sub which_cluster_for_new_element {
521             my $self = shift;
522             my $data_tag = shift;
523             # The following `unless' clause is called only the first time the current method
524             # is called:
525             unless (@{$self->{_final_trailing_eigenvec_matrices_for_all_subspaces}} > 0) {
526             my @trailing_eigenvec_matrices_for_all_subspaces;
527             my @reference_vecs_for_all_subspaces;
528             foreach my $cluster (@{$self->{_final_clusters}}) {
529             my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
530             my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance);
531             my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
532             my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions},
533             scalar(@trailing_eigenvecs));
534             foreach my $j (0..@trailing_eigenvecs-1) {
535             $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
536             }
537             push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
538             push @reference_vecs_for_all_subspaces, $mean;
539             }
540             $self->{_final_trailing_eigenvec_matrices_for_all_subspaces} =
541             \@trailing_eigenvec_matrices_for_all_subspaces;
542             $self->{_final_reference_vecs_for_all_subspaces} = \@reference_vecs_for_all_subspaces;
543             }
544             my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
545             $data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
546             my @errors = map {$self->reconstruction_error($data_vec,
547             $self->{_final_trailing_eigenvec_matrices_for_all_subspaces}->[$_],
548             $self->{_final_reference_vecs_for_all_subspaces}->[$_])}
549             0 .. $self->{_K}-1;
550             my ($minval, $index_for_closest_subspace) = minimum(\@errors);
551             return $index_for_closest_subspace;
552             }
553              
554             sub graph_partition {
555             my $self = shift;
556             my $clusters = shift;
557             my $how_many_clusters_looking_for = shift;
558             print "\n\nGraph partitioning looking for $how_many_clusters_looking_for clusters\n\n";
559             my $num_nodes = scalar @$clusters;
560             my $W = Math::GSL::Matrix->new($num_nodes,$num_nodes);
561             my $D = Math::GSL::Matrix->new($num_nodes,$num_nodes);
562             $D->identity;
563             my $neg_sqrt_of_D = Math::GSL::Matrix->new($num_nodes,$num_nodes);
564             $neg_sqrt_of_D->identity;
565             my @subspace_construction_errors;
566             my @trailing_eigenvec_matrices_for_all_subspaces;
567             my @reference_vecs_for_all_subspaces;
568             foreach my $cluster (@$clusters) {
569             my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
570             my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance);
571             my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
572             my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1];
573             my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals;
574             push @subspace_construction_errors, $subspace_construction_error;
575             my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions},
576             scalar(@trailing_eigenvecs));
577             foreach my $j (0..@trailing_eigenvecs-1) {
578             print "trailing eigenvec column: @{$trailing_eigenvecs[$j]}\n" if $self->{_debug};
579             $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
580             }
581             push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
582             push @reference_vecs_for_all_subspaces, $mean;
583             }
584             # We consider the similarity matrix W to be a sum of two parts W_recon_based and
585             # W_dist_bet_means based. IMPORTANT: For coding reasons, we first store the two
586             # similarity measures separately in W_recon_based and W_dist_bet_means based. Our
587             # goal is to fill up these component matrices with the raw values while at the
588             # same time collecting information needed for normalizing these two separate
589             # measures of similarity.
590             my $W_reconstruction_error_based = Math::GSL::Matrix->new($num_nodes,$num_nodes);
591             my $W_dist_between_means_based = Math::GSL::Matrix->new($num_nodes,$num_nodes);
592             my @all_pairwise_reconstruction_errors;
593             my @all_dist_between_means_errors;
594             foreach my $i (0..$num_nodes-1) {
595             foreach my $j (0..$num_nodes-1) {
596             my ($recon_error_similarity, $dist_bet_means) = $self->pairwise_cluster_similarity(
597             $clusters->[$i],
598             $trailing_eigenvec_matrices_for_all_subspaces[$i],
599             $reference_vecs_for_all_subspaces[$i],
600             $clusters->[$j],
601             $trailing_eigenvec_matrices_for_all_subspaces[$j],
602             $reference_vecs_for_all_subspaces[$j]);
603             $W_reconstruction_error_based->set_elem($i, $j, $recon_error_similarity);
604             $W_dist_between_means_based->set_elem($i, $j, $dist_bet_means);
605             push @all_pairwise_reconstruction_errors, $recon_error_similarity;
606             push @all_dist_between_means_errors, $dist_bet_means;
607             }
608             }
609             my $recon_error_normalizer = (reduce {$a + $b} @all_pairwise_reconstruction_errors) /
610             (scalar @all_pairwise_reconstruction_errors);
611             my $dist_bet_means_based_normalizer = (reduce {$a + $b} @all_dist_between_means_errors ) /
612             (scalar @all_dist_between_means_errors );
613             die "\n\nBailing out!\n" .
614             "Dealing with highly defective clusters. Try again.\n"
615             if ($recon_error_normalizer == 0) || ($dist_bet_means_based_normalizer == 0);
616             foreach my $i (0..$num_nodes-1) {
617             foreach my $j (0..$num_nodes-1) {
618             my $recon_val = $W_reconstruction_error_based->get_elem($i,$j);
619             my $new_recon_val = exp( -1.0 * $recon_val / $recon_error_normalizer );
620             $W_reconstruction_error_based->set_elem($i,$j,$new_recon_val);
621             my $mean_dist_val = $W_dist_between_means_based->get_elem($i,$j);
622             my $new_mean_dist_val = exp( -1.0 * $mean_dist_val / $dist_bet_means_based_normalizer );
623             $W_dist_between_means_based->set_elem($i,$j,$new_mean_dist_val);
624             }
625             }
626             $W = $W_reconstruction_error_based + $W_dist_between_means_based;
627             if ($self->{_debug}) {
628             print "\nDisplaying the similarity matrix W for the cluster graph:\n";
629             display_matrix($W) if $self->{_terminal_output};
630             }
631             my $add_all_columns = Math::GSL::Matrix->new($num_nodes,1);
632             foreach my $col (0..$num_nodes-1) {
633             $add_all_columns += $W->col($col);
634             }
635             foreach my $i (0..$num_nodes-1) {
636             $D->set_elem($i,$i, $add_all_columns->get_elem($i,0));
637             $neg_sqrt_of_D->set_elem($i,$i, 1.0 / sqrt($add_all_columns->get_elem($i,0)));
638             }
639             # the Laplacian matrix:
640             my $Laplacian = $D - $W;
641             # the Symmetric Normalized Laplacian matrix A:
642             my $A = $neg_sqrt_of_D * $Laplacian * $neg_sqrt_of_D;
643             foreach my $i (0..$num_nodes-1) {
644             foreach my $j (0..$num_nodes-1) {
645             $A->set_elem($i,$j,0) if abs($A->get_elem($i,$j)) < 0.01;
646             }
647             }
648             if ($self->{_terminal_output}) {
649             print "\nDisplaying the Symmetric Normalized Laplacian matrix A:\n" .
650             "A = neg_sqrt(D) * Laplacian_matrix * neg_sqrt(D)\n";
651             display_matrix( $A );
652             }
653             my ($eigenvalues, $eigenvectors) = $A->eigenpair;
654             my $num_of_eigens = @$eigenvalues;
655             my $largest_eigen_index = 0;
656             my $smallest_eigen_index = 0;
657             if ($self->{_debug2}) {
658             print "Eigenvalue 0: $eigenvalues->[0]\n";
659             foreach my $i (1..$num_of_eigens-1) {
660             $largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
661             $smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
662             print "Eigenvalue $i: $eigenvalues->[$i]\n";
663             }
664             print "\nlargest eigen index: $largest_eigen_index\n";
665             print "\nsmallest eigen index: $smallest_eigen_index\n\n";
666             }
667             my @all_my_eigenvecs;
668             foreach my $i (0..$num_of_eigens-1) {
669             my @vec = $eigenvectors->[$i]->as_list;
670             my @eigenvec;
671             foreach my $ele (@vec) {
672             my ($mag, $theta) = $ele =~ /\[(\d*\.?\d*e?[+-]?\d*),(\S+)\]/;
673             if ($theta eq "0") {
674             push @eigenvec, $mag;
675             } elsif ($theta eq "pi") {
676             push @eigenvec, -1.0 * $mag;
677             } else {
678             die "Eigendecomposition produced a complex eigenvector!";
679             }
680             }
681             print "Eigenvector $i: @eigenvec\n" if $self->{_debug2};
682             push @all_my_eigenvecs, \@eigenvec;
683             }
684             if ($self->{_debug2}) {
685             my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
686             print "\nLargest eigenvector of A: @largest_eigen_vec\n";
687             }
688             my @sorted_eigenvec_indexes = sort {$eigenvalues->[$b] <=> $eigenvalues->[$a]} 0..@all_my_eigenvecs-1;
689             print "sorted eigenvec indexes for A: @sorted_eigenvec_indexes\n" if $self->{_debug2};
690             my @sorted_eigenvecs;
691             my @sorted_eigenvals;
692             foreach my $i (0..@sorted_eigenvec_indexes-1) {
693             $sorted_eigenvecs[$i] = $all_my_eigenvecs[$sorted_eigenvec_indexes[$i]];
694             $sorted_eigenvals[$i] = $eigenvalues->[$sorted_eigenvec_indexes[$i]];
695             }
696             if ($self->{_debug2}) {
697             print "\nHere come sorted eigenvectors for A --- from the largest to the smallest:\n";
698             foreach my $i (0..@sorted_eigenvecs-1) {
699             print "eigenvec: @{$sorted_eigenvecs[$i]} eigenvalue: $sorted_eigenvals[$i]\n";
700             }
701             }
702             my $best_partitioning_eigenvec = $sorted_eigenvecs[@sorted_eigenvec_indexes-2];
703             print "\nBest graph partitioning eigenvector: @$best_partitioning_eigenvec\n" if $self->{_terminal_output};
704             my $how_many_positive = reduce {$a + $b} map {$_ > 0 ? 1 : 0 } @$best_partitioning_eigenvec;
705             my $how_many_negative = scalar(@$best_partitioning_eigenvec) - $how_many_positive;
706             print "Have $how_many_positive positive and $how_many_negative negative elements in the partitioning vec\n"
707             if $self->{_terminal_output};
708             if ($how_many_clusters_looking_for <= 3) {
709             my @merged_cluster;
710             my $final_cluster;
711             my @newclusters;
712             if ($how_many_positive == 1) {
713             foreach my $i (0..@$clusters-1) {
714             if ($best_partitioning_eigenvec->[$i] > 0) {
715             $final_cluster = $clusters->[$i];
716             } else {
717             push @newclusters, $clusters->[$i];
718             }
719             }
720             return ([$final_cluster], \@newclusters);
721             } elsif ($how_many_negative == 1) {
722             foreach my $i (0..@$clusters-1) {
723             if ($best_partitioning_eigenvec->[$i] < 0) {
724             $final_cluster = $clusters->[$i];
725             } else {
726             push @newclusters, $clusters->[$i];
727             }
728             }
729             return ([$final_cluster], \@newclusters);
730             } elsif ($how_many_positive <= $self->{_cluster_search_multiplier}) {
731             foreach my $i (0..@$clusters-1) {
732             if ($best_partitioning_eigenvec->[$i] > 0) {
733             push @merged_cluster, @{$clusters->[$i]};
734             } else {
735             push @newclusters, $clusters->[$i];
736             }
737             }
738             return ([\@merged_cluster], \@newclusters);
739             } elsif ($how_many_negative <= $self->{_cluster_search_multiplier}) {
740             foreach my $i (0..@$clusters-1) {
741             if ($best_partitioning_eigenvec->[$i] < 0) {
742             push @merged_cluster, @{$clusters->[$i]};
743             } else {
744             push @newclusters, $clusters->[$i];
745             }
746             }
747             return ([\@merged_cluster], \@newclusters);
748             } else {
749             die "\n\nBailing out!\n\n" .
750             "No consensus support for dominant clusters in the graph partitioning step\n" .
751             "of the algorithm. This can be caused by bad random selection of initial\n" .
752             "cluster centers. Please run this program again.\n";
753             }
754             } else {
755             my @positive_clusters;
756             my @negative_clusters;
757             foreach my $i (0..@$clusters-1) {
758             if ($best_partitioning_eigenvec->[$i] > 0) {
759             push @positive_clusters, $clusters->[$i];
760             } else {
761             push @negative_clusters, $clusters->[$i];
762             }
763             }
764             return (\@positive_clusters, \@negative_clusters);
765             }
766             }
767              
768             sub pairwise_cluster_similarity {
769             my $self = shift;
770             my $cluster1 = shift;
771             my $trailing_eigenvec_matrix_cluster1 = shift;
772             my $reference_vec_cluster1 = shift;
773             my $cluster2 = shift;
774             my $trailing_eigenvec_matrix_cluster2 = shift;
775             my $reference_vec_cluster2 = shift;
776             my $total_reconstruction_error_in_this_iteration = 0;
777             my @errors_for_1_on_2 = map {my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
778             $data_vec->set_col(0, $self->{_data_hash}->{$_});
779             $self->reconstruction_error($data_vec,
780             $trailing_eigenvec_matrix_cluster2,
781             $reference_vec_cluster2)}
782             @$cluster1;
783             my @errors_for_2_on_1 = map {my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
784             $data_vec->set_col(0, $self->{_data_hash}->{$_});
785             $self->reconstruction_error($data_vec,
786             $trailing_eigenvec_matrix_cluster1,
787             $reference_vec_cluster1)}
788             @$cluster2;
789             my $type_1_error = reduce {abs($a) + abs($b)} @errors_for_1_on_2;
790             my $type_2_error = reduce {abs($a) + abs($b)} @errors_for_2_on_1;
791             my $total_reconstruction_error = $type_1_error + $type_2_error;
792             my $diff_between_the_means = $reference_vec_cluster1 - $reference_vec_cluster2;
793             my $dist_squared = transpose($diff_between_the_means) * $diff_between_the_means;
794             my @dist_squared_as_list = $dist_squared->as_list();
795             my $dist_between_means_based_error = shift @dist_squared_as_list;
796             return ($total_reconstruction_error, $dist_between_means_based_error);
797             }
798              
799             # delta ball
800             sub cluster_unimodality_quotient {
801             my $self = shift;
802             my $cluster = shift;
803             my $mean = shift;
804             my $delta = 0.4 * $self->{_scale_factor}; # Radius of the delta ball along each dimension
805             my @mean = $mean->as_list;
806             my @data_tags_for_range_tests;
807             foreach my $dimen (0..$self->{_data_dimensions}-1) {
808             my @values = map {$_->[$dimen]} map {$self->{_data_hash}->{$_}} @$cluster;
809             my ($min, $max) = (List::Util::min(@values), List::Util::max(@values));
810             my $range = $max - $min;
811             my $mean_along_this_dimen = $mean[$dimen];
812             my @tags = grep {$_}
813             map { ( ($self->{_data_hash}->{$_}->[$dimen] > $mean_along_this_dimen - $delta * $range)
814             &&
815             ($self->{_data_hash}->{$_}->[$dimen] < $mean_along_this_dimen + $delta * $range) )
816             ? $_ : undef }
817             @$cluster;
818             push @data_tags_for_range_tests, \@tags;
819             }
820             # Now find the intersection of the tag sets for each of the dimensions
821             my %intersection_hash;
822             foreach my $dimen (0..$self->{_data_dimensions}-1) {
823             my %tag_hash_for_this_dimen = map {$_ => 1} @{$data_tags_for_range_tests[$dimen]};
824             if ($dimen == 0) {
825             %intersection_hash = %tag_hash_for_this_dimen;
826             } else {
827             %intersection_hash = map {$_ => 1} grep {$tag_hash_for_this_dimen{$_}}
828             keys %intersection_hash;
829             }
830             }
831             my @intersection_set = keys %intersection_hash;
832             my $cluster_unimodality_index = scalar(@intersection_set) / scalar(@$cluster);
833             return $cluster_unimodality_index;
834             }
835              
836             sub find_best_ref_vector {
837             my $self = shift;
838             my $cluster = shift;
839             my $trailing_eigenvec_matrix = shift;
840             my $mean = shift; # a GSL marix ref
841             my @min_bounds;
842             my @max_bounds;
843             my @ranges;
844             foreach my $dimen (0..$self->{_data_dimensions}-1) {
845             my @values = map {$_->[$dimen]} map {$self->{_data_hash}->{$_}} @$cluster;
846             my ($min, $max) = (List::Util::min(@values), List::Util::max(@values));
847             push @min_bounds, $min;
848             push @max_bounds, $max;
849             push @ranges, $max - $min;
850             }
851             print "min bounds are: @min_bounds\n";
852             print "max bounds are: @max_bounds\n";
853             my $max_iterations = 100;
854             my @random_points;
855             my $iteration = 0;
856             while ($iteration++ < $max_iterations) {
857             my @coordinate_vec;
858             foreach my $dimen (0..$self->{_data_dimensions}-1) {
859             push @coordinate_vec, $min_bounds[$dimen] + rand($ranges[$dimen]);
860             }
861             push @random_points, \@coordinate_vec;
862             }
863             if ($self->{_debug}) {
864             print "\nrandom points\n";
865             map {print "@$_\n"} @random_points;
866             }
867             my @mean = $mean->as_list;
868             unshift @random_points, \@mean;
869             my @reconstruction_errors;
870             foreach my $candidate_ref_vec (@random_points) {
871             my $ref_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
872             $ref_vec->set_col(0, $candidate_ref_vec);
873             my $reconstruction_error_for_a_ref_vec = 0;
874             foreach my $data_tag (@{$self->{_data_tags}}) {
875             my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
876             $data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
877             my $error = $self->reconstruction_error($data_vec,$trailing_eigenvec_matrix,$ref_vec);
878             $reconstruction_error_for_a_ref_vec += $error;
879             }
880             push @reconstruction_errors, $reconstruction_error_for_a_ref_vec;
881             }
882             my $recon_error_for_original_mean = shift @reconstruction_errors;
883             my $smallest_error_randomly_selected_ref_vecs = List::Util::min(@reconstruction_errors);
884             my $minindex = List::Util::first { $_ == $smallest_error_randomly_selected_ref_vecs }
885             @reconstruction_errors;
886             my $refvec = $random_points[$minindex];
887             return $refvec;
888             }
889              
890             ## The reconstruction error relates to the size of the perpendicular from a data
891             ## point X to the hyperplane that defines a given subspace on the manifold.
892             sub reconstruction_error {
893             my $self = shift;
894             my $data_vec = shift;
895             my $trailing_eigenvecs = shift;
896             my $ref_vec = shift;
897             my $error_squared = transpose($data_vec - $ref_vec) * $trailing_eigenvecs *
898             transpose($trailing_eigenvecs) * ($data_vec - $ref_vec);
899             my @error_squared_as_list = $error_squared->as_list();
900             my $error_squared_as_scalar = shift @error_squared_as_list;
901             return $error_squared_as_scalar;
902             }
903              
904             # Returns a set of KM random integers. These serve as indices to reach into the data
905             # array. A data element whose index is one of the random numbers returned by this
906             # routine serves as an initial cluster center. Note the quality check it runs on the
907             # list of the random integers constructed. We first make sure that all the random
908             # integers returned are different. Subsequently, we carry out a quality assessment
909             # of the random integers constructed. This quality measure consists of the ratio of
910             # the values spanned by the random integers to the value of N, the total number of
911             # data points to be clustered. Currently, if this ratio is less than 0.3, we discard
912             # the K integers and try again.
913             sub initialize_cluster_centers {
914             my $self = shift;
915             my $K = shift; # This value is set to the parameter KM in the call to this subroutine
916             my $data_store_size = shift;
917             my @cluster_center_indices;
918             while (1) {
919             foreach my $i (0..$K-1) {
920             $cluster_center_indices[$i] = int rand( $data_store_size );
921             next if $i == 0;
922             foreach my $j (0..$i-1) {
923             while ( $cluster_center_indices[$j] == $cluster_center_indices[$i] ) {
924             my $old = $cluster_center_indices[$i];
925             $cluster_center_indices[$i] = int rand($data_store_size);
926             }
927             }
928             }
929             my ($min,$max) = minmax(\@cluster_center_indices );
930             my $quality = ($max - $min) / $data_store_size;
931             last if $quality > 0.3;
932             }
933             return @cluster_center_indices;
934             }
935              
936             # The purpose of this routine is to form initial clusters by assigning the data
937             # samples to the initial clusters formed by the previous routine on the basis of the
938             # best proximity of the data samples to the different cluster centers.
939             sub assign_data_to_clusters_initial {
940             my $self = shift;
941             my @cluster_centers = @{ shift @_ };
942             my @clusters;
943             foreach my $ele (@{$self->{_data_tags}}) {
944             my $best_cluster;
945             my @dist_from_clust_centers;
946             foreach my $center (@cluster_centers) {
947             push @dist_from_clust_centers, $self->distance($ele, $center);
948             }
949             my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
950             push @{$clusters[$best_center_index]}, $ele;
951             }
952             return \@clusters;
953             }
954              
955             # The following routine is for computing the distance between a data point specified
956             # by its symbolic name in the master datafile and a point (such as the center of a
957             # cluster) expressed as a vector of coordinates:
958             sub distance {
959             my $self = shift;
960             my $ele1_id = shift @_; # symbolic name of data sample
961             my @ele1 = @{$self->{_data_hash}->{$ele1_id}};
962             my @ele2 = @{shift @_};
963             die "wrong data types for distance calculation\n" if @ele1 != @ele2;
964             my $how_many = @ele1;
965             my $squared_sum = 0;
966             foreach my $i (0..$how_many-1) {
967             $squared_sum += ($ele1[$i] - $ele2[$i])**2;
968             }
969             my $dist = sqrt $squared_sum;
970             return $dist;
971             }
972              
973             sub write_clusters_to_files {
974             my $self = shift;
975             my $clusters = shift;
976             my @clusters = @$clusters;
977             unlink glob "cluster*.txt";
978             foreach my $i (0..@clusters-1) {
979             my $filename = "cluster" . $i . ".txt";
980             print "\nWriting cluster $i to file $filename\n" if $self->{_terminal_output};
981             open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!";
982             foreach my $ele (@{$clusters[$i]}) {
983             print FILEHANDLE "$ele ";
984             }
985             close FILEHANDLE;
986             }
987             }
988              
989             sub DESTROY {
990             my $filename = basename($_[0]->{_datafile});
991             $filename =~ s/\.\w+$/\.txt/;
992             unlink "__temp_" . $filename;
993             }
994              
995             ################################## Visualization Code ###################################
996              
997             sub add_point_coords {
998             my $self = shift;
999             my @arr_of_ids = @{shift @_}; # array of data element names
1000             my @result;
1001             my $data_dimensionality = $self->{_data_dimensions};
1002             foreach my $i (0..$data_dimensionality-1) {
1003             $result[$i] = 0.0;
1004             }
1005             foreach my $id (@arr_of_ids) {
1006             my $ele = $self->{_data_hash}->{$id};
1007             my $i = 0;
1008             foreach my $component (@$ele) {
1009             $result[$i] += $component;
1010             $i++;
1011             }
1012             }
1013             return \@result;
1014             }
1015              
1016             # This is the main module version:
1017             sub visualize_clusters_on_sphere {
1018             my $self = shift;
1019             my $visualization_msg = shift;
1020             my $clusters = deep_copy_AoA(shift);
1021             my $hardcopy_format = shift;
1022             my $pause_time = shift;
1023             my $d = $self->{_data_dimensions};
1024             my $temp_file = "__temp_" . $self->{_datafile};
1025             $temp_file =~ s/\.\w+$/\.txt/;
1026             unlink $temp_file if -e $temp_file;
1027             open OUTPUT, ">$temp_file"
1028             or die "Unable to open a temp file in this directory: $!";
1029             my @all_tags = "A".."Z";
1030             my @retagged_clusters;
1031             foreach my $cluster (@$clusters) {
1032             my $label = shift @all_tags;
1033             my @retagged_cluster =
1034             map {$_ =~ s/^(\w+?)_(\w+)/$label . "_$2 @{$self->{_data_hash}->{$_}}"/e;$_} @$cluster;
1035             push @retagged_clusters, \@retagged_cluster;
1036             }
1037             my %clusters;
1038             foreach my $cluster (@retagged_clusters) {
1039             foreach my $record (@$cluster) {
1040             my @splits = grep $_, split /\s+/, $record;
1041             $splits[0] =~ /(\w+?)_.*/;
1042             my $primary_cluster_label = $1;
1043             my @coords = @splits[1..$d];
1044             push @{$clusters{$primary_cluster_label}}, \@coords;
1045             }
1046             }
1047             foreach my $key (sort {"\L$a" cmp "\L$b"} keys %clusters) {
1048             map {print OUTPUT "$_"} map {"@$_\n"} @{$clusters{$key}};
1049             print OUTPUT "\n\n";
1050             }
1051             my @sorted_cluster_keys = sort {"\L$a" cmp "\L$b"} keys %clusters;
1052             close OUTPUT;
1053             my $plot;
1054             unless (defined $pause_time) {
1055             $plot = Graphics::GnuplotIF->new( persist => 1 );
1056             } else {
1057             $plot = Graphics::GnuplotIF->new();
1058             }
1059             my $arg_string = "";
1060             $plot->gnuplot_cmd( "set hidden3d" ) unless $self->{_show_hidden_in_3D_plots};
1061             $plot->gnuplot_cmd( "set title \"$visualization_msg\"" );
1062             $plot->gnuplot_cmd( "set noclip" );
1063             $plot->gnuplot_cmd( "set pointsize 2" );
1064             $plot->gnuplot_cmd( "set parametric" );
1065             $plot->gnuplot_cmd( "set size ratio 1" );
1066             $plot->gnuplot_cmd( "set xlabel \"X\"" );
1067             $plot->gnuplot_cmd( "set ylabel \"Y\"" );
1068             $plot->gnuplot_cmd( "set zlabel \"Z\"" );
1069             if ($hardcopy_format) {
1070             $plot->gnuplot_cmd( "set terminal png" );
1071             my $image_file_name = "$visualization_msg\.$hardcopy_format";
1072             $plot->gnuplot_cmd( "set output \"$image_file_name\"" );
1073             $plot->gnuplot_cmd( "unset hidden3d" );
1074             }
1075             # set the range for azimuth angles:
1076             $plot->gnuplot_cmd( "set urange [0:2*pi]" );
1077             # set the range for the elevation angles:
1078             $plot->gnuplot_cmd( "set vrange [-pi/2:pi/2]" );
1079             # Parametric functions for the sphere
1080             # $plot->gnuplot_cmd( "r=1" );
1081             if ($self->{_scale_factor}) {
1082             $plot->gnuplot_cmd( "r=$self->{_scale_factor}" );
1083             } else {
1084             $plot->gnuplot_cmd( "r=1" );
1085             }
1086             $plot->gnuplot_cmd( "fx(v,u) = r*cos(v)*cos(u)" );
1087             $plot->gnuplot_cmd( "fy(v,u) = r*cos(v)*sin(u)" );
1088             $plot->gnuplot_cmd( "fz(v) = r*sin(v)" );
1089             my $sphere_arg_str = "fx(v,u),fy(v,u),fz(v) notitle with lines lt 0,";
1090             foreach my $i (0..scalar(keys %clusters)-1) {
1091             my $j = $i + 1;
1092             # The following statement puts the titles on the data points
1093             $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"$sorted_cluster_keys[$i] \" with points lt $j pt $j, ";
1094             }
1095             $arg_string = $arg_string =~ /^(.*),[ ]+$/;
1096             $arg_string = $1;
1097             $plot->gnuplot_cmd( "splot $sphere_arg_str $arg_string" );
1098             $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1099             }
1100              
1101              
1102             ################################### Support Routines ########################################
1103              
1104             sub cluster_split {
1105             my $cluster = shift;
1106             my $how_many = shift;
1107             my @cluster_fragments;
1108             foreach my $i (0..$how_many-1) {
1109             $cluster_fragments[$i] = [];
1110             }
1111             my $delta = int( scalar(@$cluster) / $how_many );
1112             my $j = 0;
1113             foreach my $i (0..@$cluster-1) {
1114             push @{$cluster_fragments[int($i/$delta)]}, $cluster->[$i];
1115             }
1116             my $how_many_accounted_for = reduce {$a + $b} map {scalar(@$_)} @cluster_fragments;
1117             foreach my $frag (@cluster_fragments) {
1118             print "fragment: @$frag\n";
1119             }
1120             die "the fragmentation could not account for all the data"
1121             unless @$cluster == $how_many_accounted_for;
1122             return @cluster_fragments;
1123             }
1124              
1125             # Returns the minimum value and its positional index in an array
1126             sub minimum {
1127             my $arr = shift;
1128             my $min;
1129             my $index;
1130             foreach my $i (0..@{$arr}-1) {
1131             if ( (!defined $min) || ($arr->[$i] < $min) ) {
1132             $index = $i;
1133             $min = $arr->[$i];
1134             }
1135             }
1136             return ($min, $index);
1137             }
1138              
1139             sub minmax {
1140             my $arr = shift;
1141             my $min;
1142             my $max;
1143             foreach my $i (0..@{$arr}-1) {
1144             next if !defined $arr->[$i];
1145             if ( (!defined $min) && (!defined $max) ) {
1146             $min = $arr->[$i];
1147             $max = $arr->[$i];
1148             } elsif ( $arr->[$i] < $min ) {
1149             $min = $arr->[$i];
1150             } elsif ( $arr->[$i] > $max ) {
1151             $max = $arr->[$i];
1152             }
1153             }
1154             return ($min, $max);
1155             }
1156              
1157             # Meant only for constructing a deep copy of an array of arrays:
1158             sub deep_copy_AoA {
1159             my $ref_in = shift;
1160             my $ref_out;
1161             foreach my $i (0..@{$ref_in}-1) {
1162             foreach my $j (0..@{$ref_in->[$i]}-1) {
1163             $ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
1164             }
1165             }
1166             return $ref_out;
1167             }
1168              
1169             # For displaying the individual clusters on a terminal screen. Each cluster is
1170             # displayed through the symbolic names associated with the data points.
1171             sub display_clusters {
1172             my @clusters = @{shift @_};
1173             my $i = 0;
1174             foreach my $cluster (@clusters) {
1175             @$cluster = sort @$cluster;
1176             my $cluster_size = @$cluster;
1177             print "\n\nCluster $i ($cluster_size records):\n";
1178             foreach my $ele (@$cluster) {
1179             print " $ele";
1180             }
1181             $i++
1182             }
1183             print "\n\n";
1184             }
1185              
1186             sub display_mean_and_covariance {
1187             my $mean = shift;
1188             my $covariance = shift;
1189             print "\nDisplay the mean:\n";
1190             display_matrix($mean);
1191             print "\nDisplay the covariance:\n";
1192             display_matrix($covariance);
1193             }
1194              
1195             sub check_for_illegal_params {
1196             my @params = @_;
1197             my @legal_params = qw / datafile
1198             mask
1199             K
1200             P
1201             terminal_output
1202             cluster_search_multiplier
1203             max_iterations
1204             delta_reconstruction_error
1205             visualize_each_iteration
1206             show_hidden_in_3D_plots
1207             make_png_for_each_iteration
1208             debug
1209             /;
1210             my $found_match_flag;
1211             foreach my $param (@params) {
1212             foreach my $legal (@legal_params) {
1213             $found_match_flag = 0;
1214             if ($param eq $legal) {
1215             $found_match_flag = 1;
1216             last;
1217             }
1218             }
1219             last if $found_match_flag == 0;
1220             }
1221             return $found_match_flag;
1222             }
1223              
1224             sub display_matrix {
1225             my $matrix = shift;
1226             my $nrows = $matrix->rows();
1227             my $ncols = $matrix->cols();
1228             print "\nDisplaying a matrix of size $nrows rows and $ncols columns:\n";
1229             foreach my $i (0..$nrows-1) {
1230             my $row = $matrix->row($i);
1231             my @row_as_list = $row->as_list;
1232             # print "@row_as_list\n";
1233             map { printf("%.4f ", $_) } @row_as_list;
1234             print "\n";
1235             }
1236             print "\n\n";
1237             }
1238              
1239             sub transpose {
1240             my $matrix = shift;
1241             my $num_rows = $matrix->rows();
1242             my $num_cols = $matrix->cols();
1243             my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
1244             foreach my $i (0..$num_rows-1) {
1245             my @row = $matrix->row($i)->as_list;
1246             $transpose->set_col($i, \@row );
1247             }
1248             return $transpose;
1249             }
1250              
1251             sub vector_subtract {
1252             my $vec1 = shift;
1253             my $vec2 = shift;
1254             die "wrong data types for vector subtract calculation\n" if @$vec1 != @$vec2;
1255             my @result;
1256             foreach my $i (0..@$vec1-1){
1257             push @result, $vec1->[$i] - $vec2->[$i];
1258             }
1259             return @result;
1260             }
1261              
1262             # from perl docs:
1263             sub fisher_yates_shuffle {
1264             my $arr = shift;
1265             my $i = @$arr;
1266             while (--$i) {
1267             my $j = int rand( $i + 1 );
1268             @$arr[$i, $j] = @$arr[$j, $i];
1269             }
1270             }
1271              
1272             ######################### Generating Synthetic Data for Manifold Clustering ##########################
1273              
1274             ################################## Class DataGenerator ########################################
1275              
1276             ## The embedded class defined below is for generating synthetic data for
1277             ## experimenting with linear manifold clustering when the data resides on the
1278             ## surface of a sphere. See the script generate_data_on_a_sphere.pl in the
1279             ## `examples' directory for how to specify the number of clusters and the spread of
1280             ## each cluster in the data that is generated.
1281              
1282             package DataGenerator;
1283              
1284             use strict;
1285             use Carp;
1286              
1287             sub new {
1288             my ($class, %args) = @_;
1289             my @params = keys %args;
1290             croak "\nYou have used a wrong name for a keyword argument " .
1291             "--- perhaps a misspelling\n"
1292             if _check_for_illegal_params3(@params) == 0;
1293             bless {
1294             _output_file => $args{output_file}
1295             || croak("name for output_file required"),
1296             _total_number_of_samples_needed => $args{total_number_of_samples_needed}
1297             || croak("total_number_of_samples_needed required"),
1298             _number_of_clusters_on_sphere => $args{number_of_clusters_on_sphere} || 3,
1299             _cluster_width => $args{cluster_width} || 0.1,
1300             _show_hidden_in_3D_plots => $args{show_hidden_in_3D_plots} || 1,
1301             _debug => $args{debug} || 0,
1302             }, $class;
1303             }
1304              
1305             sub _check_for_illegal_params3 {
1306             my @params = @_;
1307             my @legal_params = qw / output_file
1308             total_number_of_samples_needed
1309             number_of_clusters_on_sphere
1310             cluster_width
1311             show_hidden_in_3D_plots
1312             /;
1313             my $found_match_flag;
1314             foreach my $param (@params) {
1315             foreach my $legal (@legal_params) {
1316             $found_match_flag = 0;
1317             if ($param eq $legal) {
1318             $found_match_flag = 1;
1319             last;
1320             }
1321             }
1322             last if $found_match_flag == 0;
1323             }
1324             return $found_match_flag;
1325             }
1326              
1327             ## We first generate a set of points randomly on the unit sphere --- the number of
1328             ## points being equal to the number of clusters desired. These points will serve as
1329             ## cluster means (or, as cluster centroids) subsequently when we ask
1330             ## Math::Random::random_multivariate_normal($N, @m, @covar) to return $N number of
1331             ## points on the sphere. The second argument is the cluster mean and the third
1332             ## argument the cluster covariance. For the synthetic data, we set the cluster
1333             ## covariance to a 2x2 diagonal matrix, with the (0,0) element corresponding to the
1334             ## variance along the azimuth direction and the (1,1) element corresponding to the
1335             ## variance along the elevation direction.
1336             ##
1337             ## When you generate the points in the 2D spherical coordinates of
1338             ## (azimuth,elevation), you also need `wrap-around' logic for those points yielded by
1339             ## the multivariate-normal function whose azimuth angle is outside the interval
1340             ## (0,360) and/or whose elevation angle is outside the interval (-90,90).
1341             ##
1342             ## Note that the first of the two dimensions for which the multivariate-normal
1343             ## function returns the points is for the azimuth angle and the second for the
1344             ## elevation angle.
1345             ##
1346             ## With regard to the relationship of the Cartesian coordinates to the spherical
1347             ## (azimuth, elevation) coordinates, we assume that (x,y) is the horizontal plane
1348             ## and z the vertical axis. The elevation angle theta is measure with respect to
1349             ## the XY-plane. The highest point on the sphere (the Zenith) corresponds to the
1350             ## elevation angle of +90 and the lowest points on the sphere (the Nadir)
1351             ## corresponds to the elevation angle of -90. The azimuth is measured with respect
1352             ## X-axis. The range of the azimuth is from 0 to 360 degrees. The elevation is
1353             ## measured from the XY plane and its range is (-90,90) degrees.
1354             sub gen_data_and_write_to_csv {
1355             my $self = shift;
1356             my $K = $self->{_number_of_clusters_on_sphere};
1357             # $N is the number of samples to be generated for each cluster:
1358             my $N = int($self->{_total_number_of_samples_needed} / $K);
1359             my $output_file = $self->{_output_file};
1360             # Designated all of the data elements in a cluster by a letter that is followed by
1361             # an integer that identifies a specific data element.
1362             my @point_labels = ('a'..'z');
1363             # Our first job is to define $K random points in the 2D space (azimuth,
1364             # elevation) to serve as cluster centers on the surface of the sphere. This we
1365             # do by calling a uniformly distributed 1-D random number generator, first for
1366             # the azimuth and then for the elevation in the loop shown below:
1367             my @cluster_centers;
1368             my @covariances;
1369             foreach my $i (0..$K-1) {
1370             my $azimuth = rand(360);
1371             my $elevation = rand(90) - 90;
1372             my @mean = ($azimuth, $elevation);
1373             push @cluster_centers, \@mean;
1374             my $cluster_covariance;
1375             # The j-th dimension is for azimuth and k-th for elevation for the directions
1376             # to surface of the sphere:
1377             foreach my $j (0..1) {
1378             foreach my $k (0..1) {
1379             $cluster_covariance->[$j]->[$k] = ($self->{_cluster_width} * 360.0) ** 2
1380             if $j == 0 && $k == 0;
1381             $cluster_covariance->[$j]->[$k] = ($self->{_cluster_width} * 180.0) ** 2
1382             if $j == 1 && $k == 1;
1383             $cluster_covariance->[$j]->[$k] = 0.0 if $j != $k;
1384             }
1385             }
1386             push @covariances, $cluster_covariance;
1387             }
1388             if ($self->{_debug}) {
1389             foreach my $i (0..$K-1) {
1390             print "\n\nCluster center: @{$cluster_centers[$i]}\n";
1391             print "\nCovariance:\n";
1392             foreach my $j (0..1) {
1393             foreach my $k (0..1) {
1394             print "$covariances[$i]->[$j]->[$k] ";
1395             }
1396             print "\n";
1397             }
1398             }
1399             }
1400             my @data_dump;
1401             foreach my $i (0..$K-1) {
1402             my @m = @{shift @cluster_centers};
1403             my @covar = @{shift @covariances};
1404             my @new_data = Math::Random::random_multivariate_normal($N, @m, @covar);
1405             if ($self->{_debug}) {
1406             print "\nThe points for cluster $i:\n";
1407             map { print "@$_ "; } @new_data;
1408             print "\n\n";
1409             }
1410             my @wrapped_data;
1411             foreach my $d (@new_data) {
1412             my $wrapped_d;
1413             if ($d->[0] >= 360.0) {
1414             $wrapped_d->[0] = $d->[0] - 360.0;
1415             } elsif ($d->[0] < 0) {
1416             $wrapped_d->[0] = 360.0 - abs($d->[0]);
1417             }
1418             if ($d->[1] >= 90.0) {
1419             $wrapped_d->[0] = POSIX::fmod($d->[0] + 180.0, 360);
1420             $wrapped_d->[1] = 180.0 - $d->[1];
1421             } elsif ($d->[1] < -90.0) {
1422             $wrapped_d->[0] = POSIX::fmod($d->[0] + 180, 360);
1423             $wrapped_d->[1] = -180.0 - $d->[1];
1424             }
1425             $wrapped_d->[0] = $d->[0] unless defined $wrapped_d->[0];
1426             $wrapped_d->[1] = $d->[1] unless defined $wrapped_d->[1];
1427             push @wrapped_data, $wrapped_d;
1428             }
1429             if ($self->{_debug}) {
1430             print "\nThe unwrapped points for cluster $i:\n";
1431             map { print "@$_ "; } @wrapped_data;
1432             print "\n\n";
1433             }
1434             my $label = $point_labels[$i];
1435             my $j = 0;
1436             @new_data = map {unshift @$_, $label."_".$j; $j++; $_} @wrapped_data;
1437             push @data_dump, @new_data;
1438             }
1439             if ($self->{_debug}) {
1440             print "\n\nThe labeled points for clusters:\n";
1441             map { print "@$_\n"; } @data_dump;
1442             }
1443             fisher_yates_shuffle( \@data_dump );
1444             open OUTPUT, ">$output_file";
1445             my $total_num_of_points = $N * $K;
1446             print "Total number of data points that will be written out to the file: $total_num_of_points\n"
1447             if $self->{_debug};
1448             foreach my $ele (@data_dump) {
1449             my ($x,$y,$z);
1450             my $label = $ele->[0];
1451             my $azimuth = $ele->[1];
1452             my $elevation = $ele->[2];
1453             $x = cos($elevation) * cos($azimuth);
1454             $y = cos($elevation) * sin($azimuth);
1455             $z = sin($elevation);
1456             my $csv_str = join ",", ($label,$x,$y,$z);
1457             print OUTPUT "$csv_str\n";
1458             }
1459             print "\n\n";
1460             print "Data written out to file $output_file\n" if $self->{_debug};
1461             close OUTPUT;
1462             }
1463              
1464             # This version for the embedded class for data generation
1465             sub visualize_data_on_sphere {
1466             my $self = shift;
1467             my $datafile = shift;
1468             my $filename = File::Basename::basename($datafile);
1469             my $temp_file = "__temp_" . $filename;
1470             $temp_file =~ s/\.\w+$/\.txt/;
1471             unlink $temp_file if -e $temp_file;
1472             open OUTPUT, ">$temp_file"
1473             or die "Unable to open a temp file in this directory: $!";
1474             open INPUT, "< $filename" or die "Unable to open $filename: $!";
1475             local $/ = undef;
1476             my @all_records = split /\s+/, ;
1477             my %clusters;
1478             foreach my $record (@all_records) {
1479             my @splits = split /,/, $record;
1480             my $record_name = shift @splits;
1481             $record_name =~ /(\w+?)_.*/;
1482             my $primary_cluster_label = $1;
1483             push @{$clusters{$primary_cluster_label}}, \@splits;
1484             }
1485             foreach my $key (sort {"\L$a" cmp "\L$b"} keys %clusters) {
1486             map {print OUTPUT "$_"} map {"@$_\n"} @{$clusters{$key}};
1487             print OUTPUT "\n\n";
1488             }
1489             my @sorted_cluster_keys = sort {"\L$a" cmp "\L$b"} keys %clusters;
1490             close OUTPUT;
1491             my $plot = Graphics::GnuplotIF->new( persist => 1 );
1492             my $arg_string = "";
1493             $plot->gnuplot_cmd( "set noclip" );
1494             $plot->gnuplot_cmd( "set hidden3d" ) unless $self->{_show_hidden_in_3D_plots};
1495             $plot->gnuplot_cmd( "set pointsize 2" );
1496             $plot->gnuplot_cmd( "set parametric" );
1497             $plot->gnuplot_cmd( "set size ratio 1" );
1498             $plot->gnuplot_cmd( "set xlabel \"X\"" );
1499             $plot->gnuplot_cmd( "set ylabel \"Y\"" );
1500             $plot->gnuplot_cmd( "set zlabel \"Z\"" );
1501             # set the range for azimuth angles:
1502             $plot->gnuplot_cmd( "set urange [0:2*pi]" );
1503             # set the range for the elevation angles:
1504             $plot->gnuplot_cmd( "set vrange [-pi/2:pi/2]" );
1505             # Parametric functions for the sphere
1506             $plot->gnuplot_cmd( "r=1" );
1507             $plot->gnuplot_cmd( "fx(v,u) = r*cos(v)*cos(u)" );
1508             $plot->gnuplot_cmd( "fy(v,u) = r*cos(v)*sin(u)" );
1509             $plot->gnuplot_cmd( "fz(v) = r*sin(v)" );
1510             my $sphere_arg_str = "fx(v,u),fy(v,u),fz(v) notitle with lines lt 0,";
1511             foreach my $i (0..scalar(keys %clusters)-1) {
1512             my $j = $i + 1;
1513             # The following statement puts the titles on the data points
1514             $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"$sorted_cluster_keys[$i] \" with points lt $j pt $j, ";
1515             }
1516             $arg_string = $arg_string =~ /^(.*),[ ]+$/;
1517             $arg_string = $1;
1518             # $plot->gnuplot_cmd( "splot $arg_string" );
1519             $plot->gnuplot_cmd( "splot $sphere_arg_str $arg_string" );
1520             }
1521              
1522             sub DESTROY {
1523             use File::Basename;
1524             my $filename = basename($_[0]->{_output_file});
1525             $filename =~ s/\.\w+$/\.txt/;
1526             unlink "__temp_" . $filename;
1527             }
1528              
1529             # from perl docs:
1530             sub fisher_yates_shuffle {
1531             my $arr = shift;
1532             my $i = @$arr;
1533             while (--$i) {
1534             my $j = int rand( $i + 1 );
1535             @$arr[$i, $j] = @$arr[$j, $i];
1536             }
1537             }
1538              
1539             1;
1540              
1541             =pod
1542              
1543             =head1 NAME
1544              
1545             Algorithm::LinearManifoldDataClusterer --- for clustering data that resides on a
1546             low-dimensional manifold in a high-dimensional measurement space
1547              
1548             =head1 SYNOPSIS
1549              
1550             # You'd begin with:
1551              
1552             use Algorithm::LinearManifoldDataClusterer;
1553              
1554             # You'd next name the data file:
1555              
1556             my $datafile = "mydatafile.csv";
1557              
1558             # Your data must be in the CSV format, with one of the columns containing a unique
1559             # symbolic tag for each data record. You tell the module which column has the
1560             # symbolic tag and which columns to use for clustering through a mask such as
1561              
1562             my $mask = "N111";
1563              
1564             # which says that the symbolic tag is in the first column and that the numerical
1565             # data in the next three columns is to be used for clustering. If your data file
1566             # had, say, five columns and you wanted only the last three columns to be
1567             # clustered, the mask would become `N0111' assuming that that the symbolic tag is
1568             # still in the first column.
1569              
1570             # Now you must construct an instance of the clusterer through a call such as:
1571              
1572             my $clusterer = Algorithm::LinearManifoldDataClusterer->new(
1573             datafile => $datafile,
1574             mask => $mask,
1575             K => 3,
1576             P => 2,
1577             max_iterations => 15,
1578             cluster_search_multiplier => 2,
1579             delta_reconstruction_error => 0.001,
1580             terminal_output => 1,
1581             visualize_each_iteration => 1,
1582             show_hidden_in_3D_plots => 1,
1583             make_png_for_each_iteration => 1,
1584             );
1585              
1586             # where the parameter K specifies the number of clusters you expect to find in
1587             # your data and the parameter P is the dimensionality of the manifold on which the
1588             # data resides. The parameter cluster_search_multiplier is for increasing the
1589             # odds that the random seeds chosen initially for clustering will populate all the
1590             # clusters. Set this parameter to a low number like 2 or 3. The parameter
1591             # max_iterations places a hard limit on the number of iterations that the
1592             # algorithm is allowed. The actual number of iterations is controlled by the
1593             # parameter delta_reconstruction_error. The iterations stop when the change in
1594             # the total "reconstruction error" from one iteration to the next is smaller than
1595             # the value specified by delta_reconstruction_error.
1596              
1597             # Next, you must get the module to read the data for clustering:
1598              
1599             $clusterer->get_data_from_csv();
1600              
1601             # Finally, you invoke linear manifold clustering by:
1602              
1603             my $clusters = $clusterer->linear_manifold_clusterer();
1604              
1605             # The value returned by this call is a reference to an array of anonymous arrays,
1606             # with each anonymous array holding one cluster. If you wish, you can have the
1607             # module write the clusters to individual files by the following call:
1608              
1609             $clusterer->write_clusters_to_files($clusters);
1610              
1611             # If you want to see how the reconstruction error changes with the iterations, you
1612             # can make the call:
1613              
1614             $clusterer->display_reconstruction_errors_as_a_function_of_iterations();
1615              
1616             # When your data is 3-dimensional and when the clusters reside on a surface that
1617             # is more or less spherical, you can visualize the clusters by calling
1618              
1619             $clusterer->visualize_clusters_on_sphere("final clustering", $clusters);
1620              
1621             # where the first argument is a label to be displayed in the 3D plot and the
1622             # second argument the value returned by calling linear_manifold_clusterer().
1623              
1624             # SYNTHETIC DATA GENERATION:
1625              
1626             # The module includes an embedded class, DataGenerator, for generating synthetic
1627             # three-dimensional data that can be used to experiment with the clustering code.
1628             # The synthetic data, written out to a CSV file, consists of Gaussian clusters on
1629             # the surface of a sphere. You can control the number of clusters, the width of
1630             # each cluster, and the number of samples in the clusters by giving appropriate
1631             # values to the constructor parameters as shown below:
1632              
1633             use strict;
1634             use Algorithm::LinearManifoldDataClusterer;
1635              
1636             my $output_file = "4_clusters_on_a_sphere_1000_samples.csv";
1637              
1638             my $training_data_gen = DataGenerator->new(
1639             output_file => $output_file,
1640             cluster_width => 0.015,
1641             total_number_of_samples_needed => 1000,
1642             number_of_clusters_on_sphere => 4,
1643             show_hidden_in_3D_plots => 0,
1644             );
1645             $training_data_gen->gen_data_and_write_to_csv();
1646             $training_data_gen->visualize_data_on_sphere($output_file);
1647              
1648              
1649             =head1 DESCRIPTION
1650              
1651             If you are new to machine learning and data clustering on linear and nonlinear
1652             manifolds, your first question is likely to be: What is a manifold? A manifold is a
1653             space that is locally Euclidean. And a space is locally Euclidean if it allows for
1654             the points in a small neighborhood to be represented by Cartesian coordinates and if
1655             the distances between the points in the neighborhood are given by the Euclidean
1656             metric. For an example, the set of all points on the surface of a sphere does NOT
1657             constitute a Euclidean space. Nonetheless, if you confined your attention to a small
1658             enough neighborhood around a point, the space would seem to be locally Euclidean.
1659             The surface of a sphere is a 2-dimensional manifold embedded in a 3-dimensional
1660             space. A plane in a 3-dimensional space is also a 2-dimensional manifold. You would
1661             think of the surface of a sphere as a nonlinear manifold, whereas a plane would be a
1662             linear manifold. However, note that any nonlinear manifold is locally a linear
1663             manifold. That is, given a sufficiently small neighborhood on a nonlinear manifold,
1664             you can always think of it as a locally flat surface.
1665              
1666             As to why we need machine learning and data clustering on manifolds, there exist many
1667             important applications in which the measured data resides on a nonlinear manifold.
1668             For example, when you record images of a human face from different angles, all the
1669             image pixels taken together fall on a low-dimensional surface in a high-dimensional
1670             measurement space. The same is believed to be true for the satellite images of a land
1671             mass that are recorded with the sun at different angles with respect to the direction
1672             of the camera.
1673              
1674             Reducing the dimensionality of the sort of data mentioned above is critical to the
1675             proper functioning of downstream classification algorithms, and the most popular
1676             traditional method for dimensionality reduction is the Principal Components Analysis
1677             (PCA) algorithm. However, using PCA is tantamount to passing a linear least-squares
1678             hyperplane through the surface on which the data actually resides. As to why that
1679             might be a bad thing to do, just imagine the consequences of assuming that your data
1680             falls on a straight line when, in reality, it falls on a strongly curving arc. This
1681             is exactly what happens with PCA --- it gives you a linear manifold approximation to
1682             your data that may actually reside on a curved surface.
1683              
1684             That brings us to the purpose of this module, which is to cluster data that resides
1685             on a nonlinear manifold. Since a nonlinear manifold is locally linear, we can think
1686             of each data cluster on a nonlinear manifold as falling on a locally linear portion
1687             of the manifold, meaning on a hyperplane. The logic of the module is based on
1688             finding a set of hyperplanes that best describes the data, with each hyperplane
1689             derived from a local data cluster. This is like constructing a piecewise linear
1690             approximation to data that falls on a curve as opposed to constructing a single
1691             straight line approximation to all of the data. So whereas the frequently used PCA
1692             algorithm gives you a single hyperplane approximation to all your data, what this
1693             module returns is a set of hyperplane approximations, with each hyperplane derived by
1694             applying the PCA algorithm locally to a data cluster.
1695              
1696             That brings us to the problem of how to actually discover the best set of hyperplane
1697             approximations to the data. What is probably the most popular algorithm today for
1698             that purpose is based on the following key idea: Given a set of subspaces to which a
1699             data element can be assigned, you assign it to that subspace for which the
1700             B is the least. But what do we mean by a B and what
1701             is B?
1702              
1703             To understand the notions of B and B, let's revisit
1704             the traditional approach of dimensionality reduction by the PCA algorithm. The PCA
1705             algorithm consists of: (1) Subtracting from each data element the global mean of the
1706             data; (2) Calculating the covariance matrix of the data; (3) Carrying out an
1707             eigendecomposition of the covariance matrix and ordering the eigenvectors according
1708             to decreasing values of the corresponding eigenvalues; (4) Forming a B by
1709             discarding the trailing eigenvectors whose corresponding eigenvalues are relatively
1710             small; and, finally, (5) projecting all the data elements into the subspace so
1711             formed. The error incurred in representing a data element by its projection into the
1712             subspace is known as the B. This error is the projection of
1713             the data element into the space spanned by the discarded trailing eigenvectors.
1714              
1715             I
1716             subspace in the manner described above, we construct a set of subspaces, one for each
1717             data cluster on the nonlinear manifold. After the subspaces have been constructed, a
1718             data element is assigned to that subspace for which the reconstruction error is the
1719             least.> On the face of it, this sounds like a chicken-and-egg sort of a problem. You
1720             need to have already clustered the data in order to construct the subspaces at
1721             different places on the manifold so that you can figure out which cluster to place a
1722             data element in.
1723              
1724             Such problems, when they do possess a solution, are best tackled through iterative
1725             algorithms in which you start with a guess for the final solution, you rearrange the
1726             measured data on the basis of the guess, and you then use the new arrangement of the
1727             data to refine the guess. Subsequently, you iterate through the second and the third
1728             steps until you do not see any discernible changes in the new arrangements of the
1729             data. This forms the basis of the clustering algorithm that is described under
1730             B in the section that follows. This algorithm was first proposed in the
1731             article "Dimension Reduction by Local Principal Component Analysis" by Kambhatla and
1732             Leen that appeared in the journal Neural Computation in 1997.
1733              
1734             Unfortunately, experiments show that the algorithm as proposed by Kambhatla and Leen
1735             is much too sensitive to how the clusters are seeded initially. To get around this
1736             limitation of the basic clustering-by-minimization-of-reconstruction-error, this
1737             module implements a two phased approach. In B, we introduce a multiplier
1738             effect in our search for clusters by looking for C clusters instead of the main
1739             C clusters. In this manner, we increase the odds that each original cluster will
1740             be visited by one or more of the C randomly selected seeds at the beginning,
1741             where C is the integer value given to the constructor parameter
1742             C. Subsequently, we merge the clusters that belong
1743             together in order to form the final C clusters. That work is done in B
1744             of the algorithm.
1745              
1746             For the cluster merging operation in Phase 2, we model the C clusters as the
1747             nodes of an attributed graph in which the weight given to an edge connecting a pair
1748             of nodes is a measure of the similarity between the two clusters corresponding to the
1749             two nodes. Subsequently, we use spectral clustering to merge the most similar nodes
1750             in our quest to partition the data into C clusters. For that purpose, we use the
1751             Shi-Malik normalized cuts algorithm. The pairwise node similarity required by this
1752             algorithm is measured by the C method of the
1753             C class. The smaller the overall reconstruction error
1754             when all of the data elements in one cluster are projected into the other's subspace
1755             and vice versa, the greater the similarity between two clusters. Additionally, the
1756             smaller the distance between the mean vectors of the clusters, the greater the
1757             similarity between two clusters. The overall similarity between a pair of clusters
1758             is a combination of these two similarity measures.
1759              
1760             =head1 SUMMARY OF THE ALGORITHM
1761              
1762             We now present a summary of the two phases of the algorithm implemented in this
1763             module. Note particularly the important role played by the constructor parameter
1764             C. It is only when the integer value given to this
1765             parameter is greater than 1 that Phase 2 of the algorithm kicks in.
1766              
1767             =over 4
1768              
1769             =item B
1770              
1771             Through iterative minimization of the total reconstruction error, this phase of the
1772             algorithm returns C clusters where C is the actual number of clusters you
1773             expect to find in your data and where C is the integer value given to the
1774             constructor parameter C. As previously mentioned, on
1775             account of the sensitivity of the reconstruction-error based clustering to how the
1776             clusters are initially seeded, our goal is to look for C clusters with the idea
1777             of increasing the odds that each of the C clusters will see at least one seed at
1778             the beginning of the algorithm.
1779              
1780             =over 4
1781              
1782             =item Step 1:
1783              
1784             Randomly choose C data elements to serve as the seeds for that many clusters.
1785              
1786             =item Step 2:
1787              
1788             Construct initial C clusters by assigning each data element to that cluster
1789             whose seed it is closest to.
1790              
1791             =item Step 3:
1792              
1793             Calculate the mean and the covariance matrix for each of the C clusters and
1794             carry out an eigendecomposition of the covariance matrix. Order the eigenvectors in
1795             decreasing order of the corresponding eigenvalues. The first C

eigenvectors

1796             define the subspace for that cluster. Use the space spanned by the remaining
1797             eigenvectors --- we refer to them as the trailing eigenvectors --- for calculating
1798             the reconstruction error.
1799              
1800             =item Step 4
1801              
1802             Taking into account the mean associated with each cluster, re-cluster the entire data
1803             set on the basis of the least reconstruction error. That is, assign each data
1804             element to that subspace for which it has the smallest reconstruction error.
1805             Calculate the total reconstruction error associated with all the data elements. (See
1806             the definition of the reconstruction error in the C section.)
1807              
1808             =item Step 5
1809              
1810             Stop iterating if the change in the total reconstruction error from the previous
1811             iteration to the current iteration is less than the value specified by the constructor
1812             parameter C. Otherwise, go back to Step 3.
1813              
1814             =back
1815              
1816             =item B
1817              
1818             This phase of the algorithm uses graph partitioning to merge the C clusters back
1819             into the C clusters you expect to see in your data. Since the algorithm whose
1820             steps are presented below is invoked recursively, let's assume that we have C
1821             clusters that need to be merged by invoking the Shi-Malik spectral clustering
1822             algorithm as described below:
1823              
1824             =over 4
1825              
1826             =item Step 1
1827              
1828             Form a graph whose C nodes represent the C clusters. (For the very first
1829             invocation of this step, we have C.)
1830              
1831             =item Step 2
1832              
1833             Construct an C similarity matrix for the nodes in the graph. The C<(i,j)>-th
1834             element of this matrix is the pairwise similarity between the clusters indexed C
1835             and C. Calculate this similarity on the basis of the following two criteria: (1)
1836             The total reconstruction error when the data elements in the cluster indexed C are
1837             projected into the subspace for the cluster indexed C and vice versa. And (2) The
1838             distance between the mean vectors associated with the two clusters.
1839              
1840             =item Step 3
1841              
1842             Calculate the symmetric normalized Laplacian of the similarity matrix. We use C
1843             to denote the symmetric normalized Laplacian.
1844              
1845             =item Step 4
1846              
1847             Carry out an eigendecomposition of the C matrix and choose the eigenvector
1848             corresponding to the second smallest eigenvalue for bipartitioning the graph on the
1849             basis of the sign of the values in the eigenvector.
1850              
1851             =item Step 5
1852              
1853             If the bipartition of the previous step yields one-versus-the-rest kind of a
1854             partition, add the `one' cluster to the output list of clusters and invoke graph
1855             partitioning recursively on the `rest' by going back to Step 1. On the other hand,
1856             if the cardinality of both the partitions returned by Step 4 exceeds 1, invoke graph
1857             partitioning recursively on both partitions. Step when the list of clusters in the
1858             output list equals C.
1859              
1860             =item Step 6
1861              
1862             In general, the C clusters obtained by recursive graph partitioning will not cover
1863             all of the data. So, for the final step, assign each data element not covered by the
1864             C clusters to that cluster for which its reconstruction error is the least.
1865              
1866             =back
1867              
1868             =back
1869              
1870             =head1 FAIL-FIRST BIAS OF THE MODULE
1871              
1872             As you would expect from all such iterative algorithms, the module carries no
1873             theoretical guarantee that it will give you correct results. But what does that mean?
1874             Suppose you create synthetic data that consists of Gaussian looking disjoint clusters
1875             on the surface of a sphere, would the module always succeed in separating out the
1876             clusters? The module carries no guarantees to that effect --- especially considering
1877             that Phase 1 of the algorithm is sensitive to how the clusters are seeded at the
1878             beginning. Although this sensitivity is mitigated by the cluster merging step when
1879             greater-than-1 value is given to the constructor option C,
1880             a plain vanilla implementation of the steps in Phase 1 and Phase 2 would nonetheless
1881             carry significant risk that you'll end up with incorrect clustering results.
1882              
1883             To further reduce this risk, the module has been programmed so that it terminates
1884             immediately if it suspects that the cluster solution being developed is unlikely to
1885             be fruitful. The heuristics used for such termination are conservative --- since the
1886             cost of termination is only that the user will have to run the code again, which at
1887             worst only carries an element of annoyance with it. The three "Fail First"
1888             heuristics currently programmed into the module are based on simple "unimodality
1889             testing", testing for "congruent clusters," and testing for dominant cluster support
1890             in the final stage of the recursive invocation of the graph partitioning step. The
1891             unimodality testing is as elementary as it can be --- it only checks for the number
1892             of data samples within a certain radius of the mean in relation to the total number
1893             of data samples in the cluster.
1894              
1895             When the program terminates under such conditions, it prints out the following message
1896             in your terminal window:
1897              
1898             Bailing out!
1899              
1900             Given the very simple nature of testing that is carried for the "Fail First" bias, do
1901             not be surprised if the results you get for your data simply look wrong. If most
1902             runs of the module produce wrong results for your application, that means that the
1903             module logic needs to be strengthened further. The author of this module would love
1904             to hear from you if that is the case.
1905              
1906             =head1 METHODS
1907              
1908             The module provides the following methods for linear-manifold based clustering, for
1909             cluster visualization, and for the generation of data for testing the clustering algorithm:
1910              
1911             =over 4
1912              
1913             =item B
1914              
1915             my $clusterer = Algorithm::LinearManifoldDataClusterer->new(
1916             datafile => $datafile,
1917             mask => $mask,
1918             K => $K,
1919             P => $P,
1920             cluster_search_multiplier => $C,
1921             max_iterations => $max_iter,
1922             delta_reconstruction_error => 0.001,
1923             terminal_output => 1,
1924             write_clusters_to_files => 1,
1925             visualize_each_iteration => 1,
1926             show_hidden_in_3D_plots => 1,
1927             make_png_for_each_iteration => 1,
1928             );
1929              
1930             A call to C constructs a new instance of the
1931             C class.
1932              
1933             =back
1934              
1935             =head2 Constructor Parameters
1936              
1937             =over 8
1938              
1939             =item C:
1940              
1941             This parameter names the data file that contains the multidimensional data records
1942             you want the module to cluster. This file must be in CSV format and each record in
1943             the file must include a symbolic tag for the record. Here are first few rows of such
1944             a CSV file in the C directory:
1945              
1946             d_161,0.0739248630173239,0.231119293395665,-0.970112873251437
1947             a_59,0.459932215884786,0.0110216469739639,0.887885623314902
1948             a_225,0.440503220903039,-0.00543366086464691,0.897734586447273
1949             a_203,0.441656364946433,0.0437191337788422,0.896118459046532
1950             ...
1951             ...
1952              
1953             What you see in the first column --- C, C, C, C --- are
1954             the symbolic tags associated with four 3-dimensional data records.
1955              
1956             =item C:
1957              
1958             This parameter supplies the mask to be applied to the columns of your data file. For
1959             the data file whose first few records are shown above, the mask should be C
1960             since the symbolic tag is in the first column of the CSV file and since, presumably,
1961             you want to cluster the data in the next three columns.
1962              
1963             =item C:
1964              
1965             This parameter supplies the number of clusters you are looking for.
1966              
1967             =item C

:

1968              
1969             This parameter specifies the dimensionality of the manifold on which the data resides.
1970              
1971             =item C:
1972              
1973             As should be clear from the C section, this parameter plays
1974             a very important role in the successful clustering of your data. As explained in
1975             C, the basic algorithm used for clustering in Phase 1 --- clustering by
1976             the minimization of the reconstruction error --- is sensitive to the choice of the
1977             cluster seeds that are selected randomly at the beginning of the algorithm. Should
1978             it happen that the seeds miss one or more of the clusters, the clustering produced is
1979             likely to not be correct. By giving an integer value to C
1980             that is greater than 1, you'll increase the odds that the randomly selected seeds
1981             will see all clusters. When you set C to C, you ask
1982             Phase 1 of the algorithm to construct C clusters as opposed to just C
1983             clusters. Subsequently, in Phase 2, the module uses inter-cluster similarity based
1984             graph partitioning to merge the C clusters into C clusters.
1985              
1986             =item C:
1987              
1988             This hard limits the number of iterations in Phase 1 of the algorithm. Ordinarily,
1989             the iterations stop automatically when the change in the total reconstruction error
1990             from one iteration to the next is less than the value specified by the parameter
1991             C.
1992              
1993             =item C:
1994              
1995             It is this parameter that determines when the iterations will actually terminate in
1996             Phase 1 of the algorithm. When the difference in the total reconstruction error from
1997             one iteration to the next is less than the value given to this parameter, the
1998             iterations come to an end. B
1999             data samples that need to be clustered, the larger must be the value give to this
2000             parameter. That makes intuitive sense since the total reconstruction error is the
2001             sum of all such errors for all of the data elements.> Unfortunately, the best value
2002             for this parameter does NOT appear to depend linearly on the total number of data
2003             records to be clustered.
2004              
2005             =item C:
2006              
2007             When this parameter is set, you will see in your terminal window the different
2008             clusters as lists of the symbolic tags associated with the data records. You will
2009             also see in your terminal window the output produced by the different steps of the
2010             graph partitioning algorithm as smaller clusters are merged to form the final C
2011             clusters --- assuming that you set the parameter C to an
2012             integer value that is greater than 1.
2013              
2014             =item C:
2015              
2016             As its name implies, when this option is set to 1, you'll see 3D plots of the
2017             clustering results for each iteration --- but only if your data is 3-dimensional.
2018              
2019             =item C:
2020              
2021             This parameter is important for controlling the visualization of the clusters on the
2022             surface of a sphere. If the clusters are too spread out, seeing all of the clusters
2023             all at once can be visually confusing. When you set this parameter, the clusters on
2024             the back side of the sphere will not be visible. Note that no matter how you set
2025             this parameter, you can interact with the 3D plot of the data and rotate it with your
2026             mouse pointer to see all of the data that is output by the clustering code.
2027              
2028             =item C:
2029              
2030             If you set this option to 1, the module will output a Gnuplot in the form of a PNG
2031             image for each iteration in Phase 1 of the algorithm. In Phase 2, the module will
2032             output the clustering result produced by the graph partitioning algorithm.
2033              
2034             =back
2035              
2036             =over
2037              
2038             =item B:
2039              
2040             $clusterer->get_data_from_csv();
2041              
2042             As you can guess from its name, the method extracts the data from the CSV file named
2043             in the constructor.
2044              
2045             =item B:
2046              
2047             $clusterer->linear_manifold_clusterer();
2048              
2049             or
2050              
2051             my $clusters = $clusterer->linear_manifold_clusterer();
2052              
2053             This is the main call to the linear-manifold based clusterer. The first call works
2054             by side-effect, meaning that you will see the clusters in your terminal window and
2055             they would be written out to disk files (depending on the constructor options you
2056             have set). The second call also returns the clusters as a reference to an array of
2057             anonymous arrays, each holding the symbolic tags for a cluster.
2058              
2059             =item B:
2060              
2061             $clusterer->display_reconstruction_errors_as_a_function_of_iterations();
2062              
2063             This method would normally be called after the clustering is completed to see how the
2064             reconstruction errors decreased with the iterations in Phase 1 of the overall
2065             algorithm.
2066              
2067             =item B:
2068              
2069             $clusterer->write_clusters_to_files($clusters);
2070              
2071             As its name implies, when you call this methods, the final clusters would be written
2072             out to disk files. The files have names like:
2073              
2074             cluster0.txt
2075             cluster1.txt
2076             cluster2.txt
2077             ...
2078             ...
2079              
2080             Before the clusters are written to these files, the module destroys all files with
2081             such names in the directory in which you call the module.
2082              
2083             =item B:
2084              
2085             $clusterer->visualize_clusters_on_sphere("final clustering", $clusters);
2086              
2087             or
2088              
2089             $clusterer->visualize_clusters_on_sphere("final_clustering", $clusters, "png");
2090              
2091             If your data is 3-dimensional and it resides on the surface of a sphere (or in the
2092             vicinity of such a surface), you may be able to use these methods for the
2093             visualization of the clusters produced by the algorithm. The first invocation
2094             produces a Gnuplot in a terminal window that you can rotate with your mouse pointer.
2095             The second invocation produces a `.png' image of the plot.
2096              
2097             =item B:
2098              
2099             $clusterer->auto_retry_clusterer();
2100              
2101             or
2102              
2103             my $clusters = $clusterer->auto_retry_clusterer();
2104              
2105             As mentioned earlier, the module is programmed in such a way that it is more likely
2106             to fail than to give you a wrong answer. If manually trying the clusterer repeatedly
2107             on a data file is frustrating, you can use C to automatically
2108             make repeated attempts for you. See the script C for how you can use
2109             C in your own code.
2110              
2111             =back
2112              
2113             =head1 GENERATING SYNTHETIC DATA FOR EXPERIMENTING WITH THE CLUSTERER
2114              
2115             The module file also contains a class named C for generating synthetic
2116             data for experimenting with linear-manifold based clustering. At this time, only
2117             3-dimensional data that resides in the form of Gaussian clusters on the surface of a
2118             sphere is generated. The generated data is placed in a CSV file. You construct an
2119             instance of the C class by a call like:
2120              
2121             =over 4
2122              
2123             =item B
2124              
2125             my $training_data_gen = DataGenerator->new(
2126             output_file => $output_file,
2127             cluster_width => 0.0005,
2128             total_number_of_samples_needed => 1000,
2129             number_of_clusters_on_sphere => 4,
2130             show_hidden_in_3D_plots => 0,
2131             );
2132              
2133             =back
2134              
2135             =head2 Parameters for the DataGenerator constructor:
2136              
2137             =over 8
2138              
2139             =item C:
2140              
2141             The numeric values are generated using a bivariate Gaussian distribution whose two
2142             independent variables are the azimuth and the elevation angles on the surface of a
2143             unit sphere. The mean of each cluster is chosen randomly and its covariance set in
2144             proportion to the value supplied for the C< cluster_width> parameter.
2145              
2146             =item C:
2147              
2148             This parameter controls the spread of each cluster on the surface of the unit sphere.
2149              
2150             =item C:
2151              
2152             As its name implies, this parameter specifies the total number of data samples that
2153             will be written out to the output file --- provided this number is divisible by the
2154             number of clusters you asked for. If the divisibility condition is not satisfied,
2155             the number of data samples actually written out will be the closest it can be to the
2156             number you specify for this parameter under the condition that equal number of
2157             samples will be created for each cluster.
2158              
2159             =item C:
2160              
2161             Again as its name implies, this parameters specifies the number of clusters that will
2162             be produced on the surface of a unit sphere.
2163              
2164             =item C:
2165              
2166             This parameter is important for the visualization of the clusters and it controls
2167             whether you will see the generated data on the back side of the sphere. If the
2168             clusters are not too spread out, you can set this parameter to 0 and see all the
2169             clusters all at once. However, when the clusters are spread out, it can be visually
2170             confusing to see the data on the back side of the sphere. Note that no matter how
2171             you set this parameter, you can interact with the 3D plot of the data and rotate it
2172             with your mouse pointer to see all of the data that is generated.
2173              
2174             =back
2175              
2176             =over 4
2177              
2178             =item B:
2179              
2180             $training_data_gen->gen_data_and_write_to_csv();
2181              
2182             As its name implies, this method generates the data according to the parameters
2183             specified in the DataGenerator constructor.
2184              
2185             =item B:
2186              
2187             $training_data_gen->visualize_data_on_sphere($output_file);
2188              
2189             You can use this method to visualize the clusters produced by the data generator.
2190             Since the clusters are located at randomly selected points on a unit sphere, by
2191             looking at the output visually, you can quickly reject what the data generator has
2192             produced and try again.
2193              
2194             =back
2195              
2196             =head1 HOW THE CLUSTERS ARE OUTPUT
2197              
2198             When the option C is set in the constructor of the
2199             C class, the clusters are displayed on the terminal
2200             screen.
2201              
2202             And, when the option C is set in the same constructor, the
2203             module dumps the clusters in files named
2204              
2205             cluster0.txt
2206             cluster1.txt
2207             cluster2.txt
2208             ...
2209             ...
2210              
2211             in the directory in which you execute the module. The number of such files will
2212             equal the number of clusters formed. All such existing files in the directory are
2213             destroyed before any fresh ones are created. Each cluster file contains the symbolic
2214             tags of the data samples in that cluster.
2215              
2216             Assuming that the data dimensionality is 3, if you have set the constructor parameter
2217             C, the module will deposit in your directory printable PNG
2218             files that are point plots of the different clusters in the different iterations of
2219             the algorithm. Such printable files are also generated for the initial clusters at
2220             the beginning of the iterations and for the final clusters in Phase 1 of the
2221             algorithm. You will also see in your directory a PNG file for the clustering result
2222             produced by graph partitioning in Phase 2.
2223              
2224             Also, as mentioned previously, a call to C in your own
2225             code will return the clusters to you directly.
2226              
2227             =head1 REQUIRED
2228              
2229             This module requires the following modules:
2230              
2231             List::Util
2232             File::Basename
2233             Math::Random
2234             Graphics::GnuplotIF
2235             Math::GSL::Matrix
2236             POSIX
2237              
2238             =head1 THE C DIRECTORY
2239              
2240             The C directory contains the following four scripts that you might want to
2241             play with in order to become familiar with the module:
2242              
2243             example1.pl
2244              
2245             example2.pl
2246              
2247             example3.pl
2248              
2249             example4.pl
2250              
2251             These scripts demonstrate linear-manifold based clustering on the 3-dimensional data
2252             in the following three CSV files:
2253              
2254             3_clusters_on_a_sphere_498_samples.csv (used in example1.pl and example4.pl)
2255              
2256             3_clusters_on_a_sphere_3000_samples.csv (used in example2.pl)
2257              
2258             4_clusters_on_a_sphere_1000_samples.csv (used in example3.pl)
2259              
2260             Note that even though the first two of these files both contain exactly three
2261             clusters, the clusters look very different in the two data files. The clusters are
2262             much more spread out in C<3_clusters_on_a_sphere_3000_samples.csv>.
2263              
2264             The code in C is special because it shows how you can call the
2265             C method of the module for automatic repeated invocations of
2266             the clustering program until success is achieved. The value of the constructor
2267             parameter C is set to 1 in C, implying that
2268             when you execute C you will not be invoking Phase 2 of the algorithm.
2269             You might wish to change the value given to the parameter
2270             C to see how it affects the number of attempts needed to
2271             achieve success.
2272              
2273             The C directory also includes PNG image files that show some intermediate
2274             and the best final results that can be achieved by the first three examples, that
2275             is, for the scripts C, C, and C. If you are on
2276             a Linux machine and if you have the C library installed, you can use the
2277             C command to see the results in the PNG images. The results you get with
2278             your own runs of the three example scripts may or may not look like what you see in
2279             the outputs shown in the PNG files depending on how the module seeds the clusters.
2280             Your best results should look like what you see in the PNG images.
2281              
2282             The C directory also contains the following utility scripts:
2283              
2284             For generating the data for experiments with clustering:
2285              
2286             generate_data_on_a_sphere.pl
2287              
2288             For visualizing the data generated by the above script:
2289              
2290             data_visualizer.pl
2291              
2292             For cleaning up the examples directory:
2293              
2294             cleanup_directory.pl
2295              
2296             Invoking the C script will get rid of all the PNG image files
2297             that are generated by the module when you run it with the constructor option
2298             C set to 1.
2299              
2300             =head1 EXPORT
2301              
2302             None by design.
2303              
2304             =head1 CAVEATS
2305              
2306             The performance of the algorithm depends much on the values you choose for the
2307             constructor parameters. And, even for the best choices for the parameters, the
2308             algorithm is not theoretically guaranteed to return the best results.
2309              
2310             Even after you have discovered the best choices for the constructor parameters, the
2311             best way to use this module is to try it multiple times on any given data and accept
2312             only those results that make the best intuitive sense.
2313              
2314             The module was designed with the hope that it would rather fail than give you wrong
2315             results. So if you see it failing a few times before it returns a good answer, that's
2316             a good thing.
2317              
2318             However, if the module fails too often or is too quick to give you wrong answers,
2319             that means the module is not working on your data. If that happens, I'd love to hear
2320             from you. That might indicate to me how I should enhance the power of this module
2321             for its future versions.
2322              
2323             =head1 BUGS
2324              
2325             Please notify the author if you encounter any bugs. When sending email, please place
2326             the string 'LinearManifoldDataClusterer' in the subject line.
2327              
2328             =head1 INSTALLATION
2329              
2330             Download the archive from CPAN in any directory of your choice. Unpack the archive
2331             with a command that on a Linux machine would look like:
2332              
2333             tar zxvf Algorithm-LinearManifoldDataClusterer-1.0.tar.gz
2334              
2335             This will create an installation directory for you whose name will be
2336             C. Enter this directory and execute the following commands
2337             for a standard install of the module if you have root privileges:
2338              
2339             perl Makefile.PL
2340             make
2341             make test
2342             sudo make install
2343              
2344             If you do not have root privileges, you can carry out a non-standard install the
2345             module in any directory of your choice by:
2346              
2347             perl Makefile.PL prefix=/some/other/directory/
2348             make
2349             make test
2350             make install
2351              
2352             With a non-standard install, you may also have to set your PERL5LIB environment
2353             variable so that this module can find the required other modules. How you do that
2354             would depend on what platform you are working on. In order to install this module in
2355             a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
2356             variable by
2357              
2358             setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2359              
2360             If I used bash, I'd need to declare:
2361              
2362             export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2363              
2364              
2365             =head1 THANKS
2366              
2367             I have learned much from my conversations with Donghun Kim whose research on face
2368             recognition in the wild involves clustering image data on manifolds. I have also had
2369             several fruitful conversations with Bharath Kumar Comandur and Tanmay Prakash with
2370             regard to the ideas that are incorporated in this module.
2371              
2372             =head1 AUTHOR
2373              
2374             Avinash Kak, kak@purdue.edu
2375              
2376             If you send email, please place the string "LinearManifoldDataClusterer" in your subject line to get past
2377             my spam filter.
2378              
2379             =head1 COPYRIGHT
2380              
2381             This library is free software; you can redistribute it and/or modify it under the
2382             same terms as Perl itself.
2383              
2384             Copyright 2015 Avinash Kak
2385              
2386             =cut
2387