File Coverage

blib/lib/Algorithm/LinearManifoldDataClusterer.pm
Criterion Covered Total %
statement 25 27 92.5
branch n/a
condition n/a
subroutine 9 9 100.0
pod n/a
total 34 36 94.4


line stmt bran cond sub pod time code
1             package Algorithm::LinearManifoldDataClusterer;
2              
3             #------------------------------------------------------------------------------------
4             # Copyright (c) 2015 Avinash Kak. All rights reserved. This program is free
5             # software. You may modify and/or distribute it under the same terms as Perl itself.
6             # This copyright notice must remain attached to the file.
7             #
8             # Algorithm::LinearManifoldDataClusterer is a Perl module for clustering data that
9             # resides on a low-dimensional manifold in a high-dimensional measurement space.
10             # -----------------------------------------------------------------------------------
11              
12 1     1   13073 use 5.10.0;
  1         3  
  1         45  
13 1     1   5 use strict;
  1         1  
  1         28  
14 1     1   3 use warnings;
  1         4  
  1         30  
15 1     1   4 use Carp;
  1         0  
  1         71  
16 1     1   4 use List::Util qw(reduce any);
  1         1  
  1         83  
17 1     1   4 use File::Basename;
  1         2  
  1         66  
18 1     1   594 use Math::Random;
  1         5336  
  1         86  
19 1     1   584 use Graphics::GnuplotIF;
  1         9525  
  1         57  
20 1     1   1048 use Math::GSL::Matrix;
  0            
  0            
21             use POSIX ();
22              
23             our $VERSION = '1.01';
24              
25             # Constructor:
26             sub new {
27             my ($class, %args) = @_;
28             my @params = keys %args;
29             croak "\nYou have used a wrong name for a keyword argument " .
30             "--- perhaps a misspelling\n"
31             if check_for_illegal_params(@params) == 0;
32             bless {
33             _datafile => $args{datafile} || croak("datafile required"),
34             _mask => $args{mask} || croak("mask required"),
35             _K => $args{K} || 0,
36             _P => $args{P} || 0,
37             _terminal_output => $args{terminal_output} || 0,
38             _max_iterations => $args{max_iterations} || 0,
39             _delta_reconstruction_error => $args{delta_reconstruction_error} || 0.001,
40             _delta_normalized_error => undef,
41             _cluster_search_multiplier => $args{cluster_search_multiplier} || 1,
42             _visualize_each_iteration => $args{visualize_each_iteration} == 0 ? 0 : 1,
43             _show_hidden_in_3D_plots => $args{show_hidden_in_3D_plots} == 0 ? 0 : 1,
44             _make_png_for_each_iteration => $args{make_png_for_each_iteration} == 0 ? 0 : 1,
45             _debug => $args{debug} || 0,
46             _N => 0,
47             _KM => $args{K} * $args{cluster_search_multiplier},
48             _data_hash => {},
49             _data_tags => [],
50             _data_dimensions => 0,
51             _final_clusters => [],
52             _auto_retry_flag => 0,
53             _num_iterations_actually_used => undef,
54             _scale_factor => undef,
55             _data_tags_to_cluster_label_hash => {},
56             _final_reference_vecs_for_all_subspaces => [],
57             _reconstruction_error_as_a_function_of_iteration => [],
58             _final_trailing_eigenvec_matrices_for_all_subspaces => [],
59             _subspace_construction_error_as_a_function_of_iteration => [],
60             }, $class;
61             }
62              
63             sub get_data_from_csv {
64             my $self = shift;
65             my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered";
66             my $mask = $self->{_mask};
67             my @mask = split //, $mask;
68             $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
69             print "data dimensionality: $self->{_data_dimensions} \n" if $self->{_terminal_output};
70             open FILEIN, $filename or die "Unable to open $filename: $!";
71             die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
72             local $/ = undef;
73             my @all_data = split /\s+/, ;
74             my %data_hash = ();
75             my @data_tags = ();
76             foreach my $record (@all_data) {
77             my @splits = split /,/, $record;
78             my $record_name = shift @splits;
79             $data_hash{$record_name} = \@splits;
80             push @data_tags, $record_name;
81             }
82             $self->{_data_hash} = \%data_hash;
83             $self->{_data_tags} = \@data_tags;
84             $self->{_N} = scalar @data_tags;
85             }
86              
87             sub estimate_mean_and_covariance {
88             my $self = shift;
89             my $tag_set = shift;
90             my $cluster_size = @$tag_set;
91             my @cluster_center = @{$self->add_point_coords($tag_set)};
92             @cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;
93             # for covariance calculation:
94             my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$tag_set));
95             print "\nThe data will be stuffed into a matrix of $num_rows rows and $num_cols columns\n\n"
96             if $self->{_debug};
97             my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
98             my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
99             # All the record labels are stored in the array $self->{_data_tags}. The actual
100             # data for clustering is stored in a hash at $self->{_data_hash} whose keys are
101             # the record labels; the value associated with each key is the array holding the
102             # corresponding numerical multidimensional data.
103             $mean_vec->set_col(0, \@cluster_center);
104             if ($self->{_debug}) {
105             print "\nDisplaying the mean vector for the cluster:\n";
106             display_matrix( $mean_vec ) if $self->{_terminal_output};
107             }
108             foreach my $j (0..$num_cols-1) {
109             my $tag = $tag_set->[$j];
110             my $data = $self->{_data_hash}->{$tag};
111             my @diff_from_mean = vector_subtract($data, \@cluster_center);
112             $matrix->set_col($j, \@diff_from_mean);
113             }
114             my $transposed = transpose( $matrix );
115             my $covariance = $matrix * $transposed;
116             $covariance *= 1.0 / $num_cols;
117             if ($self->{_debug}) {
118             print "\nDisplaying the Covariance Matrix for cluster:";
119             display_matrix( $covariance ) if $self->{_terminal_output};
120             }
121             return ($mean_vec, $covariance);
122             }
123              
124             sub eigen_analysis_of_covariance {
125             my $self = shift;
126             my $covariance = shift;
127             my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
128             my $num_of_eigens = @$eigenvalues;
129             my $largest_eigen_index = 0;
130             my $smallest_eigen_index = 0;
131             print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug};
132             foreach my $i (1..$num_of_eigens-1) {
133             $largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
134             $smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
135             print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug};
136             }
137             print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
138             print "\nsmallest eigen index: $smallest_eigen_index\n\n" if $self->{_debug};
139             my @all_my_eigenvecs;
140             foreach my $i (0..$num_of_eigens-1) {
141             my @vec = $eigenvectors->[$i]->as_list;
142             my @eigenvec;
143             foreach my $ele (@vec) {
144             my ($mag, $theta) = $ele =~ /\[(\d*\.?\d*e?[+-]?\d*),(\S+)\]/;
145             if ($theta eq "0") {
146             push @eigenvec, $mag;
147             } elsif ($theta eq "pi") {
148             push @eigenvec, -1.0 * $mag;
149             } else {
150             die "Eigendecomposition produced a complex eigenvector -- " .
151             "which should not happen for a covariance matrix!";
152             }
153             }
154             print "Eigenvector $i: @eigenvec\n" if $self->{_debug};
155             push @all_my_eigenvecs, \@eigenvec;
156             }
157             my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
158             print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug};
159             my @sorted_eigenvec_indexes = sort {$eigenvalues->[$b] <=> $eigenvalues->[$a]} 0..@all_my_eigenvecs-1;
160             my @sorted_eigenvecs;
161             my @sorted_eigenvals;
162             foreach my $i (0..@sorted_eigenvec_indexes-1) {
163             $sorted_eigenvecs[$i] = $all_my_eigenvecs[$sorted_eigenvec_indexes[$i]];
164             $sorted_eigenvals[$i] = $eigenvalues->[$sorted_eigenvec_indexes[$i]];
165             }
166             if ($self->{_debug}) {
167             print "\nHere come sorted eigenvectors --- from the largest to the smallest:\n";
168             foreach my $i (0..@sorted_eigenvecs-1) {
169             print "eigenvec: @{$sorted_eigenvecs[$i]} eigenvalue: $sorted_eigenvals[$i]\n";
170             }
171             }
172             return (\@sorted_eigenvecs, \@sorted_eigenvals);
173             }
174              
175             sub auto_retry_clusterer {
176             my $self = shift;
177             $self->{_auto_retry_flag} = 1;
178             my $clusters;
179             $@ = 1;
180             my $retry_attempts = 1;
181             while ($@) {
182             eval {
183             $clusters = $self->linear_manifold_clusterer();
184             };
185             if ($@) {
186             if ($self->{_terminal_output}) {
187             print "Clustering failed. Trying again. --- $@";
188             print "\n\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n";
189             print "VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV\n\n";
190             }
191             $retry_attempts++;
192             } else {
193             print "\n\nNumber of retry attempts: $retry_attempts\n\n" if $self->{_terminal_output};
194             return $clusters;
195             }
196             }
197             }
198              
199             sub linear_manifold_clusterer {
200             my $self = shift;
201             my $KM = $self->{_KM};
202             my @initial_cluster_center_tags;
203             my $visualization_msg;
204             my @initial_cluster_center_indexes = $self->initialize_cluster_centers($KM, $self->{_N});
205             print "Randomly selected indexes for cluster center tags: @initial_cluster_center_indexes\n"
206             if $self->{_debug};
207             @initial_cluster_center_tags = map {$self->{_data_tags}->[$_]} @initial_cluster_center_indexes;
208             my @initial_cluster_center_coords = map {$self->{_data_hash}->{$_}} @initial_cluster_center_tags;
209             if ($self->{_debug}) {
210             foreach my $centroid (@initial_cluster_center_coords) {
211             print "Initial cluster center coords: @{$centroid}\n";
212             }
213             }
214             my $initial_clusters = $self->assign_data_to_clusters_initial(\@initial_cluster_center_coords);
215             if ($self->{_data_dimensions} == 3) {
216             $visualization_msg = "initial_clusters";
217             $self->visualize_clusters_on_sphere($visualization_msg, $initial_clusters)
218             if $self->{_visualize_each_iteration};
219             $self->visualize_clusters_on_sphere($visualization_msg, $initial_clusters, "png")
220             if $self->{_make_png_for_each_iteration};
221             }
222             foreach my $cluster (@$initial_clusters) {
223             my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
224             display_mean_and_covariance($mean, $covariance) if $self->{_debug};
225             }
226             my @clusters = @$initial_clusters;
227             display_clusters(\@clusters) if $self->{_debug};
228             my $iteration_index = 0;
229             my $unimodal_correction_flag;
230             my $previous_min_value_for_unimodality_quotient;
231             while ($iteration_index < $self->{_max_iterations}) {
232             print "\n\n========================== STARTING ITERATION $iteration_index =====================\n\n"
233             if $self->{_terminal_output};
234             my $total_reconstruction_error_this_iteration = 0;
235             my @subspace_construction_errors_this_iteration;
236             my @trailing_eigenvec_matrices_for_all_subspaces;
237             my @reference_vecs_for_all_subspaces;
238             foreach my $cluster (@clusters) {
239             next if @$cluster == 0;
240             my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
241             display_mean_and_covariance($mean, $covariance) if $self->{_debug};
242             print "--------------end of displaying mean and covariance\n\n" if $self->{_debug};
243             my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance);
244             my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
245             my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1];
246             my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals;
247             push @subspace_construction_errors_this_iteration, $subspace_construction_error;
248             my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions},
249             scalar(@trailing_eigenvecs));
250             foreach my $j (0..@trailing_eigenvecs-1) {
251             print "trailing eigenvec column: @{$trailing_eigenvecs[$j]}\n" if $self->{_debug};
252             $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
253             }
254             push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
255             push @reference_vecs_for_all_subspaces, $mean;
256             }
257             $self->set_termination_reconstruction_error_threshold(\@reference_vecs_for_all_subspaces);
258             my %best_subspace_based_partition_of_data;
259             foreach my $i (0..$self->{_KM}-1) {
260             $best_subspace_based_partition_of_data{$i} = [];
261             }
262             foreach my $data_tag (@{$self->{_data_tags}}) {
263             my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
264             $data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
265             my @errors = map {$self->reconstruction_error($data_vec,
266             $trailing_eigenvec_matrices_for_all_subspaces[$_],
267             $reference_vecs_for_all_subspaces[$_])}
268             0 .. $self->{_KM}-1;
269             my ($minval, $index_for_closest_subspace) = minimum(\@errors);
270             $total_reconstruction_error_this_iteration += $minval;
271             push @{$best_subspace_based_partition_of_data{$index_for_closest_subspace}},
272             $data_tag;
273             }
274             print "Finished calculating the eigenvectors for the clusters produced by the previous\n" .
275             "iteration and re-assigning the data samples to the new subspaces on the basis of\n".
276             "the least reconstruction error.\n\n" .
277             "Total reconstruction error in this iteration: $total_reconstruction_error_this_iteration\n"
278             if $self->{_terminal_output};
279             foreach my $i (0..$self->{_KM}-1) {
280             $clusters[$i] = $best_subspace_based_partition_of_data{$i};
281             }
282             display_clusters(\@clusters) if $self->{_terminal_output};
283             # Check if any cluster has lost all its elements. If so, fragment the worst
284             # existing cluster to create the additional clusters needed:
285             if (any {@$_ == 0} @clusters) {
286             die "empty cluster found" if $self->{_auto_retry_flag};
287             print "\nOne or more clusters have become empty. Will carve out the needed clusters\n" .
288             "from the cluster with the largest subspace construction error.\n\n";
289             $total_reconstruction_error_this_iteration = 0;
290             @subspace_construction_errors_this_iteration = ();
291             my $how_many_extra_clusters_needed = $self->{_KM} - scalar(grep {@$_ != 0} @clusters);
292             print "number of extra clusters needed at iteration $iteration_index: $how_many_extra_clusters_needed\n";
293             my $max = List::Util::max @subspace_construction_errors_this_iteration;
294             my $maxindex = List::Util::first {$_ == $max} @subspace_construction_errors_this_iteration;
295             my @cluster_fragments = cluster_split($clusters[$maxindex],
296             $how_many_extra_clusters_needed + 1);
297             my @newclusters;
298             push @newclusters, @clusters[0 .. $maxindex-1];
299             push @newclusters, @clusters[$maxindex+1 .. $self->{_KM}-1];
300             push @newclusters, @cluster_fragments;
301             @newclusters = grep {@$_ != 0} @newclusters;
302             die "something went wrong with cluster fragmentation"
303             unless $self->{_KM} = @newclusters;
304             @trailing_eigenvec_matrices_for_all_subspaces = ();
305             @reference_vecs_for_all_subspaces = ();
306             foreach my $cluster (@newclusters) {
307             die "Linear Manifold Clustering did not work $!" if @$cluster == 0;
308             my ($mean, $covariance) = estimate_mean_and_covariance($cluster);
309             my ($eigenvecs, $eigenvals) = eigen_analysis_of_covariance($covariance);
310             my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
311             my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1];
312             my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals;
313             push @subspace_construction_errors_this_iteration, $subspace_construction_error;
314             my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions},
315             scalar(@trailing_eigenvecs));
316             foreach my $j (0..@trailing_eigenvecs-1) {
317             $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
318             }
319             push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
320             push @reference_vecs_for_all_subspaces, $mean;
321             }
322             my %best_subspace_based_partition_of_data;
323             foreach my $i (0..$self->{_KM}-1) {
324             $best_subspace_based_partition_of_data{$i} = [];
325             }
326             foreach my $data_tag (@{$self->{_data_tags}}) {
327             my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
328             $data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
329             my @errors = map {reconstruction_error($data_vec,
330             $trailing_eigenvec_matrices_for_all_subspaces[$_],
331             $reference_vecs_for_all_subspaces[$_])}
332             0 .. $self->{_KM}-1;
333             my ($minval, $index_for_closest_subspace) = minimum(\@errors);
334             $total_reconstruction_error_this_iteration += $minval;
335             push @{$best_subspace_based_partition_of_data{$index_for_closest_subspace}},
336             $data_tag;
337             }
338             print "empty-cluster jag: total reconstruction error in this iteration: \n" .
339             "$total_reconstruction_error_this_iteration\n"
340             if $self->{_debug};
341             foreach my $i (0..$self->{_KM}-1) {
342             $clusters[$i] = $best_subspace_based_partition_of_data{$i};
343             }
344             display_clusters(\@newclusters) if $self->{_terminal_output};
345             @clusters = grep {@$_ != 0} @newclusters;
346             die "linear manifold based algorithm does not appear to work in this case $!"
347             unless @clusters == $self->{_KM};
348             }# end of foreach my $cluster (@clusters) ... loop followed by if clause for empty clusters
349             if ($self->{_data_dimensions} == 3) {
350             $visualization_msg = "clustering_at_iteration_$iteration_index";
351             $self->visualize_clusters_on_sphere($visualization_msg, \@clusters)
352             if $self->{_visualize_each_iteration};
353             $self->visualize_clusters_on_sphere($visualization_msg, \@clusters, "png")
354             if $self->{_make_png_for_each_iteration};
355             }
356             my @cluster_unimodality_quotients = map {$self->cluster_unimodality_quotient($clusters[$_],
357             $reference_vecs_for_all_subspaces[$_])} 0..@clusters-1;
358             my $min_value_for_unimodality_quotient = List::Util::min @cluster_unimodality_quotients;
359             print "\nCluster unimodality quotients: @cluster_unimodality_quotients\n" if $self->{_terminal_output};
360             die "\n\nBailing out!\n" .
361             "It does not look like these iterations will lead to a good clustering result.\n" .
362             "Program terminating. Try running again.\n"
363             if defined($previous_min_value_for_unimodality_quotient)
364             && ($min_value_for_unimodality_quotient < 0.4)
365             && ($min_value_for_unimodality_quotient < (0.5 * $previous_min_value_for_unimodality_quotient));
366             if ( $min_value_for_unimodality_quotient < 0.5 ) {
367             $unimodal_correction_flag = 1;
368             print "\nApplying unimodality correction:\n\n" if $self->{_terminal_output};
369             my @sorted_cluster_indexes =
370             sort {$cluster_unimodality_quotients[$b] <=> $cluster_unimodality_quotients[$a]} 0..@clusters-1;
371             my @newclusters;
372             foreach my $cluster_index (0..@clusters - 1) {
373             push @newclusters, $clusters[$sorted_cluster_indexes[$cluster_index]];
374             }
375             @clusters = @newclusters;
376             my $worst_cluster = pop @clusters;
377             print "\nthe worst cluster: @$worst_cluster\n" if $self->{_terminal_output};
378             my $second_worst_cluster = pop @clusters;
379             print "\nthe second worst cluster: @$second_worst_cluster\n" if $self->{_terminal_output};
380             push @$worst_cluster, @$second_worst_cluster;
381             fisher_yates_shuffle($worst_cluster);
382             my @first_half = @$worst_cluster[0 .. int(scalar(@$worst_cluster)/2) - 1];
383             my @second_half = @$worst_cluster[int(scalar(@$worst_cluster)/2) .. @$worst_cluster - 1];
384             push @clusters, \@first_half;
385             push @clusters, \@second_half;
386             if ($self->{_terminal_output}) {
387             print "\n\nShowing the clusters obtained after applying the unimodality correction:\n";
388             display_clusters(\@clusters);
389             }
390             }
391             if (@{$self->{_reconstruction_error_as_a_function_of_iteration}} > 0) {
392             my $last_recon_error = pop @{$self->{_reconstruction_error_as_a_function_of_iteration}};
393             push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, $last_recon_error;
394             if (($last_recon_error - $total_reconstruction_error_this_iteration)
395             < $self->{_delta_normalized_error}) {
396             push @{$self->{_reconstruction_error_as_a_function_of_iteration}},
397             $total_reconstruction_error_this_iteration;
398             last;
399             }
400             }
401             push @{$self->{_reconstruction_error_as_a_function_of_iteration}},
402             $total_reconstruction_error_this_iteration;
403             $iteration_index++;
404             $previous_min_value_for_unimodality_quotient = $min_value_for_unimodality_quotient;
405             } # end of while loop on iteration_index
406             $self->{_num_iterations_actually_used} =
407             scalar @{$self->{_reconstruction_error_as_a_function_of_iteration}};
408             if ($self->{_terminal_output}) {
409             print "\nIterations of the main loop terminated at iteration number $iteration_index.\n";
410             print "Will now invoke graph partitioning to discover dominant clusters and to\n" .
411             "merge small clusters.\n\n" if $self->{_cluster_search_multiplier} > 1;
412             print "Total reconstruction error as a function of iterations: " .
413             "@{$self->{_reconstruction_error_as_a_function_of_iteration}}";
414             }
415             # now merge sub-clusters if cluster_search_multiplier > 1
416             my @final_clusters;
417             if ($self->{_cluster_search_multiplier} > 1) {
418             print "\n\nInvoking recursive graph partitioning to merge small clusters\n\n";
419             my @array_of_partitioned_cluster_groups = (\@clusters);
420             my @partitioned_cluster_groups;
421             my $how_many_clusters_looking_for = $self->{_K};
422             while (scalar(@final_clusters) < $self->{_K}) {
423             @partitioned_cluster_groups =
424             $self->graph_partition(shift @array_of_partitioned_cluster_groups,
425             $how_many_clusters_looking_for );
426             if (@{$partitioned_cluster_groups[0]} == 1) {
427             my $singular_cluster = shift @{$partitioned_cluster_groups[0]};
428             push @final_clusters, $singular_cluster;
429             $how_many_clusters_looking_for--;
430             push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[1];
431             } elsif (@{$partitioned_cluster_groups[1]} == 1) {
432             my $singular_cluster = shift @{$partitioned_cluster_groups[1]};
433             push @final_clusters, $singular_cluster;
434             $how_many_clusters_looking_for--;
435             push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[0];
436             } else {
437             push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[0];
438             push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[1];
439             }
440             }
441             my @data_clustered;
442             foreach my $cluster (@final_clusters) {
443             push @data_clustered, @$cluster;
444             }
445             unless (scalar(@data_clustered) == scalar(@{$self->{_data_tags}})) {
446             $self->{_final_clusters} = \@final_clusters;
447             my %data_clustered = map {$_ => 1} @data_clustered;
448             my @data_tags_not_clustered =
449             grep {$_} map {exists $data_clustered{$_} ? undef : $_} @{$self->{_data_tags}};
450             if ($self->{_terminal_output}) {
451             print "\n\nNot all data clustered. The most reliable clusters found by graph partitioning:\n";
452             display_clusters(\@final_clusters);
453             print "\n\nData not yet clustered:\n\n@data_tags_not_clustered\n";
454             }
455             if ($self->{_data_dimensions} == 3) {
456             $visualization_msg = "$self->{_K}_best_clusters_produced_by_graph_partitioning";
457             $self->visualize_clusters_on_sphere($visualization_msg, \@final_clusters)
458             if $self->{_visualize_each_iteration};
459             $self->visualize_clusters_on_sphere($visualization_msg, \@final_clusters, "png")
460             if $self->{_make_png_for_each_iteration};
461             }
462             my %data_tags_to_cluster_label_hash;
463             foreach my $i (0..@final_clusters-1) {
464             map {$data_tags_to_cluster_label_hash{$_} = $i} @{$final_clusters[$i]};
465             }
466             $self->{_data_tags_to_cluster_label_hash} = \%data_tags_to_cluster_label_hash;
467             foreach my $tag (@data_tags_not_clustered) {
468             my $which_cluster = $self->which_cluster_for_new_element($tag);
469             $self->{_data_tags_to_cluster_label_hash}->{$tag} = $which_cluster;
470             }
471             die "Some data elements are still missing from the final tally"
472             unless scalar(keys %{$self->{_data_tags_to_cluster_label_hash}}) ==
473             scalar(@{$self->{_data_tags}});
474             my @new_final_clusters;
475             map { foreach my $ele (keys %{$self->{_data_tags_to_cluster_label_hash}}) {
476             push @{$new_final_clusters[$_]}, $ele
477             if $self->{_data_tags_to_cluster_label_hash}->{$ele} == $_ }
478             } 0..$self->{_K}-1;
479             if ($self->{_debug}) {
480             print "\ndisplaying the final clusters after accounting for unclustered data:\n";
481             display_clusters(\@new_final_clusters);
482             }
483             $self->{_final_clusters} = \@new_final_clusters;
484             @final_clusters = @new_final_clusters;
485             }
486             } else {
487             @final_clusters = @clusters;
488             }
489             print "\n\nDisplaying final clustering results:\n\n" if $self->{_terminal_output};
490             display_clusters(\@final_clusters) if $self->{_terminal_output};
491             return \@final_clusters;
492             }
493              
494             sub display_reconstruction_errors_as_a_function_of_iterations {
495             my $self = shift;
496             print "\n\nNumber of iterations used in Phase 1: $self->{_num_iterations_actually_used}\n";
497             print "\nTotal reconstruction error as a function of iterations in Phase 1: " .
498             "@{$self->{_reconstruction_error_as_a_function_of_iteration}}\n";
499             }
500              
501             sub set_termination_reconstruction_error_threshold {
502             my $self = shift;
503             my $all_ref_vecs = shift;
504             my @mean_vecs = @$all_ref_vecs;
505             my $sum_of_mean_magnitudes = reduce {$a+$b} map { my $result = transpose($_) * $_;
506             my @result = $result->as_list;
507             sqrt($result[0])
508             } @mean_vecs;
509             $self->{_scale_factor} = $sum_of_mean_magnitudes / @mean_vecs;
510             $self->{_delta_normalized_error} = ($sum_of_mean_magnitudes / @mean_vecs ) *
511             $self->{_delta_reconstruction_error};
512             }
513              
514             # This method is called only in the `unless' clause at the end of the main
515             # linear_manifold_clusterer() method. It is called to find the cluster labels for
516             # those data elements that were left unclustered by the main part of the algorithm
517             # when graph partitioning is used to merge similar sub-clusters. The operating logic
518             # here is that graph partition yields K main clusters even though each main cluster
519             # may not yet be fully populated.
520             sub which_cluster_for_new_element {
521             my $self = shift;
522             my $data_tag = shift;
523             # The following `unless' clause is called only the first time the current method
524             # is called:
525             unless (@{$self->{_final_trailing_eigenvec_matrices_for_all_subspaces}} > 0) {
526             my @trailing_eigenvec_matrices_for_all_subspaces;
527             my @reference_vecs_for_all_subspaces;
528             foreach my $cluster (@{$self->{_final_clusters}}) {
529             my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
530             my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance);
531             my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
532             my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions},
533             scalar(@trailing_eigenvecs));
534             foreach my $j (0..@trailing_eigenvecs-1) {
535             $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
536             }
537             push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
538             push @reference_vecs_for_all_subspaces, $mean;
539             }
540             $self->{_final_trailing_eigenvec_matrices_for_all_subspaces} =
541             \@trailing_eigenvec_matrices_for_all_subspaces;
542             $self->{_final_reference_vecs_for_all_subspaces} = \@reference_vecs_for_all_subspaces;
543             }
544             my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
545             $data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
546             my @errors = map {$self->reconstruction_error($data_vec,
547             $self->{_final_trailing_eigenvec_matrices_for_all_subspaces}->[$_],
548             $self->{_final_reference_vecs_for_all_subspaces}->[$_])}
549             0 .. $self->{_K}-1;
550             my ($minval, $index_for_closest_subspace) = minimum(\@errors);
551             return $index_for_closest_subspace;
552             }
553              
554             sub graph_partition {
555             my $self = shift;
556             my $clusters = shift;
557             my $how_many_clusters_looking_for = shift;
558             print "\n\nGraph partitioning looking for $how_many_clusters_looking_for clusters\n\n";
559             my $num_nodes = scalar @$clusters;
560             my $W = Math::GSL::Matrix->new($num_nodes,$num_nodes);
561             my $D = Math::GSL::Matrix->new($num_nodes,$num_nodes);
562             $D->identity;
563             my $neg_sqrt_of_D = Math::GSL::Matrix->new($num_nodes,$num_nodes);
564             $neg_sqrt_of_D->identity;
565             my @subspace_construction_errors;
566             my @trailing_eigenvec_matrices_for_all_subspaces;
567             my @reference_vecs_for_all_subspaces;
568             foreach my $cluster (@$clusters) {
569             my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
570             my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance);
571             my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
572             my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1];
573             my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals;
574             push @subspace_construction_errors, $subspace_construction_error;
575             my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions},
576             scalar(@trailing_eigenvecs));
577             foreach my $j (0..@trailing_eigenvecs-1) {
578             print "trailing eigenvec column: @{$trailing_eigenvecs[$j]}\n" if $self->{_debug};
579             $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
580             }
581             push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
582             push @reference_vecs_for_all_subspaces, $mean;
583             }
584             # We consider the similarity matrix W to be a sum of two parts W_recon_based and
585             # W_dist_bet_means based. IMPORTANT: For coding reasons, we first store the two
586             # similarity measures separately in W_recon_based and W_dist_bet_means based. Our
587             # goal is to fill up these component matrices with the raw values while at the
588             # same time collecting information needed for normalizing these two separate
589             # measures of similarity.
590             my $W_reconstruction_error_based = Math::GSL::Matrix->new($num_nodes,$num_nodes);
591             my $W_dist_between_means_based = Math::GSL::Matrix->new($num_nodes,$num_nodes);
592             my @all_pairwise_reconstruction_errors;
593             my @all_dist_between_means_errors;
594             foreach my $i (0..$num_nodes-1) {
595             foreach my $j (0..$num_nodes-1) {
596             my ($recon_error_similarity, $dist_bet_means) = $self->pairwise_cluster_similarity(
597             $clusters->[$i],
598             $trailing_eigenvec_matrices_for_all_subspaces[$i],
599             $reference_vecs_for_all_subspaces[$i],
600             $clusters->[$j],
601             $trailing_eigenvec_matrices_for_all_subspaces[$j],
602             $reference_vecs_for_all_subspaces[$j]);
603             $W_reconstruction_error_based->set_elem($i, $j, $recon_error_similarity);
604             $W_dist_between_means_based->set_elem($i, $j, $dist_bet_means);
605             push @all_pairwise_reconstruction_errors, $recon_error_similarity;
606             push @all_dist_between_means_errors, $dist_bet_means;
607             }
608             }
609             my $recon_error_normalizer = (reduce {$a + $b} @all_pairwise_reconstruction_errors) /
610             (scalar @all_pairwise_reconstruction_errors);
611             my $dist_bet_means_based_normalizer = (reduce {$a + $b} @all_dist_between_means_errors ) /
612             (scalar @all_dist_between_means_errors );
613             die "\n\nBailing out!\n" .
614             "Dealing with highly defective clusters. Try again.\n"
615             if ($recon_error_normalizer == 0) || ($dist_bet_means_based_normalizer == 0);
616             foreach my $i (0..$num_nodes-1) {
617             foreach my $j (0..$num_nodes-1) {
618             my $recon_val = $W_reconstruction_error_based->get_elem($i,$j);
619             my $new_recon_val = exp( -1.0 * $recon_val / $recon_error_normalizer );
620             $W_reconstruction_error_based->set_elem($i,$j,$new_recon_val);
621             my $mean_dist_val = $W_dist_between_means_based->get_elem($i,$j);
622             my $new_mean_dist_val = exp( -1.0 * $mean_dist_val / $dist_bet_means_based_normalizer );
623             $W_dist_between_means_based->set_elem($i,$j,$new_mean_dist_val);
624             }
625             }
626             $W = $W_reconstruction_error_based + $W_dist_between_means_based;
627             if ($self->{_debug}) {
628             print "\nDisplaying the similarity matrix W for the cluster graph:\n";
629             display_matrix($W) if $self->{_terminal_output};
630             }
631             my $add_all_columns = Math::GSL::Matrix->new($num_nodes,1);
632             foreach my $col (0..$num_nodes-1) {
633             $add_all_columns += $W->col($col);
634             }
635             foreach my $i (0..$num_nodes-1) {
636             $D->set_elem($i,$i, $add_all_columns->get_elem($i,0));
637             $neg_sqrt_of_D->set_elem($i,$i, 1.0 / sqrt($add_all_columns->get_elem($i,0)));
638             }
639             # the Laplacian matrix:
640             my $Laplacian = $D - $W;
641             # the Symmetric Normalized Laplacian matrix A:
642             my $A = $neg_sqrt_of_D * $Laplacian * $neg_sqrt_of_D;
643             foreach my $i (0..$num_nodes-1) {
644             foreach my $j (0..$num_nodes-1) {
645             $A->set_elem($i,$j,0) if abs($A->get_elem($i,$j)) < 0.01;
646             }
647             }
648             if ($self->{_terminal_output}) {
649             print "\nDisplaying the Symmetric Normalized Laplacian matrix A:\n" .
650             "A = neg_sqrt(D) * Laplacian_matrix * neg_sqrt(D)\n";
651             display_matrix( $A );
652             }
653             my ($eigenvalues, $eigenvectors) = $A->eigenpair;
654             my $num_of_eigens = @$eigenvalues;
655             my $largest_eigen_index = 0;
656             my $smallest_eigen_index = 0;
657             if ($self->{_debug2}) {
658             print "Eigenvalue 0: $eigenvalues->[0]\n";
659             foreach my $i (1..$num_of_eigens-1) {
660             $largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
661             $smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
662             print "Eigenvalue $i: $eigenvalues->[$i]\n";
663             }
664             print "\nlargest eigen index: $largest_eigen_index\n";
665             print "\nsmallest eigen index: $smallest_eigen_index\n\n";
666             }
667             my @all_my_eigenvecs;
668             foreach my $i (0..$num_of_eigens-1) {
669             my @vec = $eigenvectors->[$i]->as_list;
670             my @eigenvec;
671             foreach my $ele (@vec) {
672             my ($mag, $theta) = $ele =~ /\[(\d*\.?\d*e?[+-]?\d*),(\S+)\]/;
673             if ($theta eq "0") {
674             push @eigenvec, $mag;
675             } elsif ($theta eq "pi") {
676             push @eigenvec, -1.0 * $mag;
677             } else {
678             die "Eigendecomposition produced a complex eigenvector!";
679             }
680             }
681             print "Eigenvector $i: @eigenvec\n" if $self->{_debug2};
682             push @all_my_eigenvecs, \@eigenvec;
683             }
684             if ($self->{_debug2}) {
685             my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
686             print "\nLargest eigenvector of A: @largest_eigen_vec\n";
687             }
688             my @sorted_eigenvec_indexes = sort {$eigenvalues->[$b] <=> $eigenvalues->[$a]} 0..@all_my_eigenvecs-1;
689             print "sorted eigenvec indexes for A: @sorted_eigenvec_indexes\n" if $self->{_debug2};
690             my @sorted_eigenvecs;
691             my @sorted_eigenvals;
692             foreach my $i (0..@sorted_eigenvec_indexes-1) {
693             $sorted_eigenvecs[$i] = $all_my_eigenvecs[$sorted_eigenvec_indexes[$i]];
694             $sorted_eigenvals[$i] = $eigenvalues->[$sorted_eigenvec_indexes[$i]];
695             }
696             if ($self->{_debug2}) {
697             print "\nHere come sorted eigenvectors for A --- from the largest to the smallest:\n";
698             foreach my $i (0..@sorted_eigenvecs-1) {
699             print "eigenvec: @{$sorted_eigenvecs[$i]} eigenvalue: $sorted_eigenvals[$i]\n";
700             }
701             }
702             my $best_partitioning_eigenvec = $sorted_eigenvecs[@sorted_eigenvec_indexes-2];
703             print "\nBest graph partitioning eigenvector: @$best_partitioning_eigenvec\n" if $self->{_terminal_output};
704             my $how_many_positive = reduce {$a + $b} map {$_ > 0 ? 1 : 0 } @$best_partitioning_eigenvec;
705             my $how_many_negative = scalar(@$best_partitioning_eigenvec) - $how_many_positive;
706             print "Have $how_many_positive positive and $how_many_negative negative elements in the partitioning vec\n"
707             if $self->{_terminal_output};
708             if ($how_many_clusters_looking_for <= 3) {
709             my @merged_cluster;
710             my $final_cluster;
711             my @newclusters;
712             if ($how_many_positive == 1) {
713             foreach my $i (0..@$clusters-1) {
714             if ($best_partitioning_eigenvec->[$i] > 0) {
715             $final_cluster = $clusters->[$i];
716             } else {
717             push @newclusters, $clusters->[$i];
718             }
719             }
720             return ([$final_cluster], \@newclusters);
721             } elsif ($how_many_negative == 1) {
722             foreach my $i (0..@$clusters-1) {
723             if ($best_partitioning_eigenvec->[$i] < 0) {
724             $final_cluster = $clusters->[$i];
725             } else {
726             push @newclusters, $clusters->[$i];
727             }
728             }
729             return ([$final_cluster], \@newclusters);
730             } elsif ($how_many_positive <= $self->{_cluster_search_multiplier}) {
731             foreach my $i (0..@$clusters-1) {
732             if ($best_partitioning_eigenvec->[$i] > 0) {
733             push @merged_cluster, @{$clusters->[$i]};
734             } else {
735             push @newclusters, $clusters->[$i];
736             }
737             }
738             return ([\@merged_cluster], \@newclusters);
739             } elsif ($how_many_negative <= $self->{_cluster_search_multiplier}) {
740             foreach my $i (0..@$clusters-1) {
741             if ($best_partitioning_eigenvec->[$i] < 0) {
742             push @merged_cluster, @{$clusters->[$i]};
743             } else {
744             push @newclusters, $clusters->[$i];
745             }
746             }
747             return ([\@merged_cluster], \@newclusters);
748             } else {
749             die "\n\nBailing out!\n\n" .
750             "No consensus support for dominant clusters in the graph partitioning step\n" .
751             "of the algorithm. This can be caused by bad random selection of initial\n" .
752             "cluster centers. Please run this program again.\n";
753             }
754             } else {
755             my @positive_clusters;
756             my @negative_clusters;
757             foreach my $i (0..@$clusters-1) {
758             if ($best_partitioning_eigenvec->[$i] > 0) {
759             push @positive_clusters, $clusters->[$i];
760             } else {
761             push @negative_clusters, $clusters->[$i];
762             }
763             }
764             return (\@positive_clusters, \@negative_clusters);
765             }
766             }
767              
768             sub pairwise_cluster_similarity {
769             my $self = shift;
770             my $cluster1 = shift;
771             my $trailing_eigenvec_matrix_cluster1 = shift;
772             my $reference_vec_cluster1 = shift;
773             my $cluster2 = shift;
774             my $trailing_eigenvec_matrix_cluster2 = shift;
775             my $reference_vec_cluster2 = shift;
776             my $total_reconstruction_error_in_this_iteration = 0;
777             my @errors_for_1_on_2 = map {my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
778             $data_vec->set_col(0, $self->{_data_hash}->{$_});
779             $self->reconstruction_error($data_vec,
780             $trailing_eigenvec_matrix_cluster2,
781             $reference_vec_cluster2)}
782             @$cluster1;
783             my @errors_for_2_on_1 = map {my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
784             $data_vec->set_col(0, $self->{_data_hash}->{$_});
785             $self->reconstruction_error($data_vec,
786             $trailing_eigenvec_matrix_cluster1,
787             $reference_vec_cluster1)}
788             @$cluster2;
789             my $type_1_error = reduce {abs($a) + abs($b)} @errors_for_1_on_2;
790             my $type_2_error = reduce {abs($a) + abs($b)} @errors_for_2_on_1;
791             my $total_reconstruction_error = $type_1_error + $type_2_error;
792             my $diff_between_the_means = $reference_vec_cluster1 - $reference_vec_cluster2;
793             my $dist_squared = transpose($diff_between_the_means) * $diff_between_the_means;
794             my @dist_squared_as_list = $dist_squared->as_list();
795             my $dist_between_means_based_error = shift @dist_squared_as_list;
796             return ($total_reconstruction_error, $dist_between_means_based_error);
797             }
798              
799             # delta ball
800             sub cluster_unimodality_quotient {
801             my $self = shift;
802             my $cluster = shift;
803             my $mean = shift;
804             my $delta = 0.4 * $self->{_scale_factor}; # Radius of the delta ball along each dimension
805             my @mean = $mean->as_list;
806             my @data_tags_for_range_tests;
807             foreach my $dimen (0..$self->{_data_dimensions}-1) {
808             my @values = map {$_->[$dimen]} map {$self->{_data_hash}->{$_}} @$cluster;
809             my ($min, $max) = (List::Util::min(@values), List::Util::max(@values));
810             my $range = $max - $min;
811             my $mean_along_this_dimen = $mean[$dimen];
812             my @tags = grep {$_}
813             map { ( ($self->{_data_hash}->{$_}->[$dimen] > $mean_along_this_dimen - $delta * $range)
814             &&
815             ($self->{_data_hash}->{$_}->[$dimen] < $mean_along_this_dimen + $delta * $range) )
816             ? $_ : undef }
817             @$cluster;
818             push @data_tags_for_range_tests, \@tags;
819             }
820             # Now find the intersection of the tag sets for each of the dimensions
821             my %intersection_hash;
822             foreach my $dimen (0..$self->{_data_dimensions}-1) {
823             my %tag_hash_for_this_dimen = map {$_ => 1} @{$data_tags_for_range_tests[$dimen]};
824             if ($dimen == 0) {
825             %intersection_hash = %tag_hash_for_this_dimen;
826             } else {
827             %intersection_hash = map {$_ => 1} grep {$tag_hash_for_this_dimen{$_}}
828             keys %intersection_hash;
829             }
830             }
831             my @intersection_set = keys %intersection_hash;
832             my $cluster_unimodality_index = scalar(@intersection_set) / scalar(@$cluster);
833             return $cluster_unimodality_index;
834             }
835              
836             sub find_best_ref_vector {
837             my $self = shift;
838             my $cluster = shift;
839             my $trailing_eigenvec_matrix = shift;
840             my $mean = shift; # a GSL marix ref
841             my @min_bounds;
842             my @max_bounds;
843             my @ranges;
844             foreach my $dimen (0..$self->{_data_dimensions}-1) {
845             my @values = map {$_->[$dimen]} map {$self->{_data_hash}->{$_}} @$cluster;
846             my ($min, $max) = (List::Util::min(@values), List::Util::max(@values));
847             push @min_bounds, $min;
848             push @max_bounds, $max;
849             push @ranges, $max - $min;
850             }
851             print "min bounds are: @min_bounds\n";
852             print "max bounds are: @max_bounds\n";
853             my $max_iterations = 100;
854             my @random_points;
855             my $iteration = 0;
856             while ($iteration++ < $max_iterations) {
857             my @coordinate_vec;
858             foreach my $dimen (0..$self->{_data_dimensions}-1) {
859             push @coordinate_vec, $min_bounds[$dimen] + rand($ranges[$dimen]);
860             }
861             push @random_points, \@coordinate_vec;
862             }
863             if ($self->{_debug}) {
864             print "\nrandom points\n";
865             map {print "@$_\n"} @random_points;
866             }
867             my @mean = $mean->as_list;
868             unshift @random_points, \@mean;
869             my @reconstruction_errors;
870             foreach my $candidate_ref_vec (@random_points) {
871             my $ref_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
872             $ref_vec->set_col(0, $candidate_ref_vec);
873             my $reconstruction_error_for_a_ref_vec = 0;
874             foreach my $data_tag (@{$self->{_data_tags}}) {
875             my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
876             $data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
877             my $error = $self->reconstruction_error($data_vec,$trailing_eigenvec_matrix,$ref_vec);
878             $reconstruction_error_for_a_ref_vec += $error;
879             }
880             push @reconstruction_errors, $reconstruction_error_for_a_ref_vec;
881             }
882             my $recon_error_for_original_mean = shift @reconstruction_errors;
883             my $smallest_error_randomly_selected_ref_vecs = List::Util::min(@reconstruction_errors);
884             my $minindex = List::Util::first { $_ == $smallest_error_randomly_selected_ref_vecs }
885             @reconstruction_errors;
886             my $refvec = $random_points[$minindex];
887             return $refvec;
888             }
889              
890             ## The reconstruction error relates to the size of the perpendicular from a data
891             ## point X to the hyperplane that defines a given subspace on the manifold.
892             sub reconstruction_error {
893             my $self = shift;
894             my $data_vec = shift;
895             my $trailing_eigenvecs = shift;
896             my $ref_vec = shift;
897             my $error_squared = transpose($data_vec - $ref_vec) * $trailing_eigenvecs *
898             transpose($trailing_eigenvecs) * ($data_vec - $ref_vec);
899             my @error_squared_as_list = $error_squared->as_list();
900             my $error_squared_as_scalar = shift @error_squared_as_list;
901             return $error_squared_as_scalar;
902             }
903              
904             # Returns a set of KM random integers. These serve as indices to reach into the data
905             # array. A data element whose index is one of the random numbers returned by this
906             # routine serves as an initial cluster center. Note the quality check it runs on the
907             # list of the random integers constructed. We first make sure that all the random
908             # integers returned are different. Subsequently, we carry out a quality assessment
909             # of the random integers constructed. This quality measure consists of the ratio of
910             # the values spanned by the random integers to the value of N, the total number of
911             # data points to be clustered. Currently, if this ratio is less than 0.3, we discard
912             # the K integers and try again.
913             sub initialize_cluster_centers {
914             my $self = shift;
915             my $K = shift; # This value is set to the parameter KM in the call to this subroutine
916             my $data_store_size = shift;
917             my @cluster_center_indices;
918             while (1) {
919             foreach my $i (0..$K-1) {
920             $cluster_center_indices[$i] = int rand( $data_store_size );
921             next if $i == 0;
922             foreach my $j (0..$i-1) {
923             while ( $cluster_center_indices[$j] == $cluster_center_indices[$i] ) {
924             my $old = $cluster_center_indices[$i];
925             $cluster_center_indices[$i] = int rand($data_store_size);
926             }
927             }
928             }
929             my ($min,$max) = minmax(\@cluster_center_indices );
930             my $quality = ($max - $min) / $data_store_size;
931             last if $quality > 0.3;
932             }
933             return @cluster_center_indices;
934             }
935              
936             # The purpose of this routine is to form initial clusters by assigning the data
937             # samples to the initial clusters formed by the previous routine on the basis of the
938             # best proximity of the data samples to the different cluster centers.
939             sub assign_data_to_clusters_initial {
940             my $self = shift;
941             my @cluster_centers = @{ shift @_ };
942             my @clusters;
943             foreach my $ele (@{$self->{_data_tags}}) {
944             my $best_cluster;
945             my @dist_from_clust_centers;
946             foreach my $center (@cluster_centers) {
947             push @dist_from_clust_centers, $self->distance($ele, $center);
948             }
949             my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
950             push @{$clusters[$best_center_index]}, $ele;
951             }
952             return \@clusters;
953             }
954              
955             # The following routine is for computing the distance between a data point specified
956             # by its symbolic name in the master datafile and a point (such as the center of a
957             # cluster) expressed as a vector of coordinates:
958             sub distance {
959             my $self = shift;
960             my $ele1_id = shift @_; # symbolic name of data sample
961             my @ele1 = @{$self->{_data_hash}->{$ele1_id}};
962             my @ele2 = @{shift @_};
963             die "wrong data types for distance calculation\n" if @ele1 != @ele2;
964             my $how_many = @ele1;
965             my $squared_sum = 0;
966             foreach my $i (0..$how_many-1) {
967             $squared_sum += ($ele1[$i] - $ele2[$i])**2;
968             }
969             my $dist = sqrt $squared_sum;
970             return $dist;
971             }
972              
973             sub write_clusters_to_files {
974             my $self = shift;
975             my $clusters = shift;
976             my @clusters = @$clusters;
977             unlink glob "cluster*.txt";
978             foreach my $i (0..@clusters-1) {
979             my $filename = "cluster" . $i . ".txt";
980             print "\nWriting cluster $i to file $filename\n" if $self->{_terminal_output};
981             open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!";
982             foreach my $ele (@{$clusters[$i]}) {
983             print FILEHANDLE "$ele ";
984             }
985             close FILEHANDLE;
986             }
987             }
988              
989             sub DESTROY {
990             my $filename = basename($_[0]->{_datafile});
991             $filename =~ s/\.\w+$/\.txt/;
992             unlink "__temp_" . $filename;
993             }
994              
995             ################################## Visualization Code ###################################
996              
997             sub add_point_coords {
998             my $self = shift;
999             my @arr_of_ids = @{shift @_}; # array of data element names
1000             my @result;
1001             my $data_dimensionality = $self->{_data_dimensions};
1002             foreach my $i (0..$data_dimensionality-1) {
1003             $result[$i] = 0.0;
1004             }
1005             foreach my $id (@arr_of_ids) {
1006             my $ele = $self->{_data_hash}->{$id};
1007             my $i = 0;
1008             foreach my $component (@$ele) {
1009             $result[$i] += $component;
1010             $i++;
1011             }
1012             }
1013             return \@result;
1014             }
1015              
1016             # This is the main module version:
1017             sub visualize_clusters_on_sphere {
1018             my $self = shift;
1019             my $visualization_msg = shift;
1020             my $clusters = deep_copy_AoA(shift);
1021             my $hardcopy_format = shift;
1022             my $pause_time = shift;
1023             my $d = $self->{_data_dimensions};
1024             my $temp_file = "__temp_" . $self->{_datafile};
1025             $temp_file =~ s/\.\w+$/\.txt/;
1026             unlink $temp_file if -e $temp_file;
1027             open OUTPUT, ">$temp_file"
1028             or die "Unable to open a temp file in this directory: $!";
1029             my @all_tags = "A".."Z";
1030             my @retagged_clusters;
1031             foreach my $cluster (@$clusters) {
1032             my $label = shift @all_tags;
1033             my @retagged_cluster =
1034             map {$_ =~ s/^(\w+?)_(\w+)/$label . "_$2 @{$self->{_data_hash}->{$_}}"/e;$_} @$cluster;
1035             push @retagged_clusters, \@retagged_cluster;
1036             }
1037             my %clusters;
1038             foreach my $cluster (@retagged_clusters) {
1039             foreach my $record (@$cluster) {
1040             my @splits = grep $_, split /\s+/, $record;
1041             $splits[0] =~ /(\w+?)_.*/;
1042             my $primary_cluster_label = $1;
1043             my @coords = @splits[1..$d];
1044             push @{$clusters{$primary_cluster_label}}, \@coords;
1045             }
1046             }
1047             foreach my $key (sort {"\L$a" cmp "\L$b"} keys %clusters) {
1048             map {print OUTPUT "$_"} map {"@$_\n"} @{$clusters{$key}};
1049             print OUTPUT "\n\n";
1050             }
1051             my @sorted_cluster_keys = sort {"\L$a" cmp "\L$b"} keys %clusters;
1052             close OUTPUT;
1053             my $plot;
1054             unless (defined $pause_time) {
1055             $plot = Graphics::GnuplotIF->new( persist => 1 );
1056             } else {
1057             $plot = Graphics::GnuplotIF->new();
1058             }
1059             my $arg_string = "";
1060             $plot->gnuplot_cmd( "set hidden3d" ) unless $self->{_show_hidden_in_3D_plots};
1061             $plot->gnuplot_cmd( "set title \"$visualization_msg\"" );
1062             $plot->gnuplot_cmd( "set noclip" );
1063             $plot->gnuplot_cmd( "set pointsize 2" );
1064             $plot->gnuplot_cmd( "set parametric" );
1065             $plot->gnuplot_cmd( "set size ratio 1" );
1066             $plot->gnuplot_cmd( "set xlabel \"X\"" );
1067             $plot->gnuplot_cmd( "set ylabel \"Y\"" );
1068             $plot->gnuplot_cmd( "set zlabel \"Z\"" );
1069             if ($hardcopy_format) {
1070             $plot->gnuplot_cmd( "set terminal png" );
1071             my $image_file_name = "$visualization_msg\.$hardcopy_format";
1072             $plot->gnuplot_cmd( "set output \"$image_file_name\"" );
1073             $plot->gnuplot_cmd( "unset hidden3d" );
1074             }
1075             # set the range for azimuth angles:
1076             $plot->gnuplot_cmd( "set urange [0:2*pi]" );
1077             # set the range for the elevation angles:
1078             $plot->gnuplot_cmd( "set vrange [-pi/2:pi/2]" );
1079             # Parametric functions for the sphere
1080             # $plot->gnuplot_cmd( "r=1" );
1081             if ($self->{_scale_factor}) {
1082             $plot->gnuplot_cmd( "r=$self->{_scale_factor}" );
1083             } else {
1084             $plot->gnuplot_cmd( "r=1" );
1085             }
1086             $plot->gnuplot_cmd( "fx(v,u) = r*cos(v)*cos(u)" );
1087             $plot->gnuplot_cmd( "fy(v,u) = r*cos(v)*sin(u)" );
1088             $plot->gnuplot_cmd( "fz(v) = r*sin(v)" );
1089             my $sphere_arg_str = "fx(v,u),fy(v,u),fz(v) notitle with lines lt 0,";
1090             foreach my $i (0..scalar(keys %clusters)-1) {
1091             my $j = $i + 1;
1092             # The following statement puts the titles on the data points
1093             $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"$sorted_cluster_keys[$i] \" with points lt $j pt $j, ";
1094             }
1095             $arg_string = $arg_string =~ /^(.*),[ ]+$/;
1096             $arg_string = $1;
1097             $plot->gnuplot_cmd( "splot $sphere_arg_str $arg_string" );
1098             $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1099             }
1100              
1101              
1102             ################################### Support Routines ########################################
1103              
1104             sub cluster_split {
1105             my $cluster = shift;
1106             my $how_many = shift;
1107             my @cluster_fragments;
1108             foreach my $i (0..$how_many-1) {
1109             $cluster_fragments[$i] = [];
1110             }
1111             my $delta = int( scalar(@$cluster) / $how_many );
1112             my $j = 0;
1113             foreach my $i (0..@$cluster-1) {
1114             push @{$cluster_fragments[int($i/$delta)]}, $cluster->[$i];
1115             }
1116             my $how_many_accounted_for = reduce {$a + $b} map {scalar(@$_)} @cluster_fragments;
1117             foreach my $frag (@cluster_fragments) {
1118             print "fragment: @$frag\n";
1119             }
1120             die "the fragmentation could not account for all the data"
1121             unless @$cluster == $how_many_accounted_for;
1122             return @cluster_fragments;
1123             }
1124              
1125             # Returns the minimum value and its positional index in an array
1126             sub minimum {
1127             my $arr = shift;
1128             my $min;
1129             my $index;
1130             foreach my $i (0..@{$arr}-1) {
1131             if ( (!defined $min) || ($arr->[$i] < $min) ) {
1132             $index = $i;
1133             $min = $arr->[$i];
1134             }
1135             }
1136             return ($min, $index);
1137             }
1138              
1139             sub minmax {
1140             my $arr = shift;
1141             my $min;
1142             my $max;
1143             foreach my $i (0..@{$arr}-1) {
1144             next if !defined $arr->[$i];
1145             if ( (!defined $min) && (!defined $max) ) {
1146             $min = $arr->[$i];
1147             $max = $arr->[$i];
1148             } elsif ( $arr->[$i] < $min ) {
1149             $min = $arr->[$i];
1150             } elsif ( $arr->[$i] > $max ) {
1151             $max = $arr->[$i];
1152             }
1153             }
1154             return ($min, $max);
1155             }
1156              
1157             # Meant only for constructing a deep copy of an array of arrays:
1158             sub deep_copy_AoA {
1159             my $ref_in = shift;
1160             my $ref_out;
1161             foreach my $i (0..@{$ref_in}-1) {
1162             foreach my $j (0..@{$ref_in->[$i]}-1) {
1163             $ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
1164             }
1165             }
1166             return $ref_out;
1167             }
1168              
1169             # For displaying the individual clusters on a terminal screen. Each cluster is
1170             # displayed through the symbolic names associated with the data points.
1171             sub display_clusters {
1172             my @clusters = @{shift @_};
1173             my $i = 0;
1174             foreach my $cluster (@clusters) {
1175             @$cluster = sort @$cluster;
1176             my $cluster_size = @$cluster;
1177             print "\n\nCluster $i ($cluster_size records):\n";
1178             foreach my $ele (@$cluster) {
1179             print " $ele";
1180             }
1181             $i++
1182             }
1183             print "\n\n";
1184             }
1185              
1186             sub display_mean_and_covariance {
1187             my $mean = shift;
1188             my $covariance = shift;
1189             print "\nDisplay the mean:\n";
1190             display_matrix($mean);
1191             print "\nDisplay the covariance:\n";
1192             display_matrix($covariance);
1193             }
1194              
1195             sub check_for_illegal_params {
1196             my @params = @_;
1197             my @legal_params = qw / datafile
1198             mask
1199             K
1200             P
1201             terminal_output
1202             cluster_search_multiplier
1203             max_iterations
1204             delta_reconstruction_error
1205             visualize_each_iteration
1206             show_hidden_in_3D_plots
1207             make_png_for_each_iteration
1208             debug
1209             /;
1210             my $found_match_flag;
1211             foreach my $param (@params) {
1212             foreach my $legal (@legal_params) {
1213             $found_match_flag = 0;
1214             if ($param eq $legal) {
1215             $found_match_flag = 1;
1216             last;
1217             }
1218             }
1219             last if $found_match_flag == 0;
1220             }
1221             return $found_match_flag;
1222             }
1223              
1224             sub display_matrix {
1225             my $matrix = shift;
1226             my $nrows = $matrix->rows();
1227             my $ncols = $matrix->cols();
1228             print "\nDisplaying a matrix of size $nrows rows and $ncols columns:\n";
1229             foreach my $i (0..$nrows-1) {
1230             my $row = $matrix->row($i);
1231             my @row_as_list = $row->as_list;
1232             # print "@row_as_list\n";
1233             map { printf("%.4f ", $_) } @row_as_list;
1234             print "\n";
1235             }
1236             print "\n\n";
1237             }
1238              
1239             sub transpose {
1240             my $matrix = shift;
1241             my $num_rows = $matrix->rows();
1242             my $num_cols = $matrix->cols();
1243             my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
1244             foreach my $i (0..$num_rows-1) {
1245             my @row = $matrix->row($i)->as_list;
1246             $transpose->set_col($i, \@row );
1247             }
1248             return $transpose;
1249             }
1250              
1251             sub vector_subtract {
1252             my $vec1 = shift;
1253             my $vec2 = shift;
1254             die "wrong data types for vector subtract calculation\n" if @$vec1 != @$vec2;
1255             my @result;
1256             foreach my $i (0..@$vec1-1){
1257             push @result, $vec1->[$i] - $vec2->[$i];
1258             }
1259             return @result;
1260             }
1261              
1262             # from perl docs:
1263             sub fisher_yates_shuffle {
1264             my $arr = shift;
1265             my $i = @$arr;
1266             while (--$i) {
1267             my $j = int rand( $i + 1 );
1268             @$arr[$i, $j] = @$arr[$j, $i];
1269             }
1270             }
1271              
1272             ######################### Generating Synthetic Data for Manifold Clustering ##########################
1273              
1274             ################################## Class DataGenerator ########################################
1275              
1276             ## The embedded class defined below is for generating synthetic data for
1277             ## experimenting with linear manifold clustering when the data resides on the
1278             ## surface of a sphere. See the script generate_data_on_a_sphere.pl in the
1279             ## `examples' directory for how to specify the number of clusters and the spread of
1280             ## each cluster in the data that is generated.
1281              
1282             package DataGenerator;
1283              
1284             use strict;
1285             use Carp;
1286              
1287             sub new {
1288             my ($class, %args) = @_;
1289             my @params = keys %args;
1290             croak "\nYou have used a wrong name for a keyword argument " .
1291             "--- perhaps a misspelling\n"
1292             if _check_for_illegal_params3(@params) == 0;
1293             bless {
1294             _output_file => $args{output_file}
1295             || croak("name for output_file required"),
1296             _total_number_of_samples_needed => $args{total_number_of_samples_needed}
1297             || croak("total_number_of_samples_needed required"),
1298             _number_of_clusters_on_sphere => $args{number_of_clusters_on_sphere} || 3,
1299             _cluster_width => $args{cluster_width} || 0.1,
1300             _show_hidden_in_3D_plots => $args{show_hidden_in_3D_plots} || 1,
1301             _debug => $args{debug} || 0,
1302             }, $class;
1303             }
1304              
1305             sub _check_for_illegal_params3 {
1306             my @params = @_;
1307             my @legal_params = qw / output_file
1308             total_number_of_samples_needed
1309             number_of_clusters_on_sphere
1310             cluster_width
1311             show_hidden_in_3D_plots
1312             /;
1313             my $found_match_flag;
1314             foreach my $param (@params) {
1315             foreach my $legal (@legal_params) {
1316             $found_match_flag = 0;
1317             if ($param eq $legal) {
1318             $found_match_flag = 1;
1319             last;
1320             }
1321             }
1322             last if $found_match_flag == 0;
1323             }
1324             return $found_match_flag;
1325             }
1326              
1327             ## We first generate a set of points randomly on the unit sphere --- the number of
1328             ## points being equal to the number of clusters desired. These points will serve as
1329             ## cluster means (or, as cluster centroids) subsequently when we ask
1330             ## Math::Random::random_multivariate_normal($N, @m, @covar) to return $N number of
1331             ## points on the sphere. The second argument is the cluster mean and the third
1332             ## argument the cluster covariance. For the synthetic data, we set the cluster
1333             ## covariance to a 2x2 diagonal matrix, with the (0,0) element corresponding to the
1334             ## variance along the azimuth direction and the (1,1) element corresponding to the
1335             ## variance along the elevation direction.
1336             ##
1337             ## When you generate the points in the 2D spherical coordinates of
1338             ## (azimuth,elevation), you also need `wrap-around' logic for those points yielded by
1339             ## the multivariate-normal function whose azimuth angle is outside the interval
1340             ## (0,360) and/or whose elevation angle is outside the interval (-90,90).
1341             ##
1342             ## Note that the first of the two dimensions for which the multivariate-normal
1343             ## function returns the points is for the azimuth angle and the second for the
1344             ## elevation angle.
1345             ##
1346             ## With regard to the relationship of the Cartesian coordinates to the spherical
1347             ## (azimuth, elevation) coordinates, we assume that (x,y) is the horizontal plane
1348             ## and z the vertical axis. The elevation angle theta is measure with respect to
1349             ## the XY-plane. The highest point on the sphere (the Zenith) corresponds to the
1350             ## elevation angle of +90 and the lowest points on the sphere (the Nadir)
1351             ## corresponds to the elevation angle of -90. The azimuth is measured with respect
1352             ## X-axis. The range of the azimuth is from 0 to 360 degrees. The elevation is
1353             ## measured from the XY plane and its range is (-90,90) degrees.
1354             sub gen_data_and_write_to_csv {
1355             my $self = shift;
1356             my $K = $self->{_number_of_clusters_on_sphere};
1357             # $N is the number of samples to be generated for each cluster:
1358             my $N = int($self->{_total_number_of_samples_needed} / $K);
1359             my $output_file = $self->{_output_file};
1360             # Designated all of the data elements in a cluster by a letter that is followed by
1361             # an integer that identifies a specific data element.
1362             my @point_labels = ('a'..'z');
1363             # Our first job is to define $K random points in the 2D space (azimuth,
1364             # elevation) to serve as cluster centers on the surface of the sphere. This we
1365             # do by calling a uniformly distributed 1-D random number generator, first for
1366             # the azimuth and then for the elevation in the loop shown below:
1367             my @cluster_centers;
1368             my @covariances;
1369             foreach my $i (0..$K-1) {
1370             my $azimuth = rand(360);
1371             my $elevation = rand(90) - 90;
1372             my @mean = ($azimuth, $elevation);
1373             push @cluster_centers, \@mean;
1374             my $cluster_covariance;
1375             # The j-th dimension is for azimuth and k-th for elevation for the directions
1376             # to surface of the sphere:
1377             foreach my $j (0..1) {
1378             foreach my $k (0..1) {
1379             $cluster_covariance->[$j]->[$k] = ($self->{_cluster_width} * 360.0) ** 2
1380             if $j == 0 && $k == 0;
1381             $cluster_covariance->[$j]->[$k] = ($self->{_cluster_width} * 180.0) ** 2
1382             if $j == 1 && $k == 1;
1383             $cluster_covariance->[$j]->[$k] = 0.0 if $j != $k;
1384             }
1385             }
1386             push @covariances, $cluster_covariance;
1387             }
1388             if ($self->{_debug}) {
1389             foreach my $i (0..$K-1) {
1390             print "\n\nCluster center: @{$cluster_centers[$i]}\n";
1391             print "\nCovariance:\n";
1392             foreach my $j (0..1) {
1393             foreach my $k (0..1) {
1394             print "$covariances[$i]->[$j]->[$k] ";
1395             }
1396             print "\n";
1397             }
1398             }
1399             }
1400             my @data_dump;
1401             foreach my $i (0..$K-1) {
1402             my @m = @{shift @cluster_centers};
1403             my @covar = @{shift @covariances};
1404             my @new_data = Math::Random::random_multivariate_normal($N, @m, @covar);
1405             if ($self->{_debug}) {
1406             print "\nThe points for cluster $i:\n";
1407             map { print "@$_ "; } @new_data;
1408             print "\n\n";
1409             }
1410             my @wrapped_data;
1411             foreach my $d (@new_data) {
1412             my $wrapped_d;
1413             if ($d->[0] >= 360.0) {
1414             $wrapped_d->[0] = $d->[0] - 360.0;
1415             } elsif ($d->[0] < 0) {
1416             $wrapped_d->[0] = 360.0 - abs($d->[0]);
1417             }
1418             if ($d->[1] >= 90.0) {
1419             $wrapped_d->[0] = POSIX::fmod($d->[0] + 180.0, 360);
1420             $wrapped_d->[1] = 180.0 - $d->[1];
1421             } elsif ($d->[1] < -90.0) {
1422             $wrapped_d->[0] = POSIX::fmod($d->[0] + 180, 360);
1423             $wrapped_d->[1] = -180.0 - $d->[1];
1424             }
1425             $wrapped_d->[0] = $d->[0] unless defined $wrapped_d->[0];
1426             $wrapped_d->[1] = $d->[1] unless defined $wrapped_d->[1];
1427             push @wrapped_data, $wrapped_d;
1428             }
1429             if ($self->{_debug}) {
1430             print "\nThe unwrapped points for cluster $i:\n";
1431             map { print "@$_ "; } @wrapped_data;
1432             print "\n\n";
1433             }
1434             my $label = $point_labels[$i];
1435             my $j = 0;
1436             @new_data = map {unshift @$_, $label."_".$j; $j++; $_} @wrapped_data;
1437             push @data_dump, @new_data;
1438             }
1439             if ($self->{_debug}) {
1440             print "\n\nThe labeled points for clusters:\n";
1441             map { print "@$_\n"; } @data_dump;
1442             }
1443             fisher_yates_shuffle( \@data_dump );
1444             open OUTPUT, ">$output_file";
1445             my $total_num_of_points = $N * $K;
1446             print "Total number of data points that will be written out to the file: $total_num_of_points\n"
1447             if $self->{_debug};
1448             foreach my $ele (@data_dump) {
1449             my ($x,$y,$z);
1450             my $label = $ele->[0];
1451             my $azimuth = $ele->[1];
1452             my $elevation = $ele->[2];
1453             $x = cos($elevation) * cos($azimuth);
1454             $y = cos($elevation) * sin($azimuth);
1455             $z = sin($elevation);
1456             my $csv_str = join ",", ($label,$x,$y,$z);
1457             print OUTPUT "$csv_str\n";
1458             }
1459             print "\n\n";
1460             print "Data written out to file $output_file\n" if $self->{_debug};
1461             close OUTPUT;
1462             }
1463              
1464             # This version for the embedded class for data generation
1465             sub visualize_data_on_sphere {
1466             my $self = shift;
1467             my $datafile = shift;
1468             my $filename = File::Basename::basename($datafile);
1469             my $temp_file = "__temp_" . $filename;
1470             $temp_file =~ s/\.\w+$/\.txt/;
1471             unlink $temp_file if -e $temp_file;
1472             open OUTPUT, ">$temp_file"
1473             or die "Unable to open a temp file in this directory: $!";
1474             open INPUT, "< $filename" or die "Unable to open $filename: $!";
1475             local $/ = undef;
1476             my @all_records = split /\s+/, ;
1477             my %clusters;
1478             foreach my $record (@all_records) {
1479             my @splits = split /,/, $record;
1480             my $record_name = shift @splits;
1481             $record_name =~ /(\w+?)_.*/;
1482             my $primary_cluster_label = $1;
1483             push @{$clusters{$primary_cluster_label}}, \@splits;
1484             }
1485             foreach my $key (sort {"\L$a" cmp "\L$b"} keys %clusters) {
1486             map {print OUTPUT "$_"} map {"@$_\n"} @{$clusters{$key}};
1487             print OUTPUT "\n\n";
1488             }
1489             my @sorted_cluster_keys = sort {"\L$a" cmp "\L$b"} keys %clusters;
1490             close OUTPUT;
1491             my $plot = Graphics::GnuplotIF->new( persist => 1 );
1492             my $arg_string = "";
1493             $plot->gnuplot_cmd( "set noclip" );
1494             $plot->gnuplot_cmd( "set hidden3d" ) unless $self->{_show_hidden_in_3D_plots};
1495             $plot->gnuplot_cmd( "set pointsize 2" );
1496             $plot->gnuplot_cmd( "set parametric" );
1497             $plot->gnuplot_cmd( "set size ratio 1" );
1498             $plot->gnuplot_cmd( "set xlabel \"X\"" );
1499             $plot->gnuplot_cmd( "set ylabel \"Y\"" );
1500             $plot->gnuplot_cmd( "set zlabel \"Z\"" );
1501             # set the range for azimuth angles:
1502             $plot->gnuplot_cmd( "set urange [0:2*pi]" );
1503             # set the range for the elevation angles:
1504             $plot->gnuplot_cmd( "set vrange [-pi/2:pi/2]" );
1505             # Parametric functions for the sphere
1506             $plot->gnuplot_cmd( "r=1" );
1507             $plot->gnuplot_cmd( "fx(v,u) = r*cos(v)*cos(u)" );
1508             $plot->gnuplot_cmd( "fy(v,u) = r*cos(v)*sin(u)" );
1509             $plot->gnuplot_cmd( "fz(v) = r*sin(v)" );
1510             my $sphere_arg_str = "fx(v,u),fy(v,u),fz(v) notitle with lines lt 0,";
1511             foreach my $i (0..scalar(keys %clusters)-1) {
1512             my $j = $i + 1;
1513             # The following statement puts the titles on the data points
1514             $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"$sorted_cluster_keys[$i] \" with points lt $j pt $j, ";
1515             }
1516             $arg_string = $arg_string =~ /^(.*),[ ]+$/;
1517             $arg_string = $1;
1518             # $plot->gnuplot_cmd( "splot $arg_string" );
1519             $plot->gnuplot_cmd( "splot $sphere_arg_str $arg_string" );
1520             }
1521              
1522             sub DESTROY {
1523             use File::Basename;
1524             my $filename = basename($_[0]->{_output_file});
1525             $filename =~ s/\.\w+$/\.txt/;
1526             unlink "__temp_" . $filename;
1527             }
1528              
1529             # from perl docs:
1530             sub fisher_yates_shuffle {
1531             my $arr = shift;
1532             my $i = @$arr;
1533             while (--$i) {
1534             my $j = int rand( $i + 1 );
1535             @$arr[$i, $j] = @$arr[$j, $i];
1536             }
1537             }
1538              
1539             1;
1540              
1541             =pod
1542              
1543             =head1 NAME
1544              
1545             Algorithm::LinearManifoldDataClusterer --- for clustering data that resides on a
1546             low-dimensional manifold in a high-dimensional measurement space
1547              
1548             =head1 SYNOPSIS
1549              
1550             # You'd begin with:
1551              
1552             use Algorithm::LinearManifoldDataClusterer;
1553              
1554             # You'd next name the data file:
1555              
1556             my $datafile = "mydatafile.csv";
1557              
1558             # Your data must be in the CSV format, with one of the columns containing a unique
1559             # symbolic tag for each data record. You tell the module which column has the
1560             # symbolic tag and which columns to use for clustering through a mask such as
1561              
1562             my $mask = "N111";
1563              
1564             # which says that the symbolic tag is in the first column and that the numerical
1565             # data in the next three columns is to be used for clustering. If your data file
1566             # had, say, five columns and you wanted only the last three columns to be
1567             # clustered, the mask would become `N0111' assuming that that the symbolic tag is
1568             # still in the first column.
1569              
1570             # Now you must construct an instance of the clusterer through a call such as:
1571              
1572             my $clusterer = Algorithm::LinearManifoldDataClusterer->new(
1573             datafile => $datafile,
1574             mask => $mask,
1575             K => 3,
1576             P => 2,
1577             max_iterations => 15,
1578             cluster_search_multiplier => 2,
1579             delta_reconstruction_error => 0.001,
1580             terminal_output => 1,
1581             visualize_each_iteration => 1,
1582             show_hidden_in_3D_plots => 1,
1583             make_png_for_each_iteration => 1,
1584             );
1585              
1586             # where the parameter K specifies the number of clusters you expect to find in
1587             # your data and the parameter P is the dimensionality of the manifold on which the
1588             # data resides. The parameter cluster_search_multiplier is for increasing the
1589             # odds that the random seeds chosen initially for clustering will populate all the
1590             # clusters. Set this parameter to a low number like 2 or 3. The parameter
1591             # max_iterations places a hard limit on the number of iterations that the
1592             # algorithm is allowed. The actual number of iterations is controlled by the
1593             # parameter delta_reconstruction_error. The iterations stop when the change in
1594             # the total "reconstruction error" from one iteration to the next is smaller than
1595             # the value specified by delta_reconstruction_error.
1596              
1597             # Next, you must get the module to read the data for clustering:
1598              
1599             $clusterer->get_data_from_csv();
1600              
1601             # Finally, you invoke linear manifold clustering by:
1602              
1603             my $clusters = $clusterer->linear_manifold_clusterer();
1604              
1605             # The value returned by this call is a reference to an array of anonymous arrays,
1606             # with each anonymous array holding one cluster. If you wish, you can have the
1607             # module write the clusters to individual files by the following call:
1608              
1609             $clusterer->write_clusters_to_files($clusters);
1610              
1611             # If you want to see how the reconstruction error changes with the iterations, you
1612             # can make the call:
1613              
1614             $clusterer->display_reconstruction_errors_as_a_function_of_iterations();
1615              
1616             # When your data is 3-dimensional and when the clusters reside on a surface that
1617             # is more or less spherical, you can visualize the clusters by calling
1618              
1619             $clusterer->visualize_clusters_on_sphere("final clustering", $clusters);
1620              
1621             # where the first argument is a label to be displayed in the 3D plot and the
1622             # second argument the value returned by calling linear_manifold_clusterer().
1623              
1624             # SYNTHETIC DATA GENERATION:
1625              
1626             # The module includes an embedded class, DataGenerator, for generating synthetic
1627             # three-dimensional data that can be used to experiment with the clustering code.
1628             # The synthetic data, written out to a CSV file, consists of Gaussian clusters on
1629             # the surface of a sphere. You can control the number of clusters, the width of
1630             # each cluster, and the number of samples in the clusters by giving appropriate
1631             # values to the constructor parameters as shown below:
1632              
1633             use strict;
1634             use Algorithm::LinearManifoldDataClusterer;
1635              
1636             my $output_file = "4_clusters_on_a_sphere_1000_samples.csv";
1637              
1638             my $training_data_gen = DataGenerator->new(
1639             output_file => $output_file,
1640             cluster_width => 0.015,
1641             total_number_of_samples_needed => 1000,
1642             number_of_clusters_on_sphere => 4,
1643             show_hidden_in_3D_plots => 0,
1644             );
1645             $training_data_gen->gen_data_and_write_to_csv();
1646             $training_data_gen->visualize_data_on_sphere($output_file);
1647              
1648              
1649             =head1 CHANGES
1650              
1651             Version 1.01: Typos and other errors removed in the documentation. Also included in
1652             the documentation a link to a tutorial on data processing on manifolds.
1653              
1654              
1655             =head1 DESCRIPTION
1656              
1657             If you are new to machine learning and data clustering on linear and nonlinear
1658             manifolds, your first question is likely to be: What is a manifold? A manifold is a
1659             space that is locally Euclidean. And a space is locally Euclidean if it allows for
1660             the points in a small neighborhood to be represented by, say, the Cartesian
1661             coordinates and if the distances between the points in the neighborhood are given by
1662             the Euclidean metric. For an example, the set of all points on the surface of a
1663             sphere does NOT constitute a Euclidean space. Nonetheless, if you confined your
1664             attention to a small enough neighborhood around a point, the space would seem to be
1665             locally Euclidean. The surface of a sphere is a 2-dimensional manifold embedded in a
1666             3-dimensional space. A plane in a 3-dimensional space is also a 2-dimensional
1667             manifold. You would think of the surface of a sphere as a nonlinear manifold, whereas
1668             a plane would be a linear manifold. However, note that any nonlinear manifold is
1669             locally a linear manifold. That is, given a sufficiently small neighborhood on a
1670             nonlinear manifold, you can always think of it as a locally flat surface.
1671              
1672             As to why we need machine learning and data clustering on manifolds, there exist many
1673             important applications in which the measured data resides on a nonlinear manifold.
1674             For example, when you record images of a human face from different angles, all the
1675             image pixels taken together fall on a low-dimensional surface in a high-dimensional
1676             measurement space. The same is believed to be true for the satellite images of a land
1677             mass that are recorded with the sun at different angles with respect to the direction
1678             of the camera.
1679              
1680             Reducing the dimensionality of the sort of data mentioned above is critical to the
1681             proper functioning of downstream classification algorithms, and the most popular
1682             traditional method for dimensionality reduction is the Principal Components Analysis
1683             (PCA) algorithm. However, using PCA is tantamount to passing a linear least-squares
1684             hyperplane through the surface on which the data actually resides. As to why that
1685             might be a bad thing to do, just imagine the consequences of assuming that your data
1686             falls on a straight line when, in reality, it falls on a strongly curving arc. This
1687             is exactly what happens with PCA --- it gives you a linear manifold approximation to
1688             your data that may actually reside on a curved surface.
1689              
1690             That brings us to the purpose of this module, which is to cluster data that resides
1691             on a nonlinear manifold. Since a nonlinear manifold is locally linear, we can think
1692             of each data cluster on a nonlinear manifold as falling on a locally linear portion
1693             of the manifold, meaning on a hyperplane. The logic of the module is based on
1694             finding a set of hyperplanes that best describes the data, with each hyperplane
1695             derived from a local data cluster. This is like constructing a piecewise linear
1696             approximation to data that falls on a curve as opposed to constructing a single
1697             straight line approximation to all of the data. So whereas the frequently used PCA
1698             algorithm gives you a single hyperplane approximation to all your data, what this
1699             module returns is a set of hyperplane approximations, with each hyperplane derived by
1700             applying the PCA algorithm locally to a data cluster.
1701              
1702             That brings us to the problem of how to actually discover the best set of hyperplane
1703             approximations to the data. What is probably the most popular algorithm today for
1704             that purpose is based on the following key idea: Given a set of subspaces to which a
1705             data element can be assigned, you assign it to that subspace for which the
1706             B is the least. But what do we mean by a B and what
1707             is B?
1708              
1709             To understand the notions of B and B, let's revisit
1710             the traditional approach of dimensionality reduction by the PCA algorithm. The PCA
1711             algorithm consists of: (1) Subtracting from each data element the global mean of the
1712             data; (2) Calculating the covariance matrix of the data; (3) Carrying out an
1713             eigendecomposition of the covariance matrix and ordering the eigenvectors according
1714             to decreasing values of the corresponding eigenvalues; (4) Forming a B by
1715             discarding the trailing eigenvectors whose corresponding eigenvalues are relatively
1716             small; and, finally, (5) projecting all the data elements into the subspace so
1717             formed. The error incurred in representing a data element by its projection into the
1718             subspace is known as the B. This error is the projection of
1719             the data element into the space spanned by the discarded trailing eigenvectors.
1720              
1721             I
1722             subspace in the manner described above, we construct a set of subspaces, one for each
1723             data cluster on the nonlinear manifold. After the subspaces have been constructed, a
1724             data element is assigned to that subspace for which the reconstruction error is the
1725             least.> On the face of it, this sounds like a chicken-and-egg sort of a problem. You
1726             need to have already clustered the data in order to construct the subspaces at
1727             different places on the manifold so that you can figure out which cluster to place a
1728             data element in.
1729              
1730             Such problems, when they do possess a solution, are best tackled through iterative
1731             algorithms in which you start with a guess for the final solution, you rearrange the
1732             measured data on the basis of the guess, and you then use the new arrangement of the
1733             data to refine the guess. Subsequently, you iterate through the second and the third
1734             steps until you do not see any discernible changes in the new arrangements of the
1735             data. This forms the basis of the clustering algorithm that is described under
1736             B in the section that follows. This algorithm was first proposed in the
1737             article "Dimension Reduction by Local Principal Component Analysis" by Kambhatla and
1738             Leen that appeared in the journal Neural Computation in 1997.
1739              
1740             Unfortunately, experiments show that the algorithm as proposed by Kambhatla and Leen
1741             is much too sensitive to how the clusters are seeded initially. To get around this
1742             limitation of the basic clustering-by-minimization-of-reconstruction-error, this
1743             module implements a two phased approach. In B, we introduce a multiplier
1744             effect in our search for clusters by looking for C clusters instead of the main
1745             C clusters. In this manner, we increase the odds that each original cluster will
1746             be visited by one or more of the C randomly selected seeds at the beginning,
1747             where C is the integer value given to the constructor parameter
1748             C. Subsequently, we merge the clusters that belong
1749             together in order to form the final C clusters. That work is done in B
1750             of the algorithm.
1751              
1752             For the cluster merging operation in Phase 2, we model the C clusters as the
1753             nodes of an attributed graph in which the weight given to an edge connecting a pair
1754             of nodes is a measure of the similarity between the two clusters corresponding to the
1755             two nodes. Subsequently, we use spectral clustering to merge the most similar nodes
1756             in our quest to partition the data into C clusters. For that purpose, we use the
1757             Shi-Malik normalized cuts algorithm. The pairwise node similarity required by this
1758             algorithm is measured by the C method of the
1759             C class. The smaller the overall reconstruction error
1760             when all of the data elements in one cluster are projected into the other's subspace
1761             and vice versa, the greater the similarity between two clusters. Additionally, the
1762             smaller the distance between the mean vectors of the clusters, the greater the
1763             similarity between two clusters. The overall similarity between a pair of clusters
1764             is a combination of these two similarity measures.
1765              
1766             For additional information regarding the theoretical underpinnings of the algorithm
1767             implemented in this module, visit
1768             L
1769              
1770              
1771             =head1 SUMMARY OF THE ALGORITHM
1772              
1773             We now present a summary of the two phases of the algorithm implemented in this
1774             module. Note particularly the important role played by the constructor parameter
1775             C. It is only when the integer value given to this
1776             parameter is greater than 1 that Phase 2 of the algorithm kicks in.
1777              
1778             =over 4
1779              
1780             =item B
1781              
1782             Through iterative minimization of the total reconstruction error, this phase of the
1783             algorithm returns C clusters where C is the actual number of clusters you
1784             expect to find in your data and where C is the integer value given to the
1785             constructor parameter C. As previously mentioned, on
1786             account of the sensitivity of the reconstruction-error based clustering to how the
1787             clusters are initially seeded, our goal is to look for C clusters with the idea
1788             of increasing the odds that each of the C clusters will see at least one seed at
1789             the beginning of the algorithm.
1790              
1791             =over 4
1792              
1793             =item Step 1:
1794              
1795             Randomly choose C data elements to serve as the seeds for that many clusters.
1796              
1797             =item Step 2:
1798              
1799             Construct initial C clusters by assigning each data element to that cluster
1800             whose seed it is closest to.
1801              
1802             =item Step 3:
1803              
1804             Calculate the mean and the covariance matrix for each of the C clusters and
1805             carry out an eigendecomposition of the covariance matrix. Order the eigenvectors in
1806             decreasing order of the corresponding eigenvalues. The first C

eigenvectors

1807             define the subspace for that cluster. Use the space spanned by the remaining
1808             eigenvectors --- we refer to them as the trailing eigenvectors --- for calculating
1809             the reconstruction error.
1810              
1811             =item Step 4:
1812              
1813             Taking into account the mean associated with each cluster, re-cluster the entire data
1814             set on the basis of the least reconstruction error. That is, assign each data
1815             element to that subspace for which it has the smallest reconstruction error.
1816             Calculate the total reconstruction error associated with all the data elements. (See
1817             the definition of the reconstruction error in the C section.)
1818              
1819             =item Step 5:
1820              
1821             Stop iterating if the change in the total reconstruction error from the previous
1822             iteration to the current iteration is less than the value specified by the constructor
1823             parameter C. Otherwise, go back to Step 3.
1824              
1825             =back
1826              
1827             =item B
1828              
1829             This phase of the algorithm uses graph partitioning to merge the C clusters back
1830             into the C clusters you expect to see in your data. Since the algorithm whose
1831             steps are presented below is invoked recursively, let's assume that we have C
1832             clusters that need to be merged by invoking the Shi-Malik spectral clustering
1833             algorithm as described below:
1834              
1835             =over 4
1836              
1837             =item Step 1:
1838              
1839             Form a graph whose C nodes represent the C clusters. (For the very first
1840             invocation of this step, we have C.)
1841              
1842             =item Step 2:
1843              
1844             Construct an C similarity matrix for the nodes in the graph. The C<(i,j)>-th
1845             element of this matrix is the pairwise similarity between the clusters indexed C
1846             and C. Calculate this similarity on the basis of the following two criteria: (1)
1847             The total reconstruction error when the data elements in the cluster indexed C are
1848             projected into the subspace for the cluster indexed C and vice versa. And (2) The
1849             distance between the mean vectors associated with the two clusters.
1850              
1851             =item Step 3:
1852              
1853             Calculate the symmetric normalized Laplacian of the similarity matrix. We use C
1854             to denote the symmetric normalized Laplacian.
1855              
1856             =item Step 4:
1857              
1858             Carry out an eigendecomposition of the C matrix and choose the eigenvector
1859             corresponding to the second smallest eigenvalue for bipartitioning the graph on the
1860             basis of the sign of the values in the eigenvector.
1861              
1862             =item Step 5:
1863              
1864             If the bipartition of the previous step yields one-versus-the-rest kind of a
1865             partition, add the `one' cluster to the output list of clusters and invoke graph
1866             partitioning recursively on the `rest' by going back to Step 1. On the other hand,
1867             if the cardinality of both the partitions returned by Step 4 exceeds 1, invoke graph
1868             partitioning recursively on both partitions. Stop when the list of clusters in the
1869             output list equals C.
1870              
1871             =item Step 6:
1872              
1873             In general, the C clusters obtained by recursive graph partitioning will not cover
1874             all of the data. So, for the final step, assign each data element not covered by the
1875             C clusters to that cluster for which its reconstruction error is the least.
1876              
1877             =back
1878              
1879             =back
1880              
1881             =head1 FAIL-FIRST BIAS OF THE MODULE
1882              
1883             As you would expect for all such iterative algorithms, the module carries no
1884             theoretical guarantee that it will give you correct results. But what does that mean?
1885             Suppose you create synthetic data that consists of Gaussian looking disjoint clusters
1886             on the surface of a sphere, would the module always succeed in separating out the
1887             clusters? The module carries no guarantees to that effect --- especially considering
1888             that Phase 1 of the algorithm is sensitive to how the clusters are seeded at the
1889             beginning. Although this sensitivity is mitigated by the cluster merging step when
1890             greater-than-1 value is given to the constructor option C,
1891             a plain vanilla implementation of the steps in Phase 1 and Phase 2 would nonetheless
1892             carry significant risk that you'll end up with incorrect clustering results.
1893              
1894             To further reduce this risk, the module has been programmed so that it terminates
1895             immediately if it suspects that the cluster solution being developed is unlikely to
1896             be fruitful. The heuristics used for such terminations are conservative --- since
1897             the cost of termination is only that the user will have to run the code again, which
1898             at worst only carries an element of annoyance with it. The three "Fail First"
1899             heuristics currently programmed into the module are based on simple "unimodality
1900             testing", testing for "congruent clusters," and testing for dominant cluster support
1901             in the final stage of the recursive invocation of the graph partitioning step. The
1902             unimodality testing is as elementary as it can be --- it only checks for the number
1903             of data samples within a certain radius of the mean in relation to the total number
1904             of data samples in the cluster.
1905              
1906             When the program terminates under such conditions, it prints out the following message
1907             in your terminal window:
1908              
1909             Bailing out!
1910              
1911             Given the very simple nature of testing that is carried for the "Fail First" bias, do
1912             not be surprised if the results you get for your data simply look wrong. If most
1913             runs of the module produce wrong results for your application, that means that the
1914             module logic needs to be strengthened further. The author of this module would love
1915             to hear from you if that is the case.
1916              
1917             =head1 METHODS
1918              
1919             The module provides the following methods for linear-manifold based clustering, for
1920             cluster visualization, and for the generation of data for testing the clustering algorithm:
1921              
1922             =over 4
1923              
1924             =item B
1925              
1926             my $clusterer = Algorithm::LinearManifoldDataClusterer->new(
1927             datafile => $datafile,
1928             mask => $mask,
1929             K => $K,
1930             P => $P,
1931             cluster_search_multiplier => $C,
1932             max_iterations => $max_iter,
1933             delta_reconstruction_error => 0.001,
1934             terminal_output => 1,
1935             write_clusters_to_files => 1,
1936             visualize_each_iteration => 1,
1937             show_hidden_in_3D_plots => 1,
1938             make_png_for_each_iteration => 1,
1939             );
1940              
1941             A call to C constructs a new instance of the
1942             C class.
1943              
1944             =back
1945              
1946             =head2 Constructor Parameters
1947              
1948             =over 8
1949              
1950             =item C:
1951              
1952             This parameter names the data file that contains the multidimensional data records
1953             you want the module to cluster. This file must be in CSV format and each record in
1954             the file must include a symbolic tag for the record. Here are first few rows of such
1955             a CSV file in the C directory:
1956              
1957             d_161,0.0739248630173239,0.231119293395665,-0.970112873251437
1958             a_59,0.459932215884786,0.0110216469739639,0.887885623314902
1959             a_225,0.440503220903039,-0.00543366086464691,0.897734586447273
1960             a_203,0.441656364946433,0.0437191337788422,0.896118459046532
1961             ...
1962             ...
1963              
1964             What you see in the first column --- C, C, C, C --- are
1965             the symbolic tags associated with four 3-dimensional data records.
1966              
1967             =item C:
1968              
1969             This parameter supplies the mask to be applied to the columns of your data file. For
1970             the data file whose first few records are shown above, the mask should be C
1971             since the symbolic tag is in the first column of the CSV file and since, presumably,
1972             you want to cluster the data in the next three columns.
1973              
1974             =item C:
1975              
1976             This parameter supplies the number of clusters you are looking for.
1977              
1978             =item C

:

1979              
1980             This parameter specifies the dimensionality of the manifold on which the data resides.
1981              
1982             =item C:
1983              
1984             As should be clear from the C section, this parameter plays
1985             a very important role in the successful clustering of your data. As explained in
1986             C, the basic algorithm used for clustering in Phase 1 --- clustering by
1987             the minimization of the reconstruction error --- is sensitive to the choice of the
1988             cluster seeds that are selected randomly at the beginning of the algorithm. Should
1989             it happen that the seeds miss one or more of the clusters, the clustering produced is
1990             likely to not be correct. By giving an integer value to C
1991             that is greater than 1, you'll increase the odds that the randomly selected seeds
1992             will see all clusters. When you set C to C, you ask
1993             Phase 1 of the algorithm to construct C clusters as opposed to just C
1994             clusters. Subsequently, in Phase 2, the module uses inter-cluster similarity based
1995             graph partitioning to merge the C clusters into C clusters.
1996              
1997             =item C:
1998              
1999             This hard limits the number of iterations in Phase 1 of the algorithm. Ordinarily,
2000             the iterations stop automatically when the change in the total reconstruction error
2001             from one iteration to the next is less than the value specified by the parameter
2002             C.
2003              
2004             =item C:
2005              
2006             It is this parameter that determines when the iterations will actually terminate in
2007             Phase 1 of the algorithm. When the difference in the total reconstruction error from
2008             one iteration to the next is less than the value given to this parameter, the
2009             iterations come to an end. B
2010             data samples that need to be clustered, the larger must be the value give to this
2011             parameter. That makes intuitive sense since the total reconstruction error is the
2012             sum of all such errors for all of the data elements.> Unfortunately, the best value
2013             for this parameter does NOT appear to depend linearly on the total number of data
2014             records to be clustered.
2015              
2016             =item C:
2017              
2018             When this parameter is set, you will see in your terminal window the different
2019             clusters as lists of the symbolic tags associated with the data records. You will
2020             also see in your terminal window the output produced by the different steps of the
2021             graph partitioning algorithm as smaller clusters are merged to form the final C
2022             clusters --- assuming that you set the parameter C to an
2023             integer value that is greater than 1.
2024              
2025             =item C:
2026              
2027             As its name implies, when this option is set to 1, you'll see 3D plots of the
2028             clustering results for each iteration --- but only if your data is 3-dimensional.
2029              
2030             =item C:
2031              
2032             This parameter is important for controlling the visualization of the clusters on the
2033             surface of a sphere. If the clusters are too spread out, seeing all of the clusters
2034             all at once can be visually confusing. When you set this parameter, the clusters on
2035             the back side of the sphere will not be visible. Note that no matter how you set
2036             this parameter, you can interact with the 3D plot of the data and rotate it with your
2037             mouse pointer to see all of the data that is output by the clustering code.
2038              
2039             =item C:
2040              
2041             If you set this option to 1, the module will output a Gnuplot in the form of a PNG
2042             image for each iteration in Phase 1 of the algorithm. In Phase 2, the module will
2043             output the clustering result produced by the graph partitioning algorithm.
2044              
2045             =back
2046              
2047             =over
2048              
2049             =item B:
2050              
2051             $clusterer->get_data_from_csv();
2052              
2053             As you can guess from its name, the method extracts the data from the CSV file named
2054             in the constructor.
2055              
2056             =item B:
2057              
2058             $clusterer->linear_manifold_clusterer();
2059              
2060             or
2061              
2062             my $clusters = $clusterer->linear_manifold_clusterer();
2063              
2064             This is the main call to the linear-manifold based clusterer. The first call works
2065             by side-effect, meaning that you will see the clusters in your terminal window and
2066             they would be written out to disk files (depending on the constructor options you
2067             have set). The second call also returns the clusters as a reference to an array of
2068             anonymous arrays, each holding the symbolic tags for a cluster.
2069              
2070             =item B:
2071              
2072             $clusterer->display_reconstruction_errors_as_a_function_of_iterations();
2073              
2074             This method would normally be called after the clustering is completed to see how the
2075             reconstruction errors decreased with the iterations in Phase 1 of the overall
2076             algorithm.
2077              
2078             =item B:
2079              
2080             $clusterer->write_clusters_to_files($clusters);
2081              
2082             As its name implies, when you call this methods, the final clusters would be written
2083             out to disk files. The files have names like:
2084              
2085             cluster0.txt
2086             cluster1.txt
2087             cluster2.txt
2088             ...
2089             ...
2090              
2091             Before the clusters are written to these files, the module destroys all files with
2092             such names in the directory in which you call the module.
2093              
2094             =item B:
2095              
2096             $clusterer->visualize_clusters_on_sphere("final clustering", $clusters);
2097              
2098             or
2099              
2100             $clusterer->visualize_clusters_on_sphere("final_clustering", $clusters, "png");
2101              
2102             If your data is 3-dimensional and it resides on the surface of a sphere (or in the
2103             vicinity of such a surface), you may be able to use these methods for the
2104             visualization of the clusters produced by the algorithm. The first invocation
2105             produces a Gnuplot in a terminal window that you can rotate with your mouse pointer.
2106             The second invocation produces a `.png' image of the plot.
2107              
2108             =item B:
2109              
2110             $clusterer->auto_retry_clusterer();
2111              
2112             or
2113              
2114             my $clusters = $clusterer->auto_retry_clusterer();
2115              
2116             As mentioned earlier, the module is programmed in such a way that it is more likely
2117             to fail than to give you a wrong answer. If manually trying the clusterer repeatedly
2118             on a data file is frustrating, you can use C to automatically
2119             make repeated attempts for you. See the script C for how you can use
2120             C in your own code.
2121              
2122             =back
2123              
2124             =head1 GENERATING SYNTHETIC DATA FOR EXPERIMENTING WITH THE CLUSTERER
2125              
2126             The module file also contains a class named C for generating synthetic
2127             data for experimenting with linear-manifold based clustering. At this time, only
2128             3-dimensional data that resides in the form of Gaussian clusters on the surface of a
2129             sphere is generated. The generated data is placed in a CSV file. You construct an
2130             instance of the C class by a call like:
2131              
2132             =over 4
2133              
2134             =item B
2135              
2136             my $training_data_gen = DataGenerator->new(
2137             output_file => $output_file,
2138             cluster_width => 0.0005,
2139             total_number_of_samples_needed => 1000,
2140             number_of_clusters_on_sphere => 4,
2141             show_hidden_in_3D_plots => 0,
2142             );
2143              
2144             =back
2145              
2146             =head2 Parameters for the DataGenerator constructor:
2147              
2148             =over 8
2149              
2150             =item C:
2151              
2152             The numeric values are generated using a bivariate Gaussian distribution whose two
2153             independent variables are the azimuth and the elevation angles on the surface of a
2154             unit sphere. The mean of each cluster is chosen randomly and its covariance set in
2155             proportion to the value supplied for the C< cluster_width> parameter.
2156              
2157             =item C:
2158              
2159             This parameter controls the spread of each cluster on the surface of the unit sphere.
2160              
2161             =item C:
2162              
2163             As its name implies, this parameter specifies the total number of data samples that
2164             will be written out to the output file --- provided this number is divisible by the
2165             number of clusters you asked for. If the divisibility condition is not satisfied,
2166             the number of data samples actually written out will be the closest it can be to the
2167             number you specify for this parameter under the condition that equal number of
2168             samples will be created for each cluster.
2169              
2170             =item C:
2171              
2172             Again as its name implies, this parameters specifies the number of clusters that will
2173             be produced on the surface of a unit sphere.
2174              
2175             =item C:
2176              
2177             This parameter is important for the visualization of the clusters and it controls
2178             whether you will see the generated data on the back side of the sphere. If the
2179             clusters are not too spread out, you can set this parameter to 0 and see all the
2180             clusters all at once. However, when the clusters are spread out, it can be visually
2181             confusing to see the data on the back side of the sphere. Note that no matter how
2182             you set this parameter, you can interact with the 3D plot of the data and rotate it
2183             with your mouse pointer to see all of the data that is generated.
2184              
2185             =back
2186              
2187             =over 4
2188              
2189             =item B:
2190              
2191             $training_data_gen->gen_data_and_write_to_csv();
2192              
2193             As its name implies, this method generates the data according to the parameters
2194             specified in the DataGenerator constructor.
2195              
2196             =item B:
2197              
2198             $training_data_gen->visualize_data_on_sphere($output_file);
2199              
2200             You can use this method to visualize the clusters produced by the data generator.
2201             Since the clusters are located at randomly selected points on a unit sphere, by
2202             looking at the output visually, you can quickly reject what the data generator has
2203             produced and try again.
2204              
2205             =back
2206              
2207             =head1 HOW THE CLUSTERS ARE OUTPUT
2208              
2209             When the option C is set in the constructor of the
2210             C class, the clusters are displayed on the terminal
2211             screen.
2212              
2213             And, when the option C is set in the same constructor, the
2214             module dumps the clusters in files named
2215              
2216             cluster0.txt
2217             cluster1.txt
2218             cluster2.txt
2219             ...
2220             ...
2221              
2222             in the directory in which you execute the module. The number of such files will
2223             equal the number of clusters formed. All such existing files in the directory are
2224             destroyed before any fresh ones are created. Each cluster file contains the symbolic
2225             tags of the data samples in that cluster.
2226              
2227             Assuming that the data dimensionality is 3, if you have set the constructor parameter
2228             C, the module will deposit in your directory printable PNG
2229             files that are point plots of the different clusters in the different iterations of
2230             the algorithm. Such printable files are also generated for the initial clusters at
2231             the beginning of the iterations and for the final clusters in Phase 1 of the
2232             algorithm. You will also see in your directory a PNG file for the clustering result
2233             produced by graph partitioning in Phase 2.
2234              
2235             Also, as mentioned previously, a call to C in your own
2236             code will return the clusters to you directly.
2237              
2238             =head1 REQUIRED
2239              
2240             This module requires the following modules:
2241              
2242             List::Util
2243             File::Basename
2244             Math::Random
2245             Graphics::GnuplotIF
2246             Math::GSL::Matrix
2247             POSIX
2248              
2249             =head1 THE C DIRECTORY
2250              
2251             The C directory contains the following four scripts that you might want to
2252             play with in order to become familiar with the module:
2253              
2254             example1.pl
2255              
2256             example2.pl
2257              
2258             example3.pl
2259              
2260             example4.pl
2261              
2262             These scripts demonstrate linear-manifold based clustering on the 3-dimensional data
2263             in the following three CSV files:
2264              
2265             3_clusters_on_a_sphere_498_samples.csv (used in example1.pl and example4.pl)
2266              
2267             3_clusters_on_a_sphere_3000_samples.csv (used in example2.pl)
2268              
2269             4_clusters_on_a_sphere_1000_samples.csv (used in example3.pl)
2270              
2271             Note that even though the first two of these files both contain exactly three
2272             clusters, the clusters look very different in the two data files. The clusters are
2273             much more spread out in C<3_clusters_on_a_sphere_3000_samples.csv>.
2274              
2275             The code in C is special because it shows how you can call the
2276             C method of the module for automatic repeated invocations of
2277             the clustering program until success is achieved. The value of the constructor
2278             parameter C is set to 1 in C, implying that
2279             when you execute C you will not be invoking Phase 2 of the algorithm.
2280             You might wish to change the value given to the parameter
2281             C to see how it affects the number of attempts needed to
2282             achieve success.
2283              
2284             The C directory also includes PNG image files that show some intermediate
2285             and the best final results that can be achieved by the first three examples, that
2286             is, for the scripts C, C, and C. If you are on
2287             a Linux machine and if you have the C library installed, you can use the
2288             C command to see the results in the PNG images. The results you get with
2289             your own runs of the three example scripts may or may not look like what you see in
2290             the outputs shown in the PNG files depending on how the module seeds the clusters.
2291             Your best results should look like what you see in the PNG images.
2292              
2293             The C directory also contains the following utility scripts:
2294              
2295             For generating the data for experiments with clustering:
2296              
2297             generate_data_on_a_sphere.pl
2298              
2299             For visualizing the data generated by the above script:
2300              
2301             data_visualizer.pl
2302              
2303             For cleaning up the examples directory:
2304              
2305             cleanup_directory.pl
2306              
2307             Invoking the C script will get rid of all the PNG image files
2308             that are generated by the module when you run it with the constructor option
2309             C set to 1.
2310              
2311             =head1 EXPORT
2312              
2313             None by design.
2314              
2315             =head1 CAVEATS
2316              
2317             The performance of the algorithm depends much on the values you choose for the
2318             constructor parameters. And, even for the best choices for the parameters, the
2319             algorithm is not theoretically guaranteed to return the best results.
2320              
2321             Even after you have discovered the best choices for the constructor parameters, the
2322             best way to use this module is to try it multiple times on any given data and accept
2323             only those results that make the best intuitive sense.
2324              
2325             The module was designed with the hope that it would rather fail than give you wrong
2326             results. So if you see it failing a few times before it returns a good answer, that's
2327             a good thing.
2328              
2329             However, if the module fails too often or is too quick to give you wrong answers,
2330             that means the module is not working on your data. If that happens, I'd love to hear
2331             from you. That might indicate to me how I should enhance the power of this module
2332             for its future versions.
2333              
2334             =head1 BUGS
2335              
2336             Please notify the author if you encounter any bugs. When sending email, please place
2337             the string 'LinearManifoldDataClusterer' in the subject line.
2338              
2339             =head1 INSTALLATION
2340              
2341             Download the archive from CPAN in any directory of your choice. Unpack the archive
2342             with a command that on a Linux machine would look like:
2343              
2344             tar zxvf Algorithm-LinearManifoldDataClusterer-1.01.tar.gz
2345              
2346             This will create an installation directory for you whose name will be
2347             C. Enter this directory and execute the following commands
2348             for a standard install of the module if you have root privileges:
2349              
2350             perl Makefile.PL
2351             make
2352             make test
2353             sudo make install
2354              
2355             If you do not have root privileges, you can carry out a non-standard install the
2356             module in any directory of your choice by:
2357              
2358             perl Makefile.PL prefix=/some/other/directory/
2359             make
2360             make test
2361             make install
2362              
2363             With a non-standard install, you may also have to set your PERL5LIB environment
2364             variable so that this module can find the required other modules. How you do that
2365             would depend on what platform you are working on. In order to install this module in
2366             a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
2367             variable by
2368              
2369             setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2370              
2371             If I used bash, I'd need to declare:
2372              
2373             export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2374              
2375              
2376             =head1 THANKS
2377              
2378             I have learned much from my conversations with Donghun Kim whose research on face
2379             recognition in the wild involves clustering image data on manifolds. I have also had
2380             several fruitful conversations with Bharath Kumar Comandur and Tanmay Prakash with
2381             regard to the ideas that are incorporated in this module.
2382              
2383             =head1 AUTHOR
2384              
2385             Avinash Kak, kak@purdue.edu
2386              
2387             If you send email, please place the string "LinearManifoldDataClusterer" in your subject line to get past
2388             my spam filter.
2389              
2390             =head1 COPYRIGHT
2391              
2392             This library is free software; you can redistribute it and/or modify it under the
2393             same terms as Perl itself.
2394              
2395             Copyright 2015 Avinash Kak
2396              
2397             =cut
2398