File Coverage

blib/lib/Algorithm/LinearManifoldDataClusterer.pm

Criterion	Covered	Total	%
statement	25	27	92.5
branch			n/a
condition			n/a
subroutine	9	9	100.0
pod			n/a
total	34	36	94.4

line	stmt	sub	time	code
1				package Algorithm::LinearManifoldDataClusterer;
2
3				#------------------------------------------------------------------------------------
4				# Copyright (c) 2015 Avinash Kak. All rights reserved. This program is free
5				# software. You may modify and/or distribute it under the same terms as Perl itself.
6				# This copyright notice must remain attached to the file.
7				#
8				# Algorithm::LinearManifoldDataClusterer is a Perl module for clustering data that
9				# resides on a low-dimensional manifold in a high-dimensional measurement space.
10				# -----------------------------------------------------------------------------------
11
12	1	1	13460	use 5.10.0;
	1		3
	1		48
13	1	1	5	use strict;
	1		1
	1		33
14	1	1	5	use warnings;
	1		5
	1		30
15	1	1	4	use Carp;
	1		3
	1		81
16	1	1	7	use List::Util qw(reduce any);
	1		2
	1		123
17	1	1	6	use File::Basename;
	1		1
	1		79
18	1	1	1009	use Math::Random;
	1		5355
	1		98
19	1	1	579	use Graphics::GnuplotIF;
	1		10104
	1		66
20	1	1	1020	use Math::GSL::Matrix;
	0
	0
21				use POSIX ();
22
23				our $VERSION = '1.0';
24
25				# Constructor:
26				sub new {
27				my ($class, %args) = @_;
28				my @params = keys %args;
29				croak "\nYou have used a wrong name for a keyword argument " .
30				"--- perhaps a misspelling\n"
31				if check_for_illegal_params(@params) == 0;
32				bless {
33				_datafile => $args{datafile} \|\| croak("datafile required"),
34				_mask => $args{mask} \|\| croak("mask required"),
35				_K => $args{K} \|\| 0,
36				_P => $args{P} \|\| 0,
37				_terminal_output => $args{terminal_output} \|\| 0,
38				_max_iterations => $args{max_iterations} \|\| 0,
39				_delta_reconstruction_error => $args{delta_reconstruction_error} \|\| 0.001,
40				_delta_normalized_error => undef,
41				_cluster_search_multiplier => $args{cluster_search_multiplier} \|\| 1,
42				_visualize_each_iteration => $args{visualize_each_iteration} == 0 ? 0 : 1,
43				_show_hidden_in_3D_plots => $args{show_hidden_in_3D_plots} == 0 ? 0 : 1,
44				_make_png_for_each_iteration => $args{make_png_for_each_iteration} == 0 ? 0 : 1,
45				_debug => $args{debug} \|\| 0,
46				_N => 0,
47				_KM => $args{K} * $args{cluster_search_multiplier},
48				_data_hash => {},
49				_data_tags => [],
50				_data_dimensions => 0,
51				_final_clusters => [],
52				_auto_retry_flag => 0,
53				_num_iterations_actually_used => undef,
54				_scale_factor => undef,
55				_data_tags_to_cluster_label_hash => {},
56				_final_reference_vecs_for_all_subspaces => [],
57				_reconstruction_error_as_a_function_of_iteration => [],
58				_final_trailing_eigenvec_matrices_for_all_subspaces => [],
59				_subspace_construction_error_as_a_function_of_iteration => [],
60				}, $class;
61				}
62
63				sub get_data_from_csv {
64				my $self = shift;
65				my $filename = $self->{_datafile} \|\| die "you did not specify a file with the data to be clustered";
66				my $mask = $self->{_mask};
67				my @mask = split //, $mask;
68				$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
69				print "data dimensionality: $self->{_data_dimensions} \n" if $self->{_terminal_output};
70				open FILEIN, $filename or die "Unable to open $filename: $!";
71				die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
72				local $/ = undef;
73				my @all_data = split /\s+/, ;
74				my %data_hash = ();
75				my @data_tags = ();
76				foreach my $record (@all_data) {
77				my @splits = split /,/, $record;
78				my $record_name = shift @splits;
79				$data_hash{$record_name} = \@splits;
80				push @data_tags, $record_name;
81				}
82				$self->{_data_hash} = \%data_hash;
83				$self->{_data_tags} = \@data_tags;
84				$self->{_N} = scalar @data_tags;
85				}
86
87				sub estimate_mean_and_covariance {
88				my $self = shift;
89				my $tag_set = shift;
90				my $cluster_size = @$tag_set;
91				my @cluster_center = @{$self->add_point_coords($tag_set)};
92				@cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;
93				# for covariance calculation:
94				my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$tag_set));
95				print "\nThe data will be stuffed into a matrix of $num_rows rows and $num_cols columns\n\n"
96				if $self->{_debug};
97				my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
98				my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
99				# All the record labels are stored in the array $self->{_data_tags}. The actual
100				# data for clustering is stored in a hash at $self->{_data_hash} whose keys are
101				# the record labels; the value associated with each key is the array holding the
102				# corresponding numerical multidimensional data.
103				$mean_vec->set_col(0, \@cluster_center);
104				if ($self->{_debug}) {
105				print "\nDisplaying the mean vector for the cluster:\n";
106				display_matrix( $mean_vec ) if $self->{_terminal_output};
107				}
108				foreach my $j (0..$num_cols-1) {
109				my $tag = $tag_set->[$j];
110				my $data = $self->{_data_hash}->{$tag};
111				my @diff_from_mean = vector_subtract($data, \@cluster_center);
112				$matrix->set_col($j, \@diff_from_mean);
113				}
114				my $transposed = transpose( $matrix );
115				my $covariance = $matrix * $transposed;
116				$covariance *= 1.0 / $num_cols;
117				if ($self->{_debug}) {
118				print "\nDisplaying the Covariance Matrix for cluster:";
119				display_matrix( $covariance ) if $self->{_terminal_output};
120				}
121				return ($mean_vec, $covariance);
122				}
123
124				sub eigen_analysis_of_covariance {
125				my $self = shift;
126				my $covariance = shift;
127				my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
128				my $num_of_eigens = @$eigenvalues;
129				my $largest_eigen_index = 0;
130				my $smallest_eigen_index = 0;
131				print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug};
132				foreach my $i (1..$num_of_eigens-1) {
133				$largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
134				$smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
135				print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug};
136				}
137				print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
138				print "\nsmallest eigen index: $smallest_eigen_index\n\n" if $self->{_debug};
139				my @all_my_eigenvecs;
140				foreach my $i (0..$num_of_eigens-1) {
141				my @vec = $eigenvectors->[$i]->as_list;
142				my @eigenvec;
143				foreach my $ele (@vec) {
144				my ($mag, $theta) = $ele =~ /\[(\d\.?\de?[+-]?\d*),(\S+)\]/;
145				if ($theta eq "0") {
146				push @eigenvec, $mag;
147				} elsif ($theta eq "pi") {
148				push @eigenvec, -1.0 * $mag;
149				} else {
150				die "Eigendecomposition produced a complex eigenvector -- " .
151				"which should not happen for a covariance matrix!";
152				}
153				}
154				print "Eigenvector $i: @eigenvec\n" if $self->{_debug};
155				push @all_my_eigenvecs, \@eigenvec;
156				}
157				my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
158				print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug};
159				my @sorted_eigenvec_indexes = sort {$eigenvalues->[$b] <=> $eigenvalues->[$a]} 0..@all_my_eigenvecs-1;
160				my @sorted_eigenvecs;
161				my @sorted_eigenvals;
162				foreach my $i (0..@sorted_eigenvec_indexes-1) {
163				$sorted_eigenvecs[$i] = $all_my_eigenvecs[$sorted_eigenvec_indexes[$i]];
164				$sorted_eigenvals[$i] = $eigenvalues->[$sorted_eigenvec_indexes[$i]];
165				}
166				if ($self->{_debug}) {
167				print "\nHere come sorted eigenvectors --- from the largest to the smallest:\n";
168				foreach my $i (0..@sorted_eigenvecs-1) {
169				print "eigenvec: @{$sorted_eigenvecs[$i]} eigenvalue: $sorted_eigenvals[$i]\n";
170				}
171				}
172				return (\@sorted_eigenvecs, \@sorted_eigenvals);
173				}
174
175				sub auto_retry_clusterer {
176				my $self = shift;
177				$self->{_auto_retry_flag} = 1;
178				my $clusters;
179				$@ = 1;
180				my $retry_attempts = 1;
181				while ($@) {
182				eval {
183				$clusters = $self->linear_manifold_clusterer();
184				};
185				if ($@) {
186				if ($self->{_terminal_output}) {
187				print "Clustering failed. Trying again. --- $@";
188				print "\n\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n";
189				print "VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV\n\n";
190				}
191				$retry_attempts++;
192				} else {
193				print "\n\nNumber of retry attempts: $retry_attempts\n\n" if $self->{_terminal_output};
194				return $clusters;
195				}
196				}
197				}
198
199				sub linear_manifold_clusterer {
200				my $self = shift;
201				my $KM = $self->{_KM};
202				my @initial_cluster_center_tags;
203				my $visualization_msg;
204				my @initial_cluster_center_indexes = $self->initialize_cluster_centers($KM, $self->{_N});
205				print "Randomly selected indexes for cluster center tags: @initial_cluster_center_indexes\n"
206				if $self->{_debug};
207				@initial_cluster_center_tags = map {$self->{_data_tags}->[$_]} @initial_cluster_center_indexes;
208				my @initial_cluster_center_coords = map {$self->{_data_hash}->{$_}} @initial_cluster_center_tags;
209				if ($self->{_debug}) {
210				foreach my $centroid (@initial_cluster_center_coords) {
211				print "Initial cluster center coords: @{$centroid}\n";
212				}
213				}
214				my $initial_clusters = $self->assign_data_to_clusters_initial(\@initial_cluster_center_coords);
215				if ($self->{_data_dimensions} == 3) {
216				$visualization_msg = "initial_clusters";
217				$self->visualize_clusters_on_sphere($visualization_msg, $initial_clusters)
218				if $self->{_visualize_each_iteration};
219				$self->visualize_clusters_on_sphere($visualization_msg, $initial_clusters, "png")
220				if $self->{_make_png_for_each_iteration};
221				}
222				foreach my $cluster (@$initial_clusters) {
223				my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
224				display_mean_and_covariance($mean, $covariance) if $self->{_debug};
225				}
226				my @clusters = @$initial_clusters;
227				display_clusters(\@clusters) if $self->{_debug};
228				my $iteration_index = 0;
229				my $unimodal_correction_flag;
230				my $previous_min_value_for_unimodality_quotient;
231				while ($iteration_index < $self->{_max_iterations}) {
232				print "\n\n========================== STARTING ITERATION $iteration_index =====================\n\n"
233				if $self->{_terminal_output};
234				my $total_reconstruction_error_this_iteration = 0;
235				my @subspace_construction_errors_this_iteration;
236				my @trailing_eigenvec_matrices_for_all_subspaces;
237				my @reference_vecs_for_all_subspaces;
238				foreach my $cluster (@clusters) {
239				next if @$cluster == 0;
240				my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
241				display_mean_and_covariance($mean, $covariance) if $self->{_debug};
242				print "--------------end of displaying mean and covariance\n\n" if $self->{_debug};
243				my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance);
244				my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
245				my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1];
246				my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals;
247				push @subspace_construction_errors_this_iteration, $subspace_construction_error;
248				my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions},
249				scalar(@trailing_eigenvecs));
250				foreach my $j (0..@trailing_eigenvecs-1) {
251				print "trailing eigenvec column: @{$trailing_eigenvecs[$j]}\n" if $self->{_debug};
252				$trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
253				}
254				push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
255				push @reference_vecs_for_all_subspaces, $mean;
256				}
257				$self->set_termination_reconstruction_error_threshold(\@reference_vecs_for_all_subspaces);
258				my %best_subspace_based_partition_of_data;
259				foreach my $i (0..$self->{_KM}-1) {
260				$best_subspace_based_partition_of_data{$i} = [];
261				}
262				foreach my $data_tag (@{$self->{_data_tags}}) {
263				my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
264				$data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
265				my @errors = map {$self->reconstruction_error($data_vec,
266				$trailing_eigenvec_matrices_for_all_subspaces[$_],
267				$reference_vecs_for_all_subspaces[$_])}
268				0 .. $self->{_KM}-1;
269				my ($minval, $index_for_closest_subspace) = minimum(\@errors);
270				$total_reconstruction_error_this_iteration += $minval;
271				push @{$best_subspace_based_partition_of_data{$index_for_closest_subspace}},
272				$data_tag;
273				}
274				print "Finished calculating the eigenvectors for the clusters produced by the previous\n" .
275				"iteration and re-assigning the data samples to the new subspaces on the basis of\n".
276				"the least reconstruction error.\n\n" .
277				"Total reconstruction error in this iteration: $total_reconstruction_error_this_iteration\n"
278				if $self->{_terminal_output};
279				foreach my $i (0..$self->{_KM}-1) {
280				$clusters[$i] = $best_subspace_based_partition_of_data{$i};
281				}
282				display_clusters(\@clusters) if $self->{_terminal_output};
283				# Check if any cluster has lost all its elements. If so, fragment the worst
284				# existing cluster to create the additional clusters needed:
285				if (any {@$_ == 0} @clusters) {
286				die "empty cluster found" if $self->{_auto_retry_flag};
287				print "\nOne or more clusters have become empty. Will carve out the needed clusters\n" .
288				"from the cluster with the largest subspace construction error.\n\n";
289				$total_reconstruction_error_this_iteration = 0;
290				@subspace_construction_errors_this_iteration = ();
291				my $how_many_extra_clusters_needed = $self->{_KM} - scalar(grep {@$_ != 0} @clusters);
292				print "number of extra clusters needed at iteration $iteration_index: $how_many_extra_clusters_needed\n";
293				my $max = List::Util::max @subspace_construction_errors_this_iteration;
294				my $maxindex = List::Util::first {$_ == $max} @subspace_construction_errors_this_iteration;
295				my @cluster_fragments = cluster_split($clusters[$maxindex],
296				$how_many_extra_clusters_needed + 1);
297				my @newclusters;
298				push @newclusters, @clusters[0 .. $maxindex-1];
299				push @newclusters, @clusters[$maxindex+1 .. $self->{_KM}-1];
300				push @newclusters, @cluster_fragments;
301				@newclusters = grep {@$_ != 0} @newclusters;
302				die "something went wrong with cluster fragmentation"
303				unless $self->{_KM} = @newclusters;
304				@trailing_eigenvec_matrices_for_all_subspaces = ();
305				@reference_vecs_for_all_subspaces = ();
306				foreach my $cluster (@newclusters) {
307				die "Linear Manifold Clustering did not work $!" if @$cluster == 0;
308				my ($mean, $covariance) = estimate_mean_and_covariance($cluster);
309				my ($eigenvecs, $eigenvals) = eigen_analysis_of_covariance($covariance);
310				my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
311				my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1];
312				my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals;
313				push @subspace_construction_errors_this_iteration, $subspace_construction_error;
314				my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions},
315				scalar(@trailing_eigenvecs));
316				foreach my $j (0..@trailing_eigenvecs-1) {
317				$trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
318				}
319				push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
320				push @reference_vecs_for_all_subspaces, $mean;
321				}
322				my %best_subspace_based_partition_of_data;
323				foreach my $i (0..$self->{_KM}-1) {
324				$best_subspace_based_partition_of_data{$i} = [];
325				}
326				foreach my $data_tag (@{$self->{_data_tags}}) {
327				my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
328				$data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
329				my @errors = map {reconstruction_error($data_vec,
330				$trailing_eigenvec_matrices_for_all_subspaces[$_],
331				$reference_vecs_for_all_subspaces[$_])}
332				0 .. $self->{_KM}-1;
333				my ($minval, $index_for_closest_subspace) = minimum(\@errors);
334				$total_reconstruction_error_this_iteration += $minval;
335				push @{$best_subspace_based_partition_of_data{$index_for_closest_subspace}},
336				$data_tag;
337				}
338				print "empty-cluster jag: total reconstruction error in this iteration: \n" .
339				"$total_reconstruction_error_this_iteration\n"
340				if $self->{_debug};
341				foreach my $i (0..$self->{_KM}-1) {
342				$clusters[$i] = $best_subspace_based_partition_of_data{$i};
343				}
344				display_clusters(\@newclusters) if $self->{_terminal_output};
345				@clusters = grep {@$_ != 0} @newclusters;
346				die "linear manifold based algorithm does not appear to work in this case $!"
347				unless @clusters == $self->{_KM};
348				}# end of foreach my $cluster (@clusters) ... loop followed by if clause for empty clusters
349				if ($self->{_data_dimensions} == 3) {
350				$visualization_msg = "clustering_at_iteration_$iteration_index";
351				$self->visualize_clusters_on_sphere($visualization_msg, \@clusters)
352				if $self->{_visualize_each_iteration};
353				$self->visualize_clusters_on_sphere($visualization_msg, \@clusters, "png")
354				if $self->{_make_png_for_each_iteration};
355				}
356				my @cluster_unimodality_quotients = map {$self->cluster_unimodality_quotient($clusters[$_],
357				$reference_vecs_for_all_subspaces[$_])} 0..@clusters-1;
358				my $min_value_for_unimodality_quotient = List::Util::min @cluster_unimodality_quotients;
359				print "\nCluster unimodality quotients: @cluster_unimodality_quotients\n" if $self->{_terminal_output};
360				die "\n\nBailing out!\n" .
361				"It does not look like these iterations will lead to a good clustering result.\n" .
362				"Program terminating. Try running again.\n"
363				if defined($previous_min_value_for_unimodality_quotient)
364				&& ($min_value_for_unimodality_quotient < 0.4)
365				&& ($min_value_for_unimodality_quotient < (0.5 * $previous_min_value_for_unimodality_quotient));
366				if ( $min_value_for_unimodality_quotient < 0.5 ) {
367				$unimodal_correction_flag = 1;
368				print "\nApplying unimodality correction:\n\n" if $self->{_terminal_output};
369				my @sorted_cluster_indexes =
370				sort {$cluster_unimodality_quotients[$b] <=> $cluster_unimodality_quotients[$a]} 0..@clusters-1;
371				my @newclusters;
372				foreach my $cluster_index (0..@clusters - 1) {
373				push @newclusters, $clusters[$sorted_cluster_indexes[$cluster_index]];
374				}
375				@clusters = @newclusters;
376				my $worst_cluster = pop @clusters;
377				print "\nthe worst cluster: @$worst_cluster\n" if $self->{_terminal_output};
378				my $second_worst_cluster = pop @clusters;
379				print "\nthe second worst cluster: @$second_worst_cluster\n" if $self->{_terminal_output};
380				push @$worst_cluster, @$second_worst_cluster;
381				fisher_yates_shuffle($worst_cluster);
382				my @first_half = @$worst_cluster[0 .. int(scalar(@$worst_cluster)/2) - 1];
383				my @second_half = @$worst_cluster[int(scalar(@$worst_cluster)/2) .. @$worst_cluster - 1];
384				push @clusters, \@first_half;
385				push @clusters, \@second_half;
386				if ($self->{_terminal_output}) {
387				print "\n\nShowing the clusters obtained after applying the unimodality correction:\n";
388				display_clusters(\@clusters);
389				}
390				}
391				if (@{$self->{_reconstruction_error_as_a_function_of_iteration}} > 0) {
392				my $last_recon_error = pop @{$self->{_reconstruction_error_as_a_function_of_iteration}};
393				push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, $last_recon_error;
394				if (($last_recon_error - $total_reconstruction_error_this_iteration)
395				< $self->{_delta_normalized_error}) {
396				push @{$self->{_reconstruction_error_as_a_function_of_iteration}},
397				$total_reconstruction_error_this_iteration;
398				last;
399				}
400				}
401				push @{$self->{_reconstruction_error_as_a_function_of_iteration}},
402				$total_reconstruction_error_this_iteration;
403				$iteration_index++;
404				$previous_min_value_for_unimodality_quotient = $min_value_for_unimodality_quotient;
405				} # end of while loop on iteration_index
406				$self->{_num_iterations_actually_used} =
407				scalar @{$self->{_reconstruction_error_as_a_function_of_iteration}};
408				if ($self->{_terminal_output}) {
409				print "\nIterations of the main loop terminated at iteration number $iteration_index.\n";
410				print "Will now invoke graph partitioning to discover dominant clusters and to\n" .
411				"merge small clusters.\n\n" if $self->{_cluster_search_multiplier} > 1;
412				print "Total reconstruction error as a function of iterations: " .
413				"@{$self->{_reconstruction_error_as_a_function_of_iteration}}";
414				}
415				# now merge sub-clusters if cluster_search_multiplier > 1
416				my @final_clusters;
417				if ($self->{_cluster_search_multiplier} > 1) {
418				print "\n\nInvoking recursive graph partitioning to merge small clusters\n\n";
419				my @array_of_partitioned_cluster_groups = (\@clusters);
420				my @partitioned_cluster_groups;
421				my $how_many_clusters_looking_for = $self->{_K};
422				while (scalar(@final_clusters) < $self->{_K}) {
423				@partitioned_cluster_groups =
424				$self->graph_partition(shift @array_of_partitioned_cluster_groups,
425				$how_many_clusters_looking_for );
426				if (@{$partitioned_cluster_groups[0]} == 1) {
427				my $singular_cluster = shift @{$partitioned_cluster_groups[0]};
428				push @final_clusters, $singular_cluster;
429				$how_many_clusters_looking_for--;
430				push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[1];
431				} elsif (@{$partitioned_cluster_groups[1]} == 1) {
432				my $singular_cluster = shift @{$partitioned_cluster_groups[1]};
433				push @final_clusters, $singular_cluster;
434				$how_many_clusters_looking_for--;
435				push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[0];
436				} else {
437				push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[0];
438				push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[1];
439				}
440				}
441				my @data_clustered;
442				foreach my $cluster (@final_clusters) {
443				push @data_clustered, @$cluster;
444				}
445				unless (scalar(@data_clustered) == scalar(@{$self->{_data_tags}})) {
446				$self->{_final_clusters} = \@final_clusters;
447				my %data_clustered = map {$_ => 1} @data_clustered;
448				my @data_tags_not_clustered =
449				grep {$_} map {exists $data_clustered{$_} ? undef : $_} @{$self->{_data_tags}};
450				if ($self->{_terminal_output}) {
451				print "\n\nNot all data clustered. The most reliable clusters found by graph partitioning:\n";
452				display_clusters(\@final_clusters);
453				print "\n\nData not yet clustered:\n\n@data_tags_not_clustered\n";
454				}
455				if ($self->{_data_dimensions} == 3) {
456				$visualization_msg = "$self->{_K}_best_clusters_produced_by_graph_partitioning";
457				$self->visualize_clusters_on_sphere($visualization_msg, \@final_clusters)
458				if $self->{_visualize_each_iteration};
459				$self->visualize_clusters_on_sphere($visualization_msg, \@final_clusters, "png")
460				if $self->{_make_png_for_each_iteration};
461				}
462				my %data_tags_to_cluster_label_hash;
463				foreach my $i (0..@final_clusters-1) {
464				map {$data_tags_to_cluster_label_hash{$_} = $i} @{$final_clusters[$i]};
465				}
466				$self->{_data_tags_to_cluster_label_hash} = \%data_tags_to_cluster_label_hash;
467				foreach my $tag (@data_tags_not_clustered) {
468				my $which_cluster = $self->which_cluster_for_new_element($tag);
469				$self->{_data_tags_to_cluster_label_hash}->{$tag} = $which_cluster;
470				}
471				die "Some data elements are still missing from the final tally"
472				unless scalar(keys %{$self->{_data_tags_to_cluster_label_hash}}) ==
473				scalar(@{$self->{_data_tags}});
474				my @new_final_clusters;
475				map { foreach my $ele (keys %{$self->{_data_tags_to_cluster_label_hash}}) {
476				push @{$new_final_clusters[$_]}, $ele
477				if $self->{_data_tags_to_cluster_label_hash}->{$ele} == $_ }
478				} 0..$self->{_K}-1;
479				if ($self->{_debug}) {
480				print "\ndisplaying the final clusters after accounting for unclustered data:\n";
481				display_clusters(\@new_final_clusters);
482				}
483				$self->{_final_clusters} = \@new_final_clusters;
484				@final_clusters = @new_final_clusters;
485				}
486				} else {
487				@final_clusters = @clusters;
488				}
489				print "\n\nDisplaying final clustering results:\n\n" if $self->{_terminal_output};
490				display_clusters(\@final_clusters) if $self->{_terminal_output};
491				return \@final_clusters;
492				}
493
494				sub display_reconstruction_errors_as_a_function_of_iterations {
495				my $self = shift;
496				print "\n\nNumber of iterations used in Phase 1: $self->{_num_iterations_actually_used}\n";
497				print "\nTotal reconstruction error as a function of iterations in Phase 1: " .
498				"@{$self->{_reconstruction_error_as_a_function_of_iteration}}\n";
499				}
500
501				sub set_termination_reconstruction_error_threshold {
502				my $self = shift;
503				my $all_ref_vecs = shift;
504				my @mean_vecs = @$all_ref_vecs;
505				my $sum_of_mean_magnitudes = reduce {$a+$b} map { my $result = transpose($_) * $_;
506				my @result = $result->as_list;
507				sqrt($result[0])
508				} @mean_vecs;
509				$self->{_scale_factor} = $sum_of_mean_magnitudes / @mean_vecs;
510				$self->{_delta_normalized_error} = ($sum_of_mean_magnitudes / @mean_vecs ) *
511				$self->{_delta_reconstruction_error};
512				}
513
514				# This method is called only in the `unless' clause at the end of the main
515				# linear_manifold_clusterer() method. It is called to find the cluster labels for
516				# those data elements that were left unclustered by the main part of the algorithm
517				# when graph partitioning is used to merge similar sub-clusters. The operating logic
518				# here is that graph partition yields K main clusters even though each main cluster
519				# may not yet be fully populated.
520				sub which_cluster_for_new_element {
521				my $self = shift;
522				my $data_tag = shift;
523				# The following `unless' clause is called only the first time the current method
524				# is called:
525				unless (@{$self->{_final_trailing_eigenvec_matrices_for_all_subspaces}} > 0) {
526				my @trailing_eigenvec_matrices_for_all_subspaces;
527				my @reference_vecs_for_all_subspaces;
528				foreach my $cluster (@{$self->{_final_clusters}}) {
529				my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
530				my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance);
531				my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
532				my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions},
533				scalar(@trailing_eigenvecs));
534				foreach my $j (0..@trailing_eigenvecs-1) {
535				$trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
536				}
537				push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
538				push @reference_vecs_for_all_subspaces, $mean;
539				}
540				$self->{_final_trailing_eigenvec_matrices_for_all_subspaces} =
541				\@trailing_eigenvec_matrices_for_all_subspaces;
542				$self->{_final_reference_vecs_for_all_subspaces} = \@reference_vecs_for_all_subspaces;
543				}
544				my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
545				$data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
546				my @errors = map {$self->reconstruction_error($data_vec,
547				$self->{_final_trailing_eigenvec_matrices_for_all_subspaces}->[$_],
548				$self->{_final_reference_vecs_for_all_subspaces}->[$_])}
549				0 .. $self->{_K}-1;
550				my ($minval, $index_for_closest_subspace) = minimum(\@errors);
551				return $index_for_closest_subspace;
552				}
553
554				sub graph_partition {
555				my $self = shift;
556				my $clusters = shift;
557				my $how_many_clusters_looking_for = shift;
558				print "\n\nGraph partitioning looking for $how_many_clusters_looking_for clusters\n\n";
559				my $num_nodes = scalar @$clusters;
560				my $W = Math::GSL::Matrix->new($num_nodes,$num_nodes);
561				my $D = Math::GSL::Matrix->new($num_nodes,$num_nodes);
562				$D->identity;
563				my $neg_sqrt_of_D = Math::GSL::Matrix->new($num_nodes,$num_nodes);
564				$neg_sqrt_of_D->identity;
565				my @subspace_construction_errors;
566				my @trailing_eigenvec_matrices_for_all_subspaces;
567				my @reference_vecs_for_all_subspaces;
568				foreach my $cluster (@$clusters) {
569				my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
570				my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance);
571				my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
572				my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1];
573				my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals;
574				push @subspace_construction_errors, $subspace_construction_error;
575				my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions},
576				scalar(@trailing_eigenvecs));
577				foreach my $j (0..@trailing_eigenvecs-1) {
578				print "trailing eigenvec column: @{$trailing_eigenvecs[$j]}\n" if $self->{_debug};
579				$trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
580				}
581				push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
582				push @reference_vecs_for_all_subspaces, $mean;
583				}
584				# We consider the similarity matrix W to be a sum of two parts W_recon_based and
585				# W_dist_bet_means based. IMPORTANT: For coding reasons, we first store the two
586				# similarity measures separately in W_recon_based and W_dist_bet_means based. Our
587				# goal is to fill up these component matrices with the raw values while at the
588				# same time collecting information needed for normalizing these two separate
589				# measures of similarity.
590				my $W_reconstruction_error_based = Math::GSL::Matrix->new($num_nodes,$num_nodes);
591				my $W_dist_between_means_based = Math::GSL::Matrix->new($num_nodes,$num_nodes);
592				my @all_pairwise_reconstruction_errors;
593				my @all_dist_between_means_errors;
594				foreach my $i (0..$num_nodes-1) {
595				foreach my $j (0..$num_nodes-1) {
596				my ($recon_error_similarity, $dist_bet_means) = $self->pairwise_cluster_similarity(
597				$clusters->[$i],
598				$trailing_eigenvec_matrices_for_all_subspaces[$i],
599				$reference_vecs_for_all_subspaces[$i],
600				$clusters->[$j],
601				$trailing_eigenvec_matrices_for_all_subspaces[$j],
602				$reference_vecs_for_all_subspaces[$j]);
603				$W_reconstruction_error_based->set_elem($i, $j, $recon_error_similarity);
604				$W_dist_between_means_based->set_elem($i, $j, $dist_bet_means);
605				push @all_pairwise_reconstruction_errors, $recon_error_similarity;
606				push @all_dist_between_means_errors, $dist_bet_means;
607				}
608				}
609				my $recon_error_normalizer = (reduce {$a + $b} @all_pairwise_reconstruction_errors) /
610				(scalar @all_pairwise_reconstruction_errors);
611				my $dist_bet_means_based_normalizer = (reduce {$a + $b} @all_dist_between_means_errors ) /
612				(scalar @all_dist_between_means_errors );
613				die "\n\nBailing out!\n" .
614				"Dealing with highly defective clusters. Try again.\n"
615				if ($recon_error_normalizer == 0) \|\| ($dist_bet_means_based_normalizer == 0);
616				foreach my $i (0..$num_nodes-1) {
617				foreach my $j (0..$num_nodes-1) {
618				my $recon_val = $W_reconstruction_error_based->get_elem($i,$j);
619				my $new_recon_val = exp( -1.0 * $recon_val / $recon_error_normalizer );
620				$W_reconstruction_error_based->set_elem($i,$j,$new_recon_val);
621				my $mean_dist_val = $W_dist_between_means_based->get_elem($i,$j);
622				my $new_mean_dist_val = exp( -1.0 * $mean_dist_val / $dist_bet_means_based_normalizer );
623				$W_dist_between_means_based->set_elem($i,$j,$new_mean_dist_val);
624				}
625				}
626				$W = $W_reconstruction_error_based + $W_dist_between_means_based;
627				if ($self->{_debug}) {
628				print "\nDisplaying the similarity matrix W for the cluster graph:\n";
629				display_matrix($W) if $self->{_terminal_output};
630				}
631				my $add_all_columns = Math::GSL::Matrix->new($num_nodes,1);
632				foreach my $col (0..$num_nodes-1) {
633				$add_all_columns += $W->col($col);
634				}
635				foreach my $i (0..$num_nodes-1) {
636				$D->set_elem($i,$i, $add_all_columns->get_elem($i,0));
637				$neg_sqrt_of_D->set_elem($i,$i, 1.0 / sqrt($add_all_columns->get_elem($i,0)));
638				}
639				# the Laplacian matrix:
640				my $Laplacian = $D - $W;
641				# the Symmetric Normalized Laplacian matrix A:
642				my $A = $neg_sqrt_of_D * $Laplacian * $neg_sqrt_of_D;
643				foreach my $i (0..$num_nodes-1) {
644				foreach my $j (0..$num_nodes-1) {
645				$A->set_elem($i,$j,0) if abs($A->get_elem($i,$j)) < 0.01;
646				}
647				}
648				if ($self->{_terminal_output}) {
649				print "\nDisplaying the Symmetric Normalized Laplacian matrix A:\n" .
650				"A = neg_sqrt(D) * Laplacian_matrix * neg_sqrt(D)\n";
651				display_matrix( $A );
652				}
653				my ($eigenvalues, $eigenvectors) = $A->eigenpair;
654				my $num_of_eigens = @$eigenvalues;
655				my $largest_eigen_index = 0;
656				my $smallest_eigen_index = 0;
657				if ($self->{_debug2}) {
658				print "Eigenvalue 0: $eigenvalues->[0]\n";
659				foreach my $i (1..$num_of_eigens-1) {
660				$largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
661				$smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
662				print "Eigenvalue $i: $eigenvalues->[$i]\n";
663				}
664				print "\nlargest eigen index: $largest_eigen_index\n";
665				print "\nsmallest eigen index: $smallest_eigen_index\n\n";
666				}
667				my @all_my_eigenvecs;
668				foreach my $i (0..$num_of_eigens-1) {
669				my @vec = $eigenvectors->[$i]->as_list;
670				my @eigenvec;
671				foreach my $ele (@vec) {
672				my ($mag, $theta) = $ele =~ /\[(\d\.?\de?[+-]?\d*),(\S+)\]/;
673				if ($theta eq "0") {
674				push @eigenvec, $mag;
675				} elsif ($theta eq "pi") {
676				push @eigenvec, -1.0 * $mag;
677				} else {
678				die "Eigendecomposition produced a complex eigenvector!";
679				}
680				}
681				print "Eigenvector $i: @eigenvec\n" if $self->{_debug2};
682				push @all_my_eigenvecs, \@eigenvec;
683				}
684				if ($self->{_debug2}) {
685				my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
686				print "\nLargest eigenvector of A: @largest_eigen_vec\n";
687				}
688				my @sorted_eigenvec_indexes = sort {$eigenvalues->[$b] <=> $eigenvalues->[$a]} 0..@all_my_eigenvecs-1;
689				print "sorted eigenvec indexes for A: @sorted_eigenvec_indexes\n" if $self->{_debug2};
690				my @sorted_eigenvecs;
691				my @sorted_eigenvals;
692				foreach my $i (0..@sorted_eigenvec_indexes-1) {
693				$sorted_eigenvecs[$i] = $all_my_eigenvecs[$sorted_eigenvec_indexes[$i]];
694				$sorted_eigenvals[$i] = $eigenvalues->[$sorted_eigenvec_indexes[$i]];
695				}
696				if ($self->{_debug2}) {
697				print "\nHere come sorted eigenvectors for A --- from the largest to the smallest:\n";
698				foreach my $i (0..@sorted_eigenvecs-1) {
699				print "eigenvec: @{$sorted_eigenvecs[$i]} eigenvalue: $sorted_eigenvals[$i]\n";
700				}
701				}
702				my $best_partitioning_eigenvec = $sorted_eigenvecs[@sorted_eigenvec_indexes-2];
703				print "\nBest graph partitioning eigenvector: @$best_partitioning_eigenvec\n" if $self->{_terminal_output};
704				my $how_many_positive = reduce {$a + $b} map {$_ > 0 ? 1 : 0 } @$best_partitioning_eigenvec;
705				my $how_many_negative = scalar(@$best_partitioning_eigenvec) - $how_many_positive;
706				print "Have $how_many_positive positive and $how_many_negative negative elements in the partitioning vec\n"
707				if $self->{_terminal_output};
708				if ($how_many_clusters_looking_for <= 3) {
709				my @merged_cluster;
710				my $final_cluster;
711				my @newclusters;
712				if ($how_many_positive == 1) {
713				foreach my $i (0..@$clusters-1) {
714				if ($best_partitioning_eigenvec->[$i] > 0) {
715				$final_cluster = $clusters->[$i];
716				} else {
717				push @newclusters, $clusters->[$i];
718				}
719				}
720				return ([$final_cluster], \@newclusters);
721				} elsif ($how_many_negative == 1) {
722				foreach my $i (0..@$clusters-1) {
723				if ($best_partitioning_eigenvec->[$i] < 0) {
724				$final_cluster = $clusters->[$i];
725				} else {
726				push @newclusters, $clusters->[$i];
727				}
728				}
729				return ([$final_cluster], \@newclusters);
730				} elsif ($how_many_positive <= $self->{_cluster_search_multiplier}) {
731				foreach my $i (0..@$clusters-1) {
732				if ($best_partitioning_eigenvec->[$i] > 0) {
733				push @merged_cluster, @{$clusters->[$i]};
734				} else {
735				push @newclusters, $clusters->[$i];
736				}
737				}
738				return ([\@merged_cluster], \@newclusters);
739				} elsif ($how_many_negative <= $self->{_cluster_search_multiplier}) {
740				foreach my $i (0..@$clusters-1) {
741				if ($best_partitioning_eigenvec->[$i] < 0) {
742				push @merged_cluster, @{$clusters->[$i]};
743				} else {
744				push @newclusters, $clusters->[$i];
745				}
746				}
747				return ([\@merged_cluster], \@newclusters);
748				} else {
749				die "\n\nBailing out!\n\n" .
750				"No consensus support for dominant clusters in the graph partitioning step\n" .
751				"of the algorithm. This can be caused by bad random selection of initial\n" .
752				"cluster centers. Please run this program again.\n";
753				}
754				} else {
755				my @positive_clusters;
756				my @negative_clusters;
757				foreach my $i (0..@$clusters-1) {
758				if ($best_partitioning_eigenvec->[$i] > 0) {
759				push @positive_clusters, $clusters->[$i];
760				} else {
761				push @negative_clusters, $clusters->[$i];
762				}
763				}
764				return (\@positive_clusters, \@negative_clusters);
765				}
766				}
767
768				sub pairwise_cluster_similarity {
769				my $self = shift;
770				my $cluster1 = shift;
771				my $trailing_eigenvec_matrix_cluster1 = shift;
772				my $reference_vec_cluster1 = shift;
773				my $cluster2 = shift;
774				my $trailing_eigenvec_matrix_cluster2 = shift;
775				my $reference_vec_cluster2 = shift;
776				my $total_reconstruction_error_in_this_iteration = 0;
777				my @errors_for_1_on_2 = map {my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
778				$data_vec->set_col(0, $self->{_data_hash}->{$_});
779				$self->reconstruction_error($data_vec,
780				$trailing_eigenvec_matrix_cluster2,
781				$reference_vec_cluster2)}
782				@$cluster1;
783				my @errors_for_2_on_1 = map {my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
784				$data_vec->set_col(0, $self->{_data_hash}->{$_});
785				$self->reconstruction_error($data_vec,
786				$trailing_eigenvec_matrix_cluster1,
787				$reference_vec_cluster1)}
788				@$cluster2;
789				my $type_1_error = reduce {abs($a) + abs($b)} @errors_for_1_on_2;
790				my $type_2_error = reduce {abs($a) + abs($b)} @errors_for_2_on_1;
791				my $total_reconstruction_error = $type_1_error + $type_2_error;
792				my $diff_between_the_means = $reference_vec_cluster1 - $reference_vec_cluster2;
793				my $dist_squared = transpose($diff_between_the_means) * $diff_between_the_means;
794				my @dist_squared_as_list = $dist_squared->as_list();
795				my $dist_between_means_based_error = shift @dist_squared_as_list;
796				return ($total_reconstruction_error, $dist_between_means_based_error);
797				}
798
799				# delta ball
800				sub cluster_unimodality_quotient {
801				my $self = shift;
802				my $cluster = shift;
803				my $mean = shift;
804				my $delta = 0.4 * $self->{_scale_factor}; # Radius of the delta ball along each dimension
805				my @mean = $mean->as_list;
806				my @data_tags_for_range_tests;
807				foreach my $dimen (0..$self->{_data_dimensions}-1) {
808				my @values = map {$_->[$dimen]} map {$self->{_data_hash}->{$_}} @$cluster;
809				my ($min, $max) = (List::Util::min(@values), List::Util::max(@values));
810				my $range = $max - $min;
811				my $mean_along_this_dimen = $mean[$dimen];
812				my @tags = grep {$_}
813				map { ( ($self->{_data_hash}->{$_}->[$dimen] > $mean_along_this_dimen - $delta * $range)
814				&&
815				($self->{_data_hash}->{$_}->[$dimen] < $mean_along_this_dimen + $delta * $range) )
816				? $_ : undef }
817				@$cluster;
818				push @data_tags_for_range_tests, \@tags;
819				}
820				# Now find the intersection of the tag sets for each of the dimensions
821				my %intersection_hash;
822				foreach my $dimen (0..$self->{_data_dimensions}-1) {
823				my %tag_hash_for_this_dimen = map {$_ => 1} @{$data_tags_for_range_tests[$dimen]};
824				if ($dimen == 0) {
825				%intersection_hash = %tag_hash_for_this_dimen;
826				} else {
827				%intersection_hash = map {$_ => 1} grep {$tag_hash_for_this_dimen{$_}}
828				keys %intersection_hash;
829				}
830				}
831				my @intersection_set = keys %intersection_hash;
832				my $cluster_unimodality_index = scalar(@intersection_set) / scalar(@$cluster);
833				return $cluster_unimodality_index;
834				}
835
836				sub find_best_ref_vector {
837				my $self = shift;
838				my $cluster = shift;
839				my $trailing_eigenvec_matrix = shift;
840				my $mean = shift; # a GSL marix ref
841				my @min_bounds;
842				my @max_bounds;
843				my @ranges;
844				foreach my $dimen (0..$self->{_data_dimensions}-1) {
845				my @values = map {$_->[$dimen]} map {$self->{_data_hash}->{$_}} @$cluster;
846				my ($min, $max) = (List::Util::min(@values), List::Util::max(@values));
847				push @min_bounds, $min;
848				push @max_bounds, $max;
849				push @ranges, $max - $min;
850				}
851				print "min bounds are: @min_bounds\n";
852				print "max bounds are: @max_bounds\n";
853				my $max_iterations = 100;
854				my @random_points;
855				my $iteration = 0;
856				while ($iteration++ < $max_iterations) {
857				my @coordinate_vec;
858				foreach my $dimen (0..$self->{_data_dimensions}-1) {
859				push @coordinate_vec, $min_bounds[$dimen] + rand($ranges[$dimen]);
860				}
861				push @random_points, \@coordinate_vec;
862				}
863				if ($self->{_debug}) {
864				print "\nrandom points\n";
865				map {print "@$_\n"} @random_points;
866				}
867				my @mean = $mean->as_list;
868				unshift @random_points, \@mean;
869				my @reconstruction_errors;
870				foreach my $candidate_ref_vec (@random_points) {
871				my $ref_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
872				$ref_vec->set_col(0, $candidate_ref_vec);
873				my $reconstruction_error_for_a_ref_vec = 0;
874				foreach my $data_tag (@{$self->{_data_tags}}) {
875				my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
876				$data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
877				my $error = $self->reconstruction_error($data_vec,$trailing_eigenvec_matrix,$ref_vec);
878				$reconstruction_error_for_a_ref_vec += $error;
879				}
880				push @reconstruction_errors, $reconstruction_error_for_a_ref_vec;
881				}
882				my $recon_error_for_original_mean = shift @reconstruction_errors;
883				my $smallest_error_randomly_selected_ref_vecs = List::Util::min(@reconstruction_errors);
884				my $minindex = List::Util::first { $_ == $smallest_error_randomly_selected_ref_vecs }
885				@reconstruction_errors;
886				my $refvec = $random_points[$minindex];
887				return $refvec;
888				}
889
890				## The reconstruction error relates to the size of the perpendicular from a data
891				## point X to the hyperplane that defines a given subspace on the manifold.
892				sub reconstruction_error {
893				my $self = shift;
894				my $data_vec = shift;
895				my $trailing_eigenvecs = shift;
896				my $ref_vec = shift;
897				my $error_squared = transpose($data_vec - $ref_vec) * $trailing_eigenvecs *
898				transpose($trailing_eigenvecs) * ($data_vec - $ref_vec);
899				my @error_squared_as_list = $error_squared->as_list();
900				my $error_squared_as_scalar = shift @error_squared_as_list;
901				return $error_squared_as_scalar;
902				}
903
904				# Returns a set of KM random integers. These serve as indices to reach into the data
905				# array. A data element whose index is one of the random numbers returned by this
906				# routine serves as an initial cluster center. Note the quality check it runs on the
907				# list of the random integers constructed. We first make sure that all the random
908				# integers returned are different. Subsequently, we carry out a quality assessment
909				# of the random integers constructed. This quality measure consists of the ratio of
910				# the values spanned by the random integers to the value of N, the total number of
911				# data points to be clustered. Currently, if this ratio is less than 0.3, we discard
912				# the K integers and try again.
913				sub initialize_cluster_centers {
914				my $self = shift;
915				my $K = shift; # This value is set to the parameter KM in the call to this subroutine
916				my $data_store_size = shift;
917				my @cluster_center_indices;
918				while (1) {
919				foreach my $i (0..$K-1) {
920				$cluster_center_indices[$i] = int rand( $data_store_size );
921				next if $i == 0;
922				foreach my $j (0..$i-1) {
923				while ( $cluster_center_indices[$j] == $cluster_center_indices[$i] ) {
924				my $old = $cluster_center_indices[$i];
925				$cluster_center_indices[$i] = int rand($data_store_size);
926				}
927				}
928				}
929				my ($min,$max) = minmax(\@cluster_center_indices );
930				my $quality = ($max - $min) / $data_store_size;
931				last if $quality > 0.3;
932				}
933				return @cluster_center_indices;
934				}
935
936				# The purpose of this routine is to form initial clusters by assigning the data
937				# samples to the initial clusters formed by the previous routine on the basis of the
938				# best proximity of the data samples to the different cluster centers.
939				sub assign_data_to_clusters_initial {
940				my $self = shift;
941				my @cluster_centers = @{ shift @_ };
942				my @clusters;
943				foreach my $ele (@{$self->{_data_tags}}) {
944				my $best_cluster;
945				my @dist_from_clust_centers;
946				foreach my $center (@cluster_centers) {
947				push @dist_from_clust_centers, $self->distance($ele, $center);
948				}
949				my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
950				push @{$clusters[$best_center_index]}, $ele;
951				}
952				return \@clusters;
953				}
954
955				# The following routine is for computing the distance between a data point specified
956				# by its symbolic name in the master datafile and a point (such as the center of a
957				# cluster) expressed as a vector of coordinates:
958				sub distance {
959				my $self = shift;
960				my $ele1_id = shift @_; # symbolic name of data sample
961				my @ele1 = @{$self->{_data_hash}->{$ele1_id}};
962				my @ele2 = @{shift @_};
963				die "wrong data types for distance calculation\n" if @ele1 != @ele2;
964				my $how_many = @ele1;
965				my $squared_sum = 0;
966				foreach my $i (0..$how_many-1) {
967				$squared_sum += ($ele1[$i] - $ele2[$i])**2;
968				}
969				my $dist = sqrt $squared_sum;
970				return $dist;
971				}
972
973				sub write_clusters_to_files {
974				my $self = shift;
975				my $clusters = shift;
976				my @clusters = @$clusters;
977				unlink glob "cluster*.txt";
978				foreach my $i (0..@clusters-1) {
979				my $filename = "cluster" . $i . ".txt";
980				print "\nWriting cluster $i to file $filename\n" if $self->{_terminal_output};
981				open FILEHANDLE, "\| sort > $filename" or die "Unable to open file: $!";
982				foreach my $ele (@{$clusters[$i]}) {
983				print FILEHANDLE "$ele ";
984				}
985				close FILEHANDLE;
986				}
987				}
988
989				sub DESTROY {
990				my $filename = basename($_[0]->{_datafile});
991				$filename =~ s/\.\w+$/\.txt/;
992				unlink "__temp_" . $filename;
993				}
994
995				################################## Visualization Code ###################################
996
997				sub add_point_coords {
998				my $self = shift;
999				my @arr_of_ids = @{shift @_}; # array of data element names
1000				my @result;
1001				my $data_dimensionality = $self->{_data_dimensions};
1002				foreach my $i (0..$data_dimensionality-1) {
1003				$result[$i] = 0.0;
1004				}
1005				foreach my $id (@arr_of_ids) {
1006				my $ele = $self->{_data_hash}->{$id};
1007				my $i = 0;
1008				foreach my $component (@$ele) {
1009				$result[$i] += $component;
1010				$i++;
1011				}
1012				}
1013				return \@result;
1014				}
1015
1016				# This is the main module version:
1017				sub visualize_clusters_on_sphere {
1018				my $self = shift;
1019				my $visualization_msg = shift;
1020				my $clusters = deep_copy_AoA(shift);
1021				my $hardcopy_format = shift;
1022				my $pause_time = shift;
1023				my $d = $self->{_data_dimensions};
1024				my $temp_file = "__temp_" . $self->{_datafile};
1025				$temp_file =~ s/\.\w+$/\.txt/;
1026				unlink $temp_file if -e $temp_file;
1027				open OUTPUT, ">$temp_file"
1028				or die "Unable to open a temp file in this directory: $!";
1029				my @all_tags = "A".."Z";
1030				my @retagged_clusters;
1031				foreach my $cluster (@$clusters) {
1032				my $label = shift @all_tags;
1033				my @retagged_cluster =
1034				map {$_ =~ s/^(\w+?)_(\w+)/$label . "_$2 @{$self->{_data_hash}->{$_}}"/e;$_} @$cluster;
1035				push @retagged_clusters, \@retagged_cluster;
1036				}
1037				my %clusters;
1038				foreach my $cluster (@retagged_clusters) {
1039				foreach my $record (@$cluster) {
1040				my @splits = grep $_, split /\s+/, $record;
1041				$splits[0] =~ /(\w+?)_.*/;
1042				my $primary_cluster_label = $1;
1043				my @coords = @splits[1..$d];
1044				push @{$clusters{$primary_cluster_label}}, \@coords;
1045				}
1046				}
1047				foreach my $key (sort {"\L$a" cmp "\L$b"} keys %clusters) {
1048				map {print OUTPUT "$_"} map {"@$_\n"} @{$clusters{$key}};
1049				print OUTPUT "\n\n";
1050				}
1051				my @sorted_cluster_keys = sort {"\L$a" cmp "\L$b"} keys %clusters;
1052				close OUTPUT;
1053				my $plot;
1054				unless (defined $pause_time) {
1055				$plot = Graphics::GnuplotIF->new( persist => 1 );
1056				} else {
1057				$plot = Graphics::GnuplotIF->new();
1058				}
1059				my $arg_string = "";
1060				$plot->gnuplot_cmd( "set hidden3d" ) unless $self->{_show_hidden_in_3D_plots};
1061				$plot->gnuplot_cmd( "set title \"$visualization_msg\"" );
1062				$plot->gnuplot_cmd( "set noclip" );
1063				$plot->gnuplot_cmd( "set pointsize 2" );
1064				$plot->gnuplot_cmd( "set parametric" );
1065				$plot->gnuplot_cmd( "set size ratio 1" );
1066				$plot->gnuplot_cmd( "set xlabel \"X\"" );
1067				$plot->gnuplot_cmd( "set ylabel \"Y\"" );
1068				$plot->gnuplot_cmd( "set zlabel \"Z\"" );
1069				if ($hardcopy_format) {
1070				$plot->gnuplot_cmd( "set terminal png" );
1071				my $image_file_name = "$visualization_msg\.$hardcopy_format";
1072				$plot->gnuplot_cmd( "set output \"$image_file_name\"" );
1073				$plot->gnuplot_cmd( "unset hidden3d" );
1074				}
1075				# set the range for azimuth angles:
1076				$plot->gnuplot_cmd( "set urange [0:2*pi]" );
1077				# set the range for the elevation angles:
1078				$plot->gnuplot_cmd( "set vrange [-pi/2:pi/2]" );
1079				# Parametric functions for the sphere
1080				# $plot->gnuplot_cmd( "r=1" );
1081				if ($self->{_scale_factor}) {
1082				$plot->gnuplot_cmd( "r=$self->{_scale_factor}" );
1083				} else {
1084				$plot->gnuplot_cmd( "r=1" );
1085				}
1086				$plot->gnuplot_cmd( "fx(v,u) = rcos(v)cos(u)" );
1087				$plot->gnuplot_cmd( "fy(v,u) = rcos(v)sin(u)" );
1088				$plot->gnuplot_cmd( "fz(v) = r*sin(v)" );
1089				my $sphere_arg_str = "fx(v,u),fy(v,u),fz(v) notitle with lines lt 0,";
1090				foreach my $i (0..scalar(keys %clusters)-1) {
1091				my $j = $i + 1;
1092				# The following statement puts the titles on the data points
1093				$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"$sorted_cluster_keys[$i] \" with points lt $j pt $j, ";
1094				}
1095				$arg_string = $arg_string =~ /^(.*),[ ]+$/;
1096				$arg_string = $1;
1097				$plot->gnuplot_cmd( "splot $sphere_arg_str $arg_string" );
1098				$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1099				}
1100
1101
1102				################################### Support Routines ########################################
1103
1104				sub cluster_split {
1105				my $cluster = shift;
1106				my $how_many = shift;
1107				my @cluster_fragments;
1108				foreach my $i (0..$how_many-1) {
1109				$cluster_fragments[$i] = [];
1110				}
1111				my $delta = int( scalar(@$cluster) / $how_many );
1112				my $j = 0;
1113				foreach my $i (0..@$cluster-1) {
1114				push @{$cluster_fragments[int($i/$delta)]}, $cluster->[$i];
1115				}
1116				my $how_many_accounted_for = reduce {$a + $b} map {scalar(@$_)} @cluster_fragments;
1117				foreach my $frag (@cluster_fragments) {
1118				print "fragment: @$frag\n";
1119				}
1120				die "the fragmentation could not account for all the data"
1121				unless @$cluster == $how_many_accounted_for;
1122				return @cluster_fragments;
1123				}
1124
1125				# Returns the minimum value and its positional index in an array
1126				sub minimum {
1127				my $arr = shift;
1128				my $min;
1129				my $index;
1130				foreach my $i (0..@{$arr}-1) {
1131				if ( (!defined $min) \|\| ($arr->[$i] < $min) ) {
1132				$index = $i;
1133				$min = $arr->[$i];
1134				}
1135				}
1136				return ($min, $index);
1137				}
1138
1139				sub minmax {
1140				my $arr = shift;
1141				my $min;
1142				my $max;
1143				foreach my $i (0..@{$arr}-1) {
1144				next if !defined $arr->[$i];
1145				if ( (!defined $min) && (!defined $max) ) {
1146				$min = $arr->[$i];
1147				$max = $arr->[$i];
1148				} elsif ( $arr->[$i] < $min ) {
1149				$min = $arr->[$i];
1150				} elsif ( $arr->[$i] > $max ) {
1151				$max = $arr->[$i];
1152				}
1153				}
1154				return ($min, $max);
1155				}
1156
1157				# Meant only for constructing a deep copy of an array of arrays:
1158				sub deep_copy_AoA {
1159				my $ref_in = shift;
1160				my $ref_out;
1161				foreach my $i (0..@{$ref_in}-1) {
1162				foreach my $j (0..@{$ref_in->[$i]}-1) {
1163				$ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
1164				}
1165				}
1166				return $ref_out;
1167				}
1168
1169				# For displaying the individual clusters on a terminal screen. Each cluster is
1170				# displayed through the symbolic names associated with the data points.
1171				sub display_clusters {
1172				my @clusters = @{shift @_};
1173				my $i = 0;
1174				foreach my $cluster (@clusters) {
1175				@$cluster = sort @$cluster;
1176				my $cluster_size = @$cluster;
1177				print "\n\nCluster $i ($cluster_size records):\n";
1178				foreach my $ele (@$cluster) {
1179				print " $ele";
1180				}
1181				$i++
1182				}
1183				print "\n\n";
1184				}
1185
1186				sub display_mean_and_covariance {
1187				my $mean = shift;
1188				my $covariance = shift;
1189				print "\nDisplay the mean:\n";
1190				display_matrix($mean);
1191				print "\nDisplay the covariance:\n";
1192				display_matrix($covariance);
1193				}
1194
1195				sub check_for_illegal_params {
1196				my @params = @_;
1197				my @legal_params = qw / datafile
1198				mask
1199				K
1200				P
1201				terminal_output
1202				cluster_search_multiplier
1203				max_iterations
1204				delta_reconstruction_error
1205				visualize_each_iteration
1206				show_hidden_in_3D_plots
1207				make_png_for_each_iteration
1208				debug
1209				/;
1210				my $found_match_flag;
1211				foreach my $param (@params) {
1212				foreach my $legal (@legal_params) {
1213				$found_match_flag = 0;
1214				if ($param eq $legal) {
1215				$found_match_flag = 1;
1216				last;
1217				}
1218				}
1219				last if $found_match_flag == 0;
1220				}
1221				return $found_match_flag;
1222				}
1223
1224				sub display_matrix {
1225				my $matrix = shift;
1226				my $nrows = $matrix->rows();
1227				my $ncols = $matrix->cols();
1228				print "\nDisplaying a matrix of size $nrows rows and $ncols columns:\n";
1229				foreach my $i (0..$nrows-1) {
1230				my $row = $matrix->row($i);
1231				my @row_as_list = $row->as_list;
1232				# print "@row_as_list\n";
1233				map { printf("%.4f ", $_) } @row_as_list;
1234				print "\n";
1235				}
1236				print "\n\n";
1237				}
1238
1239				sub transpose {
1240				my $matrix = shift;
1241				my $num_rows = $matrix->rows();
1242				my $num_cols = $matrix->cols();
1243				my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
1244				foreach my $i (0..$num_rows-1) {
1245				my @row = $matrix->row($i)->as_list;
1246				$transpose->set_col($i, \@row );
1247				}
1248				return $transpose;
1249				}
1250
1251				sub vector_subtract {
1252				my $vec1 = shift;
1253				my $vec2 = shift;
1254				die "wrong data types for vector subtract calculation\n" if @$vec1 != @$vec2;
1255				my @result;
1256				foreach my $i (0..@$vec1-1){
1257				push @result, $vec1->[$i] - $vec2->[$i];
1258				}
1259				return @result;
1260				}
1261
1262				# from perl docs:
1263				sub fisher_yates_shuffle {
1264				my $arr = shift;
1265				my $i = @$arr;
1266				while (--$i) {
1267				my $j = int rand( $i + 1 );
1268				@$arr[$i, $j] = @$arr[$j, $i];
1269				}
1270				}
1271
1272				######################### Generating Synthetic Data for Manifold Clustering ##########################
1273
1274				################################## Class DataGenerator ########################################
1275
1276				## The embedded class defined below is for generating synthetic data for
1277				## experimenting with linear manifold clustering when the data resides on the
1278				## surface of a sphere. See the script generate_data_on_a_sphere.pl in the
1279				## `examples' directory for how to specify the number of clusters and the spread of
1280				## each cluster in the data that is generated.
1281
1282				package DataGenerator;
1283
1284				use strict;
1285				use Carp;
1286
1287				sub new {
1288				my ($class, %args) = @_;
1289				my @params = keys %args;
1290				croak "\nYou have used a wrong name for a keyword argument " .
1291				"--- perhaps a misspelling\n"
1292				if _check_for_illegal_params3(@params) == 0;
1293				bless {
1294				_output_file => $args{output_file}
1295				\|\| croak("name for output_file required"),
1296				_total_number_of_samples_needed => $args{total_number_of_samples_needed}
1297				\|\| croak("total_number_of_samples_needed required"),
1298				_number_of_clusters_on_sphere => $args{number_of_clusters_on_sphere} \|\| 3,
1299				_cluster_width => $args{cluster_width} \|\| 0.1,
1300				_show_hidden_in_3D_plots => $args{show_hidden_in_3D_plots} \|\| 1,
1301				_debug => $args{debug} \|\| 0,
1302				}, $class;
1303				}
1304
1305				sub _check_for_illegal_params3 {
1306				my @params = @_;
1307				my @legal_params = qw / output_file
1308				total_number_of_samples_needed
1309				number_of_clusters_on_sphere
1310				cluster_width
1311				show_hidden_in_3D_plots
1312				/;
1313				my $found_match_flag;
1314				foreach my $param (@params) {
1315				foreach my $legal (@legal_params) {
1316				$found_match_flag = 0;
1317				if ($param eq $legal) {
1318				$found_match_flag = 1;
1319				last;
1320				}
1321				}
1322				last if $found_match_flag == 0;
1323				}
1324				return $found_match_flag;
1325				}
1326
1327				## We first generate a set of points randomly on the unit sphere --- the number of
1328				## points being equal to the number of clusters desired. These points will serve as
1329				## cluster means (or, as cluster centroids) subsequently when we ask
1330				## Math::Random::random_multivariate_normal($N, @m, @covar) to return $N number of
1331				## points on the sphere. The second argument is the cluster mean and the third
1332				## argument the cluster covariance. For the synthetic data, we set the cluster
1333				## covariance to a 2x2 diagonal matrix, with the (0,0) element corresponding to the
1334				## variance along the azimuth direction and the (1,1) element corresponding to the
1335				## variance along the elevation direction.
1336				##
1337				## When you generate the points in the 2D spherical coordinates of
1338				## (azimuth,elevation), you also need `wrap-around' logic for those points yielded by
1339				## the multivariate-normal function whose azimuth angle is outside the interval
1340				## (0,360) and/or whose elevation angle is outside the interval (-90,90).
1341				##
1342				## Note that the first of the two dimensions for which the multivariate-normal
1343				## function returns the points is for the azimuth angle and the second for the
1344				## elevation angle.
1345				##
1346				## With regard to the relationship of the Cartesian coordinates to the spherical
1347				## (azimuth, elevation) coordinates, we assume that (x,y) is the horizontal plane
1348				## and z the vertical axis. The elevation angle theta is measure with respect to
1349				## the XY-plane. The highest point on the sphere (the Zenith) corresponds to the
1350				## elevation angle of +90 and the lowest points on the sphere (the Nadir)
1351				## corresponds to the elevation angle of -90. The azimuth is measured with respect
1352				## X-axis. The range of the azimuth is from 0 to 360 degrees. The elevation is
1353				## measured from the XY plane and its range is (-90,90) degrees.
1354				sub gen_data_and_write_to_csv {
1355				my $self = shift;
1356				my $K = $self->{_number_of_clusters_on_sphere};
1357				# $N is the number of samples to be generated for each cluster:
1358				my $N = int($self->{_total_number_of_samples_needed} / $K);
1359				my $output_file = $self->{_output_file};
1360				# Designated all of the data elements in a cluster by a letter that is followed by
1361				# an integer that identifies a specific data element.
1362				my @point_labels = ('a'..'z');
1363				# Our first job is to define $K random points in the 2D space (azimuth,
1364				# elevation) to serve as cluster centers on the surface of the sphere. This we
1365				# do by calling a uniformly distributed 1-D random number generator, first for
1366				# the azimuth and then for the elevation in the loop shown below:
1367				my @cluster_centers;
1368				my @covariances;
1369				foreach my $i (0..$K-1) {
1370				my $azimuth = rand(360);
1371				my $elevation = rand(90) - 90;
1372				my @mean = ($azimuth, $elevation);
1373				push @cluster_centers, \@mean;
1374				my $cluster_covariance;
1375				# The j-th dimension is for azimuth and k-th for elevation for the directions
1376				# to surface of the sphere:
1377				foreach my $j (0..1) {
1378				foreach my $k (0..1) {
1379				$cluster_covariance->[$j]->[$k] = ($self->{_cluster_width} * 360.0) ** 2
1380				if $j == 0 && $k == 0;
1381				$cluster_covariance->[$j]->[$k] = ($self->{_cluster_width} * 180.0) ** 2
1382				if $j == 1 && $k == 1;
1383				$cluster_covariance->[$j]->[$k] = 0.0 if $j != $k;
1384				}
1385				}
1386				push @covariances, $cluster_covariance;
1387				}
1388				if ($self->{_debug}) {
1389				foreach my $i (0..$K-1) {
1390				print "\n\nCluster center: @{$cluster_centers[$i]}\n";
1391				print "\nCovariance:\n";
1392				foreach my $j (0..1) {
1393				foreach my $k (0..1) {
1394				print "$covariances[$i]->[$j]->[$k] ";
1395				}
1396				print "\n";
1397				}
1398				}
1399				}
1400				my @data_dump;
1401				foreach my $i (0..$K-1) {
1402				my @m = @{shift @cluster_centers};
1403				my @covar = @{shift @covariances};
1404				my @new_data = Math::Random::random_multivariate_normal($N, @m, @covar);
1405				if ($self->{_debug}) {
1406				print "\nThe points for cluster $i:\n";
1407				map { print "@$_ "; } @new_data;
1408				print "\n\n";
1409				}
1410				my @wrapped_data;
1411				foreach my $d (@new_data) {
1412				my $wrapped_d;
1413				if ($d->[0] >= 360.0) {
1414				$wrapped_d->[0] = $d->[0] - 360.0;
1415				} elsif ($d->[0] < 0) {
1416				$wrapped_d->[0] = 360.0 - abs($d->[0]);
1417				}
1418				if ($d->[1] >= 90.0) {
1419				$wrapped_d->[0] = POSIX::fmod($d->[0] + 180.0, 360);
1420				$wrapped_d->[1] = 180.0 - $d->[1];
1421				} elsif ($d->[1] < -90.0) {
1422				$wrapped_d->[0] = POSIX::fmod($d->[0] + 180, 360);
1423				$wrapped_d->[1] = -180.0 - $d->[1];
1424				}
1425				$wrapped_d->[0] = $d->[0] unless defined $wrapped_d->[0];
1426				$wrapped_d->[1] = $d->[1] unless defined $wrapped_d->[1];
1427				push @wrapped_data, $wrapped_d;
1428				}
1429				if ($self->{_debug}) {
1430				print "\nThe unwrapped points for cluster $i:\n";
1431				map { print "@$_ "; } @wrapped_data;
1432				print "\n\n";
1433				}
1434				my $label = $point_labels[$i];
1435				my $j = 0;
1436				@new_data = map {unshift @$_, $label."_".$j; $j++; $_} @wrapped_data;
1437				push @data_dump, @new_data;
1438				}
1439				if ($self->{_debug}) {
1440				print "\n\nThe labeled points for clusters:\n";
1441				map { print "@$_\n"; } @data_dump;
1442				}
1443				fisher_yates_shuffle( \@data_dump );
1444				open OUTPUT, ">$output_file";
1445				my $total_num_of_points = $N * $K;
1446				print "Total number of data points that will be written out to the file: $total_num_of_points\n"
1447				if $self->{_debug};
1448				foreach my $ele (@data_dump) {
1449				my ($x,$y,$z);
1450				my $label = $ele->[0];
1451				my $azimuth = $ele->[1];
1452				my $elevation = $ele->[2];
1453				$x = cos($elevation) * cos($azimuth);
1454				$y = cos($elevation) * sin($azimuth);
1455				$z = sin($elevation);
1456				my $csv_str = join ",", ($label,$x,$y,$z);
1457				print OUTPUT "$csv_str\n";
1458				}
1459				print "\n\n";
1460				print "Data written out to file $output_file\n" if $self->{_debug};
1461				close OUTPUT;
1462				}
1463
1464				# This version for the embedded class for data generation
1465				sub visualize_data_on_sphere {
1466				my $self = shift;
1467				my $datafile = shift;
1468				my $filename = File::Basename::basename($datafile);
1469				my $temp_file = "__temp_" . $filename;
1470				$temp_file =~ s/\.\w+$/\.txt/;
1471				unlink $temp_file if -e $temp_file;
1472				open OUTPUT, ">$temp_file"
1473				or die "Unable to open a temp file in this directory: $!";
1474				open INPUT, "< $filename" or die "Unable to open $filename: $!";
1475				local $/ = undef;
1476				my @all_records = split /\s+/, ;
1477				my %clusters;
1478				foreach my $record (@all_records) {
1479				my @splits = split /,/, $record;
1480				my $record_name = shift @splits;
1481				$record_name =~ /(\w+?)_.*/;
1482				my $primary_cluster_label = $1;
1483				push @{$clusters{$primary_cluster_label}}, \@splits;
1484				}
1485				foreach my $key (sort {"\L$a" cmp "\L$b"} keys %clusters) {
1486				map {print OUTPUT "$_"} map {"@$_\n"} @{$clusters{$key}};
1487				print OUTPUT "\n\n";
1488				}
1489				my @sorted_cluster_keys = sort {"\L$a" cmp "\L$b"} keys %clusters;
1490				close OUTPUT;
1491				my $plot = Graphics::GnuplotIF->new( persist => 1 );
1492				my $arg_string = "";
1493				$plot->gnuplot_cmd( "set noclip" );
1494				$plot->gnuplot_cmd( "set hidden3d" ) unless $self->{_show_hidden_in_3D_plots};
1495				$plot->gnuplot_cmd( "set pointsize 2" );
1496				$plot->gnuplot_cmd( "set parametric" );
1497				$plot->gnuplot_cmd( "set size ratio 1" );
1498				$plot->gnuplot_cmd( "set xlabel \"X\"" );
1499				$plot->gnuplot_cmd( "set ylabel \"Y\"" );
1500				$plot->gnuplot_cmd( "set zlabel \"Z\"" );
1501				# set the range for azimuth angles:
1502				$plot->gnuplot_cmd( "set urange [0:2*pi]" );
1503				# set the range for the elevation angles:
1504				$plot->gnuplot_cmd( "set vrange [-pi/2:pi/2]" );
1505				# Parametric functions for the sphere
1506				$plot->gnuplot_cmd( "r=1" );
1507				$plot->gnuplot_cmd( "fx(v,u) = rcos(v)cos(u)" );
1508				$plot->gnuplot_cmd( "fy(v,u) = rcos(v)sin(u)" );
1509				$plot->gnuplot_cmd( "fz(v) = r*sin(v)" );
1510				my $sphere_arg_str = "fx(v,u),fy(v,u),fz(v) notitle with lines lt 0,";
1511				foreach my $i (0..scalar(keys %clusters)-1) {
1512				my $j = $i + 1;
1513				# The following statement puts the titles on the data points
1514				$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"$sorted_cluster_keys[$i] \" with points lt $j pt $j, ";
1515				}
1516				$arg_string = $arg_string =~ /^(.*),[ ]+$/;
1517				$arg_string = $1;
1518				# $plot->gnuplot_cmd( "splot $arg_string" );
1519				$plot->gnuplot_cmd( "splot $sphere_arg_str $arg_string" );
1520				}
1521
1522				sub DESTROY {
1523				use File::Basename;
1524				my $filename = basename($_[0]->{_output_file});
1525				$filename =~ s/\.\w+$/\.txt/;
1526				unlink "__temp_" . $filename;
1527				}
1528
1529				# from perl docs:
1530				sub fisher_yates_shuffle {
1531				my $arr = shift;
1532				my $i = @$arr;
1533				while (--$i) {
1534				my $j = int rand( $i + 1 );
1535				@$arr[$i, $j] = @$arr[$j, $i];
1536				}
1537				}
1538
1539				1;
1540
1541				=pod
1542
1543				=head1 NAME
1544
1545				Algorithm::LinearManifoldDataClusterer --- for clustering data that resides on a
1546				low-dimensional manifold in a high-dimensional measurement space
1547
1548				=head1 SYNOPSIS
1549
1550				# You'd begin with:
1551
1552				use Algorithm::LinearManifoldDataClusterer;
1553
1554				# You'd next name the data file:
1555
1556				my $datafile = "mydatafile.csv";
1557
1558				# Your data must be in the CSV format, with one of the columns containing a unique
1559				# symbolic tag for each data record. You tell the module which column has the
1560				# symbolic tag and which columns to use for clustering through a mask such as
1561
1562				my $mask = "N111";
1563
1564				# which says that the symbolic tag is in the first column and that the numerical
1565				# data in the next three columns is to be used for clustering. If your data file
1566				# had, say, five columns and you wanted only the last three columns to be
1567				# clustered, the mask would become `N0111' assuming that that the symbolic tag is
1568				# still in the first column.
1569
1570				# Now you must construct an instance of the clusterer through a call such as:
1571
1572				my $clusterer = Algorithm::LinearManifoldDataClusterer->new(
1573				datafile => $datafile,
1574				mask => $mask,
1575				K => 3,
1576				P => 2,
1577				max_iterations => 15,
1578				cluster_search_multiplier => 2,
1579				delta_reconstruction_error => 0.001,
1580				terminal_output => 1,
1581				visualize_each_iteration => 1,
1582				show_hidden_in_3D_plots => 1,
1583				make_png_for_each_iteration => 1,
1584				);
1585
1586				# where the parameter K specifies the number of clusters you expect to find in
1587				# your data and the parameter P is the dimensionality of the manifold on which the
1588				# data resides. The parameter cluster_search_multiplier is for increasing the
1589				# odds that the random seeds chosen initially for clustering will populate all the
1590				# clusters. Set this parameter to a low number like 2 or 3. The parameter
1591				# max_iterations places a hard limit on the number of iterations that the
1592				# algorithm is allowed. The actual number of iterations is controlled by the
1593				# parameter delta_reconstruction_error. The iterations stop when the change in
1594				# the total "reconstruction error" from one iteration to the next is smaller than
1595				# the value specified by delta_reconstruction_error.
1596
1597				# Next, you must get the module to read the data for clustering:
1598
1599				$clusterer->get_data_from_csv();
1600
1601				# Finally, you invoke linear manifold clustering by:
1602
1603				my $clusters = $clusterer->linear_manifold_clusterer();
1604
1605				# The value returned by this call is a reference to an array of anonymous arrays,
1606				# with each anonymous array holding one cluster. If you wish, you can have the
1607				# module write the clusters to individual files by the following call:
1608
1609				$clusterer->write_clusters_to_files($clusters);
1610
1611				# If you want to see how the reconstruction error changes with the iterations, you
1612				# can make the call:
1613
1614				$clusterer->display_reconstruction_errors_as_a_function_of_iterations();
1615
1616				# When your data is 3-dimensional and when the clusters reside on a surface that
1617				# is more or less spherical, you can visualize the clusters by calling
1618
1619				$clusterer->visualize_clusters_on_sphere("final clustering", $clusters);
1620
1621				# where the first argument is a label to be displayed in the 3D plot and the
1622				# second argument the value returned by calling linear_manifold_clusterer().
1623
1624				# SYNTHETIC DATA GENERATION:
1625
1626				# The module includes an embedded class, DataGenerator, for generating synthetic
1627				# three-dimensional data that can be used to experiment with the clustering code.
1628				# The synthetic data, written out to a CSV file, consists of Gaussian clusters on
1629				# the surface of a sphere. You can control the number of clusters, the width of
1630				# each cluster, and the number of samples in the clusters by giving appropriate
1631				# values to the constructor parameters as shown below:
1632
1633				use strict;
1634				use Algorithm::LinearManifoldDataClusterer;
1635
1636				my $output_file = "4_clusters_on_a_sphere_1000_samples.csv";
1637
1638				my $training_data_gen = DataGenerator->new(
1639				output_file => $output_file,
1640				cluster_width => 0.015,
1641				total_number_of_samples_needed => 1000,
1642				number_of_clusters_on_sphere => 4,
1643				show_hidden_in_3D_plots => 0,
1644				);
1645				$training_data_gen->gen_data_and_write_to_csv();
1646				$training_data_gen->visualize_data_on_sphere($output_file);
1647
1648
1649				=head1 DESCRIPTION
1650
1651				If you are new to machine learning and data clustering on linear and nonlinear
1652				manifolds, your first question is likely to be: What is a manifold? A manifold is a
1653				space that is locally Euclidean. And a space is locally Euclidean if it allows for
1654				the points in a small neighborhood to be represented by Cartesian coordinates and if
1655				the distances between the points in the neighborhood are given by the Euclidean
1656				metric. For an example, the set of all points on the surface of a sphere does NOT
1657				constitute a Euclidean space. Nonetheless, if you confined your attention to a small
1658				enough neighborhood around a point, the space would seem to be locally Euclidean.
1659				The surface of a sphere is a 2-dimensional manifold embedded in a 3-dimensional
1660				space. A plane in a 3-dimensional space is also a 2-dimensional manifold. You would
1661				think of the surface of a sphere as a nonlinear manifold, whereas a plane would be a
1662				linear manifold. However, note that any nonlinear manifold is locally a linear
1663				manifold. That is, given a sufficiently small neighborhood on a nonlinear manifold,
1664				you can always think of it as a locally flat surface.
1665
1666				As to why we need machine learning and data clustering on manifolds, there exist many
1667				important applications in which the measured data resides on a nonlinear manifold.
1668				For example, when you record images of a human face from different angles, all the
1669				image pixels taken together fall on a low-dimensional surface in a high-dimensional
1670				measurement space. The same is believed to be true for the satellite images of a land
1671				mass that are recorded with the sun at different angles with respect to the direction
1672				of the camera.
1673
1674				Reducing the dimensionality of the sort of data mentioned above is critical to the
1675				proper functioning of downstream classification algorithms, and the most popular
1676				traditional method for dimensionality reduction is the Principal Components Analysis
1677				(PCA) algorithm. However, using PCA is tantamount to passing a linear least-squares
1678				hyperplane through the surface on which the data actually resides. As to why that
1679				might be a bad thing to do, just imagine the consequences of assuming that your data
1680				falls on a straight line when, in reality, it falls on a strongly curving arc. This
1681				is exactly what happens with PCA --- it gives you a linear manifold approximation to
1682				your data that may actually reside on a curved surface.
1683
1684				That brings us to the purpose of this module, which is to cluster data that resides
1685				on a nonlinear manifold. Since a nonlinear manifold is locally linear, we can think
1686				of each data cluster on a nonlinear manifold as falling on a locally linear portion
1687				of the manifold, meaning on a hyperplane. The logic of the module is based on
1688				finding a set of hyperplanes that best describes the data, with each hyperplane
1689				derived from a local data cluster. This is like constructing a piecewise linear
1690				approximation to data that falls on a curve as opposed to constructing a single
1691				straight line approximation to all of the data. So whereas the frequently used PCA
1692				algorithm gives you a single hyperplane approximation to all your data, what this
1693				module returns is a set of hyperplane approximations, with each hyperplane derived by
1694				applying the PCA algorithm locally to a data cluster.
1695
1696				That brings us to the problem of how to actually discover the best set of hyperplane
1697				approximations to the data. What is probably the most popular algorithm today for
1698				that purpose is based on the following key idea: Given a set of subspaces to which a
1699				data element can be assigned, you assign it to that subspace for which the
1700				B is the least. But what do we mean by a B and what
1701				is B?
1702
1703				To understand the notions of B and B, let's revisit
1704				the traditional approach of dimensionality reduction by the PCA algorithm. The PCA
1705				algorithm consists of: (1) Subtracting from each data element the global mean of the
1706				data; (2) Calculating the covariance matrix of the data; (3) Carrying out an
1707				eigendecomposition of the covariance matrix and ordering the eigenvectors according
1708				to decreasing values of the corresponding eigenvalues; (4) Forming a B by
1709				discarding the trailing eigenvectors whose corresponding eigenvalues are relatively
1710				small; and, finally, (5) projecting all the data elements into the subspace so
1711				formed. The error incurred in representing a data element by its projection into the
1712				subspace is known as the B. This error is the projection of
1713				the data element into the space spanned by the discarded trailing eigenvectors.
1714
1715				I
1716				subspace in the manner described above, we construct a set of subspaces, one for each
1717				data cluster on the nonlinear manifold. After the subspaces have been constructed, a
1718				data element is assigned to that subspace for which the reconstruction error is the
1719				least.> On the face of it, this sounds like a chicken-and-egg sort of a problem. You
1720				need to have already clustered the data in order to construct the subspaces at
1721				different places on the manifold so that you can figure out which cluster to place a
1722				data element in.
1723
1724				Such problems, when they do possess a solution, are best tackled through iterative
1725				algorithms in which you start with a guess for the final solution, you rearrange the
1726				measured data on the basis of the guess, and you then use the new arrangement of the
1727				data to refine the guess. Subsequently, you iterate through the second and the third
1728				steps until you do not see any discernible changes in the new arrangements of the
1729				data. This forms the basis of the clustering algorithm that is described under
1730				B in the section that follows. This algorithm was first proposed in the
1731				article "Dimension Reduction by Local Principal Component Analysis" by Kambhatla and
1732				Leen that appeared in the journal Neural Computation in 1997.
1733
1734				Unfortunately, experiments show that the algorithm as proposed by Kambhatla and Leen
1735				is much too sensitive to how the clusters are seeded initially. To get around this
1736				limitation of the basic clustering-by-minimization-of-reconstruction-error, this
1737				module implements a two phased approach. In B, we introduce a multiplier
1738				effect in our search for clusters by looking for C clusters instead of the main
1739				C clusters. In this manner, we increase the odds that each original cluster will
1740				be visited by one or more of the C randomly selected seeds at the beginning,
1741				where C is the integer value given to the constructor parameter
1742				C. Subsequently, we merge the clusters that belong
1743				together in order to form the final C clusters. That work is done in B
1744				of the algorithm.
1745
1746				For the cluster merging operation in Phase 2, we model the C clusters as the
1747				nodes of an attributed graph in which the weight given to an edge connecting a pair
1748				of nodes is a measure of the similarity between the two clusters corresponding to the
1749				two nodes. Subsequently, we use spectral clustering to merge the most similar nodes
1750				in our quest to partition the data into C clusters. For that purpose, we use the
1751				Shi-Malik normalized cuts algorithm. The pairwise node similarity required by this
1752				algorithm is measured by the C method of the
1753				C class. The smaller the overall reconstruction error
1754				when all of the data elements in one cluster are projected into the other's subspace
1755				and vice versa, the greater the similarity between two clusters. Additionally, the
1756				smaller the distance between the mean vectors of the clusters, the greater the
1757				similarity between two clusters. The overall similarity between a pair of clusters
1758				is a combination of these two similarity measures.
1759
1760				=head1 SUMMARY OF THE ALGORITHM
1761
1762				We now present a summary of the two phases of the algorithm implemented in this
1763				module. Note particularly the important role played by the constructor parameter
1764				C. It is only when the integer value given to this
1765				parameter is greater than 1 that Phase 2 of the algorithm kicks in.
1766
1767				=over 4
1768
1769				=item B
1770
1771				Through iterative minimization of the total reconstruction error, this phase of the
1772				algorithm returns C clusters where C is the actual number of clusters you
1773				expect to find in your data and where C is the integer value given to the
1774				constructor parameter C. As previously mentioned, on
1775				account of the sensitivity of the reconstruction-error based clustering to how the
1776				clusters are initially seeded, our goal is to look for C clusters with the idea
1777				of increasing the odds that each of the C clusters will see at least one seed at
1778				the beginning of the algorithm.
1779
1780				=over 4
1781
1782				=item Step 1:
1783
1784				Randomly choose C data elements to serve as the seeds for that many clusters.
1785
1786				=item Step 2:
1787
1788				Construct initial C clusters by assigning each data element to that cluster
1789				whose seed it is closest to.
1790
1791				=item Step 3:
1792
1793				Calculate the mean and the covariance matrix for each of the C clusters and
1794				carry out an eigendecomposition of the covariance matrix. Order the eigenvectors in
1795				decreasing order of the corresponding eigenvalues. The first C eigenvectors
1796				define the subspace for that cluster. Use the space spanned by the remaining
1797				eigenvectors --- we refer to them as the trailing eigenvectors --- for calculating
1798				the reconstruction error.
1799
1800				=item Step 4
1801
1802				Taking into account the mean associated with each cluster, re-cluster the entire data
1803				set on the basis of the least reconstruction error. That is, assign each data
1804				element to that subspace for which it has the smallest reconstruction error.
1805				Calculate the total reconstruction error associated with all the data elements. (See
1806				the definition of the reconstruction error in the C section.)
1807
1808				=item Step 5
1809
1810				Stop iterating if the change in the total reconstruction error from the previous
1811				iteration to the current iteration is less than the value specified by the constructor
1812				parameter C. Otherwise, go back to Step 3.
1813
1814				=back
1815
1816				=item B
1817
1818				This phase of the algorithm uses graph partitioning to merge the C clusters back
1819				into the C clusters you expect to see in your data. Since the algorithm whose
1820				steps are presented below is invoked recursively, let's assume that we have C
1821				clusters that need to be merged by invoking the Shi-Malik spectral clustering
1822				algorithm as described below:
1823
1824				=over 4
1825
1826				=item Step 1
1827
1828				Form a graph whose C nodes represent the C clusters. (For the very first
1829				invocation of this step, we have C.)
1830
1831				=item Step 2
1832
1833				Construct an C similarity matrix for the nodes in the graph. The C<(i,j)>-th
1834				element of this matrix is the pairwise similarity between the clusters indexed C
1835				and C. Calculate this similarity on the basis of the following two criteria: (1)
1836				The total reconstruction error when the data elements in the cluster indexed C are
1837				projected into the subspace for the cluster indexed C and vice versa. And (2) The
1838				distance between the mean vectors associated with the two clusters.
1839
1840				=item Step 3
1841
1842				Calculate the symmetric normalized Laplacian of the similarity matrix. We use C
1843				to denote the symmetric normalized Laplacian.
1844
1845				=item Step 4
1846
1847				Carry out an eigendecomposition of the C matrix and choose the eigenvector
1848				corresponding to the second smallest eigenvalue for bipartitioning the graph on the
1849				basis of the sign of the values in the eigenvector.
1850
1851				=item Step 5
1852
1853				If the bipartition of the previous step yields one-versus-the-rest kind of a
1854				partition, add the `one' cluster to the output list of clusters and invoke graph
1855				partitioning recursively on the `rest' by going back to Step 1. On the other hand,
1856				if the cardinality of both the partitions returned by Step 4 exceeds 1, invoke graph
1857				partitioning recursively on both partitions. Step when the list of clusters in the
1858				output list equals C.
1859
1860				=item Step 6
1861
1862				In general, the C clusters obtained by recursive graph partitioning will not cover
1863				all of the data. So, for the final step, assign each data element not covered by the
1864				C clusters to that cluster for which its reconstruction error is the least.
1865
1866				=back
1867
1868				=back
1869
1870				=head1 FAIL-FIRST BIAS OF THE MODULE
1871
1872				As you would expect from all such iterative algorithms, the module carries no
1873				theoretical guarantee that it will give you correct results. But what does that mean?
1874				Suppose you create synthetic data that consists of Gaussian looking disjoint clusters
1875				on the surface of a sphere, would the module always succeed in separating out the
1876				clusters? The module carries no guarantees to that effect --- especially considering
1877				that Phase 1 of the algorithm is sensitive to how the clusters are seeded at the
1878				beginning. Although this sensitivity is mitigated by the cluster merging step when
1879				greater-than-1 value is given to the constructor option C,
1880				a plain vanilla implementation of the steps in Phase 1 and Phase 2 would nonetheless
1881				carry significant risk that you'll end up with incorrect clustering results.
1882
1883				To further reduce this risk, the module has been programmed so that it terminates
1884				immediately if it suspects that the cluster solution being developed is unlikely to
1885				be fruitful. The heuristics used for such termination are conservative --- since the
1886				cost of termination is only that the user will have to run the code again, which at
1887				worst only carries an element of annoyance with it. The three "Fail First"
1888				heuristics currently programmed into the module are based on simple "unimodality
1889				testing", testing for "congruent clusters," and testing for dominant cluster support
1890				in the final stage of the recursive invocation of the graph partitioning step. The
1891				unimodality testing is as elementary as it can be --- it only checks for the number
1892				of data samples within a certain radius of the mean in relation to the total number
1893				of data samples in the cluster.
1894
1895				When the program terminates under such conditions, it prints out the following message
1896				in your terminal window:
1897
1898				Bailing out!
1899
1900				Given the very simple nature of testing that is carried for the "Fail First" bias, do
1901				not be surprised if the results you get for your data simply look wrong. If most
1902				runs of the module produce wrong results for your application, that means that the
1903				module logic needs to be strengthened further. The author of this module would love
1904				to hear from you if that is the case.
1905
1906				=head1 METHODS
1907
1908				The module provides the following methods for linear-manifold based clustering, for
1909				cluster visualization, and for the generation of data for testing the clustering algorithm:
1910
1911				=over 4
1912
1913				=item B
1914
1915				my $clusterer = Algorithm::LinearManifoldDataClusterer->new(
1916				datafile => $datafile,
1917				mask => $mask,
1918				K => $K,
1919				P => $P,
1920				cluster_search_multiplier => $C,
1921				max_iterations => $max_iter,
1922				delta_reconstruction_error => 0.001,
1923				terminal_output => 1,
1924				write_clusters_to_files => 1,
1925				visualize_each_iteration => 1,
1926				show_hidden_in_3D_plots => 1,
1927				make_png_for_each_iteration => 1,
1928				);
1929
1930				A call to C constructs a new instance of the
1931				C class.
1932
1933				=back
1934
1935				=head2 Constructor Parameters
1936
1937				=over 8
1938
1939				=item C:
1940
1941				This parameter names the data file that contains the multidimensional data records
1942				you want the module to cluster. This file must be in CSV format and each record in
1943				the file must include a symbolic tag for the record. Here are first few rows of such
1944				a CSV file in the C directory:
1945
1946				d_161,0.0739248630173239,0.231119293395665,-0.970112873251437
1947				a_59,0.459932215884786,0.0110216469739639,0.887885623314902
1948				a_225,0.440503220903039,-0.00543366086464691,0.897734586447273
1949				a_203,0.441656364946433,0.0437191337788422,0.896118459046532
1950				...
1951				...
1952
1953				What you see in the first column --- C, C, C, C --- are
1954				the symbolic tags associated with four 3-dimensional data records.
1955
1956				=item C:
1957
1958				This parameter supplies the mask to be applied to the columns of your data file. For
1959				the data file whose first few records are shown above, the mask should be C
1960				since the symbolic tag is in the first column of the CSV file and since, presumably,
1961				you want to cluster the data in the next three columns.
1962
1963				=item C:
1964
1965				This parameter supplies the number of clusters you are looking for.
1966
1967				=item C :
1968
1969				This parameter specifies the dimensionality of the manifold on which the data resides.
1970
1971				=item C:
1972
1973				As should be clear from the C section, this parameter plays
1974				a very important role in the successful clustering of your data. As explained in
1975				C, the basic algorithm used for clustering in Phase 1 --- clustering by
1976				the minimization of the reconstruction error --- is sensitive to the choice of the
1977				cluster seeds that are selected randomly at the beginning of the algorithm. Should
1978				it happen that the seeds miss one or more of the clusters, the clustering produced is
1979				likely to not be correct. By giving an integer value to C
1980				that is greater than 1, you'll increase the odds that the randomly selected seeds
1981				will see all clusters. When you set C to C, you ask
1982				Phase 1 of the algorithm to construct C clusters as opposed to just C
1983				clusters. Subsequently, in Phase 2, the module uses inter-cluster similarity based
1984				graph partitioning to merge the C clusters into C clusters.
1985
1986				=item C:
1987
1988				This hard limits the number of iterations in Phase 1 of the algorithm. Ordinarily,
1989				the iterations stop automatically when the change in the total reconstruction error
1990				from one iteration to the next is less than the value specified by the parameter
1991				C.
1992
1993				=item C:
1994
1995				It is this parameter that determines when the iterations will actually terminate in
1996				Phase 1 of the algorithm. When the difference in the total reconstruction error from
1997				one iteration to the next is less than the value given to this parameter, the
1998				iterations come to an end. B
1999				data samples that need to be clustered, the larger must be the value give to this
2000				parameter. That makes intuitive sense since the total reconstruction error is the
2001				sum of all such errors for all of the data elements.> Unfortunately, the best value
2002				for this parameter does NOT appear to depend linearly on the total number of data
2003				records to be clustered.
2004
2005				=item C:
2006
2007				When this parameter is set, you will see in your terminal window the different
2008				clusters as lists of the symbolic tags associated with the data records. You will
2009				also see in your terminal window the output produced by the different steps of the
2010				graph partitioning algorithm as smaller clusters are merged to form the final C
2011				clusters --- assuming that you set the parameter C to an
2012				integer value that is greater than 1.
2013
2014				=item C:
2015
2016				As its name implies, when this option is set to 1, you'll see 3D plots of the
2017				clustering results for each iteration --- but only if your data is 3-dimensional.
2018
2019				=item C:
2020
2021				This parameter is important for controlling the visualization of the clusters on the
2022				surface of a sphere. If the clusters are too spread out, seeing all of the clusters
2023				all at once can be visually confusing. When you set this parameter, the clusters on
2024				the back side of the sphere will not be visible. Note that no matter how you set
2025				this parameter, you can interact with the 3D plot of the data and rotate it with your
2026				mouse pointer to see all of the data that is output by the clustering code.
2027
2028				=item C:
2029
2030				If you set this option to 1, the module will output a Gnuplot in the form of a PNG
2031				image for each iteration in Phase 1 of the algorithm. In Phase 2, the module will
2032				output the clustering result produced by the graph partitioning algorithm.
2033
2034				=back
2035
2036				=over
2037
2038				=item B:
2039
2040				$clusterer->get_data_from_csv();
2041
2042				As you can guess from its name, the method extracts the data from the CSV file named
2043				in the constructor.
2044
2045				=item B:
2046
2047				$clusterer->linear_manifold_clusterer();
2048
2049				or
2050
2051				my $clusters = $clusterer->linear_manifold_clusterer();
2052
2053				This is the main call to the linear-manifold based clusterer. The first call works
2054				by side-effect, meaning that you will see the clusters in your terminal window and
2055				they would be written out to disk files (depending on the constructor options you
2056				have set). The second call also returns the clusters as a reference to an array of
2057				anonymous arrays, each holding the symbolic tags for a cluster.
2058
2059				=item B:
2060
2061				$clusterer->display_reconstruction_errors_as_a_function_of_iterations();
2062
2063				This method would normally be called after the clustering is completed to see how the
2064				reconstruction errors decreased with the iterations in Phase 1 of the overall
2065				algorithm.
2066
2067				=item B:
2068
2069				$clusterer->write_clusters_to_files($clusters);
2070
2071				As its name implies, when you call this methods, the final clusters would be written
2072				out to disk files. The files have names like:
2073
2074				cluster0.txt
2075				cluster1.txt
2076				cluster2.txt
2077				...
2078				...
2079
2080				Before the clusters are written to these files, the module destroys all files with
2081				such names in the directory in which you call the module.
2082
2083				=item B:
2084
2085				$clusterer->visualize_clusters_on_sphere("final clustering", $clusters);
2086
2087				or
2088
2089				$clusterer->visualize_clusters_on_sphere("final_clustering", $clusters, "png");
2090
2091				If your data is 3-dimensional and it resides on the surface of a sphere (or in the
2092				vicinity of such a surface), you may be able to use these methods for the
2093				visualization of the clusters produced by the algorithm. The first invocation
2094				produces a Gnuplot in a terminal window that you can rotate with your mouse pointer.
2095				The second invocation produces a `.png' image of the plot.
2096
2097				=item B:
2098
2099				$clusterer->auto_retry_clusterer();
2100
2101				or
2102
2103				my $clusters = $clusterer->auto_retry_clusterer();
2104
2105				As mentioned earlier, the module is programmed in such a way that it is more likely
2106				to fail than to give you a wrong answer. If manually trying the clusterer repeatedly
2107				on a data file is frustrating, you can use C to automatically
2108				make repeated attempts for you. See the script C for how you can use
2109				C in your own code.
2110
2111				=back
2112
2113				=head1 GENERATING SYNTHETIC DATA FOR EXPERIMENTING WITH THE CLUSTERER
2114
2115				The module file also contains a class named C for generating synthetic
2116				data for experimenting with linear-manifold based clustering. At this time, only
2117				3-dimensional data that resides in the form of Gaussian clusters on the surface of a
2118				sphere is generated. The generated data is placed in a CSV file. You construct an
2119				instance of the C class by a call like:
2120
2121				=over 4
2122
2123				=item B
2124
2125				my $training_data_gen = DataGenerator->new(
2126				output_file => $output_file,
2127				cluster_width => 0.0005,
2128				total_number_of_samples_needed => 1000,
2129				number_of_clusters_on_sphere => 4,
2130				show_hidden_in_3D_plots => 0,
2131				);
2132
2133				=back
2134
2135				=head2 Parameters for the DataGenerator constructor:
2136
2137				=over 8
2138
2139				=item C:
2140
2141				The numeric values are generated using a bivariate Gaussian distribution whose two
2142				independent variables are the azimuth and the elevation angles on the surface of a
2143				unit sphere. The mean of each cluster is chosen randomly and its covariance set in
2144				proportion to the value supplied for the C< cluster_width> parameter.
2145
2146				=item C:
2147
2148				This parameter controls the spread of each cluster on the surface of the unit sphere.
2149
2150				=item C:
2151
2152				As its name implies, this parameter specifies the total number of data samples that
2153				will be written out to the output file --- provided this number is divisible by the
2154				number of clusters you asked for. If the divisibility condition is not satisfied,
2155				the number of data samples actually written out will be the closest it can be to the
2156				number you specify for this parameter under the condition that equal number of
2157				samples will be created for each cluster.
2158
2159				=item C:
2160
2161				Again as its name implies, this parameters specifies the number of clusters that will
2162				be produced on the surface of a unit sphere.
2163
2164				=item C:
2165
2166				This parameter is important for the visualization of the clusters and it controls
2167				whether you will see the generated data on the back side of the sphere. If the
2168				clusters are not too spread out, you can set this parameter to 0 and see all the
2169				clusters all at once. However, when the clusters are spread out, it can be visually
2170				confusing to see the data on the back side of the sphere. Note that no matter how
2171				you set this parameter, you can interact with the 3D plot of the data and rotate it
2172				with your mouse pointer to see all of the data that is generated.
2173
2174				=back
2175
2176				=over 4
2177
2178				=item B:
2179
2180				$training_data_gen->gen_data_and_write_to_csv();
2181
2182				As its name implies, this method generates the data according to the parameters
2183				specified in the DataGenerator constructor.
2184
2185				=item B:
2186
2187				$training_data_gen->visualize_data_on_sphere($output_file);
2188
2189				You can use this method to visualize the clusters produced by the data generator.
2190				Since the clusters are located at randomly selected points on a unit sphere, by
2191				looking at the output visually, you can quickly reject what the data generator has
2192				produced and try again.
2193
2194				=back
2195
2196				=head1 HOW THE CLUSTERS ARE OUTPUT
2197
2198				When the option C is set in the constructor of the
2199				C class, the clusters are displayed on the terminal
2200				screen.
2201
2202				And, when the option C is set in the same constructor, the
2203				module dumps the clusters in files named
2204
2205				cluster0.txt
2206				cluster1.txt
2207				cluster2.txt
2208				...
2209				...
2210
2211				in the directory in which you execute the module. The number of such files will
2212				equal the number of clusters formed. All such existing files in the directory are
2213				destroyed before any fresh ones are created. Each cluster file contains the symbolic
2214				tags of the data samples in that cluster.
2215
2216				Assuming that the data dimensionality is 3, if you have set the constructor parameter
2217				C, the module will deposit in your directory printable PNG
2218				files that are point plots of the different clusters in the different iterations of
2219				the algorithm. Such printable files are also generated for the initial clusters at
2220				the beginning of the iterations and for the final clusters in Phase 1 of the
2221				algorithm. You will also see in your directory a PNG file for the clustering result
2222				produced by graph partitioning in Phase 2.
2223
2224				Also, as mentioned previously, a call to C in your own
2225				code will return the clusters to you directly.
2226
2227				=head1 REQUIRED
2228
2229				This module requires the following modules:
2230
2231				List::Util
2232				File::Basename
2233				Math::Random
2234				Graphics::GnuplotIF
2235				Math::GSL::Matrix
2236				POSIX
2237
2238				=head1 THE C DIRECTORY
2239
2240				The C directory contains the following four scripts that you might want to
2241				play with in order to become familiar with the module:
2242
2243				example1.pl
2244
2245				example2.pl
2246
2247				example3.pl
2248
2249				example4.pl
2250
2251				These scripts demonstrate linear-manifold based clustering on the 3-dimensional data
2252				in the following three CSV files:
2253
2254				3_clusters_on_a_sphere_498_samples.csv (used in example1.pl and example4.pl)
2255
2256				3_clusters_on_a_sphere_3000_samples.csv (used in example2.pl)
2257
2258				4_clusters_on_a_sphere_1000_samples.csv (used in example3.pl)
2259
2260				Note that even though the first two of these files both contain exactly three
2261				clusters, the clusters look very different in the two data files. The clusters are
2262				much more spread out in C<3_clusters_on_a_sphere_3000_samples.csv>.
2263
2264				The code in C is special because it shows how you can call the
2265				C method of the module for automatic repeated invocations of
2266				the clustering program until success is achieved. The value of the constructor
2267				parameter C is set to 1 in C, implying that
2268				when you execute C you will not be invoking Phase 2 of the algorithm.
2269				You might wish to change the value given to the parameter
2270				C to see how it affects the number of attempts needed to
2271				achieve success.
2272
2273				The C directory also includes PNG image files that show some intermediate
2274				and the best final results that can be achieved by the first three examples, that
2275				is, for the scripts C, C, and C. If you are on
2276				a Linux machine and if you have the C library installed, you can use the
2277				C command to see the results in the PNG images. The results you get with
2278				your own runs of the three example scripts may or may not look like what you see in
2279				the outputs shown in the PNG files depending on how the module seeds the clusters.
2280				Your best results should look like what you see in the PNG images.
2281
2282				The C directory also contains the following utility scripts:
2283
2284				For generating the data for experiments with clustering:
2285
2286				generate_data_on_a_sphere.pl
2287
2288				For visualizing the data generated by the above script:
2289
2290				data_visualizer.pl
2291
2292				For cleaning up the examples directory:
2293
2294				cleanup_directory.pl
2295
2296				Invoking the C script will get rid of all the PNG image files
2297				that are generated by the module when you run it with the constructor option
2298				C set to 1.
2299
2300				=head1 EXPORT
2301
2302				None by design.
2303
2304				=head1 CAVEATS
2305
2306				The performance of the algorithm depends much on the values you choose for the
2307				constructor parameters. And, even for the best choices for the parameters, the
2308				algorithm is not theoretically guaranteed to return the best results.
2309
2310				Even after you have discovered the best choices for the constructor parameters, the
2311				best way to use this module is to try it multiple times on any given data and accept
2312				only those results that make the best intuitive sense.
2313
2314				The module was designed with the hope that it would rather fail than give you wrong
2315				results. So if you see it failing a few times before it returns a good answer, that's
2316				a good thing.
2317
2318				However, if the module fails too often or is too quick to give you wrong answers,
2319				that means the module is not working on your data. If that happens, I'd love to hear
2320				from you. That might indicate to me how I should enhance the power of this module
2321				for its future versions.
2322
2323				=head1 BUGS
2324
2325				Please notify the author if you encounter any bugs. When sending email, please place
2326				the string 'LinearManifoldDataClusterer' in the subject line.
2327
2328				=head1 INSTALLATION
2329
2330				Download the archive from CPAN in any directory of your choice. Unpack the archive
2331				with a command that on a Linux machine would look like:
2332
2333				tar zxvf Algorithm-LinearManifoldDataClusterer-1.0.tar.gz
2334
2335				This will create an installation directory for you whose name will be
2336				C. Enter this directory and execute the following commands
2337				for a standard install of the module if you have root privileges:
2338
2339				perl Makefile.PL
2340				make
2341				make test
2342				sudo make install
2343
2344				If you do not have root privileges, you can carry out a non-standard install the
2345				module in any directory of your choice by:
2346
2347				perl Makefile.PL prefix=/some/other/directory/
2348				make
2349				make test
2350				make install
2351
2352				With a non-standard install, you may also have to set your PERL5LIB environment
2353				variable so that this module can find the required other modules. How you do that
2354				would depend on what platform you are working on. In order to install this module in
2355				a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
2356				variable by
2357
2358				setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2359
2360				If I used bash, I'd need to declare:
2361
2362				export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2363
2364
2365				=head1 THANKS
2366
2367				I have learned much from my conversations with Donghun Kim whose research on face
2368				recognition in the wild involves clustering image data on manifolds. I have also had
2369				several fruitful conversations with Bharath Kumar Comandur and Tanmay Prakash with
2370				regard to the ideas that are incorporated in this module.
2371
2372				=head1 AUTHOR
2373
2374				Avinash Kak, kak@purdue.edu
2375
2376				If you send email, please place the string "LinearManifoldDataClusterer" in your subject line to get past
2377				my spam filter.
2378
2379				=head1 COPYRIGHT
2380
2381				This library is free software; you can redistribute it and/or modify it under the
2382				same terms as Perl itself.
2383
2384				Copyright 2015 Avinash Kak
2385
2386				=cut
2387