File Coverage

blib/lib/Algorithm/LinearManifoldDataClusterer.pm

Criterion	Covered	Total	%
statement	25	27	92.5
branch			n/a
condition			n/a
subroutine	9	9	100.0
pod			n/a
total	34	36	94.4

line	stmt	sub	time	code
1				package Algorithm::LinearManifoldDataClusterer;
2
3				#------------------------------------------------------------------------------------
4				# Copyright (c) 2015 Avinash Kak. All rights reserved. This program is free
5				# software. You may modify and/or distribute it under the same terms as Perl itself.
6				# This copyright notice must remain attached to the file.
7				#
8				# Algorithm::LinearManifoldDataClusterer is a Perl module for clustering data that
9				# resides on a low-dimensional manifold in a high-dimensional measurement space.
10				# -----------------------------------------------------------------------------------
11
12	1	1	13073	use 5.10.0;
	1		3
	1		45
13	1	1	5	use strict;
	1		1
	1		28
14	1	1	3	use warnings;
	1		4
	1		30
15	1	1	4	use Carp;
	1		0
	1		71
16	1	1	4	use List::Util qw(reduce any);
	1		1
	1		83
17	1	1	4	use File::Basename;
	1		2
	1		66
18	1	1	594	use Math::Random;
	1		5336
	1		86
19	1	1	584	use Graphics::GnuplotIF;
	1		9525
	1		57
20	1	1	1048	use Math::GSL::Matrix;
	0
	0
21				use POSIX ();
22
23				our $VERSION = '1.01';
24
25				# Constructor:
26				sub new {
27				my ($class, %args) = @_;
28				my @params = keys %args;
29				croak "\nYou have used a wrong name for a keyword argument " .
30				"--- perhaps a misspelling\n"
31				if check_for_illegal_params(@params) == 0;
32				bless {
33				_datafile => $args{datafile} \|\| croak("datafile required"),
34				_mask => $args{mask} \|\| croak("mask required"),
35				_K => $args{K} \|\| 0,
36				_P => $args{P} \|\| 0,
37				_terminal_output => $args{terminal_output} \|\| 0,
38				_max_iterations => $args{max_iterations} \|\| 0,
39				_delta_reconstruction_error => $args{delta_reconstruction_error} \|\| 0.001,
40				_delta_normalized_error => undef,
41				_cluster_search_multiplier => $args{cluster_search_multiplier} \|\| 1,
42				_visualize_each_iteration => $args{visualize_each_iteration} == 0 ? 0 : 1,
43				_show_hidden_in_3D_plots => $args{show_hidden_in_3D_plots} == 0 ? 0 : 1,
44				_make_png_for_each_iteration => $args{make_png_for_each_iteration} == 0 ? 0 : 1,
45				_debug => $args{debug} \|\| 0,
46				_N => 0,
47				_KM => $args{K} * $args{cluster_search_multiplier},
48				_data_hash => {},
49				_data_tags => [],
50				_data_dimensions => 0,
51				_final_clusters => [],
52				_auto_retry_flag => 0,
53				_num_iterations_actually_used => undef,
54				_scale_factor => undef,
55				_data_tags_to_cluster_label_hash => {},
56				_final_reference_vecs_for_all_subspaces => [],
57				_reconstruction_error_as_a_function_of_iteration => [],
58				_final_trailing_eigenvec_matrices_for_all_subspaces => [],
59				_subspace_construction_error_as_a_function_of_iteration => [],
60				}, $class;
61				}
62
63				sub get_data_from_csv {
64				my $self = shift;
65				my $filename = $self->{_datafile} \|\| die "you did not specify a file with the data to be clustered";
66				my $mask = $self->{_mask};
67				my @mask = split //, $mask;
68				$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
69				print "data dimensionality: $self->{_data_dimensions} \n" if $self->{_terminal_output};
70				open FILEIN, $filename or die "Unable to open $filename: $!";
71				die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
72				local $/ = undef;
73				my @all_data = split /\s+/, ;
74				my %data_hash = ();
75				my @data_tags = ();
76				foreach my $record (@all_data) {
77				my @splits = split /,/, $record;
78				my $record_name = shift @splits;
79				$data_hash{$record_name} = \@splits;
80				push @data_tags, $record_name;
81				}
82				$self->{_data_hash} = \%data_hash;
83				$self->{_data_tags} = \@data_tags;
84				$self->{_N} = scalar @data_tags;
85				}
86
87				sub estimate_mean_and_covariance {
88				my $self = shift;
89				my $tag_set = shift;
90				my $cluster_size = @$tag_set;
91				my @cluster_center = @{$self->add_point_coords($tag_set)};
92				@cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;
93				# for covariance calculation:
94				my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$tag_set));
95				print "\nThe data will be stuffed into a matrix of $num_rows rows and $num_cols columns\n\n"
96				if $self->{_debug};
97				my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
98				my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
99				# All the record labels are stored in the array $self->{_data_tags}. The actual
100				# data for clustering is stored in a hash at $self->{_data_hash} whose keys are
101				# the record labels; the value associated with each key is the array holding the
102				# corresponding numerical multidimensional data.
103				$mean_vec->set_col(0, \@cluster_center);
104				if ($self->{_debug}) {
105				print "\nDisplaying the mean vector for the cluster:\n";
106				display_matrix( $mean_vec ) if $self->{_terminal_output};
107				}
108				foreach my $j (0..$num_cols-1) {
109				my $tag = $tag_set->[$j];
110				my $data = $self->{_data_hash}->{$tag};
111				my @diff_from_mean = vector_subtract($data, \@cluster_center);
112				$matrix->set_col($j, \@diff_from_mean);
113				}
114				my $transposed = transpose( $matrix );
115				my $covariance = $matrix * $transposed;
116				$covariance *= 1.0 / $num_cols;
117				if ($self->{_debug}) {
118				print "\nDisplaying the Covariance Matrix for cluster:";
119				display_matrix( $covariance ) if $self->{_terminal_output};
120				}
121				return ($mean_vec, $covariance);
122				}
123
124				sub eigen_analysis_of_covariance {
125				my $self = shift;
126				my $covariance = shift;
127				my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
128				my $num_of_eigens = @$eigenvalues;
129				my $largest_eigen_index = 0;
130				my $smallest_eigen_index = 0;
131				print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug};
132				foreach my $i (1..$num_of_eigens-1) {
133				$largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
134				$smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
135				print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug};
136				}
137				print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
138				print "\nsmallest eigen index: $smallest_eigen_index\n\n" if $self->{_debug};
139				my @all_my_eigenvecs;
140				foreach my $i (0..$num_of_eigens-1) {
141				my @vec = $eigenvectors->[$i]->as_list;
142				my @eigenvec;
143				foreach my $ele (@vec) {
144				my ($mag, $theta) = $ele =~ /\[(\d\.?\de?[+-]?\d*),(\S+)\]/;
145				if ($theta eq "0") {
146				push @eigenvec, $mag;
147				} elsif ($theta eq "pi") {
148				push @eigenvec, -1.0 * $mag;
149				} else {
150				die "Eigendecomposition produced a complex eigenvector -- " .
151				"which should not happen for a covariance matrix!";
152				}
153				}
154				print "Eigenvector $i: @eigenvec\n" if $self->{_debug};
155				push @all_my_eigenvecs, \@eigenvec;
156				}
157				my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
158				print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug};
159				my @sorted_eigenvec_indexes = sort {$eigenvalues->[$b] <=> $eigenvalues->[$a]} 0..@all_my_eigenvecs-1;
160				my @sorted_eigenvecs;
161				my @sorted_eigenvals;
162				foreach my $i (0..@sorted_eigenvec_indexes-1) {
163				$sorted_eigenvecs[$i] = $all_my_eigenvecs[$sorted_eigenvec_indexes[$i]];
164				$sorted_eigenvals[$i] = $eigenvalues->[$sorted_eigenvec_indexes[$i]];
165				}
166				if ($self->{_debug}) {
167				print "\nHere come sorted eigenvectors --- from the largest to the smallest:\n";
168				foreach my $i (0..@sorted_eigenvecs-1) {
169				print "eigenvec: @{$sorted_eigenvecs[$i]} eigenvalue: $sorted_eigenvals[$i]\n";
170				}
171				}
172				return (\@sorted_eigenvecs, \@sorted_eigenvals);
173				}
174
175				sub auto_retry_clusterer {
176				my $self = shift;
177				$self->{_auto_retry_flag} = 1;
178				my $clusters;
179				$@ = 1;
180				my $retry_attempts = 1;
181				while ($@) {
182				eval {
183				$clusters = $self->linear_manifold_clusterer();
184				};
185				if ($@) {
186				if ($self->{_terminal_output}) {
187				print "Clustering failed. Trying again. --- $@";
188				print "\n\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n";
189				print "VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV\n\n";
190				}
191				$retry_attempts++;
192				} else {
193				print "\n\nNumber of retry attempts: $retry_attempts\n\n" if $self->{_terminal_output};
194				return $clusters;
195				}
196				}
197				}
198
199				sub linear_manifold_clusterer {
200				my $self = shift;
201				my $KM = $self->{_KM};
202				my @initial_cluster_center_tags;
203				my $visualization_msg;
204				my @initial_cluster_center_indexes = $self->initialize_cluster_centers($KM, $self->{_N});
205				print "Randomly selected indexes for cluster center tags: @initial_cluster_center_indexes\n"
206				if $self->{_debug};
207				@initial_cluster_center_tags = map {$self->{_data_tags}->[$_]} @initial_cluster_center_indexes;
208				my @initial_cluster_center_coords = map {$self->{_data_hash}->{$_}} @initial_cluster_center_tags;
209				if ($self->{_debug}) {
210				foreach my $centroid (@initial_cluster_center_coords) {
211				print "Initial cluster center coords: @{$centroid}\n";
212				}
213				}
214				my $initial_clusters = $self->assign_data_to_clusters_initial(\@initial_cluster_center_coords);
215				if ($self->{_data_dimensions} == 3) {
216				$visualization_msg = "initial_clusters";
217				$self->visualize_clusters_on_sphere($visualization_msg, $initial_clusters)
218				if $self->{_visualize_each_iteration};
219				$self->visualize_clusters_on_sphere($visualization_msg, $initial_clusters, "png")
220				if $self->{_make_png_for_each_iteration};
221				}
222				foreach my $cluster (@$initial_clusters) {
223				my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
224				display_mean_and_covariance($mean, $covariance) if $self->{_debug};
225				}
226				my @clusters = @$initial_clusters;
227				display_clusters(\@clusters) if $self->{_debug};
228				my $iteration_index = 0;
229				my $unimodal_correction_flag;
230				my $previous_min_value_for_unimodality_quotient;
231				while ($iteration_index < $self->{_max_iterations}) {
232				print "\n\n========================== STARTING ITERATION $iteration_index =====================\n\n"
233				if $self->{_terminal_output};
234				my $total_reconstruction_error_this_iteration = 0;
235				my @subspace_construction_errors_this_iteration;
236				my @trailing_eigenvec_matrices_for_all_subspaces;
237				my @reference_vecs_for_all_subspaces;
238				foreach my $cluster (@clusters) {
239				next if @$cluster == 0;
240				my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
241				display_mean_and_covariance($mean, $covariance) if $self->{_debug};
242				print "--------------end of displaying mean and covariance\n\n" if $self->{_debug};
243				my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance);
244				my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
245				my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1];
246				my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals;
247				push @subspace_construction_errors_this_iteration, $subspace_construction_error;
248				my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions},
249				scalar(@trailing_eigenvecs));
250				foreach my $j (0..@trailing_eigenvecs-1) {
251				print "trailing eigenvec column: @{$trailing_eigenvecs[$j]}\n" if $self->{_debug};
252				$trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
253				}
254				push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
255				push @reference_vecs_for_all_subspaces, $mean;
256				}
257				$self->set_termination_reconstruction_error_threshold(\@reference_vecs_for_all_subspaces);
258				my %best_subspace_based_partition_of_data;
259				foreach my $i (0..$self->{_KM}-1) {
260				$best_subspace_based_partition_of_data{$i} = [];
261				}
262				foreach my $data_tag (@{$self->{_data_tags}}) {
263				my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
264				$data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
265				my @errors = map {$self->reconstruction_error($data_vec,
266				$trailing_eigenvec_matrices_for_all_subspaces[$_],
267				$reference_vecs_for_all_subspaces[$_])}
268				0 .. $self->{_KM}-1;
269				my ($minval, $index_for_closest_subspace) = minimum(\@errors);
270				$total_reconstruction_error_this_iteration += $minval;
271				push @{$best_subspace_based_partition_of_data{$index_for_closest_subspace}},
272				$data_tag;
273				}
274				print "Finished calculating the eigenvectors for the clusters produced by the previous\n" .
275				"iteration and re-assigning the data samples to the new subspaces on the basis of\n".
276				"the least reconstruction error.\n\n" .
277				"Total reconstruction error in this iteration: $total_reconstruction_error_this_iteration\n"
278				if $self->{_terminal_output};
279				foreach my $i (0..$self->{_KM}-1) {
280				$clusters[$i] = $best_subspace_based_partition_of_data{$i};
281				}
282				display_clusters(\@clusters) if $self->{_terminal_output};
283				# Check if any cluster has lost all its elements. If so, fragment the worst
284				# existing cluster to create the additional clusters needed:
285				if (any {@$_ == 0} @clusters) {
286				die "empty cluster found" if $self->{_auto_retry_flag};
287				print "\nOne or more clusters have become empty. Will carve out the needed clusters\n" .
288				"from the cluster with the largest subspace construction error.\n\n";
289				$total_reconstruction_error_this_iteration = 0;
290				@subspace_construction_errors_this_iteration = ();
291				my $how_many_extra_clusters_needed = $self->{_KM} - scalar(grep {@$_ != 0} @clusters);
292				print "number of extra clusters needed at iteration $iteration_index: $how_many_extra_clusters_needed\n";
293				my $max = List::Util::max @subspace_construction_errors_this_iteration;
294				my $maxindex = List::Util::first {$_ == $max} @subspace_construction_errors_this_iteration;
295				my @cluster_fragments = cluster_split($clusters[$maxindex],
296				$how_many_extra_clusters_needed + 1);
297				my @newclusters;
298				push @newclusters, @clusters[0 .. $maxindex-1];
299				push @newclusters, @clusters[$maxindex+1 .. $self->{_KM}-1];
300				push @newclusters, @cluster_fragments;
301				@newclusters = grep {@$_ != 0} @newclusters;
302				die "something went wrong with cluster fragmentation"
303				unless $self->{_KM} = @newclusters;
304				@trailing_eigenvec_matrices_for_all_subspaces = ();
305				@reference_vecs_for_all_subspaces = ();
306				foreach my $cluster (@newclusters) {
307				die "Linear Manifold Clustering did not work $!" if @$cluster == 0;
308				my ($mean, $covariance) = estimate_mean_and_covariance($cluster);
309				my ($eigenvecs, $eigenvals) = eigen_analysis_of_covariance($covariance);
310				my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
311				my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1];
312				my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals;
313				push @subspace_construction_errors_this_iteration, $subspace_construction_error;
314				my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions},
315				scalar(@trailing_eigenvecs));
316				foreach my $j (0..@trailing_eigenvecs-1) {
317				$trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
318				}
319				push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
320				push @reference_vecs_for_all_subspaces, $mean;
321				}
322				my %best_subspace_based_partition_of_data;
323				foreach my $i (0..$self->{_KM}-1) {
324				$best_subspace_based_partition_of_data{$i} = [];
325				}
326				foreach my $data_tag (@{$self->{_data_tags}}) {
327				my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
328				$data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
329				my @errors = map {reconstruction_error($data_vec,
330				$trailing_eigenvec_matrices_for_all_subspaces[$_],
331				$reference_vecs_for_all_subspaces[$_])}
332				0 .. $self->{_KM}-1;
333				my ($minval, $index_for_closest_subspace) = minimum(\@errors);
334				$total_reconstruction_error_this_iteration += $minval;
335				push @{$best_subspace_based_partition_of_data{$index_for_closest_subspace}},
336				$data_tag;
337				}
338				print "empty-cluster jag: total reconstruction error in this iteration: \n" .
339				"$total_reconstruction_error_this_iteration\n"
340				if $self->{_debug};
341				foreach my $i (0..$self->{_KM}-1) {
342				$clusters[$i] = $best_subspace_based_partition_of_data{$i};
343				}
344				display_clusters(\@newclusters) if $self->{_terminal_output};
345				@clusters = grep {@$_ != 0} @newclusters;
346				die "linear manifold based algorithm does not appear to work in this case $!"
347				unless @clusters == $self->{_KM};
348				}# end of foreach my $cluster (@clusters) ... loop followed by if clause for empty clusters
349				if ($self->{_data_dimensions} == 3) {
350				$visualization_msg = "clustering_at_iteration_$iteration_index";
351				$self->visualize_clusters_on_sphere($visualization_msg, \@clusters)
352				if $self->{_visualize_each_iteration};
353				$self->visualize_clusters_on_sphere($visualization_msg, \@clusters, "png")
354				if $self->{_make_png_for_each_iteration};
355				}
356				my @cluster_unimodality_quotients = map {$self->cluster_unimodality_quotient($clusters[$_],
357				$reference_vecs_for_all_subspaces[$_])} 0..@clusters-1;
358				my $min_value_for_unimodality_quotient = List::Util::min @cluster_unimodality_quotients;
359				print "\nCluster unimodality quotients: @cluster_unimodality_quotients\n" if $self->{_terminal_output};
360				die "\n\nBailing out!\n" .
361				"It does not look like these iterations will lead to a good clustering result.\n" .
362				"Program terminating. Try running again.\n"
363				if defined($previous_min_value_for_unimodality_quotient)
364				&& ($min_value_for_unimodality_quotient < 0.4)
365				&& ($min_value_for_unimodality_quotient < (0.5 * $previous_min_value_for_unimodality_quotient));
366				if ( $min_value_for_unimodality_quotient < 0.5 ) {
367				$unimodal_correction_flag = 1;
368				print "\nApplying unimodality correction:\n\n" if $self->{_terminal_output};
369				my @sorted_cluster_indexes =
370				sort {$cluster_unimodality_quotients[$b] <=> $cluster_unimodality_quotients[$a]} 0..@clusters-1;
371				my @newclusters;
372				foreach my $cluster_index (0..@clusters - 1) {
373				push @newclusters, $clusters[$sorted_cluster_indexes[$cluster_index]];
374				}
375				@clusters = @newclusters;
376				my $worst_cluster = pop @clusters;
377				print "\nthe worst cluster: @$worst_cluster\n" if $self->{_terminal_output};
378				my $second_worst_cluster = pop @clusters;
379				print "\nthe second worst cluster: @$second_worst_cluster\n" if $self->{_terminal_output};
380				push @$worst_cluster, @$second_worst_cluster;
381				fisher_yates_shuffle($worst_cluster);
382				my @first_half = @$worst_cluster[0 .. int(scalar(@$worst_cluster)/2) - 1];
383				my @second_half = @$worst_cluster[int(scalar(@$worst_cluster)/2) .. @$worst_cluster - 1];
384				push @clusters, \@first_half;
385				push @clusters, \@second_half;
386				if ($self->{_terminal_output}) {
387				print "\n\nShowing the clusters obtained after applying the unimodality correction:\n";
388				display_clusters(\@clusters);
389				}
390				}
391				if (@{$self->{_reconstruction_error_as_a_function_of_iteration}} > 0) {
392				my $last_recon_error = pop @{$self->{_reconstruction_error_as_a_function_of_iteration}};
393				push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, $last_recon_error;
394				if (($last_recon_error - $total_reconstruction_error_this_iteration)
395				< $self->{_delta_normalized_error}) {
396				push @{$self->{_reconstruction_error_as_a_function_of_iteration}},
397				$total_reconstruction_error_this_iteration;
398				last;
399				}
400				}
401				push @{$self->{_reconstruction_error_as_a_function_of_iteration}},
402				$total_reconstruction_error_this_iteration;
403				$iteration_index++;
404				$previous_min_value_for_unimodality_quotient = $min_value_for_unimodality_quotient;
405				} # end of while loop on iteration_index
406				$self->{_num_iterations_actually_used} =
407				scalar @{$self->{_reconstruction_error_as_a_function_of_iteration}};
408				if ($self->{_terminal_output}) {
409				print "\nIterations of the main loop terminated at iteration number $iteration_index.\n";
410				print "Will now invoke graph partitioning to discover dominant clusters and to\n" .
411				"merge small clusters.\n\n" if $self->{_cluster_search_multiplier} > 1;
412				print "Total reconstruction error as a function of iterations: " .
413				"@{$self->{_reconstruction_error_as_a_function_of_iteration}}";
414				}
415				# now merge sub-clusters if cluster_search_multiplier > 1
416				my @final_clusters;
417				if ($self->{_cluster_search_multiplier} > 1) {
418				print "\n\nInvoking recursive graph partitioning to merge small clusters\n\n";
419				my @array_of_partitioned_cluster_groups = (\@clusters);
420				my @partitioned_cluster_groups;
421				my $how_many_clusters_looking_for = $self->{_K};
422				while (scalar(@final_clusters) < $self->{_K}) {
423				@partitioned_cluster_groups =
424				$self->graph_partition(shift @array_of_partitioned_cluster_groups,
425				$how_many_clusters_looking_for );
426				if (@{$partitioned_cluster_groups[0]} == 1) {
427				my $singular_cluster = shift @{$partitioned_cluster_groups[0]};
428				push @final_clusters, $singular_cluster;
429				$how_many_clusters_looking_for--;
430				push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[1];
431				} elsif (@{$partitioned_cluster_groups[1]} == 1) {
432				my $singular_cluster = shift @{$partitioned_cluster_groups[1]};
433				push @final_clusters, $singular_cluster;
434				$how_many_clusters_looking_for--;
435				push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[0];
436				} else {
437				push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[0];
438				push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[1];
439				}
440				}
441				my @data_clustered;
442				foreach my $cluster (@final_clusters) {
443				push @data_clustered, @$cluster;
444				}
445				unless (scalar(@data_clustered) == scalar(@{$self->{_data_tags}})) {
446				$self->{_final_clusters} = \@final_clusters;
447				my %data_clustered = map {$_ => 1} @data_clustered;
448				my @data_tags_not_clustered =
449				grep {$_} map {exists $data_clustered{$_} ? undef : $_} @{$self->{_data_tags}};
450				if ($self->{_terminal_output}) {
451				print "\n\nNot all data clustered. The most reliable clusters found by graph partitioning:\n";
452				display_clusters(\@final_clusters);
453				print "\n\nData not yet clustered:\n\n@data_tags_not_clustered\n";
454				}
455				if ($self->{_data_dimensions} == 3) {
456				$visualization_msg = "$self->{_K}_best_clusters_produced_by_graph_partitioning";
457				$self->visualize_clusters_on_sphere($visualization_msg, \@final_clusters)
458				if $self->{_visualize_each_iteration};
459				$self->visualize_clusters_on_sphere($visualization_msg, \@final_clusters, "png")
460				if $self->{_make_png_for_each_iteration};
461				}
462				my %data_tags_to_cluster_label_hash;
463				foreach my $i (0..@final_clusters-1) {
464				map {$data_tags_to_cluster_label_hash{$_} = $i} @{$final_clusters[$i]};
465				}
466				$self->{_data_tags_to_cluster_label_hash} = \%data_tags_to_cluster_label_hash;
467				foreach my $tag (@data_tags_not_clustered) {
468				my $which_cluster = $self->which_cluster_for_new_element($tag);
469				$self->{_data_tags_to_cluster_label_hash}->{$tag} = $which_cluster;
470				}
471				die "Some data elements are still missing from the final tally"
472				unless scalar(keys %{$self->{_data_tags_to_cluster_label_hash}}) ==
473				scalar(@{$self->{_data_tags}});
474				my @new_final_clusters;
475				map { foreach my $ele (keys %{$self->{_data_tags_to_cluster_label_hash}}) {
476				push @{$new_final_clusters[$_]}, $ele
477				if $self->{_data_tags_to_cluster_label_hash}->{$ele} == $_ }
478				} 0..$self->{_K}-1;
479				if ($self->{_debug}) {
480				print "\ndisplaying the final clusters after accounting for unclustered data:\n";
481				display_clusters(\@new_final_clusters);
482				}
483				$self->{_final_clusters} = \@new_final_clusters;
484				@final_clusters = @new_final_clusters;
485				}
486				} else {
487				@final_clusters = @clusters;
488				}
489				print "\n\nDisplaying final clustering results:\n\n" if $self->{_terminal_output};
490				display_clusters(\@final_clusters) if $self->{_terminal_output};
491				return \@final_clusters;
492				}
493
494				sub display_reconstruction_errors_as_a_function_of_iterations {
495				my $self = shift;
496				print "\n\nNumber of iterations used in Phase 1: $self->{_num_iterations_actually_used}\n";
497				print "\nTotal reconstruction error as a function of iterations in Phase 1: " .
498				"@{$self->{_reconstruction_error_as_a_function_of_iteration}}\n";
499				}
500
501				sub set_termination_reconstruction_error_threshold {
502				my $self = shift;
503				my $all_ref_vecs = shift;
504				my @mean_vecs = @$all_ref_vecs;
505				my $sum_of_mean_magnitudes = reduce {$a+$b} map { my $result = transpose($_) * $_;
506				my @result = $result->as_list;
507				sqrt($result[0])
508				} @mean_vecs;
509				$self->{_scale_factor} = $sum_of_mean_magnitudes / @mean_vecs;
510				$self->{_delta_normalized_error} = ($sum_of_mean_magnitudes / @mean_vecs ) *
511				$self->{_delta_reconstruction_error};
512				}
513
514				# This method is called only in the `unless' clause at the end of the main
515				# linear_manifold_clusterer() method. It is called to find the cluster labels for
516				# those data elements that were left unclustered by the main part of the algorithm
517				# when graph partitioning is used to merge similar sub-clusters. The operating logic
518				# here is that graph partition yields K main clusters even though each main cluster
519				# may not yet be fully populated.
520				sub which_cluster_for_new_element {
521				my $self = shift;
522				my $data_tag = shift;
523				# The following `unless' clause is called only the first time the current method
524				# is called:
525				unless (@{$self->{_final_trailing_eigenvec_matrices_for_all_subspaces}} > 0) {
526				my @trailing_eigenvec_matrices_for_all_subspaces;
527				my @reference_vecs_for_all_subspaces;
528				foreach my $cluster (@{$self->{_final_clusters}}) {
529				my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
530				my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance);
531				my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
532				my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions},
533				scalar(@trailing_eigenvecs));
534				foreach my $j (0..@trailing_eigenvecs-1) {
535				$trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
536				}
537				push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
538				push @reference_vecs_for_all_subspaces, $mean;
539				}
540				$self->{_final_trailing_eigenvec_matrices_for_all_subspaces} =
541				\@trailing_eigenvec_matrices_for_all_subspaces;
542				$self->{_final_reference_vecs_for_all_subspaces} = \@reference_vecs_for_all_subspaces;
543				}
544				my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
545				$data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
546				my @errors = map {$self->reconstruction_error($data_vec,
547				$self->{_final_trailing_eigenvec_matrices_for_all_subspaces}->[$_],
548				$self->{_final_reference_vecs_for_all_subspaces}->[$_])}
549				0 .. $self->{_K}-1;
550				my ($minval, $index_for_closest_subspace) = minimum(\@errors);
551				return $index_for_closest_subspace;
552				}
553
554				sub graph_partition {
555				my $self = shift;
556				my $clusters = shift;
557				my $how_many_clusters_looking_for = shift;
558				print "\n\nGraph partitioning looking for $how_many_clusters_looking_for clusters\n\n";
559				my $num_nodes = scalar @$clusters;
560				my $W = Math::GSL::Matrix->new($num_nodes,$num_nodes);
561				my $D = Math::GSL::Matrix->new($num_nodes,$num_nodes);
562				$D->identity;
563				my $neg_sqrt_of_D = Math::GSL::Matrix->new($num_nodes,$num_nodes);
564				$neg_sqrt_of_D->identity;
565				my @subspace_construction_errors;
566				my @trailing_eigenvec_matrices_for_all_subspaces;
567				my @reference_vecs_for_all_subspaces;
568				foreach my $cluster (@$clusters) {
569				my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
570				my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance);
571				my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
572				my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1];
573				my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals;
574				push @subspace_construction_errors, $subspace_construction_error;
575				my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions},
576				scalar(@trailing_eigenvecs));
577				foreach my $j (0..@trailing_eigenvecs-1) {
578				print "trailing eigenvec column: @{$trailing_eigenvecs[$j]}\n" if $self->{_debug};
579				$trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
580				}
581				push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
582				push @reference_vecs_for_all_subspaces, $mean;
583				}
584				# We consider the similarity matrix W to be a sum of two parts W_recon_based and
585				# W_dist_bet_means based. IMPORTANT: For coding reasons, we first store the two
586				# similarity measures separately in W_recon_based and W_dist_bet_means based. Our
587				# goal is to fill up these component matrices with the raw values while at the
588				# same time collecting information needed for normalizing these two separate
589				# measures of similarity.
590				my $W_reconstruction_error_based = Math::GSL::Matrix->new($num_nodes,$num_nodes);
591				my $W_dist_between_means_based = Math::GSL::Matrix->new($num_nodes,$num_nodes);
592				my @all_pairwise_reconstruction_errors;
593				my @all_dist_between_means_errors;
594				foreach my $i (0..$num_nodes-1) {
595				foreach my $j (0..$num_nodes-1) {
596				my ($recon_error_similarity, $dist_bet_means) = $self->pairwise_cluster_similarity(
597				$clusters->[$i],
598				$trailing_eigenvec_matrices_for_all_subspaces[$i],
599				$reference_vecs_for_all_subspaces[$i],
600				$clusters->[$j],
601				$trailing_eigenvec_matrices_for_all_subspaces[$j],
602				$reference_vecs_for_all_subspaces[$j]);
603				$W_reconstruction_error_based->set_elem($i, $j, $recon_error_similarity);
604				$W_dist_between_means_based->set_elem($i, $j, $dist_bet_means);
605				push @all_pairwise_reconstruction_errors, $recon_error_similarity;
606				push @all_dist_between_means_errors, $dist_bet_means;
607				}
608				}
609				my $recon_error_normalizer = (reduce {$a + $b} @all_pairwise_reconstruction_errors) /
610				(scalar @all_pairwise_reconstruction_errors);
611				my $dist_bet_means_based_normalizer = (reduce {$a + $b} @all_dist_between_means_errors ) /
612				(scalar @all_dist_between_means_errors );
613				die "\n\nBailing out!\n" .
614				"Dealing with highly defective clusters. Try again.\n"
615				if ($recon_error_normalizer == 0) \|\| ($dist_bet_means_based_normalizer == 0);
616				foreach my $i (0..$num_nodes-1) {
617				foreach my $j (0..$num_nodes-1) {
618				my $recon_val = $W_reconstruction_error_based->get_elem($i,$j);
619				my $new_recon_val = exp( -1.0 * $recon_val / $recon_error_normalizer );
620				$W_reconstruction_error_based->set_elem($i,$j,$new_recon_val);
621				my $mean_dist_val = $W_dist_between_means_based->get_elem($i,$j);
622				my $new_mean_dist_val = exp( -1.0 * $mean_dist_val / $dist_bet_means_based_normalizer );
623				$W_dist_between_means_based->set_elem($i,$j,$new_mean_dist_val);
624				}
625				}
626				$W = $W_reconstruction_error_based + $W_dist_between_means_based;
627				if ($self->{_debug}) {
628				print "\nDisplaying the similarity matrix W for the cluster graph:\n";
629				display_matrix($W) if $self->{_terminal_output};
630				}
631				my $add_all_columns = Math::GSL::Matrix->new($num_nodes,1);
632				foreach my $col (0..$num_nodes-1) {
633				$add_all_columns += $W->col($col);
634				}
635				foreach my $i (0..$num_nodes-1) {
636				$D->set_elem($i,$i, $add_all_columns->get_elem($i,0));
637				$neg_sqrt_of_D->set_elem($i,$i, 1.0 / sqrt($add_all_columns->get_elem($i,0)));
638				}
639				# the Laplacian matrix:
640				my $Laplacian = $D - $W;
641				# the Symmetric Normalized Laplacian matrix A:
642				my $A = $neg_sqrt_of_D * $Laplacian * $neg_sqrt_of_D;
643				foreach my $i (0..$num_nodes-1) {
644				foreach my $j (0..$num_nodes-1) {
645				$A->set_elem($i,$j,0) if abs($A->get_elem($i,$j)) < 0.01;
646				}
647				}
648				if ($self->{_terminal_output}) {
649				print "\nDisplaying the Symmetric Normalized Laplacian matrix A:\n" .
650				"A = neg_sqrt(D) * Laplacian_matrix * neg_sqrt(D)\n";
651				display_matrix( $A );
652				}
653				my ($eigenvalues, $eigenvectors) = $A->eigenpair;
654				my $num_of_eigens = @$eigenvalues;
655				my $largest_eigen_index = 0;
656				my $smallest_eigen_index = 0;
657				if ($self->{_debug2}) {
658				print "Eigenvalue 0: $eigenvalues->[0]\n";
659				foreach my $i (1..$num_of_eigens-1) {
660				$largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
661				$smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
662				print "Eigenvalue $i: $eigenvalues->[$i]\n";
663				}
664				print "\nlargest eigen index: $largest_eigen_index\n";
665				print "\nsmallest eigen index: $smallest_eigen_index\n\n";
666				}
667				my @all_my_eigenvecs;
668				foreach my $i (0..$num_of_eigens-1) {
669				my @vec = $eigenvectors->[$i]->as_list;
670				my @eigenvec;
671				foreach my $ele (@vec) {
672				my ($mag, $theta) = $ele =~ /\[(\d\.?\de?[+-]?\d*),(\S+)\]/;
673				if ($theta eq "0") {
674				push @eigenvec, $mag;
675				} elsif ($theta eq "pi") {
676				push @eigenvec, -1.0 * $mag;
677				} else {
678				die "Eigendecomposition produced a complex eigenvector!";
679				}
680				}
681				print "Eigenvector $i: @eigenvec\n" if $self->{_debug2};
682				push @all_my_eigenvecs, \@eigenvec;
683				}
684				if ($self->{_debug2}) {
685				my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
686				print "\nLargest eigenvector of A: @largest_eigen_vec\n";
687				}
688				my @sorted_eigenvec_indexes = sort {$eigenvalues->[$b] <=> $eigenvalues->[$a]} 0..@all_my_eigenvecs-1;
689				print "sorted eigenvec indexes for A: @sorted_eigenvec_indexes\n" if $self->{_debug2};
690				my @sorted_eigenvecs;
691				my @sorted_eigenvals;
692				foreach my $i (0..@sorted_eigenvec_indexes-1) {
693				$sorted_eigenvecs[$i] = $all_my_eigenvecs[$sorted_eigenvec_indexes[$i]];
694				$sorted_eigenvals[$i] = $eigenvalues->[$sorted_eigenvec_indexes[$i]];
695				}
696				if ($self->{_debug2}) {
697				print "\nHere come sorted eigenvectors for A --- from the largest to the smallest:\n";
698				foreach my $i (0..@sorted_eigenvecs-1) {
699				print "eigenvec: @{$sorted_eigenvecs[$i]} eigenvalue: $sorted_eigenvals[$i]\n";
700				}
701				}
702				my $best_partitioning_eigenvec = $sorted_eigenvecs[@sorted_eigenvec_indexes-2];
703				print "\nBest graph partitioning eigenvector: @$best_partitioning_eigenvec\n" if $self->{_terminal_output};
704				my $how_many_positive = reduce {$a + $b} map {$_ > 0 ? 1 : 0 } @$best_partitioning_eigenvec;
705				my $how_many_negative = scalar(@$best_partitioning_eigenvec) - $how_many_positive;
706				print "Have $how_many_positive positive and $how_many_negative negative elements in the partitioning vec\n"
707				if $self->{_terminal_output};
708				if ($how_many_clusters_looking_for <= 3) {
709				my @merged_cluster;
710				my $final_cluster;
711				my @newclusters;
712				if ($how_many_positive == 1) {
713				foreach my $i (0..@$clusters-1) {
714				if ($best_partitioning_eigenvec->[$i] > 0) {
715				$final_cluster = $clusters->[$i];
716				} else {
717				push @newclusters, $clusters->[$i];
718				}
719				}
720				return ([$final_cluster], \@newclusters);
721				} elsif ($how_many_negative == 1) {
722				foreach my $i (0..@$clusters-1) {
723				if ($best_partitioning_eigenvec->[$i] < 0) {
724				$final_cluster = $clusters->[$i];
725				} else {
726				push @newclusters, $clusters->[$i];
727				}
728				}
729				return ([$final_cluster], \@newclusters);
730				} elsif ($how_many_positive <= $self->{_cluster_search_multiplier}) {
731				foreach my $i (0..@$clusters-1) {
732				if ($best_partitioning_eigenvec->[$i] > 0) {
733				push @merged_cluster, @{$clusters->[$i]};
734				} else {
735				push @newclusters, $clusters->[$i];
736				}
737				}
738				return ([\@merged_cluster], \@newclusters);
739				} elsif ($how_many_negative <= $self->{_cluster_search_multiplier}) {
740				foreach my $i (0..@$clusters-1) {
741				if ($best_partitioning_eigenvec->[$i] < 0) {
742				push @merged_cluster, @{$clusters->[$i]};
743				} else {
744				push @newclusters, $clusters->[$i];
745				}
746				}
747				return ([\@merged_cluster], \@newclusters);
748				} else {
749				die "\n\nBailing out!\n\n" .
750				"No consensus support for dominant clusters in the graph partitioning step\n" .
751				"of the algorithm. This can be caused by bad random selection of initial\n" .
752				"cluster centers. Please run this program again.\n";
753				}
754				} else {
755				my @positive_clusters;
756				my @negative_clusters;
757				foreach my $i (0..@$clusters-1) {
758				if ($best_partitioning_eigenvec->[$i] > 0) {
759				push @positive_clusters, $clusters->[$i];
760				} else {
761				push @negative_clusters, $clusters->[$i];
762				}
763				}
764				return (\@positive_clusters, \@negative_clusters);
765				}
766				}
767
768				sub pairwise_cluster_similarity {
769				my $self = shift;
770				my $cluster1 = shift;
771				my $trailing_eigenvec_matrix_cluster1 = shift;
772				my $reference_vec_cluster1 = shift;
773				my $cluster2 = shift;
774				my $trailing_eigenvec_matrix_cluster2 = shift;
775				my $reference_vec_cluster2 = shift;
776				my $total_reconstruction_error_in_this_iteration = 0;
777				my @errors_for_1_on_2 = map {my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
778				$data_vec->set_col(0, $self->{_data_hash}->{$_});
779				$self->reconstruction_error($data_vec,
780				$trailing_eigenvec_matrix_cluster2,
781				$reference_vec_cluster2)}
782				@$cluster1;
783				my @errors_for_2_on_1 = map {my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
784				$data_vec->set_col(0, $self->{_data_hash}->{$_});
785				$self->reconstruction_error($data_vec,
786				$trailing_eigenvec_matrix_cluster1,
787				$reference_vec_cluster1)}
788				@$cluster2;
789				my $type_1_error = reduce {abs($a) + abs($b)} @errors_for_1_on_2;
790				my $type_2_error = reduce {abs($a) + abs($b)} @errors_for_2_on_1;
791				my $total_reconstruction_error = $type_1_error + $type_2_error;
792				my $diff_between_the_means = $reference_vec_cluster1 - $reference_vec_cluster2;
793				my $dist_squared = transpose($diff_between_the_means) * $diff_between_the_means;
794				my @dist_squared_as_list = $dist_squared->as_list();
795				my $dist_between_means_based_error = shift @dist_squared_as_list;
796				return ($total_reconstruction_error, $dist_between_means_based_error);
797				}
798
799				# delta ball
800				sub cluster_unimodality_quotient {
801				my $self = shift;
802				my $cluster = shift;
803				my $mean = shift;
804				my $delta = 0.4 * $self->{_scale_factor}; # Radius of the delta ball along each dimension
805				my @mean = $mean->as_list;
806				my @data_tags_for_range_tests;
807				foreach my $dimen (0..$self->{_data_dimensions}-1) {
808				my @values = map {$_->[$dimen]} map {$self->{_data_hash}->{$_}} @$cluster;
809				my ($min, $max) = (List::Util::min(@values), List::Util::max(@values));
810				my $range = $max - $min;
811				my $mean_along_this_dimen = $mean[$dimen];
812				my @tags = grep {$_}
813				map { ( ($self->{_data_hash}->{$_}->[$dimen] > $mean_along_this_dimen - $delta * $range)
814				&&
815				($self->{_data_hash}->{$_}->[$dimen] < $mean_along_this_dimen + $delta * $range) )
816				? $_ : undef }
817				@$cluster;
818				push @data_tags_for_range_tests, \@tags;
819				}
820				# Now find the intersection of the tag sets for each of the dimensions
821				my %intersection_hash;
822				foreach my $dimen (0..$self->{_data_dimensions}-1) {
823				my %tag_hash_for_this_dimen = map {$_ => 1} @{$data_tags_for_range_tests[$dimen]};
824				if ($dimen == 0) {
825				%intersection_hash = %tag_hash_for_this_dimen;
826				} else {
827				%intersection_hash = map {$_ => 1} grep {$tag_hash_for_this_dimen{$_}}
828				keys %intersection_hash;
829				}
830				}
831				my @intersection_set = keys %intersection_hash;
832				my $cluster_unimodality_index = scalar(@intersection_set) / scalar(@$cluster);
833				return $cluster_unimodality_index;
834				}
835
836				sub find_best_ref_vector {
837				my $self = shift;
838				my $cluster = shift;
839				my $trailing_eigenvec_matrix = shift;
840				my $mean = shift; # a GSL marix ref
841				my @min_bounds;
842				my @max_bounds;
843				my @ranges;
844				foreach my $dimen (0..$self->{_data_dimensions}-1) {
845				my @values = map {$_->[$dimen]} map {$self->{_data_hash}->{$_}} @$cluster;
846				my ($min, $max) = (List::Util::min(@values), List::Util::max(@values));
847				push @min_bounds, $min;
848				push @max_bounds, $max;
849				push @ranges, $max - $min;
850				}
851				print "min bounds are: @min_bounds\n";
852				print "max bounds are: @max_bounds\n";
853				my $max_iterations = 100;
854				my @random_points;
855				my $iteration = 0;
856				while ($iteration++ < $max_iterations) {
857				my @coordinate_vec;
858				foreach my $dimen (0..$self->{_data_dimensions}-1) {
859				push @coordinate_vec, $min_bounds[$dimen] + rand($ranges[$dimen]);
860				}
861				push @random_points, \@coordinate_vec;
862				}
863				if ($self->{_debug}) {
864				print "\nrandom points\n";
865				map {print "@$_\n"} @random_points;
866				}
867				my @mean = $mean->as_list;
868				unshift @random_points, \@mean;
869				my @reconstruction_errors;
870				foreach my $candidate_ref_vec (@random_points) {
871				my $ref_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
872				$ref_vec->set_col(0, $candidate_ref_vec);
873				my $reconstruction_error_for_a_ref_vec = 0;
874				foreach my $data_tag (@{$self->{_data_tags}}) {
875				my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
876				$data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
877				my $error = $self->reconstruction_error($data_vec,$trailing_eigenvec_matrix,$ref_vec);
878				$reconstruction_error_for_a_ref_vec += $error;
879				}
880				push @reconstruction_errors, $reconstruction_error_for_a_ref_vec;
881				}
882				my $recon_error_for_original_mean = shift @reconstruction_errors;
883				my $smallest_error_randomly_selected_ref_vecs = List::Util::min(@reconstruction_errors);
884				my $minindex = List::Util::first { $_ == $smallest_error_randomly_selected_ref_vecs }
885				@reconstruction_errors;
886				my $refvec = $random_points[$minindex];
887				return $refvec;
888				}
889
890				## The reconstruction error relates to the size of the perpendicular from a data
891				## point X to the hyperplane that defines a given subspace on the manifold.
892				sub reconstruction_error {
893				my $self = shift;
894				my $data_vec = shift;
895				my $trailing_eigenvecs = shift;
896				my $ref_vec = shift;
897				my $error_squared = transpose($data_vec - $ref_vec) * $trailing_eigenvecs *
898				transpose($trailing_eigenvecs) * ($data_vec - $ref_vec);
899				my @error_squared_as_list = $error_squared->as_list();
900				my $error_squared_as_scalar = shift @error_squared_as_list;
901				return $error_squared_as_scalar;
902				}
903
904				# Returns a set of KM random integers. These serve as indices to reach into the data
905				# array. A data element whose index is one of the random numbers returned by this
906				# routine serves as an initial cluster center. Note the quality check it runs on the
907				# list of the random integers constructed. We first make sure that all the random
908				# integers returned are different. Subsequently, we carry out a quality assessment
909				# of the random integers constructed. This quality measure consists of the ratio of
910				# the values spanned by the random integers to the value of N, the total number of
911				# data points to be clustered. Currently, if this ratio is less than 0.3, we discard
912				# the K integers and try again.
913				sub initialize_cluster_centers {
914				my $self = shift;
915				my $K = shift; # This value is set to the parameter KM in the call to this subroutine
916				my $data_store_size = shift;
917				my @cluster_center_indices;
918				while (1) {
919				foreach my $i (0..$K-1) {
920				$cluster_center_indices[$i] = int rand( $data_store_size );
921				next if $i == 0;
922				foreach my $j (0..$i-1) {
923				while ( $cluster_center_indices[$j] == $cluster_center_indices[$i] ) {
924				my $old = $cluster_center_indices[$i];
925				$cluster_center_indices[$i] = int rand($data_store_size);
926				}
927				}
928				}
929				my ($min,$max) = minmax(\@cluster_center_indices );
930				my $quality = ($max - $min) / $data_store_size;
931				last if $quality > 0.3;
932				}
933				return @cluster_center_indices;
934				}
935
936				# The purpose of this routine is to form initial clusters by assigning the data
937				# samples to the initial clusters formed by the previous routine on the basis of the
938				# best proximity of the data samples to the different cluster centers.
939				sub assign_data_to_clusters_initial {
940				my $self = shift;
941				my @cluster_centers = @{ shift @_ };
942				my @clusters;
943				foreach my $ele (@{$self->{_data_tags}}) {
944				my $best_cluster;
945				my @dist_from_clust_centers;
946				foreach my $center (@cluster_centers) {
947				push @dist_from_clust_centers, $self->distance($ele, $center);
948				}
949				my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
950				push @{$clusters[$best_center_index]}, $ele;
951				}
952				return \@clusters;
953				}
954
955				# The following routine is for computing the distance between a data point specified
956				# by its symbolic name in the master datafile and a point (such as the center of a
957				# cluster) expressed as a vector of coordinates:
958				sub distance {
959				my $self = shift;
960				my $ele1_id = shift @_; # symbolic name of data sample
961				my @ele1 = @{$self->{_data_hash}->{$ele1_id}};
962				my @ele2 = @{shift @_};
963				die "wrong data types for distance calculation\n" if @ele1 != @ele2;
964				my $how_many = @ele1;
965				my $squared_sum = 0;
966				foreach my $i (0..$how_many-1) {
967				$squared_sum += ($ele1[$i] - $ele2[$i])**2;
968				}
969				my $dist = sqrt $squared_sum;
970				return $dist;
971				}
972
973				sub write_clusters_to_files {
974				my $self = shift;
975				my $clusters = shift;
976				my @clusters = @$clusters;
977				unlink glob "cluster*.txt";
978				foreach my $i (0..@clusters-1) {
979				my $filename = "cluster" . $i . ".txt";
980				print "\nWriting cluster $i to file $filename\n" if $self->{_terminal_output};
981				open FILEHANDLE, "\| sort > $filename" or die "Unable to open file: $!";
982				foreach my $ele (@{$clusters[$i]}) {
983				print FILEHANDLE "$ele ";
984				}
985				close FILEHANDLE;
986				}
987				}
988
989				sub DESTROY {
990				my $filename = basename($_[0]->{_datafile});
991				$filename =~ s/\.\w+$/\.txt/;
992				unlink "__temp_" . $filename;
993				}
994
995				################################## Visualization Code ###################################
996
997				sub add_point_coords {
998				my $self = shift;
999				my @arr_of_ids = @{shift @_}; # array of data element names
1000				my @result;
1001				my $data_dimensionality = $self->{_data_dimensions};
1002				foreach my $i (0..$data_dimensionality-1) {
1003				$result[$i] = 0.0;
1004				}
1005				foreach my $id (@arr_of_ids) {
1006				my $ele = $self->{_data_hash}->{$id};
1007				my $i = 0;
1008				foreach my $component (@$ele) {
1009				$result[$i] += $component;
1010				$i++;
1011				}
1012				}
1013				return \@result;
1014				}
1015
1016				# This is the main module version:
1017				sub visualize_clusters_on_sphere {
1018				my $self = shift;
1019				my $visualization_msg = shift;
1020				my $clusters = deep_copy_AoA(shift);
1021				my $hardcopy_format = shift;
1022				my $pause_time = shift;
1023				my $d = $self->{_data_dimensions};
1024				my $temp_file = "__temp_" . $self->{_datafile};
1025				$temp_file =~ s/\.\w+$/\.txt/;
1026				unlink $temp_file if -e $temp_file;
1027				open OUTPUT, ">$temp_file"
1028				or die "Unable to open a temp file in this directory: $!";
1029				my @all_tags = "A".."Z";
1030				my @retagged_clusters;
1031				foreach my $cluster (@$clusters) {
1032				my $label = shift @all_tags;
1033				my @retagged_cluster =
1034				map {$_ =~ s/^(\w+?)_(\w+)/$label . "_$2 @{$self->{_data_hash}->{$_}}"/e;$_} @$cluster;
1035				push @retagged_clusters, \@retagged_cluster;
1036				}
1037				my %clusters;
1038				foreach my $cluster (@retagged_clusters) {
1039				foreach my $record (@$cluster) {
1040				my @splits = grep $_, split /\s+/, $record;
1041				$splits[0] =~ /(\w+?)_.*/;
1042				my $primary_cluster_label = $1;
1043				my @coords = @splits[1..$d];
1044				push @{$clusters{$primary_cluster_label}}, \@coords;
1045				}
1046				}
1047				foreach my $key (sort {"\L$a" cmp "\L$b"} keys %clusters) {
1048				map {print OUTPUT "$_"} map {"@$_\n"} @{$clusters{$key}};
1049				print OUTPUT "\n\n";
1050				}
1051				my @sorted_cluster_keys = sort {"\L$a" cmp "\L$b"} keys %clusters;
1052				close OUTPUT;
1053				my $plot;
1054				unless (defined $pause_time) {
1055				$plot = Graphics::GnuplotIF->new( persist => 1 );
1056				} else {
1057				$plot = Graphics::GnuplotIF->new();
1058				}
1059				my $arg_string = "";
1060				$plot->gnuplot_cmd( "set hidden3d" ) unless $self->{_show_hidden_in_3D_plots};
1061				$plot->gnuplot_cmd( "set title \"$visualization_msg\"" );
1062				$plot->gnuplot_cmd( "set noclip" );
1063				$plot->gnuplot_cmd( "set pointsize 2" );
1064				$plot->gnuplot_cmd( "set parametric" );
1065				$plot->gnuplot_cmd( "set size ratio 1" );
1066				$plot->gnuplot_cmd( "set xlabel \"X\"" );
1067				$plot->gnuplot_cmd( "set ylabel \"Y\"" );
1068				$plot->gnuplot_cmd( "set zlabel \"Z\"" );
1069				if ($hardcopy_format) {
1070				$plot->gnuplot_cmd( "set terminal png" );
1071				my $image_file_name = "$visualization_msg\.$hardcopy_format";
1072				$plot->gnuplot_cmd( "set output \"$image_file_name\"" );
1073				$plot->gnuplot_cmd( "unset hidden3d" );
1074				}
1075				# set the range for azimuth angles:
1076				$plot->gnuplot_cmd( "set urange [0:2*pi]" );
1077				# set the range for the elevation angles:
1078				$plot->gnuplot_cmd( "set vrange [-pi/2:pi/2]" );
1079				# Parametric functions for the sphere
1080				# $plot->gnuplot_cmd( "r=1" );
1081				if ($self->{_scale_factor}) {
1082				$plot->gnuplot_cmd( "r=$self->{_scale_factor}" );
1083				} else {
1084				$plot->gnuplot_cmd( "r=1" );
1085				}
1086				$plot->gnuplot_cmd( "fx(v,u) = rcos(v)cos(u)" );
1087				$plot->gnuplot_cmd( "fy(v,u) = rcos(v)sin(u)" );
1088				$plot->gnuplot_cmd( "fz(v) = r*sin(v)" );
1089				my $sphere_arg_str = "fx(v,u),fy(v,u),fz(v) notitle with lines lt 0,";
1090				foreach my $i (0..scalar(keys %clusters)-1) {
1091				my $j = $i + 1;
1092				# The following statement puts the titles on the data points
1093				$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"$sorted_cluster_keys[$i] \" with points lt $j pt $j, ";
1094				}
1095				$arg_string = $arg_string =~ /^(.*),[ ]+$/;
1096				$arg_string = $1;
1097				$plot->gnuplot_cmd( "splot $sphere_arg_str $arg_string" );
1098				$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
1099				}
1100
1101
1102				################################### Support Routines ########################################
1103
1104				sub cluster_split {
1105				my $cluster = shift;
1106				my $how_many = shift;
1107				my @cluster_fragments;
1108				foreach my $i (0..$how_many-1) {
1109				$cluster_fragments[$i] = [];
1110				}
1111				my $delta = int( scalar(@$cluster) / $how_many );
1112				my $j = 0;
1113				foreach my $i (0..@$cluster-1) {
1114				push @{$cluster_fragments[int($i/$delta)]}, $cluster->[$i];
1115				}
1116				my $how_many_accounted_for = reduce {$a + $b} map {scalar(@$_)} @cluster_fragments;
1117				foreach my $frag (@cluster_fragments) {
1118				print "fragment: @$frag\n";
1119				}
1120				die "the fragmentation could not account for all the data"
1121				unless @$cluster == $how_many_accounted_for;
1122				return @cluster_fragments;
1123				}
1124
1125				# Returns the minimum value and its positional index in an array
1126				sub minimum {
1127				my $arr = shift;
1128				my $min;
1129				my $index;
1130				foreach my $i (0..@{$arr}-1) {
1131				if ( (!defined $min) \|\| ($arr->[$i] < $min) ) {
1132				$index = $i;
1133				$min = $arr->[$i];
1134				}
1135				}
1136				return ($min, $index);
1137				}
1138
1139				sub minmax {
1140				my $arr = shift;
1141				my $min;
1142				my $max;
1143				foreach my $i (0..@{$arr}-1) {
1144				next if !defined $arr->[$i];
1145				if ( (!defined $min) && (!defined $max) ) {
1146				$min = $arr->[$i];
1147				$max = $arr->[$i];
1148				} elsif ( $arr->[$i] < $min ) {
1149				$min = $arr->[$i];
1150				} elsif ( $arr->[$i] > $max ) {
1151				$max = $arr->[$i];
1152				}
1153				}
1154				return ($min, $max);
1155				}
1156
1157				# Meant only for constructing a deep copy of an array of arrays:
1158				sub deep_copy_AoA {
1159				my $ref_in = shift;
1160				my $ref_out;
1161				foreach my $i (0..@{$ref_in}-1) {
1162				foreach my $j (0..@{$ref_in->[$i]}-1) {
1163				$ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
1164				}
1165				}
1166				return $ref_out;
1167				}
1168
1169				# For displaying the individual clusters on a terminal screen. Each cluster is
1170				# displayed through the symbolic names associated with the data points.
1171				sub display_clusters {
1172				my @clusters = @{shift @_};
1173				my $i = 0;
1174				foreach my $cluster (@clusters) {
1175				@$cluster = sort @$cluster;
1176				my $cluster_size = @$cluster;
1177				print "\n\nCluster $i ($cluster_size records):\n";
1178				foreach my $ele (@$cluster) {
1179				print " $ele";
1180				}
1181				$i++
1182				}
1183				print "\n\n";
1184				}
1185
1186				sub display_mean_and_covariance {
1187				my $mean = shift;
1188				my $covariance = shift;
1189				print "\nDisplay the mean:\n";
1190				display_matrix($mean);
1191				print "\nDisplay the covariance:\n";
1192				display_matrix($covariance);
1193				}
1194
1195				sub check_for_illegal_params {
1196				my @params = @_;
1197				my @legal_params = qw / datafile
1198				mask
1199				K
1200				P
1201				terminal_output
1202				cluster_search_multiplier
1203				max_iterations
1204				delta_reconstruction_error
1205				visualize_each_iteration
1206				show_hidden_in_3D_plots
1207				make_png_for_each_iteration
1208				debug
1209				/;
1210				my $found_match_flag;
1211				foreach my $param (@params) {
1212				foreach my $legal (@legal_params) {
1213				$found_match_flag = 0;
1214				if ($param eq $legal) {
1215				$found_match_flag = 1;
1216				last;
1217				}
1218				}
1219				last if $found_match_flag == 0;
1220				}
1221				return $found_match_flag;
1222				}
1223
1224				sub display_matrix {
1225				my $matrix = shift;
1226				my $nrows = $matrix->rows();
1227				my $ncols = $matrix->cols();
1228				print "\nDisplaying a matrix of size $nrows rows and $ncols columns:\n";
1229				foreach my $i (0..$nrows-1) {
1230				my $row = $matrix->row($i);
1231				my @row_as_list = $row->as_list;
1232				# print "@row_as_list\n";
1233				map { printf("%.4f ", $_) } @row_as_list;
1234				print "\n";
1235				}
1236				print "\n\n";
1237				}
1238
1239				sub transpose {
1240				my $matrix = shift;
1241				my $num_rows = $matrix->rows();
1242				my $num_cols = $matrix->cols();
1243				my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
1244				foreach my $i (0..$num_rows-1) {
1245				my @row = $matrix->row($i)->as_list;
1246				$transpose->set_col($i, \@row );
1247				}
1248				return $transpose;
1249				}
1250
1251				sub vector_subtract {
1252				my $vec1 = shift;
1253				my $vec2 = shift;
1254				die "wrong data types for vector subtract calculation\n" if @$vec1 != @$vec2;
1255				my @result;
1256				foreach my $i (0..@$vec1-1){
1257				push @result, $vec1->[$i] - $vec2->[$i];
1258				}
1259				return @result;
1260				}
1261
1262				# from perl docs:
1263				sub fisher_yates_shuffle {
1264				my $arr = shift;
1265				my $i = @$arr;
1266				while (--$i) {
1267				my $j = int rand( $i + 1 );
1268				@$arr[$i, $j] = @$arr[$j, $i];
1269				}
1270				}
1271
1272				######################### Generating Synthetic Data for Manifold Clustering ##########################
1273
1274				################################## Class DataGenerator ########################################
1275
1276				## The embedded class defined below is for generating synthetic data for
1277				## experimenting with linear manifold clustering when the data resides on the
1278				## surface of a sphere. See the script generate_data_on_a_sphere.pl in the
1279				## `examples' directory for how to specify the number of clusters and the spread of
1280				## each cluster in the data that is generated.
1281
1282				package DataGenerator;
1283
1284				use strict;
1285				use Carp;
1286
1287				sub new {
1288				my ($class, %args) = @_;
1289				my @params = keys %args;
1290				croak "\nYou have used a wrong name for a keyword argument " .
1291				"--- perhaps a misspelling\n"
1292				if _check_for_illegal_params3(@params) == 0;
1293				bless {
1294				_output_file => $args{output_file}
1295				\|\| croak("name for output_file required"),
1296				_total_number_of_samples_needed => $args{total_number_of_samples_needed}
1297				\|\| croak("total_number_of_samples_needed required"),
1298				_number_of_clusters_on_sphere => $args{number_of_clusters_on_sphere} \|\| 3,
1299				_cluster_width => $args{cluster_width} \|\| 0.1,
1300				_show_hidden_in_3D_plots => $args{show_hidden_in_3D_plots} \|\| 1,
1301				_debug => $args{debug} \|\| 0,
1302				}, $class;
1303				}
1304
1305				sub _check_for_illegal_params3 {
1306				my @params = @_;
1307				my @legal_params = qw / output_file
1308				total_number_of_samples_needed
1309				number_of_clusters_on_sphere
1310				cluster_width
1311				show_hidden_in_3D_plots
1312				/;
1313				my $found_match_flag;
1314				foreach my $param (@params) {
1315				foreach my $legal (@legal_params) {
1316				$found_match_flag = 0;
1317				if ($param eq $legal) {
1318				$found_match_flag = 1;
1319				last;
1320				}
1321				}
1322				last if $found_match_flag == 0;
1323				}
1324				return $found_match_flag;
1325				}
1326
1327				## We first generate a set of points randomly on the unit sphere --- the number of
1328				## points being equal to the number of clusters desired. These points will serve as
1329				## cluster means (or, as cluster centroids) subsequently when we ask
1330				## Math::Random::random_multivariate_normal($N, @m, @covar) to return $N number of
1331				## points on the sphere. The second argument is the cluster mean and the third
1332				## argument the cluster covariance. For the synthetic data, we set the cluster
1333				## covariance to a 2x2 diagonal matrix, with the (0,0) element corresponding to the
1334				## variance along the azimuth direction and the (1,1) element corresponding to the
1335				## variance along the elevation direction.
1336				##
1337				## When you generate the points in the 2D spherical coordinates of
1338				## (azimuth,elevation), you also need `wrap-around' logic for those points yielded by
1339				## the multivariate-normal function whose azimuth angle is outside the interval
1340				## (0,360) and/or whose elevation angle is outside the interval (-90,90).
1341				##
1342				## Note that the first of the two dimensions for which the multivariate-normal
1343				## function returns the points is for the azimuth angle and the second for the
1344				## elevation angle.
1345				##
1346				## With regard to the relationship of the Cartesian coordinates to the spherical
1347				## (azimuth, elevation) coordinates, we assume that (x,y) is the horizontal plane
1348				## and z the vertical axis. The elevation angle theta is measure with respect to
1349				## the XY-plane. The highest point on the sphere (the Zenith) corresponds to the
1350				## elevation angle of +90 and the lowest points on the sphere (the Nadir)
1351				## corresponds to the elevation angle of -90. The azimuth is measured with respect
1352				## X-axis. The range of the azimuth is from 0 to 360 degrees. The elevation is
1353				## measured from the XY plane and its range is (-90,90) degrees.
1354				sub gen_data_and_write_to_csv {
1355				my $self = shift;
1356				my $K = $self->{_number_of_clusters_on_sphere};
1357				# $N is the number of samples to be generated for each cluster:
1358				my $N = int($self->{_total_number_of_samples_needed} / $K);
1359				my $output_file = $self->{_output_file};
1360				# Designated all of the data elements in a cluster by a letter that is followed by
1361				# an integer that identifies a specific data element.
1362				my @point_labels = ('a'..'z');
1363				# Our first job is to define $K random points in the 2D space (azimuth,
1364				# elevation) to serve as cluster centers on the surface of the sphere. This we
1365				# do by calling a uniformly distributed 1-D random number generator, first for
1366				# the azimuth and then for the elevation in the loop shown below:
1367				my @cluster_centers;
1368				my @covariances;
1369				foreach my $i (0..$K-1) {
1370				my $azimuth = rand(360);
1371				my $elevation = rand(90) - 90;
1372				my @mean = ($azimuth, $elevation);
1373				push @cluster_centers, \@mean;
1374				my $cluster_covariance;
1375				# The j-th dimension is for azimuth and k-th for elevation for the directions
1376				# to surface of the sphere:
1377				foreach my $j (0..1) {
1378				foreach my $k (0..1) {
1379				$cluster_covariance->[$j]->[$k] = ($self->{_cluster_width} * 360.0) ** 2
1380				if $j == 0 && $k == 0;
1381				$cluster_covariance->[$j]->[$k] = ($self->{_cluster_width} * 180.0) ** 2
1382				if $j == 1 && $k == 1;
1383				$cluster_covariance->[$j]->[$k] = 0.0 if $j != $k;
1384				}
1385				}
1386				push @covariances, $cluster_covariance;
1387				}
1388				if ($self->{_debug}) {
1389				foreach my $i (0..$K-1) {
1390				print "\n\nCluster center: @{$cluster_centers[$i]}\n";
1391				print "\nCovariance:\n";
1392				foreach my $j (0..1) {
1393				foreach my $k (0..1) {
1394				print "$covariances[$i]->[$j]->[$k] ";
1395				}
1396				print "\n";
1397				}
1398				}
1399				}
1400				my @data_dump;
1401				foreach my $i (0..$K-1) {
1402				my @m = @{shift @cluster_centers};
1403				my @covar = @{shift @covariances};
1404				my @new_data = Math::Random::random_multivariate_normal($N, @m, @covar);
1405				if ($self->{_debug}) {
1406				print "\nThe points for cluster $i:\n";
1407				map { print "@$_ "; } @new_data;
1408				print "\n\n";
1409				}
1410				my @wrapped_data;
1411				foreach my $d (@new_data) {
1412				my $wrapped_d;
1413				if ($d->[0] >= 360.0) {
1414				$wrapped_d->[0] = $d->[0] - 360.0;
1415				} elsif ($d->[0] < 0) {
1416				$wrapped_d->[0] = 360.0 - abs($d->[0]);
1417				}
1418				if ($d->[1] >= 90.0) {
1419				$wrapped_d->[0] = POSIX::fmod($d->[0] + 180.0, 360);
1420				$wrapped_d->[1] = 180.0 - $d->[1];
1421				} elsif ($d->[1] < -90.0) {
1422				$wrapped_d->[0] = POSIX::fmod($d->[0] + 180, 360);
1423				$wrapped_d->[1] = -180.0 - $d->[1];
1424				}
1425				$wrapped_d->[0] = $d->[0] unless defined $wrapped_d->[0];
1426				$wrapped_d->[1] = $d->[1] unless defined $wrapped_d->[1];
1427				push @wrapped_data, $wrapped_d;
1428				}
1429				if ($self->{_debug}) {
1430				print "\nThe unwrapped points for cluster $i:\n";
1431				map { print "@$_ "; } @wrapped_data;
1432				print "\n\n";
1433				}
1434				my $label = $point_labels[$i];
1435				my $j = 0;
1436				@new_data = map {unshift @$_, $label."_".$j; $j++; $_} @wrapped_data;
1437				push @data_dump, @new_data;
1438				}
1439				if ($self->{_debug}) {
1440				print "\n\nThe labeled points for clusters:\n";
1441				map { print "@$_\n"; } @data_dump;
1442				}
1443				fisher_yates_shuffle( \@data_dump );
1444				open OUTPUT, ">$output_file";
1445				my $total_num_of_points = $N * $K;
1446				print "Total number of data points that will be written out to the file: $total_num_of_points\n"
1447				if $self->{_debug};
1448				foreach my $ele (@data_dump) {
1449				my ($x,$y,$z);
1450				my $label = $ele->[0];
1451				my $azimuth = $ele->[1];
1452				my $elevation = $ele->[2];
1453				$x = cos($elevation) * cos($azimuth);
1454				$y = cos($elevation) * sin($azimuth);
1455				$z = sin($elevation);
1456				my $csv_str = join ",", ($label,$x,$y,$z);
1457				print OUTPUT "$csv_str\n";
1458				}
1459				print "\n\n";
1460				print "Data written out to file $output_file\n" if $self->{_debug};
1461				close OUTPUT;
1462				}
1463
1464				# This version for the embedded class for data generation
1465				sub visualize_data_on_sphere {
1466				my $self = shift;
1467				my $datafile = shift;
1468				my $filename = File::Basename::basename($datafile);
1469				my $temp_file = "__temp_" . $filename;
1470				$temp_file =~ s/\.\w+$/\.txt/;
1471				unlink $temp_file if -e $temp_file;
1472				open OUTPUT, ">$temp_file"
1473				or die "Unable to open a temp file in this directory: $!";
1474				open INPUT, "< $filename" or die "Unable to open $filename: $!";
1475				local $/ = undef;
1476				my @all_records = split /\s+/, ;
1477				my %clusters;
1478				foreach my $record (@all_records) {
1479				my @splits = split /,/, $record;
1480				my $record_name = shift @splits;
1481				$record_name =~ /(\w+?)_.*/;
1482				my $primary_cluster_label = $1;
1483				push @{$clusters{$primary_cluster_label}}, \@splits;
1484				}
1485				foreach my $key (sort {"\L$a" cmp "\L$b"} keys %clusters) {
1486				map {print OUTPUT "$_"} map {"@$_\n"} @{$clusters{$key}};
1487				print OUTPUT "\n\n";
1488				}
1489				my @sorted_cluster_keys = sort {"\L$a" cmp "\L$b"} keys %clusters;
1490				close OUTPUT;
1491				my $plot = Graphics::GnuplotIF->new( persist => 1 );
1492				my $arg_string = "";
1493				$plot->gnuplot_cmd( "set noclip" );
1494				$plot->gnuplot_cmd( "set hidden3d" ) unless $self->{_show_hidden_in_3D_plots};
1495				$plot->gnuplot_cmd( "set pointsize 2" );
1496				$plot->gnuplot_cmd( "set parametric" );
1497				$plot->gnuplot_cmd( "set size ratio 1" );
1498				$plot->gnuplot_cmd( "set xlabel \"X\"" );
1499				$plot->gnuplot_cmd( "set ylabel \"Y\"" );
1500				$plot->gnuplot_cmd( "set zlabel \"Z\"" );
1501				# set the range for azimuth angles:
1502				$plot->gnuplot_cmd( "set urange [0:2*pi]" );
1503				# set the range for the elevation angles:
1504				$plot->gnuplot_cmd( "set vrange [-pi/2:pi/2]" );
1505				# Parametric functions for the sphere
1506				$plot->gnuplot_cmd( "r=1" );
1507				$plot->gnuplot_cmd( "fx(v,u) = rcos(v)cos(u)" );
1508				$plot->gnuplot_cmd( "fy(v,u) = rcos(v)sin(u)" );
1509				$plot->gnuplot_cmd( "fz(v) = r*sin(v)" );
1510				my $sphere_arg_str = "fx(v,u),fy(v,u),fz(v) notitle with lines lt 0,";
1511				foreach my $i (0..scalar(keys %clusters)-1) {
1512				my $j = $i + 1;
1513				# The following statement puts the titles on the data points
1514				$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"$sorted_cluster_keys[$i] \" with points lt $j pt $j, ";
1515				}
1516				$arg_string = $arg_string =~ /^(.*),[ ]+$/;
1517				$arg_string = $1;
1518				# $plot->gnuplot_cmd( "splot $arg_string" );
1519				$plot->gnuplot_cmd( "splot $sphere_arg_str $arg_string" );
1520				}
1521
1522				sub DESTROY {
1523				use File::Basename;
1524				my $filename = basename($_[0]->{_output_file});
1525				$filename =~ s/\.\w+$/\.txt/;
1526				unlink "__temp_" . $filename;
1527				}
1528
1529				# from perl docs:
1530				sub fisher_yates_shuffle {
1531				my $arr = shift;
1532				my $i = @$arr;
1533				while (--$i) {
1534				my $j = int rand( $i + 1 );
1535				@$arr[$i, $j] = @$arr[$j, $i];
1536				}
1537				}
1538
1539				1;
1540
1541				=pod
1542
1543				=head1 NAME
1544
1545				Algorithm::LinearManifoldDataClusterer --- for clustering data that resides on a
1546				low-dimensional manifold in a high-dimensional measurement space
1547
1548				=head1 SYNOPSIS
1549
1550				# You'd begin with:
1551
1552				use Algorithm::LinearManifoldDataClusterer;
1553
1554				# You'd next name the data file:
1555
1556				my $datafile = "mydatafile.csv";
1557
1558				# Your data must be in the CSV format, with one of the columns containing a unique
1559				# symbolic tag for each data record. You tell the module which column has the
1560				# symbolic tag and which columns to use for clustering through a mask such as
1561
1562				my $mask = "N111";
1563
1564				# which says that the symbolic tag is in the first column and that the numerical
1565				# data in the next three columns is to be used for clustering. If your data file
1566				# had, say, five columns and you wanted only the last three columns to be
1567				# clustered, the mask would become `N0111' assuming that that the symbolic tag is
1568				# still in the first column.
1569
1570				# Now you must construct an instance of the clusterer through a call such as:
1571
1572				my $clusterer = Algorithm::LinearManifoldDataClusterer->new(
1573				datafile => $datafile,
1574				mask => $mask,
1575				K => 3,
1576				P => 2,
1577				max_iterations => 15,
1578				cluster_search_multiplier => 2,
1579				delta_reconstruction_error => 0.001,
1580				terminal_output => 1,
1581				visualize_each_iteration => 1,
1582				show_hidden_in_3D_plots => 1,
1583				make_png_for_each_iteration => 1,
1584				);
1585
1586				# where the parameter K specifies the number of clusters you expect to find in
1587				# your data and the parameter P is the dimensionality of the manifold on which the
1588				# data resides. The parameter cluster_search_multiplier is for increasing the
1589				# odds that the random seeds chosen initially for clustering will populate all the
1590				# clusters. Set this parameter to a low number like 2 or 3. The parameter
1591				# max_iterations places a hard limit on the number of iterations that the
1592				# algorithm is allowed. The actual number of iterations is controlled by the
1593				# parameter delta_reconstruction_error. The iterations stop when the change in
1594				# the total "reconstruction error" from one iteration to the next is smaller than
1595				# the value specified by delta_reconstruction_error.
1596
1597				# Next, you must get the module to read the data for clustering:
1598
1599				$clusterer->get_data_from_csv();
1600
1601				# Finally, you invoke linear manifold clustering by:
1602
1603				my $clusters = $clusterer->linear_manifold_clusterer();
1604
1605				# The value returned by this call is a reference to an array of anonymous arrays,
1606				# with each anonymous array holding one cluster. If you wish, you can have the
1607				# module write the clusters to individual files by the following call:
1608
1609				$clusterer->write_clusters_to_files($clusters);
1610
1611				# If you want to see how the reconstruction error changes with the iterations, you
1612				# can make the call:
1613
1614				$clusterer->display_reconstruction_errors_as_a_function_of_iterations();
1615
1616				# When your data is 3-dimensional and when the clusters reside on a surface that
1617				# is more or less spherical, you can visualize the clusters by calling
1618
1619				$clusterer->visualize_clusters_on_sphere("final clustering", $clusters);
1620
1621				# where the first argument is a label to be displayed in the 3D plot and the
1622				# second argument the value returned by calling linear_manifold_clusterer().
1623
1624				# SYNTHETIC DATA GENERATION:
1625
1626				# The module includes an embedded class, DataGenerator, for generating synthetic
1627				# three-dimensional data that can be used to experiment with the clustering code.
1628				# The synthetic data, written out to a CSV file, consists of Gaussian clusters on
1629				# the surface of a sphere. You can control the number of clusters, the width of
1630				# each cluster, and the number of samples in the clusters by giving appropriate
1631				# values to the constructor parameters as shown below:
1632
1633				use strict;
1634				use Algorithm::LinearManifoldDataClusterer;
1635
1636				my $output_file = "4_clusters_on_a_sphere_1000_samples.csv";
1637
1638				my $training_data_gen = DataGenerator->new(
1639				output_file => $output_file,
1640				cluster_width => 0.015,
1641				total_number_of_samples_needed => 1000,
1642				number_of_clusters_on_sphere => 4,
1643				show_hidden_in_3D_plots => 0,
1644				);
1645				$training_data_gen->gen_data_and_write_to_csv();
1646				$training_data_gen->visualize_data_on_sphere($output_file);
1647
1648
1649				=head1 CHANGES
1650
1651				Version 1.01: Typos and other errors removed in the documentation. Also included in
1652				the documentation a link to a tutorial on data processing on manifolds.
1653
1654
1655				=head1 DESCRIPTION
1656
1657				If you are new to machine learning and data clustering on linear and nonlinear
1658				manifolds, your first question is likely to be: What is a manifold? A manifold is a
1659				space that is locally Euclidean. And a space is locally Euclidean if it allows for
1660				the points in a small neighborhood to be represented by, say, the Cartesian
1661				coordinates and if the distances between the points in the neighborhood are given by
1662				the Euclidean metric. For an example, the set of all points on the surface of a
1663				sphere does NOT constitute a Euclidean space. Nonetheless, if you confined your
1664				attention to a small enough neighborhood around a point, the space would seem to be
1665				locally Euclidean. The surface of a sphere is a 2-dimensional manifold embedded in a
1666				3-dimensional space. A plane in a 3-dimensional space is also a 2-dimensional
1667				manifold. You would think of the surface of a sphere as a nonlinear manifold, whereas
1668				a plane would be a linear manifold. However, note that any nonlinear manifold is
1669				locally a linear manifold. That is, given a sufficiently small neighborhood on a
1670				nonlinear manifold, you can always think of it as a locally flat surface.
1671
1672				As to why we need machine learning and data clustering on manifolds, there exist many
1673				important applications in which the measured data resides on a nonlinear manifold.
1674				For example, when you record images of a human face from different angles, all the
1675				image pixels taken together fall on a low-dimensional surface in a high-dimensional
1676				measurement space. The same is believed to be true for the satellite images of a land
1677				mass that are recorded with the sun at different angles with respect to the direction
1678				of the camera.
1679
1680				Reducing the dimensionality of the sort of data mentioned above is critical to the
1681				proper functioning of downstream classification algorithms, and the most popular
1682				traditional method for dimensionality reduction is the Principal Components Analysis
1683				(PCA) algorithm. However, using PCA is tantamount to passing a linear least-squares
1684				hyperplane through the surface on which the data actually resides. As to why that
1685				might be a bad thing to do, just imagine the consequences of assuming that your data
1686				falls on a straight line when, in reality, it falls on a strongly curving arc. This
1687				is exactly what happens with PCA --- it gives you a linear manifold approximation to
1688				your data that may actually reside on a curved surface.
1689
1690				That brings us to the purpose of this module, which is to cluster data that resides
1691				on a nonlinear manifold. Since a nonlinear manifold is locally linear, we can think
1692				of each data cluster on a nonlinear manifold as falling on a locally linear portion
1693				of the manifold, meaning on a hyperplane. The logic of the module is based on
1694				finding a set of hyperplanes that best describes the data, with each hyperplane
1695				derived from a local data cluster. This is like constructing a piecewise linear
1696				approximation to data that falls on a curve as opposed to constructing a single
1697				straight line approximation to all of the data. So whereas the frequently used PCA
1698				algorithm gives you a single hyperplane approximation to all your data, what this
1699				module returns is a set of hyperplane approximations, with each hyperplane derived by
1700				applying the PCA algorithm locally to a data cluster.
1701
1702				That brings us to the problem of how to actually discover the best set of hyperplane
1703				approximations to the data. What is probably the most popular algorithm today for
1704				that purpose is based on the following key idea: Given a set of subspaces to which a
1705				data element can be assigned, you assign it to that subspace for which the
1706				B is the least. But what do we mean by a B and what
1707				is B?
1708
1709				To understand the notions of B and B, let's revisit
1710				the traditional approach of dimensionality reduction by the PCA algorithm. The PCA
1711				algorithm consists of: (1) Subtracting from each data element the global mean of the
1712				data; (2) Calculating the covariance matrix of the data; (3) Carrying out an
1713				eigendecomposition of the covariance matrix and ordering the eigenvectors according
1714				to decreasing values of the corresponding eigenvalues; (4) Forming a B by
1715				discarding the trailing eigenvectors whose corresponding eigenvalues are relatively
1716				small; and, finally, (5) projecting all the data elements into the subspace so
1717				formed. The error incurred in representing a data element by its projection into the
1718				subspace is known as the B. This error is the projection of
1719				the data element into the space spanned by the discarded trailing eigenvectors.
1720
1721				I
1722				subspace in the manner described above, we construct a set of subspaces, one for each
1723				data cluster on the nonlinear manifold. After the subspaces have been constructed, a
1724				data element is assigned to that subspace for which the reconstruction error is the
1725				least.> On the face of it, this sounds like a chicken-and-egg sort of a problem. You
1726				need to have already clustered the data in order to construct the subspaces at
1727				different places on the manifold so that you can figure out which cluster to place a
1728				data element in.
1729
1730				Such problems, when they do possess a solution, are best tackled through iterative
1731				algorithms in which you start with a guess for the final solution, you rearrange the
1732				measured data on the basis of the guess, and you then use the new arrangement of the
1733				data to refine the guess. Subsequently, you iterate through the second and the third
1734				steps until you do not see any discernible changes in the new arrangements of the
1735				data. This forms the basis of the clustering algorithm that is described under
1736				B in the section that follows. This algorithm was first proposed in the
1737				article "Dimension Reduction by Local Principal Component Analysis" by Kambhatla and
1738				Leen that appeared in the journal Neural Computation in 1997.
1739
1740				Unfortunately, experiments show that the algorithm as proposed by Kambhatla and Leen
1741				is much too sensitive to how the clusters are seeded initially. To get around this
1742				limitation of the basic clustering-by-minimization-of-reconstruction-error, this
1743				module implements a two phased approach. In B, we introduce a multiplier
1744				effect in our search for clusters by looking for C clusters instead of the main
1745				C clusters. In this manner, we increase the odds that each original cluster will
1746				be visited by one or more of the C randomly selected seeds at the beginning,
1747				where C is the integer value given to the constructor parameter
1748				C. Subsequently, we merge the clusters that belong
1749				together in order to form the final C clusters. That work is done in B
1750				of the algorithm.
1751
1752				For the cluster merging operation in Phase 2, we model the C clusters as the
1753				nodes of an attributed graph in which the weight given to an edge connecting a pair
1754				of nodes is a measure of the similarity between the two clusters corresponding to the
1755				two nodes. Subsequently, we use spectral clustering to merge the most similar nodes
1756				in our quest to partition the data into C clusters. For that purpose, we use the
1757				Shi-Malik normalized cuts algorithm. The pairwise node similarity required by this
1758				algorithm is measured by the C method of the
1759				C class. The smaller the overall reconstruction error
1760				when all of the data elements in one cluster are projected into the other's subspace
1761				and vice versa, the greater the similarity between two clusters. Additionally, the
1762				smaller the distance between the mean vectors of the clusters, the greater the
1763				similarity between two clusters. The overall similarity between a pair of clusters
1764				is a combination of these two similarity measures.
1765
1766				For additional information regarding the theoretical underpinnings of the algorithm
1767				implemented in this module, visit
1768				L
1769
1770
1771				=head1 SUMMARY OF THE ALGORITHM
1772
1773				We now present a summary of the two phases of the algorithm implemented in this
1774				module. Note particularly the important role played by the constructor parameter
1775				C. It is only when the integer value given to this
1776				parameter is greater than 1 that Phase 2 of the algorithm kicks in.
1777
1778				=over 4
1779
1780				=item B
1781
1782				Through iterative minimization of the total reconstruction error, this phase of the
1783				algorithm returns C clusters where C is the actual number of clusters you
1784				expect to find in your data and where C is the integer value given to the
1785				constructor parameter C. As previously mentioned, on
1786				account of the sensitivity of the reconstruction-error based clustering to how the
1787				clusters are initially seeded, our goal is to look for C clusters with the idea
1788				of increasing the odds that each of the C clusters will see at least one seed at
1789				the beginning of the algorithm.
1790
1791				=over 4
1792
1793				=item Step 1:
1794
1795				Randomly choose C data elements to serve as the seeds for that many clusters.
1796
1797				=item Step 2:
1798
1799				Construct initial C clusters by assigning each data element to that cluster
1800				whose seed it is closest to.
1801
1802				=item Step 3:
1803
1804				Calculate the mean and the covariance matrix for each of the C clusters and
1805				carry out an eigendecomposition of the covariance matrix. Order the eigenvectors in
1806				decreasing order of the corresponding eigenvalues. The first C eigenvectors
1807				define the subspace for that cluster. Use the space spanned by the remaining
1808				eigenvectors --- we refer to them as the trailing eigenvectors --- for calculating
1809				the reconstruction error.
1810
1811				=item Step 4:
1812
1813				Taking into account the mean associated with each cluster, re-cluster the entire data
1814				set on the basis of the least reconstruction error. That is, assign each data
1815				element to that subspace for which it has the smallest reconstruction error.
1816				Calculate the total reconstruction error associated with all the data elements. (See
1817				the definition of the reconstruction error in the C section.)
1818
1819				=item Step 5:
1820
1821				Stop iterating if the change in the total reconstruction error from the previous
1822				iteration to the current iteration is less than the value specified by the constructor
1823				parameter C. Otherwise, go back to Step 3.
1824
1825				=back
1826
1827				=item B
1828
1829				This phase of the algorithm uses graph partitioning to merge the C clusters back
1830				into the C clusters you expect to see in your data. Since the algorithm whose
1831				steps are presented below is invoked recursively, let's assume that we have C
1832				clusters that need to be merged by invoking the Shi-Malik spectral clustering
1833				algorithm as described below:
1834
1835				=over 4
1836
1837				=item Step 1:
1838
1839				Form a graph whose C nodes represent the C clusters. (For the very first
1840				invocation of this step, we have C.)
1841
1842				=item Step 2:
1843
1844				Construct an C similarity matrix for the nodes in the graph. The C<(i,j)>-th
1845				element of this matrix is the pairwise similarity between the clusters indexed C
1846				and C. Calculate this similarity on the basis of the following two criteria: (1)
1847				The total reconstruction error when the data elements in the cluster indexed C are
1848				projected into the subspace for the cluster indexed C and vice versa. And (2) The
1849				distance between the mean vectors associated with the two clusters.
1850
1851				=item Step 3:
1852
1853				Calculate the symmetric normalized Laplacian of the similarity matrix. We use C
1854				to denote the symmetric normalized Laplacian.
1855
1856				=item Step 4:
1857
1858				Carry out an eigendecomposition of the C matrix and choose the eigenvector
1859				corresponding to the second smallest eigenvalue for bipartitioning the graph on the
1860				basis of the sign of the values in the eigenvector.
1861
1862				=item Step 5:
1863
1864				If the bipartition of the previous step yields one-versus-the-rest kind of a
1865				partition, add the `one' cluster to the output list of clusters and invoke graph
1866				partitioning recursively on the `rest' by going back to Step 1. On the other hand,
1867				if the cardinality of both the partitions returned by Step 4 exceeds 1, invoke graph
1868				partitioning recursively on both partitions. Stop when the list of clusters in the
1869				output list equals C.
1870
1871				=item Step 6:
1872
1873				In general, the C clusters obtained by recursive graph partitioning will not cover
1874				all of the data. So, for the final step, assign each data element not covered by the
1875				C clusters to that cluster for which its reconstruction error is the least.
1876
1877				=back
1878
1879				=back
1880
1881				=head1 FAIL-FIRST BIAS OF THE MODULE
1882
1883				As you would expect for all such iterative algorithms, the module carries no
1884				theoretical guarantee that it will give you correct results. But what does that mean?
1885				Suppose you create synthetic data that consists of Gaussian looking disjoint clusters
1886				on the surface of a sphere, would the module always succeed in separating out the
1887				clusters? The module carries no guarantees to that effect --- especially considering
1888				that Phase 1 of the algorithm is sensitive to how the clusters are seeded at the
1889				beginning. Although this sensitivity is mitigated by the cluster merging step when
1890				greater-than-1 value is given to the constructor option C,
1891				a plain vanilla implementation of the steps in Phase 1 and Phase 2 would nonetheless
1892				carry significant risk that you'll end up with incorrect clustering results.
1893
1894				To further reduce this risk, the module has been programmed so that it terminates
1895				immediately if it suspects that the cluster solution being developed is unlikely to
1896				be fruitful. The heuristics used for such terminations are conservative --- since
1897				the cost of termination is only that the user will have to run the code again, which
1898				at worst only carries an element of annoyance with it. The three "Fail First"
1899				heuristics currently programmed into the module are based on simple "unimodality
1900				testing", testing for "congruent clusters," and testing for dominant cluster support
1901				in the final stage of the recursive invocation of the graph partitioning step. The
1902				unimodality testing is as elementary as it can be --- it only checks for the number
1903				of data samples within a certain radius of the mean in relation to the total number
1904				of data samples in the cluster.
1905
1906				When the program terminates under such conditions, it prints out the following message
1907				in your terminal window:
1908
1909				Bailing out!
1910
1911				Given the very simple nature of testing that is carried for the "Fail First" bias, do
1912				not be surprised if the results you get for your data simply look wrong. If most
1913				runs of the module produce wrong results for your application, that means that the
1914				module logic needs to be strengthened further. The author of this module would love
1915				to hear from you if that is the case.
1916
1917				=head1 METHODS
1918
1919				The module provides the following methods for linear-manifold based clustering, for
1920				cluster visualization, and for the generation of data for testing the clustering algorithm:
1921
1922				=over 4
1923
1924				=item B
1925
1926				my $clusterer = Algorithm::LinearManifoldDataClusterer->new(
1927				datafile => $datafile,
1928				mask => $mask,
1929				K => $K,
1930				P => $P,
1931				cluster_search_multiplier => $C,
1932				max_iterations => $max_iter,
1933				delta_reconstruction_error => 0.001,
1934				terminal_output => 1,
1935				write_clusters_to_files => 1,
1936				visualize_each_iteration => 1,
1937				show_hidden_in_3D_plots => 1,
1938				make_png_for_each_iteration => 1,
1939				);
1940
1941				A call to C constructs a new instance of the
1942				C class.
1943
1944				=back
1945
1946				=head2 Constructor Parameters
1947
1948				=over 8
1949
1950				=item C:
1951
1952				This parameter names the data file that contains the multidimensional data records
1953				you want the module to cluster. This file must be in CSV format and each record in
1954				the file must include a symbolic tag for the record. Here are first few rows of such
1955				a CSV file in the C directory:
1956
1957				d_161,0.0739248630173239,0.231119293395665,-0.970112873251437
1958				a_59,0.459932215884786,0.0110216469739639,0.887885623314902
1959				a_225,0.440503220903039,-0.00543366086464691,0.897734586447273
1960				a_203,0.441656364946433,0.0437191337788422,0.896118459046532
1961				...
1962				...
1963
1964				What you see in the first column --- C, C, C, C --- are
1965				the symbolic tags associated with four 3-dimensional data records.
1966
1967				=item C:
1968
1969				This parameter supplies the mask to be applied to the columns of your data file. For
1970				the data file whose first few records are shown above, the mask should be C
1971				since the symbolic tag is in the first column of the CSV file and since, presumably,
1972				you want to cluster the data in the next three columns.
1973
1974				=item C:
1975
1976				This parameter supplies the number of clusters you are looking for.
1977
1978				=item C :
1979
1980				This parameter specifies the dimensionality of the manifold on which the data resides.
1981
1982				=item C:
1983
1984				As should be clear from the C section, this parameter plays
1985				a very important role in the successful clustering of your data. As explained in
1986				C, the basic algorithm used for clustering in Phase 1 --- clustering by
1987				the minimization of the reconstruction error --- is sensitive to the choice of the
1988				cluster seeds that are selected randomly at the beginning of the algorithm. Should
1989				it happen that the seeds miss one or more of the clusters, the clustering produced is
1990				likely to not be correct. By giving an integer value to C
1991				that is greater than 1, you'll increase the odds that the randomly selected seeds
1992				will see all clusters. When you set C to C, you ask
1993				Phase 1 of the algorithm to construct C clusters as opposed to just C
1994				clusters. Subsequently, in Phase 2, the module uses inter-cluster similarity based
1995				graph partitioning to merge the C clusters into C clusters.
1996
1997				=item C:
1998
1999				This hard limits the number of iterations in Phase 1 of the algorithm. Ordinarily,
2000				the iterations stop automatically when the change in the total reconstruction error
2001				from one iteration to the next is less than the value specified by the parameter
2002				C.
2003
2004				=item C:
2005
2006				It is this parameter that determines when the iterations will actually terminate in
2007				Phase 1 of the algorithm. When the difference in the total reconstruction error from
2008				one iteration to the next is less than the value given to this parameter, the
2009				iterations come to an end. B
2010				data samples that need to be clustered, the larger must be the value give to this
2011				parameter. That makes intuitive sense since the total reconstruction error is the
2012				sum of all such errors for all of the data elements.> Unfortunately, the best value
2013				for this parameter does NOT appear to depend linearly on the total number of data
2014				records to be clustered.
2015
2016				=item C:
2017
2018				When this parameter is set, you will see in your terminal window the different
2019				clusters as lists of the symbolic tags associated with the data records. You will
2020				also see in your terminal window the output produced by the different steps of the
2021				graph partitioning algorithm as smaller clusters are merged to form the final C
2022				clusters --- assuming that you set the parameter C to an
2023				integer value that is greater than 1.
2024
2025				=item C:
2026
2027				As its name implies, when this option is set to 1, you'll see 3D plots of the
2028				clustering results for each iteration --- but only if your data is 3-dimensional.
2029
2030				=item C:
2031
2032				This parameter is important for controlling the visualization of the clusters on the
2033				surface of a sphere. If the clusters are too spread out, seeing all of the clusters
2034				all at once can be visually confusing. When you set this parameter, the clusters on
2035				the back side of the sphere will not be visible. Note that no matter how you set
2036				this parameter, you can interact with the 3D plot of the data and rotate it with your
2037				mouse pointer to see all of the data that is output by the clustering code.
2038
2039				=item C:
2040
2041				If you set this option to 1, the module will output a Gnuplot in the form of a PNG
2042				image for each iteration in Phase 1 of the algorithm. In Phase 2, the module will
2043				output the clustering result produced by the graph partitioning algorithm.
2044
2045				=back
2046
2047				=over
2048
2049				=item B:
2050
2051				$clusterer->get_data_from_csv();
2052
2053				As you can guess from its name, the method extracts the data from the CSV file named
2054				in the constructor.
2055
2056				=item B:
2057
2058				$clusterer->linear_manifold_clusterer();
2059
2060				or
2061
2062				my $clusters = $clusterer->linear_manifold_clusterer();
2063
2064				This is the main call to the linear-manifold based clusterer. The first call works
2065				by side-effect, meaning that you will see the clusters in your terminal window and
2066				they would be written out to disk files (depending on the constructor options you
2067				have set). The second call also returns the clusters as a reference to an array of
2068				anonymous arrays, each holding the symbolic tags for a cluster.
2069
2070				=item B:
2071
2072				$clusterer->display_reconstruction_errors_as_a_function_of_iterations();
2073
2074				This method would normally be called after the clustering is completed to see how the
2075				reconstruction errors decreased with the iterations in Phase 1 of the overall
2076				algorithm.
2077
2078				=item B:
2079
2080				$clusterer->write_clusters_to_files($clusters);
2081
2082				As its name implies, when you call this methods, the final clusters would be written
2083				out to disk files. The files have names like:
2084
2085				cluster0.txt
2086				cluster1.txt
2087				cluster2.txt
2088				...
2089				...
2090
2091				Before the clusters are written to these files, the module destroys all files with
2092				such names in the directory in which you call the module.
2093
2094				=item B:
2095
2096				$clusterer->visualize_clusters_on_sphere("final clustering", $clusters);
2097
2098				or
2099
2100				$clusterer->visualize_clusters_on_sphere("final_clustering", $clusters, "png");
2101
2102				If your data is 3-dimensional and it resides on the surface of a sphere (or in the
2103				vicinity of such a surface), you may be able to use these methods for the
2104				visualization of the clusters produced by the algorithm. The first invocation
2105				produces a Gnuplot in a terminal window that you can rotate with your mouse pointer.
2106				The second invocation produces a `.png' image of the plot.
2107
2108				=item B:
2109
2110				$clusterer->auto_retry_clusterer();
2111
2112				or
2113
2114				my $clusters = $clusterer->auto_retry_clusterer();
2115
2116				As mentioned earlier, the module is programmed in such a way that it is more likely
2117				to fail than to give you a wrong answer. If manually trying the clusterer repeatedly
2118				on a data file is frustrating, you can use C to automatically
2119				make repeated attempts for you. See the script C for how you can use
2120				C in your own code.
2121
2122				=back
2123
2124				=head1 GENERATING SYNTHETIC DATA FOR EXPERIMENTING WITH THE CLUSTERER
2125
2126				The module file also contains a class named C for generating synthetic
2127				data for experimenting with linear-manifold based clustering. At this time, only
2128				3-dimensional data that resides in the form of Gaussian clusters on the surface of a
2129				sphere is generated. The generated data is placed in a CSV file. You construct an
2130				instance of the C class by a call like:
2131
2132				=over 4
2133
2134				=item B
2135
2136				my $training_data_gen = DataGenerator->new(
2137				output_file => $output_file,
2138				cluster_width => 0.0005,
2139				total_number_of_samples_needed => 1000,
2140				number_of_clusters_on_sphere => 4,
2141				show_hidden_in_3D_plots => 0,
2142				);
2143
2144				=back
2145
2146				=head2 Parameters for the DataGenerator constructor:
2147
2148				=over 8
2149
2150				=item C:
2151
2152				The numeric values are generated using a bivariate Gaussian distribution whose two
2153				independent variables are the azimuth and the elevation angles on the surface of a
2154				unit sphere. The mean of each cluster is chosen randomly and its covariance set in
2155				proportion to the value supplied for the C< cluster_width> parameter.
2156
2157				=item C:
2158
2159				This parameter controls the spread of each cluster on the surface of the unit sphere.
2160
2161				=item C:
2162
2163				As its name implies, this parameter specifies the total number of data samples that
2164				will be written out to the output file --- provided this number is divisible by the
2165				number of clusters you asked for. If the divisibility condition is not satisfied,
2166				the number of data samples actually written out will be the closest it can be to the
2167				number you specify for this parameter under the condition that equal number of
2168				samples will be created for each cluster.
2169
2170				=item C:
2171
2172				Again as its name implies, this parameters specifies the number of clusters that will
2173				be produced on the surface of a unit sphere.
2174
2175				=item C:
2176
2177				This parameter is important for the visualization of the clusters and it controls
2178				whether you will see the generated data on the back side of the sphere. If the
2179				clusters are not too spread out, you can set this parameter to 0 and see all the
2180				clusters all at once. However, when the clusters are spread out, it can be visually
2181				confusing to see the data on the back side of the sphere. Note that no matter how
2182				you set this parameter, you can interact with the 3D plot of the data and rotate it
2183				with your mouse pointer to see all of the data that is generated.
2184
2185				=back
2186
2187				=over 4
2188
2189				=item B:
2190
2191				$training_data_gen->gen_data_and_write_to_csv();
2192
2193				As its name implies, this method generates the data according to the parameters
2194				specified in the DataGenerator constructor.
2195
2196				=item B:
2197
2198				$training_data_gen->visualize_data_on_sphere($output_file);
2199
2200				You can use this method to visualize the clusters produced by the data generator.
2201				Since the clusters are located at randomly selected points on a unit sphere, by
2202				looking at the output visually, you can quickly reject what the data generator has
2203				produced and try again.
2204
2205				=back
2206
2207				=head1 HOW THE CLUSTERS ARE OUTPUT
2208
2209				When the option C is set in the constructor of the
2210				C class, the clusters are displayed on the terminal
2211				screen.
2212
2213				And, when the option C is set in the same constructor, the
2214				module dumps the clusters in files named
2215
2216				cluster0.txt
2217				cluster1.txt
2218				cluster2.txt
2219				...
2220				...
2221
2222				in the directory in which you execute the module. The number of such files will
2223				equal the number of clusters formed. All such existing files in the directory are
2224				destroyed before any fresh ones are created. Each cluster file contains the symbolic
2225				tags of the data samples in that cluster.
2226
2227				Assuming that the data dimensionality is 3, if you have set the constructor parameter
2228				C, the module will deposit in your directory printable PNG
2229				files that are point plots of the different clusters in the different iterations of
2230				the algorithm. Such printable files are also generated for the initial clusters at
2231				the beginning of the iterations and for the final clusters in Phase 1 of the
2232				algorithm. You will also see in your directory a PNG file for the clustering result
2233				produced by graph partitioning in Phase 2.
2234
2235				Also, as mentioned previously, a call to C in your own
2236				code will return the clusters to you directly.
2237
2238				=head1 REQUIRED
2239
2240				This module requires the following modules:
2241
2242				List::Util
2243				File::Basename
2244				Math::Random
2245				Graphics::GnuplotIF
2246				Math::GSL::Matrix
2247				POSIX
2248
2249				=head1 THE C DIRECTORY
2250
2251				The C directory contains the following four scripts that you might want to
2252				play with in order to become familiar with the module:
2253
2254				example1.pl
2255
2256				example2.pl
2257
2258				example3.pl
2259
2260				example4.pl
2261
2262				These scripts demonstrate linear-manifold based clustering on the 3-dimensional data
2263				in the following three CSV files:
2264
2265				3_clusters_on_a_sphere_498_samples.csv (used in example1.pl and example4.pl)
2266
2267				3_clusters_on_a_sphere_3000_samples.csv (used in example2.pl)
2268
2269				4_clusters_on_a_sphere_1000_samples.csv (used in example3.pl)
2270
2271				Note that even though the first two of these files both contain exactly three
2272				clusters, the clusters look very different in the two data files. The clusters are
2273				much more spread out in C<3_clusters_on_a_sphere_3000_samples.csv>.
2274
2275				The code in C is special because it shows how you can call the
2276				C method of the module for automatic repeated invocations of
2277				the clustering program until success is achieved. The value of the constructor
2278				parameter C is set to 1 in C, implying that
2279				when you execute C you will not be invoking Phase 2 of the algorithm.
2280				You might wish to change the value given to the parameter
2281				C to see how it affects the number of attempts needed to
2282				achieve success.
2283
2284				The C directory also includes PNG image files that show some intermediate
2285				and the best final results that can be achieved by the first three examples, that
2286				is, for the scripts C, C, and C. If you are on
2287				a Linux machine and if you have the C library installed, you can use the
2288				C command to see the results in the PNG images. The results you get with
2289				your own runs of the three example scripts may or may not look like what you see in
2290				the outputs shown in the PNG files depending on how the module seeds the clusters.
2291				Your best results should look like what you see in the PNG images.
2292
2293				The C directory also contains the following utility scripts:
2294
2295				For generating the data for experiments with clustering:
2296
2297				generate_data_on_a_sphere.pl
2298
2299				For visualizing the data generated by the above script:
2300
2301				data_visualizer.pl
2302
2303				For cleaning up the examples directory:
2304
2305				cleanup_directory.pl
2306
2307				Invoking the C script will get rid of all the PNG image files
2308				that are generated by the module when you run it with the constructor option
2309				C set to 1.
2310
2311				=head1 EXPORT
2312
2313				None by design.
2314
2315				=head1 CAVEATS
2316
2317				The performance of the algorithm depends much on the values you choose for the
2318				constructor parameters. And, even for the best choices for the parameters, the
2319				algorithm is not theoretically guaranteed to return the best results.
2320
2321				Even after you have discovered the best choices for the constructor parameters, the
2322				best way to use this module is to try it multiple times on any given data and accept
2323				only those results that make the best intuitive sense.
2324
2325				The module was designed with the hope that it would rather fail than give you wrong
2326				results. So if you see it failing a few times before it returns a good answer, that's
2327				a good thing.
2328
2329				However, if the module fails too often or is too quick to give you wrong answers,
2330				that means the module is not working on your data. If that happens, I'd love to hear
2331				from you. That might indicate to me how I should enhance the power of this module
2332				for its future versions.
2333
2334				=head1 BUGS
2335
2336				Please notify the author if you encounter any bugs. When sending email, please place
2337				the string 'LinearManifoldDataClusterer' in the subject line.
2338
2339				=head1 INSTALLATION
2340
2341				Download the archive from CPAN in any directory of your choice. Unpack the archive
2342				with a command that on a Linux machine would look like:
2343
2344				tar zxvf Algorithm-LinearManifoldDataClusterer-1.01.tar.gz
2345
2346				This will create an installation directory for you whose name will be
2347				C. Enter this directory and execute the following commands
2348				for a standard install of the module if you have root privileges:
2349
2350				perl Makefile.PL
2351				make
2352				make test
2353				sudo make install
2354
2355				If you do not have root privileges, you can carry out a non-standard install the
2356				module in any directory of your choice by:
2357
2358				perl Makefile.PL prefix=/some/other/directory/
2359				make
2360				make test
2361				make install
2362
2363				With a non-standard install, you may also have to set your PERL5LIB environment
2364				variable so that this module can find the required other modules. How you do that
2365				would depend on what platform you are working on. In order to install this module in
2366				a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
2367				variable by
2368
2369				setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2370
2371				If I used bash, I'd need to declare:
2372
2373				export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2374
2375
2376				=head1 THANKS
2377
2378				I have learned much from my conversations with Donghun Kim whose research on face
2379				recognition in the wild involves clustering image data on manifolds. I have also had
2380				several fruitful conversations with Bharath Kumar Comandur and Tanmay Prakash with
2381				regard to the ideas that are incorporated in this module.
2382
2383				=head1 AUTHOR
2384
2385				Avinash Kak, kak@purdue.edu
2386
2387				If you send email, please place the string "LinearManifoldDataClusterer" in your subject line to get past
2388				my spam filter.
2389
2390				=head1 COPYRIGHT
2391
2392				This library is free software; you can redistribute it and/or modify it under the
2393				same terms as Perl itself.
2394
2395				Copyright 2015 Avinash Kak
2396
2397				=cut
2398