blib/lib/Algorithm/LinearManifoldDataClusterer.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 25 | 27 | 92.5 |
branch | n/a | ||
condition | n/a | ||
subroutine | 9 | 9 | 100.0 |
pod | n/a | ||
total | 34 | 36 | 94.4 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | package Algorithm::LinearManifoldDataClusterer; | ||||||
2 | |||||||
3 | #------------------------------------------------------------------------------------ | ||||||
4 | # Copyright (c) 2015 Avinash Kak. All rights reserved. This program is free | ||||||
5 | # software. You may modify and/or distribute it under the same terms as Perl itself. | ||||||
6 | # This copyright notice must remain attached to the file. | ||||||
7 | # | ||||||
8 | # Algorithm::LinearManifoldDataClusterer is a Perl module for clustering data that | ||||||
9 | # resides on a low-dimensional manifold in a high-dimensional measurement space. | ||||||
10 | # ----------------------------------------------------------------------------------- | ||||||
11 | |||||||
12 | 1 | 1 | 13460 | use 5.10.0; | |||
1 | 3 | ||||||
1 | 48 | ||||||
13 | 1 | 1 | 5 | use strict; | |||
1 | 1 | ||||||
1 | 33 | ||||||
14 | 1 | 1 | 5 | use warnings; | |||
1 | 5 | ||||||
1 | 30 | ||||||
15 | 1 | 1 | 4 | use Carp; | |||
1 | 3 | ||||||
1 | 81 | ||||||
16 | 1 | 1 | 7 | use List::Util qw(reduce any); | |||
1 | 2 | ||||||
1 | 123 | ||||||
17 | 1 | 1 | 6 | use File::Basename; | |||
1 | 1 | ||||||
1 | 79 | ||||||
18 | 1 | 1 | 1009 | use Math::Random; | |||
1 | 5355 | ||||||
1 | 98 | ||||||
19 | 1 | 1 | 579 | use Graphics::GnuplotIF; | |||
1 | 10104 | ||||||
1 | 66 | ||||||
20 | 1 | 1 | 1020 | use Math::GSL::Matrix; | |||
0 | |||||||
0 | |||||||
21 | use POSIX (); | ||||||
22 | |||||||
23 | our $VERSION = '1.0'; | ||||||
24 | |||||||
25 | # Constructor: | ||||||
26 | sub new { | ||||||
27 | my ($class, %args) = @_; | ||||||
28 | my @params = keys %args; | ||||||
29 | croak "\nYou have used a wrong name for a keyword argument " . | ||||||
30 | "--- perhaps a misspelling\n" | ||||||
31 | if check_for_illegal_params(@params) == 0; | ||||||
32 | bless { | ||||||
33 | _datafile => $args{datafile} || croak("datafile required"), | ||||||
34 | _mask => $args{mask} || croak("mask required"), | ||||||
35 | _K => $args{K} || 0, | ||||||
36 | _P => $args{P} || 0, | ||||||
37 | _terminal_output => $args{terminal_output} || 0, | ||||||
38 | _max_iterations => $args{max_iterations} || 0, | ||||||
39 | _delta_reconstruction_error => $args{delta_reconstruction_error} || 0.001, | ||||||
40 | _delta_normalized_error => undef, | ||||||
41 | _cluster_search_multiplier => $args{cluster_search_multiplier} || 1, | ||||||
42 | _visualize_each_iteration => $args{visualize_each_iteration} == 0 ? 0 : 1, | ||||||
43 | _show_hidden_in_3D_plots => $args{show_hidden_in_3D_plots} == 0 ? 0 : 1, | ||||||
44 | _make_png_for_each_iteration => $args{make_png_for_each_iteration} == 0 ? 0 : 1, | ||||||
45 | _debug => $args{debug} || 0, | ||||||
46 | _N => 0, | ||||||
47 | _KM => $args{K} * $args{cluster_search_multiplier}, | ||||||
48 | _data_hash => {}, | ||||||
49 | _data_tags => [], | ||||||
50 | _data_dimensions => 0, | ||||||
51 | _final_clusters => [], | ||||||
52 | _auto_retry_flag => 0, | ||||||
53 | _num_iterations_actually_used => undef, | ||||||
54 | _scale_factor => undef, | ||||||
55 | _data_tags_to_cluster_label_hash => {}, | ||||||
56 | _final_reference_vecs_for_all_subspaces => [], | ||||||
57 | _reconstruction_error_as_a_function_of_iteration => [], | ||||||
58 | _final_trailing_eigenvec_matrices_for_all_subspaces => [], | ||||||
59 | _subspace_construction_error_as_a_function_of_iteration => [], | ||||||
60 | }, $class; | ||||||
61 | } | ||||||
62 | |||||||
63 | sub get_data_from_csv { | ||||||
64 | my $self = shift; | ||||||
65 | my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered"; | ||||||
66 | my $mask = $self->{_mask}; | ||||||
67 | my @mask = split //, $mask; | ||||||
68 | $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask; | ||||||
69 | print "data dimensionality: $self->{_data_dimensions} \n" if $self->{_terminal_output}; | ||||||
70 | open FILEIN, $filename or die "Unable to open $filename: $!"; | ||||||
71 | die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/; | ||||||
72 | local $/ = undef; | ||||||
73 | my @all_data = split /\s+/, |
||||||
74 | my %data_hash = (); | ||||||
75 | my @data_tags = (); | ||||||
76 | foreach my $record (@all_data) { | ||||||
77 | my @splits = split /,/, $record; | ||||||
78 | my $record_name = shift @splits; | ||||||
79 | $data_hash{$record_name} = \@splits; | ||||||
80 | push @data_tags, $record_name; | ||||||
81 | } | ||||||
82 | $self->{_data_hash} = \%data_hash; | ||||||
83 | $self->{_data_tags} = \@data_tags; | ||||||
84 | $self->{_N} = scalar @data_tags; | ||||||
85 | } | ||||||
86 | |||||||
87 | sub estimate_mean_and_covariance { | ||||||
88 | my $self = shift; | ||||||
89 | my $tag_set = shift; | ||||||
90 | my $cluster_size = @$tag_set; | ||||||
91 | my @cluster_center = @{$self->add_point_coords($tag_set)}; | ||||||
92 | @cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center; | ||||||
93 | # for covariance calculation: | ||||||
94 | my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$tag_set)); | ||||||
95 | print "\nThe data will be stuffed into a matrix of $num_rows rows and $num_cols columns\n\n" | ||||||
96 | if $self->{_debug}; | ||||||
97 | my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols); | ||||||
98 | my $mean_vec = Math::GSL::Matrix->new($num_rows,1); | ||||||
99 | # All the record labels are stored in the array $self->{_data_tags}. The actual | ||||||
100 | # data for clustering is stored in a hash at $self->{_data_hash} whose keys are | ||||||
101 | # the record labels; the value associated with each key is the array holding the | ||||||
102 | # corresponding numerical multidimensional data. | ||||||
103 | $mean_vec->set_col(0, \@cluster_center); | ||||||
104 | if ($self->{_debug}) { | ||||||
105 | print "\nDisplaying the mean vector for the cluster:\n"; | ||||||
106 | display_matrix( $mean_vec ) if $self->{_terminal_output}; | ||||||
107 | } | ||||||
108 | foreach my $j (0..$num_cols-1) { | ||||||
109 | my $tag = $tag_set->[$j]; | ||||||
110 | my $data = $self->{_data_hash}->{$tag}; | ||||||
111 | my @diff_from_mean = vector_subtract($data, \@cluster_center); | ||||||
112 | $matrix->set_col($j, \@diff_from_mean); | ||||||
113 | } | ||||||
114 | my $transposed = transpose( $matrix ); | ||||||
115 | my $covariance = $matrix * $transposed; | ||||||
116 | $covariance *= 1.0 / $num_cols; | ||||||
117 | if ($self->{_debug}) { | ||||||
118 | print "\nDisplaying the Covariance Matrix for cluster:"; | ||||||
119 | display_matrix( $covariance ) if $self->{_terminal_output}; | ||||||
120 | } | ||||||
121 | return ($mean_vec, $covariance); | ||||||
122 | } | ||||||
123 | |||||||
124 | sub eigen_analysis_of_covariance { | ||||||
125 | my $self = shift; | ||||||
126 | my $covariance = shift; | ||||||
127 | my ($eigenvalues, $eigenvectors) = $covariance->eigenpair; | ||||||
128 | my $num_of_eigens = @$eigenvalues; | ||||||
129 | my $largest_eigen_index = 0; | ||||||
130 | my $smallest_eigen_index = 0; | ||||||
131 | print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug}; | ||||||
132 | foreach my $i (1..$num_of_eigens-1) { | ||||||
133 | $largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index]; | ||||||
134 | $smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index]; | ||||||
135 | print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug}; | ||||||
136 | } | ||||||
137 | print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug}; | ||||||
138 | print "\nsmallest eigen index: $smallest_eigen_index\n\n" if $self->{_debug}; | ||||||
139 | my @all_my_eigenvecs; | ||||||
140 | foreach my $i (0..$num_of_eigens-1) { | ||||||
141 | my @vec = $eigenvectors->[$i]->as_list; | ||||||
142 | my @eigenvec; | ||||||
143 | foreach my $ele (@vec) { | ||||||
144 | my ($mag, $theta) = $ele =~ /\[(\d*\.?\d*e?[+-]?\d*),(\S+)\]/; | ||||||
145 | if ($theta eq "0") { | ||||||
146 | push @eigenvec, $mag; | ||||||
147 | } elsif ($theta eq "pi") { | ||||||
148 | push @eigenvec, -1.0 * $mag; | ||||||
149 | } else { | ||||||
150 | die "Eigendecomposition produced a complex eigenvector -- " . | ||||||
151 | "which should not happen for a covariance matrix!"; | ||||||
152 | } | ||||||
153 | } | ||||||
154 | print "Eigenvector $i: @eigenvec\n" if $self->{_debug}; | ||||||
155 | push @all_my_eigenvecs, \@eigenvec; | ||||||
156 | } | ||||||
157 | my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list; | ||||||
158 | print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug}; | ||||||
159 | my @sorted_eigenvec_indexes = sort {$eigenvalues->[$b] <=> $eigenvalues->[$a]} 0..@all_my_eigenvecs-1; | ||||||
160 | my @sorted_eigenvecs; | ||||||
161 | my @sorted_eigenvals; | ||||||
162 | foreach my $i (0..@sorted_eigenvec_indexes-1) { | ||||||
163 | $sorted_eigenvecs[$i] = $all_my_eigenvecs[$sorted_eigenvec_indexes[$i]]; | ||||||
164 | $sorted_eigenvals[$i] = $eigenvalues->[$sorted_eigenvec_indexes[$i]]; | ||||||
165 | } | ||||||
166 | if ($self->{_debug}) { | ||||||
167 | print "\nHere come sorted eigenvectors --- from the largest to the smallest:\n"; | ||||||
168 | foreach my $i (0..@sorted_eigenvecs-1) { | ||||||
169 | print "eigenvec: @{$sorted_eigenvecs[$i]} eigenvalue: $sorted_eigenvals[$i]\n"; | ||||||
170 | } | ||||||
171 | } | ||||||
172 | return (\@sorted_eigenvecs, \@sorted_eigenvals); | ||||||
173 | } | ||||||
174 | |||||||
175 | sub auto_retry_clusterer { | ||||||
176 | my $self = shift; | ||||||
177 | $self->{_auto_retry_flag} = 1; | ||||||
178 | my $clusters; | ||||||
179 | $@ = 1; | ||||||
180 | my $retry_attempts = 1; | ||||||
181 | while ($@) { | ||||||
182 | eval { | ||||||
183 | $clusters = $self->linear_manifold_clusterer(); | ||||||
184 | }; | ||||||
185 | if ($@) { | ||||||
186 | if ($self->{_terminal_output}) { | ||||||
187 | print "Clustering failed. Trying again. --- $@"; | ||||||
188 | print "\n\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"; | ||||||
189 | print "VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV\n\n"; | ||||||
190 | } | ||||||
191 | $retry_attempts++; | ||||||
192 | } else { | ||||||
193 | print "\n\nNumber of retry attempts: $retry_attempts\n\n" if $self->{_terminal_output}; | ||||||
194 | return $clusters; | ||||||
195 | } | ||||||
196 | } | ||||||
197 | } | ||||||
198 | |||||||
199 | sub linear_manifold_clusterer { | ||||||
200 | my $self = shift; | ||||||
201 | my $KM = $self->{_KM}; | ||||||
202 | my @initial_cluster_center_tags; | ||||||
203 | my $visualization_msg; | ||||||
204 | my @initial_cluster_center_indexes = $self->initialize_cluster_centers($KM, $self->{_N}); | ||||||
205 | print "Randomly selected indexes for cluster center tags: @initial_cluster_center_indexes\n" | ||||||
206 | if $self->{_debug}; | ||||||
207 | @initial_cluster_center_tags = map {$self->{_data_tags}->[$_]} @initial_cluster_center_indexes; | ||||||
208 | my @initial_cluster_center_coords = map {$self->{_data_hash}->{$_}} @initial_cluster_center_tags; | ||||||
209 | if ($self->{_debug}) { | ||||||
210 | foreach my $centroid (@initial_cluster_center_coords) { | ||||||
211 | print "Initial cluster center coords: @{$centroid}\n"; | ||||||
212 | } | ||||||
213 | } | ||||||
214 | my $initial_clusters = $self->assign_data_to_clusters_initial(\@initial_cluster_center_coords); | ||||||
215 | if ($self->{_data_dimensions} == 3) { | ||||||
216 | $visualization_msg = "initial_clusters"; | ||||||
217 | $self->visualize_clusters_on_sphere($visualization_msg, $initial_clusters) | ||||||
218 | if $self->{_visualize_each_iteration}; | ||||||
219 | $self->visualize_clusters_on_sphere($visualization_msg, $initial_clusters, "png") | ||||||
220 | if $self->{_make_png_for_each_iteration}; | ||||||
221 | } | ||||||
222 | foreach my $cluster (@$initial_clusters) { | ||||||
223 | my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster); | ||||||
224 | display_mean_and_covariance($mean, $covariance) if $self->{_debug}; | ||||||
225 | } | ||||||
226 | my @clusters = @$initial_clusters; | ||||||
227 | display_clusters(\@clusters) if $self->{_debug}; | ||||||
228 | my $iteration_index = 0; | ||||||
229 | my $unimodal_correction_flag; | ||||||
230 | my $previous_min_value_for_unimodality_quotient; | ||||||
231 | while ($iteration_index < $self->{_max_iterations}) { | ||||||
232 | print "\n\n========================== STARTING ITERATION $iteration_index =====================\n\n" | ||||||
233 | if $self->{_terminal_output}; | ||||||
234 | my $total_reconstruction_error_this_iteration = 0; | ||||||
235 | my @subspace_construction_errors_this_iteration; | ||||||
236 | my @trailing_eigenvec_matrices_for_all_subspaces; | ||||||
237 | my @reference_vecs_for_all_subspaces; | ||||||
238 | foreach my $cluster (@clusters) { | ||||||
239 | next if @$cluster == 0; | ||||||
240 | my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster); | ||||||
241 | display_mean_and_covariance($mean, $covariance) if $self->{_debug}; | ||||||
242 | print "--------------end of displaying mean and covariance\n\n" if $self->{_debug}; | ||||||
243 | my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance); | ||||||
244 | my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1]; | ||||||
245 | my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1]; | ||||||
246 | my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals; | ||||||
247 | push @subspace_construction_errors_this_iteration, $subspace_construction_error; | ||||||
248 | my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions}, | ||||||
249 | scalar(@trailing_eigenvecs)); | ||||||
250 | foreach my $j (0..@trailing_eigenvecs-1) { | ||||||
251 | print "trailing eigenvec column: @{$trailing_eigenvecs[$j]}\n" if $self->{_debug}; | ||||||
252 | $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]); | ||||||
253 | } | ||||||
254 | push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix; | ||||||
255 | push @reference_vecs_for_all_subspaces, $mean; | ||||||
256 | } | ||||||
257 | $self->set_termination_reconstruction_error_threshold(\@reference_vecs_for_all_subspaces); | ||||||
258 | my %best_subspace_based_partition_of_data; | ||||||
259 | foreach my $i (0..$self->{_KM}-1) { | ||||||
260 | $best_subspace_based_partition_of_data{$i} = []; | ||||||
261 | } | ||||||
262 | foreach my $data_tag (@{$self->{_data_tags}}) { | ||||||
263 | my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); | ||||||
264 | $data_vec->set_col(0, $self->{_data_hash}->{$data_tag}); | ||||||
265 | my @errors = map {$self->reconstruction_error($data_vec, | ||||||
266 | $trailing_eigenvec_matrices_for_all_subspaces[$_], | ||||||
267 | $reference_vecs_for_all_subspaces[$_])} | ||||||
268 | 0 .. $self->{_KM}-1; | ||||||
269 | my ($minval, $index_for_closest_subspace) = minimum(\@errors); | ||||||
270 | $total_reconstruction_error_this_iteration += $minval; | ||||||
271 | push @{$best_subspace_based_partition_of_data{$index_for_closest_subspace}}, | ||||||
272 | $data_tag; | ||||||
273 | } | ||||||
274 | print "Finished calculating the eigenvectors for the clusters produced by the previous\n" . | ||||||
275 | "iteration and re-assigning the data samples to the new subspaces on the basis of\n". | ||||||
276 | "the least reconstruction error.\n\n" . | ||||||
277 | "Total reconstruction error in this iteration: $total_reconstruction_error_this_iteration\n" | ||||||
278 | if $self->{_terminal_output}; | ||||||
279 | foreach my $i (0..$self->{_KM}-1) { | ||||||
280 | $clusters[$i] = $best_subspace_based_partition_of_data{$i}; | ||||||
281 | } | ||||||
282 | display_clusters(\@clusters) if $self->{_terminal_output}; | ||||||
283 | # Check if any cluster has lost all its elements. If so, fragment the worst | ||||||
284 | # existing cluster to create the additional clusters needed: | ||||||
285 | if (any {@$_ == 0} @clusters) { | ||||||
286 | die "empty cluster found" if $self->{_auto_retry_flag}; | ||||||
287 | print "\nOne or more clusters have become empty. Will carve out the needed clusters\n" . | ||||||
288 | "from the cluster with the largest subspace construction error.\n\n"; | ||||||
289 | $total_reconstruction_error_this_iteration = 0; | ||||||
290 | @subspace_construction_errors_this_iteration = (); | ||||||
291 | my $how_many_extra_clusters_needed = $self->{_KM} - scalar(grep {@$_ != 0} @clusters); | ||||||
292 | print "number of extra clusters needed at iteration $iteration_index: $how_many_extra_clusters_needed\n"; | ||||||
293 | my $max = List::Util::max @subspace_construction_errors_this_iteration; | ||||||
294 | my $maxindex = List::Util::first {$_ == $max} @subspace_construction_errors_this_iteration; | ||||||
295 | my @cluster_fragments = cluster_split($clusters[$maxindex], | ||||||
296 | $how_many_extra_clusters_needed + 1); | ||||||
297 | my @newclusters; | ||||||
298 | push @newclusters, @clusters[0 .. $maxindex-1]; | ||||||
299 | push @newclusters, @clusters[$maxindex+1 .. $self->{_KM}-1]; | ||||||
300 | push @newclusters, @cluster_fragments; | ||||||
301 | @newclusters = grep {@$_ != 0} @newclusters; | ||||||
302 | die "something went wrong with cluster fragmentation" | ||||||
303 | unless $self->{_KM} = @newclusters; | ||||||
304 | @trailing_eigenvec_matrices_for_all_subspaces = (); | ||||||
305 | @reference_vecs_for_all_subspaces = (); | ||||||
306 | foreach my $cluster (@newclusters) { | ||||||
307 | die "Linear Manifold Clustering did not work $!" if @$cluster == 0; | ||||||
308 | my ($mean, $covariance) = estimate_mean_and_covariance($cluster); | ||||||
309 | my ($eigenvecs, $eigenvals) = eigen_analysis_of_covariance($covariance); | ||||||
310 | my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1]; | ||||||
311 | my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1]; | ||||||
312 | my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals; | ||||||
313 | push @subspace_construction_errors_this_iteration, $subspace_construction_error; | ||||||
314 | my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions}, | ||||||
315 | scalar(@trailing_eigenvecs)); | ||||||
316 | foreach my $j (0..@trailing_eigenvecs-1) { | ||||||
317 | $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]); | ||||||
318 | } | ||||||
319 | push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix; | ||||||
320 | push @reference_vecs_for_all_subspaces, $mean; | ||||||
321 | } | ||||||
322 | my %best_subspace_based_partition_of_data; | ||||||
323 | foreach my $i (0..$self->{_KM}-1) { | ||||||
324 | $best_subspace_based_partition_of_data{$i} = []; | ||||||
325 | } | ||||||
326 | foreach my $data_tag (@{$self->{_data_tags}}) { | ||||||
327 | my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); | ||||||
328 | $data_vec->set_col(0, $self->{_data_hash}->{$data_tag}); | ||||||
329 | my @errors = map {reconstruction_error($data_vec, | ||||||
330 | $trailing_eigenvec_matrices_for_all_subspaces[$_], | ||||||
331 | $reference_vecs_for_all_subspaces[$_])} | ||||||
332 | 0 .. $self->{_KM}-1; | ||||||
333 | my ($minval, $index_for_closest_subspace) = minimum(\@errors); | ||||||
334 | $total_reconstruction_error_this_iteration += $minval; | ||||||
335 | push @{$best_subspace_based_partition_of_data{$index_for_closest_subspace}}, | ||||||
336 | $data_tag; | ||||||
337 | } | ||||||
338 | print "empty-cluster jag: total reconstruction error in this iteration: \n" . | ||||||
339 | "$total_reconstruction_error_this_iteration\n" | ||||||
340 | if $self->{_debug}; | ||||||
341 | foreach my $i (0..$self->{_KM}-1) { | ||||||
342 | $clusters[$i] = $best_subspace_based_partition_of_data{$i}; | ||||||
343 | } | ||||||
344 | display_clusters(\@newclusters) if $self->{_terminal_output}; | ||||||
345 | @clusters = grep {@$_ != 0} @newclusters; | ||||||
346 | die "linear manifold based algorithm does not appear to work in this case $!" | ||||||
347 | unless @clusters == $self->{_KM}; | ||||||
348 | }# end of foreach my $cluster (@clusters) ... loop followed by if clause for empty clusters | ||||||
349 | if ($self->{_data_dimensions} == 3) { | ||||||
350 | $visualization_msg = "clustering_at_iteration_$iteration_index"; | ||||||
351 | $self->visualize_clusters_on_sphere($visualization_msg, \@clusters) | ||||||
352 | if $self->{_visualize_each_iteration}; | ||||||
353 | $self->visualize_clusters_on_sphere($visualization_msg, \@clusters, "png") | ||||||
354 | if $self->{_make_png_for_each_iteration}; | ||||||
355 | } | ||||||
356 | my @cluster_unimodality_quotients = map {$self->cluster_unimodality_quotient($clusters[$_], | ||||||
357 | $reference_vecs_for_all_subspaces[$_])} 0..@clusters-1; | ||||||
358 | my $min_value_for_unimodality_quotient = List::Util::min @cluster_unimodality_quotients; | ||||||
359 | print "\nCluster unimodality quotients: @cluster_unimodality_quotients\n" if $self->{_terminal_output}; | ||||||
360 | die "\n\nBailing out!\n" . | ||||||
361 | "It does not look like these iterations will lead to a good clustering result.\n" . | ||||||
362 | "Program terminating. Try running again.\n" | ||||||
363 | if defined($previous_min_value_for_unimodality_quotient) | ||||||
364 | && ($min_value_for_unimodality_quotient < 0.4) | ||||||
365 | && ($min_value_for_unimodality_quotient < (0.5 * $previous_min_value_for_unimodality_quotient)); | ||||||
366 | if ( $min_value_for_unimodality_quotient < 0.5 ) { | ||||||
367 | $unimodal_correction_flag = 1; | ||||||
368 | print "\nApplying unimodality correction:\n\n" if $self->{_terminal_output}; | ||||||
369 | my @sorted_cluster_indexes = | ||||||
370 | sort {$cluster_unimodality_quotients[$b] <=> $cluster_unimodality_quotients[$a]} 0..@clusters-1; | ||||||
371 | my @newclusters; | ||||||
372 | foreach my $cluster_index (0..@clusters - 1) { | ||||||
373 | push @newclusters, $clusters[$sorted_cluster_indexes[$cluster_index]]; | ||||||
374 | } | ||||||
375 | @clusters = @newclusters; | ||||||
376 | my $worst_cluster = pop @clusters; | ||||||
377 | print "\nthe worst cluster: @$worst_cluster\n" if $self->{_terminal_output}; | ||||||
378 | my $second_worst_cluster = pop @clusters; | ||||||
379 | print "\nthe second worst cluster: @$second_worst_cluster\n" if $self->{_terminal_output}; | ||||||
380 | push @$worst_cluster, @$second_worst_cluster; | ||||||
381 | fisher_yates_shuffle($worst_cluster); | ||||||
382 | my @first_half = @$worst_cluster[0 .. int(scalar(@$worst_cluster)/2) - 1]; | ||||||
383 | my @second_half = @$worst_cluster[int(scalar(@$worst_cluster)/2) .. @$worst_cluster - 1]; | ||||||
384 | push @clusters, \@first_half; | ||||||
385 | push @clusters, \@second_half; | ||||||
386 | if ($self->{_terminal_output}) { | ||||||
387 | print "\n\nShowing the clusters obtained after applying the unimodality correction:\n"; | ||||||
388 | display_clusters(\@clusters); | ||||||
389 | } | ||||||
390 | } | ||||||
391 | if (@{$self->{_reconstruction_error_as_a_function_of_iteration}} > 0) { | ||||||
392 | my $last_recon_error = pop @{$self->{_reconstruction_error_as_a_function_of_iteration}}; | ||||||
393 | push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, $last_recon_error; | ||||||
394 | if (($last_recon_error - $total_reconstruction_error_this_iteration) | ||||||
395 | < $self->{_delta_normalized_error}) { | ||||||
396 | push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, | ||||||
397 | $total_reconstruction_error_this_iteration; | ||||||
398 | last; | ||||||
399 | } | ||||||
400 | } | ||||||
401 | push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, | ||||||
402 | $total_reconstruction_error_this_iteration; | ||||||
403 | $iteration_index++; | ||||||
404 | $previous_min_value_for_unimodality_quotient = $min_value_for_unimodality_quotient; | ||||||
405 | } # end of while loop on iteration_index | ||||||
406 | $self->{_num_iterations_actually_used} = | ||||||
407 | scalar @{$self->{_reconstruction_error_as_a_function_of_iteration}}; | ||||||
408 | if ($self->{_terminal_output}) { | ||||||
409 | print "\nIterations of the main loop terminated at iteration number $iteration_index.\n"; | ||||||
410 | print "Will now invoke graph partitioning to discover dominant clusters and to\n" . | ||||||
411 | "merge small clusters.\n\n" if $self->{_cluster_search_multiplier} > 1; | ||||||
412 | print "Total reconstruction error as a function of iterations: " . | ||||||
413 | "@{$self->{_reconstruction_error_as_a_function_of_iteration}}"; | ||||||
414 | } | ||||||
415 | # now merge sub-clusters if cluster_search_multiplier > 1 | ||||||
416 | my @final_clusters; | ||||||
417 | if ($self->{_cluster_search_multiplier} > 1) { | ||||||
418 | print "\n\nInvoking recursive graph partitioning to merge small clusters\n\n"; | ||||||
419 | my @array_of_partitioned_cluster_groups = (\@clusters); | ||||||
420 | my @partitioned_cluster_groups; | ||||||
421 | my $how_many_clusters_looking_for = $self->{_K}; | ||||||
422 | while (scalar(@final_clusters) < $self->{_K}) { | ||||||
423 | @partitioned_cluster_groups = | ||||||
424 | $self->graph_partition(shift @array_of_partitioned_cluster_groups, | ||||||
425 | $how_many_clusters_looking_for ); | ||||||
426 | if (@{$partitioned_cluster_groups[0]} == 1) { | ||||||
427 | my $singular_cluster = shift @{$partitioned_cluster_groups[0]}; | ||||||
428 | push @final_clusters, $singular_cluster; | ||||||
429 | $how_many_clusters_looking_for--; | ||||||
430 | push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[1]; | ||||||
431 | } elsif (@{$partitioned_cluster_groups[1]} == 1) { | ||||||
432 | my $singular_cluster = shift @{$partitioned_cluster_groups[1]}; | ||||||
433 | push @final_clusters, $singular_cluster; | ||||||
434 | $how_many_clusters_looking_for--; | ||||||
435 | push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[0]; | ||||||
436 | } else { | ||||||
437 | push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[0]; | ||||||
438 | push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[1]; | ||||||
439 | } | ||||||
440 | } | ||||||
441 | my @data_clustered; | ||||||
442 | foreach my $cluster (@final_clusters) { | ||||||
443 | push @data_clustered, @$cluster; | ||||||
444 | } | ||||||
445 | unless (scalar(@data_clustered) == scalar(@{$self->{_data_tags}})) { | ||||||
446 | $self->{_final_clusters} = \@final_clusters; | ||||||
447 | my %data_clustered = map {$_ => 1} @data_clustered; | ||||||
448 | my @data_tags_not_clustered = | ||||||
449 | grep {$_} map {exists $data_clustered{$_} ? undef : $_} @{$self->{_data_tags}}; | ||||||
450 | if ($self->{_terminal_output}) { | ||||||
451 | print "\n\nNot all data clustered. The most reliable clusters found by graph partitioning:\n"; | ||||||
452 | display_clusters(\@final_clusters); | ||||||
453 | print "\n\nData not yet clustered:\n\n@data_tags_not_clustered\n"; | ||||||
454 | } | ||||||
455 | if ($self->{_data_dimensions} == 3) { | ||||||
456 | $visualization_msg = "$self->{_K}_best_clusters_produced_by_graph_partitioning"; | ||||||
457 | $self->visualize_clusters_on_sphere($visualization_msg, \@final_clusters) | ||||||
458 | if $self->{_visualize_each_iteration}; | ||||||
459 | $self->visualize_clusters_on_sphere($visualization_msg, \@final_clusters, "png") | ||||||
460 | if $self->{_make_png_for_each_iteration}; | ||||||
461 | } | ||||||
462 | my %data_tags_to_cluster_label_hash; | ||||||
463 | foreach my $i (0..@final_clusters-1) { | ||||||
464 | map {$data_tags_to_cluster_label_hash{$_} = $i} @{$final_clusters[$i]}; | ||||||
465 | } | ||||||
466 | $self->{_data_tags_to_cluster_label_hash} = \%data_tags_to_cluster_label_hash; | ||||||
467 | foreach my $tag (@data_tags_not_clustered) { | ||||||
468 | my $which_cluster = $self->which_cluster_for_new_element($tag); | ||||||
469 | $self->{_data_tags_to_cluster_label_hash}->{$tag} = $which_cluster; | ||||||
470 | } | ||||||
471 | die "Some data elements are still missing from the final tally" | ||||||
472 | unless scalar(keys %{$self->{_data_tags_to_cluster_label_hash}}) == | ||||||
473 | scalar(@{$self->{_data_tags}}); | ||||||
474 | my @new_final_clusters; | ||||||
475 | map { foreach my $ele (keys %{$self->{_data_tags_to_cluster_label_hash}}) { | ||||||
476 | push @{$new_final_clusters[$_]}, $ele | ||||||
477 | if $self->{_data_tags_to_cluster_label_hash}->{$ele} == $_ } | ||||||
478 | } 0..$self->{_K}-1; | ||||||
479 | if ($self->{_debug}) { | ||||||
480 | print "\ndisplaying the final clusters after accounting for unclustered data:\n"; | ||||||
481 | display_clusters(\@new_final_clusters); | ||||||
482 | } | ||||||
483 | $self->{_final_clusters} = \@new_final_clusters; | ||||||
484 | @final_clusters = @new_final_clusters; | ||||||
485 | } | ||||||
486 | } else { | ||||||
487 | @final_clusters = @clusters; | ||||||
488 | } | ||||||
489 | print "\n\nDisplaying final clustering results:\n\n" if $self->{_terminal_output}; | ||||||
490 | display_clusters(\@final_clusters) if $self->{_terminal_output}; | ||||||
491 | return \@final_clusters; | ||||||
492 | } | ||||||
493 | |||||||
494 | sub display_reconstruction_errors_as_a_function_of_iterations { | ||||||
495 | my $self = shift; | ||||||
496 | print "\n\nNumber of iterations used in Phase 1: $self->{_num_iterations_actually_used}\n"; | ||||||
497 | print "\nTotal reconstruction error as a function of iterations in Phase 1: " . | ||||||
498 | "@{$self->{_reconstruction_error_as_a_function_of_iteration}}\n"; | ||||||
499 | } | ||||||
500 | |||||||
501 | sub set_termination_reconstruction_error_threshold { | ||||||
502 | my $self = shift; | ||||||
503 | my $all_ref_vecs = shift; | ||||||
504 | my @mean_vecs = @$all_ref_vecs; | ||||||
505 | my $sum_of_mean_magnitudes = reduce {$a+$b} map { my $result = transpose($_) * $_; | ||||||
506 | my @result = $result->as_list; | ||||||
507 | sqrt($result[0]) | ||||||
508 | } @mean_vecs; | ||||||
509 | $self->{_scale_factor} = $sum_of_mean_magnitudes / @mean_vecs; | ||||||
510 | $self->{_delta_normalized_error} = ($sum_of_mean_magnitudes / @mean_vecs ) * | ||||||
511 | $self->{_delta_reconstruction_error}; | ||||||
512 | } | ||||||
513 | |||||||
514 | # This method is called only in the `unless' clause at the end of the main | ||||||
515 | # linear_manifold_clusterer() method. It is called to find the cluster labels for | ||||||
516 | # those data elements that were left unclustered by the main part of the algorithm | ||||||
517 | # when graph partitioning is used to merge similar sub-clusters. The operating logic | ||||||
518 | # here is that graph partition yields K main clusters even though each main cluster | ||||||
519 | # may not yet be fully populated. | ||||||
520 | sub which_cluster_for_new_element { | ||||||
521 | my $self = shift; | ||||||
522 | my $data_tag = shift; | ||||||
523 | # The following `unless' clause is called only the first time the current method | ||||||
524 | # is called: | ||||||
525 | unless (@{$self->{_final_trailing_eigenvec_matrices_for_all_subspaces}} > 0) { | ||||||
526 | my @trailing_eigenvec_matrices_for_all_subspaces; | ||||||
527 | my @reference_vecs_for_all_subspaces; | ||||||
528 | foreach my $cluster (@{$self->{_final_clusters}}) { | ||||||
529 | my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster); | ||||||
530 | my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance); | ||||||
531 | my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1]; | ||||||
532 | my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions}, | ||||||
533 | scalar(@trailing_eigenvecs)); | ||||||
534 | foreach my $j (0..@trailing_eigenvecs-1) { | ||||||
535 | $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]); | ||||||
536 | } | ||||||
537 | push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix; | ||||||
538 | push @reference_vecs_for_all_subspaces, $mean; | ||||||
539 | } | ||||||
540 | $self->{_final_trailing_eigenvec_matrices_for_all_subspaces} = | ||||||
541 | \@trailing_eigenvec_matrices_for_all_subspaces; | ||||||
542 | $self->{_final_reference_vecs_for_all_subspaces} = \@reference_vecs_for_all_subspaces; | ||||||
543 | } | ||||||
544 | my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); | ||||||
545 | $data_vec->set_col(0, $self->{_data_hash}->{$data_tag}); | ||||||
546 | my @errors = map {$self->reconstruction_error($data_vec, | ||||||
547 | $self->{_final_trailing_eigenvec_matrices_for_all_subspaces}->[$_], | ||||||
548 | $self->{_final_reference_vecs_for_all_subspaces}->[$_])} | ||||||
549 | 0 .. $self->{_K}-1; | ||||||
550 | my ($minval, $index_for_closest_subspace) = minimum(\@errors); | ||||||
551 | return $index_for_closest_subspace; | ||||||
552 | } | ||||||
553 | |||||||
554 | sub graph_partition { | ||||||
555 | my $self = shift; | ||||||
556 | my $clusters = shift; | ||||||
557 | my $how_many_clusters_looking_for = shift; | ||||||
558 | print "\n\nGraph partitioning looking for $how_many_clusters_looking_for clusters\n\n"; | ||||||
559 | my $num_nodes = scalar @$clusters; | ||||||
560 | my $W = Math::GSL::Matrix->new($num_nodes,$num_nodes); | ||||||
561 | my $D = Math::GSL::Matrix->new($num_nodes,$num_nodes); | ||||||
562 | $D->identity; | ||||||
563 | my $neg_sqrt_of_D = Math::GSL::Matrix->new($num_nodes,$num_nodes); | ||||||
564 | $neg_sqrt_of_D->identity; | ||||||
565 | my @subspace_construction_errors; | ||||||
566 | my @trailing_eigenvec_matrices_for_all_subspaces; | ||||||
567 | my @reference_vecs_for_all_subspaces; | ||||||
568 | foreach my $cluster (@$clusters) { | ||||||
569 | my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster); | ||||||
570 | my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance); | ||||||
571 | my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1]; | ||||||
572 | my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1]; | ||||||
573 | my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals; | ||||||
574 | push @subspace_construction_errors, $subspace_construction_error; | ||||||
575 | my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions}, | ||||||
576 | scalar(@trailing_eigenvecs)); | ||||||
577 | foreach my $j (0..@trailing_eigenvecs-1) { | ||||||
578 | print "trailing eigenvec column: @{$trailing_eigenvecs[$j]}\n" if $self->{_debug}; | ||||||
579 | $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]); | ||||||
580 | } | ||||||
581 | push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix; | ||||||
582 | push @reference_vecs_for_all_subspaces, $mean; | ||||||
583 | } | ||||||
584 | # We consider the similarity matrix W to be a sum of two parts W_recon_based and | ||||||
585 | # W_dist_bet_means based. IMPORTANT: For coding reasons, we first store the two | ||||||
586 | # similarity measures separately in W_recon_based and W_dist_bet_means based. Our | ||||||
587 | # goal is to fill up these component matrices with the raw values while at the | ||||||
588 | # same time collecting information needed for normalizing these two separate | ||||||
589 | # measures of similarity. | ||||||
590 | my $W_reconstruction_error_based = Math::GSL::Matrix->new($num_nodes,$num_nodes); | ||||||
591 | my $W_dist_between_means_based = Math::GSL::Matrix->new($num_nodes,$num_nodes); | ||||||
592 | my @all_pairwise_reconstruction_errors; | ||||||
593 | my @all_dist_between_means_errors; | ||||||
594 | foreach my $i (0..$num_nodes-1) { | ||||||
595 | foreach my $j (0..$num_nodes-1) { | ||||||
596 | my ($recon_error_similarity, $dist_bet_means) = $self->pairwise_cluster_similarity( | ||||||
597 | $clusters->[$i], | ||||||
598 | $trailing_eigenvec_matrices_for_all_subspaces[$i], | ||||||
599 | $reference_vecs_for_all_subspaces[$i], | ||||||
600 | $clusters->[$j], | ||||||
601 | $trailing_eigenvec_matrices_for_all_subspaces[$j], | ||||||
602 | $reference_vecs_for_all_subspaces[$j]); | ||||||
603 | $W_reconstruction_error_based->set_elem($i, $j, $recon_error_similarity); | ||||||
604 | $W_dist_between_means_based->set_elem($i, $j, $dist_bet_means); | ||||||
605 | push @all_pairwise_reconstruction_errors, $recon_error_similarity; | ||||||
606 | push @all_dist_between_means_errors, $dist_bet_means; | ||||||
607 | } | ||||||
608 | } | ||||||
609 | my $recon_error_normalizer = (reduce {$a + $b} @all_pairwise_reconstruction_errors) / | ||||||
610 | (scalar @all_pairwise_reconstruction_errors); | ||||||
611 | my $dist_bet_means_based_normalizer = (reduce {$a + $b} @all_dist_between_means_errors ) / | ||||||
612 | (scalar @all_dist_between_means_errors ); | ||||||
613 | die "\n\nBailing out!\n" . | ||||||
614 | "Dealing with highly defective clusters. Try again.\n" | ||||||
615 | if ($recon_error_normalizer == 0) || ($dist_bet_means_based_normalizer == 0); | ||||||
616 | foreach my $i (0..$num_nodes-1) { | ||||||
617 | foreach my $j (0..$num_nodes-1) { | ||||||
618 | my $recon_val = $W_reconstruction_error_based->get_elem($i,$j); | ||||||
619 | my $new_recon_val = exp( -1.0 * $recon_val / $recon_error_normalizer ); | ||||||
620 | $W_reconstruction_error_based->set_elem($i,$j,$new_recon_val); | ||||||
621 | my $mean_dist_val = $W_dist_between_means_based->get_elem($i,$j); | ||||||
622 | my $new_mean_dist_val = exp( -1.0 * $mean_dist_val / $dist_bet_means_based_normalizer ); | ||||||
623 | $W_dist_between_means_based->set_elem($i,$j,$new_mean_dist_val); | ||||||
624 | } | ||||||
625 | } | ||||||
626 | $W = $W_reconstruction_error_based + $W_dist_between_means_based; | ||||||
627 | if ($self->{_debug}) { | ||||||
628 | print "\nDisplaying the similarity matrix W for the cluster graph:\n"; | ||||||
629 | display_matrix($W) if $self->{_terminal_output}; | ||||||
630 | } | ||||||
631 | my $add_all_columns = Math::GSL::Matrix->new($num_nodes,1); | ||||||
632 | foreach my $col (0..$num_nodes-1) { | ||||||
633 | $add_all_columns += $W->col($col); | ||||||
634 | } | ||||||
635 | foreach my $i (0..$num_nodes-1) { | ||||||
636 | $D->set_elem($i,$i, $add_all_columns->get_elem($i,0)); | ||||||
637 | $neg_sqrt_of_D->set_elem($i,$i, 1.0 / sqrt($add_all_columns->get_elem($i,0))); | ||||||
638 | } | ||||||
639 | # the Laplacian matrix: | ||||||
640 | my $Laplacian = $D - $W; | ||||||
641 | # the Symmetric Normalized Laplacian matrix A: | ||||||
642 | my $A = $neg_sqrt_of_D * $Laplacian * $neg_sqrt_of_D; | ||||||
643 | foreach my $i (0..$num_nodes-1) { | ||||||
644 | foreach my $j (0..$num_nodes-1) { | ||||||
645 | $A->set_elem($i,$j,0) if abs($A->get_elem($i,$j)) < 0.01; | ||||||
646 | } | ||||||
647 | } | ||||||
648 | if ($self->{_terminal_output}) { | ||||||
649 | print "\nDisplaying the Symmetric Normalized Laplacian matrix A:\n" . | ||||||
650 | "A = neg_sqrt(D) * Laplacian_matrix * neg_sqrt(D)\n"; | ||||||
651 | display_matrix( $A ); | ||||||
652 | } | ||||||
653 | my ($eigenvalues, $eigenvectors) = $A->eigenpair; | ||||||
654 | my $num_of_eigens = @$eigenvalues; | ||||||
655 | my $largest_eigen_index = 0; | ||||||
656 | my $smallest_eigen_index = 0; | ||||||
657 | if ($self->{_debug2}) { | ||||||
658 | print "Eigenvalue 0: $eigenvalues->[0]\n"; | ||||||
659 | foreach my $i (1..$num_of_eigens-1) { | ||||||
660 | $largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index]; | ||||||
661 | $smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index]; | ||||||
662 | print "Eigenvalue $i: $eigenvalues->[$i]\n"; | ||||||
663 | } | ||||||
664 | print "\nlargest eigen index: $largest_eigen_index\n"; | ||||||
665 | print "\nsmallest eigen index: $smallest_eigen_index\n\n"; | ||||||
666 | } | ||||||
667 | my @all_my_eigenvecs; | ||||||
668 | foreach my $i (0..$num_of_eigens-1) { | ||||||
669 | my @vec = $eigenvectors->[$i]->as_list; | ||||||
670 | my @eigenvec; | ||||||
671 | foreach my $ele (@vec) { | ||||||
672 | my ($mag, $theta) = $ele =~ /\[(\d*\.?\d*e?[+-]?\d*),(\S+)\]/; | ||||||
673 | if ($theta eq "0") { | ||||||
674 | push @eigenvec, $mag; | ||||||
675 | } elsif ($theta eq "pi") { | ||||||
676 | push @eigenvec, -1.0 * $mag; | ||||||
677 | } else { | ||||||
678 | die "Eigendecomposition produced a complex eigenvector!"; | ||||||
679 | } | ||||||
680 | } | ||||||
681 | print "Eigenvector $i: @eigenvec\n" if $self->{_debug2}; | ||||||
682 | push @all_my_eigenvecs, \@eigenvec; | ||||||
683 | } | ||||||
684 | if ($self->{_debug2}) { | ||||||
685 | my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list; | ||||||
686 | print "\nLargest eigenvector of A: @largest_eigen_vec\n"; | ||||||
687 | } | ||||||
688 | my @sorted_eigenvec_indexes = sort {$eigenvalues->[$b] <=> $eigenvalues->[$a]} 0..@all_my_eigenvecs-1; | ||||||
689 | print "sorted eigenvec indexes for A: @sorted_eigenvec_indexes\n" if $self->{_debug2}; | ||||||
690 | my @sorted_eigenvecs; | ||||||
691 | my @sorted_eigenvals; | ||||||
692 | foreach my $i (0..@sorted_eigenvec_indexes-1) { | ||||||
693 | $sorted_eigenvecs[$i] = $all_my_eigenvecs[$sorted_eigenvec_indexes[$i]]; | ||||||
694 | $sorted_eigenvals[$i] = $eigenvalues->[$sorted_eigenvec_indexes[$i]]; | ||||||
695 | } | ||||||
696 | if ($self->{_debug2}) { | ||||||
697 | print "\nHere come sorted eigenvectors for A --- from the largest to the smallest:\n"; | ||||||
698 | foreach my $i (0..@sorted_eigenvecs-1) { | ||||||
699 | print "eigenvec: @{$sorted_eigenvecs[$i]} eigenvalue: $sorted_eigenvals[$i]\n"; | ||||||
700 | } | ||||||
701 | } | ||||||
702 | my $best_partitioning_eigenvec = $sorted_eigenvecs[@sorted_eigenvec_indexes-2]; | ||||||
703 | print "\nBest graph partitioning eigenvector: @$best_partitioning_eigenvec\n" if $self->{_terminal_output}; | ||||||
704 | my $how_many_positive = reduce {$a + $b} map {$_ > 0 ? 1 : 0 } @$best_partitioning_eigenvec; | ||||||
705 | my $how_many_negative = scalar(@$best_partitioning_eigenvec) - $how_many_positive; | ||||||
706 | print "Have $how_many_positive positive and $how_many_negative negative elements in the partitioning vec\n" | ||||||
707 | if $self->{_terminal_output}; | ||||||
708 | if ($how_many_clusters_looking_for <= 3) { | ||||||
709 | my @merged_cluster; | ||||||
710 | my $final_cluster; | ||||||
711 | my @newclusters; | ||||||
712 | if ($how_many_positive == 1) { | ||||||
713 | foreach my $i (0..@$clusters-1) { | ||||||
714 | if ($best_partitioning_eigenvec->[$i] > 0) { | ||||||
715 | $final_cluster = $clusters->[$i]; | ||||||
716 | } else { | ||||||
717 | push @newclusters, $clusters->[$i]; | ||||||
718 | } | ||||||
719 | } | ||||||
720 | return ([$final_cluster], \@newclusters); | ||||||
721 | } elsif ($how_many_negative == 1) { | ||||||
722 | foreach my $i (0..@$clusters-1) { | ||||||
723 | if ($best_partitioning_eigenvec->[$i] < 0) { | ||||||
724 | $final_cluster = $clusters->[$i]; | ||||||
725 | } else { | ||||||
726 | push @newclusters, $clusters->[$i]; | ||||||
727 | } | ||||||
728 | } | ||||||
729 | return ([$final_cluster], \@newclusters); | ||||||
730 | } elsif ($how_many_positive <= $self->{_cluster_search_multiplier}) { | ||||||
731 | foreach my $i (0..@$clusters-1) { | ||||||
732 | if ($best_partitioning_eigenvec->[$i] > 0) { | ||||||
733 | push @merged_cluster, @{$clusters->[$i]}; | ||||||
734 | } else { | ||||||
735 | push @newclusters, $clusters->[$i]; | ||||||
736 | } | ||||||
737 | } | ||||||
738 | return ([\@merged_cluster], \@newclusters); | ||||||
739 | } elsif ($how_many_negative <= $self->{_cluster_search_multiplier}) { | ||||||
740 | foreach my $i (0..@$clusters-1) { | ||||||
741 | if ($best_partitioning_eigenvec->[$i] < 0) { | ||||||
742 | push @merged_cluster, @{$clusters->[$i]}; | ||||||
743 | } else { | ||||||
744 | push @newclusters, $clusters->[$i]; | ||||||
745 | } | ||||||
746 | } | ||||||
747 | return ([\@merged_cluster], \@newclusters); | ||||||
748 | } else { | ||||||
749 | die "\n\nBailing out!\n\n" . | ||||||
750 | "No consensus support for dominant clusters in the graph partitioning step\n" . | ||||||
751 | "of the algorithm. This can be caused by bad random selection of initial\n" . | ||||||
752 | "cluster centers. Please run this program again.\n"; | ||||||
753 | } | ||||||
754 | } else { | ||||||
755 | my @positive_clusters; | ||||||
756 | my @negative_clusters; | ||||||
757 | foreach my $i (0..@$clusters-1) { | ||||||
758 | if ($best_partitioning_eigenvec->[$i] > 0) { | ||||||
759 | push @positive_clusters, $clusters->[$i]; | ||||||
760 | } else { | ||||||
761 | push @negative_clusters, $clusters->[$i]; | ||||||
762 | } | ||||||
763 | } | ||||||
764 | return (\@positive_clusters, \@negative_clusters); | ||||||
765 | } | ||||||
766 | } | ||||||
767 | |||||||
768 | sub pairwise_cluster_similarity { | ||||||
769 | my $self = shift; | ||||||
770 | my $cluster1 = shift; | ||||||
771 | my $trailing_eigenvec_matrix_cluster1 = shift; | ||||||
772 | my $reference_vec_cluster1 = shift; | ||||||
773 | my $cluster2 = shift; | ||||||
774 | my $trailing_eigenvec_matrix_cluster2 = shift; | ||||||
775 | my $reference_vec_cluster2 = shift; | ||||||
776 | my $total_reconstruction_error_in_this_iteration = 0; | ||||||
777 | my @errors_for_1_on_2 = map {my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); | ||||||
778 | $data_vec->set_col(0, $self->{_data_hash}->{$_}); | ||||||
779 | $self->reconstruction_error($data_vec, | ||||||
780 | $trailing_eigenvec_matrix_cluster2, | ||||||
781 | $reference_vec_cluster2)} | ||||||
782 | @$cluster1; | ||||||
783 | my @errors_for_2_on_1 = map {my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); | ||||||
784 | $data_vec->set_col(0, $self->{_data_hash}->{$_}); | ||||||
785 | $self->reconstruction_error($data_vec, | ||||||
786 | $trailing_eigenvec_matrix_cluster1, | ||||||
787 | $reference_vec_cluster1)} | ||||||
788 | @$cluster2; | ||||||
789 | my $type_1_error = reduce {abs($a) + abs($b)} @errors_for_1_on_2; | ||||||
790 | my $type_2_error = reduce {abs($a) + abs($b)} @errors_for_2_on_1; | ||||||
791 | my $total_reconstruction_error = $type_1_error + $type_2_error; | ||||||
792 | my $diff_between_the_means = $reference_vec_cluster1 - $reference_vec_cluster2; | ||||||
793 | my $dist_squared = transpose($diff_between_the_means) * $diff_between_the_means; | ||||||
794 | my @dist_squared_as_list = $dist_squared->as_list(); | ||||||
795 | my $dist_between_means_based_error = shift @dist_squared_as_list; | ||||||
796 | return ($total_reconstruction_error, $dist_between_means_based_error); | ||||||
797 | } | ||||||
798 | |||||||
799 | # delta ball | ||||||
800 | sub cluster_unimodality_quotient { | ||||||
801 | my $self = shift; | ||||||
802 | my $cluster = shift; | ||||||
803 | my $mean = shift; | ||||||
804 | my $delta = 0.4 * $self->{_scale_factor}; # Radius of the delta ball along each dimension | ||||||
805 | my @mean = $mean->as_list; | ||||||
806 | my @data_tags_for_range_tests; | ||||||
807 | foreach my $dimen (0..$self->{_data_dimensions}-1) { | ||||||
808 | my @values = map {$_->[$dimen]} map {$self->{_data_hash}->{$_}} @$cluster; | ||||||
809 | my ($min, $max) = (List::Util::min(@values), List::Util::max(@values)); | ||||||
810 | my $range = $max - $min; | ||||||
811 | my $mean_along_this_dimen = $mean[$dimen]; | ||||||
812 | my @tags = grep {$_} | ||||||
813 | map { ( ($self->{_data_hash}->{$_}->[$dimen] > $mean_along_this_dimen - $delta * $range) | ||||||
814 | && | ||||||
815 | ($self->{_data_hash}->{$_}->[$dimen] < $mean_along_this_dimen + $delta * $range) ) | ||||||
816 | ? $_ : undef } | ||||||
817 | @$cluster; | ||||||
818 | push @data_tags_for_range_tests, \@tags; | ||||||
819 | } | ||||||
820 | # Now find the intersection of the tag sets for each of the dimensions | ||||||
821 | my %intersection_hash; | ||||||
822 | foreach my $dimen (0..$self->{_data_dimensions}-1) { | ||||||
823 | my %tag_hash_for_this_dimen = map {$_ => 1} @{$data_tags_for_range_tests[$dimen]}; | ||||||
824 | if ($dimen == 0) { | ||||||
825 | %intersection_hash = %tag_hash_for_this_dimen; | ||||||
826 | } else { | ||||||
827 | %intersection_hash = map {$_ => 1} grep {$tag_hash_for_this_dimen{$_}} | ||||||
828 | keys %intersection_hash; | ||||||
829 | } | ||||||
830 | } | ||||||
831 | my @intersection_set = keys %intersection_hash; | ||||||
832 | my $cluster_unimodality_index = scalar(@intersection_set) / scalar(@$cluster); | ||||||
833 | return $cluster_unimodality_index; | ||||||
834 | } | ||||||
835 | |||||||
836 | sub find_best_ref_vector { | ||||||
837 | my $self = shift; | ||||||
838 | my $cluster = shift; | ||||||
839 | my $trailing_eigenvec_matrix = shift; | ||||||
840 | my $mean = shift; # a GSL marix ref | ||||||
841 | my @min_bounds; | ||||||
842 | my @max_bounds; | ||||||
843 | my @ranges; | ||||||
844 | foreach my $dimen (0..$self->{_data_dimensions}-1) { | ||||||
845 | my @values = map {$_->[$dimen]} map {$self->{_data_hash}->{$_}} @$cluster; | ||||||
846 | my ($min, $max) = (List::Util::min(@values), List::Util::max(@values)); | ||||||
847 | push @min_bounds, $min; | ||||||
848 | push @max_bounds, $max; | ||||||
849 | push @ranges, $max - $min; | ||||||
850 | } | ||||||
851 | print "min bounds are: @min_bounds\n"; | ||||||
852 | print "max bounds are: @max_bounds\n"; | ||||||
853 | my $max_iterations = 100; | ||||||
854 | my @random_points; | ||||||
855 | my $iteration = 0; | ||||||
856 | while ($iteration++ < $max_iterations) { | ||||||
857 | my @coordinate_vec; | ||||||
858 | foreach my $dimen (0..$self->{_data_dimensions}-1) { | ||||||
859 | push @coordinate_vec, $min_bounds[$dimen] + rand($ranges[$dimen]); | ||||||
860 | } | ||||||
861 | push @random_points, \@coordinate_vec; | ||||||
862 | } | ||||||
863 | if ($self->{_debug}) { | ||||||
864 | print "\nrandom points\n"; | ||||||
865 | map {print "@$_\n"} @random_points; | ||||||
866 | } | ||||||
867 | my @mean = $mean->as_list; | ||||||
868 | unshift @random_points, \@mean; | ||||||
869 | my @reconstruction_errors; | ||||||
870 | foreach my $candidate_ref_vec (@random_points) { | ||||||
871 | my $ref_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); | ||||||
872 | $ref_vec->set_col(0, $candidate_ref_vec); | ||||||
873 | my $reconstruction_error_for_a_ref_vec = 0; | ||||||
874 | foreach my $data_tag (@{$self->{_data_tags}}) { | ||||||
875 | my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); | ||||||
876 | $data_vec->set_col(0, $self->{_data_hash}->{$data_tag}); | ||||||
877 | my $error = $self->reconstruction_error($data_vec,$trailing_eigenvec_matrix,$ref_vec); | ||||||
878 | $reconstruction_error_for_a_ref_vec += $error; | ||||||
879 | } | ||||||
880 | push @reconstruction_errors, $reconstruction_error_for_a_ref_vec; | ||||||
881 | } | ||||||
882 | my $recon_error_for_original_mean = shift @reconstruction_errors; | ||||||
883 | my $smallest_error_randomly_selected_ref_vecs = List::Util::min(@reconstruction_errors); | ||||||
884 | my $minindex = List::Util::first { $_ == $smallest_error_randomly_selected_ref_vecs } | ||||||
885 | @reconstruction_errors; | ||||||
886 | my $refvec = $random_points[$minindex]; | ||||||
887 | return $refvec; | ||||||
888 | } | ||||||
889 | |||||||
890 | ## The reconstruction error relates to the size of the perpendicular from a data | ||||||
891 | ## point X to the hyperplane that defines a given subspace on the manifold. | ||||||
892 | sub reconstruction_error { | ||||||
893 | my $self = shift; | ||||||
894 | my $data_vec = shift; | ||||||
895 | my $trailing_eigenvecs = shift; | ||||||
896 | my $ref_vec = shift; | ||||||
897 | my $error_squared = transpose($data_vec - $ref_vec) * $trailing_eigenvecs * | ||||||
898 | transpose($trailing_eigenvecs) * ($data_vec - $ref_vec); | ||||||
899 | my @error_squared_as_list = $error_squared->as_list(); | ||||||
900 | my $error_squared_as_scalar = shift @error_squared_as_list; | ||||||
901 | return $error_squared_as_scalar; | ||||||
902 | } | ||||||
903 | |||||||
904 | # Returns a set of KM random integers. These serve as indices to reach into the data | ||||||
905 | # array. A data element whose index is one of the random numbers returned by this | ||||||
906 | # routine serves as an initial cluster center. Note the quality check it runs on the | ||||||
907 | # list of the random integers constructed. We first make sure that all the random | ||||||
908 | # integers returned are different. Subsequently, we carry out a quality assessment | ||||||
909 | # of the random integers constructed. This quality measure consists of the ratio of | ||||||
910 | # the values spanned by the random integers to the value of N, the total number of | ||||||
911 | # data points to be clustered. Currently, if this ratio is less than 0.3, we discard | ||||||
912 | # the K integers and try again. | ||||||
913 | sub initialize_cluster_centers { | ||||||
914 | my $self = shift; | ||||||
915 | my $K = shift; # This value is set to the parameter KM in the call to this subroutine | ||||||
916 | my $data_store_size = shift; | ||||||
917 | my @cluster_center_indices; | ||||||
918 | while (1) { | ||||||
919 | foreach my $i (0..$K-1) { | ||||||
920 | $cluster_center_indices[$i] = int rand( $data_store_size ); | ||||||
921 | next if $i == 0; | ||||||
922 | foreach my $j (0..$i-1) { | ||||||
923 | while ( $cluster_center_indices[$j] == $cluster_center_indices[$i] ) { | ||||||
924 | my $old = $cluster_center_indices[$i]; | ||||||
925 | $cluster_center_indices[$i] = int rand($data_store_size); | ||||||
926 | } | ||||||
927 | } | ||||||
928 | } | ||||||
929 | my ($min,$max) = minmax(\@cluster_center_indices ); | ||||||
930 | my $quality = ($max - $min) / $data_store_size; | ||||||
931 | last if $quality > 0.3; | ||||||
932 | } | ||||||
933 | return @cluster_center_indices; | ||||||
934 | } | ||||||
935 | |||||||
936 | # The purpose of this routine is to form initial clusters by assigning the data | ||||||
937 | # samples to the initial clusters formed by the previous routine on the basis of the | ||||||
938 | # best proximity of the data samples to the different cluster centers. | ||||||
939 | sub assign_data_to_clusters_initial { | ||||||
940 | my $self = shift; | ||||||
941 | my @cluster_centers = @{ shift @_ }; | ||||||
942 | my @clusters; | ||||||
943 | foreach my $ele (@{$self->{_data_tags}}) { | ||||||
944 | my $best_cluster; | ||||||
945 | my @dist_from_clust_centers; | ||||||
946 | foreach my $center (@cluster_centers) { | ||||||
947 | push @dist_from_clust_centers, $self->distance($ele, $center); | ||||||
948 | } | ||||||
949 | my ($min, $best_center_index) = minimum( \@dist_from_clust_centers ); | ||||||
950 | push @{$clusters[$best_center_index]}, $ele; | ||||||
951 | } | ||||||
952 | return \@clusters; | ||||||
953 | } | ||||||
954 | |||||||
955 | # The following routine is for computing the distance between a data point specified | ||||||
956 | # by its symbolic name in the master datafile and a point (such as the center of a | ||||||
957 | # cluster) expressed as a vector of coordinates: | ||||||
958 | sub distance { | ||||||
959 | my $self = shift; | ||||||
960 | my $ele1_id = shift @_; # symbolic name of data sample | ||||||
961 | my @ele1 = @{$self->{_data_hash}->{$ele1_id}}; | ||||||
962 | my @ele2 = @{shift @_}; | ||||||
963 | die "wrong data types for distance calculation\n" if @ele1 != @ele2; | ||||||
964 | my $how_many = @ele1; | ||||||
965 | my $squared_sum = 0; | ||||||
966 | foreach my $i (0..$how_many-1) { | ||||||
967 | $squared_sum += ($ele1[$i] - $ele2[$i])**2; | ||||||
968 | } | ||||||
969 | my $dist = sqrt $squared_sum; | ||||||
970 | return $dist; | ||||||
971 | } | ||||||
972 | |||||||
973 | sub write_clusters_to_files { | ||||||
974 | my $self = shift; | ||||||
975 | my $clusters = shift; | ||||||
976 | my @clusters = @$clusters; | ||||||
977 | unlink glob "cluster*.txt"; | ||||||
978 | foreach my $i (0..@clusters-1) { | ||||||
979 | my $filename = "cluster" . $i . ".txt"; | ||||||
980 | print "\nWriting cluster $i to file $filename\n" if $self->{_terminal_output}; | ||||||
981 | open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!"; | ||||||
982 | foreach my $ele (@{$clusters[$i]}) { | ||||||
983 | print FILEHANDLE "$ele "; | ||||||
984 | } | ||||||
985 | close FILEHANDLE; | ||||||
986 | } | ||||||
987 | } | ||||||
988 | |||||||
989 | sub DESTROY { | ||||||
990 | my $filename = basename($_[0]->{_datafile}); | ||||||
991 | $filename =~ s/\.\w+$/\.txt/; | ||||||
992 | unlink "__temp_" . $filename; | ||||||
993 | } | ||||||
994 | |||||||
995 | ################################## Visualization Code ################################### | ||||||
996 | |||||||
997 | sub add_point_coords { | ||||||
998 | my $self = shift; | ||||||
999 | my @arr_of_ids = @{shift @_}; # array of data element names | ||||||
1000 | my @result; | ||||||
1001 | my $data_dimensionality = $self->{_data_dimensions}; | ||||||
1002 | foreach my $i (0..$data_dimensionality-1) { | ||||||
1003 | $result[$i] = 0.0; | ||||||
1004 | } | ||||||
1005 | foreach my $id (@arr_of_ids) { | ||||||
1006 | my $ele = $self->{_data_hash}->{$id}; | ||||||
1007 | my $i = 0; | ||||||
1008 | foreach my $component (@$ele) { | ||||||
1009 | $result[$i] += $component; | ||||||
1010 | $i++; | ||||||
1011 | } | ||||||
1012 | } | ||||||
1013 | return \@result; | ||||||
1014 | } | ||||||
1015 | |||||||
1016 | # This is the main module version: | ||||||
1017 | sub visualize_clusters_on_sphere { | ||||||
1018 | my $self = shift; | ||||||
1019 | my $visualization_msg = shift; | ||||||
1020 | my $clusters = deep_copy_AoA(shift); | ||||||
1021 | my $hardcopy_format = shift; | ||||||
1022 | my $pause_time = shift; | ||||||
1023 | my $d = $self->{_data_dimensions}; | ||||||
1024 | my $temp_file = "__temp_" . $self->{_datafile}; | ||||||
1025 | $temp_file =~ s/\.\w+$/\.txt/; | ||||||
1026 | unlink $temp_file if -e $temp_file; | ||||||
1027 | open OUTPUT, ">$temp_file" | ||||||
1028 | or die "Unable to open a temp file in this directory: $!"; | ||||||
1029 | my @all_tags = "A".."Z"; | ||||||
1030 | my @retagged_clusters; | ||||||
1031 | foreach my $cluster (@$clusters) { | ||||||
1032 | my $label = shift @all_tags; | ||||||
1033 | my @retagged_cluster = | ||||||
1034 | map {$_ =~ s/^(\w+?)_(\w+)/$label . "_$2 @{$self->{_data_hash}->{$_}}"/e;$_} @$cluster; | ||||||
1035 | push @retagged_clusters, \@retagged_cluster; | ||||||
1036 | } | ||||||
1037 | my %clusters; | ||||||
1038 | foreach my $cluster (@retagged_clusters) { | ||||||
1039 | foreach my $record (@$cluster) { | ||||||
1040 | my @splits = grep $_, split /\s+/, $record; | ||||||
1041 | $splits[0] =~ /(\w+?)_.*/; | ||||||
1042 | my $primary_cluster_label = $1; | ||||||
1043 | my @coords = @splits[1..$d]; | ||||||
1044 | push @{$clusters{$primary_cluster_label}}, \@coords; | ||||||
1045 | } | ||||||
1046 | } | ||||||
1047 | foreach my $key (sort {"\L$a" cmp "\L$b"} keys %clusters) { | ||||||
1048 | map {print OUTPUT "$_"} map {"@$_\n"} @{$clusters{$key}}; | ||||||
1049 | print OUTPUT "\n\n"; | ||||||
1050 | } | ||||||
1051 | my @sorted_cluster_keys = sort {"\L$a" cmp "\L$b"} keys %clusters; | ||||||
1052 | close OUTPUT; | ||||||
1053 | my $plot; | ||||||
1054 | unless (defined $pause_time) { | ||||||
1055 | $plot = Graphics::GnuplotIF->new( persist => 1 ); | ||||||
1056 | } else { | ||||||
1057 | $plot = Graphics::GnuplotIF->new(); | ||||||
1058 | } | ||||||
1059 | my $arg_string = ""; | ||||||
1060 | $plot->gnuplot_cmd( "set hidden3d" ) unless $self->{_show_hidden_in_3D_plots}; | ||||||
1061 | $plot->gnuplot_cmd( "set title \"$visualization_msg\"" ); | ||||||
1062 | $plot->gnuplot_cmd( "set noclip" ); | ||||||
1063 | $plot->gnuplot_cmd( "set pointsize 2" ); | ||||||
1064 | $plot->gnuplot_cmd( "set parametric" ); | ||||||
1065 | $plot->gnuplot_cmd( "set size ratio 1" ); | ||||||
1066 | $plot->gnuplot_cmd( "set xlabel \"X\"" ); | ||||||
1067 | $plot->gnuplot_cmd( "set ylabel \"Y\"" ); | ||||||
1068 | $plot->gnuplot_cmd( "set zlabel \"Z\"" ); | ||||||
1069 | if ($hardcopy_format) { | ||||||
1070 | $plot->gnuplot_cmd( "set terminal png" ); | ||||||
1071 | my $image_file_name = "$visualization_msg\.$hardcopy_format"; | ||||||
1072 | $plot->gnuplot_cmd( "set output \"$image_file_name\"" ); | ||||||
1073 | $plot->gnuplot_cmd( "unset hidden3d" ); | ||||||
1074 | } | ||||||
1075 | # set the range for azimuth angles: | ||||||
1076 | $plot->gnuplot_cmd( "set urange [0:2*pi]" ); | ||||||
1077 | # set the range for the elevation angles: | ||||||
1078 | $plot->gnuplot_cmd( "set vrange [-pi/2:pi/2]" ); | ||||||
1079 | # Parametric functions for the sphere | ||||||
1080 | # $plot->gnuplot_cmd( "r=1" ); | ||||||
1081 | if ($self->{_scale_factor}) { | ||||||
1082 | $plot->gnuplot_cmd( "r=$self->{_scale_factor}" ); | ||||||
1083 | } else { | ||||||
1084 | $plot->gnuplot_cmd( "r=1" ); | ||||||
1085 | } | ||||||
1086 | $plot->gnuplot_cmd( "fx(v,u) = r*cos(v)*cos(u)" ); | ||||||
1087 | $plot->gnuplot_cmd( "fy(v,u) = r*cos(v)*sin(u)" ); | ||||||
1088 | $plot->gnuplot_cmd( "fz(v) = r*sin(v)" ); | ||||||
1089 | my $sphere_arg_str = "fx(v,u),fy(v,u),fz(v) notitle with lines lt 0,"; | ||||||
1090 | foreach my $i (0..scalar(keys %clusters)-1) { | ||||||
1091 | my $j = $i + 1; | ||||||
1092 | # The following statement puts the titles on the data points | ||||||
1093 | $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"$sorted_cluster_keys[$i] \" with points lt $j pt $j, "; | ||||||
1094 | } | ||||||
1095 | $arg_string = $arg_string =~ /^(.*),[ ]+$/; | ||||||
1096 | $arg_string = $1; | ||||||
1097 | $plot->gnuplot_cmd( "splot $sphere_arg_str $arg_string" ); | ||||||
1098 | $plot->gnuplot_pause( $pause_time ) if defined $pause_time; | ||||||
1099 | } | ||||||
1100 | |||||||
1101 | |||||||
1102 | ################################### Support Routines ######################################## | ||||||
1103 | |||||||
1104 | sub cluster_split { | ||||||
1105 | my $cluster = shift; | ||||||
1106 | my $how_many = shift; | ||||||
1107 | my @cluster_fragments; | ||||||
1108 | foreach my $i (0..$how_many-1) { | ||||||
1109 | $cluster_fragments[$i] = []; | ||||||
1110 | } | ||||||
1111 | my $delta = int( scalar(@$cluster) / $how_many ); | ||||||
1112 | my $j = 0; | ||||||
1113 | foreach my $i (0..@$cluster-1) { | ||||||
1114 | push @{$cluster_fragments[int($i/$delta)]}, $cluster->[$i]; | ||||||
1115 | } | ||||||
1116 | my $how_many_accounted_for = reduce {$a + $b} map {scalar(@$_)} @cluster_fragments; | ||||||
1117 | foreach my $frag (@cluster_fragments) { | ||||||
1118 | print "fragment: @$frag\n"; | ||||||
1119 | } | ||||||
1120 | die "the fragmentation could not account for all the data" | ||||||
1121 | unless @$cluster == $how_many_accounted_for; | ||||||
1122 | return @cluster_fragments; | ||||||
1123 | } | ||||||
1124 | |||||||
1125 | # Returns the minimum value and its positional index in an array | ||||||
1126 | sub minimum { | ||||||
1127 | my $arr = shift; | ||||||
1128 | my $min; | ||||||
1129 | my $index; | ||||||
1130 | foreach my $i (0..@{$arr}-1) { | ||||||
1131 | if ( (!defined $min) || ($arr->[$i] < $min) ) { | ||||||
1132 | $index = $i; | ||||||
1133 | $min = $arr->[$i]; | ||||||
1134 | } | ||||||
1135 | } | ||||||
1136 | return ($min, $index); | ||||||
1137 | } | ||||||
1138 | |||||||
1139 | sub minmax { | ||||||
1140 | my $arr = shift; | ||||||
1141 | my $min; | ||||||
1142 | my $max; | ||||||
1143 | foreach my $i (0..@{$arr}-1) { | ||||||
1144 | next if !defined $arr->[$i]; | ||||||
1145 | if ( (!defined $min) && (!defined $max) ) { | ||||||
1146 | $min = $arr->[$i]; | ||||||
1147 | $max = $arr->[$i]; | ||||||
1148 | } elsif ( $arr->[$i] < $min ) { | ||||||
1149 | $min = $arr->[$i]; | ||||||
1150 | } elsif ( $arr->[$i] > $max ) { | ||||||
1151 | $max = $arr->[$i]; | ||||||
1152 | } | ||||||
1153 | } | ||||||
1154 | return ($min, $max); | ||||||
1155 | } | ||||||
1156 | |||||||
1157 | # Meant only for constructing a deep copy of an array of arrays: | ||||||
1158 | sub deep_copy_AoA { | ||||||
1159 | my $ref_in = shift; | ||||||
1160 | my $ref_out; | ||||||
1161 | foreach my $i (0..@{$ref_in}-1) { | ||||||
1162 | foreach my $j (0..@{$ref_in->[$i]}-1) { | ||||||
1163 | $ref_out->[$i]->[$j] = $ref_in->[$i]->[$j]; | ||||||
1164 | } | ||||||
1165 | } | ||||||
1166 | return $ref_out; | ||||||
1167 | } | ||||||
1168 | |||||||
1169 | # For displaying the individual clusters on a terminal screen. Each cluster is | ||||||
1170 | # displayed through the symbolic names associated with the data points. | ||||||
1171 | sub display_clusters { | ||||||
1172 | my @clusters = @{shift @_}; | ||||||
1173 | my $i = 0; | ||||||
1174 | foreach my $cluster (@clusters) { | ||||||
1175 | @$cluster = sort @$cluster; | ||||||
1176 | my $cluster_size = @$cluster; | ||||||
1177 | print "\n\nCluster $i ($cluster_size records):\n"; | ||||||
1178 | foreach my $ele (@$cluster) { | ||||||
1179 | print " $ele"; | ||||||
1180 | } | ||||||
1181 | $i++ | ||||||
1182 | } | ||||||
1183 | print "\n\n"; | ||||||
1184 | } | ||||||
1185 | |||||||
1186 | sub display_mean_and_covariance { | ||||||
1187 | my $mean = shift; | ||||||
1188 | my $covariance = shift; | ||||||
1189 | print "\nDisplay the mean:\n"; | ||||||
1190 | display_matrix($mean); | ||||||
1191 | print "\nDisplay the covariance:\n"; | ||||||
1192 | display_matrix($covariance); | ||||||
1193 | } | ||||||
1194 | |||||||
1195 | sub check_for_illegal_params { | ||||||
1196 | my @params = @_; | ||||||
1197 | my @legal_params = qw / datafile | ||||||
1198 | mask | ||||||
1199 | K | ||||||
1200 | P | ||||||
1201 | terminal_output | ||||||
1202 | cluster_search_multiplier | ||||||
1203 | max_iterations | ||||||
1204 | delta_reconstruction_error | ||||||
1205 | visualize_each_iteration | ||||||
1206 | show_hidden_in_3D_plots | ||||||
1207 | make_png_for_each_iteration | ||||||
1208 | debug | ||||||
1209 | /; | ||||||
1210 | my $found_match_flag; | ||||||
1211 | foreach my $param (@params) { | ||||||
1212 | foreach my $legal (@legal_params) { | ||||||
1213 | $found_match_flag = 0; | ||||||
1214 | if ($param eq $legal) { | ||||||
1215 | $found_match_flag = 1; | ||||||
1216 | last; | ||||||
1217 | } | ||||||
1218 | } | ||||||
1219 | last if $found_match_flag == 0; | ||||||
1220 | } | ||||||
1221 | return $found_match_flag; | ||||||
1222 | } | ||||||
1223 | |||||||
1224 | sub display_matrix { | ||||||
1225 | my $matrix = shift; | ||||||
1226 | my $nrows = $matrix->rows(); | ||||||
1227 | my $ncols = $matrix->cols(); | ||||||
1228 | print "\nDisplaying a matrix of size $nrows rows and $ncols columns:\n"; | ||||||
1229 | foreach my $i (0..$nrows-1) { | ||||||
1230 | my $row = $matrix->row($i); | ||||||
1231 | my @row_as_list = $row->as_list; | ||||||
1232 | # print "@row_as_list\n"; | ||||||
1233 | map { printf("%.4f ", $_) } @row_as_list; | ||||||
1234 | print "\n"; | ||||||
1235 | } | ||||||
1236 | print "\n\n"; | ||||||
1237 | } | ||||||
1238 | |||||||
1239 | sub transpose { | ||||||
1240 | my $matrix = shift; | ||||||
1241 | my $num_rows = $matrix->rows(); | ||||||
1242 | my $num_cols = $matrix->cols(); | ||||||
1243 | my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows); | ||||||
1244 | foreach my $i (0..$num_rows-1) { | ||||||
1245 | my @row = $matrix->row($i)->as_list; | ||||||
1246 | $transpose->set_col($i, \@row ); | ||||||
1247 | } | ||||||
1248 | return $transpose; | ||||||
1249 | } | ||||||
1250 | |||||||
1251 | sub vector_subtract { | ||||||
1252 | my $vec1 = shift; | ||||||
1253 | my $vec2 = shift; | ||||||
1254 | die "wrong data types for vector subtract calculation\n" if @$vec1 != @$vec2; | ||||||
1255 | my @result; | ||||||
1256 | foreach my $i (0..@$vec1-1){ | ||||||
1257 | push @result, $vec1->[$i] - $vec2->[$i]; | ||||||
1258 | } | ||||||
1259 | return @result; | ||||||
1260 | } | ||||||
1261 | |||||||
1262 | # from perl docs: | ||||||
1263 | sub fisher_yates_shuffle { | ||||||
1264 | my $arr = shift; | ||||||
1265 | my $i = @$arr; | ||||||
1266 | while (--$i) { | ||||||
1267 | my $j = int rand( $i + 1 ); | ||||||
1268 | @$arr[$i, $j] = @$arr[$j, $i]; | ||||||
1269 | } | ||||||
1270 | } | ||||||
1271 | |||||||
1272 | ######################### Generating Synthetic Data for Manifold Clustering ########################## | ||||||
1273 | |||||||
1274 | ################################## Class DataGenerator ######################################## | ||||||
1275 | |||||||
1276 | ## The embedded class defined below is for generating synthetic data for | ||||||
1277 | ## experimenting with linear manifold clustering when the data resides on the | ||||||
1278 | ## surface of a sphere. See the script generate_data_on_a_sphere.pl in the | ||||||
1279 | ## `examples' directory for how to specify the number of clusters and the spread of | ||||||
1280 | ## each cluster in the data that is generated. | ||||||
1281 | |||||||
1282 | package DataGenerator; | ||||||
1283 | |||||||
1284 | use strict; | ||||||
1285 | use Carp; | ||||||
1286 | |||||||
1287 | sub new { | ||||||
1288 | my ($class, %args) = @_; | ||||||
1289 | my @params = keys %args; | ||||||
1290 | croak "\nYou have used a wrong name for a keyword argument " . | ||||||
1291 | "--- perhaps a misspelling\n" | ||||||
1292 | if _check_for_illegal_params3(@params) == 0; | ||||||
1293 | bless { | ||||||
1294 | _output_file => $args{output_file} | ||||||
1295 | || croak("name for output_file required"), | ||||||
1296 | _total_number_of_samples_needed => $args{total_number_of_samples_needed} | ||||||
1297 | || croak("total_number_of_samples_needed required"), | ||||||
1298 | _number_of_clusters_on_sphere => $args{number_of_clusters_on_sphere} || 3, | ||||||
1299 | _cluster_width => $args{cluster_width} || 0.1, | ||||||
1300 | _show_hidden_in_3D_plots => $args{show_hidden_in_3D_plots} || 1, | ||||||
1301 | _debug => $args{debug} || 0, | ||||||
1302 | }, $class; | ||||||
1303 | } | ||||||
1304 | |||||||
1305 | sub _check_for_illegal_params3 { | ||||||
1306 | my @params = @_; | ||||||
1307 | my @legal_params = qw / output_file | ||||||
1308 | total_number_of_samples_needed | ||||||
1309 | number_of_clusters_on_sphere | ||||||
1310 | cluster_width | ||||||
1311 | show_hidden_in_3D_plots | ||||||
1312 | /; | ||||||
1313 | my $found_match_flag; | ||||||
1314 | foreach my $param (@params) { | ||||||
1315 | foreach my $legal (@legal_params) { | ||||||
1316 | $found_match_flag = 0; | ||||||
1317 | if ($param eq $legal) { | ||||||
1318 | $found_match_flag = 1; | ||||||
1319 | last; | ||||||
1320 | } | ||||||
1321 | } | ||||||
1322 | last if $found_match_flag == 0; | ||||||
1323 | } | ||||||
1324 | return $found_match_flag; | ||||||
1325 | } | ||||||
1326 | |||||||
1327 | ## We first generate a set of points randomly on the unit sphere --- the number of | ||||||
1328 | ## points being equal to the number of clusters desired. These points will serve as | ||||||
1329 | ## cluster means (or, as cluster centroids) subsequently when we ask | ||||||
1330 | ## Math::Random::random_multivariate_normal($N, @m, @covar) to return $N number of | ||||||
1331 | ## points on the sphere. The second argument is the cluster mean and the third | ||||||
1332 | ## argument the cluster covariance. For the synthetic data, we set the cluster | ||||||
1333 | ## covariance to a 2x2 diagonal matrix, with the (0,0) element corresponding to the | ||||||
1334 | ## variance along the azimuth direction and the (1,1) element corresponding to the | ||||||
1335 | ## variance along the elevation direction. | ||||||
1336 | ## | ||||||
1337 | ## When you generate the points in the 2D spherical coordinates of | ||||||
1338 | ## (azimuth,elevation), you also need `wrap-around' logic for those points yielded by | ||||||
1339 | ## the multivariate-normal function whose azimuth angle is outside the interval | ||||||
1340 | ## (0,360) and/or whose elevation angle is outside the interval (-90,90). | ||||||
1341 | ## | ||||||
1342 | ## Note that the first of the two dimensions for which the multivariate-normal | ||||||
1343 | ## function returns the points is for the azimuth angle and the second for the | ||||||
1344 | ## elevation angle. | ||||||
1345 | ## | ||||||
1346 | ## With regard to the relationship of the Cartesian coordinates to the spherical | ||||||
1347 | ## (azimuth, elevation) coordinates, we assume that (x,y) is the horizontal plane | ||||||
1348 | ## and z the vertical axis. The elevation angle theta is measure with respect to | ||||||
1349 | ## the XY-plane. The highest point on the sphere (the Zenith) corresponds to the | ||||||
1350 | ## elevation angle of +90 and the lowest points on the sphere (the Nadir) | ||||||
1351 | ## corresponds to the elevation angle of -90. The azimuth is measured with respect | ||||||
1352 | ## X-axis. The range of the azimuth is from 0 to 360 degrees. The elevation is | ||||||
1353 | ## measured from the XY plane and its range is (-90,90) degrees. | ||||||
1354 | sub gen_data_and_write_to_csv { | ||||||
1355 | my $self = shift; | ||||||
1356 | my $K = $self->{_number_of_clusters_on_sphere}; | ||||||
1357 | # $N is the number of samples to be generated for each cluster: | ||||||
1358 | my $N = int($self->{_total_number_of_samples_needed} / $K); | ||||||
1359 | my $output_file = $self->{_output_file}; | ||||||
1360 | # Designated all of the data elements in a cluster by a letter that is followed by | ||||||
1361 | # an integer that identifies a specific data element. | ||||||
1362 | my @point_labels = ('a'..'z'); | ||||||
1363 | # Our first job is to define $K random points in the 2D space (azimuth, | ||||||
1364 | # elevation) to serve as cluster centers on the surface of the sphere. This we | ||||||
1365 | # do by calling a uniformly distributed 1-D random number generator, first for | ||||||
1366 | # the azimuth and then for the elevation in the loop shown below: | ||||||
1367 | my @cluster_centers; | ||||||
1368 | my @covariances; | ||||||
1369 | foreach my $i (0..$K-1) { | ||||||
1370 | my $azimuth = rand(360); | ||||||
1371 | my $elevation = rand(90) - 90; | ||||||
1372 | my @mean = ($azimuth, $elevation); | ||||||
1373 | push @cluster_centers, \@mean; | ||||||
1374 | my $cluster_covariance; | ||||||
1375 | # The j-th dimension is for azimuth and k-th for elevation for the directions | ||||||
1376 | # to surface of the sphere: | ||||||
1377 | foreach my $j (0..1) { | ||||||
1378 | foreach my $k (0..1) { | ||||||
1379 | $cluster_covariance->[$j]->[$k] = ($self->{_cluster_width} * 360.0) ** 2 | ||||||
1380 | if $j == 0 && $k == 0; | ||||||
1381 | $cluster_covariance->[$j]->[$k] = ($self->{_cluster_width} * 180.0) ** 2 | ||||||
1382 | if $j == 1 && $k == 1; | ||||||
1383 | $cluster_covariance->[$j]->[$k] = 0.0 if $j != $k; | ||||||
1384 | } | ||||||
1385 | } | ||||||
1386 | push @covariances, $cluster_covariance; | ||||||
1387 | } | ||||||
1388 | if ($self->{_debug}) { | ||||||
1389 | foreach my $i (0..$K-1) { | ||||||
1390 | print "\n\nCluster center: @{$cluster_centers[$i]}\n"; | ||||||
1391 | print "\nCovariance:\n"; | ||||||
1392 | foreach my $j (0..1) { | ||||||
1393 | foreach my $k (0..1) { | ||||||
1394 | print "$covariances[$i]->[$j]->[$k] "; | ||||||
1395 | } | ||||||
1396 | print "\n"; | ||||||
1397 | } | ||||||
1398 | } | ||||||
1399 | } | ||||||
1400 | my @data_dump; | ||||||
1401 | foreach my $i (0..$K-1) { | ||||||
1402 | my @m = @{shift @cluster_centers}; | ||||||
1403 | my @covar = @{shift @covariances}; | ||||||
1404 | my @new_data = Math::Random::random_multivariate_normal($N, @m, @covar); | ||||||
1405 | if ($self->{_debug}) { | ||||||
1406 | print "\nThe points for cluster $i:\n"; | ||||||
1407 | map { print "@$_ "; } @new_data; | ||||||
1408 | print "\n\n"; | ||||||
1409 | } | ||||||
1410 | my @wrapped_data; | ||||||
1411 | foreach my $d (@new_data) { | ||||||
1412 | my $wrapped_d; | ||||||
1413 | if ($d->[0] >= 360.0) { | ||||||
1414 | $wrapped_d->[0] = $d->[0] - 360.0; | ||||||
1415 | } elsif ($d->[0] < 0) { | ||||||
1416 | $wrapped_d->[0] = 360.0 - abs($d->[0]); | ||||||
1417 | } | ||||||
1418 | if ($d->[1] >= 90.0) { | ||||||
1419 | $wrapped_d->[0] = POSIX::fmod($d->[0] + 180.0, 360); | ||||||
1420 | $wrapped_d->[1] = 180.0 - $d->[1]; | ||||||
1421 | } elsif ($d->[1] < -90.0) { | ||||||
1422 | $wrapped_d->[0] = POSIX::fmod($d->[0] + 180, 360); | ||||||
1423 | $wrapped_d->[1] = -180.0 - $d->[1]; | ||||||
1424 | } | ||||||
1425 | $wrapped_d->[0] = $d->[0] unless defined $wrapped_d->[0]; | ||||||
1426 | $wrapped_d->[1] = $d->[1] unless defined $wrapped_d->[1]; | ||||||
1427 | push @wrapped_data, $wrapped_d; | ||||||
1428 | } | ||||||
1429 | if ($self->{_debug}) { | ||||||
1430 | print "\nThe unwrapped points for cluster $i:\n"; | ||||||
1431 | map { print "@$_ "; } @wrapped_data; | ||||||
1432 | print "\n\n"; | ||||||
1433 | } | ||||||
1434 | my $label = $point_labels[$i]; | ||||||
1435 | my $j = 0; | ||||||
1436 | @new_data = map {unshift @$_, $label."_".$j; $j++; $_} @wrapped_data; | ||||||
1437 | push @data_dump, @new_data; | ||||||
1438 | } | ||||||
1439 | if ($self->{_debug}) { | ||||||
1440 | print "\n\nThe labeled points for clusters:\n"; | ||||||
1441 | map { print "@$_\n"; } @data_dump; | ||||||
1442 | } | ||||||
1443 | fisher_yates_shuffle( \@data_dump ); | ||||||
1444 | open OUTPUT, ">$output_file"; | ||||||
1445 | my $total_num_of_points = $N * $K; | ||||||
1446 | print "Total number of data points that will be written out to the file: $total_num_of_points\n" | ||||||
1447 | if $self->{_debug}; | ||||||
1448 | foreach my $ele (@data_dump) { | ||||||
1449 | my ($x,$y,$z); | ||||||
1450 | my $label = $ele->[0]; | ||||||
1451 | my $azimuth = $ele->[1]; | ||||||
1452 | my $elevation = $ele->[2]; | ||||||
1453 | $x = cos($elevation) * cos($azimuth); | ||||||
1454 | $y = cos($elevation) * sin($azimuth); | ||||||
1455 | $z = sin($elevation); | ||||||
1456 | my $csv_str = join ",", ($label,$x,$y,$z); | ||||||
1457 | print OUTPUT "$csv_str\n"; | ||||||
1458 | } | ||||||
1459 | print "\n\n"; | ||||||
1460 | print "Data written out to file $output_file\n" if $self->{_debug}; | ||||||
1461 | close OUTPUT; | ||||||
1462 | } | ||||||
1463 | |||||||
1464 | # This version for the embedded class for data generation | ||||||
1465 | sub visualize_data_on_sphere { | ||||||
1466 | my $self = shift; | ||||||
1467 | my $datafile = shift; | ||||||
1468 | my $filename = File::Basename::basename($datafile); | ||||||
1469 | my $temp_file = "__temp_" . $filename; | ||||||
1470 | $temp_file =~ s/\.\w+$/\.txt/; | ||||||
1471 | unlink $temp_file if -e $temp_file; | ||||||
1472 | open OUTPUT, ">$temp_file" | ||||||
1473 | or die "Unable to open a temp file in this directory: $!"; | ||||||
1474 | open INPUT, "< $filename" or die "Unable to open $filename: $!"; | ||||||
1475 | local $/ = undef; | ||||||
1476 | my @all_records = split /\s+/, ; | ||||||
1477 | my %clusters; | ||||||
1478 | foreach my $record (@all_records) { | ||||||
1479 | my @splits = split /,/, $record; | ||||||
1480 | my $record_name = shift @splits; | ||||||
1481 | $record_name =~ /(\w+?)_.*/; | ||||||
1482 | my $primary_cluster_label = $1; | ||||||
1483 | push @{$clusters{$primary_cluster_label}}, \@splits; | ||||||
1484 | } | ||||||
1485 | foreach my $key (sort {"\L$a" cmp "\L$b"} keys %clusters) { | ||||||
1486 | map {print OUTPUT "$_"} map {"@$_\n"} @{$clusters{$key}}; | ||||||
1487 | print OUTPUT "\n\n"; | ||||||
1488 | } | ||||||
1489 | my @sorted_cluster_keys = sort {"\L$a" cmp "\L$b"} keys %clusters; | ||||||
1490 | close OUTPUT; | ||||||
1491 | my $plot = Graphics::GnuplotIF->new( persist => 1 ); | ||||||
1492 | my $arg_string = ""; | ||||||
1493 | $plot->gnuplot_cmd( "set noclip" ); | ||||||
1494 | $plot->gnuplot_cmd( "set hidden3d" ) unless $self->{_show_hidden_in_3D_plots}; | ||||||
1495 | $plot->gnuplot_cmd( "set pointsize 2" ); | ||||||
1496 | $plot->gnuplot_cmd( "set parametric" ); | ||||||
1497 | $plot->gnuplot_cmd( "set size ratio 1" ); | ||||||
1498 | $plot->gnuplot_cmd( "set xlabel \"X\"" ); | ||||||
1499 | $plot->gnuplot_cmd( "set ylabel \"Y\"" ); | ||||||
1500 | $plot->gnuplot_cmd( "set zlabel \"Z\"" ); | ||||||
1501 | # set the range for azimuth angles: | ||||||
1502 | $plot->gnuplot_cmd( "set urange [0:2*pi]" ); | ||||||
1503 | # set the range for the elevation angles: | ||||||
1504 | $plot->gnuplot_cmd( "set vrange [-pi/2:pi/2]" ); | ||||||
1505 | # Parametric functions for the sphere | ||||||
1506 | $plot->gnuplot_cmd( "r=1" ); | ||||||
1507 | $plot->gnuplot_cmd( "fx(v,u) = r*cos(v)*cos(u)" ); | ||||||
1508 | $plot->gnuplot_cmd( "fy(v,u) = r*cos(v)*sin(u)" ); | ||||||
1509 | $plot->gnuplot_cmd( "fz(v) = r*sin(v)" ); | ||||||
1510 | my $sphere_arg_str = "fx(v,u),fy(v,u),fz(v) notitle with lines lt 0,"; | ||||||
1511 | foreach my $i (0..scalar(keys %clusters)-1) { | ||||||
1512 | my $j = $i + 1; | ||||||
1513 | # The following statement puts the titles on the data points | ||||||
1514 | $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"$sorted_cluster_keys[$i] \" with points lt $j pt $j, "; | ||||||
1515 | } | ||||||
1516 | $arg_string = $arg_string =~ /^(.*),[ ]+$/; | ||||||
1517 | $arg_string = $1; | ||||||
1518 | # $plot->gnuplot_cmd( "splot $arg_string" ); | ||||||
1519 | $plot->gnuplot_cmd( "splot $sphere_arg_str $arg_string" ); | ||||||
1520 | } | ||||||
1521 | |||||||
1522 | sub DESTROY { | ||||||
1523 | use File::Basename; | ||||||
1524 | my $filename = basename($_[0]->{_output_file}); | ||||||
1525 | $filename =~ s/\.\w+$/\.txt/; | ||||||
1526 | unlink "__temp_" . $filename; | ||||||
1527 | } | ||||||
1528 | |||||||
1529 | # from perl docs: | ||||||
1530 | sub fisher_yates_shuffle { | ||||||
1531 | my $arr = shift; | ||||||
1532 | my $i = @$arr; | ||||||
1533 | while (--$i) { | ||||||
1534 | my $j = int rand( $i + 1 ); | ||||||
1535 | @$arr[$i, $j] = @$arr[$j, $i]; | ||||||
1536 | } | ||||||
1537 | } | ||||||
1538 | |||||||
1539 | 1; | ||||||
1540 | |||||||
1541 | =pod | ||||||
1542 | |||||||
1543 | =head1 NAME | ||||||
1544 | |||||||
1545 | Algorithm::LinearManifoldDataClusterer --- for clustering data that resides on a | ||||||
1546 | low-dimensional manifold in a high-dimensional measurement space | ||||||
1547 | |||||||
1548 | =head1 SYNOPSIS | ||||||
1549 | |||||||
1550 | # You'd begin with: | ||||||
1551 | |||||||
1552 | use Algorithm::LinearManifoldDataClusterer; | ||||||
1553 | |||||||
1554 | # You'd next name the data file: | ||||||
1555 | |||||||
1556 | my $datafile = "mydatafile.csv"; | ||||||
1557 | |||||||
1558 | # Your data must be in the CSV format, with one of the columns containing a unique | ||||||
1559 | # symbolic tag for each data record. You tell the module which column has the | ||||||
1560 | # symbolic tag and which columns to use for clustering through a mask such as | ||||||
1561 | |||||||
1562 | my $mask = "N111"; | ||||||
1563 | |||||||
1564 | # which says that the symbolic tag is in the first column and that the numerical | ||||||
1565 | # data in the next three columns is to be used for clustering. If your data file | ||||||
1566 | # had, say, five columns and you wanted only the last three columns to be | ||||||
1567 | # clustered, the mask would become `N0111' assuming that that the symbolic tag is | ||||||
1568 | # still in the first column. | ||||||
1569 | |||||||
1570 | # Now you must construct an instance of the clusterer through a call such as: | ||||||
1571 | |||||||
1572 | my $clusterer = Algorithm::LinearManifoldDataClusterer->new( | ||||||
1573 | datafile => $datafile, | ||||||
1574 | mask => $mask, | ||||||
1575 | K => 3, | ||||||
1576 | P => 2, | ||||||
1577 | max_iterations => 15, | ||||||
1578 | cluster_search_multiplier => 2, | ||||||
1579 | delta_reconstruction_error => 0.001, | ||||||
1580 | terminal_output => 1, | ||||||
1581 | visualize_each_iteration => 1, | ||||||
1582 | show_hidden_in_3D_plots => 1, | ||||||
1583 | make_png_for_each_iteration => 1, | ||||||
1584 | ); | ||||||
1585 | |||||||
1586 | # where the parameter K specifies the number of clusters you expect to find in | ||||||
1587 | # your data and the parameter P is the dimensionality of the manifold on which the | ||||||
1588 | # data resides. The parameter cluster_search_multiplier is for increasing the | ||||||
1589 | # odds that the random seeds chosen initially for clustering will populate all the | ||||||
1590 | # clusters. Set this parameter to a low number like 2 or 3. The parameter | ||||||
1591 | # max_iterations places a hard limit on the number of iterations that the | ||||||
1592 | # algorithm is allowed. The actual number of iterations is controlled by the | ||||||
1593 | # parameter delta_reconstruction_error. The iterations stop when the change in | ||||||
1594 | # the total "reconstruction error" from one iteration to the next is smaller than | ||||||
1595 | # the value specified by delta_reconstruction_error. | ||||||
1596 | |||||||
1597 | # Next, you must get the module to read the data for clustering: | ||||||
1598 | |||||||
1599 | $clusterer->get_data_from_csv(); | ||||||
1600 | |||||||
1601 | # Finally, you invoke linear manifold clustering by: | ||||||
1602 | |||||||
1603 | my $clusters = $clusterer->linear_manifold_clusterer(); | ||||||
1604 | |||||||
1605 | # The value returned by this call is a reference to an array of anonymous arrays, | ||||||
1606 | # with each anonymous array holding one cluster. If you wish, you can have the | ||||||
1607 | # module write the clusters to individual files by the following call: | ||||||
1608 | |||||||
1609 | $clusterer->write_clusters_to_files($clusters); | ||||||
1610 | |||||||
1611 | # If you want to see how the reconstruction error changes with the iterations, you | ||||||
1612 | # can make the call: | ||||||
1613 | |||||||
1614 | $clusterer->display_reconstruction_errors_as_a_function_of_iterations(); | ||||||
1615 | |||||||
1616 | # When your data is 3-dimensional and when the clusters reside on a surface that | ||||||
1617 | # is more or less spherical, you can visualize the clusters by calling | ||||||
1618 | |||||||
1619 | $clusterer->visualize_clusters_on_sphere("final clustering", $clusters); | ||||||
1620 | |||||||
1621 | # where the first argument is a label to be displayed in the 3D plot and the | ||||||
1622 | # second argument the value returned by calling linear_manifold_clusterer(). | ||||||
1623 | |||||||
1624 | # SYNTHETIC DATA GENERATION: | ||||||
1625 | |||||||
1626 | # The module includes an embedded class, DataGenerator, for generating synthetic | ||||||
1627 | # three-dimensional data that can be used to experiment with the clustering code. | ||||||
1628 | # The synthetic data, written out to a CSV file, consists of Gaussian clusters on | ||||||
1629 | # the surface of a sphere. You can control the number of clusters, the width of | ||||||
1630 | # each cluster, and the number of samples in the clusters by giving appropriate | ||||||
1631 | # values to the constructor parameters as shown below: | ||||||
1632 | |||||||
1633 | use strict; | ||||||
1634 | use Algorithm::LinearManifoldDataClusterer; | ||||||
1635 | |||||||
1636 | my $output_file = "4_clusters_on_a_sphere_1000_samples.csv"; | ||||||
1637 | |||||||
1638 | my $training_data_gen = DataGenerator->new( | ||||||
1639 | output_file => $output_file, | ||||||
1640 | cluster_width => 0.015, | ||||||
1641 | total_number_of_samples_needed => 1000, | ||||||
1642 | number_of_clusters_on_sphere => 4, | ||||||
1643 | show_hidden_in_3D_plots => 0, | ||||||
1644 | ); | ||||||
1645 | $training_data_gen->gen_data_and_write_to_csv(); | ||||||
1646 | $training_data_gen->visualize_data_on_sphere($output_file); | ||||||
1647 | |||||||
1648 | |||||||
1649 | =head1 DESCRIPTION | ||||||
1650 | |||||||
1651 | If you are new to machine learning and data clustering on linear and nonlinear | ||||||
1652 | manifolds, your first question is likely to be: What is a manifold? A manifold is a | ||||||
1653 | space that is locally Euclidean. And a space is locally Euclidean if it allows for | ||||||
1654 | the points in a small neighborhood to be represented by Cartesian coordinates and if | ||||||
1655 | the distances between the points in the neighborhood are given by the Euclidean | ||||||
1656 | metric. For an example, the set of all points on the surface of a sphere does NOT | ||||||
1657 | constitute a Euclidean space. Nonetheless, if you confined your attention to a small | ||||||
1658 | enough neighborhood around a point, the space would seem to be locally Euclidean. | ||||||
1659 | The surface of a sphere is a 2-dimensional manifold embedded in a 3-dimensional | ||||||
1660 | space. A plane in a 3-dimensional space is also a 2-dimensional manifold. You would | ||||||
1661 | think of the surface of a sphere as a nonlinear manifold, whereas a plane would be a | ||||||
1662 | linear manifold. However, note that any nonlinear manifold is locally a linear | ||||||
1663 | manifold. That is, given a sufficiently small neighborhood on a nonlinear manifold, | ||||||
1664 | you can always think of it as a locally flat surface. | ||||||
1665 | |||||||
1666 | As to why we need machine learning and data clustering on manifolds, there exist many | ||||||
1667 | important applications in which the measured data resides on a nonlinear manifold. | ||||||
1668 | For example, when you record images of a human face from different angles, all the | ||||||
1669 | image pixels taken together fall on a low-dimensional surface in a high-dimensional | ||||||
1670 | measurement space. The same is believed to be true for the satellite images of a land | ||||||
1671 | mass that are recorded with the sun at different angles with respect to the direction | ||||||
1672 | of the camera. | ||||||
1673 | |||||||
1674 | Reducing the dimensionality of the sort of data mentioned above is critical to the | ||||||
1675 | proper functioning of downstream classification algorithms, and the most popular | ||||||
1676 | traditional method for dimensionality reduction is the Principal Components Analysis | ||||||
1677 | (PCA) algorithm. However, using PCA is tantamount to passing a linear least-squares | ||||||
1678 | hyperplane through the surface on which the data actually resides. As to why that | ||||||
1679 | might be a bad thing to do, just imagine the consequences of assuming that your data | ||||||
1680 | falls on a straight line when, in reality, it falls on a strongly curving arc. This | ||||||
1681 | is exactly what happens with PCA --- it gives you a linear manifold approximation to | ||||||
1682 | your data that may actually reside on a curved surface. | ||||||
1683 | |||||||
1684 | That brings us to the purpose of this module, which is to cluster data that resides | ||||||
1685 | on a nonlinear manifold. Since a nonlinear manifold is locally linear, we can think | ||||||
1686 | of each data cluster on a nonlinear manifold as falling on a locally linear portion | ||||||
1687 | of the manifold, meaning on a hyperplane. The logic of the module is based on | ||||||
1688 | finding a set of hyperplanes that best describes the data, with each hyperplane | ||||||
1689 | derived from a local data cluster. This is like constructing a piecewise linear | ||||||
1690 | approximation to data that falls on a curve as opposed to constructing a single | ||||||
1691 | straight line approximation to all of the data. So whereas the frequently used PCA | ||||||
1692 | algorithm gives you a single hyperplane approximation to all your data, what this | ||||||
1693 | module returns is a set of hyperplane approximations, with each hyperplane derived by | ||||||
1694 | applying the PCA algorithm locally to a data cluster. | ||||||
1695 | |||||||
1696 | That brings us to the problem of how to actually discover the best set of hyperplane | ||||||
1697 | approximations to the data. What is probably the most popular algorithm today for | ||||||
1698 | that purpose is based on the following key idea: Given a set of subspaces to which a | ||||||
1699 | data element can be assigned, you assign it to that subspace for which the | ||||||
1700 | B |
||||||
1701 | is B |
||||||
1702 | |||||||
1703 | To understand the notions of B |
||||||
1704 | the traditional approach of dimensionality reduction by the PCA algorithm. The PCA | ||||||
1705 | algorithm consists of: (1) Subtracting from each data element the global mean of the | ||||||
1706 | data; (2) Calculating the covariance matrix of the data; (3) Carrying out an | ||||||
1707 | eigendecomposition of the covariance matrix and ordering the eigenvectors according | ||||||
1708 | to decreasing values of the corresponding eigenvalues; (4) Forming a B |
||||||
1709 | discarding the trailing eigenvectors whose corresponding eigenvalues are relatively | ||||||
1710 | small; and, finally, (5) projecting all the data elements into the subspace so | ||||||
1711 | formed. The error incurred in representing a data element by its projection into the | ||||||
1712 | subspace is known as the B |
||||||
1713 | the data element into the space spanned by the discarded trailing eigenvectors. | ||||||
1714 | |||||||
1715 | I | ||||||
1716 | subspace in the manner described above, we construct a set of subspaces, one for each | ||||||
1717 | data cluster on the nonlinear manifold. After the subspaces have been constructed, a | ||||||
1718 | data element is assigned to that subspace for which the reconstruction error is the | ||||||
1719 | least.> On the face of it, this sounds like a chicken-and-egg sort of a problem. You | ||||||
1720 | need to have already clustered the data in order to construct the subspaces at | ||||||
1721 | different places on the manifold so that you can figure out which cluster to place a | ||||||
1722 | data element in. | ||||||
1723 | |||||||
1724 | Such problems, when they do possess a solution, are best tackled through iterative | ||||||
1725 | algorithms in which you start with a guess for the final solution, you rearrange the | ||||||
1726 | measured data on the basis of the guess, and you then use the new arrangement of the | ||||||
1727 | data to refine the guess. Subsequently, you iterate through the second and the third | ||||||
1728 | steps until you do not see any discernible changes in the new arrangements of the | ||||||
1729 | data. This forms the basis of the clustering algorithm that is described under | ||||||
1730 | B |
||||||
1731 | article "Dimension Reduction by Local Principal Component Analysis" by Kambhatla and | ||||||
1732 | Leen that appeared in the journal Neural Computation in 1997. | ||||||
1733 | |||||||
1734 | Unfortunately, experiments show that the algorithm as proposed by Kambhatla and Leen | ||||||
1735 | is much too sensitive to how the clusters are seeded initially. To get around this | ||||||
1736 | limitation of the basic clustering-by-minimization-of-reconstruction-error, this | ||||||
1737 | module implements a two phased approach. In B |
||||||
1738 | effect in our search for clusters by looking for C |
||||||
1739 | C |
||||||
1740 | be visited by one or more of the C |
||||||
1741 | where C |
||||||
1742 | C |
||||||
1743 | together in order to form the final C |
||||||
1744 | of the algorithm. | ||||||
1745 | |||||||
1746 | For the cluster merging operation in Phase 2, we model the C |
||||||
1747 | nodes of an attributed graph in which the weight given to an edge connecting a pair | ||||||
1748 | of nodes is a measure of the similarity between the two clusters corresponding to the | ||||||
1749 | two nodes. Subsequently, we use spectral clustering to merge the most similar nodes | ||||||
1750 | in our quest to partition the data into C |
||||||
1751 | Shi-Malik normalized cuts algorithm. The pairwise node similarity required by this | ||||||
1752 | algorithm is measured by the C |
||||||
1753 | C |
||||||
1754 | when all of the data elements in one cluster are projected into the other's subspace | ||||||
1755 | and vice versa, the greater the similarity between two clusters. Additionally, the | ||||||
1756 | smaller the distance between the mean vectors of the clusters, the greater the | ||||||
1757 | similarity between two clusters. The overall similarity between a pair of clusters | ||||||
1758 | is a combination of these two similarity measures. | ||||||
1759 | |||||||
1760 | =head1 SUMMARY OF THE ALGORITHM | ||||||
1761 | |||||||
1762 | We now present a summary of the two phases of the algorithm implemented in this | ||||||
1763 | module. Note particularly the important role played by the constructor parameter | ||||||
1764 | C |
||||||
1765 | parameter is greater than 1 that Phase 2 of the algorithm kicks in. | ||||||
1766 | |||||||
1767 | =over 4 | ||||||
1768 | |||||||
1769 | =item B |
||||||
1770 | |||||||
1771 | Through iterative minimization of the total reconstruction error, this phase of the | ||||||
1772 | algorithm returns C |
||||||
1773 | expect to find in your data and where C |
||||||
1774 | constructor parameter C |
||||||
1775 | account of the sensitivity of the reconstruction-error based clustering to how the | ||||||
1776 | clusters are initially seeded, our goal is to look for C |
||||||
1777 | of increasing the odds that each of the C |
||||||
1778 | the beginning of the algorithm. | ||||||
1779 | |||||||
1780 | =over 4 | ||||||
1781 | |||||||
1782 | =item Step 1: | ||||||
1783 | |||||||
1784 | Randomly choose C |
||||||
1785 | |||||||
1786 | =item Step 2: | ||||||
1787 | |||||||
1788 | Construct initial C |
||||||
1789 | whose seed it is closest to. | ||||||
1790 | |||||||
1791 | =item Step 3: | ||||||
1792 | |||||||
1793 | Calculate the mean and the covariance matrix for each of the C |
||||||
1794 | carry out an eigendecomposition of the covariance matrix. Order the eigenvectors in | ||||||
1795 | decreasing order of the corresponding eigenvalues. The first C eigenvectors |
||||||
1796 | define the subspace for that cluster. Use the space spanned by the remaining | ||||||
1797 | eigenvectors --- we refer to them as the trailing eigenvectors --- for calculating | ||||||
1798 | the reconstruction error. | ||||||
1799 | |||||||
1800 | =item Step 4 | ||||||
1801 | |||||||
1802 | Taking into account the mean associated with each cluster, re-cluster the entire data | ||||||
1803 | set on the basis of the least reconstruction error. That is, assign each data | ||||||
1804 | element to that subspace for which it has the smallest reconstruction error. | ||||||
1805 | Calculate the total reconstruction error associated with all the data elements. (See | ||||||
1806 | the definition of the reconstruction error in the C |
||||||
1807 | |||||||
1808 | =item Step 5 | ||||||
1809 | |||||||
1810 | Stop iterating if the change in the total reconstruction error from the previous | ||||||
1811 | iteration to the current iteration is less than the value specified by the constructor | ||||||
1812 | parameter C |
||||||
1813 | |||||||
1814 | =back | ||||||
1815 | |||||||
1816 | =item B |
||||||
1817 | |||||||
1818 | This phase of the algorithm uses graph partitioning to merge the C |
||||||
1819 | into the C |
||||||
1820 | steps are presented below is invoked recursively, let's assume that we have C |
||||||
1821 | clusters that need to be merged by invoking the Shi-Malik spectral clustering | ||||||
1822 | algorithm as described below: | ||||||
1823 | |||||||
1824 | =over 4 | ||||||
1825 | |||||||
1826 | =item Step 1 | ||||||
1827 | |||||||
1828 | Form a graph whose C |
||||||
1829 | invocation of this step, we have C |
||||||
1830 | |||||||
1831 | =item Step 2 | ||||||
1832 | |||||||
1833 | Construct an C |
||||||
1834 | element of this matrix is the pairwise similarity between the clusters indexed C | ||||||
1835 | and C |
||||||
1836 | The total reconstruction error when the data elements in the cluster indexed C |
||||||
1837 | projected into the subspace for the cluster indexed C and vice versa. And (2) The | ||||||
1838 | distance between the mean vectors associated with the two clusters. | ||||||
1839 | |||||||
1840 | =item Step 3 | ||||||
1841 | |||||||
1842 | Calculate the symmetric normalized Laplacian of the similarity matrix. We use C | ||||||
1843 | to denote the symmetric normalized Laplacian. | ||||||
1844 | |||||||
1845 | =item Step 4 | ||||||
1846 | |||||||
1847 | Carry out an eigendecomposition of the C matrix and choose the eigenvector | ||||||
1848 | corresponding to the second smallest eigenvalue for bipartitioning the graph on the | ||||||
1849 | basis of the sign of the values in the eigenvector. | ||||||
1850 | |||||||
1851 | =item Step 5 | ||||||
1852 | |||||||
1853 | If the bipartition of the previous step yields one-versus-the-rest kind of a | ||||||
1854 | partition, add the `one' cluster to the output list of clusters and invoke graph | ||||||
1855 | partitioning recursively on the `rest' by going back to Step 1. On the other hand, | ||||||
1856 | if the cardinality of both the partitions returned by Step 4 exceeds 1, invoke graph | ||||||
1857 | partitioning recursively on both partitions. Step when the list of clusters in the | ||||||
1858 | output list equals C |
||||||
1859 | |||||||
1860 | =item Step 6 | ||||||
1861 | |||||||
1862 | In general, the C |
||||||
1863 | all of the data. So, for the final step, assign each data element not covered by the | ||||||
1864 | C |
||||||
1865 | |||||||
1866 | =back | ||||||
1867 | |||||||
1868 | =back | ||||||
1869 | |||||||
1870 | =head1 FAIL-FIRST BIAS OF THE MODULE | ||||||
1871 | |||||||
1872 | As you would expect from all such iterative algorithms, the module carries no | ||||||
1873 | theoretical guarantee that it will give you correct results. But what does that mean? | ||||||
1874 | Suppose you create synthetic data that consists of Gaussian looking disjoint clusters | ||||||
1875 | on the surface of a sphere, would the module always succeed in separating out the | ||||||
1876 | clusters? The module carries no guarantees to that effect --- especially considering | ||||||
1877 | that Phase 1 of the algorithm is sensitive to how the clusters are seeded at the | ||||||
1878 | beginning. Although this sensitivity is mitigated by the cluster merging step when | ||||||
1879 | greater-than-1 value is given to the constructor option C |
||||||
1880 | a plain vanilla implementation of the steps in Phase 1 and Phase 2 would nonetheless | ||||||
1881 | carry significant risk that you'll end up with incorrect clustering results. | ||||||
1882 | |||||||
1883 | To further reduce this risk, the module has been programmed so that it terminates | ||||||
1884 | immediately if it suspects that the cluster solution being developed is unlikely to | ||||||
1885 | be fruitful. The heuristics used for such termination are conservative --- since the | ||||||
1886 | cost of termination is only that the user will have to run the code again, which at | ||||||
1887 | worst only carries an element of annoyance with it. The three "Fail First" | ||||||
1888 | heuristics currently programmed into the module are based on simple "unimodality | ||||||
1889 | testing", testing for "congruent clusters," and testing for dominant cluster support | ||||||
1890 | in the final stage of the recursive invocation of the graph partitioning step. The | ||||||
1891 | unimodality testing is as elementary as it can be --- it only checks for the number | ||||||
1892 | of data samples within a certain radius of the mean in relation to the total number | ||||||
1893 | of data samples in the cluster. | ||||||
1894 | |||||||
1895 | When the program terminates under such conditions, it prints out the following message | ||||||
1896 | in your terminal window: | ||||||
1897 | |||||||
1898 | Bailing out! | ||||||
1899 | |||||||
1900 | Given the very simple nature of testing that is carried for the "Fail First" bias, do | ||||||
1901 | not be surprised if the results you get for your data simply look wrong. If most | ||||||
1902 | runs of the module produce wrong results for your application, that means that the | ||||||
1903 | module logic needs to be strengthened further. The author of this module would love | ||||||
1904 | to hear from you if that is the case. | ||||||
1905 | |||||||
1906 | =head1 METHODS | ||||||
1907 | |||||||
1908 | The module provides the following methods for linear-manifold based clustering, for | ||||||
1909 | cluster visualization, and for the generation of data for testing the clustering algorithm: | ||||||
1910 | |||||||
1911 | =over 4 | ||||||
1912 | |||||||
1913 | =item B |
||||||
1914 | |||||||
1915 | my $clusterer = Algorithm::LinearManifoldDataClusterer->new( | ||||||
1916 | datafile => $datafile, | ||||||
1917 | mask => $mask, | ||||||
1918 | K => $K, | ||||||
1919 | P => $P, | ||||||
1920 | cluster_search_multiplier => $C, | ||||||
1921 | max_iterations => $max_iter, | ||||||
1922 | delta_reconstruction_error => 0.001, | ||||||
1923 | terminal_output => 1, | ||||||
1924 | write_clusters_to_files => 1, | ||||||
1925 | visualize_each_iteration => 1, | ||||||
1926 | show_hidden_in_3D_plots => 1, | ||||||
1927 | make_png_for_each_iteration => 1, | ||||||
1928 | ); | ||||||
1929 | |||||||
1930 | A call to C |
||||||
1931 | C |
||||||
1932 | |||||||
1933 | =back | ||||||
1934 | |||||||
1935 | =head2 Constructor Parameters | ||||||
1936 | |||||||
1937 | =over 8 | ||||||
1938 | |||||||
1939 | =item C |
||||||
1940 | |||||||
1941 | This parameter names the data file that contains the multidimensional data records | ||||||
1942 | you want the module to cluster. This file must be in CSV format and each record in | ||||||
1943 | the file must include a symbolic tag for the record. Here are first few rows of such | ||||||
1944 | a CSV file in the C |
||||||
1945 | |||||||
1946 | d_161,0.0739248630173239,0.231119293395665,-0.970112873251437 | ||||||
1947 | a_59,0.459932215884786,0.0110216469739639,0.887885623314902 | ||||||
1948 | a_225,0.440503220903039,-0.00543366086464691,0.897734586447273 | ||||||
1949 | a_203,0.441656364946433,0.0437191337788422,0.896118459046532 | ||||||
1950 | ... | ||||||
1951 | ... | ||||||
1952 | |||||||
1953 | What you see in the first column --- C |
||||||
1954 | the symbolic tags associated with four 3-dimensional data records. | ||||||
1955 | |||||||
1956 | =item C |
||||||
1957 | |||||||
1958 | This parameter supplies the mask to be applied to the columns of your data file. For | ||||||
1959 | the data file whose first few records are shown above, the mask should be C |
||||||
1960 | since the symbolic tag is in the first column of the CSV file and since, presumably, | ||||||
1961 | you want to cluster the data in the next three columns. | ||||||
1962 | |||||||
1963 | =item C |
||||||
1964 | |||||||
1965 | This parameter supplies the number of clusters you are looking for. | ||||||
1966 | |||||||
1967 | =item C : |
||||||
1968 | |||||||
1969 | This parameter specifies the dimensionality of the manifold on which the data resides. | ||||||
1970 | |||||||
1971 | =item C |
||||||
1972 | |||||||
1973 | As should be clear from the C |
||||||
1974 | a very important role in the successful clustering of your data. As explained in | ||||||
1975 | C |
||||||
1976 | the minimization of the reconstruction error --- is sensitive to the choice of the | ||||||
1977 | cluster seeds that are selected randomly at the beginning of the algorithm. Should | ||||||
1978 | it happen that the seeds miss one or more of the clusters, the clustering produced is | ||||||
1979 | likely to not be correct. By giving an integer value to C |
||||||
1980 | that is greater than 1, you'll increase the odds that the randomly selected seeds | ||||||
1981 | will see all clusters. When you set C |
||||||
1982 | Phase 1 of the algorithm to construct C |
||||||
1983 | clusters. Subsequently, in Phase 2, the module uses inter-cluster similarity based | ||||||
1984 | graph partitioning to merge the C |
||||||
1985 | |||||||
1986 | =item C |
||||||
1987 | |||||||
1988 | This hard limits the number of iterations in Phase 1 of the algorithm. Ordinarily, | ||||||
1989 | the iterations stop automatically when the change in the total reconstruction error | ||||||
1990 | from one iteration to the next is less than the value specified by the parameter | ||||||
1991 | C |
||||||
1992 | |||||||
1993 | =item C |
||||||
1994 | |||||||
1995 | It is this parameter that determines when the iterations will actually terminate in | ||||||
1996 | Phase 1 of the algorithm. When the difference in the total reconstruction error from | ||||||
1997 | one iteration to the next is less than the value given to this parameter, the | ||||||
1998 | iterations come to an end. B | ||||||
1999 | data samples that need to be clustered, the larger must be the value give to this | ||||||
2000 | parameter. That makes intuitive sense since the total reconstruction error is the | ||||||
2001 | sum of all such errors for all of the data elements.> Unfortunately, the best value | ||||||
2002 | for this parameter does NOT appear to depend linearly on the total number of data | ||||||
2003 | records to be clustered. | ||||||
2004 | |||||||
2005 | =item C |
||||||
2006 | |||||||
2007 | When this parameter is set, you will see in your terminal window the different | ||||||
2008 | clusters as lists of the symbolic tags associated with the data records. You will | ||||||
2009 | also see in your terminal window the output produced by the different steps of the | ||||||
2010 | graph partitioning algorithm as smaller clusters are merged to form the final C |
||||||
2011 | clusters --- assuming that you set the parameter C |
||||||
2012 | integer value that is greater than 1. | ||||||
2013 | |||||||
2014 | =item C |
||||||
2015 | |||||||
2016 | As its name implies, when this option is set to 1, you'll see 3D plots of the | ||||||
2017 | clustering results for each iteration --- but only if your data is 3-dimensional. | ||||||
2018 | |||||||
2019 | =item C |
||||||
2020 | |||||||
2021 | This parameter is important for controlling the visualization of the clusters on the | ||||||
2022 | surface of a sphere. If the clusters are too spread out, seeing all of the clusters | ||||||
2023 | all at once can be visually confusing. When you set this parameter, the clusters on | ||||||
2024 | the back side of the sphere will not be visible. Note that no matter how you set | ||||||
2025 | this parameter, you can interact with the 3D plot of the data and rotate it with your | ||||||
2026 | mouse pointer to see all of the data that is output by the clustering code. | ||||||
2027 | |||||||
2028 | =item C |
||||||
2029 | |||||||
2030 | If you set this option to 1, the module will output a Gnuplot in the form of a PNG | ||||||
2031 | image for each iteration in Phase 1 of the algorithm. In Phase 2, the module will | ||||||
2032 | output the clustering result produced by the graph partitioning algorithm. | ||||||
2033 | |||||||
2034 | =back | ||||||
2035 | |||||||
2036 | =over | ||||||
2037 | |||||||
2038 | =item B |
||||||
2039 | |||||||
2040 | $clusterer->get_data_from_csv(); | ||||||
2041 | |||||||
2042 | As you can guess from its name, the method extracts the data from the CSV file named | ||||||
2043 | in the constructor. | ||||||
2044 | |||||||
2045 | =item B |
||||||
2046 | |||||||
2047 | $clusterer->linear_manifold_clusterer(); | ||||||
2048 | |||||||
2049 | or | ||||||
2050 | |||||||
2051 | my $clusters = $clusterer->linear_manifold_clusterer(); | ||||||
2052 | |||||||
2053 | This is the main call to the linear-manifold based clusterer. The first call works | ||||||
2054 | by side-effect, meaning that you will see the clusters in your terminal window and | ||||||
2055 | they would be written out to disk files (depending on the constructor options you | ||||||
2056 | have set). The second call also returns the clusters as a reference to an array of | ||||||
2057 | anonymous arrays, each holding the symbolic tags for a cluster. | ||||||
2058 | |||||||
2059 | =item B |
||||||
2060 | |||||||
2061 | $clusterer->display_reconstruction_errors_as_a_function_of_iterations(); | ||||||
2062 | |||||||
2063 | This method would normally be called after the clustering is completed to see how the | ||||||
2064 | reconstruction errors decreased with the iterations in Phase 1 of the overall | ||||||
2065 | algorithm. | ||||||
2066 | |||||||
2067 | =item B |
||||||
2068 | |||||||
2069 | $clusterer->write_clusters_to_files($clusters); | ||||||
2070 | |||||||
2071 | As its name implies, when you call this methods, the final clusters would be written | ||||||
2072 | out to disk files. The files have names like: | ||||||
2073 | |||||||
2074 | cluster0.txt | ||||||
2075 | cluster1.txt | ||||||
2076 | cluster2.txt | ||||||
2077 | ... | ||||||
2078 | ... | ||||||
2079 | |||||||
2080 | Before the clusters are written to these files, the module destroys all files with | ||||||
2081 | such names in the directory in which you call the module. | ||||||
2082 | |||||||
2083 | =item B |
||||||
2084 | |||||||
2085 | $clusterer->visualize_clusters_on_sphere("final clustering", $clusters); | ||||||
2086 | |||||||
2087 | or | ||||||
2088 | |||||||
2089 | $clusterer->visualize_clusters_on_sphere("final_clustering", $clusters, "png"); | ||||||
2090 | |||||||
2091 | If your data is 3-dimensional and it resides on the surface of a sphere (or in the | ||||||
2092 | vicinity of such a surface), you may be able to use these methods for the | ||||||
2093 | visualization of the clusters produced by the algorithm. The first invocation | ||||||
2094 | produces a Gnuplot in a terminal window that you can rotate with your mouse pointer. | ||||||
2095 | The second invocation produces a `.png' image of the plot. | ||||||
2096 | |||||||
2097 | =item B |
||||||
2098 | |||||||
2099 | $clusterer->auto_retry_clusterer(); | ||||||
2100 | |||||||
2101 | or | ||||||
2102 | |||||||
2103 | my $clusters = $clusterer->auto_retry_clusterer(); | ||||||
2104 | |||||||
2105 | As mentioned earlier, the module is programmed in such a way that it is more likely | ||||||
2106 | to fail than to give you a wrong answer. If manually trying the clusterer repeatedly | ||||||
2107 | on a data file is frustrating, you can use C |
||||||
2108 | make repeated attempts for you. See the script C |
||||||
2109 | C |
||||||
2110 | |||||||
2111 | =back | ||||||
2112 | |||||||
2113 | =head1 GENERATING SYNTHETIC DATA FOR EXPERIMENTING WITH THE CLUSTERER | ||||||
2114 | |||||||
2115 | The module file also contains a class named C |
||||||
2116 | data for experimenting with linear-manifold based clustering. At this time, only | ||||||
2117 | 3-dimensional data that resides in the form of Gaussian clusters on the surface of a | ||||||
2118 | sphere is generated. The generated data is placed in a CSV file. You construct an | ||||||
2119 | instance of the C |
||||||
2120 | |||||||
2121 | =over 4 | ||||||
2122 | |||||||
2123 | =item B |
||||||
2124 | |||||||
2125 | my $training_data_gen = DataGenerator->new( | ||||||
2126 | output_file => $output_file, | ||||||
2127 | cluster_width => 0.0005, | ||||||
2128 | total_number_of_samples_needed => 1000, | ||||||
2129 | number_of_clusters_on_sphere => 4, | ||||||
2130 | show_hidden_in_3D_plots => 0, | ||||||
2131 | ); | ||||||
2132 | |||||||
2133 | =back | ||||||
2134 | |||||||
2135 | =head2 Parameters for the DataGenerator constructor: | ||||||
2136 | |||||||
2137 | =over 8 | ||||||
2138 | |||||||
2139 | =item C |
||||||
2140 | |||||||
2141 | The numeric values are generated using a bivariate Gaussian distribution whose two | ||||||
2142 | independent variables are the azimuth and the elevation angles on the surface of a | ||||||
2143 | unit sphere. The mean of each cluster is chosen randomly and its covariance set in | ||||||
2144 | proportion to the value supplied for the C< cluster_width> parameter. | ||||||
2145 | |||||||
2146 | =item C |
||||||
2147 | |||||||
2148 | This parameter controls the spread of each cluster on the surface of the unit sphere. | ||||||
2149 | |||||||
2150 | =item C |
||||||
2151 | |||||||
2152 | As its name implies, this parameter specifies the total number of data samples that | ||||||
2153 | will be written out to the output file --- provided this number is divisible by the | ||||||
2154 | number of clusters you asked for. If the divisibility condition is not satisfied, | ||||||
2155 | the number of data samples actually written out will be the closest it can be to the | ||||||
2156 | number you specify for this parameter under the condition that equal number of | ||||||
2157 | samples will be created for each cluster. | ||||||
2158 | |||||||
2159 | =item C |
||||||
2160 | |||||||
2161 | Again as its name implies, this parameters specifies the number of clusters that will | ||||||
2162 | be produced on the surface of a unit sphere. | ||||||
2163 | |||||||
2164 | =item C |
||||||
2165 | |||||||
2166 | This parameter is important for the visualization of the clusters and it controls | ||||||
2167 | whether you will see the generated data on the back side of the sphere. If the | ||||||
2168 | clusters are not too spread out, you can set this parameter to 0 and see all the | ||||||
2169 | clusters all at once. However, when the clusters are spread out, it can be visually | ||||||
2170 | confusing to see the data on the back side of the sphere. Note that no matter how | ||||||
2171 | you set this parameter, you can interact with the 3D plot of the data and rotate it | ||||||
2172 | with your mouse pointer to see all of the data that is generated. | ||||||
2173 | |||||||
2174 | =back | ||||||
2175 | |||||||
2176 | =over 4 | ||||||
2177 | |||||||
2178 | =item B |
||||||
2179 | |||||||
2180 | $training_data_gen->gen_data_and_write_to_csv(); | ||||||
2181 | |||||||
2182 | As its name implies, this method generates the data according to the parameters | ||||||
2183 | specified in the DataGenerator constructor. | ||||||
2184 | |||||||
2185 | =item B |
||||||
2186 | |||||||
2187 | $training_data_gen->visualize_data_on_sphere($output_file); | ||||||
2188 | |||||||
2189 | You can use this method to visualize the clusters produced by the data generator. | ||||||
2190 | Since the clusters are located at randomly selected points on a unit sphere, by | ||||||
2191 | looking at the output visually, you can quickly reject what the data generator has | ||||||
2192 | produced and try again. | ||||||
2193 | |||||||
2194 | =back | ||||||
2195 | |||||||
2196 | =head1 HOW THE CLUSTERS ARE OUTPUT | ||||||
2197 | |||||||
2198 | When the option C |
||||||
2199 | C |
||||||
2200 | screen. | ||||||
2201 | |||||||
2202 | And, when the option C |
||||||
2203 | module dumps the clusters in files named | ||||||
2204 | |||||||
2205 | cluster0.txt | ||||||
2206 | cluster1.txt | ||||||
2207 | cluster2.txt | ||||||
2208 | ... | ||||||
2209 | ... | ||||||
2210 | |||||||
2211 | in the directory in which you execute the module. The number of such files will | ||||||
2212 | equal the number of clusters formed. All such existing files in the directory are | ||||||
2213 | destroyed before any fresh ones are created. Each cluster file contains the symbolic | ||||||
2214 | tags of the data samples in that cluster. | ||||||
2215 | |||||||
2216 | Assuming that the data dimensionality is 3, if you have set the constructor parameter | ||||||
2217 | C |
||||||
2218 | files that are point plots of the different clusters in the different iterations of | ||||||
2219 | the algorithm. Such printable files are also generated for the initial clusters at | ||||||
2220 | the beginning of the iterations and for the final clusters in Phase 1 of the | ||||||
2221 | algorithm. You will also see in your directory a PNG file for the clustering result | ||||||
2222 | produced by graph partitioning in Phase 2. | ||||||
2223 | |||||||
2224 | Also, as mentioned previously, a call to C |
||||||
2225 | code will return the clusters to you directly. | ||||||
2226 | |||||||
2227 | =head1 REQUIRED | ||||||
2228 | |||||||
2229 | This module requires the following modules: | ||||||
2230 | |||||||
2231 | List::Util | ||||||
2232 | File::Basename | ||||||
2233 | Math::Random | ||||||
2234 | Graphics::GnuplotIF | ||||||
2235 | Math::GSL::Matrix | ||||||
2236 | POSIX | ||||||
2237 | |||||||
2238 | =head1 THE C |
||||||
2239 | |||||||
2240 | The C |
||||||
2241 | play with in order to become familiar with the module: | ||||||
2242 | |||||||
2243 | example1.pl | ||||||
2244 | |||||||
2245 | example2.pl | ||||||
2246 | |||||||
2247 | example3.pl | ||||||
2248 | |||||||
2249 | example4.pl | ||||||
2250 | |||||||
2251 | These scripts demonstrate linear-manifold based clustering on the 3-dimensional data | ||||||
2252 | in the following three CSV files: | ||||||
2253 | |||||||
2254 | 3_clusters_on_a_sphere_498_samples.csv (used in example1.pl and example4.pl) | ||||||
2255 | |||||||
2256 | 3_clusters_on_a_sphere_3000_samples.csv (used in example2.pl) | ||||||
2257 | |||||||
2258 | 4_clusters_on_a_sphere_1000_samples.csv (used in example3.pl) | ||||||
2259 | |||||||
2260 | Note that even though the first two of these files both contain exactly three | ||||||
2261 | clusters, the clusters look very different in the two data files. The clusters are | ||||||
2262 | much more spread out in C<3_clusters_on_a_sphere_3000_samples.csv>. | ||||||
2263 | |||||||
2264 | The code in C |
||||||
2265 | C |
||||||
2266 | the clustering program until success is achieved. The value of the constructor | ||||||
2267 | parameter C |
||||||
2268 | when you execute C |
||||||
2269 | You might wish to change the value given to the parameter | ||||||
2270 | C |
||||||
2271 | achieve success. | ||||||
2272 | |||||||
2273 | The C |
||||||
2274 | and the best final results that can be achieved by the first three examples, that | ||||||
2275 | is, for the scripts C |
||||||
2276 | a Linux machine and if you have the C |
||||||
2277 | C |
||||||
2278 | your own runs of the three example scripts may or may not look like what you see in | ||||||
2279 | the outputs shown in the PNG files depending on how the module seeds the clusters. | ||||||
2280 | Your best results should look like what you see in the PNG images. | ||||||
2281 | |||||||
2282 | The C |
||||||
2283 | |||||||
2284 | For generating the data for experiments with clustering: | ||||||
2285 | |||||||
2286 | generate_data_on_a_sphere.pl | ||||||
2287 | |||||||
2288 | For visualizing the data generated by the above script: | ||||||
2289 | |||||||
2290 | data_visualizer.pl | ||||||
2291 | |||||||
2292 | For cleaning up the examples directory: | ||||||
2293 | |||||||
2294 | cleanup_directory.pl | ||||||
2295 | |||||||
2296 | Invoking the C |
||||||
2297 | that are generated by the module when you run it with the constructor option | ||||||
2298 | C |
||||||
2299 | |||||||
2300 | =head1 EXPORT | ||||||
2301 | |||||||
2302 | None by design. | ||||||
2303 | |||||||
2304 | =head1 CAVEATS | ||||||
2305 | |||||||
2306 | The performance of the algorithm depends much on the values you choose for the | ||||||
2307 | constructor parameters. And, even for the best choices for the parameters, the | ||||||
2308 | algorithm is not theoretically guaranteed to return the best results. | ||||||
2309 | |||||||
2310 | Even after you have discovered the best choices for the constructor parameters, the | ||||||
2311 | best way to use this module is to try it multiple times on any given data and accept | ||||||
2312 | only those results that make the best intuitive sense. | ||||||
2313 | |||||||
2314 | The module was designed with the hope that it would rather fail than give you wrong | ||||||
2315 | results. So if you see it failing a few times before it returns a good answer, that's | ||||||
2316 | a good thing. | ||||||
2317 | |||||||
2318 | However, if the module fails too often or is too quick to give you wrong answers, | ||||||
2319 | that means the module is not working on your data. If that happens, I'd love to hear | ||||||
2320 | from you. That might indicate to me how I should enhance the power of this module | ||||||
2321 | for its future versions. | ||||||
2322 | |||||||
2323 | =head1 BUGS | ||||||
2324 | |||||||
2325 | Please notify the author if you encounter any bugs. When sending email, please place | ||||||
2326 | the string 'LinearManifoldDataClusterer' in the subject line. | ||||||
2327 | |||||||
2328 | =head1 INSTALLATION | ||||||
2329 | |||||||
2330 | Download the archive from CPAN in any directory of your choice. Unpack the archive | ||||||
2331 | with a command that on a Linux machine would look like: | ||||||
2332 | |||||||
2333 | tar zxvf Algorithm-LinearManifoldDataClusterer-1.0.tar.gz | ||||||
2334 | |||||||
2335 | This will create an installation directory for you whose name will be | ||||||
2336 | C |
||||||
2337 | for a standard install of the module if you have root privileges: | ||||||
2338 | |||||||
2339 | perl Makefile.PL | ||||||
2340 | make | ||||||
2341 | make test | ||||||
2342 | sudo make install | ||||||
2343 | |||||||
2344 | If you do not have root privileges, you can carry out a non-standard install the | ||||||
2345 | module in any directory of your choice by: | ||||||
2346 | |||||||
2347 | perl Makefile.PL prefix=/some/other/directory/ | ||||||
2348 | make | ||||||
2349 | make test | ||||||
2350 | make install | ||||||
2351 | |||||||
2352 | With a non-standard install, you may also have to set your PERL5LIB environment | ||||||
2353 | variable so that this module can find the required other modules. How you do that | ||||||
2354 | would depend on what platform you are working on. In order to install this module in | ||||||
2355 | a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment | ||||||
2356 | variable by | ||||||
2357 | |||||||
2358 | setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/ | ||||||
2359 | |||||||
2360 | If I used bash, I'd need to declare: | ||||||
2361 | |||||||
2362 | export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/ | ||||||
2363 | |||||||
2364 | |||||||
2365 | =head1 THANKS | ||||||
2366 | |||||||
2367 | I have learned much from my conversations with Donghun Kim whose research on face | ||||||
2368 | recognition in the wild involves clustering image data on manifolds. I have also had | ||||||
2369 | several fruitful conversations with Bharath Kumar Comandur and Tanmay Prakash with | ||||||
2370 | regard to the ideas that are incorporated in this module. | ||||||
2371 | |||||||
2372 | =head1 AUTHOR | ||||||
2373 | |||||||
2374 | Avinash Kak, kak@purdue.edu | ||||||
2375 | |||||||
2376 | If you send email, please place the string "LinearManifoldDataClusterer" in your subject line to get past | ||||||
2377 | my spam filter. | ||||||
2378 | |||||||
2379 | =head1 COPYRIGHT | ||||||
2380 | |||||||
2381 | This library is free software; you can redistribute it and/or modify it under the | ||||||
2382 | same terms as Perl itself. | ||||||
2383 | |||||||
2384 | Copyright 2015 Avinash Kak | ||||||
2385 | |||||||
2386 | =cut | ||||||
2387 |