blib/lib/Algorithm/LinearManifoldDataClusterer.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 25 | 27 | 92.5 |
branch | n/a | ||
condition | n/a | ||
subroutine | 9 | 9 | 100.0 |
pod | n/a | ||
total | 34 | 36 | 94.4 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | package Algorithm::LinearManifoldDataClusterer; | ||||||
2 | |||||||
3 | #------------------------------------------------------------------------------------ | ||||||
4 | # Copyright (c) 2015 Avinash Kak. All rights reserved. This program is free | ||||||
5 | # software. You may modify and/or distribute it under the same terms as Perl itself. | ||||||
6 | # This copyright notice must remain attached to the file. | ||||||
7 | # | ||||||
8 | # Algorithm::LinearManifoldDataClusterer is a Perl module for clustering data that | ||||||
9 | # resides on a low-dimensional manifold in a high-dimensional measurement space. | ||||||
10 | # ----------------------------------------------------------------------------------- | ||||||
11 | |||||||
12 | 1 | 1 | 13073 | use 5.10.0; | |||
1 | 3 | ||||||
1 | 45 | ||||||
13 | 1 | 1 | 5 | use strict; | |||
1 | 1 | ||||||
1 | 28 | ||||||
14 | 1 | 1 | 3 | use warnings; | |||
1 | 4 | ||||||
1 | 30 | ||||||
15 | 1 | 1 | 4 | use Carp; | |||
1 | 0 | ||||||
1 | 71 | ||||||
16 | 1 | 1 | 4 | use List::Util qw(reduce any); | |||
1 | 1 | ||||||
1 | 83 | ||||||
17 | 1 | 1 | 4 | use File::Basename; | |||
1 | 2 | ||||||
1 | 66 | ||||||
18 | 1 | 1 | 594 | use Math::Random; | |||
1 | 5336 | ||||||
1 | 86 | ||||||
19 | 1 | 1 | 584 | use Graphics::GnuplotIF; | |||
1 | 9525 | ||||||
1 | 57 | ||||||
20 | 1 | 1 | 1048 | use Math::GSL::Matrix; | |||
0 | |||||||
0 | |||||||
21 | use POSIX (); | ||||||
22 | |||||||
23 | our $VERSION = '1.01'; | ||||||
24 | |||||||
25 | # Constructor: | ||||||
26 | sub new { | ||||||
27 | my ($class, %args) = @_; | ||||||
28 | my @params = keys %args; | ||||||
29 | croak "\nYou have used a wrong name for a keyword argument " . | ||||||
30 | "--- perhaps a misspelling\n" | ||||||
31 | if check_for_illegal_params(@params) == 0; | ||||||
32 | bless { | ||||||
33 | _datafile => $args{datafile} || croak("datafile required"), | ||||||
34 | _mask => $args{mask} || croak("mask required"), | ||||||
35 | _K => $args{K} || 0, | ||||||
36 | _P => $args{P} || 0, | ||||||
37 | _terminal_output => $args{terminal_output} || 0, | ||||||
38 | _max_iterations => $args{max_iterations} || 0, | ||||||
39 | _delta_reconstruction_error => $args{delta_reconstruction_error} || 0.001, | ||||||
40 | _delta_normalized_error => undef, | ||||||
41 | _cluster_search_multiplier => $args{cluster_search_multiplier} || 1, | ||||||
42 | _visualize_each_iteration => $args{visualize_each_iteration} == 0 ? 0 : 1, | ||||||
43 | _show_hidden_in_3D_plots => $args{show_hidden_in_3D_plots} == 0 ? 0 : 1, | ||||||
44 | _make_png_for_each_iteration => $args{make_png_for_each_iteration} == 0 ? 0 : 1, | ||||||
45 | _debug => $args{debug} || 0, | ||||||
46 | _N => 0, | ||||||
47 | _KM => $args{K} * $args{cluster_search_multiplier}, | ||||||
48 | _data_hash => {}, | ||||||
49 | _data_tags => [], | ||||||
50 | _data_dimensions => 0, | ||||||
51 | _final_clusters => [], | ||||||
52 | _auto_retry_flag => 0, | ||||||
53 | _num_iterations_actually_used => undef, | ||||||
54 | _scale_factor => undef, | ||||||
55 | _data_tags_to_cluster_label_hash => {}, | ||||||
56 | _final_reference_vecs_for_all_subspaces => [], | ||||||
57 | _reconstruction_error_as_a_function_of_iteration => [], | ||||||
58 | _final_trailing_eigenvec_matrices_for_all_subspaces => [], | ||||||
59 | _subspace_construction_error_as_a_function_of_iteration => [], | ||||||
60 | }, $class; | ||||||
61 | } | ||||||
62 | |||||||
63 | sub get_data_from_csv { | ||||||
64 | my $self = shift; | ||||||
65 | my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered"; | ||||||
66 | my $mask = $self->{_mask}; | ||||||
67 | my @mask = split //, $mask; | ||||||
68 | $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask; | ||||||
69 | print "data dimensionality: $self->{_data_dimensions} \n" if $self->{_terminal_output}; | ||||||
70 | open FILEIN, $filename or die "Unable to open $filename: $!"; | ||||||
71 | die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/; | ||||||
72 | local $/ = undef; | ||||||
73 | my @all_data = split /\s+/, |
||||||
74 | my %data_hash = (); | ||||||
75 | my @data_tags = (); | ||||||
76 | foreach my $record (@all_data) { | ||||||
77 | my @splits = split /,/, $record; | ||||||
78 | my $record_name = shift @splits; | ||||||
79 | $data_hash{$record_name} = \@splits; | ||||||
80 | push @data_tags, $record_name; | ||||||
81 | } | ||||||
82 | $self->{_data_hash} = \%data_hash; | ||||||
83 | $self->{_data_tags} = \@data_tags; | ||||||
84 | $self->{_N} = scalar @data_tags; | ||||||
85 | } | ||||||
86 | |||||||
87 | sub estimate_mean_and_covariance { | ||||||
88 | my $self = shift; | ||||||
89 | my $tag_set = shift; | ||||||
90 | my $cluster_size = @$tag_set; | ||||||
91 | my @cluster_center = @{$self->add_point_coords($tag_set)}; | ||||||
92 | @cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center; | ||||||
93 | # for covariance calculation: | ||||||
94 | my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$tag_set)); | ||||||
95 | print "\nThe data will be stuffed into a matrix of $num_rows rows and $num_cols columns\n\n" | ||||||
96 | if $self->{_debug}; | ||||||
97 | my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols); | ||||||
98 | my $mean_vec = Math::GSL::Matrix->new($num_rows,1); | ||||||
99 | # All the record labels are stored in the array $self->{_data_tags}. The actual | ||||||
100 | # data for clustering is stored in a hash at $self->{_data_hash} whose keys are | ||||||
101 | # the record labels; the value associated with each key is the array holding the | ||||||
102 | # corresponding numerical multidimensional data. | ||||||
103 | $mean_vec->set_col(0, \@cluster_center); | ||||||
104 | if ($self->{_debug}) { | ||||||
105 | print "\nDisplaying the mean vector for the cluster:\n"; | ||||||
106 | display_matrix( $mean_vec ) if $self->{_terminal_output}; | ||||||
107 | } | ||||||
108 | foreach my $j (0..$num_cols-1) { | ||||||
109 | my $tag = $tag_set->[$j]; | ||||||
110 | my $data = $self->{_data_hash}->{$tag}; | ||||||
111 | my @diff_from_mean = vector_subtract($data, \@cluster_center); | ||||||
112 | $matrix->set_col($j, \@diff_from_mean); | ||||||
113 | } | ||||||
114 | my $transposed = transpose( $matrix ); | ||||||
115 | my $covariance = $matrix * $transposed; | ||||||
116 | $covariance *= 1.0 / $num_cols; | ||||||
117 | if ($self->{_debug}) { | ||||||
118 | print "\nDisplaying the Covariance Matrix for cluster:"; | ||||||
119 | display_matrix( $covariance ) if $self->{_terminal_output}; | ||||||
120 | } | ||||||
121 | return ($mean_vec, $covariance); | ||||||
122 | } | ||||||
123 | |||||||
124 | sub eigen_analysis_of_covariance { | ||||||
125 | my $self = shift; | ||||||
126 | my $covariance = shift; | ||||||
127 | my ($eigenvalues, $eigenvectors) = $covariance->eigenpair; | ||||||
128 | my $num_of_eigens = @$eigenvalues; | ||||||
129 | my $largest_eigen_index = 0; | ||||||
130 | my $smallest_eigen_index = 0; | ||||||
131 | print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug}; | ||||||
132 | foreach my $i (1..$num_of_eigens-1) { | ||||||
133 | $largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index]; | ||||||
134 | $smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index]; | ||||||
135 | print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug}; | ||||||
136 | } | ||||||
137 | print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug}; | ||||||
138 | print "\nsmallest eigen index: $smallest_eigen_index\n\n" if $self->{_debug}; | ||||||
139 | my @all_my_eigenvecs; | ||||||
140 | foreach my $i (0..$num_of_eigens-1) { | ||||||
141 | my @vec = $eigenvectors->[$i]->as_list; | ||||||
142 | my @eigenvec; | ||||||
143 | foreach my $ele (@vec) { | ||||||
144 | my ($mag, $theta) = $ele =~ /\[(\d*\.?\d*e?[+-]?\d*),(\S+)\]/; | ||||||
145 | if ($theta eq "0") { | ||||||
146 | push @eigenvec, $mag; | ||||||
147 | } elsif ($theta eq "pi") { | ||||||
148 | push @eigenvec, -1.0 * $mag; | ||||||
149 | } else { | ||||||
150 | die "Eigendecomposition produced a complex eigenvector -- " . | ||||||
151 | "which should not happen for a covariance matrix!"; | ||||||
152 | } | ||||||
153 | } | ||||||
154 | print "Eigenvector $i: @eigenvec\n" if $self->{_debug}; | ||||||
155 | push @all_my_eigenvecs, \@eigenvec; | ||||||
156 | } | ||||||
157 | my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list; | ||||||
158 | print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug}; | ||||||
159 | my @sorted_eigenvec_indexes = sort {$eigenvalues->[$b] <=> $eigenvalues->[$a]} 0..@all_my_eigenvecs-1; | ||||||
160 | my @sorted_eigenvecs; | ||||||
161 | my @sorted_eigenvals; | ||||||
162 | foreach my $i (0..@sorted_eigenvec_indexes-1) { | ||||||
163 | $sorted_eigenvecs[$i] = $all_my_eigenvecs[$sorted_eigenvec_indexes[$i]]; | ||||||
164 | $sorted_eigenvals[$i] = $eigenvalues->[$sorted_eigenvec_indexes[$i]]; | ||||||
165 | } | ||||||
166 | if ($self->{_debug}) { | ||||||
167 | print "\nHere come sorted eigenvectors --- from the largest to the smallest:\n"; | ||||||
168 | foreach my $i (0..@sorted_eigenvecs-1) { | ||||||
169 | print "eigenvec: @{$sorted_eigenvecs[$i]} eigenvalue: $sorted_eigenvals[$i]\n"; | ||||||
170 | } | ||||||
171 | } | ||||||
172 | return (\@sorted_eigenvecs, \@sorted_eigenvals); | ||||||
173 | } | ||||||
174 | |||||||
175 | sub auto_retry_clusterer { | ||||||
176 | my $self = shift; | ||||||
177 | $self->{_auto_retry_flag} = 1; | ||||||
178 | my $clusters; | ||||||
179 | $@ = 1; | ||||||
180 | my $retry_attempts = 1; | ||||||
181 | while ($@) { | ||||||
182 | eval { | ||||||
183 | $clusters = $self->linear_manifold_clusterer(); | ||||||
184 | }; | ||||||
185 | if ($@) { | ||||||
186 | if ($self->{_terminal_output}) { | ||||||
187 | print "Clustering failed. Trying again. --- $@"; | ||||||
188 | print "\n\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"; | ||||||
189 | print "VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV\n\n"; | ||||||
190 | } | ||||||
191 | $retry_attempts++; | ||||||
192 | } else { | ||||||
193 | print "\n\nNumber of retry attempts: $retry_attempts\n\n" if $self->{_terminal_output}; | ||||||
194 | return $clusters; | ||||||
195 | } | ||||||
196 | } | ||||||
197 | } | ||||||
198 | |||||||
199 | sub linear_manifold_clusterer { | ||||||
200 | my $self = shift; | ||||||
201 | my $KM = $self->{_KM}; | ||||||
202 | my @initial_cluster_center_tags; | ||||||
203 | my $visualization_msg; | ||||||
204 | my @initial_cluster_center_indexes = $self->initialize_cluster_centers($KM, $self->{_N}); | ||||||
205 | print "Randomly selected indexes for cluster center tags: @initial_cluster_center_indexes\n" | ||||||
206 | if $self->{_debug}; | ||||||
207 | @initial_cluster_center_tags = map {$self->{_data_tags}->[$_]} @initial_cluster_center_indexes; | ||||||
208 | my @initial_cluster_center_coords = map {$self->{_data_hash}->{$_}} @initial_cluster_center_tags; | ||||||
209 | if ($self->{_debug}) { | ||||||
210 | foreach my $centroid (@initial_cluster_center_coords) { | ||||||
211 | print "Initial cluster center coords: @{$centroid}\n"; | ||||||
212 | } | ||||||
213 | } | ||||||
214 | my $initial_clusters = $self->assign_data_to_clusters_initial(\@initial_cluster_center_coords); | ||||||
215 | if ($self->{_data_dimensions} == 3) { | ||||||
216 | $visualization_msg = "initial_clusters"; | ||||||
217 | $self->visualize_clusters_on_sphere($visualization_msg, $initial_clusters) | ||||||
218 | if $self->{_visualize_each_iteration}; | ||||||
219 | $self->visualize_clusters_on_sphere($visualization_msg, $initial_clusters, "png") | ||||||
220 | if $self->{_make_png_for_each_iteration}; | ||||||
221 | } | ||||||
222 | foreach my $cluster (@$initial_clusters) { | ||||||
223 | my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster); | ||||||
224 | display_mean_and_covariance($mean, $covariance) if $self->{_debug}; | ||||||
225 | } | ||||||
226 | my @clusters = @$initial_clusters; | ||||||
227 | display_clusters(\@clusters) if $self->{_debug}; | ||||||
228 | my $iteration_index = 0; | ||||||
229 | my $unimodal_correction_flag; | ||||||
230 | my $previous_min_value_for_unimodality_quotient; | ||||||
231 | while ($iteration_index < $self->{_max_iterations}) { | ||||||
232 | print "\n\n========================== STARTING ITERATION $iteration_index =====================\n\n" | ||||||
233 | if $self->{_terminal_output}; | ||||||
234 | my $total_reconstruction_error_this_iteration = 0; | ||||||
235 | my @subspace_construction_errors_this_iteration; | ||||||
236 | my @trailing_eigenvec_matrices_for_all_subspaces; | ||||||
237 | my @reference_vecs_for_all_subspaces; | ||||||
238 | foreach my $cluster (@clusters) { | ||||||
239 | next if @$cluster == 0; | ||||||
240 | my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster); | ||||||
241 | display_mean_and_covariance($mean, $covariance) if $self->{_debug}; | ||||||
242 | print "--------------end of displaying mean and covariance\n\n" if $self->{_debug}; | ||||||
243 | my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance); | ||||||
244 | my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1]; | ||||||
245 | my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1]; | ||||||
246 | my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals; | ||||||
247 | push @subspace_construction_errors_this_iteration, $subspace_construction_error; | ||||||
248 | my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions}, | ||||||
249 | scalar(@trailing_eigenvecs)); | ||||||
250 | foreach my $j (0..@trailing_eigenvecs-1) { | ||||||
251 | print "trailing eigenvec column: @{$trailing_eigenvecs[$j]}\n" if $self->{_debug}; | ||||||
252 | $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]); | ||||||
253 | } | ||||||
254 | push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix; | ||||||
255 | push @reference_vecs_for_all_subspaces, $mean; | ||||||
256 | } | ||||||
257 | $self->set_termination_reconstruction_error_threshold(\@reference_vecs_for_all_subspaces); | ||||||
258 | my %best_subspace_based_partition_of_data; | ||||||
259 | foreach my $i (0..$self->{_KM}-1) { | ||||||
260 | $best_subspace_based_partition_of_data{$i} = []; | ||||||
261 | } | ||||||
262 | foreach my $data_tag (@{$self->{_data_tags}}) { | ||||||
263 | my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); | ||||||
264 | $data_vec->set_col(0, $self->{_data_hash}->{$data_tag}); | ||||||
265 | my @errors = map {$self->reconstruction_error($data_vec, | ||||||
266 | $trailing_eigenvec_matrices_for_all_subspaces[$_], | ||||||
267 | $reference_vecs_for_all_subspaces[$_])} | ||||||
268 | 0 .. $self->{_KM}-1; | ||||||
269 | my ($minval, $index_for_closest_subspace) = minimum(\@errors); | ||||||
270 | $total_reconstruction_error_this_iteration += $minval; | ||||||
271 | push @{$best_subspace_based_partition_of_data{$index_for_closest_subspace}}, | ||||||
272 | $data_tag; | ||||||
273 | } | ||||||
274 | print "Finished calculating the eigenvectors for the clusters produced by the previous\n" . | ||||||
275 | "iteration and re-assigning the data samples to the new subspaces on the basis of\n". | ||||||
276 | "the least reconstruction error.\n\n" . | ||||||
277 | "Total reconstruction error in this iteration: $total_reconstruction_error_this_iteration\n" | ||||||
278 | if $self->{_terminal_output}; | ||||||
279 | foreach my $i (0..$self->{_KM}-1) { | ||||||
280 | $clusters[$i] = $best_subspace_based_partition_of_data{$i}; | ||||||
281 | } | ||||||
282 | display_clusters(\@clusters) if $self->{_terminal_output}; | ||||||
283 | # Check if any cluster has lost all its elements. If so, fragment the worst | ||||||
284 | # existing cluster to create the additional clusters needed: | ||||||
285 | if (any {@$_ == 0} @clusters) { | ||||||
286 | die "empty cluster found" if $self->{_auto_retry_flag}; | ||||||
287 | print "\nOne or more clusters have become empty. Will carve out the needed clusters\n" . | ||||||
288 | "from the cluster with the largest subspace construction error.\n\n"; | ||||||
289 | $total_reconstruction_error_this_iteration = 0; | ||||||
290 | @subspace_construction_errors_this_iteration = (); | ||||||
291 | my $how_many_extra_clusters_needed = $self->{_KM} - scalar(grep {@$_ != 0} @clusters); | ||||||
292 | print "number of extra clusters needed at iteration $iteration_index: $how_many_extra_clusters_needed\n"; | ||||||
293 | my $max = List::Util::max @subspace_construction_errors_this_iteration; | ||||||
294 | my $maxindex = List::Util::first {$_ == $max} @subspace_construction_errors_this_iteration; | ||||||
295 | my @cluster_fragments = cluster_split($clusters[$maxindex], | ||||||
296 | $how_many_extra_clusters_needed + 1); | ||||||
297 | my @newclusters; | ||||||
298 | push @newclusters, @clusters[0 .. $maxindex-1]; | ||||||
299 | push @newclusters, @clusters[$maxindex+1 .. $self->{_KM}-1]; | ||||||
300 | push @newclusters, @cluster_fragments; | ||||||
301 | @newclusters = grep {@$_ != 0} @newclusters; | ||||||
302 | die "something went wrong with cluster fragmentation" | ||||||
303 | unless $self->{_KM} = @newclusters; | ||||||
304 | @trailing_eigenvec_matrices_for_all_subspaces = (); | ||||||
305 | @reference_vecs_for_all_subspaces = (); | ||||||
306 | foreach my $cluster (@newclusters) { | ||||||
307 | die "Linear Manifold Clustering did not work $!" if @$cluster == 0; | ||||||
308 | my ($mean, $covariance) = estimate_mean_and_covariance($cluster); | ||||||
309 | my ($eigenvecs, $eigenvals) = eigen_analysis_of_covariance($covariance); | ||||||
310 | my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1]; | ||||||
311 | my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1]; | ||||||
312 | my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals; | ||||||
313 | push @subspace_construction_errors_this_iteration, $subspace_construction_error; | ||||||
314 | my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions}, | ||||||
315 | scalar(@trailing_eigenvecs)); | ||||||
316 | foreach my $j (0..@trailing_eigenvecs-1) { | ||||||
317 | $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]); | ||||||
318 | } | ||||||
319 | push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix; | ||||||
320 | push @reference_vecs_for_all_subspaces, $mean; | ||||||
321 | } | ||||||
322 | my %best_subspace_based_partition_of_data; | ||||||
323 | foreach my $i (0..$self->{_KM}-1) { | ||||||
324 | $best_subspace_based_partition_of_data{$i} = []; | ||||||
325 | } | ||||||
326 | foreach my $data_tag (@{$self->{_data_tags}}) { | ||||||
327 | my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); | ||||||
328 | $data_vec->set_col(0, $self->{_data_hash}->{$data_tag}); | ||||||
329 | my @errors = map {reconstruction_error($data_vec, | ||||||
330 | $trailing_eigenvec_matrices_for_all_subspaces[$_], | ||||||
331 | $reference_vecs_for_all_subspaces[$_])} | ||||||
332 | 0 .. $self->{_KM}-1; | ||||||
333 | my ($minval, $index_for_closest_subspace) = minimum(\@errors); | ||||||
334 | $total_reconstruction_error_this_iteration += $minval; | ||||||
335 | push @{$best_subspace_based_partition_of_data{$index_for_closest_subspace}}, | ||||||
336 | $data_tag; | ||||||
337 | } | ||||||
338 | print "empty-cluster jag: total reconstruction error in this iteration: \n" . | ||||||
339 | "$total_reconstruction_error_this_iteration\n" | ||||||
340 | if $self->{_debug}; | ||||||
341 | foreach my $i (0..$self->{_KM}-1) { | ||||||
342 | $clusters[$i] = $best_subspace_based_partition_of_data{$i}; | ||||||
343 | } | ||||||
344 | display_clusters(\@newclusters) if $self->{_terminal_output}; | ||||||
345 | @clusters = grep {@$_ != 0} @newclusters; | ||||||
346 | die "linear manifold based algorithm does not appear to work in this case $!" | ||||||
347 | unless @clusters == $self->{_KM}; | ||||||
348 | }# end of foreach my $cluster (@clusters) ... loop followed by if clause for empty clusters | ||||||
349 | if ($self->{_data_dimensions} == 3) { | ||||||
350 | $visualization_msg = "clustering_at_iteration_$iteration_index"; | ||||||
351 | $self->visualize_clusters_on_sphere($visualization_msg, \@clusters) | ||||||
352 | if $self->{_visualize_each_iteration}; | ||||||
353 | $self->visualize_clusters_on_sphere($visualization_msg, \@clusters, "png") | ||||||
354 | if $self->{_make_png_for_each_iteration}; | ||||||
355 | } | ||||||
356 | my @cluster_unimodality_quotients = map {$self->cluster_unimodality_quotient($clusters[$_], | ||||||
357 | $reference_vecs_for_all_subspaces[$_])} 0..@clusters-1; | ||||||
358 | my $min_value_for_unimodality_quotient = List::Util::min @cluster_unimodality_quotients; | ||||||
359 | print "\nCluster unimodality quotients: @cluster_unimodality_quotients\n" if $self->{_terminal_output}; | ||||||
360 | die "\n\nBailing out!\n" . | ||||||
361 | "It does not look like these iterations will lead to a good clustering result.\n" . | ||||||
362 | "Program terminating. Try running again.\n" | ||||||
363 | if defined($previous_min_value_for_unimodality_quotient) | ||||||
364 | && ($min_value_for_unimodality_quotient < 0.4) | ||||||
365 | && ($min_value_for_unimodality_quotient < (0.5 * $previous_min_value_for_unimodality_quotient)); | ||||||
366 | if ( $min_value_for_unimodality_quotient < 0.5 ) { | ||||||
367 | $unimodal_correction_flag = 1; | ||||||
368 | print "\nApplying unimodality correction:\n\n" if $self->{_terminal_output}; | ||||||
369 | my @sorted_cluster_indexes = | ||||||
370 | sort {$cluster_unimodality_quotients[$b] <=> $cluster_unimodality_quotients[$a]} 0..@clusters-1; | ||||||
371 | my @newclusters; | ||||||
372 | foreach my $cluster_index (0..@clusters - 1) { | ||||||
373 | push @newclusters, $clusters[$sorted_cluster_indexes[$cluster_index]]; | ||||||
374 | } | ||||||
375 | @clusters = @newclusters; | ||||||
376 | my $worst_cluster = pop @clusters; | ||||||
377 | print "\nthe worst cluster: @$worst_cluster\n" if $self->{_terminal_output}; | ||||||
378 | my $second_worst_cluster = pop @clusters; | ||||||
379 | print "\nthe second worst cluster: @$second_worst_cluster\n" if $self->{_terminal_output}; | ||||||
380 | push @$worst_cluster, @$second_worst_cluster; | ||||||
381 | fisher_yates_shuffle($worst_cluster); | ||||||
382 | my @first_half = @$worst_cluster[0 .. int(scalar(@$worst_cluster)/2) - 1]; | ||||||
383 | my @second_half = @$worst_cluster[int(scalar(@$worst_cluster)/2) .. @$worst_cluster - 1]; | ||||||
384 | push @clusters, \@first_half; | ||||||
385 | push @clusters, \@second_half; | ||||||
386 | if ($self->{_terminal_output}) { | ||||||
387 | print "\n\nShowing the clusters obtained after applying the unimodality correction:\n"; | ||||||
388 | display_clusters(\@clusters); | ||||||
389 | } | ||||||
390 | } | ||||||
391 | if (@{$self->{_reconstruction_error_as_a_function_of_iteration}} > 0) { | ||||||
392 | my $last_recon_error = pop @{$self->{_reconstruction_error_as_a_function_of_iteration}}; | ||||||
393 | push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, $last_recon_error; | ||||||
394 | if (($last_recon_error - $total_reconstruction_error_this_iteration) | ||||||
395 | < $self->{_delta_normalized_error}) { | ||||||
396 | push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, | ||||||
397 | $total_reconstruction_error_this_iteration; | ||||||
398 | last; | ||||||
399 | } | ||||||
400 | } | ||||||
401 | push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, | ||||||
402 | $total_reconstruction_error_this_iteration; | ||||||
403 | $iteration_index++; | ||||||
404 | $previous_min_value_for_unimodality_quotient = $min_value_for_unimodality_quotient; | ||||||
405 | } # end of while loop on iteration_index | ||||||
406 | $self->{_num_iterations_actually_used} = | ||||||
407 | scalar @{$self->{_reconstruction_error_as_a_function_of_iteration}}; | ||||||
408 | if ($self->{_terminal_output}) { | ||||||
409 | print "\nIterations of the main loop terminated at iteration number $iteration_index.\n"; | ||||||
410 | print "Will now invoke graph partitioning to discover dominant clusters and to\n" . | ||||||
411 | "merge small clusters.\n\n" if $self->{_cluster_search_multiplier} > 1; | ||||||
412 | print "Total reconstruction error as a function of iterations: " . | ||||||
413 | "@{$self->{_reconstruction_error_as_a_function_of_iteration}}"; | ||||||
414 | } | ||||||
415 | # now merge sub-clusters if cluster_search_multiplier > 1 | ||||||
416 | my @final_clusters; | ||||||
417 | if ($self->{_cluster_search_multiplier} > 1) { | ||||||
418 | print "\n\nInvoking recursive graph partitioning to merge small clusters\n\n"; | ||||||
419 | my @array_of_partitioned_cluster_groups = (\@clusters); | ||||||
420 | my @partitioned_cluster_groups; | ||||||
421 | my $how_many_clusters_looking_for = $self->{_K}; | ||||||
422 | while (scalar(@final_clusters) < $self->{_K}) { | ||||||
423 | @partitioned_cluster_groups = | ||||||
424 | $self->graph_partition(shift @array_of_partitioned_cluster_groups, | ||||||
425 | $how_many_clusters_looking_for ); | ||||||
426 | if (@{$partitioned_cluster_groups[0]} == 1) { | ||||||
427 | my $singular_cluster = shift @{$partitioned_cluster_groups[0]}; | ||||||
428 | push @final_clusters, $singular_cluster; | ||||||
429 | $how_many_clusters_looking_for--; | ||||||
430 | push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[1]; | ||||||
431 | } elsif (@{$partitioned_cluster_groups[1]} == 1) { | ||||||
432 | my $singular_cluster = shift @{$partitioned_cluster_groups[1]}; | ||||||
433 | push @final_clusters, $singular_cluster; | ||||||
434 | $how_many_clusters_looking_for--; | ||||||
435 | push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[0]; | ||||||
436 | } else { | ||||||
437 | push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[0]; | ||||||
438 | push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[1]; | ||||||
439 | } | ||||||
440 | } | ||||||
441 | my @data_clustered; | ||||||
442 | foreach my $cluster (@final_clusters) { | ||||||
443 | push @data_clustered, @$cluster; | ||||||
444 | } | ||||||
445 | unless (scalar(@data_clustered) == scalar(@{$self->{_data_tags}})) { | ||||||
446 | $self->{_final_clusters} = \@final_clusters; | ||||||
447 | my %data_clustered = map {$_ => 1} @data_clustered; | ||||||
448 | my @data_tags_not_clustered = | ||||||
449 | grep {$_} map {exists $data_clustered{$_} ? undef : $_} @{$self->{_data_tags}}; | ||||||
450 | if ($self->{_terminal_output}) { | ||||||
451 | print "\n\nNot all data clustered. The most reliable clusters found by graph partitioning:\n"; | ||||||
452 | display_clusters(\@final_clusters); | ||||||
453 | print "\n\nData not yet clustered:\n\n@data_tags_not_clustered\n"; | ||||||
454 | } | ||||||
455 | if ($self->{_data_dimensions} == 3) { | ||||||
456 | $visualization_msg = "$self->{_K}_best_clusters_produced_by_graph_partitioning"; | ||||||
457 | $self->visualize_clusters_on_sphere($visualization_msg, \@final_clusters) | ||||||
458 | if $self->{_visualize_each_iteration}; | ||||||
459 | $self->visualize_clusters_on_sphere($visualization_msg, \@final_clusters, "png") | ||||||
460 | if $self->{_make_png_for_each_iteration}; | ||||||
461 | } | ||||||
462 | my %data_tags_to_cluster_label_hash; | ||||||
463 | foreach my $i (0..@final_clusters-1) { | ||||||
464 | map {$data_tags_to_cluster_label_hash{$_} = $i} @{$final_clusters[$i]}; | ||||||
465 | } | ||||||
466 | $self->{_data_tags_to_cluster_label_hash} = \%data_tags_to_cluster_label_hash; | ||||||
467 | foreach my $tag (@data_tags_not_clustered) { | ||||||
468 | my $which_cluster = $self->which_cluster_for_new_element($tag); | ||||||
469 | $self->{_data_tags_to_cluster_label_hash}->{$tag} = $which_cluster; | ||||||
470 | } | ||||||
471 | die "Some data elements are still missing from the final tally" | ||||||
472 | unless scalar(keys %{$self->{_data_tags_to_cluster_label_hash}}) == | ||||||
473 | scalar(@{$self->{_data_tags}}); | ||||||
474 | my @new_final_clusters; | ||||||
475 | map { foreach my $ele (keys %{$self->{_data_tags_to_cluster_label_hash}}) { | ||||||
476 | push @{$new_final_clusters[$_]}, $ele | ||||||
477 | if $self->{_data_tags_to_cluster_label_hash}->{$ele} == $_ } | ||||||
478 | } 0..$self->{_K}-1; | ||||||
479 | if ($self->{_debug}) { | ||||||
480 | print "\ndisplaying the final clusters after accounting for unclustered data:\n"; | ||||||
481 | display_clusters(\@new_final_clusters); | ||||||
482 | } | ||||||
483 | $self->{_final_clusters} = \@new_final_clusters; | ||||||
484 | @final_clusters = @new_final_clusters; | ||||||
485 | } | ||||||
486 | } else { | ||||||
487 | @final_clusters = @clusters; | ||||||
488 | } | ||||||
489 | print "\n\nDisplaying final clustering results:\n\n" if $self->{_terminal_output}; | ||||||
490 | display_clusters(\@final_clusters) if $self->{_terminal_output}; | ||||||
491 | return \@final_clusters; | ||||||
492 | } | ||||||
493 | |||||||
494 | sub display_reconstruction_errors_as_a_function_of_iterations { | ||||||
495 | my $self = shift; | ||||||
496 | print "\n\nNumber of iterations used in Phase 1: $self->{_num_iterations_actually_used}\n"; | ||||||
497 | print "\nTotal reconstruction error as a function of iterations in Phase 1: " . | ||||||
498 | "@{$self->{_reconstruction_error_as_a_function_of_iteration}}\n"; | ||||||
499 | } | ||||||
500 | |||||||
501 | sub set_termination_reconstruction_error_threshold { | ||||||
502 | my $self = shift; | ||||||
503 | my $all_ref_vecs = shift; | ||||||
504 | my @mean_vecs = @$all_ref_vecs; | ||||||
505 | my $sum_of_mean_magnitudes = reduce {$a+$b} map { my $result = transpose($_) * $_; | ||||||
506 | my @result = $result->as_list; | ||||||
507 | sqrt($result[0]) | ||||||
508 | } @mean_vecs; | ||||||
509 | $self->{_scale_factor} = $sum_of_mean_magnitudes / @mean_vecs; | ||||||
510 | $self->{_delta_normalized_error} = ($sum_of_mean_magnitudes / @mean_vecs ) * | ||||||
511 | $self->{_delta_reconstruction_error}; | ||||||
512 | } | ||||||
513 | |||||||
514 | # This method is called only in the `unless' clause at the end of the main | ||||||
515 | # linear_manifold_clusterer() method. It is called to find the cluster labels for | ||||||
516 | # those data elements that were left unclustered by the main part of the algorithm | ||||||
517 | # when graph partitioning is used to merge similar sub-clusters. The operating logic | ||||||
518 | # here is that graph partition yields K main clusters even though each main cluster | ||||||
519 | # may not yet be fully populated. | ||||||
520 | sub which_cluster_for_new_element { | ||||||
521 | my $self = shift; | ||||||
522 | my $data_tag = shift; | ||||||
523 | # The following `unless' clause is called only the first time the current method | ||||||
524 | # is called: | ||||||
525 | unless (@{$self->{_final_trailing_eigenvec_matrices_for_all_subspaces}} > 0) { | ||||||
526 | my @trailing_eigenvec_matrices_for_all_subspaces; | ||||||
527 | my @reference_vecs_for_all_subspaces; | ||||||
528 | foreach my $cluster (@{$self->{_final_clusters}}) { | ||||||
529 | my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster); | ||||||
530 | my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance); | ||||||
531 | my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1]; | ||||||
532 | my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions}, | ||||||
533 | scalar(@trailing_eigenvecs)); | ||||||
534 | foreach my $j (0..@trailing_eigenvecs-1) { | ||||||
535 | $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]); | ||||||
536 | } | ||||||
537 | push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix; | ||||||
538 | push @reference_vecs_for_all_subspaces, $mean; | ||||||
539 | } | ||||||
540 | $self->{_final_trailing_eigenvec_matrices_for_all_subspaces} = | ||||||
541 | \@trailing_eigenvec_matrices_for_all_subspaces; | ||||||
542 | $self->{_final_reference_vecs_for_all_subspaces} = \@reference_vecs_for_all_subspaces; | ||||||
543 | } | ||||||
544 | my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); | ||||||
545 | $data_vec->set_col(0, $self->{_data_hash}->{$data_tag}); | ||||||
546 | my @errors = map {$self->reconstruction_error($data_vec, | ||||||
547 | $self->{_final_trailing_eigenvec_matrices_for_all_subspaces}->[$_], | ||||||
548 | $self->{_final_reference_vecs_for_all_subspaces}->[$_])} | ||||||
549 | 0 .. $self->{_K}-1; | ||||||
550 | my ($minval, $index_for_closest_subspace) = minimum(\@errors); | ||||||
551 | return $index_for_closest_subspace; | ||||||
552 | } | ||||||
553 | |||||||
554 | sub graph_partition { | ||||||
555 | my $self = shift; | ||||||
556 | my $clusters = shift; | ||||||
557 | my $how_many_clusters_looking_for = shift; | ||||||
558 | print "\n\nGraph partitioning looking for $how_many_clusters_looking_for clusters\n\n"; | ||||||
559 | my $num_nodes = scalar @$clusters; | ||||||
560 | my $W = Math::GSL::Matrix->new($num_nodes,$num_nodes); | ||||||
561 | my $D = Math::GSL::Matrix->new($num_nodes,$num_nodes); | ||||||
562 | $D->identity; | ||||||
563 | my $neg_sqrt_of_D = Math::GSL::Matrix->new($num_nodes,$num_nodes); | ||||||
564 | $neg_sqrt_of_D->identity; | ||||||
565 | my @subspace_construction_errors; | ||||||
566 | my @trailing_eigenvec_matrices_for_all_subspaces; | ||||||
567 | my @reference_vecs_for_all_subspaces; | ||||||
568 | foreach my $cluster (@$clusters) { | ||||||
569 | my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster); | ||||||
570 | my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance); | ||||||
571 | my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1]; | ||||||
572 | my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1]; | ||||||
573 | my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals; | ||||||
574 | push @subspace_construction_errors, $subspace_construction_error; | ||||||
575 | my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions}, | ||||||
576 | scalar(@trailing_eigenvecs)); | ||||||
577 | foreach my $j (0..@trailing_eigenvecs-1) { | ||||||
578 | print "trailing eigenvec column: @{$trailing_eigenvecs[$j]}\n" if $self->{_debug}; | ||||||
579 | $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]); | ||||||
580 | } | ||||||
581 | push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix; | ||||||
582 | push @reference_vecs_for_all_subspaces, $mean; | ||||||
583 | } | ||||||
584 | # We consider the similarity matrix W to be a sum of two parts W_recon_based and | ||||||
585 | # W_dist_bet_means based. IMPORTANT: For coding reasons, we first store the two | ||||||
586 | # similarity measures separately in W_recon_based and W_dist_bet_means based. Our | ||||||
587 | # goal is to fill up these component matrices with the raw values while at the | ||||||
588 | # same time collecting information needed for normalizing these two separate | ||||||
589 | # measures of similarity. | ||||||
590 | my $W_reconstruction_error_based = Math::GSL::Matrix->new($num_nodes,$num_nodes); | ||||||
591 | my $W_dist_between_means_based = Math::GSL::Matrix->new($num_nodes,$num_nodes); | ||||||
592 | my @all_pairwise_reconstruction_errors; | ||||||
593 | my @all_dist_between_means_errors; | ||||||
594 | foreach my $i (0..$num_nodes-1) { | ||||||
595 | foreach my $j (0..$num_nodes-1) { | ||||||
596 | my ($recon_error_similarity, $dist_bet_means) = $self->pairwise_cluster_similarity( | ||||||
597 | $clusters->[$i], | ||||||
598 | $trailing_eigenvec_matrices_for_all_subspaces[$i], | ||||||
599 | $reference_vecs_for_all_subspaces[$i], | ||||||
600 | $clusters->[$j], | ||||||
601 | $trailing_eigenvec_matrices_for_all_subspaces[$j], | ||||||
602 | $reference_vecs_for_all_subspaces[$j]); | ||||||
603 | $W_reconstruction_error_based->set_elem($i, $j, $recon_error_similarity); | ||||||
604 | $W_dist_between_means_based->set_elem($i, $j, $dist_bet_means); | ||||||
605 | push @all_pairwise_reconstruction_errors, $recon_error_similarity; | ||||||
606 | push @all_dist_between_means_errors, $dist_bet_means; | ||||||
607 | } | ||||||
608 | } | ||||||
609 | my $recon_error_normalizer = (reduce {$a + $b} @all_pairwise_reconstruction_errors) / | ||||||
610 | (scalar @all_pairwise_reconstruction_errors); | ||||||
611 | my $dist_bet_means_based_normalizer = (reduce {$a + $b} @all_dist_between_means_errors ) / | ||||||
612 | (scalar @all_dist_between_means_errors ); | ||||||
613 | die "\n\nBailing out!\n" . | ||||||
614 | "Dealing with highly defective clusters. Try again.\n" | ||||||
615 | if ($recon_error_normalizer == 0) || ($dist_bet_means_based_normalizer == 0); | ||||||
616 | foreach my $i (0..$num_nodes-1) { | ||||||
617 | foreach my $j (0..$num_nodes-1) { | ||||||
618 | my $recon_val = $W_reconstruction_error_based->get_elem($i,$j); | ||||||
619 | my $new_recon_val = exp( -1.0 * $recon_val / $recon_error_normalizer ); | ||||||
620 | $W_reconstruction_error_based->set_elem($i,$j,$new_recon_val); | ||||||
621 | my $mean_dist_val = $W_dist_between_means_based->get_elem($i,$j); | ||||||
622 | my $new_mean_dist_val = exp( -1.0 * $mean_dist_val / $dist_bet_means_based_normalizer ); | ||||||
623 | $W_dist_between_means_based->set_elem($i,$j,$new_mean_dist_val); | ||||||
624 | } | ||||||
625 | } | ||||||
626 | $W = $W_reconstruction_error_based + $W_dist_between_means_based; | ||||||
627 | if ($self->{_debug}) { | ||||||
628 | print "\nDisplaying the similarity matrix W for the cluster graph:\n"; | ||||||
629 | display_matrix($W) if $self->{_terminal_output}; | ||||||
630 | } | ||||||
631 | my $add_all_columns = Math::GSL::Matrix->new($num_nodes,1); | ||||||
632 | foreach my $col (0..$num_nodes-1) { | ||||||
633 | $add_all_columns += $W->col($col); | ||||||
634 | } | ||||||
635 | foreach my $i (0..$num_nodes-1) { | ||||||
636 | $D->set_elem($i,$i, $add_all_columns->get_elem($i,0)); | ||||||
637 | $neg_sqrt_of_D->set_elem($i,$i, 1.0 / sqrt($add_all_columns->get_elem($i,0))); | ||||||
638 | } | ||||||
639 | # the Laplacian matrix: | ||||||
640 | my $Laplacian = $D - $W; | ||||||
641 | # the Symmetric Normalized Laplacian matrix A: | ||||||
642 | my $A = $neg_sqrt_of_D * $Laplacian * $neg_sqrt_of_D; | ||||||
643 | foreach my $i (0..$num_nodes-1) { | ||||||
644 | foreach my $j (0..$num_nodes-1) { | ||||||
645 | $A->set_elem($i,$j,0) if abs($A->get_elem($i,$j)) < 0.01; | ||||||
646 | } | ||||||
647 | } | ||||||
648 | if ($self->{_terminal_output}) { | ||||||
649 | print "\nDisplaying the Symmetric Normalized Laplacian matrix A:\n" . | ||||||
650 | "A = neg_sqrt(D) * Laplacian_matrix * neg_sqrt(D)\n"; | ||||||
651 | display_matrix( $A ); | ||||||
652 | } | ||||||
653 | my ($eigenvalues, $eigenvectors) = $A->eigenpair; | ||||||
654 | my $num_of_eigens = @$eigenvalues; | ||||||
655 | my $largest_eigen_index = 0; | ||||||
656 | my $smallest_eigen_index = 0; | ||||||
657 | if ($self->{_debug2}) { | ||||||
658 | print "Eigenvalue 0: $eigenvalues->[0]\n"; | ||||||
659 | foreach my $i (1..$num_of_eigens-1) { | ||||||
660 | $largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index]; | ||||||
661 | $smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index]; | ||||||
662 | print "Eigenvalue $i: $eigenvalues->[$i]\n"; | ||||||
663 | } | ||||||
664 | print "\nlargest eigen index: $largest_eigen_index\n"; | ||||||
665 | print "\nsmallest eigen index: $smallest_eigen_index\n\n"; | ||||||
666 | } | ||||||
667 | my @all_my_eigenvecs; | ||||||
668 | foreach my $i (0..$num_of_eigens-1) { | ||||||
669 | my @vec = $eigenvectors->[$i]->as_list; | ||||||
670 | my @eigenvec; | ||||||
671 | foreach my $ele (@vec) { | ||||||
672 | my ($mag, $theta) = $ele =~ /\[(\d*\.?\d*e?[+-]?\d*),(\S+)\]/; | ||||||
673 | if ($theta eq "0") { | ||||||
674 | push @eigenvec, $mag; | ||||||
675 | } elsif ($theta eq "pi") { | ||||||
676 | push @eigenvec, -1.0 * $mag; | ||||||
677 | } else { | ||||||
678 | die "Eigendecomposition produced a complex eigenvector!"; | ||||||
679 | } | ||||||
680 | } | ||||||
681 | print "Eigenvector $i: @eigenvec\n" if $self->{_debug2}; | ||||||
682 | push @all_my_eigenvecs, \@eigenvec; | ||||||
683 | } | ||||||
684 | if ($self->{_debug2}) { | ||||||
685 | my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list; | ||||||
686 | print "\nLargest eigenvector of A: @largest_eigen_vec\n"; | ||||||
687 | } | ||||||
688 | my @sorted_eigenvec_indexes = sort {$eigenvalues->[$b] <=> $eigenvalues->[$a]} 0..@all_my_eigenvecs-1; | ||||||
689 | print "sorted eigenvec indexes for A: @sorted_eigenvec_indexes\n" if $self->{_debug2}; | ||||||
690 | my @sorted_eigenvecs; | ||||||
691 | my @sorted_eigenvals; | ||||||
692 | foreach my $i (0..@sorted_eigenvec_indexes-1) { | ||||||
693 | $sorted_eigenvecs[$i] = $all_my_eigenvecs[$sorted_eigenvec_indexes[$i]]; | ||||||
694 | $sorted_eigenvals[$i] = $eigenvalues->[$sorted_eigenvec_indexes[$i]]; | ||||||
695 | } | ||||||
696 | if ($self->{_debug2}) { | ||||||
697 | print "\nHere come sorted eigenvectors for A --- from the largest to the smallest:\n"; | ||||||
698 | foreach my $i (0..@sorted_eigenvecs-1) { | ||||||
699 | print "eigenvec: @{$sorted_eigenvecs[$i]} eigenvalue: $sorted_eigenvals[$i]\n"; | ||||||
700 | } | ||||||
701 | } | ||||||
702 | my $best_partitioning_eigenvec = $sorted_eigenvecs[@sorted_eigenvec_indexes-2]; | ||||||
703 | print "\nBest graph partitioning eigenvector: @$best_partitioning_eigenvec\n" if $self->{_terminal_output}; | ||||||
704 | my $how_many_positive = reduce {$a + $b} map {$_ > 0 ? 1 : 0 } @$best_partitioning_eigenvec; | ||||||
705 | my $how_many_negative = scalar(@$best_partitioning_eigenvec) - $how_many_positive; | ||||||
706 | print "Have $how_many_positive positive and $how_many_negative negative elements in the partitioning vec\n" | ||||||
707 | if $self->{_terminal_output}; | ||||||
708 | if ($how_many_clusters_looking_for <= 3) { | ||||||
709 | my @merged_cluster; | ||||||
710 | my $final_cluster; | ||||||
711 | my @newclusters; | ||||||
712 | if ($how_many_positive == 1) { | ||||||
713 | foreach my $i (0..@$clusters-1) { | ||||||
714 | if ($best_partitioning_eigenvec->[$i] > 0) { | ||||||
715 | $final_cluster = $clusters->[$i]; | ||||||
716 | } else { | ||||||
717 | push @newclusters, $clusters->[$i]; | ||||||
718 | } | ||||||
719 | } | ||||||
720 | return ([$final_cluster], \@newclusters); | ||||||
721 | } elsif ($how_many_negative == 1) { | ||||||
722 | foreach my $i (0..@$clusters-1) { | ||||||
723 | if ($best_partitioning_eigenvec->[$i] < 0) { | ||||||
724 | $final_cluster = $clusters->[$i]; | ||||||
725 | } else { | ||||||
726 | push @newclusters, $clusters->[$i]; | ||||||
727 | } | ||||||
728 | } | ||||||
729 | return ([$final_cluster], \@newclusters); | ||||||
730 | } elsif ($how_many_positive <= $self->{_cluster_search_multiplier}) { | ||||||
731 | foreach my $i (0..@$clusters-1) { | ||||||
732 | if ($best_partitioning_eigenvec->[$i] > 0) { | ||||||
733 | push @merged_cluster, @{$clusters->[$i]}; | ||||||
734 | } else { | ||||||
735 | push @newclusters, $clusters->[$i]; | ||||||
736 | } | ||||||
737 | } | ||||||
738 | return ([\@merged_cluster], \@newclusters); | ||||||
739 | } elsif ($how_many_negative <= $self->{_cluster_search_multiplier}) { | ||||||
740 | foreach my $i (0..@$clusters-1) { | ||||||
741 | if ($best_partitioning_eigenvec->[$i] < 0) { | ||||||
742 | push @merged_cluster, @{$clusters->[$i]}; | ||||||
743 | } else { | ||||||
744 | push @newclusters, $clusters->[$i]; | ||||||
745 | } | ||||||
746 | } | ||||||
747 | return ([\@merged_cluster], \@newclusters); | ||||||
748 | } else { | ||||||
749 | die "\n\nBailing out!\n\n" . | ||||||
750 | "No consensus support for dominant clusters in the graph partitioning step\n" . | ||||||
751 | "of the algorithm. This can be caused by bad random selection of initial\n" . | ||||||
752 | "cluster centers. Please run this program again.\n"; | ||||||
753 | } | ||||||
754 | } else { | ||||||
755 | my @positive_clusters; | ||||||
756 | my @negative_clusters; | ||||||
757 | foreach my $i (0..@$clusters-1) { | ||||||
758 | if ($best_partitioning_eigenvec->[$i] > 0) { | ||||||
759 | push @positive_clusters, $clusters->[$i]; | ||||||
760 | } else { | ||||||
761 | push @negative_clusters, $clusters->[$i]; | ||||||
762 | } | ||||||
763 | } | ||||||
764 | return (\@positive_clusters, \@negative_clusters); | ||||||
765 | } | ||||||
766 | } | ||||||
767 | |||||||
768 | sub pairwise_cluster_similarity { | ||||||
769 | my $self = shift; | ||||||
770 | my $cluster1 = shift; | ||||||
771 | my $trailing_eigenvec_matrix_cluster1 = shift; | ||||||
772 | my $reference_vec_cluster1 = shift; | ||||||
773 | my $cluster2 = shift; | ||||||
774 | my $trailing_eigenvec_matrix_cluster2 = shift; | ||||||
775 | my $reference_vec_cluster2 = shift; | ||||||
776 | my $total_reconstruction_error_in_this_iteration = 0; | ||||||
777 | my @errors_for_1_on_2 = map {my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); | ||||||
778 | $data_vec->set_col(0, $self->{_data_hash}->{$_}); | ||||||
779 | $self->reconstruction_error($data_vec, | ||||||
780 | $trailing_eigenvec_matrix_cluster2, | ||||||
781 | $reference_vec_cluster2)} | ||||||
782 | @$cluster1; | ||||||
783 | my @errors_for_2_on_1 = map {my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); | ||||||
784 | $data_vec->set_col(0, $self->{_data_hash}->{$_}); | ||||||
785 | $self->reconstruction_error($data_vec, | ||||||
786 | $trailing_eigenvec_matrix_cluster1, | ||||||
787 | $reference_vec_cluster1)} | ||||||
788 | @$cluster2; | ||||||
789 | my $type_1_error = reduce {abs($a) + abs($b)} @errors_for_1_on_2; | ||||||
790 | my $type_2_error = reduce {abs($a) + abs($b)} @errors_for_2_on_1; | ||||||
791 | my $total_reconstruction_error = $type_1_error + $type_2_error; | ||||||
792 | my $diff_between_the_means = $reference_vec_cluster1 - $reference_vec_cluster2; | ||||||
793 | my $dist_squared = transpose($diff_between_the_means) * $diff_between_the_means; | ||||||
794 | my @dist_squared_as_list = $dist_squared->as_list(); | ||||||
795 | my $dist_between_means_based_error = shift @dist_squared_as_list; | ||||||
796 | return ($total_reconstruction_error, $dist_between_means_based_error); | ||||||
797 | } | ||||||
798 | |||||||
799 | # delta ball | ||||||
800 | sub cluster_unimodality_quotient { | ||||||
801 | my $self = shift; | ||||||
802 | my $cluster = shift; | ||||||
803 | my $mean = shift; | ||||||
804 | my $delta = 0.4 * $self->{_scale_factor}; # Radius of the delta ball along each dimension | ||||||
805 | my @mean = $mean->as_list; | ||||||
806 | my @data_tags_for_range_tests; | ||||||
807 | foreach my $dimen (0..$self->{_data_dimensions}-1) { | ||||||
808 | my @values = map {$_->[$dimen]} map {$self->{_data_hash}->{$_}} @$cluster; | ||||||
809 | my ($min, $max) = (List::Util::min(@values), List::Util::max(@values)); | ||||||
810 | my $range = $max - $min; | ||||||
811 | my $mean_along_this_dimen = $mean[$dimen]; | ||||||
812 | my @tags = grep {$_} | ||||||
813 | map { ( ($self->{_data_hash}->{$_}->[$dimen] > $mean_along_this_dimen - $delta * $range) | ||||||
814 | && | ||||||
815 | ($self->{_data_hash}->{$_}->[$dimen] < $mean_along_this_dimen + $delta * $range) ) | ||||||
816 | ? $_ : undef } | ||||||
817 | @$cluster; | ||||||
818 | push @data_tags_for_range_tests, \@tags; | ||||||
819 | } | ||||||
820 | # Now find the intersection of the tag sets for each of the dimensions | ||||||
821 | my %intersection_hash; | ||||||
822 | foreach my $dimen (0..$self->{_data_dimensions}-1) { | ||||||
823 | my %tag_hash_for_this_dimen = map {$_ => 1} @{$data_tags_for_range_tests[$dimen]}; | ||||||
824 | if ($dimen == 0) { | ||||||
825 | %intersection_hash = %tag_hash_for_this_dimen; | ||||||
826 | } else { | ||||||
827 | %intersection_hash = map {$_ => 1} grep {$tag_hash_for_this_dimen{$_}} | ||||||
828 | keys %intersection_hash; | ||||||
829 | } | ||||||
830 | } | ||||||
831 | my @intersection_set = keys %intersection_hash; | ||||||
832 | my $cluster_unimodality_index = scalar(@intersection_set) / scalar(@$cluster); | ||||||
833 | return $cluster_unimodality_index; | ||||||
834 | } | ||||||
835 | |||||||
836 | sub find_best_ref_vector { | ||||||
837 | my $self = shift; | ||||||
838 | my $cluster = shift; | ||||||
839 | my $trailing_eigenvec_matrix = shift; | ||||||
840 | my $mean = shift; # a GSL marix ref | ||||||
841 | my @min_bounds; | ||||||
842 | my @max_bounds; | ||||||
843 | my @ranges; | ||||||
844 | foreach my $dimen (0..$self->{_data_dimensions}-1) { | ||||||
845 | my @values = map {$_->[$dimen]} map {$self->{_data_hash}->{$_}} @$cluster; | ||||||
846 | my ($min, $max) = (List::Util::min(@values), List::Util::max(@values)); | ||||||
847 | push @min_bounds, $min; | ||||||
848 | push @max_bounds, $max; | ||||||
849 | push @ranges, $max - $min; | ||||||
850 | } | ||||||
851 | print "min bounds are: @min_bounds\n"; | ||||||
852 | print "max bounds are: @max_bounds\n"; | ||||||
853 | my $max_iterations = 100; | ||||||
854 | my @random_points; | ||||||
855 | my $iteration = 0; | ||||||
856 | while ($iteration++ < $max_iterations) { | ||||||
857 | my @coordinate_vec; | ||||||
858 | foreach my $dimen (0..$self->{_data_dimensions}-1) { | ||||||
859 | push @coordinate_vec, $min_bounds[$dimen] + rand($ranges[$dimen]); | ||||||
860 | } | ||||||
861 | push @random_points, \@coordinate_vec; | ||||||
862 | } | ||||||
863 | if ($self->{_debug}) { | ||||||
864 | print "\nrandom points\n"; | ||||||
865 | map {print "@$_\n"} @random_points; | ||||||
866 | } | ||||||
867 | my @mean = $mean->as_list; | ||||||
868 | unshift @random_points, \@mean; | ||||||
869 | my @reconstruction_errors; | ||||||
870 | foreach my $candidate_ref_vec (@random_points) { | ||||||
871 | my $ref_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); | ||||||
872 | $ref_vec->set_col(0, $candidate_ref_vec); | ||||||
873 | my $reconstruction_error_for_a_ref_vec = 0; | ||||||
874 | foreach my $data_tag (@{$self->{_data_tags}}) { | ||||||
875 | my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); | ||||||
876 | $data_vec->set_col(0, $self->{_data_hash}->{$data_tag}); | ||||||
877 | my $error = $self->reconstruction_error($data_vec,$trailing_eigenvec_matrix,$ref_vec); | ||||||
878 | $reconstruction_error_for_a_ref_vec += $error; | ||||||
879 | } | ||||||
880 | push @reconstruction_errors, $reconstruction_error_for_a_ref_vec; | ||||||
881 | } | ||||||
882 | my $recon_error_for_original_mean = shift @reconstruction_errors; | ||||||
883 | my $smallest_error_randomly_selected_ref_vecs = List::Util::min(@reconstruction_errors); | ||||||
884 | my $minindex = List::Util::first { $_ == $smallest_error_randomly_selected_ref_vecs } | ||||||
885 | @reconstruction_errors; | ||||||
886 | my $refvec = $random_points[$minindex]; | ||||||
887 | return $refvec; | ||||||
888 | } | ||||||
889 | |||||||
890 | ## The reconstruction error relates to the size of the perpendicular from a data | ||||||
891 | ## point X to the hyperplane that defines a given subspace on the manifold. | ||||||
892 | sub reconstruction_error { | ||||||
893 | my $self = shift; | ||||||
894 | my $data_vec = shift; | ||||||
895 | my $trailing_eigenvecs = shift; | ||||||
896 | my $ref_vec = shift; | ||||||
897 | my $error_squared = transpose($data_vec - $ref_vec) * $trailing_eigenvecs * | ||||||
898 | transpose($trailing_eigenvecs) * ($data_vec - $ref_vec); | ||||||
899 | my @error_squared_as_list = $error_squared->as_list(); | ||||||
900 | my $error_squared_as_scalar = shift @error_squared_as_list; | ||||||
901 | return $error_squared_as_scalar; | ||||||
902 | } | ||||||
903 | |||||||
904 | # Returns a set of KM random integers. These serve as indices to reach into the data | ||||||
905 | # array. A data element whose index is one of the random numbers returned by this | ||||||
906 | # routine serves as an initial cluster center. Note the quality check it runs on the | ||||||
907 | # list of the random integers constructed. We first make sure that all the random | ||||||
908 | # integers returned are different. Subsequently, we carry out a quality assessment | ||||||
909 | # of the random integers constructed. This quality measure consists of the ratio of | ||||||
910 | # the values spanned by the random integers to the value of N, the total number of | ||||||
911 | # data points to be clustered. Currently, if this ratio is less than 0.3, we discard | ||||||
912 | # the K integers and try again. | ||||||
913 | sub initialize_cluster_centers { | ||||||
914 | my $self = shift; | ||||||
915 | my $K = shift; # This value is set to the parameter KM in the call to this subroutine | ||||||
916 | my $data_store_size = shift; | ||||||
917 | my @cluster_center_indices; | ||||||
918 | while (1) { | ||||||
919 | foreach my $i (0..$K-1) { | ||||||
920 | $cluster_center_indices[$i] = int rand( $data_store_size ); | ||||||
921 | next if $i == 0; | ||||||
922 | foreach my $j (0..$i-1) { | ||||||
923 | while ( $cluster_center_indices[$j] == $cluster_center_indices[$i] ) { | ||||||
924 | my $old = $cluster_center_indices[$i]; | ||||||
925 | $cluster_center_indices[$i] = int rand($data_store_size); | ||||||
926 | } | ||||||
927 | } | ||||||
928 | } | ||||||
929 | my ($min,$max) = minmax(\@cluster_center_indices ); | ||||||
930 | my $quality = ($max - $min) / $data_store_size; | ||||||
931 | last if $quality > 0.3; | ||||||
932 | } | ||||||
933 | return @cluster_center_indices; | ||||||
934 | } | ||||||
935 | |||||||
936 | # The purpose of this routine is to form initial clusters by assigning the data | ||||||
937 | # samples to the initial clusters formed by the previous routine on the basis of the | ||||||
938 | # best proximity of the data samples to the different cluster centers. | ||||||
939 | sub assign_data_to_clusters_initial { | ||||||
940 | my $self = shift; | ||||||
941 | my @cluster_centers = @{ shift @_ }; | ||||||
942 | my @clusters; | ||||||
943 | foreach my $ele (@{$self->{_data_tags}}) { | ||||||
944 | my $best_cluster; | ||||||
945 | my @dist_from_clust_centers; | ||||||
946 | foreach my $center (@cluster_centers) { | ||||||
947 | push @dist_from_clust_centers, $self->distance($ele, $center); | ||||||
948 | } | ||||||
949 | my ($min, $best_center_index) = minimum( \@dist_from_clust_centers ); | ||||||
950 | push @{$clusters[$best_center_index]}, $ele; | ||||||
951 | } | ||||||
952 | return \@clusters; | ||||||
953 | } | ||||||
954 | |||||||
955 | # The following routine is for computing the distance between a data point specified | ||||||
956 | # by its symbolic name in the master datafile and a point (such as the center of a | ||||||
957 | # cluster) expressed as a vector of coordinates: | ||||||
958 | sub distance { | ||||||
959 | my $self = shift; | ||||||
960 | my $ele1_id = shift @_; # symbolic name of data sample | ||||||
961 | my @ele1 = @{$self->{_data_hash}->{$ele1_id}}; | ||||||
962 | my @ele2 = @{shift @_}; | ||||||
963 | die "wrong data types for distance calculation\n" if @ele1 != @ele2; | ||||||
964 | my $how_many = @ele1; | ||||||
965 | my $squared_sum = 0; | ||||||
966 | foreach my $i (0..$how_many-1) { | ||||||
967 | $squared_sum += ($ele1[$i] - $ele2[$i])**2; | ||||||
968 | } | ||||||
969 | my $dist = sqrt $squared_sum; | ||||||
970 | return $dist; | ||||||
971 | } | ||||||
972 | |||||||
973 | sub write_clusters_to_files { | ||||||
974 | my $self = shift; | ||||||
975 | my $clusters = shift; | ||||||
976 | my @clusters = @$clusters; | ||||||
977 | unlink glob "cluster*.txt"; | ||||||
978 | foreach my $i (0..@clusters-1) { | ||||||
979 | my $filename = "cluster" . $i . ".txt"; | ||||||
980 | print "\nWriting cluster $i to file $filename\n" if $self->{_terminal_output}; | ||||||
981 | open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!"; | ||||||
982 | foreach my $ele (@{$clusters[$i]}) { | ||||||
983 | print FILEHANDLE "$ele "; | ||||||
984 | } | ||||||
985 | close FILEHANDLE; | ||||||
986 | } | ||||||
987 | } | ||||||
988 | |||||||
989 | sub DESTROY { | ||||||
990 | my $filename = basename($_[0]->{_datafile}); | ||||||
991 | $filename =~ s/\.\w+$/\.txt/; | ||||||
992 | unlink "__temp_" . $filename; | ||||||
993 | } | ||||||
994 | |||||||
995 | ################################## Visualization Code ################################### | ||||||
996 | |||||||
997 | sub add_point_coords { | ||||||
998 | my $self = shift; | ||||||
999 | my @arr_of_ids = @{shift @_}; # array of data element names | ||||||
1000 | my @result; | ||||||
1001 | my $data_dimensionality = $self->{_data_dimensions}; | ||||||
1002 | foreach my $i (0..$data_dimensionality-1) { | ||||||
1003 | $result[$i] = 0.0; | ||||||
1004 | } | ||||||
1005 | foreach my $id (@arr_of_ids) { | ||||||
1006 | my $ele = $self->{_data_hash}->{$id}; | ||||||
1007 | my $i = 0; | ||||||
1008 | foreach my $component (@$ele) { | ||||||
1009 | $result[$i] += $component; | ||||||
1010 | $i++; | ||||||
1011 | } | ||||||
1012 | } | ||||||
1013 | return \@result; | ||||||
1014 | } | ||||||
1015 | |||||||
1016 | # This is the main module version: | ||||||
1017 | sub visualize_clusters_on_sphere { | ||||||
1018 | my $self = shift; | ||||||
1019 | my $visualization_msg = shift; | ||||||
1020 | my $clusters = deep_copy_AoA(shift); | ||||||
1021 | my $hardcopy_format = shift; | ||||||
1022 | my $pause_time = shift; | ||||||
1023 | my $d = $self->{_data_dimensions}; | ||||||
1024 | my $temp_file = "__temp_" . $self->{_datafile}; | ||||||
1025 | $temp_file =~ s/\.\w+$/\.txt/; | ||||||
1026 | unlink $temp_file if -e $temp_file; | ||||||
1027 | open OUTPUT, ">$temp_file" | ||||||
1028 | or die "Unable to open a temp file in this directory: $!"; | ||||||
1029 | my @all_tags = "A".."Z"; | ||||||
1030 | my @retagged_clusters; | ||||||
1031 | foreach my $cluster (@$clusters) { | ||||||
1032 | my $label = shift @all_tags; | ||||||
1033 | my @retagged_cluster = | ||||||
1034 | map {$_ =~ s/^(\w+?)_(\w+)/$label . "_$2 @{$self->{_data_hash}->{$_}}"/e;$_} @$cluster; | ||||||
1035 | push @retagged_clusters, \@retagged_cluster; | ||||||
1036 | } | ||||||
1037 | my %clusters; | ||||||
1038 | foreach my $cluster (@retagged_clusters) { | ||||||
1039 | foreach my $record (@$cluster) { | ||||||
1040 | my @splits = grep $_, split /\s+/, $record; | ||||||
1041 | $splits[0] =~ /(\w+?)_.*/; | ||||||
1042 | my $primary_cluster_label = $1; | ||||||
1043 | my @coords = @splits[1..$d]; | ||||||
1044 | push @{$clusters{$primary_cluster_label}}, \@coords; | ||||||
1045 | } | ||||||
1046 | } | ||||||
1047 | foreach my $key (sort {"\L$a" cmp "\L$b"} keys %clusters) { | ||||||
1048 | map {print OUTPUT "$_"} map {"@$_\n"} @{$clusters{$key}}; | ||||||
1049 | print OUTPUT "\n\n"; | ||||||
1050 | } | ||||||
1051 | my @sorted_cluster_keys = sort {"\L$a" cmp "\L$b"} keys %clusters; | ||||||
1052 | close OUTPUT; | ||||||
1053 | my $plot; | ||||||
1054 | unless (defined $pause_time) { | ||||||
1055 | $plot = Graphics::GnuplotIF->new( persist => 1 ); | ||||||
1056 | } else { | ||||||
1057 | $plot = Graphics::GnuplotIF->new(); | ||||||
1058 | } | ||||||
1059 | my $arg_string = ""; | ||||||
1060 | $plot->gnuplot_cmd( "set hidden3d" ) unless $self->{_show_hidden_in_3D_plots}; | ||||||
1061 | $plot->gnuplot_cmd( "set title \"$visualization_msg\"" ); | ||||||
1062 | $plot->gnuplot_cmd( "set noclip" ); | ||||||
1063 | $plot->gnuplot_cmd( "set pointsize 2" ); | ||||||
1064 | $plot->gnuplot_cmd( "set parametric" ); | ||||||
1065 | $plot->gnuplot_cmd( "set size ratio 1" ); | ||||||
1066 | $plot->gnuplot_cmd( "set xlabel \"X\"" ); | ||||||
1067 | $plot->gnuplot_cmd( "set ylabel \"Y\"" ); | ||||||
1068 | $plot->gnuplot_cmd( "set zlabel \"Z\"" ); | ||||||
1069 | if ($hardcopy_format) { | ||||||
1070 | $plot->gnuplot_cmd( "set terminal png" ); | ||||||
1071 | my $image_file_name = "$visualization_msg\.$hardcopy_format"; | ||||||
1072 | $plot->gnuplot_cmd( "set output \"$image_file_name\"" ); | ||||||
1073 | $plot->gnuplot_cmd( "unset hidden3d" ); | ||||||
1074 | } | ||||||
1075 | # set the range for azimuth angles: | ||||||
1076 | $plot->gnuplot_cmd( "set urange [0:2*pi]" ); | ||||||
1077 | # set the range for the elevation angles: | ||||||
1078 | $plot->gnuplot_cmd( "set vrange [-pi/2:pi/2]" ); | ||||||
1079 | # Parametric functions for the sphere | ||||||
1080 | # $plot->gnuplot_cmd( "r=1" ); | ||||||
1081 | if ($self->{_scale_factor}) { | ||||||
1082 | $plot->gnuplot_cmd( "r=$self->{_scale_factor}" ); | ||||||
1083 | } else { | ||||||
1084 | $plot->gnuplot_cmd( "r=1" ); | ||||||
1085 | } | ||||||
1086 | $plot->gnuplot_cmd( "fx(v,u) = r*cos(v)*cos(u)" ); | ||||||
1087 | $plot->gnuplot_cmd( "fy(v,u) = r*cos(v)*sin(u)" ); | ||||||
1088 | $plot->gnuplot_cmd( "fz(v) = r*sin(v)" ); | ||||||
1089 | my $sphere_arg_str = "fx(v,u),fy(v,u),fz(v) notitle with lines lt 0,"; | ||||||
1090 | foreach my $i (0..scalar(keys %clusters)-1) { | ||||||
1091 | my $j = $i + 1; | ||||||
1092 | # The following statement puts the titles on the data points | ||||||
1093 | $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"$sorted_cluster_keys[$i] \" with points lt $j pt $j, "; | ||||||
1094 | } | ||||||
1095 | $arg_string = $arg_string =~ /^(.*),[ ]+$/; | ||||||
1096 | $arg_string = $1; | ||||||
1097 | $plot->gnuplot_cmd( "splot $sphere_arg_str $arg_string" ); | ||||||
1098 | $plot->gnuplot_pause( $pause_time ) if defined $pause_time; | ||||||
1099 | } | ||||||
1100 | |||||||
1101 | |||||||
1102 | ################################### Support Routines ######################################## | ||||||
1103 | |||||||
1104 | sub cluster_split { | ||||||
1105 | my $cluster = shift; | ||||||
1106 | my $how_many = shift; | ||||||
1107 | my @cluster_fragments; | ||||||
1108 | foreach my $i (0..$how_many-1) { | ||||||
1109 | $cluster_fragments[$i] = []; | ||||||
1110 | } | ||||||
1111 | my $delta = int( scalar(@$cluster) / $how_many ); | ||||||
1112 | my $j = 0; | ||||||
1113 | foreach my $i (0..@$cluster-1) { | ||||||
1114 | push @{$cluster_fragments[int($i/$delta)]}, $cluster->[$i]; | ||||||
1115 | } | ||||||
1116 | my $how_many_accounted_for = reduce {$a + $b} map {scalar(@$_)} @cluster_fragments; | ||||||
1117 | foreach my $frag (@cluster_fragments) { | ||||||
1118 | print "fragment: @$frag\n"; | ||||||
1119 | } | ||||||
1120 | die "the fragmentation could not account for all the data" | ||||||
1121 | unless @$cluster == $how_many_accounted_for; | ||||||
1122 | return @cluster_fragments; | ||||||
1123 | } | ||||||
1124 | |||||||
1125 | # Returns the minimum value and its positional index in an array | ||||||
1126 | sub minimum { | ||||||
1127 | my $arr = shift; | ||||||
1128 | my $min; | ||||||
1129 | my $index; | ||||||
1130 | foreach my $i (0..@{$arr}-1) { | ||||||
1131 | if ( (!defined $min) || ($arr->[$i] < $min) ) { | ||||||
1132 | $index = $i; | ||||||
1133 | $min = $arr->[$i]; | ||||||
1134 | } | ||||||
1135 | } | ||||||
1136 | return ($min, $index); | ||||||
1137 | } | ||||||
1138 | |||||||
1139 | sub minmax { | ||||||
1140 | my $arr = shift; | ||||||
1141 | my $min; | ||||||
1142 | my $max; | ||||||
1143 | foreach my $i (0..@{$arr}-1) { | ||||||
1144 | next if !defined $arr->[$i]; | ||||||
1145 | if ( (!defined $min) && (!defined $max) ) { | ||||||
1146 | $min = $arr->[$i]; | ||||||
1147 | $max = $arr->[$i]; | ||||||
1148 | } elsif ( $arr->[$i] < $min ) { | ||||||
1149 | $min = $arr->[$i]; | ||||||
1150 | } elsif ( $arr->[$i] > $max ) { | ||||||
1151 | $max = $arr->[$i]; | ||||||
1152 | } | ||||||
1153 | } | ||||||
1154 | return ($min, $max); | ||||||
1155 | } | ||||||
1156 | |||||||
1157 | # Meant only for constructing a deep copy of an array of arrays: | ||||||
1158 | sub deep_copy_AoA { | ||||||
1159 | my $ref_in = shift; | ||||||
1160 | my $ref_out; | ||||||
1161 | foreach my $i (0..@{$ref_in}-1) { | ||||||
1162 | foreach my $j (0..@{$ref_in->[$i]}-1) { | ||||||
1163 | $ref_out->[$i]->[$j] = $ref_in->[$i]->[$j]; | ||||||
1164 | } | ||||||
1165 | } | ||||||
1166 | return $ref_out; | ||||||
1167 | } | ||||||
1168 | |||||||
1169 | # For displaying the individual clusters on a terminal screen. Each cluster is | ||||||
1170 | # displayed through the symbolic names associated with the data points. | ||||||
1171 | sub display_clusters { | ||||||
1172 | my @clusters = @{shift @_}; | ||||||
1173 | my $i = 0; | ||||||
1174 | foreach my $cluster (@clusters) { | ||||||
1175 | @$cluster = sort @$cluster; | ||||||
1176 | my $cluster_size = @$cluster; | ||||||
1177 | print "\n\nCluster $i ($cluster_size records):\n"; | ||||||
1178 | foreach my $ele (@$cluster) { | ||||||
1179 | print " $ele"; | ||||||
1180 | } | ||||||
1181 | $i++ | ||||||
1182 | } | ||||||
1183 | print "\n\n"; | ||||||
1184 | } | ||||||
1185 | |||||||
1186 | sub display_mean_and_covariance { | ||||||
1187 | my $mean = shift; | ||||||
1188 | my $covariance = shift; | ||||||
1189 | print "\nDisplay the mean:\n"; | ||||||
1190 | display_matrix($mean); | ||||||
1191 | print "\nDisplay the covariance:\n"; | ||||||
1192 | display_matrix($covariance); | ||||||
1193 | } | ||||||
1194 | |||||||
1195 | sub check_for_illegal_params { | ||||||
1196 | my @params = @_; | ||||||
1197 | my @legal_params = qw / datafile | ||||||
1198 | mask | ||||||
1199 | K | ||||||
1200 | P | ||||||
1201 | terminal_output | ||||||
1202 | cluster_search_multiplier | ||||||
1203 | max_iterations | ||||||
1204 | delta_reconstruction_error | ||||||
1205 | visualize_each_iteration | ||||||
1206 | show_hidden_in_3D_plots | ||||||
1207 | make_png_for_each_iteration | ||||||
1208 | debug | ||||||
1209 | /; | ||||||
1210 | my $found_match_flag; | ||||||
1211 | foreach my $param (@params) { | ||||||
1212 | foreach my $legal (@legal_params) { | ||||||
1213 | $found_match_flag = 0; | ||||||
1214 | if ($param eq $legal) { | ||||||
1215 | $found_match_flag = 1; | ||||||
1216 | last; | ||||||
1217 | } | ||||||
1218 | } | ||||||
1219 | last if $found_match_flag == 0; | ||||||
1220 | } | ||||||
1221 | return $found_match_flag; | ||||||
1222 | } | ||||||
1223 | |||||||
1224 | sub display_matrix { | ||||||
1225 | my $matrix = shift; | ||||||
1226 | my $nrows = $matrix->rows(); | ||||||
1227 | my $ncols = $matrix->cols(); | ||||||
1228 | print "\nDisplaying a matrix of size $nrows rows and $ncols columns:\n"; | ||||||
1229 | foreach my $i (0..$nrows-1) { | ||||||
1230 | my $row = $matrix->row($i); | ||||||
1231 | my @row_as_list = $row->as_list; | ||||||
1232 | # print "@row_as_list\n"; | ||||||
1233 | map { printf("%.4f ", $_) } @row_as_list; | ||||||
1234 | print "\n"; | ||||||
1235 | } | ||||||
1236 | print "\n\n"; | ||||||
1237 | } | ||||||
1238 | |||||||
1239 | sub transpose { | ||||||
1240 | my $matrix = shift; | ||||||
1241 | my $num_rows = $matrix->rows(); | ||||||
1242 | my $num_cols = $matrix->cols(); | ||||||
1243 | my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows); | ||||||
1244 | foreach my $i (0..$num_rows-1) { | ||||||
1245 | my @row = $matrix->row($i)->as_list; | ||||||
1246 | $transpose->set_col($i, \@row ); | ||||||
1247 | } | ||||||
1248 | return $transpose; | ||||||
1249 | } | ||||||
1250 | |||||||
1251 | sub vector_subtract { | ||||||
1252 | my $vec1 = shift; | ||||||
1253 | my $vec2 = shift; | ||||||
1254 | die "wrong data types for vector subtract calculation\n" if @$vec1 != @$vec2; | ||||||
1255 | my @result; | ||||||
1256 | foreach my $i (0..@$vec1-1){ | ||||||
1257 | push @result, $vec1->[$i] - $vec2->[$i]; | ||||||
1258 | } | ||||||
1259 | return @result; | ||||||
1260 | } | ||||||
1261 | |||||||
1262 | # from perl docs: | ||||||
1263 | sub fisher_yates_shuffle { | ||||||
1264 | my $arr = shift; | ||||||
1265 | my $i = @$arr; | ||||||
1266 | while (--$i) { | ||||||
1267 | my $j = int rand( $i + 1 ); | ||||||
1268 | @$arr[$i, $j] = @$arr[$j, $i]; | ||||||
1269 | } | ||||||
1270 | } | ||||||
1271 | |||||||
1272 | ######################### Generating Synthetic Data for Manifold Clustering ########################## | ||||||
1273 | |||||||
1274 | ################################## Class DataGenerator ######################################## | ||||||
1275 | |||||||
1276 | ## The embedded class defined below is for generating synthetic data for | ||||||
1277 | ## experimenting with linear manifold clustering when the data resides on the | ||||||
1278 | ## surface of a sphere. See the script generate_data_on_a_sphere.pl in the | ||||||
1279 | ## `examples' directory for how to specify the number of clusters and the spread of | ||||||
1280 | ## each cluster in the data that is generated. | ||||||
1281 | |||||||
1282 | package DataGenerator; | ||||||
1283 | |||||||
1284 | use strict; | ||||||
1285 | use Carp; | ||||||
1286 | |||||||
1287 | sub new { | ||||||
1288 | my ($class, %args) = @_; | ||||||
1289 | my @params = keys %args; | ||||||
1290 | croak "\nYou have used a wrong name for a keyword argument " . | ||||||
1291 | "--- perhaps a misspelling\n" | ||||||
1292 | if _check_for_illegal_params3(@params) == 0; | ||||||
1293 | bless { | ||||||
1294 | _output_file => $args{output_file} | ||||||
1295 | || croak("name for output_file required"), | ||||||
1296 | _total_number_of_samples_needed => $args{total_number_of_samples_needed} | ||||||
1297 | || croak("total_number_of_samples_needed required"), | ||||||
1298 | _number_of_clusters_on_sphere => $args{number_of_clusters_on_sphere} || 3, | ||||||
1299 | _cluster_width => $args{cluster_width} || 0.1, | ||||||
1300 | _show_hidden_in_3D_plots => $args{show_hidden_in_3D_plots} || 1, | ||||||
1301 | _debug => $args{debug} || 0, | ||||||
1302 | }, $class; | ||||||
1303 | } | ||||||
1304 | |||||||
1305 | sub _check_for_illegal_params3 { | ||||||
1306 | my @params = @_; | ||||||
1307 | my @legal_params = qw / output_file | ||||||
1308 | total_number_of_samples_needed | ||||||
1309 | number_of_clusters_on_sphere | ||||||
1310 | cluster_width | ||||||
1311 | show_hidden_in_3D_plots | ||||||
1312 | /; | ||||||
1313 | my $found_match_flag; | ||||||
1314 | foreach my $param (@params) { | ||||||
1315 | foreach my $legal (@legal_params) { | ||||||
1316 | $found_match_flag = 0; | ||||||
1317 | if ($param eq $legal) { | ||||||
1318 | $found_match_flag = 1; | ||||||
1319 | last; | ||||||
1320 | } | ||||||
1321 | } | ||||||
1322 | last if $found_match_flag == 0; | ||||||
1323 | } | ||||||
1324 | return $found_match_flag; | ||||||
1325 | } | ||||||
1326 | |||||||
1327 | ## We first generate a set of points randomly on the unit sphere --- the number of | ||||||
1328 | ## points being equal to the number of clusters desired. These points will serve as | ||||||
1329 | ## cluster means (or, as cluster centroids) subsequently when we ask | ||||||
1330 | ## Math::Random::random_multivariate_normal($N, @m, @covar) to return $N number of | ||||||
1331 | ## points on the sphere. The second argument is the cluster mean and the third | ||||||
1332 | ## argument the cluster covariance. For the synthetic data, we set the cluster | ||||||
1333 | ## covariance to a 2x2 diagonal matrix, with the (0,0) element corresponding to the | ||||||
1334 | ## variance along the azimuth direction and the (1,1) element corresponding to the | ||||||
1335 | ## variance along the elevation direction. | ||||||
1336 | ## | ||||||
1337 | ## When you generate the points in the 2D spherical coordinates of | ||||||
1338 | ## (azimuth,elevation), you also need `wrap-around' logic for those points yielded by | ||||||
1339 | ## the multivariate-normal function whose azimuth angle is outside the interval | ||||||
1340 | ## (0,360) and/or whose elevation angle is outside the interval (-90,90). | ||||||
1341 | ## | ||||||
1342 | ## Note that the first of the two dimensions for which the multivariate-normal | ||||||
1343 | ## function returns the points is for the azimuth angle and the second for the | ||||||
1344 | ## elevation angle. | ||||||
1345 | ## | ||||||
1346 | ## With regard to the relationship of the Cartesian coordinates to the spherical | ||||||
1347 | ## (azimuth, elevation) coordinates, we assume that (x,y) is the horizontal plane | ||||||
1348 | ## and z the vertical axis. The elevation angle theta is measure with respect to | ||||||
1349 | ## the XY-plane. The highest point on the sphere (the Zenith) corresponds to the | ||||||
1350 | ## elevation angle of +90 and the lowest points on the sphere (the Nadir) | ||||||
1351 | ## corresponds to the elevation angle of -90. The azimuth is measured with respect | ||||||
1352 | ## X-axis. The range of the azimuth is from 0 to 360 degrees. The elevation is | ||||||
1353 | ## measured from the XY plane and its range is (-90,90) degrees. | ||||||
1354 | sub gen_data_and_write_to_csv { | ||||||
1355 | my $self = shift; | ||||||
1356 | my $K = $self->{_number_of_clusters_on_sphere}; | ||||||
1357 | # $N is the number of samples to be generated for each cluster: | ||||||
1358 | my $N = int($self->{_total_number_of_samples_needed} / $K); | ||||||
1359 | my $output_file = $self->{_output_file}; | ||||||
1360 | # Designated all of the data elements in a cluster by a letter that is followed by | ||||||
1361 | # an integer that identifies a specific data element. | ||||||
1362 | my @point_labels = ('a'..'z'); | ||||||
1363 | # Our first job is to define $K random points in the 2D space (azimuth, | ||||||
1364 | # elevation) to serve as cluster centers on the surface of the sphere. This we | ||||||
1365 | # do by calling a uniformly distributed 1-D random number generator, first for | ||||||
1366 | # the azimuth and then for the elevation in the loop shown below: | ||||||
1367 | my @cluster_centers; | ||||||
1368 | my @covariances; | ||||||
1369 | foreach my $i (0..$K-1) { | ||||||
1370 | my $azimuth = rand(360); | ||||||
1371 | my $elevation = rand(90) - 90; | ||||||
1372 | my @mean = ($azimuth, $elevation); | ||||||
1373 | push @cluster_centers, \@mean; | ||||||
1374 | my $cluster_covariance; | ||||||
1375 | # The j-th dimension is for azimuth and k-th for elevation for the directions | ||||||
1376 | # to surface of the sphere: | ||||||
1377 | foreach my $j (0..1) { | ||||||
1378 | foreach my $k (0..1) { | ||||||
1379 | $cluster_covariance->[$j]->[$k] = ($self->{_cluster_width} * 360.0) ** 2 | ||||||
1380 | if $j == 0 && $k == 0; | ||||||
1381 | $cluster_covariance->[$j]->[$k] = ($self->{_cluster_width} * 180.0) ** 2 | ||||||
1382 | if $j == 1 && $k == 1; | ||||||
1383 | $cluster_covariance->[$j]->[$k] = 0.0 if $j != $k; | ||||||
1384 | } | ||||||
1385 | } | ||||||
1386 | push @covariances, $cluster_covariance; | ||||||
1387 | } | ||||||
1388 | if ($self->{_debug}) { | ||||||
1389 | foreach my $i (0..$K-1) { | ||||||
1390 | print "\n\nCluster center: @{$cluster_centers[$i]}\n"; | ||||||
1391 | print "\nCovariance:\n"; | ||||||
1392 | foreach my $j (0..1) { | ||||||
1393 | foreach my $k (0..1) { | ||||||
1394 | print "$covariances[$i]->[$j]->[$k] "; | ||||||
1395 | } | ||||||
1396 | print "\n"; | ||||||
1397 | } | ||||||
1398 | } | ||||||
1399 | } | ||||||
1400 | my @data_dump; | ||||||
1401 | foreach my $i (0..$K-1) { | ||||||
1402 | my @m = @{shift @cluster_centers}; | ||||||
1403 | my @covar = @{shift @covariances}; | ||||||
1404 | my @new_data = Math::Random::random_multivariate_normal($N, @m, @covar); | ||||||
1405 | if ($self->{_debug}) { | ||||||
1406 | print "\nThe points for cluster $i:\n"; | ||||||
1407 | map { print "@$_ "; } @new_data; | ||||||
1408 | print "\n\n"; | ||||||
1409 | } | ||||||
1410 | my @wrapped_data; | ||||||
1411 | foreach my $d (@new_data) { | ||||||
1412 | my $wrapped_d; | ||||||
1413 | if ($d->[0] >= 360.0) { | ||||||
1414 | $wrapped_d->[0] = $d->[0] - 360.0; | ||||||
1415 | } elsif ($d->[0] < 0) { | ||||||
1416 | $wrapped_d->[0] = 360.0 - abs($d->[0]); | ||||||
1417 | } | ||||||
1418 | if ($d->[1] >= 90.0) { | ||||||
1419 | $wrapped_d->[0] = POSIX::fmod($d->[0] + 180.0, 360); | ||||||
1420 | $wrapped_d->[1] = 180.0 - $d->[1]; | ||||||
1421 | } elsif ($d->[1] < -90.0) { | ||||||
1422 | $wrapped_d->[0] = POSIX::fmod($d->[0] + 180, 360); | ||||||
1423 | $wrapped_d->[1] = -180.0 - $d->[1]; | ||||||
1424 | } | ||||||
1425 | $wrapped_d->[0] = $d->[0] unless defined $wrapped_d->[0]; | ||||||
1426 | $wrapped_d->[1] = $d->[1] unless defined $wrapped_d->[1]; | ||||||
1427 | push @wrapped_data, $wrapped_d; | ||||||
1428 | } | ||||||
1429 | if ($self->{_debug}) { | ||||||
1430 | print "\nThe unwrapped points for cluster $i:\n"; | ||||||
1431 | map { print "@$_ "; } @wrapped_data; | ||||||
1432 | print "\n\n"; | ||||||
1433 | } | ||||||
1434 | my $label = $point_labels[$i]; | ||||||
1435 | my $j = 0; | ||||||
1436 | @new_data = map {unshift @$_, $label."_".$j; $j++; $_} @wrapped_data; | ||||||
1437 | push @data_dump, @new_data; | ||||||
1438 | } | ||||||
1439 | if ($self->{_debug}) { | ||||||
1440 | print "\n\nThe labeled points for clusters:\n"; | ||||||
1441 | map { print "@$_\n"; } @data_dump; | ||||||
1442 | } | ||||||
1443 | fisher_yates_shuffle( \@data_dump ); | ||||||
1444 | open OUTPUT, ">$output_file"; | ||||||
1445 | my $total_num_of_points = $N * $K; | ||||||
1446 | print "Total number of data points that will be written out to the file: $total_num_of_points\n" | ||||||
1447 | if $self->{_debug}; | ||||||
1448 | foreach my $ele (@data_dump) { | ||||||
1449 | my ($x,$y,$z); | ||||||
1450 | my $label = $ele->[0]; | ||||||
1451 | my $azimuth = $ele->[1]; | ||||||
1452 | my $elevation = $ele->[2]; | ||||||
1453 | $x = cos($elevation) * cos($azimuth); | ||||||
1454 | $y = cos($elevation) * sin($azimuth); | ||||||
1455 | $z = sin($elevation); | ||||||
1456 | my $csv_str = join ",", ($label,$x,$y,$z); | ||||||
1457 | print OUTPUT "$csv_str\n"; | ||||||
1458 | } | ||||||
1459 | print "\n\n"; | ||||||
1460 | print "Data written out to file $output_file\n" if $self->{_debug}; | ||||||
1461 | close OUTPUT; | ||||||
1462 | } | ||||||
1463 | |||||||
1464 | # This version for the embedded class for data generation | ||||||
1465 | sub visualize_data_on_sphere { | ||||||
1466 | my $self = shift; | ||||||
1467 | my $datafile = shift; | ||||||
1468 | my $filename = File::Basename::basename($datafile); | ||||||
1469 | my $temp_file = "__temp_" . $filename; | ||||||
1470 | $temp_file =~ s/\.\w+$/\.txt/; | ||||||
1471 | unlink $temp_file if -e $temp_file; | ||||||
1472 | open OUTPUT, ">$temp_file" | ||||||
1473 | or die "Unable to open a temp file in this directory: $!"; | ||||||
1474 | open INPUT, "< $filename" or die "Unable to open $filename: $!"; | ||||||
1475 | local $/ = undef; | ||||||
1476 | my @all_records = split /\s+/, ; | ||||||
1477 | my %clusters; | ||||||
1478 | foreach my $record (@all_records) { | ||||||
1479 | my @splits = split /,/, $record; | ||||||
1480 | my $record_name = shift @splits; | ||||||
1481 | $record_name =~ /(\w+?)_.*/; | ||||||
1482 | my $primary_cluster_label = $1; | ||||||
1483 | push @{$clusters{$primary_cluster_label}}, \@splits; | ||||||
1484 | } | ||||||
1485 | foreach my $key (sort {"\L$a" cmp "\L$b"} keys %clusters) { | ||||||
1486 | map {print OUTPUT "$_"} map {"@$_\n"} @{$clusters{$key}}; | ||||||
1487 | print OUTPUT "\n\n"; | ||||||
1488 | } | ||||||
1489 | my @sorted_cluster_keys = sort {"\L$a" cmp "\L$b"} keys %clusters; | ||||||
1490 | close OUTPUT; | ||||||
1491 | my $plot = Graphics::GnuplotIF->new( persist => 1 ); | ||||||
1492 | my $arg_string = ""; | ||||||
1493 | $plot->gnuplot_cmd( "set noclip" ); | ||||||
1494 | $plot->gnuplot_cmd( "set hidden3d" ) unless $self->{_show_hidden_in_3D_plots}; | ||||||
1495 | $plot->gnuplot_cmd( "set pointsize 2" ); | ||||||
1496 | $plot->gnuplot_cmd( "set parametric" ); | ||||||
1497 | $plot->gnuplot_cmd( "set size ratio 1" ); | ||||||
1498 | $plot->gnuplot_cmd( "set xlabel \"X\"" ); | ||||||
1499 | $plot->gnuplot_cmd( "set ylabel \"Y\"" ); | ||||||
1500 | $plot->gnuplot_cmd( "set zlabel \"Z\"" ); | ||||||
1501 | # set the range for azimuth angles: | ||||||
1502 | $plot->gnuplot_cmd( "set urange [0:2*pi]" ); | ||||||
1503 | # set the range for the elevation angles: | ||||||
1504 | $plot->gnuplot_cmd( "set vrange [-pi/2:pi/2]" ); | ||||||
1505 | # Parametric functions for the sphere | ||||||
1506 | $plot->gnuplot_cmd( "r=1" ); | ||||||
1507 | $plot->gnuplot_cmd( "fx(v,u) = r*cos(v)*cos(u)" ); | ||||||
1508 | $plot->gnuplot_cmd( "fy(v,u) = r*cos(v)*sin(u)" ); | ||||||
1509 | $plot->gnuplot_cmd( "fz(v) = r*sin(v)" ); | ||||||
1510 | my $sphere_arg_str = "fx(v,u),fy(v,u),fz(v) notitle with lines lt 0,"; | ||||||
1511 | foreach my $i (0..scalar(keys %clusters)-1) { | ||||||
1512 | my $j = $i + 1; | ||||||
1513 | # The following statement puts the titles on the data points | ||||||
1514 | $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"$sorted_cluster_keys[$i] \" with points lt $j pt $j, "; | ||||||
1515 | } | ||||||
1516 | $arg_string = $arg_string =~ /^(.*),[ ]+$/; | ||||||
1517 | $arg_string = $1; | ||||||
1518 | # $plot->gnuplot_cmd( "splot $arg_string" ); | ||||||
1519 | $plot->gnuplot_cmd( "splot $sphere_arg_str $arg_string" ); | ||||||
1520 | } | ||||||
1521 | |||||||
1522 | sub DESTROY { | ||||||
1523 | use File::Basename; | ||||||
1524 | my $filename = basename($_[0]->{_output_file}); | ||||||
1525 | $filename =~ s/\.\w+$/\.txt/; | ||||||
1526 | unlink "__temp_" . $filename; | ||||||
1527 | } | ||||||
1528 | |||||||
1529 | # from perl docs: | ||||||
1530 | sub fisher_yates_shuffle { | ||||||
1531 | my $arr = shift; | ||||||
1532 | my $i = @$arr; | ||||||
1533 | while (--$i) { | ||||||
1534 | my $j = int rand( $i + 1 ); | ||||||
1535 | @$arr[$i, $j] = @$arr[$j, $i]; | ||||||
1536 | } | ||||||
1537 | } | ||||||
1538 | |||||||
1539 | 1; | ||||||
1540 | |||||||
1541 | =pod | ||||||
1542 | |||||||
1543 | =head1 NAME | ||||||
1544 | |||||||
1545 | Algorithm::LinearManifoldDataClusterer --- for clustering data that resides on a | ||||||
1546 | low-dimensional manifold in a high-dimensional measurement space | ||||||
1547 | |||||||
1548 | =head1 SYNOPSIS | ||||||
1549 | |||||||
1550 | # You'd begin with: | ||||||
1551 | |||||||
1552 | use Algorithm::LinearManifoldDataClusterer; | ||||||
1553 | |||||||
1554 | # You'd next name the data file: | ||||||
1555 | |||||||
1556 | my $datafile = "mydatafile.csv"; | ||||||
1557 | |||||||
1558 | # Your data must be in the CSV format, with one of the columns containing a unique | ||||||
1559 | # symbolic tag for each data record. You tell the module which column has the | ||||||
1560 | # symbolic tag and which columns to use for clustering through a mask such as | ||||||
1561 | |||||||
1562 | my $mask = "N111"; | ||||||
1563 | |||||||
1564 | # which says that the symbolic tag is in the first column and that the numerical | ||||||
1565 | # data in the next three columns is to be used for clustering. If your data file | ||||||
1566 | # had, say, five columns and you wanted only the last three columns to be | ||||||
1567 | # clustered, the mask would become `N0111' assuming that that the symbolic tag is | ||||||
1568 | # still in the first column. | ||||||
1569 | |||||||
1570 | # Now you must construct an instance of the clusterer through a call such as: | ||||||
1571 | |||||||
1572 | my $clusterer = Algorithm::LinearManifoldDataClusterer->new( | ||||||
1573 | datafile => $datafile, | ||||||
1574 | mask => $mask, | ||||||
1575 | K => 3, | ||||||
1576 | P => 2, | ||||||
1577 | max_iterations => 15, | ||||||
1578 | cluster_search_multiplier => 2, | ||||||
1579 | delta_reconstruction_error => 0.001, | ||||||
1580 | terminal_output => 1, | ||||||
1581 | visualize_each_iteration => 1, | ||||||
1582 | show_hidden_in_3D_plots => 1, | ||||||
1583 | make_png_for_each_iteration => 1, | ||||||
1584 | ); | ||||||
1585 | |||||||
1586 | # where the parameter K specifies the number of clusters you expect to find in | ||||||
1587 | # your data and the parameter P is the dimensionality of the manifold on which the | ||||||
1588 | # data resides. The parameter cluster_search_multiplier is for increasing the | ||||||
1589 | # odds that the random seeds chosen initially for clustering will populate all the | ||||||
1590 | # clusters. Set this parameter to a low number like 2 or 3. The parameter | ||||||
1591 | # max_iterations places a hard limit on the number of iterations that the | ||||||
1592 | # algorithm is allowed. The actual number of iterations is controlled by the | ||||||
1593 | # parameter delta_reconstruction_error. The iterations stop when the change in | ||||||
1594 | # the total "reconstruction error" from one iteration to the next is smaller than | ||||||
1595 | # the value specified by delta_reconstruction_error. | ||||||
1596 | |||||||
1597 | # Next, you must get the module to read the data for clustering: | ||||||
1598 | |||||||
1599 | $clusterer->get_data_from_csv(); | ||||||
1600 | |||||||
1601 | # Finally, you invoke linear manifold clustering by: | ||||||
1602 | |||||||
1603 | my $clusters = $clusterer->linear_manifold_clusterer(); | ||||||
1604 | |||||||
1605 | # The value returned by this call is a reference to an array of anonymous arrays, | ||||||
1606 | # with each anonymous array holding one cluster. If you wish, you can have the | ||||||
1607 | # module write the clusters to individual files by the following call: | ||||||
1608 | |||||||
1609 | $clusterer->write_clusters_to_files($clusters); | ||||||
1610 | |||||||
1611 | # If you want to see how the reconstruction error changes with the iterations, you | ||||||
1612 | # can make the call: | ||||||
1613 | |||||||
1614 | $clusterer->display_reconstruction_errors_as_a_function_of_iterations(); | ||||||
1615 | |||||||
1616 | # When your data is 3-dimensional and when the clusters reside on a surface that | ||||||
1617 | # is more or less spherical, you can visualize the clusters by calling | ||||||
1618 | |||||||
1619 | $clusterer->visualize_clusters_on_sphere("final clustering", $clusters); | ||||||
1620 | |||||||
1621 | # where the first argument is a label to be displayed in the 3D plot and the | ||||||
1622 | # second argument the value returned by calling linear_manifold_clusterer(). | ||||||
1623 | |||||||
1624 | # SYNTHETIC DATA GENERATION: | ||||||
1625 | |||||||
1626 | # The module includes an embedded class, DataGenerator, for generating synthetic | ||||||
1627 | # three-dimensional data that can be used to experiment with the clustering code. | ||||||
1628 | # The synthetic data, written out to a CSV file, consists of Gaussian clusters on | ||||||
1629 | # the surface of a sphere. You can control the number of clusters, the width of | ||||||
1630 | # each cluster, and the number of samples in the clusters by giving appropriate | ||||||
1631 | # values to the constructor parameters as shown below: | ||||||
1632 | |||||||
1633 | use strict; | ||||||
1634 | use Algorithm::LinearManifoldDataClusterer; | ||||||
1635 | |||||||
1636 | my $output_file = "4_clusters_on_a_sphere_1000_samples.csv"; | ||||||
1637 | |||||||
1638 | my $training_data_gen = DataGenerator->new( | ||||||
1639 | output_file => $output_file, | ||||||
1640 | cluster_width => 0.015, | ||||||
1641 | total_number_of_samples_needed => 1000, | ||||||
1642 | number_of_clusters_on_sphere => 4, | ||||||
1643 | show_hidden_in_3D_plots => 0, | ||||||
1644 | ); | ||||||
1645 | $training_data_gen->gen_data_and_write_to_csv(); | ||||||
1646 | $training_data_gen->visualize_data_on_sphere($output_file); | ||||||
1647 | |||||||
1648 | |||||||
1649 | =head1 CHANGES | ||||||
1650 | |||||||
1651 | Version 1.01: Typos and other errors removed in the documentation. Also included in | ||||||
1652 | the documentation a link to a tutorial on data processing on manifolds. | ||||||
1653 | |||||||
1654 | |||||||
1655 | =head1 DESCRIPTION | ||||||
1656 | |||||||
1657 | If you are new to machine learning and data clustering on linear and nonlinear | ||||||
1658 | manifolds, your first question is likely to be: What is a manifold? A manifold is a | ||||||
1659 | space that is locally Euclidean. And a space is locally Euclidean if it allows for | ||||||
1660 | the points in a small neighborhood to be represented by, say, the Cartesian | ||||||
1661 | coordinates and if the distances between the points in the neighborhood are given by | ||||||
1662 | the Euclidean metric. For an example, the set of all points on the surface of a | ||||||
1663 | sphere does NOT constitute a Euclidean space. Nonetheless, if you confined your | ||||||
1664 | attention to a small enough neighborhood around a point, the space would seem to be | ||||||
1665 | locally Euclidean. The surface of a sphere is a 2-dimensional manifold embedded in a | ||||||
1666 | 3-dimensional space. A plane in a 3-dimensional space is also a 2-dimensional | ||||||
1667 | manifold. You would think of the surface of a sphere as a nonlinear manifold, whereas | ||||||
1668 | a plane would be a linear manifold. However, note that any nonlinear manifold is | ||||||
1669 | locally a linear manifold. That is, given a sufficiently small neighborhood on a | ||||||
1670 | nonlinear manifold, you can always think of it as a locally flat surface. | ||||||
1671 | |||||||
1672 | As to why we need machine learning and data clustering on manifolds, there exist many | ||||||
1673 | important applications in which the measured data resides on a nonlinear manifold. | ||||||
1674 | For example, when you record images of a human face from different angles, all the | ||||||
1675 | image pixels taken together fall on a low-dimensional surface in a high-dimensional | ||||||
1676 | measurement space. The same is believed to be true for the satellite images of a land | ||||||
1677 | mass that are recorded with the sun at different angles with respect to the direction | ||||||
1678 | of the camera. | ||||||
1679 | |||||||
1680 | Reducing the dimensionality of the sort of data mentioned above is critical to the | ||||||
1681 | proper functioning of downstream classification algorithms, and the most popular | ||||||
1682 | traditional method for dimensionality reduction is the Principal Components Analysis | ||||||
1683 | (PCA) algorithm. However, using PCA is tantamount to passing a linear least-squares | ||||||
1684 | hyperplane through the surface on which the data actually resides. As to why that | ||||||
1685 | might be a bad thing to do, just imagine the consequences of assuming that your data | ||||||
1686 | falls on a straight line when, in reality, it falls on a strongly curving arc. This | ||||||
1687 | is exactly what happens with PCA --- it gives you a linear manifold approximation to | ||||||
1688 | your data that may actually reside on a curved surface. | ||||||
1689 | |||||||
1690 | That brings us to the purpose of this module, which is to cluster data that resides | ||||||
1691 | on a nonlinear manifold. Since a nonlinear manifold is locally linear, we can think | ||||||
1692 | of each data cluster on a nonlinear manifold as falling on a locally linear portion | ||||||
1693 | of the manifold, meaning on a hyperplane. The logic of the module is based on | ||||||
1694 | finding a set of hyperplanes that best describes the data, with each hyperplane | ||||||
1695 | derived from a local data cluster. This is like constructing a piecewise linear | ||||||
1696 | approximation to data that falls on a curve as opposed to constructing a single | ||||||
1697 | straight line approximation to all of the data. So whereas the frequently used PCA | ||||||
1698 | algorithm gives you a single hyperplane approximation to all your data, what this | ||||||
1699 | module returns is a set of hyperplane approximations, with each hyperplane derived by | ||||||
1700 | applying the PCA algorithm locally to a data cluster. | ||||||
1701 | |||||||
1702 | That brings us to the problem of how to actually discover the best set of hyperplane | ||||||
1703 | approximations to the data. What is probably the most popular algorithm today for | ||||||
1704 | that purpose is based on the following key idea: Given a set of subspaces to which a | ||||||
1705 | data element can be assigned, you assign it to that subspace for which the | ||||||
1706 | B |
||||||
1707 | is B |
||||||
1708 | |||||||
1709 | To understand the notions of B |
||||||
1710 | the traditional approach of dimensionality reduction by the PCA algorithm. The PCA | ||||||
1711 | algorithm consists of: (1) Subtracting from each data element the global mean of the | ||||||
1712 | data; (2) Calculating the covariance matrix of the data; (3) Carrying out an | ||||||
1713 | eigendecomposition of the covariance matrix and ordering the eigenvectors according | ||||||
1714 | to decreasing values of the corresponding eigenvalues; (4) Forming a B |
||||||
1715 | discarding the trailing eigenvectors whose corresponding eigenvalues are relatively | ||||||
1716 | small; and, finally, (5) projecting all the data elements into the subspace so | ||||||
1717 | formed. The error incurred in representing a data element by its projection into the | ||||||
1718 | subspace is known as the B |
||||||
1719 | the data element into the space spanned by the discarded trailing eigenvectors. | ||||||
1720 | |||||||
1721 | I | ||||||
1722 | subspace in the manner described above, we construct a set of subspaces, one for each | ||||||
1723 | data cluster on the nonlinear manifold. After the subspaces have been constructed, a | ||||||
1724 | data element is assigned to that subspace for which the reconstruction error is the | ||||||
1725 | least.> On the face of it, this sounds like a chicken-and-egg sort of a problem. You | ||||||
1726 | need to have already clustered the data in order to construct the subspaces at | ||||||
1727 | different places on the manifold so that you can figure out which cluster to place a | ||||||
1728 | data element in. | ||||||
1729 | |||||||
1730 | Such problems, when they do possess a solution, are best tackled through iterative | ||||||
1731 | algorithms in which you start with a guess for the final solution, you rearrange the | ||||||
1732 | measured data on the basis of the guess, and you then use the new arrangement of the | ||||||
1733 | data to refine the guess. Subsequently, you iterate through the second and the third | ||||||
1734 | steps until you do not see any discernible changes in the new arrangements of the | ||||||
1735 | data. This forms the basis of the clustering algorithm that is described under | ||||||
1736 | B |
||||||
1737 | article "Dimension Reduction by Local Principal Component Analysis" by Kambhatla and | ||||||
1738 | Leen that appeared in the journal Neural Computation in 1997. | ||||||
1739 | |||||||
1740 | Unfortunately, experiments show that the algorithm as proposed by Kambhatla and Leen | ||||||
1741 | is much too sensitive to how the clusters are seeded initially. To get around this | ||||||
1742 | limitation of the basic clustering-by-minimization-of-reconstruction-error, this | ||||||
1743 | module implements a two phased approach. In B |
||||||
1744 | effect in our search for clusters by looking for C |
||||||
1745 | C |
||||||
1746 | be visited by one or more of the C |
||||||
1747 | where C |
||||||
1748 | C |
||||||
1749 | together in order to form the final C |
||||||
1750 | of the algorithm. | ||||||
1751 | |||||||
1752 | For the cluster merging operation in Phase 2, we model the C |
||||||
1753 | nodes of an attributed graph in which the weight given to an edge connecting a pair | ||||||
1754 | of nodes is a measure of the similarity between the two clusters corresponding to the | ||||||
1755 | two nodes. Subsequently, we use spectral clustering to merge the most similar nodes | ||||||
1756 | in our quest to partition the data into C |
||||||
1757 | Shi-Malik normalized cuts algorithm. The pairwise node similarity required by this | ||||||
1758 | algorithm is measured by the C |
||||||
1759 | C |
||||||
1760 | when all of the data elements in one cluster are projected into the other's subspace | ||||||
1761 | and vice versa, the greater the similarity between two clusters. Additionally, the | ||||||
1762 | smaller the distance between the mean vectors of the clusters, the greater the | ||||||
1763 | similarity between two clusters. The overall similarity between a pair of clusters | ||||||
1764 | is a combination of these two similarity measures. | ||||||
1765 | |||||||
1766 | For additional information regarding the theoretical underpinnings of the algorithm | ||||||
1767 | implemented in this module, visit | ||||||
1768 | L |
||||||
1769 | |||||||
1770 | |||||||
1771 | =head1 SUMMARY OF THE ALGORITHM | ||||||
1772 | |||||||
1773 | We now present a summary of the two phases of the algorithm implemented in this | ||||||
1774 | module. Note particularly the important role played by the constructor parameter | ||||||
1775 | C |
||||||
1776 | parameter is greater than 1 that Phase 2 of the algorithm kicks in. | ||||||
1777 | |||||||
1778 | =over 4 | ||||||
1779 | |||||||
1780 | =item B |
||||||
1781 | |||||||
1782 | Through iterative minimization of the total reconstruction error, this phase of the | ||||||
1783 | algorithm returns C |
||||||
1784 | expect to find in your data and where C |
||||||
1785 | constructor parameter C |
||||||
1786 | account of the sensitivity of the reconstruction-error based clustering to how the | ||||||
1787 | clusters are initially seeded, our goal is to look for C |
||||||
1788 | of increasing the odds that each of the C |
||||||
1789 | the beginning of the algorithm. | ||||||
1790 | |||||||
1791 | =over 4 | ||||||
1792 | |||||||
1793 | =item Step 1: | ||||||
1794 | |||||||
1795 | Randomly choose C |
||||||
1796 | |||||||
1797 | =item Step 2: | ||||||
1798 | |||||||
1799 | Construct initial C |
||||||
1800 | whose seed it is closest to. | ||||||
1801 | |||||||
1802 | =item Step 3: | ||||||
1803 | |||||||
1804 | Calculate the mean and the covariance matrix for each of the C |
||||||
1805 | carry out an eigendecomposition of the covariance matrix. Order the eigenvectors in | ||||||
1806 | decreasing order of the corresponding eigenvalues. The first C eigenvectors |
||||||
1807 | define the subspace for that cluster. Use the space spanned by the remaining | ||||||
1808 | eigenvectors --- we refer to them as the trailing eigenvectors --- for calculating | ||||||
1809 | the reconstruction error. | ||||||
1810 | |||||||
1811 | =item Step 4: | ||||||
1812 | |||||||
1813 | Taking into account the mean associated with each cluster, re-cluster the entire data | ||||||
1814 | set on the basis of the least reconstruction error. That is, assign each data | ||||||
1815 | element to that subspace for which it has the smallest reconstruction error. | ||||||
1816 | Calculate the total reconstruction error associated with all the data elements. (See | ||||||
1817 | the definition of the reconstruction error in the C |
||||||
1818 | |||||||
1819 | =item Step 5: | ||||||
1820 | |||||||
1821 | Stop iterating if the change in the total reconstruction error from the previous | ||||||
1822 | iteration to the current iteration is less than the value specified by the constructor | ||||||
1823 | parameter C |
||||||
1824 | |||||||
1825 | =back | ||||||
1826 | |||||||
1827 | =item B |
||||||
1828 | |||||||
1829 | This phase of the algorithm uses graph partitioning to merge the C |
||||||
1830 | into the C |
||||||
1831 | steps are presented below is invoked recursively, let's assume that we have C |
||||||
1832 | clusters that need to be merged by invoking the Shi-Malik spectral clustering | ||||||
1833 | algorithm as described below: | ||||||
1834 | |||||||
1835 | =over 4 | ||||||
1836 | |||||||
1837 | =item Step 1: | ||||||
1838 | |||||||
1839 | Form a graph whose C |
||||||
1840 | invocation of this step, we have C |
||||||
1841 | |||||||
1842 | =item Step 2: | ||||||
1843 | |||||||
1844 | Construct an C |
||||||
1845 | element of this matrix is the pairwise similarity between the clusters indexed C | ||||||
1846 | and C |
||||||
1847 | The total reconstruction error when the data elements in the cluster indexed C |
||||||
1848 | projected into the subspace for the cluster indexed C and vice versa. And (2) The | ||||||
1849 | distance between the mean vectors associated with the two clusters. | ||||||
1850 | |||||||
1851 | =item Step 3: | ||||||
1852 | |||||||
1853 | Calculate the symmetric normalized Laplacian of the similarity matrix. We use C | ||||||
1854 | to denote the symmetric normalized Laplacian. | ||||||
1855 | |||||||
1856 | =item Step 4: | ||||||
1857 | |||||||
1858 | Carry out an eigendecomposition of the C matrix and choose the eigenvector | ||||||
1859 | corresponding to the second smallest eigenvalue for bipartitioning the graph on the | ||||||
1860 | basis of the sign of the values in the eigenvector. | ||||||
1861 | |||||||
1862 | =item Step 5: | ||||||
1863 | |||||||
1864 | If the bipartition of the previous step yields one-versus-the-rest kind of a | ||||||
1865 | partition, add the `one' cluster to the output list of clusters and invoke graph | ||||||
1866 | partitioning recursively on the `rest' by going back to Step 1. On the other hand, | ||||||
1867 | if the cardinality of both the partitions returned by Step 4 exceeds 1, invoke graph | ||||||
1868 | partitioning recursively on both partitions. Stop when the list of clusters in the | ||||||
1869 | output list equals C |
||||||
1870 | |||||||
1871 | =item Step 6: | ||||||
1872 | |||||||
1873 | In general, the C |
||||||
1874 | all of the data. So, for the final step, assign each data element not covered by the | ||||||
1875 | C |
||||||
1876 | |||||||
1877 | =back | ||||||
1878 | |||||||
1879 | =back | ||||||
1880 | |||||||
1881 | =head1 FAIL-FIRST BIAS OF THE MODULE | ||||||
1882 | |||||||
1883 | As you would expect for all such iterative algorithms, the module carries no | ||||||
1884 | theoretical guarantee that it will give you correct results. But what does that mean? | ||||||
1885 | Suppose you create synthetic data that consists of Gaussian looking disjoint clusters | ||||||
1886 | on the surface of a sphere, would the module always succeed in separating out the | ||||||
1887 | clusters? The module carries no guarantees to that effect --- especially considering | ||||||
1888 | that Phase 1 of the algorithm is sensitive to how the clusters are seeded at the | ||||||
1889 | beginning. Although this sensitivity is mitigated by the cluster merging step when | ||||||
1890 | greater-than-1 value is given to the constructor option C |
||||||
1891 | a plain vanilla implementation of the steps in Phase 1 and Phase 2 would nonetheless | ||||||
1892 | carry significant risk that you'll end up with incorrect clustering results. | ||||||
1893 | |||||||
1894 | To further reduce this risk, the module has been programmed so that it terminates | ||||||
1895 | immediately if it suspects that the cluster solution being developed is unlikely to | ||||||
1896 | be fruitful. The heuristics used for such terminations are conservative --- since | ||||||
1897 | the cost of termination is only that the user will have to run the code again, which | ||||||
1898 | at worst only carries an element of annoyance with it. The three "Fail First" | ||||||
1899 | heuristics currently programmed into the module are based on simple "unimodality | ||||||
1900 | testing", testing for "congruent clusters," and testing for dominant cluster support | ||||||
1901 | in the final stage of the recursive invocation of the graph partitioning step. The | ||||||
1902 | unimodality testing is as elementary as it can be --- it only checks for the number | ||||||
1903 | of data samples within a certain radius of the mean in relation to the total number | ||||||
1904 | of data samples in the cluster. | ||||||
1905 | |||||||
1906 | When the program terminates under such conditions, it prints out the following message | ||||||
1907 | in your terminal window: | ||||||
1908 | |||||||
1909 | Bailing out! | ||||||
1910 | |||||||
1911 | Given the very simple nature of testing that is carried for the "Fail First" bias, do | ||||||
1912 | not be surprised if the results you get for your data simply look wrong. If most | ||||||
1913 | runs of the module produce wrong results for your application, that means that the | ||||||
1914 | module logic needs to be strengthened further. The author of this module would love | ||||||
1915 | to hear from you if that is the case. | ||||||
1916 | |||||||
1917 | =head1 METHODS | ||||||
1918 | |||||||
1919 | The module provides the following methods for linear-manifold based clustering, for | ||||||
1920 | cluster visualization, and for the generation of data for testing the clustering algorithm: | ||||||
1921 | |||||||
1922 | =over 4 | ||||||
1923 | |||||||
1924 | =item B |
||||||
1925 | |||||||
1926 | my $clusterer = Algorithm::LinearManifoldDataClusterer->new( | ||||||
1927 | datafile => $datafile, | ||||||
1928 | mask => $mask, | ||||||
1929 | K => $K, | ||||||
1930 | P => $P, | ||||||
1931 | cluster_search_multiplier => $C, | ||||||
1932 | max_iterations => $max_iter, | ||||||
1933 | delta_reconstruction_error => 0.001, | ||||||
1934 | terminal_output => 1, | ||||||
1935 | write_clusters_to_files => 1, | ||||||
1936 | visualize_each_iteration => 1, | ||||||
1937 | show_hidden_in_3D_plots => 1, | ||||||
1938 | make_png_for_each_iteration => 1, | ||||||
1939 | ); | ||||||
1940 | |||||||
1941 | A call to C |
||||||
1942 | C |
||||||
1943 | |||||||
1944 | =back | ||||||
1945 | |||||||
1946 | =head2 Constructor Parameters | ||||||
1947 | |||||||
1948 | =over 8 | ||||||
1949 | |||||||
1950 | =item C |
||||||
1951 | |||||||
1952 | This parameter names the data file that contains the multidimensional data records | ||||||
1953 | you want the module to cluster. This file must be in CSV format and each record in | ||||||
1954 | the file must include a symbolic tag for the record. Here are first few rows of such | ||||||
1955 | a CSV file in the C |
||||||
1956 | |||||||
1957 | d_161,0.0739248630173239,0.231119293395665,-0.970112873251437 | ||||||
1958 | a_59,0.459932215884786,0.0110216469739639,0.887885623314902 | ||||||
1959 | a_225,0.440503220903039,-0.00543366086464691,0.897734586447273 | ||||||
1960 | a_203,0.441656364946433,0.0437191337788422,0.896118459046532 | ||||||
1961 | ... | ||||||
1962 | ... | ||||||
1963 | |||||||
1964 | What you see in the first column --- C |
||||||
1965 | the symbolic tags associated with four 3-dimensional data records. | ||||||
1966 | |||||||
1967 | =item C |
||||||
1968 | |||||||
1969 | This parameter supplies the mask to be applied to the columns of your data file. For | ||||||
1970 | the data file whose first few records are shown above, the mask should be C |
||||||
1971 | since the symbolic tag is in the first column of the CSV file and since, presumably, | ||||||
1972 | you want to cluster the data in the next three columns. | ||||||
1973 | |||||||
1974 | =item C |
||||||
1975 | |||||||
1976 | This parameter supplies the number of clusters you are looking for. | ||||||
1977 | |||||||
1978 | =item C : |
||||||
1979 | |||||||
1980 | This parameter specifies the dimensionality of the manifold on which the data resides. | ||||||
1981 | |||||||
1982 | =item C |
||||||
1983 | |||||||
1984 | As should be clear from the C |
||||||
1985 | a very important role in the successful clustering of your data. As explained in | ||||||
1986 | C |
||||||
1987 | the minimization of the reconstruction error --- is sensitive to the choice of the | ||||||
1988 | cluster seeds that are selected randomly at the beginning of the algorithm. Should | ||||||
1989 | it happen that the seeds miss one or more of the clusters, the clustering produced is | ||||||
1990 | likely to not be correct. By giving an integer value to C |
||||||
1991 | that is greater than 1, you'll increase the odds that the randomly selected seeds | ||||||
1992 | will see all clusters. When you set C |
||||||
1993 | Phase 1 of the algorithm to construct C |
||||||
1994 | clusters. Subsequently, in Phase 2, the module uses inter-cluster similarity based | ||||||
1995 | graph partitioning to merge the C |
||||||
1996 | |||||||
1997 | =item C |
||||||
1998 | |||||||
1999 | This hard limits the number of iterations in Phase 1 of the algorithm. Ordinarily, | ||||||
2000 | the iterations stop automatically when the change in the total reconstruction error | ||||||
2001 | from one iteration to the next is less than the value specified by the parameter | ||||||
2002 | C |
||||||
2003 | |||||||
2004 | =item C |
||||||
2005 | |||||||
2006 | It is this parameter that determines when the iterations will actually terminate in | ||||||
2007 | Phase 1 of the algorithm. When the difference in the total reconstruction error from | ||||||
2008 | one iteration to the next is less than the value given to this parameter, the | ||||||
2009 | iterations come to an end. B | ||||||
2010 | data samples that need to be clustered, the larger must be the value give to this | ||||||
2011 | parameter. That makes intuitive sense since the total reconstruction error is the | ||||||
2012 | sum of all such errors for all of the data elements.> Unfortunately, the best value | ||||||
2013 | for this parameter does NOT appear to depend linearly on the total number of data | ||||||
2014 | records to be clustered. | ||||||
2015 | |||||||
2016 | =item C |
||||||
2017 | |||||||
2018 | When this parameter is set, you will see in your terminal window the different | ||||||
2019 | clusters as lists of the symbolic tags associated with the data records. You will | ||||||
2020 | also see in your terminal window the output produced by the different steps of the | ||||||
2021 | graph partitioning algorithm as smaller clusters are merged to form the final C |
||||||
2022 | clusters --- assuming that you set the parameter C |
||||||
2023 | integer value that is greater than 1. | ||||||
2024 | |||||||
2025 | =item C |
||||||
2026 | |||||||
2027 | As its name implies, when this option is set to 1, you'll see 3D plots of the | ||||||
2028 | clustering results for each iteration --- but only if your data is 3-dimensional. | ||||||
2029 | |||||||
2030 | =item C |
||||||
2031 | |||||||
2032 | This parameter is important for controlling the visualization of the clusters on the | ||||||
2033 | surface of a sphere. If the clusters are too spread out, seeing all of the clusters | ||||||
2034 | all at once can be visually confusing. When you set this parameter, the clusters on | ||||||
2035 | the back side of the sphere will not be visible. Note that no matter how you set | ||||||
2036 | this parameter, you can interact with the 3D plot of the data and rotate it with your | ||||||
2037 | mouse pointer to see all of the data that is output by the clustering code. | ||||||
2038 | |||||||
2039 | =item C |
||||||
2040 | |||||||
2041 | If you set this option to 1, the module will output a Gnuplot in the form of a PNG | ||||||
2042 | image for each iteration in Phase 1 of the algorithm. In Phase 2, the module will | ||||||
2043 | output the clustering result produced by the graph partitioning algorithm. | ||||||
2044 | |||||||
2045 | =back | ||||||
2046 | |||||||
2047 | =over | ||||||
2048 | |||||||
2049 | =item B |
||||||
2050 | |||||||
2051 | $clusterer->get_data_from_csv(); | ||||||
2052 | |||||||
2053 | As you can guess from its name, the method extracts the data from the CSV file named | ||||||
2054 | in the constructor. | ||||||
2055 | |||||||
2056 | =item B |
||||||
2057 | |||||||
2058 | $clusterer->linear_manifold_clusterer(); | ||||||
2059 | |||||||
2060 | or | ||||||
2061 | |||||||
2062 | my $clusters = $clusterer->linear_manifold_clusterer(); | ||||||
2063 | |||||||
2064 | This is the main call to the linear-manifold based clusterer. The first call works | ||||||
2065 | by side-effect, meaning that you will see the clusters in your terminal window and | ||||||
2066 | they would be written out to disk files (depending on the constructor options you | ||||||
2067 | have set). The second call also returns the clusters as a reference to an array of | ||||||
2068 | anonymous arrays, each holding the symbolic tags for a cluster. | ||||||
2069 | |||||||
2070 | =item B |
||||||
2071 | |||||||
2072 | $clusterer->display_reconstruction_errors_as_a_function_of_iterations(); | ||||||
2073 | |||||||
2074 | This method would normally be called after the clustering is completed to see how the | ||||||
2075 | reconstruction errors decreased with the iterations in Phase 1 of the overall | ||||||
2076 | algorithm. | ||||||
2077 | |||||||
2078 | =item B |
||||||
2079 | |||||||
2080 | $clusterer->write_clusters_to_files($clusters); | ||||||
2081 | |||||||
2082 | As its name implies, when you call this methods, the final clusters would be written | ||||||
2083 | out to disk files. The files have names like: | ||||||
2084 | |||||||
2085 | cluster0.txt | ||||||
2086 | cluster1.txt | ||||||
2087 | cluster2.txt | ||||||
2088 | ... | ||||||
2089 | ... | ||||||
2090 | |||||||
2091 | Before the clusters are written to these files, the module destroys all files with | ||||||
2092 | such names in the directory in which you call the module. | ||||||
2093 | |||||||
2094 | =item B |
||||||
2095 | |||||||
2096 | $clusterer->visualize_clusters_on_sphere("final clustering", $clusters); | ||||||
2097 | |||||||
2098 | or | ||||||
2099 | |||||||
2100 | $clusterer->visualize_clusters_on_sphere("final_clustering", $clusters, "png"); | ||||||
2101 | |||||||
2102 | If your data is 3-dimensional and it resides on the surface of a sphere (or in the | ||||||
2103 | vicinity of such a surface), you may be able to use these methods for the | ||||||
2104 | visualization of the clusters produced by the algorithm. The first invocation | ||||||
2105 | produces a Gnuplot in a terminal window that you can rotate with your mouse pointer. | ||||||
2106 | The second invocation produces a `.png' image of the plot. | ||||||
2107 | |||||||
2108 | =item B |
||||||
2109 | |||||||
2110 | $clusterer->auto_retry_clusterer(); | ||||||
2111 | |||||||
2112 | or | ||||||
2113 | |||||||
2114 | my $clusters = $clusterer->auto_retry_clusterer(); | ||||||
2115 | |||||||
2116 | As mentioned earlier, the module is programmed in such a way that it is more likely | ||||||
2117 | to fail than to give you a wrong answer. If manually trying the clusterer repeatedly | ||||||
2118 | on a data file is frustrating, you can use C |
||||||
2119 | make repeated attempts for you. See the script C |
||||||
2120 | C |
||||||
2121 | |||||||
2122 | =back | ||||||
2123 | |||||||
2124 | =head1 GENERATING SYNTHETIC DATA FOR EXPERIMENTING WITH THE CLUSTERER | ||||||
2125 | |||||||
2126 | The module file also contains a class named C |
||||||
2127 | data for experimenting with linear-manifold based clustering. At this time, only | ||||||
2128 | 3-dimensional data that resides in the form of Gaussian clusters on the surface of a | ||||||
2129 | sphere is generated. The generated data is placed in a CSV file. You construct an | ||||||
2130 | instance of the C |
||||||
2131 | |||||||
2132 | =over 4 | ||||||
2133 | |||||||
2134 | =item B |
||||||
2135 | |||||||
2136 | my $training_data_gen = DataGenerator->new( | ||||||
2137 | output_file => $output_file, | ||||||
2138 | cluster_width => 0.0005, | ||||||
2139 | total_number_of_samples_needed => 1000, | ||||||
2140 | number_of_clusters_on_sphere => 4, | ||||||
2141 | show_hidden_in_3D_plots => 0, | ||||||
2142 | ); | ||||||
2143 | |||||||
2144 | =back | ||||||
2145 | |||||||
2146 | =head2 Parameters for the DataGenerator constructor: | ||||||
2147 | |||||||
2148 | =over 8 | ||||||
2149 | |||||||
2150 | =item C |
||||||
2151 | |||||||
2152 | The numeric values are generated using a bivariate Gaussian distribution whose two | ||||||
2153 | independent variables are the azimuth and the elevation angles on the surface of a | ||||||
2154 | unit sphere. The mean of each cluster is chosen randomly and its covariance set in | ||||||
2155 | proportion to the value supplied for the C< cluster_width> parameter. | ||||||
2156 | |||||||
2157 | =item C |
||||||
2158 | |||||||
2159 | This parameter controls the spread of each cluster on the surface of the unit sphere. | ||||||
2160 | |||||||
2161 | =item C |
||||||
2162 | |||||||
2163 | As its name implies, this parameter specifies the total number of data samples that | ||||||
2164 | will be written out to the output file --- provided this number is divisible by the | ||||||
2165 | number of clusters you asked for. If the divisibility condition is not satisfied, | ||||||
2166 | the number of data samples actually written out will be the closest it can be to the | ||||||
2167 | number you specify for this parameter under the condition that equal number of | ||||||
2168 | samples will be created for each cluster. | ||||||
2169 | |||||||
2170 | =item C |
||||||
2171 | |||||||
2172 | Again as its name implies, this parameters specifies the number of clusters that will | ||||||
2173 | be produced on the surface of a unit sphere. | ||||||
2174 | |||||||
2175 | =item C |
||||||
2176 | |||||||
2177 | This parameter is important for the visualization of the clusters and it controls | ||||||
2178 | whether you will see the generated data on the back side of the sphere. If the | ||||||
2179 | clusters are not too spread out, you can set this parameter to 0 and see all the | ||||||
2180 | clusters all at once. However, when the clusters are spread out, it can be visually | ||||||
2181 | confusing to see the data on the back side of the sphere. Note that no matter how | ||||||
2182 | you set this parameter, you can interact with the 3D plot of the data and rotate it | ||||||
2183 | with your mouse pointer to see all of the data that is generated. | ||||||
2184 | |||||||
2185 | =back | ||||||
2186 | |||||||
2187 | =over 4 | ||||||
2188 | |||||||
2189 | =item B |
||||||
2190 | |||||||
2191 | $training_data_gen->gen_data_and_write_to_csv(); | ||||||
2192 | |||||||
2193 | As its name implies, this method generates the data according to the parameters | ||||||
2194 | specified in the DataGenerator constructor. | ||||||
2195 | |||||||
2196 | =item B |
||||||
2197 | |||||||
2198 | $training_data_gen->visualize_data_on_sphere($output_file); | ||||||
2199 | |||||||
2200 | You can use this method to visualize the clusters produced by the data generator. | ||||||
2201 | Since the clusters are located at randomly selected points on a unit sphere, by | ||||||
2202 | looking at the output visually, you can quickly reject what the data generator has | ||||||
2203 | produced and try again. | ||||||
2204 | |||||||
2205 | =back | ||||||
2206 | |||||||
2207 | =head1 HOW THE CLUSTERS ARE OUTPUT | ||||||
2208 | |||||||
2209 | When the option C |
||||||
2210 | C |
||||||
2211 | screen. | ||||||
2212 | |||||||
2213 | And, when the option C |
||||||
2214 | module dumps the clusters in files named | ||||||
2215 | |||||||
2216 | cluster0.txt | ||||||
2217 | cluster1.txt | ||||||
2218 | cluster2.txt | ||||||
2219 | ... | ||||||
2220 | ... | ||||||
2221 | |||||||
2222 | in the directory in which you execute the module. The number of such files will | ||||||
2223 | equal the number of clusters formed. All such existing files in the directory are | ||||||
2224 | destroyed before any fresh ones are created. Each cluster file contains the symbolic | ||||||
2225 | tags of the data samples in that cluster. | ||||||
2226 | |||||||
2227 | Assuming that the data dimensionality is 3, if you have set the constructor parameter | ||||||
2228 | C |
||||||
2229 | files that are point plots of the different clusters in the different iterations of | ||||||
2230 | the algorithm. Such printable files are also generated for the initial clusters at | ||||||
2231 | the beginning of the iterations and for the final clusters in Phase 1 of the | ||||||
2232 | algorithm. You will also see in your directory a PNG file for the clustering result | ||||||
2233 | produced by graph partitioning in Phase 2. | ||||||
2234 | |||||||
2235 | Also, as mentioned previously, a call to C |
||||||
2236 | code will return the clusters to you directly. | ||||||
2237 | |||||||
2238 | =head1 REQUIRED | ||||||
2239 | |||||||
2240 | This module requires the following modules: | ||||||
2241 | |||||||
2242 | List::Util | ||||||
2243 | File::Basename | ||||||
2244 | Math::Random | ||||||
2245 | Graphics::GnuplotIF | ||||||
2246 | Math::GSL::Matrix | ||||||
2247 | POSIX | ||||||
2248 | |||||||
2249 | =head1 THE C |
||||||
2250 | |||||||
2251 | The C |
||||||
2252 | play with in order to become familiar with the module: | ||||||
2253 | |||||||
2254 | example1.pl | ||||||
2255 | |||||||
2256 | example2.pl | ||||||
2257 | |||||||
2258 | example3.pl | ||||||
2259 | |||||||
2260 | example4.pl | ||||||
2261 | |||||||
2262 | These scripts demonstrate linear-manifold based clustering on the 3-dimensional data | ||||||
2263 | in the following three CSV files: | ||||||
2264 | |||||||
2265 | 3_clusters_on_a_sphere_498_samples.csv (used in example1.pl and example4.pl) | ||||||
2266 | |||||||
2267 | 3_clusters_on_a_sphere_3000_samples.csv (used in example2.pl) | ||||||
2268 | |||||||
2269 | 4_clusters_on_a_sphere_1000_samples.csv (used in example3.pl) | ||||||
2270 | |||||||
2271 | Note that even though the first two of these files both contain exactly three | ||||||
2272 | clusters, the clusters look very different in the two data files. The clusters are | ||||||
2273 | much more spread out in C<3_clusters_on_a_sphere_3000_samples.csv>. | ||||||
2274 | |||||||
2275 | The code in C |
||||||
2276 | C |
||||||
2277 | the clustering program until success is achieved. The value of the constructor | ||||||
2278 | parameter C |
||||||
2279 | when you execute C |
||||||
2280 | You might wish to change the value given to the parameter | ||||||
2281 | C |
||||||
2282 | achieve success. | ||||||
2283 | |||||||
2284 | The C |
||||||
2285 | and the best final results that can be achieved by the first three examples, that | ||||||
2286 | is, for the scripts C |
||||||
2287 | a Linux machine and if you have the C |
||||||
2288 | C |
||||||
2289 | your own runs of the three example scripts may or may not look like what you see in | ||||||
2290 | the outputs shown in the PNG files depending on how the module seeds the clusters. | ||||||
2291 | Your best results should look like what you see in the PNG images. | ||||||
2292 | |||||||
2293 | The C |
||||||
2294 | |||||||
2295 | For generating the data for experiments with clustering: | ||||||
2296 | |||||||
2297 | generate_data_on_a_sphere.pl | ||||||
2298 | |||||||
2299 | For visualizing the data generated by the above script: | ||||||
2300 | |||||||
2301 | data_visualizer.pl | ||||||
2302 | |||||||
2303 | For cleaning up the examples directory: | ||||||
2304 | |||||||
2305 | cleanup_directory.pl | ||||||
2306 | |||||||
2307 | Invoking the C |
||||||
2308 | that are generated by the module when you run it with the constructor option | ||||||
2309 | C |
||||||
2310 | |||||||
2311 | =head1 EXPORT | ||||||
2312 | |||||||
2313 | None by design. | ||||||
2314 | |||||||
2315 | =head1 CAVEATS | ||||||
2316 | |||||||
2317 | The performance of the algorithm depends much on the values you choose for the | ||||||
2318 | constructor parameters. And, even for the best choices for the parameters, the | ||||||
2319 | algorithm is not theoretically guaranteed to return the best results. | ||||||
2320 | |||||||
2321 | Even after you have discovered the best choices for the constructor parameters, the | ||||||
2322 | best way to use this module is to try it multiple times on any given data and accept | ||||||
2323 | only those results that make the best intuitive sense. | ||||||
2324 | |||||||
2325 | The module was designed with the hope that it would rather fail than give you wrong | ||||||
2326 | results. So if you see it failing a few times before it returns a good answer, that's | ||||||
2327 | a good thing. | ||||||
2328 | |||||||
2329 | However, if the module fails too often or is too quick to give you wrong answers, | ||||||
2330 | that means the module is not working on your data. If that happens, I'd love to hear | ||||||
2331 | from you. That might indicate to me how I should enhance the power of this module | ||||||
2332 | for its future versions. | ||||||
2333 | |||||||
2334 | =head1 BUGS | ||||||
2335 | |||||||
2336 | Please notify the author if you encounter any bugs. When sending email, please place | ||||||
2337 | the string 'LinearManifoldDataClusterer' in the subject line. | ||||||
2338 | |||||||
2339 | =head1 INSTALLATION | ||||||
2340 | |||||||
2341 | Download the archive from CPAN in any directory of your choice. Unpack the archive | ||||||
2342 | with a command that on a Linux machine would look like: | ||||||
2343 | |||||||
2344 | tar zxvf Algorithm-LinearManifoldDataClusterer-1.01.tar.gz | ||||||
2345 | |||||||
2346 | This will create an installation directory for you whose name will be | ||||||
2347 | C |
||||||
2348 | for a standard install of the module if you have root privileges: | ||||||
2349 | |||||||
2350 | perl Makefile.PL | ||||||
2351 | make | ||||||
2352 | make test | ||||||
2353 | sudo make install | ||||||
2354 | |||||||
2355 | If you do not have root privileges, you can carry out a non-standard install the | ||||||
2356 | module in any directory of your choice by: | ||||||
2357 | |||||||
2358 | perl Makefile.PL prefix=/some/other/directory/ | ||||||
2359 | make | ||||||
2360 | make test | ||||||
2361 | make install | ||||||
2362 | |||||||
2363 | With a non-standard install, you may also have to set your PERL5LIB environment | ||||||
2364 | variable so that this module can find the required other modules. How you do that | ||||||
2365 | would depend on what platform you are working on. In order to install this module in | ||||||
2366 | a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment | ||||||
2367 | variable by | ||||||
2368 | |||||||
2369 | setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/ | ||||||
2370 | |||||||
2371 | If I used bash, I'd need to declare: | ||||||
2372 | |||||||
2373 | export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/ | ||||||
2374 | |||||||
2375 | |||||||
2376 | =head1 THANKS | ||||||
2377 | |||||||
2378 | I have learned much from my conversations with Donghun Kim whose research on face | ||||||
2379 | recognition in the wild involves clustering image data on manifolds. I have also had | ||||||
2380 | several fruitful conversations with Bharath Kumar Comandur and Tanmay Prakash with | ||||||
2381 | regard to the ideas that are incorporated in this module. | ||||||
2382 | |||||||
2383 | =head1 AUTHOR | ||||||
2384 | |||||||
2385 | Avinash Kak, kak@purdue.edu | ||||||
2386 | |||||||
2387 | If you send email, please place the string "LinearManifoldDataClusterer" in your subject line to get past | ||||||
2388 | my spam filter. | ||||||
2389 | |||||||
2390 | =head1 COPYRIGHT | ||||||
2391 | |||||||
2392 | This library is free software; you can redistribute it and/or modify it under the | ||||||
2393 | same terms as Perl itself. | ||||||
2394 | |||||||
2395 | Copyright 2015 Avinash Kak | ||||||
2396 | |||||||
2397 | =cut | ||||||
2398 |