| blib/lib/Algorithm/LinearManifoldDataClusterer.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % |
| statement | 25 | 27 | 92.5 |
| branch | n/a | ||
| condition | n/a | ||
| subroutine | 9 | 9 | 100.0 |
| pod | n/a | ||
| total | 34 | 36 | 94.4 |
| line | stmt | bran | cond | sub | pod | time | code |
|---|---|---|---|---|---|---|---|
| 1 | package Algorithm::LinearManifoldDataClusterer; | ||||||
| 2 | |||||||
| 3 | #------------------------------------------------------------------------------------ | ||||||
| 4 | # Copyright (c) 2015 Avinash Kak. All rights reserved. This program is free | ||||||
| 5 | # software. You may modify and/or distribute it under the same terms as Perl itself. | ||||||
| 6 | # This copyright notice must remain attached to the file. | ||||||
| 7 | # | ||||||
| 8 | # Algorithm::LinearManifoldDataClusterer is a Perl module for clustering data that | ||||||
| 9 | # resides on a low-dimensional manifold in a high-dimensional measurement space. | ||||||
| 10 | # ----------------------------------------------------------------------------------- | ||||||
| 11 | |||||||
| 12 | 1 | 1 | 13073 | use 5.10.0; | |||
| 1 | 3 | ||||||
| 1 | 45 | ||||||
| 13 | 1 | 1 | 5 | use strict; | |||
| 1 | 1 | ||||||
| 1 | 28 | ||||||
| 14 | 1 | 1 | 3 | use warnings; | |||
| 1 | 4 | ||||||
| 1 | 30 | ||||||
| 15 | 1 | 1 | 4 | use Carp; | |||
| 1 | 0 | ||||||
| 1 | 71 | ||||||
| 16 | 1 | 1 | 4 | use List::Util qw(reduce any); | |||
| 1 | 1 | ||||||
| 1 | 83 | ||||||
| 17 | 1 | 1 | 4 | use File::Basename; | |||
| 1 | 2 | ||||||
| 1 | 66 | ||||||
| 18 | 1 | 1 | 594 | use Math::Random; | |||
| 1 | 5336 | ||||||
| 1 | 86 | ||||||
| 19 | 1 | 1 | 584 | use Graphics::GnuplotIF; | |||
| 1 | 9525 | ||||||
| 1 | 57 | ||||||
| 20 | 1 | 1 | 1048 | use Math::GSL::Matrix; | |||
| 0 | |||||||
| 0 | |||||||
| 21 | use POSIX (); | ||||||
| 22 | |||||||
| 23 | our $VERSION = '1.01'; | ||||||
| 24 | |||||||
| 25 | # Constructor: | ||||||
| 26 | sub new { | ||||||
| 27 | my ($class, %args) = @_; | ||||||
| 28 | my @params = keys %args; | ||||||
| 29 | croak "\nYou have used a wrong name for a keyword argument " . | ||||||
| 30 | "--- perhaps a misspelling\n" | ||||||
| 31 | if check_for_illegal_params(@params) == 0; | ||||||
| 32 | bless { | ||||||
| 33 | _datafile => $args{datafile} || croak("datafile required"), | ||||||
| 34 | _mask => $args{mask} || croak("mask required"), | ||||||
| 35 | _K => $args{K} || 0, | ||||||
| 36 | _P => $args{P} || 0, | ||||||
| 37 | _terminal_output => $args{terminal_output} || 0, | ||||||
| 38 | _max_iterations => $args{max_iterations} || 0, | ||||||
| 39 | _delta_reconstruction_error => $args{delta_reconstruction_error} || 0.001, | ||||||
| 40 | _delta_normalized_error => undef, | ||||||
| 41 | _cluster_search_multiplier => $args{cluster_search_multiplier} || 1, | ||||||
| 42 | _visualize_each_iteration => $args{visualize_each_iteration} == 0 ? 0 : 1, | ||||||
| 43 | _show_hidden_in_3D_plots => $args{show_hidden_in_3D_plots} == 0 ? 0 : 1, | ||||||
| 44 | _make_png_for_each_iteration => $args{make_png_for_each_iteration} == 0 ? 0 : 1, | ||||||
| 45 | _debug => $args{debug} || 0, | ||||||
| 46 | _N => 0, | ||||||
| 47 | _KM => $args{K} * $args{cluster_search_multiplier}, | ||||||
| 48 | _data_hash => {}, | ||||||
| 49 | _data_tags => [], | ||||||
| 50 | _data_dimensions => 0, | ||||||
| 51 | _final_clusters => [], | ||||||
| 52 | _auto_retry_flag => 0, | ||||||
| 53 | _num_iterations_actually_used => undef, | ||||||
| 54 | _scale_factor => undef, | ||||||
| 55 | _data_tags_to_cluster_label_hash => {}, | ||||||
| 56 | _final_reference_vecs_for_all_subspaces => [], | ||||||
| 57 | _reconstruction_error_as_a_function_of_iteration => [], | ||||||
| 58 | _final_trailing_eigenvec_matrices_for_all_subspaces => [], | ||||||
| 59 | _subspace_construction_error_as_a_function_of_iteration => [], | ||||||
| 60 | }, $class; | ||||||
| 61 | } | ||||||
| 62 | |||||||
| 63 | sub get_data_from_csv { | ||||||
| 64 | my $self = shift; | ||||||
| 65 | my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered"; | ||||||
| 66 | my $mask = $self->{_mask}; | ||||||
| 67 | my @mask = split //, $mask; | ||||||
| 68 | $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask; | ||||||
| 69 | print "data dimensionality: $self->{_data_dimensions} \n" if $self->{_terminal_output}; | ||||||
| 70 | open FILEIN, $filename or die "Unable to open $filename: $!"; | ||||||
| 71 | die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/; | ||||||
| 72 | local $/ = undef; | ||||||
| 73 | my @all_data = split /\s+/, |
||||||
| 74 | my %data_hash = (); | ||||||
| 75 | my @data_tags = (); | ||||||
| 76 | foreach my $record (@all_data) { | ||||||
| 77 | my @splits = split /,/, $record; | ||||||
| 78 | my $record_name = shift @splits; | ||||||
| 79 | $data_hash{$record_name} = \@splits; | ||||||
| 80 | push @data_tags, $record_name; | ||||||
| 81 | } | ||||||
| 82 | $self->{_data_hash} = \%data_hash; | ||||||
| 83 | $self->{_data_tags} = \@data_tags; | ||||||
| 84 | $self->{_N} = scalar @data_tags; | ||||||
| 85 | } | ||||||
| 86 | |||||||
| 87 | sub estimate_mean_and_covariance { | ||||||
| 88 | my $self = shift; | ||||||
| 89 | my $tag_set = shift; | ||||||
| 90 | my $cluster_size = @$tag_set; | ||||||
| 91 | my @cluster_center = @{$self->add_point_coords($tag_set)}; | ||||||
| 92 | @cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center; | ||||||
| 93 | # for covariance calculation: | ||||||
| 94 | my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$tag_set)); | ||||||
| 95 | print "\nThe data will be stuffed into a matrix of $num_rows rows and $num_cols columns\n\n" | ||||||
| 96 | if $self->{_debug}; | ||||||
| 97 | my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols); | ||||||
| 98 | my $mean_vec = Math::GSL::Matrix->new($num_rows,1); | ||||||
| 99 | # All the record labels are stored in the array $self->{_data_tags}. The actual | ||||||
| 100 | # data for clustering is stored in a hash at $self->{_data_hash} whose keys are | ||||||
| 101 | # the record labels; the value associated with each key is the array holding the | ||||||
| 102 | # corresponding numerical multidimensional data. | ||||||
| 103 | $mean_vec->set_col(0, \@cluster_center); | ||||||
| 104 | if ($self->{_debug}) { | ||||||
| 105 | print "\nDisplaying the mean vector for the cluster:\n"; | ||||||
| 106 | display_matrix( $mean_vec ) if $self->{_terminal_output}; | ||||||
| 107 | } | ||||||
| 108 | foreach my $j (0..$num_cols-1) { | ||||||
| 109 | my $tag = $tag_set->[$j]; | ||||||
| 110 | my $data = $self->{_data_hash}->{$tag}; | ||||||
| 111 | my @diff_from_mean = vector_subtract($data, \@cluster_center); | ||||||
| 112 | $matrix->set_col($j, \@diff_from_mean); | ||||||
| 113 | } | ||||||
| 114 | my $transposed = transpose( $matrix ); | ||||||
| 115 | my $covariance = $matrix * $transposed; | ||||||
| 116 | $covariance *= 1.0 / $num_cols; | ||||||
| 117 | if ($self->{_debug}) { | ||||||
| 118 | print "\nDisplaying the Covariance Matrix for cluster:"; | ||||||
| 119 | display_matrix( $covariance ) if $self->{_terminal_output}; | ||||||
| 120 | } | ||||||
| 121 | return ($mean_vec, $covariance); | ||||||
| 122 | } | ||||||
| 123 | |||||||
| 124 | sub eigen_analysis_of_covariance { | ||||||
| 125 | my $self = shift; | ||||||
| 126 | my $covariance = shift; | ||||||
| 127 | my ($eigenvalues, $eigenvectors) = $covariance->eigenpair; | ||||||
| 128 | my $num_of_eigens = @$eigenvalues; | ||||||
| 129 | my $largest_eigen_index = 0; | ||||||
| 130 | my $smallest_eigen_index = 0; | ||||||
| 131 | print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug}; | ||||||
| 132 | foreach my $i (1..$num_of_eigens-1) { | ||||||
| 133 | $largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index]; | ||||||
| 134 | $smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index]; | ||||||
| 135 | print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug}; | ||||||
| 136 | } | ||||||
| 137 | print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug}; | ||||||
| 138 | print "\nsmallest eigen index: $smallest_eigen_index\n\n" if $self->{_debug}; | ||||||
| 139 | my @all_my_eigenvecs; | ||||||
| 140 | foreach my $i (0..$num_of_eigens-1) { | ||||||
| 141 | my @vec = $eigenvectors->[$i]->as_list; | ||||||
| 142 | my @eigenvec; | ||||||
| 143 | foreach my $ele (@vec) { | ||||||
| 144 | my ($mag, $theta) = $ele =~ /\[(\d*\.?\d*e?[+-]?\d*),(\S+)\]/; | ||||||
| 145 | if ($theta eq "0") { | ||||||
| 146 | push @eigenvec, $mag; | ||||||
| 147 | } elsif ($theta eq "pi") { | ||||||
| 148 | push @eigenvec, -1.0 * $mag; | ||||||
| 149 | } else { | ||||||
| 150 | die "Eigendecomposition produced a complex eigenvector -- " . | ||||||
| 151 | "which should not happen for a covariance matrix!"; | ||||||
| 152 | } | ||||||
| 153 | } | ||||||
| 154 | print "Eigenvector $i: @eigenvec\n" if $self->{_debug}; | ||||||
| 155 | push @all_my_eigenvecs, \@eigenvec; | ||||||
| 156 | } | ||||||
| 157 | my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list; | ||||||
| 158 | print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug}; | ||||||
| 159 | my @sorted_eigenvec_indexes = sort {$eigenvalues->[$b] <=> $eigenvalues->[$a]} 0..@all_my_eigenvecs-1; | ||||||
| 160 | my @sorted_eigenvecs; | ||||||
| 161 | my @sorted_eigenvals; | ||||||
| 162 | foreach my $i (0..@sorted_eigenvec_indexes-1) { | ||||||
| 163 | $sorted_eigenvecs[$i] = $all_my_eigenvecs[$sorted_eigenvec_indexes[$i]]; | ||||||
| 164 | $sorted_eigenvals[$i] = $eigenvalues->[$sorted_eigenvec_indexes[$i]]; | ||||||
| 165 | } | ||||||
| 166 | if ($self->{_debug}) { | ||||||
| 167 | print "\nHere come sorted eigenvectors --- from the largest to the smallest:\n"; | ||||||
| 168 | foreach my $i (0..@sorted_eigenvecs-1) { | ||||||
| 169 | print "eigenvec: @{$sorted_eigenvecs[$i]} eigenvalue: $sorted_eigenvals[$i]\n"; | ||||||
| 170 | } | ||||||
| 171 | } | ||||||
| 172 | return (\@sorted_eigenvecs, \@sorted_eigenvals); | ||||||
| 173 | } | ||||||
| 174 | |||||||
| 175 | sub auto_retry_clusterer { | ||||||
| 176 | my $self = shift; | ||||||
| 177 | $self->{_auto_retry_flag} = 1; | ||||||
| 178 | my $clusters; | ||||||
| 179 | $@ = 1; | ||||||
| 180 | my $retry_attempts = 1; | ||||||
| 181 | while ($@) { | ||||||
| 182 | eval { | ||||||
| 183 | $clusters = $self->linear_manifold_clusterer(); | ||||||
| 184 | }; | ||||||
| 185 | if ($@) { | ||||||
| 186 | if ($self->{_terminal_output}) { | ||||||
| 187 | print "Clustering failed. Trying again. --- $@"; | ||||||
| 188 | print "\n\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"; | ||||||
| 189 | print "VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV\n\n"; | ||||||
| 190 | } | ||||||
| 191 | $retry_attempts++; | ||||||
| 192 | } else { | ||||||
| 193 | print "\n\nNumber of retry attempts: $retry_attempts\n\n" if $self->{_terminal_output}; | ||||||
| 194 | return $clusters; | ||||||
| 195 | } | ||||||
| 196 | } | ||||||
| 197 | } | ||||||
| 198 | |||||||
| 199 | sub linear_manifold_clusterer { | ||||||
| 200 | my $self = shift; | ||||||
| 201 | my $KM = $self->{_KM}; | ||||||
| 202 | my @initial_cluster_center_tags; | ||||||
| 203 | my $visualization_msg; | ||||||
| 204 | my @initial_cluster_center_indexes = $self->initialize_cluster_centers($KM, $self->{_N}); | ||||||
| 205 | print "Randomly selected indexes for cluster center tags: @initial_cluster_center_indexes\n" | ||||||
| 206 | if $self->{_debug}; | ||||||
| 207 | @initial_cluster_center_tags = map {$self->{_data_tags}->[$_]} @initial_cluster_center_indexes; | ||||||
| 208 | my @initial_cluster_center_coords = map {$self->{_data_hash}->{$_}} @initial_cluster_center_tags; | ||||||
| 209 | if ($self->{_debug}) { | ||||||
| 210 | foreach my $centroid (@initial_cluster_center_coords) { | ||||||
| 211 | print "Initial cluster center coords: @{$centroid}\n"; | ||||||
| 212 | } | ||||||
| 213 | } | ||||||
| 214 | my $initial_clusters = $self->assign_data_to_clusters_initial(\@initial_cluster_center_coords); | ||||||
| 215 | if ($self->{_data_dimensions} == 3) { | ||||||
| 216 | $visualization_msg = "initial_clusters"; | ||||||
| 217 | $self->visualize_clusters_on_sphere($visualization_msg, $initial_clusters) | ||||||
| 218 | if $self->{_visualize_each_iteration}; | ||||||
| 219 | $self->visualize_clusters_on_sphere($visualization_msg, $initial_clusters, "png") | ||||||
| 220 | if $self->{_make_png_for_each_iteration}; | ||||||
| 221 | } | ||||||
| 222 | foreach my $cluster (@$initial_clusters) { | ||||||
| 223 | my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster); | ||||||
| 224 | display_mean_and_covariance($mean, $covariance) if $self->{_debug}; | ||||||
| 225 | } | ||||||
| 226 | my @clusters = @$initial_clusters; | ||||||
| 227 | display_clusters(\@clusters) if $self->{_debug}; | ||||||
| 228 | my $iteration_index = 0; | ||||||
| 229 | my $unimodal_correction_flag; | ||||||
| 230 | my $previous_min_value_for_unimodality_quotient; | ||||||
| 231 | while ($iteration_index < $self->{_max_iterations}) { | ||||||
| 232 | print "\n\n========================== STARTING ITERATION $iteration_index =====================\n\n" | ||||||
| 233 | if $self->{_terminal_output}; | ||||||
| 234 | my $total_reconstruction_error_this_iteration = 0; | ||||||
| 235 | my @subspace_construction_errors_this_iteration; | ||||||
| 236 | my @trailing_eigenvec_matrices_for_all_subspaces; | ||||||
| 237 | my @reference_vecs_for_all_subspaces; | ||||||
| 238 | foreach my $cluster (@clusters) { | ||||||
| 239 | next if @$cluster == 0; | ||||||
| 240 | my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster); | ||||||
| 241 | display_mean_and_covariance($mean, $covariance) if $self->{_debug}; | ||||||
| 242 | print "--------------end of displaying mean and covariance\n\n" if $self->{_debug}; | ||||||
| 243 | my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance); | ||||||
| 244 | my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1]; | ||||||
| 245 | my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1]; | ||||||
| 246 | my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals; | ||||||
| 247 | push @subspace_construction_errors_this_iteration, $subspace_construction_error; | ||||||
| 248 | my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions}, | ||||||
| 249 | scalar(@trailing_eigenvecs)); | ||||||
| 250 | foreach my $j (0..@trailing_eigenvecs-1) { | ||||||
| 251 | print "trailing eigenvec column: @{$trailing_eigenvecs[$j]}\n" if $self->{_debug}; | ||||||
| 252 | $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]); | ||||||
| 253 | } | ||||||
| 254 | push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix; | ||||||
| 255 | push @reference_vecs_for_all_subspaces, $mean; | ||||||
| 256 | } | ||||||
| 257 | $self->set_termination_reconstruction_error_threshold(\@reference_vecs_for_all_subspaces); | ||||||
| 258 | my %best_subspace_based_partition_of_data; | ||||||
| 259 | foreach my $i (0..$self->{_KM}-1) { | ||||||
| 260 | $best_subspace_based_partition_of_data{$i} = []; | ||||||
| 261 | } | ||||||
| 262 | foreach my $data_tag (@{$self->{_data_tags}}) { | ||||||
| 263 | my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); | ||||||
| 264 | $data_vec->set_col(0, $self->{_data_hash}->{$data_tag}); | ||||||
| 265 | my @errors = map {$self->reconstruction_error($data_vec, | ||||||
| 266 | $trailing_eigenvec_matrices_for_all_subspaces[$_], | ||||||
| 267 | $reference_vecs_for_all_subspaces[$_])} | ||||||
| 268 | 0 .. $self->{_KM}-1; | ||||||
| 269 | my ($minval, $index_for_closest_subspace) = minimum(\@errors); | ||||||
| 270 | $total_reconstruction_error_this_iteration += $minval; | ||||||
| 271 | push @{$best_subspace_based_partition_of_data{$index_for_closest_subspace}}, | ||||||
| 272 | $data_tag; | ||||||
| 273 | } | ||||||
| 274 | print "Finished calculating the eigenvectors for the clusters produced by the previous\n" . | ||||||
| 275 | "iteration and re-assigning the data samples to the new subspaces on the basis of\n". | ||||||
| 276 | "the least reconstruction error.\n\n" . | ||||||
| 277 | "Total reconstruction error in this iteration: $total_reconstruction_error_this_iteration\n" | ||||||
| 278 | if $self->{_terminal_output}; | ||||||
| 279 | foreach my $i (0..$self->{_KM}-1) { | ||||||
| 280 | $clusters[$i] = $best_subspace_based_partition_of_data{$i}; | ||||||
| 281 | } | ||||||
| 282 | display_clusters(\@clusters) if $self->{_terminal_output}; | ||||||
| 283 | # Check if any cluster has lost all its elements. If so, fragment the worst | ||||||
| 284 | # existing cluster to create the additional clusters needed: | ||||||
| 285 | if (any {@$_ == 0} @clusters) { | ||||||
| 286 | die "empty cluster found" if $self->{_auto_retry_flag}; | ||||||
| 287 | print "\nOne or more clusters have become empty. Will carve out the needed clusters\n" . | ||||||
| 288 | "from the cluster with the largest subspace construction error.\n\n"; | ||||||
| 289 | $total_reconstruction_error_this_iteration = 0; | ||||||
| 290 | @subspace_construction_errors_this_iteration = (); | ||||||
| 291 | my $how_many_extra_clusters_needed = $self->{_KM} - scalar(grep {@$_ != 0} @clusters); | ||||||
| 292 | print "number of extra clusters needed at iteration $iteration_index: $how_many_extra_clusters_needed\n"; | ||||||
| 293 | my $max = List::Util::max @subspace_construction_errors_this_iteration; | ||||||
| 294 | my $maxindex = List::Util::first {$_ == $max} @subspace_construction_errors_this_iteration; | ||||||
| 295 | my @cluster_fragments = cluster_split($clusters[$maxindex], | ||||||
| 296 | $how_many_extra_clusters_needed + 1); | ||||||
| 297 | my @newclusters; | ||||||
| 298 | push @newclusters, @clusters[0 .. $maxindex-1]; | ||||||
| 299 | push @newclusters, @clusters[$maxindex+1 .. $self->{_KM}-1]; | ||||||
| 300 | push @newclusters, @cluster_fragments; | ||||||
| 301 | @newclusters = grep {@$_ != 0} @newclusters; | ||||||
| 302 | die "something went wrong with cluster fragmentation" | ||||||
| 303 | unless $self->{_KM} = @newclusters; | ||||||
| 304 | @trailing_eigenvec_matrices_for_all_subspaces = (); | ||||||
| 305 | @reference_vecs_for_all_subspaces = (); | ||||||
| 306 | foreach my $cluster (@newclusters) { | ||||||
| 307 | die "Linear Manifold Clustering did not work $!" if @$cluster == 0; | ||||||
| 308 | my ($mean, $covariance) = estimate_mean_and_covariance($cluster); | ||||||
| 309 | my ($eigenvecs, $eigenvals) = eigen_analysis_of_covariance($covariance); | ||||||
| 310 | my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1]; | ||||||
| 311 | my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1]; | ||||||
| 312 | my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals; | ||||||
| 313 | push @subspace_construction_errors_this_iteration, $subspace_construction_error; | ||||||
| 314 | my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions}, | ||||||
| 315 | scalar(@trailing_eigenvecs)); | ||||||
| 316 | foreach my $j (0..@trailing_eigenvecs-1) { | ||||||
| 317 | $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]); | ||||||
| 318 | } | ||||||
| 319 | push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix; | ||||||
| 320 | push @reference_vecs_for_all_subspaces, $mean; | ||||||
| 321 | } | ||||||
| 322 | my %best_subspace_based_partition_of_data; | ||||||
| 323 | foreach my $i (0..$self->{_KM}-1) { | ||||||
| 324 | $best_subspace_based_partition_of_data{$i} = []; | ||||||
| 325 | } | ||||||
| 326 | foreach my $data_tag (@{$self->{_data_tags}}) { | ||||||
| 327 | my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); | ||||||
| 328 | $data_vec->set_col(0, $self->{_data_hash}->{$data_tag}); | ||||||
| 329 | my @errors = map {reconstruction_error($data_vec, | ||||||
| 330 | $trailing_eigenvec_matrices_for_all_subspaces[$_], | ||||||
| 331 | $reference_vecs_for_all_subspaces[$_])} | ||||||
| 332 | 0 .. $self->{_KM}-1; | ||||||
| 333 | my ($minval, $index_for_closest_subspace) = minimum(\@errors); | ||||||
| 334 | $total_reconstruction_error_this_iteration += $minval; | ||||||
| 335 | push @{$best_subspace_based_partition_of_data{$index_for_closest_subspace}}, | ||||||
| 336 | $data_tag; | ||||||
| 337 | } | ||||||
| 338 | print "empty-cluster jag: total reconstruction error in this iteration: \n" . | ||||||
| 339 | "$total_reconstruction_error_this_iteration\n" | ||||||
| 340 | if $self->{_debug}; | ||||||
| 341 | foreach my $i (0..$self->{_KM}-1) { | ||||||
| 342 | $clusters[$i] = $best_subspace_based_partition_of_data{$i}; | ||||||
| 343 | } | ||||||
| 344 | display_clusters(\@newclusters) if $self->{_terminal_output}; | ||||||
| 345 | @clusters = grep {@$_ != 0} @newclusters; | ||||||
| 346 | die "linear manifold based algorithm does not appear to work in this case $!" | ||||||
| 347 | unless @clusters == $self->{_KM}; | ||||||
| 348 | }# end of foreach my $cluster (@clusters) ... loop followed by if clause for empty clusters | ||||||
| 349 | if ($self->{_data_dimensions} == 3) { | ||||||
| 350 | $visualization_msg = "clustering_at_iteration_$iteration_index"; | ||||||
| 351 | $self->visualize_clusters_on_sphere($visualization_msg, \@clusters) | ||||||
| 352 | if $self->{_visualize_each_iteration}; | ||||||
| 353 | $self->visualize_clusters_on_sphere($visualization_msg, \@clusters, "png") | ||||||
| 354 | if $self->{_make_png_for_each_iteration}; | ||||||
| 355 | } | ||||||
| 356 | my @cluster_unimodality_quotients = map {$self->cluster_unimodality_quotient($clusters[$_], | ||||||
| 357 | $reference_vecs_for_all_subspaces[$_])} 0..@clusters-1; | ||||||
| 358 | my $min_value_for_unimodality_quotient = List::Util::min @cluster_unimodality_quotients; | ||||||
| 359 | print "\nCluster unimodality quotients: @cluster_unimodality_quotients\n" if $self->{_terminal_output}; | ||||||
| 360 | die "\n\nBailing out!\n" . | ||||||
| 361 | "It does not look like these iterations will lead to a good clustering result.\n" . | ||||||
| 362 | "Program terminating. Try running again.\n" | ||||||
| 363 | if defined($previous_min_value_for_unimodality_quotient) | ||||||
| 364 | && ($min_value_for_unimodality_quotient < 0.4) | ||||||
| 365 | && ($min_value_for_unimodality_quotient < (0.5 * $previous_min_value_for_unimodality_quotient)); | ||||||
| 366 | if ( $min_value_for_unimodality_quotient < 0.5 ) { | ||||||
| 367 | $unimodal_correction_flag = 1; | ||||||
| 368 | print "\nApplying unimodality correction:\n\n" if $self->{_terminal_output}; | ||||||
| 369 | my @sorted_cluster_indexes = | ||||||
| 370 | sort {$cluster_unimodality_quotients[$b] <=> $cluster_unimodality_quotients[$a]} 0..@clusters-1; | ||||||
| 371 | my @newclusters; | ||||||
| 372 | foreach my $cluster_index (0..@clusters - 1) { | ||||||
| 373 | push @newclusters, $clusters[$sorted_cluster_indexes[$cluster_index]]; | ||||||
| 374 | } | ||||||
| 375 | @clusters = @newclusters; | ||||||
| 376 | my $worst_cluster = pop @clusters; | ||||||
| 377 | print "\nthe worst cluster: @$worst_cluster\n" if $self->{_terminal_output}; | ||||||
| 378 | my $second_worst_cluster = pop @clusters; | ||||||
| 379 | print "\nthe second worst cluster: @$second_worst_cluster\n" if $self->{_terminal_output}; | ||||||
| 380 | push @$worst_cluster, @$second_worst_cluster; | ||||||
| 381 | fisher_yates_shuffle($worst_cluster); | ||||||
| 382 | my @first_half = @$worst_cluster[0 .. int(scalar(@$worst_cluster)/2) - 1]; | ||||||
| 383 | my @second_half = @$worst_cluster[int(scalar(@$worst_cluster)/2) .. @$worst_cluster - 1]; | ||||||
| 384 | push @clusters, \@first_half; | ||||||
| 385 | push @clusters, \@second_half; | ||||||
| 386 | if ($self->{_terminal_output}) { | ||||||
| 387 | print "\n\nShowing the clusters obtained after applying the unimodality correction:\n"; | ||||||
| 388 | display_clusters(\@clusters); | ||||||
| 389 | } | ||||||
| 390 | } | ||||||
| 391 | if (@{$self->{_reconstruction_error_as_a_function_of_iteration}} > 0) { | ||||||
| 392 | my $last_recon_error = pop @{$self->{_reconstruction_error_as_a_function_of_iteration}}; | ||||||
| 393 | push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, $last_recon_error; | ||||||
| 394 | if (($last_recon_error - $total_reconstruction_error_this_iteration) | ||||||
| 395 | < $self->{_delta_normalized_error}) { | ||||||
| 396 | push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, | ||||||
| 397 | $total_reconstruction_error_this_iteration; | ||||||
| 398 | last; | ||||||
| 399 | } | ||||||
| 400 | } | ||||||
| 401 | push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, | ||||||
| 402 | $total_reconstruction_error_this_iteration; | ||||||
| 403 | $iteration_index++; | ||||||
| 404 | $previous_min_value_for_unimodality_quotient = $min_value_for_unimodality_quotient; | ||||||
| 405 | } # end of while loop on iteration_index | ||||||
| 406 | $self->{_num_iterations_actually_used} = | ||||||
| 407 | scalar @{$self->{_reconstruction_error_as_a_function_of_iteration}}; | ||||||
| 408 | if ($self->{_terminal_output}) { | ||||||
| 409 | print "\nIterations of the main loop terminated at iteration number $iteration_index.\n"; | ||||||
| 410 | print "Will now invoke graph partitioning to discover dominant clusters and to\n" . | ||||||
| 411 | "merge small clusters.\n\n" if $self->{_cluster_search_multiplier} > 1; | ||||||
| 412 | print "Total reconstruction error as a function of iterations: " . | ||||||
| 413 | "@{$self->{_reconstruction_error_as_a_function_of_iteration}}"; | ||||||
| 414 | } | ||||||
| 415 | # now merge sub-clusters if cluster_search_multiplier > 1 | ||||||
| 416 | my @final_clusters; | ||||||
| 417 | if ($self->{_cluster_search_multiplier} > 1) { | ||||||
| 418 | print "\n\nInvoking recursive graph partitioning to merge small clusters\n\n"; | ||||||
| 419 | my @array_of_partitioned_cluster_groups = (\@clusters); | ||||||
| 420 | my @partitioned_cluster_groups; | ||||||
| 421 | my $how_many_clusters_looking_for = $self->{_K}; | ||||||
| 422 | while (scalar(@final_clusters) < $self->{_K}) { | ||||||
| 423 | @partitioned_cluster_groups = | ||||||
| 424 | $self->graph_partition(shift @array_of_partitioned_cluster_groups, | ||||||
| 425 | $how_many_clusters_looking_for ); | ||||||
| 426 | if (@{$partitioned_cluster_groups[0]} == 1) { | ||||||
| 427 | my $singular_cluster = shift @{$partitioned_cluster_groups[0]}; | ||||||
| 428 | push @final_clusters, $singular_cluster; | ||||||
| 429 | $how_many_clusters_looking_for--; | ||||||
| 430 | push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[1]; | ||||||
| 431 | } elsif (@{$partitioned_cluster_groups[1]} == 1) { | ||||||
| 432 | my $singular_cluster = shift @{$partitioned_cluster_groups[1]}; | ||||||
| 433 | push @final_clusters, $singular_cluster; | ||||||
| 434 | $how_many_clusters_looking_for--; | ||||||
| 435 | push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[0]; | ||||||
| 436 | } else { | ||||||
| 437 | push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[0]; | ||||||
| 438 | push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[1]; | ||||||
| 439 | } | ||||||
| 440 | } | ||||||
| 441 | my @data_clustered; | ||||||
| 442 | foreach my $cluster (@final_clusters) { | ||||||
| 443 | push @data_clustered, @$cluster; | ||||||
| 444 | } | ||||||
| 445 | unless (scalar(@data_clustered) == scalar(@{$self->{_data_tags}})) { | ||||||
| 446 | $self->{_final_clusters} = \@final_clusters; | ||||||
| 447 | my %data_clustered = map {$_ => 1} @data_clustered; | ||||||
| 448 | my @data_tags_not_clustered = | ||||||
| 449 | grep {$_} map {exists $data_clustered{$_} ? undef : $_} @{$self->{_data_tags}}; | ||||||
| 450 | if ($self->{_terminal_output}) { | ||||||
| 451 | print "\n\nNot all data clustered. The most reliable clusters found by graph partitioning:\n"; | ||||||
| 452 | display_clusters(\@final_clusters); | ||||||
| 453 | print "\n\nData not yet clustered:\n\n@data_tags_not_clustered\n"; | ||||||
| 454 | } | ||||||
| 455 | if ($self->{_data_dimensions} == 3) { | ||||||
| 456 | $visualization_msg = "$self->{_K}_best_clusters_produced_by_graph_partitioning"; | ||||||
| 457 | $self->visualize_clusters_on_sphere($visualization_msg, \@final_clusters) | ||||||
| 458 | if $self->{_visualize_each_iteration}; | ||||||
| 459 | $self->visualize_clusters_on_sphere($visualization_msg, \@final_clusters, "png") | ||||||
| 460 | if $self->{_make_png_for_each_iteration}; | ||||||
| 461 | } | ||||||
| 462 | my %data_tags_to_cluster_label_hash; | ||||||
| 463 | foreach my $i (0..@final_clusters-1) { | ||||||
| 464 | map {$data_tags_to_cluster_label_hash{$_} = $i} @{$final_clusters[$i]}; | ||||||
| 465 | } | ||||||
| 466 | $self->{_data_tags_to_cluster_label_hash} = \%data_tags_to_cluster_label_hash; | ||||||
| 467 | foreach my $tag (@data_tags_not_clustered) { | ||||||
| 468 | my $which_cluster = $self->which_cluster_for_new_element($tag); | ||||||
| 469 | $self->{_data_tags_to_cluster_label_hash}->{$tag} = $which_cluster; | ||||||
| 470 | } | ||||||
| 471 | die "Some data elements are still missing from the final tally" | ||||||
| 472 | unless scalar(keys %{$self->{_data_tags_to_cluster_label_hash}}) == | ||||||
| 473 | scalar(@{$self->{_data_tags}}); | ||||||
| 474 | my @new_final_clusters; | ||||||
| 475 | map { foreach my $ele (keys %{$self->{_data_tags_to_cluster_label_hash}}) { | ||||||
| 476 | push @{$new_final_clusters[$_]}, $ele | ||||||
| 477 | if $self->{_data_tags_to_cluster_label_hash}->{$ele} == $_ } | ||||||
| 478 | } 0..$self->{_K}-1; | ||||||
| 479 | if ($self->{_debug}) { | ||||||
| 480 | print "\ndisplaying the final clusters after accounting for unclustered data:\n"; | ||||||
| 481 | display_clusters(\@new_final_clusters); | ||||||
| 482 | } | ||||||
| 483 | $self->{_final_clusters} = \@new_final_clusters; | ||||||
| 484 | @final_clusters = @new_final_clusters; | ||||||
| 485 | } | ||||||
| 486 | } else { | ||||||
| 487 | @final_clusters = @clusters; | ||||||
| 488 | } | ||||||
| 489 | print "\n\nDisplaying final clustering results:\n\n" if $self->{_terminal_output}; | ||||||
| 490 | display_clusters(\@final_clusters) if $self->{_terminal_output}; | ||||||
| 491 | return \@final_clusters; | ||||||
| 492 | } | ||||||
| 493 | |||||||
| 494 | sub display_reconstruction_errors_as_a_function_of_iterations { | ||||||
| 495 | my $self = shift; | ||||||
| 496 | print "\n\nNumber of iterations used in Phase 1: $self->{_num_iterations_actually_used}\n"; | ||||||
| 497 | print "\nTotal reconstruction error as a function of iterations in Phase 1: " . | ||||||
| 498 | "@{$self->{_reconstruction_error_as_a_function_of_iteration}}\n"; | ||||||
| 499 | } | ||||||
| 500 | |||||||
| 501 | sub set_termination_reconstruction_error_threshold { | ||||||
| 502 | my $self = shift; | ||||||
| 503 | my $all_ref_vecs = shift; | ||||||
| 504 | my @mean_vecs = @$all_ref_vecs; | ||||||
| 505 | my $sum_of_mean_magnitudes = reduce {$a+$b} map { my $result = transpose($_) * $_; | ||||||
| 506 | my @result = $result->as_list; | ||||||
| 507 | sqrt($result[0]) | ||||||
| 508 | } @mean_vecs; | ||||||
| 509 | $self->{_scale_factor} = $sum_of_mean_magnitudes / @mean_vecs; | ||||||
| 510 | $self->{_delta_normalized_error} = ($sum_of_mean_magnitudes / @mean_vecs ) * | ||||||
| 511 | $self->{_delta_reconstruction_error}; | ||||||
| 512 | } | ||||||
| 513 | |||||||
| 514 | # This method is called only in the `unless' clause at the end of the main | ||||||
| 515 | # linear_manifold_clusterer() method. It is called to find the cluster labels for | ||||||
| 516 | # those data elements that were left unclustered by the main part of the algorithm | ||||||
| 517 | # when graph partitioning is used to merge similar sub-clusters. The operating logic | ||||||
| 518 | # here is that graph partition yields K main clusters even though each main cluster | ||||||
| 519 | # may not yet be fully populated. | ||||||
| 520 | sub which_cluster_for_new_element { | ||||||
| 521 | my $self = shift; | ||||||
| 522 | my $data_tag = shift; | ||||||
| 523 | # The following `unless' clause is called only the first time the current method | ||||||
| 524 | # is called: | ||||||
| 525 | unless (@{$self->{_final_trailing_eigenvec_matrices_for_all_subspaces}} > 0) { | ||||||
| 526 | my @trailing_eigenvec_matrices_for_all_subspaces; | ||||||
| 527 | my @reference_vecs_for_all_subspaces; | ||||||
| 528 | foreach my $cluster (@{$self->{_final_clusters}}) { | ||||||
| 529 | my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster); | ||||||
| 530 | my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance); | ||||||
| 531 | my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1]; | ||||||
| 532 | my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions}, | ||||||
| 533 | scalar(@trailing_eigenvecs)); | ||||||
| 534 | foreach my $j (0..@trailing_eigenvecs-1) { | ||||||
| 535 | $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]); | ||||||
| 536 | } | ||||||
| 537 | push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix; | ||||||
| 538 | push @reference_vecs_for_all_subspaces, $mean; | ||||||
| 539 | } | ||||||
| 540 | $self->{_final_trailing_eigenvec_matrices_for_all_subspaces} = | ||||||
| 541 | \@trailing_eigenvec_matrices_for_all_subspaces; | ||||||
| 542 | $self->{_final_reference_vecs_for_all_subspaces} = \@reference_vecs_for_all_subspaces; | ||||||
| 543 | } | ||||||
| 544 | my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); | ||||||
| 545 | $data_vec->set_col(0, $self->{_data_hash}->{$data_tag}); | ||||||
| 546 | my @errors = map {$self->reconstruction_error($data_vec, | ||||||
| 547 | $self->{_final_trailing_eigenvec_matrices_for_all_subspaces}->[$_], | ||||||
| 548 | $self->{_final_reference_vecs_for_all_subspaces}->[$_])} | ||||||
| 549 | 0 .. $self->{_K}-1; | ||||||
| 550 | my ($minval, $index_for_closest_subspace) = minimum(\@errors); | ||||||
| 551 | return $index_for_closest_subspace; | ||||||
| 552 | } | ||||||
| 553 | |||||||
| 554 | sub graph_partition { | ||||||
| 555 | my $self = shift; | ||||||
| 556 | my $clusters = shift; | ||||||
| 557 | my $how_many_clusters_looking_for = shift; | ||||||
| 558 | print "\n\nGraph partitioning looking for $how_many_clusters_looking_for clusters\n\n"; | ||||||
| 559 | my $num_nodes = scalar @$clusters; | ||||||
| 560 | my $W = Math::GSL::Matrix->new($num_nodes,$num_nodes); | ||||||
| 561 | my $D = Math::GSL::Matrix->new($num_nodes,$num_nodes); | ||||||
| 562 | $D->identity; | ||||||
| 563 | my $neg_sqrt_of_D = Math::GSL::Matrix->new($num_nodes,$num_nodes); | ||||||
| 564 | $neg_sqrt_of_D->identity; | ||||||
| 565 | my @subspace_construction_errors; | ||||||
| 566 | my @trailing_eigenvec_matrices_for_all_subspaces; | ||||||
| 567 | my @reference_vecs_for_all_subspaces; | ||||||
| 568 | foreach my $cluster (@$clusters) { | ||||||
| 569 | my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster); | ||||||
| 570 | my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance); | ||||||
| 571 | my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1]; | ||||||
| 572 | my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1]; | ||||||
| 573 | my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals; | ||||||
| 574 | push @subspace_construction_errors, $subspace_construction_error; | ||||||
| 575 | my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions}, | ||||||
| 576 | scalar(@trailing_eigenvecs)); | ||||||
| 577 | foreach my $j (0..@trailing_eigenvecs-1) { | ||||||
| 578 | print "trailing eigenvec column: @{$trailing_eigenvecs[$j]}\n" if $self->{_debug}; | ||||||
| 579 | $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]); | ||||||
| 580 | } | ||||||
| 581 | push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix; | ||||||
| 582 | push @reference_vecs_for_all_subspaces, $mean; | ||||||
| 583 | } | ||||||
| 584 | # We consider the similarity matrix W to be a sum of two parts W_recon_based and | ||||||
| 585 | # W_dist_bet_means based. IMPORTANT: For coding reasons, we first store the two | ||||||
| 586 | # similarity measures separately in W_recon_based and W_dist_bet_means based. Our | ||||||
| 587 | # goal is to fill up these component matrices with the raw values while at the | ||||||
| 588 | # same time collecting information needed for normalizing these two separate | ||||||
| 589 | # measures of similarity. | ||||||
| 590 | my $W_reconstruction_error_based = Math::GSL::Matrix->new($num_nodes,$num_nodes); | ||||||
| 591 | my $W_dist_between_means_based = Math::GSL::Matrix->new($num_nodes,$num_nodes); | ||||||
| 592 | my @all_pairwise_reconstruction_errors; | ||||||
| 593 | my @all_dist_between_means_errors; | ||||||
| 594 | foreach my $i (0..$num_nodes-1) { | ||||||
| 595 | foreach my $j (0..$num_nodes-1) { | ||||||
| 596 | my ($recon_error_similarity, $dist_bet_means) = $self->pairwise_cluster_similarity( | ||||||
| 597 | $clusters->[$i], | ||||||
| 598 | $trailing_eigenvec_matrices_for_all_subspaces[$i], | ||||||
| 599 | $reference_vecs_for_all_subspaces[$i], | ||||||
| 600 | $clusters->[$j], | ||||||
| 601 | $trailing_eigenvec_matrices_for_all_subspaces[$j], | ||||||
| 602 | $reference_vecs_for_all_subspaces[$j]); | ||||||
| 603 | $W_reconstruction_error_based->set_elem($i, $j, $recon_error_similarity); | ||||||
| 604 | $W_dist_between_means_based->set_elem($i, $j, $dist_bet_means); | ||||||
| 605 | push @all_pairwise_reconstruction_errors, $recon_error_similarity; | ||||||
| 606 | push @all_dist_between_means_errors, $dist_bet_means; | ||||||
| 607 | } | ||||||
| 608 | } | ||||||
| 609 | my $recon_error_normalizer = (reduce {$a + $b} @all_pairwise_reconstruction_errors) / | ||||||
| 610 | (scalar @all_pairwise_reconstruction_errors); | ||||||
| 611 | my $dist_bet_means_based_normalizer = (reduce {$a + $b} @all_dist_between_means_errors ) / | ||||||
| 612 | (scalar @all_dist_between_means_errors ); | ||||||
| 613 | die "\n\nBailing out!\n" . | ||||||
| 614 | "Dealing with highly defective clusters. Try again.\n" | ||||||
| 615 | if ($recon_error_normalizer == 0) || ($dist_bet_means_based_normalizer == 0); | ||||||
| 616 | foreach my $i (0..$num_nodes-1) { | ||||||
| 617 | foreach my $j (0..$num_nodes-1) { | ||||||
| 618 | my $recon_val = $W_reconstruction_error_based->get_elem($i,$j); | ||||||
| 619 | my $new_recon_val = exp( -1.0 * $recon_val / $recon_error_normalizer ); | ||||||
| 620 | $W_reconstruction_error_based->set_elem($i,$j,$new_recon_val); | ||||||
| 621 | my $mean_dist_val = $W_dist_between_means_based->get_elem($i,$j); | ||||||
| 622 | my $new_mean_dist_val = exp( -1.0 * $mean_dist_val / $dist_bet_means_based_normalizer ); | ||||||
| 623 | $W_dist_between_means_based->set_elem($i,$j,$new_mean_dist_val); | ||||||
| 624 | } | ||||||
| 625 | } | ||||||
| 626 | $W = $W_reconstruction_error_based + $W_dist_between_means_based; | ||||||
| 627 | if ($self->{_debug}) { | ||||||
| 628 | print "\nDisplaying the similarity matrix W for the cluster graph:\n"; | ||||||
| 629 | display_matrix($W) if $self->{_terminal_output}; | ||||||
| 630 | } | ||||||
| 631 | my $add_all_columns = Math::GSL::Matrix->new($num_nodes,1); | ||||||
| 632 | foreach my $col (0..$num_nodes-1) { | ||||||
| 633 | $add_all_columns += $W->col($col); | ||||||
| 634 | } | ||||||
| 635 | foreach my $i (0..$num_nodes-1) { | ||||||
| 636 | $D->set_elem($i,$i, $add_all_columns->get_elem($i,0)); | ||||||
| 637 | $neg_sqrt_of_D->set_elem($i,$i, 1.0 / sqrt($add_all_columns->get_elem($i,0))); | ||||||
| 638 | } | ||||||
| 639 | # the Laplacian matrix: | ||||||
| 640 | my $Laplacian = $D - $W; | ||||||
| 641 | # the Symmetric Normalized Laplacian matrix A: | ||||||
| 642 | my $A = $neg_sqrt_of_D * $Laplacian * $neg_sqrt_of_D; | ||||||
| 643 | foreach my $i (0..$num_nodes-1) { | ||||||
| 644 | foreach my $j (0..$num_nodes-1) { | ||||||
| 645 | $A->set_elem($i,$j,0) if abs($A->get_elem($i,$j)) < 0.01; | ||||||
| 646 | } | ||||||
| 647 | } | ||||||
| 648 | if ($self->{_terminal_output}) { | ||||||
| 649 | print "\nDisplaying the Symmetric Normalized Laplacian matrix A:\n" . | ||||||
| 650 | "A = neg_sqrt(D) * Laplacian_matrix * neg_sqrt(D)\n"; | ||||||
| 651 | display_matrix( $A ); | ||||||
| 652 | } | ||||||
| 653 | my ($eigenvalues, $eigenvectors) = $A->eigenpair; | ||||||
| 654 | my $num_of_eigens = @$eigenvalues; | ||||||
| 655 | my $largest_eigen_index = 0; | ||||||
| 656 | my $smallest_eigen_index = 0; | ||||||
| 657 | if ($self->{_debug2}) { | ||||||
| 658 | print "Eigenvalue 0: $eigenvalues->[0]\n"; | ||||||
| 659 | foreach my $i (1..$num_of_eigens-1) { | ||||||
| 660 | $largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index]; | ||||||
| 661 | $smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index]; | ||||||
| 662 | print "Eigenvalue $i: $eigenvalues->[$i]\n"; | ||||||
| 663 | } | ||||||
| 664 | print "\nlargest eigen index: $largest_eigen_index\n"; | ||||||
| 665 | print "\nsmallest eigen index: $smallest_eigen_index\n\n"; | ||||||
| 666 | } | ||||||
| 667 | my @all_my_eigenvecs; | ||||||
| 668 | foreach my $i (0..$num_of_eigens-1) { | ||||||
| 669 | my @vec = $eigenvectors->[$i]->as_list; | ||||||
| 670 | my @eigenvec; | ||||||
| 671 | foreach my $ele (@vec) { | ||||||
| 672 | my ($mag, $theta) = $ele =~ /\[(\d*\.?\d*e?[+-]?\d*),(\S+)\]/; | ||||||
| 673 | if ($theta eq "0") { | ||||||
| 674 | push @eigenvec, $mag; | ||||||
| 675 | } elsif ($theta eq "pi") { | ||||||
| 676 | push @eigenvec, -1.0 * $mag; | ||||||
| 677 | } else { | ||||||
| 678 | die "Eigendecomposition produced a complex eigenvector!"; | ||||||
| 679 | } | ||||||
| 680 | } | ||||||
| 681 | print "Eigenvector $i: @eigenvec\n" if $self->{_debug2}; | ||||||
| 682 | push @all_my_eigenvecs, \@eigenvec; | ||||||
| 683 | } | ||||||
| 684 | if ($self->{_debug2}) { | ||||||
| 685 | my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list; | ||||||
| 686 | print "\nLargest eigenvector of A: @largest_eigen_vec\n"; | ||||||
| 687 | } | ||||||
| 688 | my @sorted_eigenvec_indexes = sort {$eigenvalues->[$b] <=> $eigenvalues->[$a]} 0..@all_my_eigenvecs-1; | ||||||
| 689 | print "sorted eigenvec indexes for A: @sorted_eigenvec_indexes\n" if $self->{_debug2}; | ||||||
| 690 | my @sorted_eigenvecs; | ||||||
| 691 | my @sorted_eigenvals; | ||||||
| 692 | foreach my $i (0..@sorted_eigenvec_indexes-1) { | ||||||
| 693 | $sorted_eigenvecs[$i] = $all_my_eigenvecs[$sorted_eigenvec_indexes[$i]]; | ||||||
| 694 | $sorted_eigenvals[$i] = $eigenvalues->[$sorted_eigenvec_indexes[$i]]; | ||||||
| 695 | } | ||||||
| 696 | if ($self->{_debug2}) { | ||||||
| 697 | print "\nHere come sorted eigenvectors for A --- from the largest to the smallest:\n"; | ||||||
| 698 | foreach my $i (0..@sorted_eigenvecs-1) { | ||||||
| 699 | print "eigenvec: @{$sorted_eigenvecs[$i]} eigenvalue: $sorted_eigenvals[$i]\n"; | ||||||
| 700 | } | ||||||
| 701 | } | ||||||
| 702 | my $best_partitioning_eigenvec = $sorted_eigenvecs[@sorted_eigenvec_indexes-2]; | ||||||
| 703 | print "\nBest graph partitioning eigenvector: @$best_partitioning_eigenvec\n" if $self->{_terminal_output}; | ||||||
| 704 | my $how_many_positive = reduce {$a + $b} map {$_ > 0 ? 1 : 0 } @$best_partitioning_eigenvec; | ||||||
| 705 | my $how_many_negative = scalar(@$best_partitioning_eigenvec) - $how_many_positive; | ||||||
| 706 | print "Have $how_many_positive positive and $how_many_negative negative elements in the partitioning vec\n" | ||||||
| 707 | if $self->{_terminal_output}; | ||||||
| 708 | if ($how_many_clusters_looking_for <= 3) { | ||||||
| 709 | my @merged_cluster; | ||||||
| 710 | my $final_cluster; | ||||||
| 711 | my @newclusters; | ||||||
| 712 | if ($how_many_positive == 1) { | ||||||
| 713 | foreach my $i (0..@$clusters-1) { | ||||||
| 714 | if ($best_partitioning_eigenvec->[$i] > 0) { | ||||||
| 715 | $final_cluster = $clusters->[$i]; | ||||||
| 716 | } else { | ||||||
| 717 | push @newclusters, $clusters->[$i]; | ||||||
| 718 | } | ||||||
| 719 | } | ||||||
| 720 | return ([$final_cluster], \@newclusters); | ||||||
| 721 | } elsif ($how_many_negative == 1) { | ||||||
| 722 | foreach my $i (0..@$clusters-1) { | ||||||
| 723 | if ($best_partitioning_eigenvec->[$i] < 0) { | ||||||
| 724 | $final_cluster = $clusters->[$i]; | ||||||
| 725 | } else { | ||||||
| 726 | push @newclusters, $clusters->[$i]; | ||||||
| 727 | } | ||||||
| 728 | } | ||||||
| 729 | return ([$final_cluster], \@newclusters); | ||||||
| 730 | } elsif ($how_many_positive <= $self->{_cluster_search_multiplier}) { | ||||||
| 731 | foreach my $i (0..@$clusters-1) { | ||||||
| 732 | if ($best_partitioning_eigenvec->[$i] > 0) { | ||||||
| 733 | push @merged_cluster, @{$clusters->[$i]}; | ||||||
| 734 | } else { | ||||||
| 735 | push @newclusters, $clusters->[$i]; | ||||||
| 736 | } | ||||||
| 737 | } | ||||||
| 738 | return ([\@merged_cluster], \@newclusters); | ||||||
| 739 | } elsif ($how_many_negative <= $self->{_cluster_search_multiplier}) { | ||||||
| 740 | foreach my $i (0..@$clusters-1) { | ||||||
| 741 | if ($best_partitioning_eigenvec->[$i] < 0) { | ||||||
| 742 | push @merged_cluster, @{$clusters->[$i]}; | ||||||
| 743 | } else { | ||||||
| 744 | push @newclusters, $clusters->[$i]; | ||||||
| 745 | } | ||||||
| 746 | } | ||||||
| 747 | return ([\@merged_cluster], \@newclusters); | ||||||
| 748 | } else { | ||||||
| 749 | die "\n\nBailing out!\n\n" . | ||||||
| 750 | "No consensus support for dominant clusters in the graph partitioning step\n" . | ||||||
| 751 | "of the algorithm. This can be caused by bad random selection of initial\n" . | ||||||
| 752 | "cluster centers. Please run this program again.\n"; | ||||||
| 753 | } | ||||||
| 754 | } else { | ||||||
| 755 | my @positive_clusters; | ||||||
| 756 | my @negative_clusters; | ||||||
| 757 | foreach my $i (0..@$clusters-1) { | ||||||
| 758 | if ($best_partitioning_eigenvec->[$i] > 0) { | ||||||
| 759 | push @positive_clusters, $clusters->[$i]; | ||||||
| 760 | } else { | ||||||
| 761 | push @negative_clusters, $clusters->[$i]; | ||||||
| 762 | } | ||||||
| 763 | } | ||||||
| 764 | return (\@positive_clusters, \@negative_clusters); | ||||||
| 765 | } | ||||||
| 766 | } | ||||||
| 767 | |||||||
| 768 | sub pairwise_cluster_similarity { | ||||||
| 769 | my $self = shift; | ||||||
| 770 | my $cluster1 = shift; | ||||||
| 771 | my $trailing_eigenvec_matrix_cluster1 = shift; | ||||||
| 772 | my $reference_vec_cluster1 = shift; | ||||||
| 773 | my $cluster2 = shift; | ||||||
| 774 | my $trailing_eigenvec_matrix_cluster2 = shift; | ||||||
| 775 | my $reference_vec_cluster2 = shift; | ||||||
| 776 | my $total_reconstruction_error_in_this_iteration = 0; | ||||||
| 777 | my @errors_for_1_on_2 = map {my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); | ||||||
| 778 | $data_vec->set_col(0, $self->{_data_hash}->{$_}); | ||||||
| 779 | $self->reconstruction_error($data_vec, | ||||||
| 780 | $trailing_eigenvec_matrix_cluster2, | ||||||
| 781 | $reference_vec_cluster2)} | ||||||
| 782 | @$cluster1; | ||||||
| 783 | my @errors_for_2_on_1 = map {my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); | ||||||
| 784 | $data_vec->set_col(0, $self->{_data_hash}->{$_}); | ||||||
| 785 | $self->reconstruction_error($data_vec, | ||||||
| 786 | $trailing_eigenvec_matrix_cluster1, | ||||||
| 787 | $reference_vec_cluster1)} | ||||||
| 788 | @$cluster2; | ||||||
| 789 | my $type_1_error = reduce {abs($a) + abs($b)} @errors_for_1_on_2; | ||||||
| 790 | my $type_2_error = reduce {abs($a) + abs($b)} @errors_for_2_on_1; | ||||||
| 791 | my $total_reconstruction_error = $type_1_error + $type_2_error; | ||||||
| 792 | my $diff_between_the_means = $reference_vec_cluster1 - $reference_vec_cluster2; | ||||||
| 793 | my $dist_squared = transpose($diff_between_the_means) * $diff_between_the_means; | ||||||
| 794 | my @dist_squared_as_list = $dist_squared->as_list(); | ||||||
| 795 | my $dist_between_means_based_error = shift @dist_squared_as_list; | ||||||
| 796 | return ($total_reconstruction_error, $dist_between_means_based_error); | ||||||
| 797 | } | ||||||
| 798 | |||||||
| 799 | # delta ball | ||||||
| 800 | sub cluster_unimodality_quotient { | ||||||
| 801 | my $self = shift; | ||||||
| 802 | my $cluster = shift; | ||||||
| 803 | my $mean = shift; | ||||||
| 804 | my $delta = 0.4 * $self->{_scale_factor}; # Radius of the delta ball along each dimension | ||||||
| 805 | my @mean = $mean->as_list; | ||||||
| 806 | my @data_tags_for_range_tests; | ||||||
| 807 | foreach my $dimen (0..$self->{_data_dimensions}-1) { | ||||||
| 808 | my @values = map {$_->[$dimen]} map {$self->{_data_hash}->{$_}} @$cluster; | ||||||
| 809 | my ($min, $max) = (List::Util::min(@values), List::Util::max(@values)); | ||||||
| 810 | my $range = $max - $min; | ||||||
| 811 | my $mean_along_this_dimen = $mean[$dimen]; | ||||||
| 812 | my @tags = grep {$_} | ||||||
| 813 | map { ( ($self->{_data_hash}->{$_}->[$dimen] > $mean_along_this_dimen - $delta * $range) | ||||||
| 814 | && | ||||||
| 815 | ($self->{_data_hash}->{$_}->[$dimen] < $mean_along_this_dimen + $delta * $range) ) | ||||||
| 816 | ? $_ : undef } | ||||||
| 817 | @$cluster; | ||||||
| 818 | push @data_tags_for_range_tests, \@tags; | ||||||
| 819 | } | ||||||
| 820 | # Now find the intersection of the tag sets for each of the dimensions | ||||||
| 821 | my %intersection_hash; | ||||||
| 822 | foreach my $dimen (0..$self->{_data_dimensions}-1) { | ||||||
| 823 | my %tag_hash_for_this_dimen = map {$_ => 1} @{$data_tags_for_range_tests[$dimen]}; | ||||||
| 824 | if ($dimen == 0) { | ||||||
| 825 | %intersection_hash = %tag_hash_for_this_dimen; | ||||||
| 826 | } else { | ||||||
| 827 | %intersection_hash = map {$_ => 1} grep {$tag_hash_for_this_dimen{$_}} | ||||||
| 828 | keys %intersection_hash; | ||||||
| 829 | } | ||||||
| 830 | } | ||||||
| 831 | my @intersection_set = keys %intersection_hash; | ||||||
| 832 | my $cluster_unimodality_index = scalar(@intersection_set) / scalar(@$cluster); | ||||||
| 833 | return $cluster_unimodality_index; | ||||||
| 834 | } | ||||||
| 835 | |||||||
| 836 | sub find_best_ref_vector { | ||||||
| 837 | my $self = shift; | ||||||
| 838 | my $cluster = shift; | ||||||
| 839 | my $trailing_eigenvec_matrix = shift; | ||||||
| 840 | my $mean = shift; # a GSL marix ref | ||||||
| 841 | my @min_bounds; | ||||||
| 842 | my @max_bounds; | ||||||
| 843 | my @ranges; | ||||||
| 844 | foreach my $dimen (0..$self->{_data_dimensions}-1) { | ||||||
| 845 | my @values = map {$_->[$dimen]} map {$self->{_data_hash}->{$_}} @$cluster; | ||||||
| 846 | my ($min, $max) = (List::Util::min(@values), List::Util::max(@values)); | ||||||
| 847 | push @min_bounds, $min; | ||||||
| 848 | push @max_bounds, $max; | ||||||
| 849 | push @ranges, $max - $min; | ||||||
| 850 | } | ||||||
| 851 | print "min bounds are: @min_bounds\n"; | ||||||
| 852 | print "max bounds are: @max_bounds\n"; | ||||||
| 853 | my $max_iterations = 100; | ||||||
| 854 | my @random_points; | ||||||
| 855 | my $iteration = 0; | ||||||
| 856 | while ($iteration++ < $max_iterations) { | ||||||
| 857 | my @coordinate_vec; | ||||||
| 858 | foreach my $dimen (0..$self->{_data_dimensions}-1) { | ||||||
| 859 | push @coordinate_vec, $min_bounds[$dimen] + rand($ranges[$dimen]); | ||||||
| 860 | } | ||||||
| 861 | push @random_points, \@coordinate_vec; | ||||||
| 862 | } | ||||||
| 863 | if ($self->{_debug}) { | ||||||
| 864 | print "\nrandom points\n"; | ||||||
| 865 | map {print "@$_\n"} @random_points; | ||||||
| 866 | } | ||||||
| 867 | my @mean = $mean->as_list; | ||||||
| 868 | unshift @random_points, \@mean; | ||||||
| 869 | my @reconstruction_errors; | ||||||
| 870 | foreach my $candidate_ref_vec (@random_points) { | ||||||
| 871 | my $ref_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); | ||||||
| 872 | $ref_vec->set_col(0, $candidate_ref_vec); | ||||||
| 873 | my $reconstruction_error_for_a_ref_vec = 0; | ||||||
| 874 | foreach my $data_tag (@{$self->{_data_tags}}) { | ||||||
| 875 | my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1); | ||||||
| 876 | $data_vec->set_col(0, $self->{_data_hash}->{$data_tag}); | ||||||
| 877 | my $error = $self->reconstruction_error($data_vec,$trailing_eigenvec_matrix,$ref_vec); | ||||||
| 878 | $reconstruction_error_for_a_ref_vec += $error; | ||||||
| 879 | } | ||||||
| 880 | push @reconstruction_errors, $reconstruction_error_for_a_ref_vec; | ||||||
| 881 | } | ||||||
| 882 | my $recon_error_for_original_mean = shift @reconstruction_errors; | ||||||
| 883 | my $smallest_error_randomly_selected_ref_vecs = List::Util::min(@reconstruction_errors); | ||||||
| 884 | my $minindex = List::Util::first { $_ == $smallest_error_randomly_selected_ref_vecs } | ||||||
| 885 | @reconstruction_errors; | ||||||
| 886 | my $refvec = $random_points[$minindex]; | ||||||
| 887 | return $refvec; | ||||||
| 888 | } | ||||||
| 889 | |||||||
| 890 | ## The reconstruction error relates to the size of the perpendicular from a data | ||||||
| 891 | ## point X to the hyperplane that defines a given subspace on the manifold. | ||||||
| 892 | sub reconstruction_error { | ||||||
| 893 | my $self = shift; | ||||||
| 894 | my $data_vec = shift; | ||||||
| 895 | my $trailing_eigenvecs = shift; | ||||||
| 896 | my $ref_vec = shift; | ||||||
| 897 | my $error_squared = transpose($data_vec - $ref_vec) * $trailing_eigenvecs * | ||||||
| 898 | transpose($trailing_eigenvecs) * ($data_vec - $ref_vec); | ||||||
| 899 | my @error_squared_as_list = $error_squared->as_list(); | ||||||
| 900 | my $error_squared_as_scalar = shift @error_squared_as_list; | ||||||
| 901 | return $error_squared_as_scalar; | ||||||
| 902 | } | ||||||
| 903 | |||||||
| 904 | # Returns a set of KM random integers. These serve as indices to reach into the data | ||||||
| 905 | # array. A data element whose index is one of the random numbers returned by this | ||||||
| 906 | # routine serves as an initial cluster center. Note the quality check it runs on the | ||||||
| 907 | # list of the random integers constructed. We first make sure that all the random | ||||||
| 908 | # integers returned are different. Subsequently, we carry out a quality assessment | ||||||
| 909 | # of the random integers constructed. This quality measure consists of the ratio of | ||||||
| 910 | # the values spanned by the random integers to the value of N, the total number of | ||||||
| 911 | # data points to be clustered. Currently, if this ratio is less than 0.3, we discard | ||||||
| 912 | # the K integers and try again. | ||||||
| 913 | sub initialize_cluster_centers { | ||||||
| 914 | my $self = shift; | ||||||
| 915 | my $K = shift; # This value is set to the parameter KM in the call to this subroutine | ||||||
| 916 | my $data_store_size = shift; | ||||||
| 917 | my @cluster_center_indices; | ||||||
| 918 | while (1) { | ||||||
| 919 | foreach my $i (0..$K-1) { | ||||||
| 920 | $cluster_center_indices[$i] = int rand( $data_store_size ); | ||||||
| 921 | next if $i == 0; | ||||||
| 922 | foreach my $j (0..$i-1) { | ||||||
| 923 | while ( $cluster_center_indices[$j] == $cluster_center_indices[$i] ) { | ||||||
| 924 | my $old = $cluster_center_indices[$i]; | ||||||
| 925 | $cluster_center_indices[$i] = int rand($data_store_size); | ||||||
| 926 | } | ||||||
| 927 | } | ||||||
| 928 | } | ||||||
| 929 | my ($min,$max) = minmax(\@cluster_center_indices ); | ||||||
| 930 | my $quality = ($max - $min) / $data_store_size; | ||||||
| 931 | last if $quality > 0.3; | ||||||
| 932 | } | ||||||
| 933 | return @cluster_center_indices; | ||||||
| 934 | } | ||||||
| 935 | |||||||
| 936 | # The purpose of this routine is to form initial clusters by assigning the data | ||||||
| 937 | # samples to the initial clusters formed by the previous routine on the basis of the | ||||||
| 938 | # best proximity of the data samples to the different cluster centers. | ||||||
| 939 | sub assign_data_to_clusters_initial { | ||||||
| 940 | my $self = shift; | ||||||
| 941 | my @cluster_centers = @{ shift @_ }; | ||||||
| 942 | my @clusters; | ||||||
| 943 | foreach my $ele (@{$self->{_data_tags}}) { | ||||||
| 944 | my $best_cluster; | ||||||
| 945 | my @dist_from_clust_centers; | ||||||
| 946 | foreach my $center (@cluster_centers) { | ||||||
| 947 | push @dist_from_clust_centers, $self->distance($ele, $center); | ||||||
| 948 | } | ||||||
| 949 | my ($min, $best_center_index) = minimum( \@dist_from_clust_centers ); | ||||||
| 950 | push @{$clusters[$best_center_index]}, $ele; | ||||||
| 951 | } | ||||||
| 952 | return \@clusters; | ||||||
| 953 | } | ||||||
| 954 | |||||||
| 955 | # The following routine is for computing the distance between a data point specified | ||||||
| 956 | # by its symbolic name in the master datafile and a point (such as the center of a | ||||||
| 957 | # cluster) expressed as a vector of coordinates: | ||||||
| 958 | sub distance { | ||||||
| 959 | my $self = shift; | ||||||
| 960 | my $ele1_id = shift @_; # symbolic name of data sample | ||||||
| 961 | my @ele1 = @{$self->{_data_hash}->{$ele1_id}}; | ||||||
| 962 | my @ele2 = @{shift @_}; | ||||||
| 963 | die "wrong data types for distance calculation\n" if @ele1 != @ele2; | ||||||
| 964 | my $how_many = @ele1; | ||||||
| 965 | my $squared_sum = 0; | ||||||
| 966 | foreach my $i (0..$how_many-1) { | ||||||
| 967 | $squared_sum += ($ele1[$i] - $ele2[$i])**2; | ||||||
| 968 | } | ||||||
| 969 | my $dist = sqrt $squared_sum; | ||||||
| 970 | return $dist; | ||||||
| 971 | } | ||||||
| 972 | |||||||
| 973 | sub write_clusters_to_files { | ||||||
| 974 | my $self = shift; | ||||||
| 975 | my $clusters = shift; | ||||||
| 976 | my @clusters = @$clusters; | ||||||
| 977 | unlink glob "cluster*.txt"; | ||||||
| 978 | foreach my $i (0..@clusters-1) { | ||||||
| 979 | my $filename = "cluster" . $i . ".txt"; | ||||||
| 980 | print "\nWriting cluster $i to file $filename\n" if $self->{_terminal_output}; | ||||||
| 981 | open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!"; | ||||||
| 982 | foreach my $ele (@{$clusters[$i]}) { | ||||||
| 983 | print FILEHANDLE "$ele "; | ||||||
| 984 | } | ||||||
| 985 | close FILEHANDLE; | ||||||
| 986 | } | ||||||
| 987 | } | ||||||
| 988 | |||||||
| 989 | sub DESTROY { | ||||||
| 990 | my $filename = basename($_[0]->{_datafile}); | ||||||
| 991 | $filename =~ s/\.\w+$/\.txt/; | ||||||
| 992 | unlink "__temp_" . $filename; | ||||||
| 993 | } | ||||||
| 994 | |||||||
| 995 | ################################## Visualization Code ################################### | ||||||
| 996 | |||||||
| 997 | sub add_point_coords { | ||||||
| 998 | my $self = shift; | ||||||
| 999 | my @arr_of_ids = @{shift @_}; # array of data element names | ||||||
| 1000 | my @result; | ||||||
| 1001 | my $data_dimensionality = $self->{_data_dimensions}; | ||||||
| 1002 | foreach my $i (0..$data_dimensionality-1) { | ||||||
| 1003 | $result[$i] = 0.0; | ||||||
| 1004 | } | ||||||
| 1005 | foreach my $id (@arr_of_ids) { | ||||||
| 1006 | my $ele = $self->{_data_hash}->{$id}; | ||||||
| 1007 | my $i = 0; | ||||||
| 1008 | foreach my $component (@$ele) { | ||||||
| 1009 | $result[$i] += $component; | ||||||
| 1010 | $i++; | ||||||
| 1011 | } | ||||||
| 1012 | } | ||||||
| 1013 | return \@result; | ||||||
| 1014 | } | ||||||
| 1015 | |||||||
| 1016 | # This is the main module version: | ||||||
| 1017 | sub visualize_clusters_on_sphere { | ||||||
| 1018 | my $self = shift; | ||||||
| 1019 | my $visualization_msg = shift; | ||||||
| 1020 | my $clusters = deep_copy_AoA(shift); | ||||||
| 1021 | my $hardcopy_format = shift; | ||||||
| 1022 | my $pause_time = shift; | ||||||
| 1023 | my $d = $self->{_data_dimensions}; | ||||||
| 1024 | my $temp_file = "__temp_" . $self->{_datafile}; | ||||||
| 1025 | $temp_file =~ s/\.\w+$/\.txt/; | ||||||
| 1026 | unlink $temp_file if -e $temp_file; | ||||||
| 1027 | open OUTPUT, ">$temp_file" | ||||||
| 1028 | or die "Unable to open a temp file in this directory: $!"; | ||||||
| 1029 | my @all_tags = "A".."Z"; | ||||||
| 1030 | my @retagged_clusters; | ||||||
| 1031 | foreach my $cluster (@$clusters) { | ||||||
| 1032 | my $label = shift @all_tags; | ||||||
| 1033 | my @retagged_cluster = | ||||||
| 1034 | map {$_ =~ s/^(\w+?)_(\w+)/$label . "_$2 @{$self->{_data_hash}->{$_}}"/e;$_} @$cluster; | ||||||
| 1035 | push @retagged_clusters, \@retagged_cluster; | ||||||
| 1036 | } | ||||||
| 1037 | my %clusters; | ||||||
| 1038 | foreach my $cluster (@retagged_clusters) { | ||||||
| 1039 | foreach my $record (@$cluster) { | ||||||
| 1040 | my @splits = grep $_, split /\s+/, $record; | ||||||
| 1041 | $splits[0] =~ /(\w+?)_.*/; | ||||||
| 1042 | my $primary_cluster_label = $1; | ||||||
| 1043 | my @coords = @splits[1..$d]; | ||||||
| 1044 | push @{$clusters{$primary_cluster_label}}, \@coords; | ||||||
| 1045 | } | ||||||
| 1046 | } | ||||||
| 1047 | foreach my $key (sort {"\L$a" cmp "\L$b"} keys %clusters) { | ||||||
| 1048 | map {print OUTPUT "$_"} map {"@$_\n"} @{$clusters{$key}}; | ||||||
| 1049 | print OUTPUT "\n\n"; | ||||||
| 1050 | } | ||||||
| 1051 | my @sorted_cluster_keys = sort {"\L$a" cmp "\L$b"} keys %clusters; | ||||||
| 1052 | close OUTPUT; | ||||||
| 1053 | my $plot; | ||||||
| 1054 | unless (defined $pause_time) { | ||||||
| 1055 | $plot = Graphics::GnuplotIF->new( persist => 1 ); | ||||||
| 1056 | } else { | ||||||
| 1057 | $plot = Graphics::GnuplotIF->new(); | ||||||
| 1058 | } | ||||||
| 1059 | my $arg_string = ""; | ||||||
| 1060 | $plot->gnuplot_cmd( "set hidden3d" ) unless $self->{_show_hidden_in_3D_plots}; | ||||||
| 1061 | $plot->gnuplot_cmd( "set title \"$visualization_msg\"" ); | ||||||
| 1062 | $plot->gnuplot_cmd( "set noclip" ); | ||||||
| 1063 | $plot->gnuplot_cmd( "set pointsize 2" ); | ||||||
| 1064 | $plot->gnuplot_cmd( "set parametric" ); | ||||||
| 1065 | $plot->gnuplot_cmd( "set size ratio 1" ); | ||||||
| 1066 | $plot->gnuplot_cmd( "set xlabel \"X\"" ); | ||||||
| 1067 | $plot->gnuplot_cmd( "set ylabel \"Y\"" ); | ||||||
| 1068 | $plot->gnuplot_cmd( "set zlabel \"Z\"" ); | ||||||
| 1069 | if ($hardcopy_format) { | ||||||
| 1070 | $plot->gnuplot_cmd( "set terminal png" ); | ||||||
| 1071 | my $image_file_name = "$visualization_msg\.$hardcopy_format"; | ||||||
| 1072 | $plot->gnuplot_cmd( "set output \"$image_file_name\"" ); | ||||||
| 1073 | $plot->gnuplot_cmd( "unset hidden3d" ); | ||||||
| 1074 | } | ||||||
| 1075 | # set the range for azimuth angles: | ||||||
| 1076 | $plot->gnuplot_cmd( "set urange [0:2*pi]" ); | ||||||
| 1077 | # set the range for the elevation angles: | ||||||
| 1078 | $plot->gnuplot_cmd( "set vrange [-pi/2:pi/2]" ); | ||||||
| 1079 | # Parametric functions for the sphere | ||||||
| 1080 | # $plot->gnuplot_cmd( "r=1" ); | ||||||
| 1081 | if ($self->{_scale_factor}) { | ||||||
| 1082 | $plot->gnuplot_cmd( "r=$self->{_scale_factor}" ); | ||||||
| 1083 | } else { | ||||||
| 1084 | $plot->gnuplot_cmd( "r=1" ); | ||||||
| 1085 | } | ||||||
| 1086 | $plot->gnuplot_cmd( "fx(v,u) = r*cos(v)*cos(u)" ); | ||||||
| 1087 | $plot->gnuplot_cmd( "fy(v,u) = r*cos(v)*sin(u)" ); | ||||||
| 1088 | $plot->gnuplot_cmd( "fz(v) = r*sin(v)" ); | ||||||
| 1089 | my $sphere_arg_str = "fx(v,u),fy(v,u),fz(v) notitle with lines lt 0,"; | ||||||
| 1090 | foreach my $i (0..scalar(keys %clusters)-1) { | ||||||
| 1091 | my $j = $i + 1; | ||||||
| 1092 | # The following statement puts the titles on the data points | ||||||
| 1093 | $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"$sorted_cluster_keys[$i] \" with points lt $j pt $j, "; | ||||||
| 1094 | } | ||||||
| 1095 | $arg_string = $arg_string =~ /^(.*),[ ]+$/; | ||||||
| 1096 | $arg_string = $1; | ||||||
| 1097 | $plot->gnuplot_cmd( "splot $sphere_arg_str $arg_string" ); | ||||||
| 1098 | $plot->gnuplot_pause( $pause_time ) if defined $pause_time; | ||||||
| 1099 | } | ||||||
| 1100 | |||||||
| 1101 | |||||||
| 1102 | ################################### Support Routines ######################################## | ||||||
| 1103 | |||||||
| 1104 | sub cluster_split { | ||||||
| 1105 | my $cluster = shift; | ||||||
| 1106 | my $how_many = shift; | ||||||
| 1107 | my @cluster_fragments; | ||||||
| 1108 | foreach my $i (0..$how_many-1) { | ||||||
| 1109 | $cluster_fragments[$i] = []; | ||||||
| 1110 | } | ||||||
| 1111 | my $delta = int( scalar(@$cluster) / $how_many ); | ||||||
| 1112 | my $j = 0; | ||||||
| 1113 | foreach my $i (0..@$cluster-1) { | ||||||
| 1114 | push @{$cluster_fragments[int($i/$delta)]}, $cluster->[$i]; | ||||||
| 1115 | } | ||||||
| 1116 | my $how_many_accounted_for = reduce {$a + $b} map {scalar(@$_)} @cluster_fragments; | ||||||
| 1117 | foreach my $frag (@cluster_fragments) { | ||||||
| 1118 | print "fragment: @$frag\n"; | ||||||
| 1119 | } | ||||||
| 1120 | die "the fragmentation could not account for all the data" | ||||||
| 1121 | unless @$cluster == $how_many_accounted_for; | ||||||
| 1122 | return @cluster_fragments; | ||||||
| 1123 | } | ||||||
| 1124 | |||||||
| 1125 | # Returns the minimum value and its positional index in an array | ||||||
| 1126 | sub minimum { | ||||||
| 1127 | my $arr = shift; | ||||||
| 1128 | my $min; | ||||||
| 1129 | my $index; | ||||||
| 1130 | foreach my $i (0..@{$arr}-1) { | ||||||
| 1131 | if ( (!defined $min) || ($arr->[$i] < $min) ) { | ||||||
| 1132 | $index = $i; | ||||||
| 1133 | $min = $arr->[$i]; | ||||||
| 1134 | } | ||||||
| 1135 | } | ||||||
| 1136 | return ($min, $index); | ||||||
| 1137 | } | ||||||
| 1138 | |||||||
| 1139 | sub minmax { | ||||||
| 1140 | my $arr = shift; | ||||||
| 1141 | my $min; | ||||||
| 1142 | my $max; | ||||||
| 1143 | foreach my $i (0..@{$arr}-1) { | ||||||
| 1144 | next if !defined $arr->[$i]; | ||||||
| 1145 | if ( (!defined $min) && (!defined $max) ) { | ||||||
| 1146 | $min = $arr->[$i]; | ||||||
| 1147 | $max = $arr->[$i]; | ||||||
| 1148 | } elsif ( $arr->[$i] < $min ) { | ||||||
| 1149 | $min = $arr->[$i]; | ||||||
| 1150 | } elsif ( $arr->[$i] > $max ) { | ||||||
| 1151 | $max = $arr->[$i]; | ||||||
| 1152 | } | ||||||
| 1153 | } | ||||||
| 1154 | return ($min, $max); | ||||||
| 1155 | } | ||||||
| 1156 | |||||||
| 1157 | # Meant only for constructing a deep copy of an array of arrays: | ||||||
| 1158 | sub deep_copy_AoA { | ||||||
| 1159 | my $ref_in = shift; | ||||||
| 1160 | my $ref_out; | ||||||
| 1161 | foreach my $i (0..@{$ref_in}-1) { | ||||||
| 1162 | foreach my $j (0..@{$ref_in->[$i]}-1) { | ||||||
| 1163 | $ref_out->[$i]->[$j] = $ref_in->[$i]->[$j]; | ||||||
| 1164 | } | ||||||
| 1165 | } | ||||||
| 1166 | return $ref_out; | ||||||
| 1167 | } | ||||||
| 1168 | |||||||
| 1169 | # For displaying the individual clusters on a terminal screen. Each cluster is | ||||||
| 1170 | # displayed through the symbolic names associated with the data points. | ||||||
| 1171 | sub display_clusters { | ||||||
| 1172 | my @clusters = @{shift @_}; | ||||||
| 1173 | my $i = 0; | ||||||
| 1174 | foreach my $cluster (@clusters) { | ||||||
| 1175 | @$cluster = sort @$cluster; | ||||||
| 1176 | my $cluster_size = @$cluster; | ||||||
| 1177 | print "\n\nCluster $i ($cluster_size records):\n"; | ||||||
| 1178 | foreach my $ele (@$cluster) { | ||||||
| 1179 | print " $ele"; | ||||||
| 1180 | } | ||||||
| 1181 | $i++ | ||||||
| 1182 | } | ||||||
| 1183 | print "\n\n"; | ||||||
| 1184 | } | ||||||
| 1185 | |||||||
| 1186 | sub display_mean_and_covariance { | ||||||
| 1187 | my $mean = shift; | ||||||
| 1188 | my $covariance = shift; | ||||||
| 1189 | print "\nDisplay the mean:\n"; | ||||||
| 1190 | display_matrix($mean); | ||||||
| 1191 | print "\nDisplay the covariance:\n"; | ||||||
| 1192 | display_matrix($covariance); | ||||||
| 1193 | } | ||||||
| 1194 | |||||||
| 1195 | sub check_for_illegal_params { | ||||||
| 1196 | my @params = @_; | ||||||
| 1197 | my @legal_params = qw / datafile | ||||||
| 1198 | mask | ||||||
| 1199 | K | ||||||
| 1200 | P | ||||||
| 1201 | terminal_output | ||||||
| 1202 | cluster_search_multiplier | ||||||
| 1203 | max_iterations | ||||||
| 1204 | delta_reconstruction_error | ||||||
| 1205 | visualize_each_iteration | ||||||
| 1206 | show_hidden_in_3D_plots | ||||||
| 1207 | make_png_for_each_iteration | ||||||
| 1208 | debug | ||||||
| 1209 | /; | ||||||
| 1210 | my $found_match_flag; | ||||||
| 1211 | foreach my $param (@params) { | ||||||
| 1212 | foreach my $legal (@legal_params) { | ||||||
| 1213 | $found_match_flag = 0; | ||||||
| 1214 | if ($param eq $legal) { | ||||||
| 1215 | $found_match_flag = 1; | ||||||
| 1216 | last; | ||||||
| 1217 | } | ||||||
| 1218 | } | ||||||
| 1219 | last if $found_match_flag == 0; | ||||||
| 1220 | } | ||||||
| 1221 | return $found_match_flag; | ||||||
| 1222 | } | ||||||
| 1223 | |||||||
| 1224 | sub display_matrix { | ||||||
| 1225 | my $matrix = shift; | ||||||
| 1226 | my $nrows = $matrix->rows(); | ||||||
| 1227 | my $ncols = $matrix->cols(); | ||||||
| 1228 | print "\nDisplaying a matrix of size $nrows rows and $ncols columns:\n"; | ||||||
| 1229 | foreach my $i (0..$nrows-1) { | ||||||
| 1230 | my $row = $matrix->row($i); | ||||||
| 1231 | my @row_as_list = $row->as_list; | ||||||
| 1232 | # print "@row_as_list\n"; | ||||||
| 1233 | map { printf("%.4f ", $_) } @row_as_list; | ||||||
| 1234 | print "\n"; | ||||||
| 1235 | } | ||||||
| 1236 | print "\n\n"; | ||||||
| 1237 | } | ||||||
| 1238 | |||||||
| 1239 | sub transpose { | ||||||
| 1240 | my $matrix = shift; | ||||||
| 1241 | my $num_rows = $matrix->rows(); | ||||||
| 1242 | my $num_cols = $matrix->cols(); | ||||||
| 1243 | my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows); | ||||||
| 1244 | foreach my $i (0..$num_rows-1) { | ||||||
| 1245 | my @row = $matrix->row($i)->as_list; | ||||||
| 1246 | $transpose->set_col($i, \@row ); | ||||||
| 1247 | } | ||||||
| 1248 | return $transpose; | ||||||
| 1249 | } | ||||||
| 1250 | |||||||
| 1251 | sub vector_subtract { | ||||||
| 1252 | my $vec1 = shift; | ||||||
| 1253 | my $vec2 = shift; | ||||||
| 1254 | die "wrong data types for vector subtract calculation\n" if @$vec1 != @$vec2; | ||||||
| 1255 | my @result; | ||||||
| 1256 | foreach my $i (0..@$vec1-1){ | ||||||
| 1257 | push @result, $vec1->[$i] - $vec2->[$i]; | ||||||
| 1258 | } | ||||||
| 1259 | return @result; | ||||||
| 1260 | } | ||||||
| 1261 | |||||||
| 1262 | # from perl docs: | ||||||
| 1263 | sub fisher_yates_shuffle { | ||||||
| 1264 | my $arr = shift; | ||||||
| 1265 | my $i = @$arr; | ||||||
| 1266 | while (--$i) { | ||||||
| 1267 | my $j = int rand( $i + 1 ); | ||||||
| 1268 | @$arr[$i, $j] = @$arr[$j, $i]; | ||||||
| 1269 | } | ||||||
| 1270 | } | ||||||
| 1271 | |||||||
| 1272 | ######################### Generating Synthetic Data for Manifold Clustering ########################## | ||||||
| 1273 | |||||||
| 1274 | ################################## Class DataGenerator ######################################## | ||||||
| 1275 | |||||||
| 1276 | ## The embedded class defined below is for generating synthetic data for | ||||||
| 1277 | ## experimenting with linear manifold clustering when the data resides on the | ||||||
| 1278 | ## surface of a sphere. See the script generate_data_on_a_sphere.pl in the | ||||||
| 1279 | ## `examples' directory for how to specify the number of clusters and the spread of | ||||||
| 1280 | ## each cluster in the data that is generated. | ||||||
| 1281 | |||||||
| 1282 | package DataGenerator; | ||||||
| 1283 | |||||||
| 1284 | use strict; | ||||||
| 1285 | use Carp; | ||||||
| 1286 | |||||||
| 1287 | sub new { | ||||||
| 1288 | my ($class, %args) = @_; | ||||||
| 1289 | my @params = keys %args; | ||||||
| 1290 | croak "\nYou have used a wrong name for a keyword argument " . | ||||||
| 1291 | "--- perhaps a misspelling\n" | ||||||
| 1292 | if _check_for_illegal_params3(@params) == 0; | ||||||
| 1293 | bless { | ||||||
| 1294 | _output_file => $args{output_file} | ||||||
| 1295 | || croak("name for output_file required"), | ||||||
| 1296 | _total_number_of_samples_needed => $args{total_number_of_samples_needed} | ||||||
| 1297 | || croak("total_number_of_samples_needed required"), | ||||||
| 1298 | _number_of_clusters_on_sphere => $args{number_of_clusters_on_sphere} || 3, | ||||||
| 1299 | _cluster_width => $args{cluster_width} || 0.1, | ||||||
| 1300 | _show_hidden_in_3D_plots => $args{show_hidden_in_3D_plots} || 1, | ||||||
| 1301 | _debug => $args{debug} || 0, | ||||||
| 1302 | }, $class; | ||||||
| 1303 | } | ||||||
| 1304 | |||||||
| 1305 | sub _check_for_illegal_params3 { | ||||||
| 1306 | my @params = @_; | ||||||
| 1307 | my @legal_params = qw / output_file | ||||||
| 1308 | total_number_of_samples_needed | ||||||
| 1309 | number_of_clusters_on_sphere | ||||||
| 1310 | cluster_width | ||||||
| 1311 | show_hidden_in_3D_plots | ||||||
| 1312 | /; | ||||||
| 1313 | my $found_match_flag; | ||||||
| 1314 | foreach my $param (@params) { | ||||||
| 1315 | foreach my $legal (@legal_params) { | ||||||
| 1316 | $found_match_flag = 0; | ||||||
| 1317 | if ($param eq $legal) { | ||||||
| 1318 | $found_match_flag = 1; | ||||||
| 1319 | last; | ||||||
| 1320 | } | ||||||
| 1321 | } | ||||||
| 1322 | last if $found_match_flag == 0; | ||||||
| 1323 | } | ||||||
| 1324 | return $found_match_flag; | ||||||
| 1325 | } | ||||||
| 1326 | |||||||
| 1327 | ## We first generate a set of points randomly on the unit sphere --- the number of | ||||||
| 1328 | ## points being equal to the number of clusters desired. These points will serve as | ||||||
| 1329 | ## cluster means (or, as cluster centroids) subsequently when we ask | ||||||
| 1330 | ## Math::Random::random_multivariate_normal($N, @m, @covar) to return $N number of | ||||||
| 1331 | ## points on the sphere. The second argument is the cluster mean and the third | ||||||
| 1332 | ## argument the cluster covariance. For the synthetic data, we set the cluster | ||||||
| 1333 | ## covariance to a 2x2 diagonal matrix, with the (0,0) element corresponding to the | ||||||
| 1334 | ## variance along the azimuth direction and the (1,1) element corresponding to the | ||||||
| 1335 | ## variance along the elevation direction. | ||||||
| 1336 | ## | ||||||
| 1337 | ## When you generate the points in the 2D spherical coordinates of | ||||||
| 1338 | ## (azimuth,elevation), you also need `wrap-around' logic for those points yielded by | ||||||
| 1339 | ## the multivariate-normal function whose azimuth angle is outside the interval | ||||||
| 1340 | ## (0,360) and/or whose elevation angle is outside the interval (-90,90). | ||||||
| 1341 | ## | ||||||
| 1342 | ## Note that the first of the two dimensions for which the multivariate-normal | ||||||
| 1343 | ## function returns the points is for the azimuth angle and the second for the | ||||||
| 1344 | ## elevation angle. | ||||||
| 1345 | ## | ||||||
| 1346 | ## With regard to the relationship of the Cartesian coordinates to the spherical | ||||||
| 1347 | ## (azimuth, elevation) coordinates, we assume that (x,y) is the horizontal plane | ||||||
| 1348 | ## and z the vertical axis. The elevation angle theta is measure with respect to | ||||||
| 1349 | ## the XY-plane. The highest point on the sphere (the Zenith) corresponds to the | ||||||
| 1350 | ## elevation angle of +90 and the lowest points on the sphere (the Nadir) | ||||||
| 1351 | ## corresponds to the elevation angle of -90. The azimuth is measured with respect | ||||||
| 1352 | ## X-axis. The range of the azimuth is from 0 to 360 degrees. The elevation is | ||||||
| 1353 | ## measured from the XY plane and its range is (-90,90) degrees. | ||||||
| 1354 | sub gen_data_and_write_to_csv { | ||||||
| 1355 | my $self = shift; | ||||||
| 1356 | my $K = $self->{_number_of_clusters_on_sphere}; | ||||||
| 1357 | # $N is the number of samples to be generated for each cluster: | ||||||
| 1358 | my $N = int($self->{_total_number_of_samples_needed} / $K); | ||||||
| 1359 | my $output_file = $self->{_output_file}; | ||||||
| 1360 | # Designated all of the data elements in a cluster by a letter that is followed by | ||||||
| 1361 | # an integer that identifies a specific data element. | ||||||
| 1362 | my @point_labels = ('a'..'z'); | ||||||
| 1363 | # Our first job is to define $K random points in the 2D space (azimuth, | ||||||
| 1364 | # elevation) to serve as cluster centers on the surface of the sphere. This we | ||||||
| 1365 | # do by calling a uniformly distributed 1-D random number generator, first for | ||||||
| 1366 | # the azimuth and then for the elevation in the loop shown below: | ||||||
| 1367 | my @cluster_centers; | ||||||
| 1368 | my @covariances; | ||||||
| 1369 | foreach my $i (0..$K-1) { | ||||||
| 1370 | my $azimuth = rand(360); | ||||||
| 1371 | my $elevation = rand(90) - 90; | ||||||
| 1372 | my @mean = ($azimuth, $elevation); | ||||||
| 1373 | push @cluster_centers, \@mean; | ||||||
| 1374 | my $cluster_covariance; | ||||||
| 1375 | # The j-th dimension is for azimuth and k-th for elevation for the directions | ||||||
| 1376 | # to surface of the sphere: | ||||||
| 1377 | foreach my $j (0..1) { | ||||||
| 1378 | foreach my $k (0..1) { | ||||||
| 1379 | $cluster_covariance->[$j]->[$k] = ($self->{_cluster_width} * 360.0) ** 2 | ||||||
| 1380 | if $j == 0 && $k == 0; | ||||||
| 1381 | $cluster_covariance->[$j]->[$k] = ($self->{_cluster_width} * 180.0) ** 2 | ||||||
| 1382 | if $j == 1 && $k == 1; | ||||||
| 1383 | $cluster_covariance->[$j]->[$k] = 0.0 if $j != $k; | ||||||
| 1384 | } | ||||||
| 1385 | } | ||||||
| 1386 | push @covariances, $cluster_covariance; | ||||||
| 1387 | } | ||||||
| 1388 | if ($self->{_debug}) { | ||||||
| 1389 | foreach my $i (0..$K-1) { | ||||||
| 1390 | print "\n\nCluster center: @{$cluster_centers[$i]}\n"; | ||||||
| 1391 | print "\nCovariance:\n"; | ||||||
| 1392 | foreach my $j (0..1) { | ||||||
| 1393 | foreach my $k (0..1) { | ||||||
| 1394 | print "$covariances[$i]->[$j]->[$k] "; | ||||||
| 1395 | } | ||||||
| 1396 | print "\n"; | ||||||
| 1397 | } | ||||||
| 1398 | } | ||||||
| 1399 | } | ||||||
| 1400 | my @data_dump; | ||||||
| 1401 | foreach my $i (0..$K-1) { | ||||||
| 1402 | my @m = @{shift @cluster_centers}; | ||||||
| 1403 | my @covar = @{shift @covariances}; | ||||||
| 1404 | my @new_data = Math::Random::random_multivariate_normal($N, @m, @covar); | ||||||
| 1405 | if ($self->{_debug}) { | ||||||
| 1406 | print "\nThe points for cluster $i:\n"; | ||||||
| 1407 | map { print "@$_ "; } @new_data; | ||||||
| 1408 | print "\n\n"; | ||||||
| 1409 | } | ||||||
| 1410 | my @wrapped_data; | ||||||
| 1411 | foreach my $d (@new_data) { | ||||||
| 1412 | my $wrapped_d; | ||||||
| 1413 | if ($d->[0] >= 360.0) { | ||||||
| 1414 | $wrapped_d->[0] = $d->[0] - 360.0; | ||||||
| 1415 | } elsif ($d->[0] < 0) { | ||||||
| 1416 | $wrapped_d->[0] = 360.0 - abs($d->[0]); | ||||||
| 1417 | } | ||||||
| 1418 | if ($d->[1] >= 90.0) { | ||||||
| 1419 | $wrapped_d->[0] = POSIX::fmod($d->[0] + 180.0, 360); | ||||||
| 1420 | $wrapped_d->[1] = 180.0 - $d->[1]; | ||||||
| 1421 | } elsif ($d->[1] < -90.0) { | ||||||
| 1422 | $wrapped_d->[0] = POSIX::fmod($d->[0] + 180, 360); | ||||||
| 1423 | $wrapped_d->[1] = -180.0 - $d->[1]; | ||||||
| 1424 | } | ||||||
| 1425 | $wrapped_d->[0] = $d->[0] unless defined $wrapped_d->[0]; | ||||||
| 1426 | $wrapped_d->[1] = $d->[1] unless defined $wrapped_d->[1]; | ||||||
| 1427 | push @wrapped_data, $wrapped_d; | ||||||
| 1428 | } | ||||||
| 1429 | if ($self->{_debug}) { | ||||||
| 1430 | print "\nThe unwrapped points for cluster $i:\n"; | ||||||
| 1431 | map { print "@$_ "; } @wrapped_data; | ||||||
| 1432 | print "\n\n"; | ||||||
| 1433 | } | ||||||
| 1434 | my $label = $point_labels[$i]; | ||||||
| 1435 | my $j = 0; | ||||||
| 1436 | @new_data = map {unshift @$_, $label."_".$j; $j++; $_} @wrapped_data; | ||||||
| 1437 | push @data_dump, @new_data; | ||||||
| 1438 | } | ||||||
| 1439 | if ($self->{_debug}) { | ||||||
| 1440 | print "\n\nThe labeled points for clusters:\n"; | ||||||
| 1441 | map { print "@$_\n"; } @data_dump; | ||||||
| 1442 | } | ||||||
| 1443 | fisher_yates_shuffle( \@data_dump ); | ||||||
| 1444 | open OUTPUT, ">$output_file"; | ||||||
| 1445 | my $total_num_of_points = $N * $K; | ||||||
| 1446 | print "Total number of data points that will be written out to the file: $total_num_of_points\n" | ||||||
| 1447 | if $self->{_debug}; | ||||||
| 1448 | foreach my $ele (@data_dump) { | ||||||
| 1449 | my ($x,$y,$z); | ||||||
| 1450 | my $label = $ele->[0]; | ||||||
| 1451 | my $azimuth = $ele->[1]; | ||||||
| 1452 | my $elevation = $ele->[2]; | ||||||
| 1453 | $x = cos($elevation) * cos($azimuth); | ||||||
| 1454 | $y = cos($elevation) * sin($azimuth); | ||||||
| 1455 | $z = sin($elevation); | ||||||
| 1456 | my $csv_str = join ",", ($label,$x,$y,$z); | ||||||
| 1457 | print OUTPUT "$csv_str\n"; | ||||||
| 1458 | } | ||||||
| 1459 | print "\n\n"; | ||||||
| 1460 | print "Data written out to file $output_file\n" if $self->{_debug}; | ||||||
| 1461 | close OUTPUT; | ||||||
| 1462 | } | ||||||
| 1463 | |||||||
| 1464 | # This version for the embedded class for data generation | ||||||
| 1465 | sub visualize_data_on_sphere { | ||||||
| 1466 | my $self = shift; | ||||||
| 1467 | my $datafile = shift; | ||||||
| 1468 | my $filename = File::Basename::basename($datafile); | ||||||
| 1469 | my $temp_file = "__temp_" . $filename; | ||||||
| 1470 | $temp_file =~ s/\.\w+$/\.txt/; | ||||||
| 1471 | unlink $temp_file if -e $temp_file; | ||||||
| 1472 | open OUTPUT, ">$temp_file" | ||||||
| 1473 | or die "Unable to open a temp file in this directory: $!"; | ||||||
| 1474 | open INPUT, "< $filename" or die "Unable to open $filename: $!"; | ||||||
| 1475 | local $/ = undef; | ||||||
| 1476 | my @all_records = split /\s+/, ; | ||||||
| 1477 | my %clusters; | ||||||
| 1478 | foreach my $record (@all_records) { | ||||||
| 1479 | my @splits = split /,/, $record; | ||||||
| 1480 | my $record_name = shift @splits; | ||||||
| 1481 | $record_name =~ /(\w+?)_.*/; | ||||||
| 1482 | my $primary_cluster_label = $1; | ||||||
| 1483 | push @{$clusters{$primary_cluster_label}}, \@splits; | ||||||
| 1484 | } | ||||||
| 1485 | foreach my $key (sort {"\L$a" cmp "\L$b"} keys %clusters) { | ||||||
| 1486 | map {print OUTPUT "$_"} map {"@$_\n"} @{$clusters{$key}}; | ||||||
| 1487 | print OUTPUT "\n\n"; | ||||||
| 1488 | } | ||||||
| 1489 | my @sorted_cluster_keys = sort {"\L$a" cmp "\L$b"} keys %clusters; | ||||||
| 1490 | close OUTPUT; | ||||||
| 1491 | my $plot = Graphics::GnuplotIF->new( persist => 1 ); | ||||||
| 1492 | my $arg_string = ""; | ||||||
| 1493 | $plot->gnuplot_cmd( "set noclip" ); | ||||||
| 1494 | $plot->gnuplot_cmd( "set hidden3d" ) unless $self->{_show_hidden_in_3D_plots}; | ||||||
| 1495 | $plot->gnuplot_cmd( "set pointsize 2" ); | ||||||
| 1496 | $plot->gnuplot_cmd( "set parametric" ); | ||||||
| 1497 | $plot->gnuplot_cmd( "set size ratio 1" ); | ||||||
| 1498 | $plot->gnuplot_cmd( "set xlabel \"X\"" ); | ||||||
| 1499 | $plot->gnuplot_cmd( "set ylabel \"Y\"" ); | ||||||
| 1500 | $plot->gnuplot_cmd( "set zlabel \"Z\"" ); | ||||||
| 1501 | # set the range for azimuth angles: | ||||||
| 1502 | $plot->gnuplot_cmd( "set urange [0:2*pi]" ); | ||||||
| 1503 | # set the range for the elevation angles: | ||||||
| 1504 | $plot->gnuplot_cmd( "set vrange [-pi/2:pi/2]" ); | ||||||
| 1505 | # Parametric functions for the sphere | ||||||
| 1506 | $plot->gnuplot_cmd( "r=1" ); | ||||||
| 1507 | $plot->gnuplot_cmd( "fx(v,u) = r*cos(v)*cos(u)" ); | ||||||
| 1508 | $plot->gnuplot_cmd( "fy(v,u) = r*cos(v)*sin(u)" ); | ||||||
| 1509 | $plot->gnuplot_cmd( "fz(v) = r*sin(v)" ); | ||||||
| 1510 | my $sphere_arg_str = "fx(v,u),fy(v,u),fz(v) notitle with lines lt 0,"; | ||||||
| 1511 | foreach my $i (0..scalar(keys %clusters)-1) { | ||||||
| 1512 | my $j = $i + 1; | ||||||
| 1513 | # The following statement puts the titles on the data points | ||||||
| 1514 | $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"$sorted_cluster_keys[$i] \" with points lt $j pt $j, "; | ||||||
| 1515 | } | ||||||
| 1516 | $arg_string = $arg_string =~ /^(.*),[ ]+$/; | ||||||
| 1517 | $arg_string = $1; | ||||||
| 1518 | # $plot->gnuplot_cmd( "splot $arg_string" ); | ||||||
| 1519 | $plot->gnuplot_cmd( "splot $sphere_arg_str $arg_string" ); | ||||||
| 1520 | } | ||||||
| 1521 | |||||||
| 1522 | sub DESTROY { | ||||||
| 1523 | use File::Basename; | ||||||
| 1524 | my $filename = basename($_[0]->{_output_file}); | ||||||
| 1525 | $filename =~ s/\.\w+$/\.txt/; | ||||||
| 1526 | unlink "__temp_" . $filename; | ||||||
| 1527 | } | ||||||
| 1528 | |||||||
| 1529 | # from perl docs: | ||||||
| 1530 | sub fisher_yates_shuffle { | ||||||
| 1531 | my $arr = shift; | ||||||
| 1532 | my $i = @$arr; | ||||||
| 1533 | while (--$i) { | ||||||
| 1534 | my $j = int rand( $i + 1 ); | ||||||
| 1535 | @$arr[$i, $j] = @$arr[$j, $i]; | ||||||
| 1536 | } | ||||||
| 1537 | } | ||||||
| 1538 | |||||||
| 1539 | 1; | ||||||
| 1540 | |||||||
| 1541 | =pod | ||||||
| 1542 | |||||||
| 1543 | =head1 NAME | ||||||
| 1544 | |||||||
| 1545 | Algorithm::LinearManifoldDataClusterer --- for clustering data that resides on a | ||||||
| 1546 | low-dimensional manifold in a high-dimensional measurement space | ||||||
| 1547 | |||||||
| 1548 | =head1 SYNOPSIS | ||||||
| 1549 | |||||||
| 1550 | # You'd begin with: | ||||||
| 1551 | |||||||
| 1552 | use Algorithm::LinearManifoldDataClusterer; | ||||||
| 1553 | |||||||
| 1554 | # You'd next name the data file: | ||||||
| 1555 | |||||||
| 1556 | my $datafile = "mydatafile.csv"; | ||||||
| 1557 | |||||||
| 1558 | # Your data must be in the CSV format, with one of the columns containing a unique | ||||||
| 1559 | # symbolic tag for each data record. You tell the module which column has the | ||||||
| 1560 | # symbolic tag and which columns to use for clustering through a mask such as | ||||||
| 1561 | |||||||
| 1562 | my $mask = "N111"; | ||||||
| 1563 | |||||||
| 1564 | # which says that the symbolic tag is in the first column and that the numerical | ||||||
| 1565 | # data in the next three columns is to be used for clustering. If your data file | ||||||
| 1566 | # had, say, five columns and you wanted only the last three columns to be | ||||||
| 1567 | # clustered, the mask would become `N0111' assuming that that the symbolic tag is | ||||||
| 1568 | # still in the first column. | ||||||
| 1569 | |||||||
| 1570 | # Now you must construct an instance of the clusterer through a call such as: | ||||||
| 1571 | |||||||
| 1572 | my $clusterer = Algorithm::LinearManifoldDataClusterer->new( | ||||||
| 1573 | datafile => $datafile, | ||||||
| 1574 | mask => $mask, | ||||||
| 1575 | K => 3, | ||||||
| 1576 | P => 2, | ||||||
| 1577 | max_iterations => 15, | ||||||
| 1578 | cluster_search_multiplier => 2, | ||||||
| 1579 | delta_reconstruction_error => 0.001, | ||||||
| 1580 | terminal_output => 1, | ||||||
| 1581 | visualize_each_iteration => 1, | ||||||
| 1582 | show_hidden_in_3D_plots => 1, | ||||||
| 1583 | make_png_for_each_iteration => 1, | ||||||
| 1584 | ); | ||||||
| 1585 | |||||||
| 1586 | # where the parameter K specifies the number of clusters you expect to find in | ||||||
| 1587 | # your data and the parameter P is the dimensionality of the manifold on which the | ||||||
| 1588 | # data resides. The parameter cluster_search_multiplier is for increasing the | ||||||
| 1589 | # odds that the random seeds chosen initially for clustering will populate all the | ||||||
| 1590 | # clusters. Set this parameter to a low number like 2 or 3. The parameter | ||||||
| 1591 | # max_iterations places a hard limit on the number of iterations that the | ||||||
| 1592 | # algorithm is allowed. The actual number of iterations is controlled by the | ||||||
| 1593 | # parameter delta_reconstruction_error. The iterations stop when the change in | ||||||
| 1594 | # the total "reconstruction error" from one iteration to the next is smaller than | ||||||
| 1595 | # the value specified by delta_reconstruction_error. | ||||||
| 1596 | |||||||
| 1597 | # Next, you must get the module to read the data for clustering: | ||||||
| 1598 | |||||||
| 1599 | $clusterer->get_data_from_csv(); | ||||||
| 1600 | |||||||
| 1601 | # Finally, you invoke linear manifold clustering by: | ||||||
| 1602 | |||||||
| 1603 | my $clusters = $clusterer->linear_manifold_clusterer(); | ||||||
| 1604 | |||||||
| 1605 | # The value returned by this call is a reference to an array of anonymous arrays, | ||||||
| 1606 | # with each anonymous array holding one cluster. If you wish, you can have the | ||||||
| 1607 | # module write the clusters to individual files by the following call: | ||||||
| 1608 | |||||||
| 1609 | $clusterer->write_clusters_to_files($clusters); | ||||||
| 1610 | |||||||
| 1611 | # If you want to see how the reconstruction error changes with the iterations, you | ||||||
| 1612 | # can make the call: | ||||||
| 1613 | |||||||
| 1614 | $clusterer->display_reconstruction_errors_as_a_function_of_iterations(); | ||||||
| 1615 | |||||||
| 1616 | # When your data is 3-dimensional and when the clusters reside on a surface that | ||||||
| 1617 | # is more or less spherical, you can visualize the clusters by calling | ||||||
| 1618 | |||||||
| 1619 | $clusterer->visualize_clusters_on_sphere("final clustering", $clusters); | ||||||
| 1620 | |||||||
| 1621 | # where the first argument is a label to be displayed in the 3D plot and the | ||||||
| 1622 | # second argument the value returned by calling linear_manifold_clusterer(). | ||||||
| 1623 | |||||||
| 1624 | # SYNTHETIC DATA GENERATION: | ||||||
| 1625 | |||||||
| 1626 | # The module includes an embedded class, DataGenerator, for generating synthetic | ||||||
| 1627 | # three-dimensional data that can be used to experiment with the clustering code. | ||||||
| 1628 | # The synthetic data, written out to a CSV file, consists of Gaussian clusters on | ||||||
| 1629 | # the surface of a sphere. You can control the number of clusters, the width of | ||||||
| 1630 | # each cluster, and the number of samples in the clusters by giving appropriate | ||||||
| 1631 | # values to the constructor parameters as shown below: | ||||||
| 1632 | |||||||
| 1633 | use strict; | ||||||
| 1634 | use Algorithm::LinearManifoldDataClusterer; | ||||||
| 1635 | |||||||
| 1636 | my $output_file = "4_clusters_on_a_sphere_1000_samples.csv"; | ||||||
| 1637 | |||||||
| 1638 | my $training_data_gen = DataGenerator->new( | ||||||
| 1639 | output_file => $output_file, | ||||||
| 1640 | cluster_width => 0.015, | ||||||
| 1641 | total_number_of_samples_needed => 1000, | ||||||
| 1642 | number_of_clusters_on_sphere => 4, | ||||||
| 1643 | show_hidden_in_3D_plots => 0, | ||||||
| 1644 | ); | ||||||
| 1645 | $training_data_gen->gen_data_and_write_to_csv(); | ||||||
| 1646 | $training_data_gen->visualize_data_on_sphere($output_file); | ||||||
| 1647 | |||||||
| 1648 | |||||||
| 1649 | =head1 CHANGES | ||||||
| 1650 | |||||||
| 1651 | Version 1.01: Typos and other errors removed in the documentation. Also included in | ||||||
| 1652 | the documentation a link to a tutorial on data processing on manifolds. | ||||||
| 1653 | |||||||
| 1654 | |||||||
| 1655 | =head1 DESCRIPTION | ||||||
| 1656 | |||||||
| 1657 | If you are new to machine learning and data clustering on linear and nonlinear | ||||||
| 1658 | manifolds, your first question is likely to be: What is a manifold? A manifold is a | ||||||
| 1659 | space that is locally Euclidean. And a space is locally Euclidean if it allows for | ||||||
| 1660 | the points in a small neighborhood to be represented by, say, the Cartesian | ||||||
| 1661 | coordinates and if the distances between the points in the neighborhood are given by | ||||||
| 1662 | the Euclidean metric. For an example, the set of all points on the surface of a | ||||||
| 1663 | sphere does NOT constitute a Euclidean space. Nonetheless, if you confined your | ||||||
| 1664 | attention to a small enough neighborhood around a point, the space would seem to be | ||||||
| 1665 | locally Euclidean. The surface of a sphere is a 2-dimensional manifold embedded in a | ||||||
| 1666 | 3-dimensional space. A plane in a 3-dimensional space is also a 2-dimensional | ||||||
| 1667 | manifold. You would think of the surface of a sphere as a nonlinear manifold, whereas | ||||||
| 1668 | a plane would be a linear manifold. However, note that any nonlinear manifold is | ||||||
| 1669 | locally a linear manifold. That is, given a sufficiently small neighborhood on a | ||||||
| 1670 | nonlinear manifold, you can always think of it as a locally flat surface. | ||||||
| 1671 | |||||||
| 1672 | As to why we need machine learning and data clustering on manifolds, there exist many | ||||||
| 1673 | important applications in which the measured data resides on a nonlinear manifold. | ||||||
| 1674 | For example, when you record images of a human face from different angles, all the | ||||||
| 1675 | image pixels taken together fall on a low-dimensional surface in a high-dimensional | ||||||
| 1676 | measurement space. The same is believed to be true for the satellite images of a land | ||||||
| 1677 | mass that are recorded with the sun at different angles with respect to the direction | ||||||
| 1678 | of the camera. | ||||||
| 1679 | |||||||
| 1680 | Reducing the dimensionality of the sort of data mentioned above is critical to the | ||||||
| 1681 | proper functioning of downstream classification algorithms, and the most popular | ||||||
| 1682 | traditional method for dimensionality reduction is the Principal Components Analysis | ||||||
| 1683 | (PCA) algorithm. However, using PCA is tantamount to passing a linear least-squares | ||||||
| 1684 | hyperplane through the surface on which the data actually resides. As to why that | ||||||
| 1685 | might be a bad thing to do, just imagine the consequences of assuming that your data | ||||||
| 1686 | falls on a straight line when, in reality, it falls on a strongly curving arc. This | ||||||
| 1687 | is exactly what happens with PCA --- it gives you a linear manifold approximation to | ||||||
| 1688 | your data that may actually reside on a curved surface. | ||||||
| 1689 | |||||||
| 1690 | That brings us to the purpose of this module, which is to cluster data that resides | ||||||
| 1691 | on a nonlinear manifold. Since a nonlinear manifold is locally linear, we can think | ||||||
| 1692 | of each data cluster on a nonlinear manifold as falling on a locally linear portion | ||||||
| 1693 | of the manifold, meaning on a hyperplane. The logic of the module is based on | ||||||
| 1694 | finding a set of hyperplanes that best describes the data, with each hyperplane | ||||||
| 1695 | derived from a local data cluster. This is like constructing a piecewise linear | ||||||
| 1696 | approximation to data that falls on a curve as opposed to constructing a single | ||||||
| 1697 | straight line approximation to all of the data. So whereas the frequently used PCA | ||||||
| 1698 | algorithm gives you a single hyperplane approximation to all your data, what this | ||||||
| 1699 | module returns is a set of hyperplane approximations, with each hyperplane derived by | ||||||
| 1700 | applying the PCA algorithm locally to a data cluster. | ||||||
| 1701 | |||||||
| 1702 | That brings us to the problem of how to actually discover the best set of hyperplane | ||||||
| 1703 | approximations to the data. What is probably the most popular algorithm today for | ||||||
| 1704 | that purpose is based on the following key idea: Given a set of subspaces to which a | ||||||
| 1705 | data element can be assigned, you assign it to that subspace for which the | ||||||
| 1706 | B |
||||||
| 1707 | is B |
||||||
| 1708 | |||||||
| 1709 | To understand the notions of B |
||||||
| 1710 | the traditional approach of dimensionality reduction by the PCA algorithm. The PCA | ||||||
| 1711 | algorithm consists of: (1) Subtracting from each data element the global mean of the | ||||||
| 1712 | data; (2) Calculating the covariance matrix of the data; (3) Carrying out an | ||||||
| 1713 | eigendecomposition of the covariance matrix and ordering the eigenvectors according | ||||||
| 1714 | to decreasing values of the corresponding eigenvalues; (4) Forming a B |
||||||
| 1715 | discarding the trailing eigenvectors whose corresponding eigenvalues are relatively | ||||||
| 1716 | small; and, finally, (5) projecting all the data elements into the subspace so | ||||||
| 1717 | formed. The error incurred in representing a data element by its projection into the | ||||||
| 1718 | subspace is known as the B |
||||||
| 1719 | the data element into the space spanned by the discarded trailing eigenvectors. | ||||||
| 1720 | |||||||
| 1721 | I | ||||||
| 1722 | subspace in the manner described above, we construct a set of subspaces, one for each | ||||||
| 1723 | data cluster on the nonlinear manifold. After the subspaces have been constructed, a | ||||||
| 1724 | data element is assigned to that subspace for which the reconstruction error is the | ||||||
| 1725 | least.> On the face of it, this sounds like a chicken-and-egg sort of a problem. You | ||||||
| 1726 | need to have already clustered the data in order to construct the subspaces at | ||||||
| 1727 | different places on the manifold so that you can figure out which cluster to place a | ||||||
| 1728 | data element in. | ||||||
| 1729 | |||||||
| 1730 | Such problems, when they do possess a solution, are best tackled through iterative | ||||||
| 1731 | algorithms in which you start with a guess for the final solution, you rearrange the | ||||||
| 1732 | measured data on the basis of the guess, and you then use the new arrangement of the | ||||||
| 1733 | data to refine the guess. Subsequently, you iterate through the second and the third | ||||||
| 1734 | steps until you do not see any discernible changes in the new arrangements of the | ||||||
| 1735 | data. This forms the basis of the clustering algorithm that is described under | ||||||
| 1736 | B |
||||||
| 1737 | article "Dimension Reduction by Local Principal Component Analysis" by Kambhatla and | ||||||
| 1738 | Leen that appeared in the journal Neural Computation in 1997. | ||||||
| 1739 | |||||||
| 1740 | Unfortunately, experiments show that the algorithm as proposed by Kambhatla and Leen | ||||||
| 1741 | is much too sensitive to how the clusters are seeded initially. To get around this | ||||||
| 1742 | limitation of the basic clustering-by-minimization-of-reconstruction-error, this | ||||||
| 1743 | module implements a two phased approach. In B |
||||||
| 1744 | effect in our search for clusters by looking for C |
||||||
| 1745 | C |
||||||
| 1746 | be visited by one or more of the C |
||||||
| 1747 | where C |
||||||
| 1748 | C |
||||||
| 1749 | together in order to form the final C |
||||||
| 1750 | of the algorithm. | ||||||
| 1751 | |||||||
| 1752 | For the cluster merging operation in Phase 2, we model the C |
||||||
| 1753 | nodes of an attributed graph in which the weight given to an edge connecting a pair | ||||||
| 1754 | of nodes is a measure of the similarity between the two clusters corresponding to the | ||||||
| 1755 | two nodes. Subsequently, we use spectral clustering to merge the most similar nodes | ||||||
| 1756 | in our quest to partition the data into C |
||||||
| 1757 | Shi-Malik normalized cuts algorithm. The pairwise node similarity required by this | ||||||
| 1758 | algorithm is measured by the C |
||||||
| 1759 | C |
||||||
| 1760 | when all of the data elements in one cluster are projected into the other's subspace | ||||||
| 1761 | and vice versa, the greater the similarity between two clusters. Additionally, the | ||||||
| 1762 | smaller the distance between the mean vectors of the clusters, the greater the | ||||||
| 1763 | similarity between two clusters. The overall similarity between a pair of clusters | ||||||
| 1764 | is a combination of these two similarity measures. | ||||||
| 1765 | |||||||
| 1766 | For additional information regarding the theoretical underpinnings of the algorithm | ||||||
| 1767 | implemented in this module, visit | ||||||
| 1768 | L |
||||||
| 1769 | |||||||
| 1770 | |||||||
| 1771 | =head1 SUMMARY OF THE ALGORITHM | ||||||
| 1772 | |||||||
| 1773 | We now present a summary of the two phases of the algorithm implemented in this | ||||||
| 1774 | module. Note particularly the important role played by the constructor parameter | ||||||
| 1775 | C |
||||||
| 1776 | parameter is greater than 1 that Phase 2 of the algorithm kicks in. | ||||||
| 1777 | |||||||
| 1778 | =over 4 | ||||||
| 1779 | |||||||
| 1780 | =item B |
||||||
| 1781 | |||||||
| 1782 | Through iterative minimization of the total reconstruction error, this phase of the | ||||||
| 1783 | algorithm returns C |
||||||
| 1784 | expect to find in your data and where C |
||||||
| 1785 | constructor parameter C |
||||||
| 1786 | account of the sensitivity of the reconstruction-error based clustering to how the | ||||||
| 1787 | clusters are initially seeded, our goal is to look for C |
||||||
| 1788 | of increasing the odds that each of the C |
||||||
| 1789 | the beginning of the algorithm. | ||||||
| 1790 | |||||||
| 1791 | =over 4 | ||||||
| 1792 | |||||||
| 1793 | =item Step 1: | ||||||
| 1794 | |||||||
| 1795 | Randomly choose C |
||||||
| 1796 | |||||||
| 1797 | =item Step 2: | ||||||
| 1798 | |||||||
| 1799 | Construct initial C |
||||||
| 1800 | whose seed it is closest to. | ||||||
| 1801 | |||||||
| 1802 | =item Step 3: | ||||||
| 1803 | |||||||
| 1804 | Calculate the mean and the covariance matrix for each of the C |
||||||
| 1805 | carry out an eigendecomposition of the covariance matrix. Order the eigenvectors in | ||||||
| 1806 | decreasing order of the corresponding eigenvalues. The first C eigenvectors |
||||||
| 1807 | define the subspace for that cluster. Use the space spanned by the remaining | ||||||
| 1808 | eigenvectors --- we refer to them as the trailing eigenvectors --- for calculating | ||||||
| 1809 | the reconstruction error. | ||||||
| 1810 | |||||||
| 1811 | =item Step 4: | ||||||
| 1812 | |||||||
| 1813 | Taking into account the mean associated with each cluster, re-cluster the entire data | ||||||
| 1814 | set on the basis of the least reconstruction error. That is, assign each data | ||||||
| 1815 | element to that subspace for which it has the smallest reconstruction error. | ||||||
| 1816 | Calculate the total reconstruction error associated with all the data elements. (See | ||||||
| 1817 | the definition of the reconstruction error in the C |
||||||
| 1818 | |||||||
| 1819 | =item Step 5: | ||||||
| 1820 | |||||||
| 1821 | Stop iterating if the change in the total reconstruction error from the previous | ||||||
| 1822 | iteration to the current iteration is less than the value specified by the constructor | ||||||
| 1823 | parameter C |
||||||
| 1824 | |||||||
| 1825 | =back | ||||||
| 1826 | |||||||
| 1827 | =item B |
||||||
| 1828 | |||||||
| 1829 | This phase of the algorithm uses graph partitioning to merge the C |
||||||
| 1830 | into the C |
||||||
| 1831 | steps are presented below is invoked recursively, let's assume that we have C |
||||||
| 1832 | clusters that need to be merged by invoking the Shi-Malik spectral clustering | ||||||
| 1833 | algorithm as described below: | ||||||
| 1834 | |||||||
| 1835 | =over 4 | ||||||
| 1836 | |||||||
| 1837 | =item Step 1: | ||||||
| 1838 | |||||||
| 1839 | Form a graph whose C |
||||||
| 1840 | invocation of this step, we have C |
||||||
| 1841 | |||||||
| 1842 | =item Step 2: | ||||||
| 1843 | |||||||
| 1844 | Construct an C |
||||||
| 1845 | element of this matrix is the pairwise similarity between the clusters indexed C | ||||||
| 1846 | and C |
||||||
| 1847 | The total reconstruction error when the data elements in the cluster indexed C |
||||||
| 1848 | projected into the subspace for the cluster indexed C and vice versa. And (2) The | ||||||
| 1849 | distance between the mean vectors associated with the two clusters. | ||||||
| 1850 | |||||||
| 1851 | =item Step 3: | ||||||
| 1852 | |||||||
| 1853 | Calculate the symmetric normalized Laplacian of the similarity matrix. We use C | ||||||
| 1854 | to denote the symmetric normalized Laplacian. | ||||||
| 1855 | |||||||
| 1856 | =item Step 4: | ||||||
| 1857 | |||||||
| 1858 | Carry out an eigendecomposition of the C matrix and choose the eigenvector | ||||||
| 1859 | corresponding to the second smallest eigenvalue for bipartitioning the graph on the | ||||||
| 1860 | basis of the sign of the values in the eigenvector. | ||||||
| 1861 | |||||||
| 1862 | =item Step 5: | ||||||
| 1863 | |||||||
| 1864 | If the bipartition of the previous step yields one-versus-the-rest kind of a | ||||||
| 1865 | partition, add the `one' cluster to the output list of clusters and invoke graph | ||||||
| 1866 | partitioning recursively on the `rest' by going back to Step 1. On the other hand, | ||||||
| 1867 | if the cardinality of both the partitions returned by Step 4 exceeds 1, invoke graph | ||||||
| 1868 | partitioning recursively on both partitions. Stop when the list of clusters in the | ||||||
| 1869 | output list equals C |
||||||
| 1870 | |||||||
| 1871 | =item Step 6: | ||||||
| 1872 | |||||||
| 1873 | In general, the C |
||||||
| 1874 | all of the data. So, for the final step, assign each data element not covered by the | ||||||
| 1875 | C |
||||||
| 1876 | |||||||
| 1877 | =back | ||||||
| 1878 | |||||||
| 1879 | =back | ||||||
| 1880 | |||||||
| 1881 | =head1 FAIL-FIRST BIAS OF THE MODULE | ||||||
| 1882 | |||||||
| 1883 | As you would expect for all such iterative algorithms, the module carries no | ||||||
| 1884 | theoretical guarantee that it will give you correct results. But what does that mean? | ||||||
| 1885 | Suppose you create synthetic data that consists of Gaussian looking disjoint clusters | ||||||
| 1886 | on the surface of a sphere, would the module always succeed in separating out the | ||||||
| 1887 | clusters? The module carries no guarantees to that effect --- especially considering | ||||||
| 1888 | that Phase 1 of the algorithm is sensitive to how the clusters are seeded at the | ||||||
| 1889 | beginning. Although this sensitivity is mitigated by the cluster merging step when | ||||||
| 1890 | greater-than-1 value is given to the constructor option C |
||||||
| 1891 | a plain vanilla implementation of the steps in Phase 1 and Phase 2 would nonetheless | ||||||
| 1892 | carry significant risk that you'll end up with incorrect clustering results. | ||||||
| 1893 | |||||||
| 1894 | To further reduce this risk, the module has been programmed so that it terminates | ||||||
| 1895 | immediately if it suspects that the cluster solution being developed is unlikely to | ||||||
| 1896 | be fruitful. The heuristics used for such terminations are conservative --- since | ||||||
| 1897 | the cost of termination is only that the user will have to run the code again, which | ||||||
| 1898 | at worst only carries an element of annoyance with it. The three "Fail First" | ||||||
| 1899 | heuristics currently programmed into the module are based on simple "unimodality | ||||||
| 1900 | testing", testing for "congruent clusters," and testing for dominant cluster support | ||||||
| 1901 | in the final stage of the recursive invocation of the graph partitioning step. The | ||||||
| 1902 | unimodality testing is as elementary as it can be --- it only checks for the number | ||||||
| 1903 | of data samples within a certain radius of the mean in relation to the total number | ||||||
| 1904 | of data samples in the cluster. | ||||||
| 1905 | |||||||
| 1906 | When the program terminates under such conditions, it prints out the following message | ||||||
| 1907 | in your terminal window: | ||||||
| 1908 | |||||||
| 1909 | Bailing out! | ||||||
| 1910 | |||||||
| 1911 | Given the very simple nature of testing that is carried for the "Fail First" bias, do | ||||||
| 1912 | not be surprised if the results you get for your data simply look wrong. If most | ||||||
| 1913 | runs of the module produce wrong results for your application, that means that the | ||||||
| 1914 | module logic needs to be strengthened further. The author of this module would love | ||||||
| 1915 | to hear from you if that is the case. | ||||||
| 1916 | |||||||
| 1917 | =head1 METHODS | ||||||
| 1918 | |||||||
| 1919 | The module provides the following methods for linear-manifold based clustering, for | ||||||
| 1920 | cluster visualization, and for the generation of data for testing the clustering algorithm: | ||||||
| 1921 | |||||||
| 1922 | =over 4 | ||||||
| 1923 | |||||||
| 1924 | =item B |
||||||
| 1925 | |||||||
| 1926 | my $clusterer = Algorithm::LinearManifoldDataClusterer->new( | ||||||
| 1927 | datafile => $datafile, | ||||||
| 1928 | mask => $mask, | ||||||
| 1929 | K => $K, | ||||||
| 1930 | P => $P, | ||||||
| 1931 | cluster_search_multiplier => $C, | ||||||
| 1932 | max_iterations => $max_iter, | ||||||
| 1933 | delta_reconstruction_error => 0.001, | ||||||
| 1934 | terminal_output => 1, | ||||||
| 1935 | write_clusters_to_files => 1, | ||||||
| 1936 | visualize_each_iteration => 1, | ||||||
| 1937 | show_hidden_in_3D_plots => 1, | ||||||
| 1938 | make_png_for_each_iteration => 1, | ||||||
| 1939 | ); | ||||||
| 1940 | |||||||
| 1941 | A call to C |
||||||
| 1942 | C |
||||||
| 1943 | |||||||
| 1944 | =back | ||||||
| 1945 | |||||||
| 1946 | =head2 Constructor Parameters | ||||||
| 1947 | |||||||
| 1948 | =over 8 | ||||||
| 1949 | |||||||
| 1950 | =item C |
||||||
| 1951 | |||||||
| 1952 | This parameter names the data file that contains the multidimensional data records | ||||||
| 1953 | you want the module to cluster. This file must be in CSV format and each record in | ||||||
| 1954 | the file must include a symbolic tag for the record. Here are first few rows of such | ||||||
| 1955 | a CSV file in the C |
||||||
| 1956 | |||||||
| 1957 | d_161,0.0739248630173239,0.231119293395665,-0.970112873251437 | ||||||
| 1958 | a_59,0.459932215884786,0.0110216469739639,0.887885623314902 | ||||||
| 1959 | a_225,0.440503220903039,-0.00543366086464691,0.897734586447273 | ||||||
| 1960 | a_203,0.441656364946433,0.0437191337788422,0.896118459046532 | ||||||
| 1961 | ... | ||||||
| 1962 | ... | ||||||
| 1963 | |||||||
| 1964 | What you see in the first column --- C |
||||||
| 1965 | the symbolic tags associated with four 3-dimensional data records. | ||||||
| 1966 | |||||||
| 1967 | =item C |
||||||
| 1968 | |||||||
| 1969 | This parameter supplies the mask to be applied to the columns of your data file. For | ||||||
| 1970 | the data file whose first few records are shown above, the mask should be C |
||||||
| 1971 | since the symbolic tag is in the first column of the CSV file and since, presumably, | ||||||
| 1972 | you want to cluster the data in the next three columns. | ||||||
| 1973 | |||||||
| 1974 | =item C |
||||||
| 1975 | |||||||
| 1976 | This parameter supplies the number of clusters you are looking for. | ||||||
| 1977 | |||||||
| 1978 | =item C : |
||||||
| 1979 | |||||||
| 1980 | This parameter specifies the dimensionality of the manifold on which the data resides. | ||||||
| 1981 | |||||||
| 1982 | =item C |
||||||
| 1983 | |||||||
| 1984 | As should be clear from the C |
||||||
| 1985 | a very important role in the successful clustering of your data. As explained in | ||||||
| 1986 | C |
||||||
| 1987 | the minimization of the reconstruction error --- is sensitive to the choice of the | ||||||
| 1988 | cluster seeds that are selected randomly at the beginning of the algorithm. Should | ||||||
| 1989 | it happen that the seeds miss one or more of the clusters, the clustering produced is | ||||||
| 1990 | likely to not be correct. By giving an integer value to C |
||||||
| 1991 | that is greater than 1, you'll increase the odds that the randomly selected seeds | ||||||
| 1992 | will see all clusters. When you set C |
||||||
| 1993 | Phase 1 of the algorithm to construct C |
||||||
| 1994 | clusters. Subsequently, in Phase 2, the module uses inter-cluster similarity based | ||||||
| 1995 | graph partitioning to merge the C |
||||||
| 1996 | |||||||
| 1997 | =item C |
||||||
| 1998 | |||||||
| 1999 | This hard limits the number of iterations in Phase 1 of the algorithm. Ordinarily, | ||||||
| 2000 | the iterations stop automatically when the change in the total reconstruction error | ||||||
| 2001 | from one iteration to the next is less than the value specified by the parameter | ||||||
| 2002 | C |
||||||
| 2003 | |||||||
| 2004 | =item C |
||||||
| 2005 | |||||||
| 2006 | It is this parameter that determines when the iterations will actually terminate in | ||||||
| 2007 | Phase 1 of the algorithm. When the difference in the total reconstruction error from | ||||||
| 2008 | one iteration to the next is less than the value given to this parameter, the | ||||||
| 2009 | iterations come to an end. B | ||||||
| 2010 | data samples that need to be clustered, the larger must be the value give to this | ||||||
| 2011 | parameter. That makes intuitive sense since the total reconstruction error is the | ||||||
| 2012 | sum of all such errors for all of the data elements.> Unfortunately, the best value | ||||||
| 2013 | for this parameter does NOT appear to depend linearly on the total number of data | ||||||
| 2014 | records to be clustered. | ||||||
| 2015 | |||||||
| 2016 | =item C |
||||||
| 2017 | |||||||
| 2018 | When this parameter is set, you will see in your terminal window the different | ||||||
| 2019 | clusters as lists of the symbolic tags associated with the data records. You will | ||||||
| 2020 | also see in your terminal window the output produced by the different steps of the | ||||||
| 2021 | graph partitioning algorithm as smaller clusters are merged to form the final C |
||||||
| 2022 | clusters --- assuming that you set the parameter C |
||||||
| 2023 | integer value that is greater than 1. | ||||||
| 2024 | |||||||
| 2025 | =item C |
||||||
| 2026 | |||||||
| 2027 | As its name implies, when this option is set to 1, you'll see 3D plots of the | ||||||
| 2028 | clustering results for each iteration --- but only if your data is 3-dimensional. | ||||||
| 2029 | |||||||
| 2030 | =item C |
||||||
| 2031 | |||||||
| 2032 | This parameter is important for controlling the visualization of the clusters on the | ||||||
| 2033 | surface of a sphere. If the clusters are too spread out, seeing all of the clusters | ||||||
| 2034 | all at once can be visually confusing. When you set this parameter, the clusters on | ||||||
| 2035 | the back side of the sphere will not be visible. Note that no matter how you set | ||||||
| 2036 | this parameter, you can interact with the 3D plot of the data and rotate it with your | ||||||
| 2037 | mouse pointer to see all of the data that is output by the clustering code. | ||||||
| 2038 | |||||||
| 2039 | =item C |
||||||
| 2040 | |||||||
| 2041 | If you set this option to 1, the module will output a Gnuplot in the form of a PNG | ||||||
| 2042 | image for each iteration in Phase 1 of the algorithm. In Phase 2, the module will | ||||||
| 2043 | output the clustering result produced by the graph partitioning algorithm. | ||||||
| 2044 | |||||||
| 2045 | =back | ||||||
| 2046 | |||||||
| 2047 | =over | ||||||
| 2048 | |||||||
| 2049 | =item B |
||||||
| 2050 | |||||||
| 2051 | $clusterer->get_data_from_csv(); | ||||||
| 2052 | |||||||
| 2053 | As you can guess from its name, the method extracts the data from the CSV file named | ||||||
| 2054 | in the constructor. | ||||||
| 2055 | |||||||
| 2056 | =item B |
||||||
| 2057 | |||||||
| 2058 | $clusterer->linear_manifold_clusterer(); | ||||||
| 2059 | |||||||
| 2060 | or | ||||||
| 2061 | |||||||
| 2062 | my $clusters = $clusterer->linear_manifold_clusterer(); | ||||||
| 2063 | |||||||
| 2064 | This is the main call to the linear-manifold based clusterer. The first call works | ||||||
| 2065 | by side-effect, meaning that you will see the clusters in your terminal window and | ||||||
| 2066 | they would be written out to disk files (depending on the constructor options you | ||||||
| 2067 | have set). The second call also returns the clusters as a reference to an array of | ||||||
| 2068 | anonymous arrays, each holding the symbolic tags for a cluster. | ||||||
| 2069 | |||||||
| 2070 | =item B |
||||||
| 2071 | |||||||
| 2072 | $clusterer->display_reconstruction_errors_as_a_function_of_iterations(); | ||||||
| 2073 | |||||||
| 2074 | This method would normally be called after the clustering is completed to see how the | ||||||
| 2075 | reconstruction errors decreased with the iterations in Phase 1 of the overall | ||||||
| 2076 | algorithm. | ||||||
| 2077 | |||||||
| 2078 | =item B |
||||||
| 2079 | |||||||
| 2080 | $clusterer->write_clusters_to_files($clusters); | ||||||
| 2081 | |||||||
| 2082 | As its name implies, when you call this methods, the final clusters would be written | ||||||
| 2083 | out to disk files. The files have names like: | ||||||
| 2084 | |||||||
| 2085 | cluster0.txt | ||||||
| 2086 | cluster1.txt | ||||||
| 2087 | cluster2.txt | ||||||
| 2088 | ... | ||||||
| 2089 | ... | ||||||
| 2090 | |||||||
| 2091 | Before the clusters are written to these files, the module destroys all files with | ||||||
| 2092 | such names in the directory in which you call the module. | ||||||
| 2093 | |||||||
| 2094 | =item B |
||||||
| 2095 | |||||||
| 2096 | $clusterer->visualize_clusters_on_sphere("final clustering", $clusters); | ||||||
| 2097 | |||||||
| 2098 | or | ||||||
| 2099 | |||||||
| 2100 | $clusterer->visualize_clusters_on_sphere("final_clustering", $clusters, "png"); | ||||||
| 2101 | |||||||
| 2102 | If your data is 3-dimensional and it resides on the surface of a sphere (or in the | ||||||
| 2103 | vicinity of such a surface), you may be able to use these methods for the | ||||||
| 2104 | visualization of the clusters produced by the algorithm. The first invocation | ||||||
| 2105 | produces a Gnuplot in a terminal window that you can rotate with your mouse pointer. | ||||||
| 2106 | The second invocation produces a `.png' image of the plot. | ||||||
| 2107 | |||||||
| 2108 | =item B |
||||||
| 2109 | |||||||
| 2110 | $clusterer->auto_retry_clusterer(); | ||||||
| 2111 | |||||||
| 2112 | or | ||||||
| 2113 | |||||||
| 2114 | my $clusters = $clusterer->auto_retry_clusterer(); | ||||||
| 2115 | |||||||
| 2116 | As mentioned earlier, the module is programmed in such a way that it is more likely | ||||||
| 2117 | to fail than to give you a wrong answer. If manually trying the clusterer repeatedly | ||||||
| 2118 | on a data file is frustrating, you can use C |
||||||
| 2119 | make repeated attempts for you. See the script C |
||||||
| 2120 | C |
||||||
| 2121 | |||||||
| 2122 | =back | ||||||
| 2123 | |||||||
| 2124 | =head1 GENERATING SYNTHETIC DATA FOR EXPERIMENTING WITH THE CLUSTERER | ||||||
| 2125 | |||||||
| 2126 | The module file also contains a class named C |
||||||
| 2127 | data for experimenting with linear-manifold based clustering. At this time, only | ||||||
| 2128 | 3-dimensional data that resides in the form of Gaussian clusters on the surface of a | ||||||
| 2129 | sphere is generated. The generated data is placed in a CSV file. You construct an | ||||||
| 2130 | instance of the C |
||||||
| 2131 | |||||||
| 2132 | =over 4 | ||||||
| 2133 | |||||||
| 2134 | =item B |
||||||
| 2135 | |||||||
| 2136 | my $training_data_gen = DataGenerator->new( | ||||||
| 2137 | output_file => $output_file, | ||||||
| 2138 | cluster_width => 0.0005, | ||||||
| 2139 | total_number_of_samples_needed => 1000, | ||||||
| 2140 | number_of_clusters_on_sphere => 4, | ||||||
| 2141 | show_hidden_in_3D_plots => 0, | ||||||
| 2142 | ); | ||||||
| 2143 | |||||||
| 2144 | =back | ||||||
| 2145 | |||||||
| 2146 | =head2 Parameters for the DataGenerator constructor: | ||||||
| 2147 | |||||||
| 2148 | =over 8 | ||||||
| 2149 | |||||||
| 2150 | =item C |
||||||
| 2151 | |||||||
| 2152 | The numeric values are generated using a bivariate Gaussian distribution whose two | ||||||
| 2153 | independent variables are the azimuth and the elevation angles on the surface of a | ||||||
| 2154 | unit sphere. The mean of each cluster is chosen randomly and its covariance set in | ||||||
| 2155 | proportion to the value supplied for the C< cluster_width> parameter. | ||||||
| 2156 | |||||||
| 2157 | =item C |
||||||
| 2158 | |||||||
| 2159 | This parameter controls the spread of each cluster on the surface of the unit sphere. | ||||||
| 2160 | |||||||
| 2161 | =item C |
||||||
| 2162 | |||||||
| 2163 | As its name implies, this parameter specifies the total number of data samples that | ||||||
| 2164 | will be written out to the output file --- provided this number is divisible by the | ||||||
| 2165 | number of clusters you asked for. If the divisibility condition is not satisfied, | ||||||
| 2166 | the number of data samples actually written out will be the closest it can be to the | ||||||
| 2167 | number you specify for this parameter under the condition that equal number of | ||||||
| 2168 | samples will be created for each cluster. | ||||||
| 2169 | |||||||
| 2170 | =item C |
||||||
| 2171 | |||||||
| 2172 | Again as its name implies, this parameters specifies the number of clusters that will | ||||||
| 2173 | be produced on the surface of a unit sphere. | ||||||
| 2174 | |||||||
| 2175 | =item C |
||||||
| 2176 | |||||||
| 2177 | This parameter is important for the visualization of the clusters and it controls | ||||||
| 2178 | whether you will see the generated data on the back side of the sphere. If the | ||||||
| 2179 | clusters are not too spread out, you can set this parameter to 0 and see all the | ||||||
| 2180 | clusters all at once. However, when the clusters are spread out, it can be visually | ||||||
| 2181 | confusing to see the data on the back side of the sphere. Note that no matter how | ||||||
| 2182 | you set this parameter, you can interact with the 3D plot of the data and rotate it | ||||||
| 2183 | with your mouse pointer to see all of the data that is generated. | ||||||
| 2184 | |||||||
| 2185 | =back | ||||||
| 2186 | |||||||
| 2187 | =over 4 | ||||||
| 2188 | |||||||
| 2189 | =item B |
||||||
| 2190 | |||||||
| 2191 | $training_data_gen->gen_data_and_write_to_csv(); | ||||||
| 2192 | |||||||
| 2193 | As its name implies, this method generates the data according to the parameters | ||||||
| 2194 | specified in the DataGenerator constructor. | ||||||
| 2195 | |||||||
| 2196 | =item B |
||||||
| 2197 | |||||||
| 2198 | $training_data_gen->visualize_data_on_sphere($output_file); | ||||||
| 2199 | |||||||
| 2200 | You can use this method to visualize the clusters produced by the data generator. | ||||||
| 2201 | Since the clusters are located at randomly selected points on a unit sphere, by | ||||||
| 2202 | looking at the output visually, you can quickly reject what the data generator has | ||||||
| 2203 | produced and try again. | ||||||
| 2204 | |||||||
| 2205 | =back | ||||||
| 2206 | |||||||
| 2207 | =head1 HOW THE CLUSTERS ARE OUTPUT | ||||||
| 2208 | |||||||
| 2209 | When the option C |
||||||
| 2210 | C |
||||||
| 2211 | screen. | ||||||
| 2212 | |||||||
| 2213 | And, when the option C |
||||||
| 2214 | module dumps the clusters in files named | ||||||
| 2215 | |||||||
| 2216 | cluster0.txt | ||||||
| 2217 | cluster1.txt | ||||||
| 2218 | cluster2.txt | ||||||
| 2219 | ... | ||||||
| 2220 | ... | ||||||
| 2221 | |||||||
| 2222 | in the directory in which you execute the module. The number of such files will | ||||||
| 2223 | equal the number of clusters formed. All such existing files in the directory are | ||||||
| 2224 | destroyed before any fresh ones are created. Each cluster file contains the symbolic | ||||||
| 2225 | tags of the data samples in that cluster. | ||||||
| 2226 | |||||||
| 2227 | Assuming that the data dimensionality is 3, if you have set the constructor parameter | ||||||
| 2228 | C |
||||||
| 2229 | files that are point plots of the different clusters in the different iterations of | ||||||
| 2230 | the algorithm. Such printable files are also generated for the initial clusters at | ||||||
| 2231 | the beginning of the iterations and for the final clusters in Phase 1 of the | ||||||
| 2232 | algorithm. You will also see in your directory a PNG file for the clustering result | ||||||
| 2233 | produced by graph partitioning in Phase 2. | ||||||
| 2234 | |||||||
| 2235 | Also, as mentioned previously, a call to C |
||||||
| 2236 | code will return the clusters to you directly. | ||||||
| 2237 | |||||||
| 2238 | =head1 REQUIRED | ||||||
| 2239 | |||||||
| 2240 | This module requires the following modules: | ||||||
| 2241 | |||||||
| 2242 | List::Util | ||||||
| 2243 | File::Basename | ||||||
| 2244 | Math::Random | ||||||
| 2245 | Graphics::GnuplotIF | ||||||
| 2246 | Math::GSL::Matrix | ||||||
| 2247 | POSIX | ||||||
| 2248 | |||||||
| 2249 | =head1 THE C |
||||||
| 2250 | |||||||
| 2251 | The C |
||||||
| 2252 | play with in order to become familiar with the module: | ||||||
| 2253 | |||||||
| 2254 | example1.pl | ||||||
| 2255 | |||||||
| 2256 | example2.pl | ||||||
| 2257 | |||||||
| 2258 | example3.pl | ||||||
| 2259 | |||||||
| 2260 | example4.pl | ||||||
| 2261 | |||||||
| 2262 | These scripts demonstrate linear-manifold based clustering on the 3-dimensional data | ||||||
| 2263 | in the following three CSV files: | ||||||
| 2264 | |||||||
| 2265 | 3_clusters_on_a_sphere_498_samples.csv (used in example1.pl and example4.pl) | ||||||
| 2266 | |||||||
| 2267 | 3_clusters_on_a_sphere_3000_samples.csv (used in example2.pl) | ||||||
| 2268 | |||||||
| 2269 | 4_clusters_on_a_sphere_1000_samples.csv (used in example3.pl) | ||||||
| 2270 | |||||||
| 2271 | Note that even though the first two of these files both contain exactly three | ||||||
| 2272 | clusters, the clusters look very different in the two data files. The clusters are | ||||||
| 2273 | much more spread out in C<3_clusters_on_a_sphere_3000_samples.csv>. | ||||||
| 2274 | |||||||
| 2275 | The code in C |
||||||
| 2276 | C |
||||||
| 2277 | the clustering program until success is achieved. The value of the constructor | ||||||
| 2278 | parameter C |
||||||
| 2279 | when you execute C |
||||||
| 2280 | You might wish to change the value given to the parameter | ||||||
| 2281 | C |
||||||
| 2282 | achieve success. | ||||||
| 2283 | |||||||
| 2284 | The C |
||||||
| 2285 | and the best final results that can be achieved by the first three examples, that | ||||||
| 2286 | is, for the scripts C |
||||||
| 2287 | a Linux machine and if you have the C |
||||||
| 2288 | C |
||||||
| 2289 | your own runs of the three example scripts may or may not look like what you see in | ||||||
| 2290 | the outputs shown in the PNG files depending on how the module seeds the clusters. | ||||||
| 2291 | Your best results should look like what you see in the PNG images. | ||||||
| 2292 | |||||||
| 2293 | The C |
||||||
| 2294 | |||||||
| 2295 | For generating the data for experiments with clustering: | ||||||
| 2296 | |||||||
| 2297 | generate_data_on_a_sphere.pl | ||||||
| 2298 | |||||||
| 2299 | For visualizing the data generated by the above script: | ||||||
| 2300 | |||||||
| 2301 | data_visualizer.pl | ||||||
| 2302 | |||||||
| 2303 | For cleaning up the examples directory: | ||||||
| 2304 | |||||||
| 2305 | cleanup_directory.pl | ||||||
| 2306 | |||||||
| 2307 | Invoking the C |
||||||
| 2308 | that are generated by the module when you run it with the constructor option | ||||||
| 2309 | C |
||||||
| 2310 | |||||||
| 2311 | =head1 EXPORT | ||||||
| 2312 | |||||||
| 2313 | None by design. | ||||||
| 2314 | |||||||
| 2315 | =head1 CAVEATS | ||||||
| 2316 | |||||||
| 2317 | The performance of the algorithm depends much on the values you choose for the | ||||||
| 2318 | constructor parameters. And, even for the best choices for the parameters, the | ||||||
| 2319 | algorithm is not theoretically guaranteed to return the best results. | ||||||
| 2320 | |||||||
| 2321 | Even after you have discovered the best choices for the constructor parameters, the | ||||||
| 2322 | best way to use this module is to try it multiple times on any given data and accept | ||||||
| 2323 | only those results that make the best intuitive sense. | ||||||
| 2324 | |||||||
| 2325 | The module was designed with the hope that it would rather fail than give you wrong | ||||||
| 2326 | results. So if you see it failing a few times before it returns a good answer, that's | ||||||
| 2327 | a good thing. | ||||||
| 2328 | |||||||
| 2329 | However, if the module fails too often or is too quick to give you wrong answers, | ||||||
| 2330 | that means the module is not working on your data. If that happens, I'd love to hear | ||||||
| 2331 | from you. That might indicate to me how I should enhance the power of this module | ||||||
| 2332 | for its future versions. | ||||||
| 2333 | |||||||
| 2334 | =head1 BUGS | ||||||
| 2335 | |||||||
| 2336 | Please notify the author if you encounter any bugs. When sending email, please place | ||||||
| 2337 | the string 'LinearManifoldDataClusterer' in the subject line. | ||||||
| 2338 | |||||||
| 2339 | =head1 INSTALLATION | ||||||
| 2340 | |||||||
| 2341 | Download the archive from CPAN in any directory of your choice. Unpack the archive | ||||||
| 2342 | with a command that on a Linux machine would look like: | ||||||
| 2343 | |||||||
| 2344 | tar zxvf Algorithm-LinearManifoldDataClusterer-1.01.tar.gz | ||||||
| 2345 | |||||||
| 2346 | This will create an installation directory for you whose name will be | ||||||
| 2347 | C |
||||||
| 2348 | for a standard install of the module if you have root privileges: | ||||||
| 2349 | |||||||
| 2350 | perl Makefile.PL | ||||||
| 2351 | make | ||||||
| 2352 | make test | ||||||
| 2353 | sudo make install | ||||||
| 2354 | |||||||
| 2355 | If you do not have root privileges, you can carry out a non-standard install the | ||||||
| 2356 | module in any directory of your choice by: | ||||||
| 2357 | |||||||
| 2358 | perl Makefile.PL prefix=/some/other/directory/ | ||||||
| 2359 | make | ||||||
| 2360 | make test | ||||||
| 2361 | make install | ||||||
| 2362 | |||||||
| 2363 | With a non-standard install, you may also have to set your PERL5LIB environment | ||||||
| 2364 | variable so that this module can find the required other modules. How you do that | ||||||
| 2365 | would depend on what platform you are working on. In order to install this module in | ||||||
| 2366 | a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment | ||||||
| 2367 | variable by | ||||||
| 2368 | |||||||
| 2369 | setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/ | ||||||
| 2370 | |||||||
| 2371 | If I used bash, I'd need to declare: | ||||||
| 2372 | |||||||
| 2373 | export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/ | ||||||
| 2374 | |||||||
| 2375 | |||||||
| 2376 | =head1 THANKS | ||||||
| 2377 | |||||||
| 2378 | I have learned much from my conversations with Donghun Kim whose research on face | ||||||
| 2379 | recognition in the wild involves clustering image data on manifolds. I have also had | ||||||
| 2380 | several fruitful conversations with Bharath Kumar Comandur and Tanmay Prakash with | ||||||
| 2381 | regard to the ideas that are incorporated in this module. | ||||||
| 2382 | |||||||
| 2383 | =head1 AUTHOR | ||||||
| 2384 | |||||||
| 2385 | Avinash Kak, kak@purdue.edu | ||||||
| 2386 | |||||||
| 2387 | If you send email, please place the string "LinearManifoldDataClusterer" in your subject line to get past | ||||||
| 2388 | my spam filter. | ||||||
| 2389 | |||||||
| 2390 | =head1 COPYRIGHT | ||||||
| 2391 | |||||||
| 2392 | This library is free software; you can redistribute it and/or modify it under the | ||||||
| 2393 | same terms as Perl itself. | ||||||
| 2394 | |||||||
| 2395 | Copyright 2015 Avinash Kak | ||||||
| 2396 | |||||||
| 2397 | =cut | ||||||
| 2398 |