File Coverage

blib/lib/Algorithm/DecisionTree.pm
Criterion Covered Total %
statement 774 2249 34.4
branch 193 708 27.2
condition 41 234 17.5
subroutine 44 92 47.8
pod 8 40 20.0
total 1060 3323 31.9


line stmt bran cond sub pod time code
1             package Algorithm::DecisionTree;
2              
3             #--------------------------------------------------------------------------------------
4             # Copyright (c) 2017 Avinash Kak. All rights reserved. This program is free
5             # software. You may modify and/or distribute it under the same terms as Perl itself.
6             # This copyright notice must remain attached to the file.
7             #
8             # Algorithm::DecisionTree is a Perl module for decision-tree based classification of
9             # multidimensional data.
10             # -------------------------------------------------------------------------------------
11              
12             #use 5.10.0;
13 1     1   22561 use strict;
  1         4  
  1         46  
14 1     1   9 use warnings;
  1         3  
  1         51  
15 1     1   7 use Carp;
  1         9  
  1         21944  
16              
17             our $VERSION = '3.43';
18              
19             ############################################ Constructor ##############################################
20             sub new {
21 1     1 1 559 my ($class, %args, $eval_or_boosting_mode);
22 1 50       6 if (@_ % 2 != 0) {
23 1         7 ($class, %args) = @_;
24             } else {
25 0         0 $class = shift;
26 0         0 $eval_or_boosting_mode = shift;
27 0 0 0     0 die unless $eval_or_boosting_mode eq 'evalmode' || $eval_or_boosting_mode eq 'boostingmode';
28 0 0       0 die "Only one string arg allowed in eval and boosting modes" if @_;
29             }
30 1 50       5 unless ($eval_or_boosting_mode) {
31 1         5 my @params = keys %args;
32 1 50       5 croak "\nYou have used a wrong name for a keyword argument --- perhaps a misspelling\n"
33             if check_for_illegal_params2(@params) == 0;
34             }
35             bless {
36             _training_datafile => $args{training_datafile},
37             _entropy_threshold => $args{entropy_threshold} || 0.01,
38             _max_depth_desired => exists $args{max_depth_desired} ?
39             $args{max_depth_desired} : undef,
40             _debug1 => $args{debug1} || 0,
41             _debug2 => $args{debug2} || 0,
42             _debug3 => $args{debug3} || 0,
43             _csv_class_column_index => $args{csv_class_column_index} || undef,
44             _csv_columns_for_features => $args{csv_columns_for_features} || undef,
45             _symbolic_to_numeric_cardinality_threshold
46             => $args{symbolic_to_numeric_cardinality_threshold} || 10,
47             _number_of_histogram_bins => $args{number_of_histogram_bins} || undef,
48 1 50 50     76 _csv_cleanup_needed => $args{csv_cleanup_needed} || 0,
      50        
      50        
      50        
      50        
      50        
      50        
      50        
      50        
49             _training_data => [],
50             _root_node => undef,
51             _probability_cache => {},
52             _entropy_cache => {},
53             _training_data_hash => {},
54             _features_and_values_hash => {},
55             _samples_class_label_hash => {},
56             _class_names => [],
57             _class_priors => [],
58             _class_priors_hash => {},
59             _feature_names => [],
60             _numeric_features_valuerange_hash => {},
61             _sampling_points_for_numeric_feature_hash => {},
62             _feature_values_how_many_uniques_hash => {},
63             _prob_distribution_numeric_features_hash => {},
64             _histogram_delta_hash => {},
65             _num_of_histogram_bins_hash => {},
66             }, $class;
67              
68             }
69              
70             #################################### Classify with Decision Tree #######################################
71              
72             ## Classifies one test sample at a time using the decision tree constructed from
73             ## your training file. The data record for the test sample must be supplied as
74             ## shown in the scripts in the `examples' subdirectory. See the scripts
75             ## construct_dt_and_classify_one_sample_caseX.pl in that subdirectory.
76             sub classify {
77 0     0 1 0 my $self = shift;
78 0         0 my $root_node = shift;
79 0         0 my $feature_and_values = shift;
80 0         0 my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
81 0         0 my @features_and_values = @$feature_and_values;
82 0         0 @features_and_values = @{deep_copy_array(\@features_and_values)};
  0         0  
83 0 0       0 die "\n\nError in the names you have used for features and/or values. " .
84             "Try using the csv_cleanup_needed option in the constructor call."
85             unless $self->check_names_used(\@features_and_values);
86 0         0 my @new_features_and_values = ();
87 0         0 my $pattern = '(\S+)\s*=\s*(\S+)';
88 0         0 foreach my $feature_and_value (@features_and_values) {
89 0         0 $feature_and_value =~ /$pattern/;
90 0         0 my ($feature, $value) = ($1, $2);
91 0         0 my $newvalue = $value;
92 0         0 my @unique_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
  0         0  
93 0         0 my $not_all_values_float = 0;
94 0 0       0 map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
  0         0  
95 0 0 0     0 if (! contained_in($feature, keys %{$self->{_prob_distribution_numeric_features_hash}}) &&
  0         0  
96             $not_all_values_float == 0) {
97 0         0 $newvalue = closest_sampling_point($value, \@unique_values_for_feature);
98             }
99 0         0 push @new_features_and_values, "$feature" . '=' . "$newvalue";
100             }
101 0         0 @features_and_values = @new_features_and_values;
102 0 0       0 print "\nCL1 New feature and values: @features_and_values\n" if $self->{_debug3};
103 0         0 my %answer = ();
104 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
105 0         0 $answer{$class_name} = undef;
106             }
107 0         0 $answer{'solution_path'} = [];
108 0         0 my %classification = %{$self->recursive_descent_for_classification($root_node,
  0         0  
109             \@features_and_values,\%answer)};
110 0         0 @{$answer{'solution_path'}} = reverse @{$answer{'solution_path'}};
  0         0  
  0         0  
111 0 0       0 if ($self->{_debug3}) {
112 0         0 print "\nCL2 The classification:\n";
113 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
114 0         0 print " $class_name with probability $classification{$class_name}\n";
115             }
116             }
117 0         0 my %classification_for_display = ();
118 0         0 foreach my $item (keys %classification) {
119 0 0       0 if ($item ne 'solution_path') {
120 0         0 $classification_for_display{$item} = sprintf("%0.3f", $classification{$item});
121             } else {
122 0         0 my @outlist = ();
123 0         0 foreach my $x (@{$classification{$item}}) {
  0         0  
124 0         0 push @outlist, "NODE$x";
125             }
126 0         0 $classification_for_display{$item} = \@outlist;
127             }
128             }
129 0         0 return \%classification_for_display;
130             }
131              
132             sub recursive_descent_for_classification {
133 0     0 0 0 my $self = shift;
134 0         0 my $node = shift;
135 0         0 my $features_and_values = shift;
136 0         0 my $answer = shift;
137 0         0 my @features_and_values = @$features_and_values;
138 0         0 my %answer = %$answer;
139 0         0 my @children = @{$node->get_children()};
  0         0  
140 0 0       0 if (@children == 0) {
141 0         0 my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
  0         0  
142 0         0 foreach my $i (0..@{$self->{_class_names}}-1) {
  0         0  
143 0         0 $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
144             }
145 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
146 0         0 return \%answer;
147             }
148 0         0 my $feature_tested_at_node = $node->get_feature();
149             print "\nCLRD1 Feature tested at node for classification: $feature_tested_at_node\n"
150 0 0       0 if $self->{_debug3};
151 0         0 my $value_for_feature;
152             my $path_found;
153 0         0 my $pattern = '(\S+)\s*=\s*(\S+)';
154 0         0 foreach my $feature_and_value (@features_and_values) {
155 0         0 $feature_and_value =~ /$pattern/;
156 0 0       0 $value_for_feature = $2 if $feature_tested_at_node eq $1;
157             }
158             # The following clause introduced in Version 3.20
159 0 0       0 if (!defined $value_for_feature) {
160 0         0 my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
  0         0  
161 0         0 foreach my $i (0..@{$self->{_class_names}}-1) {
  0         0  
162 0         0 $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
163             }
164 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
165 0         0 return \%answer;
166             }
167 0 0       0 if ($value_for_feature) {
168 0 0       0 if (contained_in($feature_tested_at_node, keys %{$self->{_prob_distribution_numeric_features_hash}})) {
  0         0  
169 0 0       0 print( "\nCLRD2 In the truly numeric section") if $self->{_debug3};
170 0         0 my $pattern1 = '(.+)<(.+)';
171 0         0 my $pattern2 = '(.+)>(.+)';
172 0         0 foreach my $child (@children) {
173 0         0 my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
  0         0  
174 0         0 my $last_feature_and_value_on_branch = $branch_features_and_values[-1];
175 0 0       0 if ($last_feature_and_value_on_branch =~ /$pattern1/) {
176 0         0 my ($feature, $threshold) = ($1,$2);
177 0 0       0 if ($value_for_feature <= $threshold) {
178 0         0 $path_found = 1;
179 0         0 %answer = %{$self->recursive_descent_for_classification($child,
  0         0  
180             $features_and_values,\%answer)};
181 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
182 0         0 last;
183             }
184             }
185 0 0       0 if ($last_feature_and_value_on_branch =~ /$pattern2/) {
186 0         0 my ($feature, $threshold) = ($1,$2);
187 0 0       0 if ($value_for_feature > $threshold) {
188 0         0 $path_found = 1;
189 0         0 %answer = %{$self->recursive_descent_for_classification($child,
  0         0  
190             $features_and_values,\%answer)};
191 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
192 0         0 last;
193             }
194             }
195             }
196 0 0       0 return \%answer if $path_found;
197             } else {
198 0         0 my $feature_value_combo = "$feature_tested_at_node" . '=' . "$value_for_feature";
199             print "\nCLRD3 In the symbolic section with feature_value_combo: $feature_value_combo\n"
200 0 0       0 if $self->{_debug3};
201 0         0 foreach my $child (@children) {
202 0         0 my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
  0         0  
203 0 0       0 print "\nCLRD4 branch features and values: @branch_features_and_values\n" if $self->{_debug3};
204 0         0 my $last_feature_and_value_on_branch = $branch_features_and_values[-1];
205 0 0       0 if ($last_feature_and_value_on_branch eq $feature_value_combo) {
206 0         0 %answer = %{$self->recursive_descent_for_classification($child,
  0         0  
207             $features_and_values,\%answer)};
208 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
209 0         0 $path_found = 1;
210 0         0 last;
211             }
212             }
213 0 0       0 return \%answer if $path_found;
214             }
215             }
216 0 0       0 if (! $path_found) {
217 0         0 my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
  0         0  
218 0         0 foreach my $i (0..@{$self->{_class_names}}-1) {
  0         0  
219 0         0 $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
220             }
221 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
222             }
223 0         0 return \%answer;
224             }
225              
226             ## If you want classification to be carried out by engaging a human user in a
227             ## question-answer session, this is the method to use for that purpose. See, for
228             ## example, the script classify_by_asking_questions.pl in the `examples'
229             ## subdirectory for an illustration of how to do that.
230             sub classify_by_asking_questions {
231 0     0 1 0 my $self = shift;
232 0         0 my $root_node = shift;
233 0         0 my %answer = ();
234 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
235 0         0 $answer{$class_name} = undef;
236             }
237 0         0 $answer{'solution_path'} = [];
238 0         0 my %scratchpad_for_numeric_answers = ();
239 0         0 foreach my $feature_name (keys %{$self->{_prob_distribution_numeric_features_hash}}) {
  0         0  
240 0         0 $scratchpad_for_numeric_answers{$feature_name} = undef;
241             }
242 0         0 my %classification = %{$self->interactive_recursive_descent_for_classification($root_node,
  0         0  
243             \%answer, \%scratchpad_for_numeric_answers)};
244 0         0 @{$classification{'solution_path'}} = reverse @{$classification{'solution_path'}};
  0         0  
  0         0  
245 0         0 my %classification_for_display = ();
246 0         0 foreach my $item (keys %classification) {
247 0 0       0 if ($item ne 'solution_path') {
248 0         0 $classification_for_display{$item} = sprintf("%0.3f", $classification{$item});
249             } else {
250 0         0 my @outlist = ();
251 0         0 foreach my $x (@{$classification{$item}}) {
  0         0  
252 0         0 push @outlist, "NODE$x";
253             }
254 0         0 $classification_for_display{$item} = \@outlist;
255             }
256             }
257 0         0 return \%classification_for_display;
258             }
259              
260             sub interactive_recursive_descent_for_classification {
261 0     0 0 0 my $self = shift;
262 0         0 my $node = shift;
263 0         0 my $answer = shift;
264 0         0 my $scratchpad_for_numerics = shift;
265 0         0 my %answer = %$answer;
266 0         0 my %scratchpad_for_numerics = %$scratchpad_for_numerics;
267 0         0 my $pattern1 = '(.+)<(.+)';
268 0         0 my $pattern2 = '(.+)>(.+)';
269 0         0 my $user_value_for_feature;
270 0         0 my @children = @{$node->get_children()};
  0         0  
271 0 0       0 if (@children == 0) {
272 0         0 my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
  0         0  
273 0         0 foreach my $i (0..@{$self->{_class_names}}-1) {
  0         0  
274 0         0 $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
275             }
276 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
277 0         0 return \%answer;
278             }
279 0         0 my @list_of_branch_attributes_to_children = ();
280 0         0 foreach my $child (@children) {
281 0         0 my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
  0         0  
282 0         0 my $feature_and_value_on_branch = $branch_features_and_values[-1];
283 0         0 push @list_of_branch_attributes_to_children, $feature_and_value_on_branch;
284             }
285 0         0 my $feature_tested_at_node = $node->get_feature();
286 0         0 my $feature_value_combo;
287 0         0 my $path_found = 0;
288 0 0       0 if (contained_in($feature_tested_at_node, keys %{$self->{_prob_distribution_numeric_features_hash}})) {
  0         0  
289 0 0       0 if ($scratchpad_for_numerics{$feature_tested_at_node}) {
290 0         0 $user_value_for_feature = $scratchpad_for_numerics{$feature_tested_at_node};
291             } else {
292 0         0 my @valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_tested_at_node}};
  0         0  
293 0         0 while (1) {
294 0         0 print "\nWhat is the value for the feature $feature_tested_at_node ?\n";
295 0         0 print "\nEnter a value in the range (@valuerange): ";
296 0         0 $user_value_for_feature = ;
297 0         0 $user_value_for_feature =~ s/\r?\n?$//;
298 0         0 $user_value_for_feature =~ s/^\s*(\S+)\s*$/$1/;
299 0         0 my $answer_found = 0;
300 0 0 0     0 if ($user_value_for_feature >= $valuerange[0] && $user_value_for_feature <= $valuerange[1]) {
301 0         0 $answer_found = 1;
302 0         0 last;
303             }
304 0 0       0 last if $answer_found;
305 0         0 print("You entered illegal value. Let's try again")
306             }
307 0         0 $scratchpad_for_numerics{$feature_tested_at_node} = $user_value_for_feature;
308             }
309 0         0 foreach my $i (0..@list_of_branch_attributes_to_children-1) {
310 0         0 my $branch_attribute = $list_of_branch_attributes_to_children[$i];
311 0 0       0 if ($branch_attribute =~ /$pattern1/) {
312 0         0 my ($feature,$threshold) = ($1,$2);
313 0 0       0 if ($user_value_for_feature <= $threshold) {
314 0         0 %answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
  0         0  
315             \%answer, \%scratchpad_for_numerics)};
316 0         0 $path_found = 1;
317 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
318 0         0 last;
319             }
320             }
321 0 0       0 if ($branch_attribute =~ /$pattern2/) {
322 0         0 my ($feature,$threshold) = ($1,$2);
323 0 0       0 if ($user_value_for_feature > $threshold) {
324 0         0 %answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
  0         0  
325             \%answer, \%scratchpad_for_numerics)};
326 0         0 $path_found = 1;
327 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
328 0         0 last;
329             }
330             }
331             }
332 0 0       0 return \%answer if $path_found;
333             } else {
334 0         0 my @possible_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature_tested_at_node}};
  0         0  
335 0         0 while (1) {
336 0         0 print "\nWhat is the value for the feature $feature_tested_at_node ?\n";
337 0         0 print "\nEnter a value from the list (@possible_values_for_feature): ";
338 0         0 $user_value_for_feature = ;
339 0         0 $user_value_for_feature =~ s/\r?\n?$//;
340 0         0 $user_value_for_feature =~ s/^\s*(\S+)\s*$/$1/;
341 0         0 my $answer_found = 0;
342 0 0       0 if (contained_in($user_value_for_feature, @possible_values_for_feature)) {
343 0         0 $answer_found = 1;
344 0         0 last;
345             }
346 0 0       0 last if $answer_found;
347 0         0 print("You entered illegal value. Let's try again");
348             }
349 0         0 $feature_value_combo = "$feature_tested_at_node=$user_value_for_feature";
350 0         0 foreach my $i (0..@list_of_branch_attributes_to_children-1) {
351 0         0 my $branch_attribute = $list_of_branch_attributes_to_children[$i];
352 0 0       0 if ($branch_attribute eq $feature_value_combo) {
353 0         0 %answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
  0         0  
354             \%answer, \%scratchpad_for_numerics)};
355 0         0 $path_found = 1;
356 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
357 0         0 last;
358             }
359             }
360 0 0       0 return \%answer if $path_found;
361             }
362 0 0       0 if (! $path_found) {
363 0         0 my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
  0         0  
364 0         0 foreach my $i (0..@{$self->{_class_names}}-1) {
  0         0  
365 0         0 $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
366             }
367 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
368             }
369 0         0 return \%answer;
370             }
371              
372             ###################################### Decision Tree Construction ####################################
373              
374             ## At the root node, we find the best feature that yields the greatest reduction in
375             ## class entropy from the entropy based on just the class priors. The logic for
376             ## finding this feature is different for symbolic features and for numeric features.
377             ## That logic is built into the method shown later for best feature calculations.
378             sub construct_decision_tree_classifier {
379 1     1 1 18 print "\nConstructing the decision tree ...\n";
380 1         6 my $self = shift;
381 1 50       7 if ($self->{_debug3}) {
382 0         0 $self->determine_data_condition();
383 0         0 print "\nStarting construction of the decision tree:\n";
384             }
385 1         4 my @class_probabilities = map {$self->prior_probability_for_class($_)} @{$self->{_class_names}};
  2         12  
  1         6  
386 1 50       9 if ($self->{_debug3}) {
387 0         0 print "\nPrior class probabilities: @class_probabilities\n";
388 0         0 print "\nClass names: @{$self->{_class_names}}\n";
  0         0  
389             }
390 1         8 my $entropy = $self->class_entropy_on_priors();
391 1 50       9 print "\nClass entropy on priors: $entropy\n" if $self->{_debug3};
392 1         35 my $root_node = DTNode->new(undef, $entropy, \@class_probabilities, [], $self, 'root');
393 1         4 $root_node->set_class_names(\@{$self->{_class_names}});
  1         8  
394 1         4 $self->{_root_node} = $root_node;
395 1         7 $self->recursive_descent($root_node);
396 1         11 return $root_node;
397             }
398              
399             ## After the root node of the decision tree is calculated by the previous methods,
400             ## we invoke this method recursively to create the rest of the tree. At each node,
401             ## we find the feature that achieves the largest entropy reduction with regard to
402             ## the partitioning of the training data samples that correspond to that node.
403             sub recursive_descent {
404 46     46 0 119 my $self = shift;
405 46         109 my $node = shift;
406             print "\n==================== ENTERING RECURSIVE DESCENT ==========================\n"
407 46 50       182 if $self->{_debug3};
408 46         171 my $node_serial_number = $node->get_serial_num();
409 46         106 my @features_and_values_or_thresholds_on_branch = @{$node->get_branch_features_and_values_or_thresholds()};
  46         175  
410 46         165 my $existing_node_entropy = $node->get_node_entropy();
411 46 50       315 if ($self->{_debug3}) {
412 0         0 print "\nRD1 NODE SERIAL NUMBER: $node_serial_number\n";
413 0         0 print "\nRD2 Existing Node Entropy: $existing_node_entropy\n";
414 0         0 print "\nRD3 features_and_values_or_thresholds_on_branch: @features_and_values_or_thresholds_on_branch\n";
415 0         0 my @class_probs = @{$node->get_class_probabilities()};
  0         0  
416 0         0 print "\nRD4 Class probabilities: @class_probs\n";
417             }
418 46 50       249 if ($existing_node_entropy < $self->{_entropy_threshold}) {
419 0 0       0 print "\nRD5 returning because existing node entropy is below threshold\n" if $self->{_debug3};
420 0         0 return;
421             }
422 46         108 my @copy_of_path_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
  46         157  
423 46         245 my ($best_feature, $best_feature_entropy, $best_feature_val_entropies, $decision_val) =
424             $self->best_feature_calculator(\@copy_of_path_attributes, $existing_node_entropy);
425 46         329 $node->set_feature($best_feature);
426 46 50       205 $node->display_node() if $self->{_debug3};
427 46 50 33     188 if (defined($self->{_max_depth_desired}) &&
428             (@features_and_values_or_thresholds_on_branch >= $self->{_max_depth_desired})) {
429 0 0       0 print "\nRD6 REACHED LEAF NODE AT MAXIMUM DEPTH ALLOWED\n" if $self->{_debug3};
430 0         0 return;
431             }
432 46 50       175 return if ! defined $best_feature;
433 46 50       161 if ($self->{_debug3}) {
434 0         0 print "\nRD7 Existing entropy at node: $existing_node_entropy\n";
435 0         0 print "\nRD8 Calculated best feature is $best_feature and its value $decision_val\n";
436 0         0 print "\nRD9 Best feature entropy: $best_feature_entropy\n";
437 0         0 print "\nRD10 Calculated entropies for different values of best feature: @$best_feature_val_entropies\n";
438             }
439 46         140 my $entropy_gain = $existing_node_entropy - $best_feature_entropy;
440 46 50       174 print "\nRD11 Expected entropy gain at this node: $entropy_gain\n" if $self->{_debug3};
441 46 100       189 if ($entropy_gain > $self->{_entropy_threshold}) {
442 26 50 33     154 if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature} &&
443             $self->{_feature_values_how_many_uniques_hash}->{$best_feature} >
444             $self->{_symbolic_to_numeric_cardinality_threshold}) {
445 0         0 my $best_threshold = $decision_val; # as returned by best feature calculator
446 0         0 my ($best_entropy_for_less, $best_entropy_for_greater) = @$best_feature_val_entropies;
447             my @extended_branch_features_and_values_or_thresholds_for_lessthan_child =
448 0         0 @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
  0         0  
449             my @extended_branch_features_and_values_or_thresholds_for_greaterthan_child =
450 0         0 @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
  0         0  
451 0         0 my $feature_threshold_combo_for_less_than = "$best_feature" . '<' . "$best_threshold";
452 0         0 my $feature_threshold_combo_for_greater_than = "$best_feature" . '>' . "$best_threshold";
453 0         0 push @extended_branch_features_and_values_or_thresholds_for_lessthan_child,
454             $feature_threshold_combo_for_less_than;
455 0         0 push @extended_branch_features_and_values_or_thresholds_for_greaterthan_child,
456             $feature_threshold_combo_for_greater_than;
457 0 0       0 if ($self->{_debug3}) {
458 0         0 print "\nRD12 extended_branch_features_and_values_or_thresholds_for_lessthan_child: " .
459             "@extended_branch_features_and_values_or_thresholds_for_lessthan_child\n";
460 0         0 print "\nRD13 extended_branch_features_and_values_or_thresholds_for_greaterthan_child: " .
461             "@extended_branch_features_and_values_or_thresholds_for_greaterthan_child\n";
462             }
463             my @class_probabilities_for_lessthan_child_node =
464 0         0 map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
465 0         0 \@extended_branch_features_and_values_or_thresholds_for_lessthan_child)} @{$self->{_class_names}};
  0         0  
466             my @class_probabilities_for_greaterthan_child_node =
467 0         0 map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
468 0         0 \@extended_branch_features_and_values_or_thresholds_for_greaterthan_child)} @{$self->{_class_names}};
  0         0  
469 0 0       0 if ($self->{_debug3}) {
470 0         0 print "\nRD14 class entropy for going down lessthan child: $best_entropy_for_less\n";
471 0         0 print "\nRD15 class_entropy_for_going_down_greaterthan_child: $best_entropy_for_greater\n";
472             }
473 0 0       0 if ($best_entropy_for_less < $existing_node_entropy - $self->{_entropy_threshold}) {
474 0         0 my $left_child_node = DTNode->new(undef, $best_entropy_for_less,
475             \@class_probabilities_for_lessthan_child_node,
476             \@extended_branch_features_and_values_or_thresholds_for_lessthan_child, $self);
477 0         0 $node->add_child_link($left_child_node);
478 0         0 $self->recursive_descent($left_child_node);
479             }
480 0 0       0 if ($best_entropy_for_greater < $existing_node_entropy - $self->{_entropy_threshold}) {
481 0         0 my $right_child_node = DTNode->new(undef, $best_entropy_for_greater,
482             \@class_probabilities_for_greaterthan_child_node,
483             \@extended_branch_features_and_values_or_thresholds_for_greaterthan_child, $self);
484 0         0 $node->add_child_link($right_child_node);
485 0         0 $self->recursive_descent($right_child_node);
486             }
487             } else {
488             print "\nRD16 RECURSIVE DESCENT: In section for symbolic features for creating children"
489 26 50       108 if $self->{_debug3};
490 26         68 my @values_for_feature = @{$self->{_features_and_unique_values_hash}->{$best_feature}};
  26         189  
491 26 50       116 print "\nRD17 Values for feature $best_feature are @values_for_feature\n" if $self->{_debug3};
492 26         95 my @feature_value_combos = sort map {"$best_feature" . '=' . $_} @values_for_feature;
  88         495  
493 26         88 my @class_entropies_for_children = ();
494 26         126 foreach my $feature_and_value_index (0..@feature_value_combos-1) {
495             print "\nRD18 Creating a child node for: $feature_value_combos[$feature_and_value_index]\n"
496 88 50       328 if $self->{_debug3};
497 88         186 my @extended_branch_features_and_values_or_thresholds;
498 88 100       296 if (! @features_and_values_or_thresholds_on_branch) {
499 4         18 @extended_branch_features_and_values_or_thresholds =
500             ($feature_value_combos[$feature_and_value_index]);
501             } else {
502             @extended_branch_features_and_values_or_thresholds =
503 84         177 @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
  84         258  
504 84         279 push @extended_branch_features_and_values_or_thresholds,
505             $feature_value_combos[$feature_and_value_index];
506             }
507             my @class_probabilities =
508 176         568 map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
509 88         191 \@extended_branch_features_and_values_or_thresholds)} @{$self->{_class_names}};
  88         290  
510 88         310 my $class_entropy_for_child =
511             $self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(
512             \@extended_branch_features_and_values_or_thresholds);
513 88 50       297 if ($self->{_debug3}) {
514 0         0 print "\nRD19 branch attributes: @extended_branch_features_and_values_or_thresholds\n";
515 0         0 print "\nRD20 class entropy for child: $class_entropy_for_child\n";
516             }
517 88 100       302 if ($existing_node_entropy - $class_entropy_for_child > $self->{_entropy_threshold}) {
518 45         290 my $child_node = DTNode->new(undef, $class_entropy_for_child,
519             \@class_probabilities, \@extended_branch_features_and_values_or_thresholds, $self);
520 45         227 $node->add_child_link($child_node);
521 45         289 $self->recursive_descent($child_node);
522             } else {
523 43 50       281 print "\nRD21 This child will NOT result in a node\n" if $self->{_debug3};
524             }
525             }
526             }
527             } else {
528             print "\nRD22 REACHED LEAF NODE NATURALLY for: @features_and_values_or_thresholds_on_branch\n"
529 20 50       74 if $self->{_debug3};
530 20         159 return;
531             }
532             }
533              
534             ## This is the heart of the decision tree constructor. Its main job is to figure
535             ## out the best feature to use for partitioning the training data samples that
536             ## correspond to the current node. The search for the best feature is carried out
537             ## differently for symbolic features and for numeric features. For a symbolic
538             ## feature, the method estimates the entropy for each value of the feature and then
539             ## averages out these entropies as a measure of the discriminatory power of that
540             ## features. For a numeric feature, on the other hand, it estimates the entropy
541             ## reduction that can be achieved if were to partition the set of training samples
542             ## for each possible threshold. For a numeric feature, all possible sampling points
543             ## relevant to the node in question are considered as candidates for thresholds.
544             sub best_feature_calculator {
545 46     46 0 108 my $self = shift;
546 46         105 my $features_and_values_or_thresholds_on_branch = shift;
547 46         120 my $existing_node_entropy = shift;
548 46         217 my @features_and_values_or_thresholds_on_branch = @$features_and_values_or_thresholds_on_branch;
549 46         109 my $pattern1 = '(.+)=(.+)';
550 46         106 my $pattern2 = '(.+)<(.+)';
551 46         145 my $pattern3 = '(.+)>(.+)';
552 46         125 my @all_symbolic_features = ();
553 46         112 foreach my $feature_name (@{$self->{_feature_names}}) {
  46         153  
554             push @all_symbolic_features, $feature_name
555 184 50       611 if ! exists $self->{_prob_distribution_numeric_features_hash}->{$feature_name};
556             }
557 46         150 my @symbolic_features_already_used = ();
558 46         136 foreach my $feature_and_value_or_threshold (@features_and_values_or_thresholds_on_branch) {
559 138 50       1156 push @symbolic_features_already_used, $1 if $feature_and_value_or_threshold =~ /$pattern1/;
560             }
561 46         150 my @symbolic_features_not_yet_used;
562 46         130 foreach my $x (@all_symbolic_features) {
563 184 100       539 push @symbolic_features_not_yet_used, $x unless contained_in($x, @symbolic_features_already_used);
564             }
565 46         128 my @true_numeric_types = ();
566 46         103 my @symbolic_types = ();
567 46         160 my @true_numeric_types_feature_names = ();
568 46         116 my @symbolic_types_feature_names = ();
569 46         118 foreach my $item (@features_and_values_or_thresholds_on_branch) {
570 138 50       1047 if ($item =~ /$pattern2/) {
    50          
    50          
571 0         0 push @true_numeric_types, $item;
572 0         0 push @true_numeric_types_feature_names, $1;
573             } elsif ($item =~ /$pattern3/) {
574 0         0 push @true_numeric_types, $item;
575 0         0 push @true_numeric_types_feature_names, $1;
576             } elsif ($item =~ /$pattern1/) {
577 138         342 push @symbolic_types, $item;
578 138         439 push @symbolic_types_feature_names, $1;
579             } else {
580 0         0 die "format error in the representation of feature and values or thresholds";
581             }
582             }
583 46         182 my %seen = ();
584 46 0       130 @true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;
  0         0  
585 46         123 %seen = ();
586 46 50       141 @symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;
  138         765  
587             my @bounded_intervals_numeric_types =
588 46         112 @{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};
  46         205  
589             # Calculate the upper and the lower bounds to be used when searching for the best
590             # threshold for each of the numeric features that are in play at the current node:
591 46         141 my (%upperbound, %lowerbound);
592 46         144 foreach my $feature (@true_numeric_types_feature_names) {
593 0         0 $upperbound{$feature} = undef;
594 0         0 $lowerbound{$feature} = undef;
595             }
596 46         130 foreach my $item (@bounded_intervals_numeric_types) {
597 0         0 foreach my $feature_grouping (@$item) {
598 0 0       0 if ($feature_grouping->[1] eq '>') {
599 0         0 $lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
600             } else {
601 0         0 $upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
602             }
603             }
604             }
605 46         141 my %entropy_values_for_different_features = ();
606 46         114 my %partitioning_point_child_entropies_hash = ();
607 46         107 my %partitioning_point_threshold = ();
608 46         118 my %entropies_for_different_values_of_symbolic_feature = ();
609 46         106 foreach my $feature (@{$self->{_feature_names}}) {
  46         159  
610 184         448 $entropy_values_for_different_features{$feature} = [];
611 184         453 $partitioning_point_child_entropies_hash{$feature} = {};
612 184         374 $partitioning_point_threshold{$feature} = undef;
613 184         519 $entropies_for_different_values_of_symbolic_feature{$feature} = [];
614             }
615 46         112 foreach my $i (0..@{$self->{_feature_names}}-1) {
  46         167  
616 184         544 my $feature_name = $self->{_feature_names}->[$i];
617 184 50       545 print "\n\nBFC1 FEATURE BEING CONSIDERED: $feature_name\n" if $self->{_debug3};
618 184 100 33     590 if (contained_in($feature_name, @symbolic_features_already_used)) {
    50          
619 138         431 next;
620 46         231 } elsif (contained_in($feature_name, keys %{$self->{_numeric_features_valuerange_hash}}) &&
621             $self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
622             $self->{_symbolic_to_numeric_cardinality_threshold}) {
623 0         0 my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
  0         0  
624 0 0       0 print "\nBFC4 values for $feature_name are @values\n" if $self->{_debug3};
625 0         0 my @newvalues = ();
626 0 0       0 if (contained_in($feature_name, @true_numeric_types_feature_names)) {
627 0 0 0     0 if (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
    0 0        
    0 0        
    0 0        
628             $lowerbound{$feature_name} >= $upperbound{$feature_name}) {
629 0         0 next;
630             } elsif (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
631             $lowerbound{$feature_name} < $upperbound{$feature_name}) {
632 0         0 foreach my $x (@values) {
633 0 0 0     0 push @newvalues, $x if $x > $lowerbound{$feature_name} && $x <= $upperbound{$feature_name};
634             }
635             } elsif (defined($upperbound{$feature_name})) {
636 0         0 foreach my $x (@values) {
637 0 0       0 push @newvalues, $x if $x <= $upperbound{$feature_name};
638             }
639             } elsif (defined($lowerbound{$feature_name})) {
640 0         0 foreach my $x (@values) {
641 0 0       0 push @newvalues, $x if $x > $lowerbound{$feature_name};
642             }
643             } else {
644 0         0 die "Error is bound specifications in best feature calculator";
645             }
646             } else {
647 0         0 @newvalues = @{deep_copy_array(\@values)};
  0         0  
648             }
649 0 0       0 next if @newvalues == 0;
650 0         0 my @partitioning_entropies = ();
651 0         0 foreach my $value (@newvalues) {
652 0         0 my $feature_and_less_than_value_string = "$feature_name" . '<' . "$value";
653 0         0 my $feature_and_greater_than_value_string = "$feature_name" . '>' . "$value";
654 0         0 my @for_left_child;
655             my @for_right_child;
656 0 0       0 if (@features_and_values_or_thresholds_on_branch) {
657 0         0 @for_left_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
  0         0  
658 0         0 push @for_left_child, $feature_and_less_than_value_string;
659 0         0 @for_right_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
  0         0  
660 0         0 push @for_right_child, $feature_and_greater_than_value_string;
661             } else {
662 0         0 @for_left_child = ($feature_and_less_than_value_string);
663 0         0 @for_right_child = ($feature_and_greater_than_value_string);
664             }
665 0         0 my $entropy1 = $self->class_entropy_for_less_than_threshold_for_feature(
666             \@features_and_values_or_thresholds_on_branch, $feature_name, $value);
667 0         0 my $entropy2 = $self->class_entropy_for_greater_than_threshold_for_feature(
668             \@features_and_values_or_thresholds_on_branch, $feature_name, $value);
669 0         0 my $partitioning_entropy = $entropy1 *
670             $self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_left_child) +
671             $entropy2 *
672             $self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_right_child);
673              
674 0         0 push @partitioning_entropies, $partitioning_entropy;
675 0         0 $partitioning_point_child_entropies_hash{$feature_name}{$value} = [$entropy1, $entropy2];
676             }
677 0         0 my ($min_entropy, $best_partition_point_index) = minimum(\@partitioning_entropies);
678 0 0       0 if ($min_entropy < $existing_node_entropy) {
679 0         0 $partitioning_point_threshold{$feature_name} = $newvalues[$best_partition_point_index];
680 0         0 $entropy_values_for_different_features{$feature_name} = $min_entropy;
681             }
682             } else {
683 46 50       190 print "\nBFC2: Entering section reserved for symbolic features\n" if $self->{_debug3};
684 46 50       175 print "\nBFC3 Feature name: $feature_name\n" if $self->{_debug3};
685 46         140 my %seen;
686 159 50       1182 my @values = grep {$_ ne 'NA' && !$seen{$_}++}
687 46         117 @{$self->{_features_and_unique_values_hash}->{$feature_name}};
  46         223  
688 46         282 @values = sort @values;
689 46 50       210 print "\nBFC4 values for feature $feature_name are @values\n" if $self->{_debug3};
690              
691 46         134 my $entropy = 0;
692 46         154 foreach my $value (@values) {
693 159         674 my $feature_value_string = "$feature_name" . '=' . "$value";
694 159 50       536 print "\nBFC4 feature_value_string: $feature_value_string\n" if $self->{_debug3};
695 159         313 my @extended_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
  159         528  
696 159 100       570 if (@features_and_values_or_thresholds_on_branch) {
697 145         385 push @extended_attributes, $feature_value_string;
698             } else {
699 14         62 @extended_attributes = ($feature_value_string);
700             }
701 159         608 $entropy +=
702             $self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(\@extended_attributes) *
703             $self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@extended_attributes);
704             print "\nBFC5 Entropy calculated for symbolic feature value choice ($feature_name,$value) " .
705 159 50       595 "is $entropy\n" if $self->{_debug3};
706 159         377 push @{$entropies_for_different_values_of_symbolic_feature{$feature_name}}, $entropy;
  159         799  
707             }
708 46 50       225 if ($entropy < $existing_node_entropy) {
709 46         465 $entropy_values_for_different_features{$feature_name} = $entropy;
710             }
711             }
712             }
713 46         189 my $min_entropy_for_best_feature;
714             my $best_feature_name;
715 46         224 foreach my $feature_nom (keys %entropy_values_for_different_features) {
716 184 100       478 if (!defined($best_feature_name)) {
717 46         106 $best_feature_name = $feature_nom;
718 46         157 $min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};
719             } else {
720 138 100       503 if ($entropy_values_for_different_features{$feature_nom} < $min_entropy_for_best_feature) {
721 43         131 $best_feature_name = $feature_nom;
722 43         118 $min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};
723             }
724             }
725             }
726 46         120 my $threshold_for_best_feature;
727 46 50       155 if (exists $partitioning_point_threshold{$best_feature_name}) {
728 46         126 $threshold_for_best_feature = $partitioning_point_threshold{$best_feature_name};
729             } else {
730 0         0 $threshold_for_best_feature = undef;
731             }
732 46         116 my $best_feature_entropy = $min_entropy_for_best_feature;
733 46         112 my @val_based_entropies_to_be_returned;
734             my $decision_val_to_be_returned;
735 46 50 33     236 if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature_name} &&
736             $self->{_feature_values_how_many_uniques_hash}->{$best_feature_name} >
737             $self->{_symbolic_to_numeric_cardinality_threshold}) {
738             @val_based_entropies_to_be_returned =
739 0         0 @{$partitioning_point_child_entropies_hash{$best_feature_name}{$threshold_for_best_feature}};
  0         0  
740             } else {
741 46         125 @val_based_entropies_to_be_returned = ();
742             }
743 46 50       152 if (exists $partitioning_point_threshold{$best_feature_name}) {
744 46         110 $decision_val_to_be_returned = $partitioning_point_threshold{$best_feature_name};
745             } else {
746 0         0 $decision_val_to_be_returned = undef;
747             }
748             print "\nBFC6 Val based entropies to be returned for feature $best_feature_name are " .
749 46 50       174 "@val_based_entropies_to_be_returned\n" if $self->{_debug3};
750 46         1506 return ($best_feature_name, $best_feature_entropy, \@val_based_entropies_to_be_returned,
751             $decision_val_to_be_returned);
752             }
753              
754             ######################################### Entropy Calculators #####################################
755              
756             sub class_entropy_on_priors {
757 1     1 0 5 my $self = shift;
758             return $self->{_entropy_cache}->{'priors'}
759 1 50       8 if exists $self->{_entropy_cache}->{"priors"};
760 1         4 my @class_names = @{$self->{_class_names}};
  1         7  
761 1         4 my $entropy;
762 1         5 foreach my $class (@class_names) {
763 2         11 my $prob = $self->prior_probability_for_class($class);
764 2 50 33     28 my $log_prob = log($prob) / log(2) if ($prob >= 0.0001) && ($prob <= 0.999) ;
765 2 50       10 $log_prob = 0 if $prob < 0.0001; # since X.log(X)->0 as X->0
766 2 50       10 $log_prob = 0 if $prob > 0.999; # since log(1) = 0
767 2 100       10 if (!defined $entropy) {
768 1         4 $entropy = -1.0 * $prob * $log_prob;
769 1         5 next;
770             }
771 1         7 $entropy += -1.0 * $prob * $log_prob;
772             }
773 1         5 $self->{_entropy_cache}->{'priors'} = $entropy;
774 1         5 return $entropy;
775             }
776              
777             sub entropy_scanner_for_a_numeric_feature {
778 0     0 0 0 local $| = 1;
779 0         0 my $self = shift;
780 0         0 my $feature = shift;
781 0         0 my @all_sampling_points = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature}};
  0         0  
782 0         0 my @entropies_for_less_than_thresholds = ();
783 0         0 my @entropies_for_greater_than_thresholds = ();
784 0         0 foreach my $point (@all_sampling_points) {
785 0         0 print ". ";
786 0         0 push @entropies_for_less_than_thresholds,
787             $self->class_entropy_for_less_than_threshold_for_feature([], $feature, $point);
788 0         0 push @entropies_for_greater_than_thresholds,
789             $self->class_entropy_for_greater_than_threshold_for_feature([], $feature, $point);
790             }
791 0         0 print "\n\nSCANNER: All entropies less than thresholds for feature $feature are: ".
792             "@entropies_for_less_than_thresholds\n";
793 0         0 print "\nSCANNER: All entropies greater than thresholds for feature $feature are: ".
794             "@entropies_for_greater_than_thresholds\n";
795             }
796              
797             sub class_entropy_for_less_than_threshold_for_feature {
798 0     0 0 0 my $self = shift;
799 0         0 my $arr = shift;
800 0         0 my $feature = shift;
801 0         0 my $threshold = shift;
802 0         0 my @array_of_features_and_values_or_thresholds = @$arr;
803 0         0 my $feature_threshold_combo = "$feature" . '<' . "$threshold";
804 0         0 my $sequence = join ":", @array_of_features_and_values_or_thresholds;
805 0         0 $sequence .= ":" . $feature_threshold_combo;
806 0 0       0 return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
807             my @copy_of_array_of_features_and_values_or_thresholds =
808 0         0 @{deep_copy_array(\@array_of_features_and_values_or_thresholds)};
  0         0  
809 0         0 push @copy_of_array_of_features_and_values_or_thresholds, $feature_threshold_combo;
810 0         0 my $entropy = 0;
811 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
812 0         0 my $log_prob = undef;
813 0         0 my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
814             $class_name, \@copy_of_array_of_features_and_values_or_thresholds);
815 0 0 0     0 if ($prob >= 0.0001 && $prob <= 0.999) {
    0          
    0          
816 0         0 $log_prob = log($prob) / log(2.0);
817             } elsif ($prob < 0.0001) {
818 0         0 $log_prob = 0;
819             } elsif ($prob > 0.999) {
820 0         0 $log_prob = 0;
821             } else {
822 0         0 die "An error has occurred in log_prob calculation";
823             }
824 0         0 $entropy += -1.0 * $prob * $log_prob;
825             }
826 0 0       0 if (abs($entropy) < 0.0000001) {
827 0         0 $entropy = 0.0;
828             }
829 0         0 $self->{_entropy_cache}->{$sequence} = $entropy;
830 0         0 return $entropy;
831             }
832              
833             sub class_entropy_for_greater_than_threshold_for_feature {
834 0     0 0 0 my $self = shift;
835 0         0 my $arr = shift;
836 0         0 my $feature = shift;
837 0         0 my $threshold = shift;
838 0         0 my @array_of_features_and_values_or_thresholds = @$arr;
839 0         0 my $feature_threshold_combo = "$feature" . '>' . "$threshold";
840 0         0 my $sequence = join ":", @array_of_features_and_values_or_thresholds;
841 0         0 $sequence .= ":" . $feature_threshold_combo;
842 0 0       0 return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
843             my @copy_of_array_of_features_and_values_or_thresholds =
844 0         0 @{deep_copy_array(\@array_of_features_and_values_or_thresholds)};
  0         0  
845 0         0 push @copy_of_array_of_features_and_values_or_thresholds, $feature_threshold_combo;
846 0         0 my $entropy = 0;
847 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
848 0         0 my $log_prob = undef;
849 0         0 my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
850             $class_name, \@copy_of_array_of_features_and_values_or_thresholds);
851 0 0 0     0 if ($prob >= 0.0001 && $prob <= 0.999) {
    0          
    0          
852 0         0 $log_prob = log($prob) / log(2.0);
853             } elsif ($prob < 0.0001) {
854 0         0 $log_prob = 0;
855             } elsif ($prob > 0.999) {
856 0         0 $log_prob = 0;
857             } else {
858 0         0 die "An error has occurred in log_prob calculation";
859             }
860 0         0 $entropy += -1.0 * $prob * $log_prob;
861             }
862 0 0       0 if (abs($entropy) < 0.0000001) {
863 0         0 $entropy = 0.0;
864             }
865 0         0 $self->{_entropy_cache}->{$sequence} = $entropy;
866 0         0 return $entropy;
867             }
868              
869             sub class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds {
870 247     247 0 566 my $self = shift;
871 247         456 my $array_of_features_and_values_or_thresholds = shift;
872 247         670 my @array_of_features_and_values_or_thresholds = @$array_of_features_and_values_or_thresholds;
873 247         725 my $sequence = join ":", @array_of_features_and_values_or_thresholds;
874 247 100       1033 return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
875 159         363 my $entropy = 0;
876 159         310 foreach my $class_name (@{$self->{_class_names}}) {
  159         469  
877 318         698 my $log_prob = undef;
878 318         1015 my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
879             $class_name, \@array_of_features_and_values_or_thresholds);
880 318 50 33     1672 if ($prob >= 0.0001 && $prob <= 0.999) {
    0          
    0          
881 318         1056 $log_prob = log($prob) / log(2.0);
882             } elsif ($prob < 0.0001) {
883 0         0 $log_prob = 0;
884             } elsif ($prob > 0.999) {
885 0         0 $log_prob = 0;
886             } else {
887 0         0 die "An error has occurred in log_prob calculation";
888             }
889 318         918 $entropy += -1.0 * $prob * $log_prob;
890             }
891 159 50       593 if (abs($entropy) < 0.0000001) {
892 0         0 $entropy = 0.0;
893             }
894 159         578 $self->{_entropy_cache}->{$sequence} = $entropy;
895 159         791 return $entropy;
896             }
897              
898              
899             ##################################### Probability Calculators ########################################
900              
901             sub prior_probability_for_class {
902 4     4 0 10 my $self = shift;
903 4         11 my $class = shift;
904 4         14 my $class_name_in_cache = "prior" . '::' . $class;
905             return $self->{_probability_cache}->{$class_name_in_cache}
906 4 50       25 if exists $self->{_probability_cache}->{$class_name_in_cache};
907 0         0 my $total_num_of_samples = keys %{$self->{_samples_class_label_hash}};
  0         0  
908 0         0 my @values = values %{$self->{_samples_class_label_hash}};
  0         0  
909 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
910 0         0 my @trues = grep {$_ eq $class_name} @values;
  0         0  
911 0         0 my $prior_for_this_class = (1.0 * @trues) / $total_num_of_samples;
912 0         0 my $this_class_name_in_cache = "prior" . '::' . $class_name;
913 0         0 $self->{_probability_cache}->{$this_class_name_in_cache} = $prior_for_this_class;
914             }
915 0         0 return $self->{_probability_cache}->{$class_name_in_cache};
916             }
917              
918             sub calculate_class_priors {
919 1     1 1 19 my $self = shift;
920 1 50       4 return if scalar keys %{$self->{_class_priors_hash}} > 1;
  1         10  
921 1         5 foreach my $class_name (@{$self->{_class_names}}) {
  1         6  
922 2         11 my $class_name_in_cache = "prior::$class_name";
923 2         6 my $total_num_of_samples = scalar keys %{$self->{_samples_class_label_hash}};
  2         10  
924 2         6 my @all_values = values %{$self->{_samples_class_label_hash}};
  2         28  
925 2         8 my @trues_for_this_class = grep {$_ eq $class_name} @all_values;
  70         196  
926 2         11 my $prior_for_this_class = (1.0 * (scalar @trues_for_this_class)) / $total_num_of_samples;
927 2         11 $self->{_class_priors_hash}->{$class_name} = $prior_for_this_class;
928 2         9 my $this_class_name_in_cache = "prior::$class_name";
929 2         17 $self->{_probability_cache}->{$this_class_name_in_cache} = $prior_for_this_class;
930             }
931 1 50       9 if ($self->{_debug1}) {
932 0         0 foreach my $class (sort keys %{$self->{_class_priors_hash}}) {
  0         0  
933 0         0 print "$class => $self->{_class_priors_hash}->{$class}\n";
934             }
935             }
936             }
937              
938             sub calculate_first_order_probabilities {
939 1     1 1 21 print "\nEstimating probabilities...\n";
940 1         3 my $self = shift;
941 1         2 foreach my $feature (@{$self->{_feature_names}}) {
  1         4  
942 4         30 $self->probability_of_feature_value($feature, undef);
943 4 50       28 if ($self->{_debug2}) {
944 0 0       0 if (exists $self->{_prob_distribution_numeric_features_hash}->{$feature}) {
945 0         0 print "\nPresenting probability distribution for a numeric feature:\n";
946 0         0 foreach my $sampling_point (sort {$a <=> $b} keys
  0         0  
947 0         0 %{$self->{_prob_distribution_numeric_features_hash}->{$feature}}) {
948 0         0 my $sampling_pt_for_display = sprintf("%.2f", $sampling_point);
949             print "$feature :: $sampling_pt_for_display=" . sprintf("%.5f",
950 0         0 $self->{_prob_distribution_numeric_features_hash}->{$feature}{$sampling_point}) . "\n";
951             }
952             } else {
953 0         0 print "\nPresenting probabilities for the values of a feature considered to be symbolic:\n";
954 0         0 my @values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
  0         0  
955 0         0 foreach my $value (sort @values_for_feature) {
956 0         0 my $prob = $self->probability_of_feature_value($feature,$value);
957 0         0 print "$feature :: $value = " . sprintf("%.5f", $prob) . "\n";
958             }
959             }
960             }
961             }
962             }
963              
964             sub probability_of_feature_value {
965 441     441 0 938 my $self = shift;
966 441         919 my $feature_name = shift;
967 441         942 my $value = shift;
968 441 50 66     2687 $value = sprintf("%.1f", $value) if defined($value) && $value =~ /^\d+$/;
969 441 50 66     2082 if (defined($value) && exists($self->{_sampling_points_for_numeric_feature_hash}->{$feature_name})) {
970             $value = closest_sampling_point($value,
971 0         0 $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name});
972             }
973 441         849 my $feature_and_value;
974 441 100       1224 if (defined($value)) {
975 437         1160 $feature_and_value = "$feature_name=$value";
976             }
977 441 50 66     2348 if (defined($value) && exists($self->{_probability_cache}->{$feature_and_value})) {
978 437         1732 return $self->{_probability_cache}->{$feature_and_value};
979             }
980 4         15 my ($histogram_delta, $num_of_histogram_bins, @valuerange, $diffrange) = (undef,undef,undef,undef);
981 4 50       19 if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
982 0 0       0 if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
983             $self->{_symbolic_to_numeric_cardinality_threshold}) {
984 0 0       0 if (! exists $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}) {
985 0         0 @valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_name}};
  0         0  
986 0         0 $diffrange = $valuerange[1] - $valuerange[0];
987 0         0 my %seen = ();
988 0 0 0     0 my @unique_values_for_feature = sort {$a <=> $b} grep {$_ if $_ ne 'NA' && !$seen{$_}++}
  0         0  
989 0         0 @{$self->{_features_and_values_hash}->{$feature_name}};
  0         0  
990 0         0 my @diffs = sort {$a <=> $b} map {$unique_values_for_feature[$_] -
  0         0  
  0         0  
991             $unique_values_for_feature[$_-1]} 1..@unique_values_for_feature-1;
992 0         0 my $median_diff = $diffs[int(@diffs/2) - 1];
993 0         0 $histogram_delta = $median_diff * 2;
994 0 0       0 if ($histogram_delta < $diffrange / 500.0) {
995 0 0       0 if (defined $self->{_number_of_histogram_bins}) {
996 0         0 $histogram_delta = $diffrange / $self->{_number_of_histogram_bins};
997             } else {
998 0         0 $histogram_delta = $diffrange / 500.0;
999             }
1000             }
1001 0         0 $self->{_histogram_delta_hash}->{$feature_name} = $histogram_delta;
1002 0         0 $num_of_histogram_bins = int($diffrange / $histogram_delta) + 1;
1003 0         0 $self->{_num_of_histogram_bins_hash}->{$feature_name} = $num_of_histogram_bins;
1004 0         0 my @sampling_points_for_feature = map {$valuerange[0] + $histogram_delta * $_}
  0         0  
1005             0..$num_of_histogram_bins-1;
1006 0         0 $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name} =
1007             \@sampling_points_for_feature;
1008             }
1009             }
1010             }
1011 4 50       15 if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
1012 0 0       0 if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1013             $self->{_symbolic_to_numeric_cardinality_threshold}) {
1014             my @sampling_points_for_feature =
1015 0         0 @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
  0         0  
1016 0         0 my @counts_at_sampling_points = (0) x @sampling_points_for_feature;
1017 0         0 my @actual_values_for_feature = grep {$_ ne 'NA'}
1018 0         0 @{$self->{_features_and_values_hash}->{$feature_name}};
  0         0  
1019 0         0 foreach my $i (0..@sampling_points_for_feature-1) {
1020 0         0 foreach my $j (0..@actual_values_for_feature-1) {
1021 0 0       0 if (abs($sampling_points_for_feature[$i]-$actual_values_for_feature[$j]) < $histogram_delta) {
1022 0         0 $counts_at_sampling_points[$i]++
1023             }
1024             }
1025             }
1026 0         0 my $total_counts = 0;
1027 0         0 map {$total_counts += $_} @counts_at_sampling_points;
  0         0  
1028 0         0 my @probs = map {$_ / (1.0 * $total_counts)} @counts_at_sampling_points;
  0         0  
1029 0         0 my %bin_prob_hash = ();
1030 0         0 foreach my $i (0..@sampling_points_for_feature-1) {
1031 0         0 $bin_prob_hash{$sampling_points_for_feature[$i]} = $probs[$i];
1032             }
1033 0         0 $self->{_prob_distribution_numeric_features_hash}->{$feature_name} = \%bin_prob_hash;
1034 0         0 my @values_for_feature = map "$feature_name=$_", map {sprintf("%.5f", $_)}
  0         0  
1035             @sampling_points_for_feature;
1036 0         0 foreach my $i (0..@values_for_feature-1) {
1037 0         0 $self->{_probability_cache}->{$values_for_feature[$i]} = $probs[$i];
1038             }
1039 0 0 0     0 if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1040 0         0 return $self->{_probability_cache}->{$feature_and_value};
1041             } else {
1042 0         0 return 0;
1043             }
1044             } else {
1045 0         0 my %seen = ();
1046 0 0 0     0 my @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1047 0         0 @{$self->{_features_and_values_hash}->{$feature_name}};
  0         0  
1048 0         0 @values_for_feature = map {"$feature_name=$_"} @values_for_feature;
  0         0  
1049 0         0 my @value_counts = (0) x @values_for_feature;
1050             # foreach my $sample (sort {sample_index($a) cmp sample_index($b)}keys %{$self->{_training_data_hash}}){
1051 0         0 foreach my $sample (sort {sample_index($a) <=> sample_index($b)}keys %{$self->{_training_data_hash}}){
  0         0  
  0         0  
1052 0         0 my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
  0         0  
1053 0         0 foreach my $i (0..@values_for_feature-1) {
1054 0         0 foreach my $current_value (@features_and_values) {
1055 0 0       0 $value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1056             }
1057             }
1058             }
1059 0         0 my $total_counts = 0;
1060 0         0 map {$total_counts += $_} @value_counts;
  0         0  
1061 0 0       0 die "PFV Something is wrong with your training file. It contains no training samples \
1062             for feature named $feature_name" if $total_counts == 0;
1063 0         0 my @probs = map {$_ / (1.0 * $total_counts)} @value_counts;
  0         0  
1064 0         0 foreach my $i (0..@values_for_feature-1) {
1065 0         0 $self->{_probability_cache}->{$values_for_feature[$i]} = $probs[$i];
1066             }
1067 0 0 0     0 if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1068 0         0 return $self->{_probability_cache}->{$feature_and_value};
1069             } else {
1070 0         0 return 0;
1071             }
1072             }
1073             } else {
1074             # This section is only for purely symbolic features:
1075 4         12 my @values_for_feature = @{$self->{_features_and_values_hash}->{$feature_name}};
  4         33  
1076 4         15 @values_for_feature = map {"$feature_name=$_"} @values_for_feature;
  140         272  
1077 4         21 my @value_counts = (0) x @values_for_feature;
1078             # foreach my $sample (sort {sample_index($a) cmp sample_index($b)} keys %{$self->{_training_data_hash}}) {
1079 4         10 foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_training_data_hash}}) {
  544         1144  
  4         53  
1080 140         281 my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
  140         657  
1081 140         401 foreach my $i (0..@values_for_feature-1) {
1082 4900         9059 for my $current_value (@features_and_values) {
1083 19600 100       47918 $value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1084             }
1085             }
1086             }
1087 4         56 foreach my $i (0..@values_for_feature-1) {
1088             $self->{_probability_cache}->{$values_for_feature[$i]} =
1089 140         221 $value_counts[$i] / (1.0 * scalar(keys %{$self->{_training_data_hash}}));
  140         382  
1090             }
1091 4 50 33     32 if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1092 0         0 return $self->{_probability_cache}->{$feature_and_value};
1093             } else {
1094 4         40 return 0;
1095             }
1096             }
1097             }
1098              
1099             sub probability_of_feature_value_given_class {
1100 882     882 0 1879 my $self = shift;
1101 882         1754 my $feature_name = shift;
1102 882         1672 my $feature_value = shift;
1103 882         1720 my $class_name = shift;
1104 882 50 33     4736 $feature_value = sprintf("%.1f", $feature_value) if defined($feature_value) && $feature_value =~ /^\d+$/;
1105 882 50 33     4195 if (defined($feature_value) && exists($self->{_sampling_points_for_numeric_feature_hash}->{$feature_name})) {
1106             $feature_value = closest_sampling_point($feature_value,
1107 0         0 $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name});
1108             }
1109 882         1780 my $feature_value_class;
1110 882 50       2234 if (defined($feature_value)) {
1111 882         2648 $feature_value_class = "$feature_name=$feature_value" . "::" . "$class_name";
1112             }
1113 882 100 66     4649 if (defined($feature_value) && exists($self->{_probability_cache}->{$feature_value_class})) {
1114             print "\nNext answer returned by cache for feature $feature_name and " .
1115 874 50       2536 "value $feature_value given class $class_name\n" if $self->{_debug2};
1116 874         3495 return $self->{_probability_cache}->{$feature_value_class};
1117             }
1118 8         42 my ($histogram_delta, $num_of_histogram_bins, @valuerange, $diffrange) = (undef,undef,undef,undef);
1119              
1120 8 50       45 if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
1121 0 0       0 if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1122             $self->{_symbolic_to_numeric_cardinality_threshold}) {
1123 0         0 $histogram_delta = $self->{_histogram_delta_hash}->{$feature_name};
1124 0         0 $num_of_histogram_bins = $self->{_num_of_histogram_bins_hash}->{$feature_name};
1125 0         0 @valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_name}};
  0         0  
1126 0         0 $diffrange = $valuerange[1] - $valuerange[0];
1127             }
1128             }
1129 8         24 my @samples_for_class = ();
1130             # Accumulate all samples names for the given class:
1131 8         22 foreach my $sample_name (keys %{$self->{_samples_class_label_hash}}) {
  8         98  
1132 280 100       907 if ($self->{_samples_class_label_hash}->{$sample_name} eq $class_name) {
1133 140         384 push @samples_for_class, $sample_name;
1134             }
1135             }
1136 8 50       54 if (exists($self->{_numeric_features_valuerange_hash}->{$feature_name})) {
1137 0 0       0 if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1138             $self->{_symbolic_to_numeric_cardinality_threshold}) {
1139             my @sampling_points_for_feature =
1140 0         0 @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
  0         0  
1141 0         0 my @counts_at_sampling_points = (0) x @sampling_points_for_feature;
1142 0         0 my @actual_feature_values_for_samples_in_class = ();
1143 0         0 foreach my $sample (@samples_for_class) {
1144 0         0 foreach my $feature_and_value (@{$self->{_training_data_hash}->{$sample}}) {
  0         0  
1145 0         0 my $pattern = '(.+)=(.+)';
1146 0         0 $feature_and_value =~ /$pattern/;
1147 0         0 my ($feature, $value) = ($1, $2);
1148 0 0 0     0 if (($feature eq $feature_name) && ($value ne 'NA')) {
1149 0         0 push @actual_feature_values_for_samples_in_class, $value;
1150             }
1151             }
1152             }
1153 0         0 foreach my $i (0..@sampling_points_for_feature-1) {
1154 0         0 foreach my $j (0..@actual_feature_values_for_samples_in_class-1) {
1155 0 0       0 if (abs($sampling_points_for_feature[$i] -
1156             $actual_feature_values_for_samples_in_class[$j]) < $histogram_delta) {
1157 0         0 $counts_at_sampling_points[$i]++;
1158             }
1159             }
1160             }
1161 0         0 my $total_counts = 0;
1162 0         0 map {$total_counts += $_} @counts_at_sampling_points;
  0         0  
1163 0 0       0 die "PFVC1 Something is wrong with your training file. It contains no training " .
1164             "samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1165 0         0 my @probs = map {$_ / (1.0 * $total_counts)} @counts_at_sampling_points;
  0         0  
1166 0         0 my @values_for_feature_and_class = map {"$feature_name=$_" . "::" . "$class_name"}
  0         0  
1167             @sampling_points_for_feature;
1168 0         0 foreach my $i (0..@values_for_feature_and_class-1) {
1169 0         0 $self->{_probability_cache}->{$values_for_feature_and_class[$i]} = $probs[$i];
1170             }
1171 0 0       0 if (exists $self->{_probability_cache}->{$feature_value_class}) {
1172 0         0 return $self->{_probability_cache}->{$feature_value_class};
1173             } else {
1174 0         0 return 0;
1175             }
1176             } else {
1177             # This section is for numeric features that will be treated symbolically
1178 0         0 my %seen = ();
1179 0 0 0     0 my @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1180 0         0 @{$self->{_features_and_values_hash}->{$feature_name}};
  0         0  
1181 0         0 @values_for_feature = map {"$feature_name=$_"} @values_for_feature;
  0         0  
1182 0         0 my @value_counts = (0) x @values_for_feature;
1183 0         0 foreach my $sample (@samples_for_class) {
1184 0         0 my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
  0         0  
1185 0         0 foreach my $i (0..@values_for_feature-1) {
1186 0         0 foreach my $current_value (@features_and_values) {
1187 0 0       0 $value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1188             }
1189             }
1190             }
1191 0         0 my $total_counts = 0;
1192 0         0 map {$total_counts += $_} @value_counts;
  0         0  
1193 0 0       0 die "PFVC2 Something is wrong with your training file. It contains no training " .
1194             "samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1195             # We normalize by total_count because the probabilities are conditioned on a given class
1196 0         0 foreach my $i (0..@values_for_feature-1) {
1197 0         0 my $feature_and_value_and_class = "$values_for_feature[$i]" . "::" . "$class_name";
1198 0         0 $self->{_probability_cache}->{$feature_and_value_and_class} =
1199             $value_counts[$i] / (1.0 * $total_counts);
1200             }
1201 0 0       0 if (exists $self->{_probability_cache}->{$feature_value_class}) {
1202 0         0 return $self->{_probability_cache}->{$feature_value_class};
1203             } else {
1204 0         0 return 0;
1205             }
1206             }
1207             } else {
1208             # This section is for purely symbolic features
1209 8         25 my @values_for_feature = @{$self->{_features_and_values_hash}->{$feature_name}};
  8         107  
1210 8         35 my %seen = ();
1211 280 100 66     1926 @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1212 8         20 @{$self->{_features_and_values_hash}->{$feature_name}};
  8         36  
1213 8         33 @values_for_feature = map {"$feature_name=$_"} @values_for_feature;
  28         118  
1214 8         41 my @value_counts = (0) x @values_for_feature;
1215 8         31 foreach my $sample (@samples_for_class) {
1216 140         331 my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
  140         631  
1217 140         440 foreach my $i (0..@values_for_feature-1) {
1218 490         1165 foreach my $current_value (@features_and_values) {
1219 1960 100       6044 $value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1220             }
1221             }
1222             }
1223 8         22 my $total_counts = 0;
1224 8         28 map {$total_counts += $_} @value_counts;
  28         79  
1225 8 50       39 die "PFVC2 Something is wrong with your training file. It contains no training " .
1226             "samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1227             # We normalize by total_count because the probabilities are conditioned on a given class
1228 8         41 foreach my $i (0..@values_for_feature-1) {
1229 28         110 my $feature_and_value_and_class = "$values_for_feature[$i]" . "::" . "$class_name";
1230 28         189 $self->{_probability_cache}->{$feature_and_value_and_class} =
1231             $value_counts[$i] / (1.0 * $total_counts);
1232             }
1233 8 50       43 if (exists $self->{_probability_cache}->{$feature_value_class}) {
1234 8         89 return $self->{_probability_cache}->{$feature_value_class};
1235             } else {
1236 0         0 return 0;
1237             }
1238             }
1239             }
1240              
1241             sub probability_of_feature_less_than_threshold {
1242 0     0 0 0 my $self = shift;
1243 0         0 my $feature_name = shift;
1244 0         0 my $threshold = shift;
1245 0         0 my $feature_threshold_combo = "$feature_name" . '<' . "$threshold";
1246             return $self->{_probability_cache}->{$feature_threshold_combo}
1247 0 0       0 if (exists $self->{_probability_cache}->{$feature_threshold_combo});
1248 0 0       0 my @all_values = grep {$_ if $_ ne 'NA'} @{$self->{_features_and_values_hash}->{$feature_name}};
  0         0  
  0         0  
1249 0 0       0 my @all_values_less_than_threshold = grep {$_ if $_ <= $threshold} @all_values;
  0         0  
1250 0         0 my $probability = (1.0 * @all_values_less_than_threshold) / @all_values;
1251 0         0 $self->{_probability_cache}->{$feature_threshold_combo} = $probability;
1252 0         0 return $probability;
1253             }
1254              
1255             sub probability_of_feature_less_than_threshold_given_class {
1256 0     0 0 0 my $self = shift;
1257 0         0 my $feature_name = shift;
1258 0         0 my $threshold = shift;
1259 0         0 my $class_name = shift;
1260 0         0 my $feature_threshold_class_combo = "$feature_name" . "<" . "$threshold" . "::" . "$class_name";
1261             return $self->{_probability_cache}->{$feature_threshold_class_combo}
1262 0 0       0 if (exists $self->{_probability_cache}->{$feature_threshold_class_combo});
1263 0         0 my @data_samples_for_class = ();
1264             # Accumulate all samples names for the given class:
1265 0         0 foreach my $sample_name (keys %{$self->{_samples_class_label_hash}}) {
  0         0  
1266             push @data_samples_for_class, $sample_name
1267 0 0       0 if $self->{_samples_class_label_hash}->{$sample_name} eq $class_name;
1268             }
1269 0         0 my @actual_feature_values_for_samples_in_class = ();
1270 0         0 foreach my $sample (@data_samples_for_class) {
1271 0         0 foreach my $feature_and_value (@{$self->{_training_data_hash}->{$sample}}) {
  0         0  
1272 0         0 my $pattern = '(.+)=(.+)';
1273 0         0 $feature_and_value =~ /$pattern/;
1274 0         0 my ($feature,$value) = ($1,$2);
1275 0 0 0     0 push @actual_feature_values_for_samples_in_class, $value
1276             if $feature eq $feature_name && $value ne 'NA';
1277             }
1278             }
1279 0 0       0 my @actual_points_for_feature_less_than_threshold = grep {$_ if $_ <= $threshold} @actual_feature_values_for_samples_in_class;
  0         0  
1280             # The condition in the assignment that follows was a bug correction in Version 3.20
1281 0 0       0 my $probability = @actual_feature_values_for_samples_in_class > 0 ? ((1.0 * @actual_points_for_feature_less_than_threshold) / @actual_feature_values_for_samples_in_class) : 0.0;
1282 0         0 $self->{_probability_cache}->{$feature_threshold_class_combo} = $probability;
1283 0         0 return $probability;
1284             }
1285              
1286             # This method requires that all truly numeric types only be expressed as '<' or '>'
1287             # constructs in the array of branch features and thresholds
1288             sub probability_of_a_sequence_of_features_and_values_or_thresholds {
1289 477     477 0 1038 my $self = shift;
1290 477         880 my $arr = shift;
1291 477         1471 my @array_of_features_and_values_or_thresholds = @$arr;
1292 477 50       1506 return if scalar @array_of_features_and_values_or_thresholds == 0;
1293 477         1469 my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1294 477 100       2124 return $self->{_probability_cache}->{$sequence} if exists $self->{_probability_cache}->{$sequence};
1295 145         319 my $probability = undef;
1296 145         316 my $pattern1 = '(.+)=(.+)';
1297 145         306 my $pattern2 = '(.+)<(.+)';
1298 145         293 my $pattern3 = '(.+)>(.+)';
1299 145         320 my @true_numeric_types = ();
1300 145         312 my @true_numeric_types_feature_names = ();
1301 145         317 my @symbolic_types = ();
1302 145         317 my @symbolic_types_feature_names = ();
1303 145         371 foreach my $item (@array_of_features_and_values_or_thresholds) {
1304 437 50       1933 if ($item =~ /$pattern2/) {
    50          
1305 0         0 push @true_numeric_types, $item;
1306 0         0 my ($feature,$value) = ($1,$2);
1307 0         0 push @true_numeric_types_feature_names, $feature;
1308             } elsif ($item =~ /$pattern3/) {
1309 0         0 push @true_numeric_types, $item;
1310 0         0 my ($feature,$value) = ($1,$2);
1311 0         0 push @true_numeric_types_feature_names, $feature;
1312             } else {
1313 437         1003 push @symbolic_types, $item;
1314 437         1668 $item =~ /$pattern1/;
1315 437         1581 my ($feature,$value) = ($1,$2);
1316 437         1209 push @symbolic_types_feature_names, $feature;
1317             }
1318             }
1319 145         390 my %seen1 = ();
1320 145 0       360 @true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;
  0         0  
1321 145         325 my %seen2 = ();
1322 145 50       378 @symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;
  437         2337  
1323 145         524 my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);
1324             print_array_with_msg("POS: Answer returned by find_bounded: ",
1325 145 50       494 $bounded_intervals_numeric_types) if $self->{_debug2};
1326             # Calculate the upper and the lower bounds to be used when searching for the best
1327             # threshold for each of the numeric features that are in play at the current node:
1328 145         360 my (%upperbound, %lowerbound);
1329 145         318 foreach my $feature_name (@true_numeric_types_feature_names) {
1330 0         0 $upperbound{$feature_name} = undef;
1331 0         0 $lowerbound{$feature_name} = undef;
1332             }
1333 145         439 foreach my $item (@$bounded_intervals_numeric_types) {
1334 0         0 foreach my $feature_grouping (@$item) {
1335 0 0       0 if ($feature_grouping->[1] eq '>') {
1336 0         0 $lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
1337             } else {
1338 0         0 $upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
1339             }
1340             }
1341             }
1342 145         374 foreach my $feature_name (@true_numeric_types_feature_names) {
1343 0 0 0     0 if (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name}) &&
    0 0        
    0 0        
    0 0        
      0        
1344             $upperbound{$feature_name} <= $lowerbound{$feature_name}) {
1345 0         0 return 0;
1346             } elsif (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name})) {
1347 0 0       0 if (! $probability) {
1348             $probability = $self->probability_of_feature_less_than_threshold($feature_name,
1349             $upperbound{$feature_name}) -
1350 0         0 $self->probability_of_feature_less_than_threshold($feature_name, $lowerbound{$feature_name});
1351             } else {
1352             $probability *= ($self->probability_of_feature_less_than_threshold($feature_name,
1353             $upperbound{$feature_name}) -
1354 0         0 $self->probability_of_feature_less_than_threshold($feature_name, $lowerbound{$feature_name}))
1355             }
1356             } elsif (defined($upperbound{$feature_name}) && ! defined($lowerbound{$feature_name})) {
1357 0 0       0 if (! $probability) {
1358             $probability = $self->probability_of_feature_less_than_threshold($feature_name,
1359 0         0 $upperbound{$feature_name});
1360             } else {
1361             $probability *= $self->probability_of_feature_less_than_threshold($feature_name,
1362 0         0 $upperbound{$feature_name});
1363             }
1364             } elsif (defined($lowerbound{$feature_name}) && ! defined($upperbound{$feature_name})) {
1365 0 0       0 if (! $probability) {
1366             $probability = 1.0 - $self->probability_of_feature_less_than_threshold($feature_name,
1367 0         0 $lowerbound{$feature_name});
1368             } else {
1369             $probability *= (1.0 - $self->probability_of_feature_less_than_threshold($feature_name,
1370 0         0 $lowerbound{$feature_name}));
1371             }
1372             } else {
1373 0         0 die("Ill formatted call to 'probability_of_sequence' method");
1374             }
1375             }
1376 145         367 foreach my $feature_and_value (@symbolic_types) {
1377 437 50       2771 if ($feature_and_value =~ /$pattern1/) {
1378 437         1751 my ($feature,$value) = ($1,$2);
1379 437 100       1326 if (! $probability) {
1380 145         501 $probability = $self->probability_of_feature_value($feature, $value);
1381             } else {
1382 292         837 $probability *= $self->probability_of_feature_value($feature, $value);
1383             }
1384             }
1385             }
1386 145         606 $self->{_probability_cache}->{$sequence} = $probability;
1387 145         746 return $probability;
1388             }
1389              
1390             ## The following method requires that all truly numeric types only be expressed as
1391             ## '<' or '>' constructs in the array of branch features and thresholds
1392             sub probability_of_a_sequence_of_features_and_values_or_thresholds_given_class {
1393 318     318 0 665 my $self = shift;
1394 318         592 my $arr = shift;
1395 318         723 my $class_name = shift;
1396 318         901 my @array_of_features_and_values_or_thresholds = @$arr;
1397 318 50       984 return if scalar @array_of_features_and_values_or_thresholds == 0;
1398 318         958 my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1399 318         870 my $sequence_with_class = "$sequence" . "::" . $class_name;
1400             return $self->{_probability_cache}->{$sequence_with_class}
1401 318 100       1203 if exists $self->{_probability_cache}->{$sequence_with_class};
1402 298         618 my $probability = undef;
1403 298         606 my $pattern1 = '(.+)=(.+)';
1404 298         587 my $pattern2 = '(.+)<(.+)';
1405 298         550 my $pattern3 = '(.+)>(.+)';
1406 298         660 my @true_numeric_types = ();
1407 298         593 my @true_numeric_types_feature_names = ();
1408 298         652 my @symbolic_types = ();
1409 298         625 my @symbolic_types_feature_names = ();
1410 298         657 foreach my $item (@array_of_features_and_values_or_thresholds) {
1411 882 50       3740 if ($item =~ /$pattern2/) {
    50          
1412 0         0 push @true_numeric_types, $item;
1413 0         0 my ($feature,$value) = ($1,$2);
1414 0         0 push @true_numeric_types_feature_names, $feature;
1415             } elsif ($item =~ /$pattern3/) {
1416 0         0 push @true_numeric_types, $item;
1417 0         0 my ($feature,$value) = ($1,$2);
1418 0         0 push @true_numeric_types_feature_names, $feature;
1419             } else {
1420 882         2021 push @symbolic_types, $item;
1421 882         3527 $item =~ /$pattern1/;
1422 882         3261 my ($feature,$value) = ($1,$2);
1423 882         2508 push @symbolic_types_feature_names, $feature;
1424             }
1425             }
1426 298         808 my %seen1 = ();
1427 298 0       717 @true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;
  0         0  
1428 298         621 my %seen2 = ();
1429 298 50       772 @symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;
  882         4830  
1430 298         1110 my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);
1431             print_array_with_msg("POSC: Answer returned by find_bounded: ",
1432 298 50       1013 $bounded_intervals_numeric_types) if $self->{_debug2};
1433             # Calculate the upper and the lower bounds to be used when searching for the best
1434             # threshold for each of the numeric features that are in play at the current node:
1435 298         629 my (%upperbound, %lowerbound);
1436 298         645 foreach my $feature_name (@true_numeric_types_feature_names) {
1437 0         0 $upperbound{$feature_name} = undef;
1438 0         0 $lowerbound{$feature_name} = undef;
1439             }
1440 298         676 foreach my $item (@$bounded_intervals_numeric_types) {
1441 0         0 foreach my $feature_grouping (@$item) {
1442 0 0       0 if ($feature_grouping->[1] eq '>') {
1443 0         0 $lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
1444             } else {
1445 0         0 $upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
1446             }
1447             }
1448             }
1449 298         689 foreach my $feature_name (@true_numeric_types_feature_names) {
1450 0 0 0     0 if ($lowerbound{$feature_name} && $upperbound{$feature_name} &&
    0 0        
    0 0        
    0 0        
      0        
1451             $upperbound{$feature_name} <= $lowerbound{$feature_name}) {
1452 0         0 return 0;
1453             } elsif (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name})) {
1454 0 0       0 if (! $probability) {
1455              
1456             $probability = $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1457             $upperbound{$feature_name}, $class_name) -
1458             $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1459 0         0 $lowerbound{$feature_name}, $class_name);
1460             } else {
1461             $probability *= ($self->probability_of_feature_less_than_threshold_given_class($feature_name,
1462             $upperbound{$feature_name}, $class_name) -
1463             $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1464 0         0 $lowerbound{$feature_name}, $class_name))
1465             }
1466             } elsif (defined($upperbound{$feature_name}) && ! defined($lowerbound{$feature_name})) {
1467 0 0       0 if (! $probability) {
1468             $probability = $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1469 0         0 $upperbound{$feature_name}, $class_name);
1470             } else {
1471             $probability *= $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1472 0         0 $upperbound{$feature_name}, $class_name);
1473             }
1474             } elsif (defined($lowerbound{$feature_name}) && ! defined($upperbound{$feature_name})) {
1475 0 0       0 if (! $probability) {
1476             $probability = 1.0 - $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1477 0         0 $lowerbound{$feature_name}, $class_name);
1478             } else {
1479             $probability *= (1.0 - $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1480 0         0 $lowerbound{$feature_name}, $class_name));
1481             }
1482             } else {
1483 0         0 die("Ill formatted call to 'probability of sequence given class' method");
1484             }
1485             }
1486 298         651 foreach my $feature_and_value (@symbolic_types) {
1487 882 50       5684 if ($feature_and_value =~ /$pattern1/) {
1488 882         3452 my ($feature,$value) = ($1,$2);
1489 882 100       2372 if (! $probability) {
1490 298         1090 $probability = $self->probability_of_feature_value_given_class($feature, $value, $class_name);
1491             } else {
1492 584         1678 $probability *= $self->probability_of_feature_value_given_class($feature, $value, $class_name);
1493             }
1494             }
1495             }
1496 298         1302 $self->{_probability_cache}->{$sequence_with_class} = $probability;
1497 298         1690 return $probability;
1498             }
1499              
1500             sub probability_of_a_class_given_sequence_of_features_and_values_or_thresholds {
1501 494     494 0 1036 my $self = shift;
1502 494         957 my $class_name = shift;
1503 494         889 my $arr = shift;
1504 494         1370 my @array_of_features_and_values_or_thresholds = @$arr;
1505 494         1349 my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1506 494         1298 my $class_and_sequence = "$class_name" . "::" . $sequence;
1507             return $self->{_probability_cache}->{$class_and_sequence}
1508 494 100       2370 if exists $self->{_probability_cache}->{$class_and_sequence};
1509 159         385 my @array_of_class_probabilities = (0) x scalar @{$self->{_class_names}};
  159         573  
1510 159         391 foreach my $i (0..@{$self->{_class_names}}-1) {
  159         501  
1511 318         761 my $class_name = $self->{_class_names}->[$i];
1512 318         1356 my $prob = $self->probability_of_a_sequence_of_features_and_values_or_thresholds_given_class(
1513             \@array_of_features_and_values_or_thresholds, $class_name);
1514 318 50       1127 if ($prob < 0.000001) {
1515 0         0 $array_of_class_probabilities[$i] = 0.0;
1516 0         0 next;
1517             }
1518 318         1122 my $prob_of_feature_sequence = $self->probability_of_a_sequence_of_features_and_values_or_thresholds(
1519             \@array_of_features_and_values_or_thresholds);
1520             # die "PCS Something is wrong with your sequence of feature values and thresholds in " .
1521             # "probability_of_a_class_given_sequence_of_features_and_values_or_thresholds()"
1522             # if ! $prob_of_feature_sequence;
1523 318         1094 my $prior = $self->{_class_priors_hash}->{$self->{_class_names}->[$i]};
1524 318 50       822 if ($prob_of_feature_sequence) {
1525 318         1220 $array_of_class_probabilities[$i] = $prob * $prior / $prob_of_feature_sequence;
1526             } else {
1527 0         0 $array_of_class_probabilities[$i] = $prior;
1528             }
1529             }
1530 159         363 my $sum_probability;
1531 159         433 map {$sum_probability += $_} @array_of_class_probabilities;
  318         931  
1532 159 50       491 if ($sum_probability == 0) {
1533 0         0 @array_of_class_probabilities = map {1.0 / (scalar @{$self->{_class_names}})}
  0         0  
1534 0         0 (0..@{$self->{_class_names}}-1);
  0         0  
1535             } else {
1536 159         392 @array_of_class_probabilities = map {$_ * 1.0 / $sum_probability} @array_of_class_probabilities;
  318         1035  
1537             }
1538 159         376 foreach my $i (0..@{$self->{_class_names}}-1) {
  159         571  
1539 318         1161 my $this_class_and_sequence = "$self->{_class_names}->[$i]" . "::" . "$sequence";
1540 318         1501 $self->{_probability_cache}->{$this_class_and_sequence} = $array_of_class_probabilities[$i];
1541             }
1542 159         636 return $self->{_probability_cache}->{$class_and_sequence};
1543             }
1544              
1545             ####################################### Class Based Utilities ##########################################
1546              
1547             ## Given a list of branch attributes for the numeric features of the form, say,
1548             ## ['g2<1','g2<2','g2<3','age>34','age>36','age>37'], this method returns the
1549             ## smallest list that is relevant for the purpose of calculating the probabilities.
1550             ## To explain, the probability that the feature `g2' is less than 1 AND, at the same
1551             ## time, less than 2, AND, at the same time, less than 3, is the same as the
1552             ## probability that the feature less than 1. Similarly, the probability that 'age'
1553             ## is greater than 34 and also greater than 37 is the same as `age' being greater
1554             ## than 37.
1555             sub find_bounded_intervals_for_numeric_features {
1556 489     489 0 1104 my $self = shift;
1557 489         912 my $arr = shift;
1558 489         1061 my @arr = @$arr;
1559 489         917 my @features = @{$self->{_feature_names}};
  489         1651  
1560 489         1170 my @arr1 = map {my @x = split /(>|<)/, $_; \@x} @arr;
  0         0  
  0         0  
1561 489 50       1456 print_array_with_msg("arr1", \@arr1) if $self->{_debug2};
1562 489         1027 my @arr3 = ();
1563 489         1133 foreach my $feature_name (@features) {
1564 1956         3893 my @temp = ();
1565 1956         3831 foreach my $x (@arr1) {
1566 0 0 0     0 push @temp, $x if @$x > 0 && $x->[0] eq $feature_name;
1567             }
1568 1956 50       5964 push @arr3, \@temp if @temp > 0;
1569             }
1570 489 50       1444 print_array_with_msg("arr3", \@arr3) if $self->{_debug2};
1571             # Sort each list so that '<' entries occur before '>' entries:
1572 489         915 my @arr4;
1573 489         992 foreach my $li (@arr3) {
1574 0         0 my @sorted = sort {$a->[1] cmp $b->[1]} @$li;
  0         0  
1575 0         0 push @arr4, \@sorted;
1576             }
1577 489 50       1387 print_array_with_msg("arr4", \@arr4) if $self->{_debug2};
1578 489         904 my @arr5;
1579 489         1030 foreach my $li (@arr4) {
1580 0         0 my @temp1 = ();
1581 0         0 my @temp2 = ();
1582 0         0 foreach my $inner (@$li) {
1583 0 0       0 if ($inner->[1] eq '<') {
1584 0         0 push @temp1, $inner;
1585             } else {
1586 0         0 push @temp2, $inner;
1587             }
1588             }
1589 0 0 0     0 if (@temp1 > 0 && @temp2 > 0) {
    0          
1590 0         0 push @arr5, [\@temp1, \@temp2];
1591             } elsif (@temp1 > 0) {
1592 0         0 push @arr5, [\@temp1];
1593             } else {
1594 0         0 push @arr5, [\@temp2];
1595             }
1596             }
1597 489 50       1396 print_array_with_msg("arr5", \@arr5) if $self->{_debug2};
1598 489         1069 my @arr6 = ();
1599 489         1036 foreach my $li (@arr5) {
1600 0         0 my @temp1 = ();
1601 0         0 foreach my $inner (@$li) {
1602 0         0 my @sorted = sort {$a->[2] <=> $b->[2]} @$inner;
  0         0  
1603 0         0 push @temp1, \@sorted;
1604             }
1605 0         0 push @arr6, \@temp1;
1606             }
1607 489 50       1301 print_array_with_msg("arr6", \@arr6) if $self->{_debug2};
1608 489         1034 my @arr9 = ();
1609 489         1042 foreach my $li (@arr6) {
1610 0         0 foreach my $alist (@$li) {
1611 0         0 my @newalist = ();
1612 0 0       0 if ($alist->[0][1] eq '<') {
1613 0         0 push @newalist, $alist->[0];
1614             } else {
1615 0         0 push @newalist, $alist->[-1];
1616             }
1617 0 0       0 if ($alist->[0][1] ne $alist->[-1][1]) {
1618 0         0 push @newalist, $alist->[-1];
1619             }
1620 0         0 push @arr9, \@newalist;
1621             }
1622             }
1623 489 50       1284 print_array_with_msg('arr9', \@arr9) if $self->{_debug2};
1624 489         1787 return \@arr9;
1625              
1626             }
1627              
1628             ## This method is used to verify that you used legal feature names in the test
1629             ## sample that you want to classify with the decision tree.
1630             sub check_names_used {
1631 0     0 0 0 my $self = shift;
1632 0         0 my $features_and_values_test_data = shift;
1633 0         0 my @features_and_values_test_data = @$features_and_values_test_data;
1634 0         0 my $pattern = '(\S+)\s*=\s*(\S+)';
1635 0         0 foreach my $feature_and_value (@features_and_values_test_data) {
1636 0         0 $feature_and_value =~ /$pattern/;
1637 0         0 my ($feature,$value) = ($1,$2);
1638 0 0 0     0 die "Your test data has formatting error" unless defined($feature) && defined($value);
1639 0 0       0 return 0 unless contained_in($feature, @{$self->{_feature_names}});
  0         0  
1640             }
1641 0         0 return 1;
1642             }
1643              
1644             ####################################### Data Condition Calculator ######################################
1645              
1646             ## This method estimates the worst-case fan-out of the decision tree taking into
1647             ## account the number of values (and therefore the number of branches emanating from
1648             ## a node) for the symbolic features.
1649             sub determine_data_condition {
1650 0     0 0 0 my $self = shift;
1651 0         0 my $num_of_features = scalar @{$self->{_feature_names}};
  0         0  
1652 0         0 my @values = ();
1653 0         0 my @number_of_values;
1654 0         0 foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {
  0         0  
1655 0         0 push @values, @{$self->{_features_and_unique_values_hash}->{$feature}}
1656 0 0       0 if ! contained_in($feature, keys %{$self->{_numeric_features_valuerange_hash}});
  0         0  
1657 0         0 push @number_of_values, scalar @values;
1658             }
1659 0 0       0 return if ! @values;
1660 0         0 print "Number of features: $num_of_features\n";
1661 0         0 my @minmax = minmax(\@number_of_values);
1662 0         0 my $max_num_values = $minmax[1];
1663 0         0 print "Largest number of values for symbolic features is: $max_num_values\n";
1664 0         0 my $estimated_number_of_nodes = $max_num_values ** $num_of_features;
1665 0         0 print "\nWORST CASE SCENARIO: The decision tree COULD have as many as $estimated_number_of_nodes " .
1666             "nodes. The exact number of nodes created depends critically on " .
1667             "the entropy_threshold used for node expansion (the default value " .
1668             "for this threshold is 0.01) and on the value set for max_depth_desired " .
1669             "for the depth of the tree.\n";
1670 0 0       0 if ($estimated_number_of_nodes > 10000) {
1671 0         0 print "\nTHIS IS WAY TOO MANY NODES. Consider using a relatively " .
1672             "large value for entropy_threshold and/or a small value for " .
1673             "for max_depth_desired to reduce the number of nodes created.\n";
1674 0         0 print "\nDo you wish to continue anyway? Enter 'y' for yes: ";
1675 0         0 my $answer = ;
1676 0         0 $answer =~ s/\r?\n?$//;
1677 0   0     0 while ( ($answer !~ /y(es)?/i) && ($answer !~ /n(o)?/i) ) {
1678 0         0 print "\nAnswer not recognized. Let's try again. Enter 'y' or 'n': ";
1679 0         0 $answer = ;
1680 0         0 $answer =~ s/\r?\n?$//;
1681             }
1682 0 0       0 die unless $answer =~ /y(es)?/i;
1683             }
1684             }
1685              
1686              
1687             #################################### Read Training Data From File ######################################
1688              
1689              
1690             sub get_training_data {
1691 1     1 1 7 my $self = shift;
1692 1 50       10 die("Aborted. get_training_data_csv() is only for CSV files") unless $self->{_training_datafile} =~ /\.csv$/;
1693 1         4 my %class_names = ();
1694 1         3 my %all_record_ids_with_class_labels;
1695             my $firstline;
1696 1         0 my %data_hash;
1697 1         4 $|++;
1698 1   50     25 open FILEIN, $self->{_training_datafile} || die "unable to open $self->{_training_datafile}: $!";
1699 1         4 my $record_index = 0;
1700 1         2 my $firsetline;
1701 1         16 while () {
1702 36 50       115 next if /^[ ]*\r?\n?$/;
1703 36         112 $_ =~ s/\r?\n?$//;
1704 36 50       84 my $record = $self->{_csv_cleanup_needed} ? cleanup_csv($_) : $_;
1705 36 100       77 if ($record_index == 0) {
1706 1         2 $firstline = $record;
1707 1         3 $record_index++;
1708 1         11 next;
1709             }
1710 35         152 my @parts = split /,/, $record;
1711 35         66 my $classname = $parts[$self->{_csv_class_column_index}];
1712 35         49 $class_names{$classname} = 1;
1713 35         56 my $record_label = shift @parts;
1714 35         81 $record_label =~ s/^\s*\"|\"\s*$//g;
1715 35         77 $data_hash{$record_label} = \@parts;
1716 35         59 $all_record_ids_with_class_labels{$record_label} = $classname;
1717 35 50       76 print "." if $record_index % 10000 == 0;
1718 35         108 $record_index++;
1719             }
1720 1         9 close FILEIN;
1721 1         3 $|--;
1722 1         5 $self->{_how_many_total_training_samples} = $record_index - 1; # must subtract 1 for the header record
1723 1 50       4 print "\n\nTotal number of training samples: $self->{_how_many_total_training_samples}\n" if $self->{_debug1};
1724 1         11 my @all_feature_names = split /,/, substr($firstline, index($firstline,','));
1725 1         4 my $class_column_heading = $all_feature_names[$self->{_csv_class_column_index}];
1726 1         4 my @all_class_names = sort map {"$class_column_heading=$_"} keys %class_names;
  2         12  
1727 1         3 my @feature_names = map {$all_feature_names[$_]} @{$self->{_csv_columns_for_features}};
  4         9  
  1         3  
1728 1         5 my %class_for_sample_hash = map {"sample_" . $_ => "$class_column_heading=" . $data_hash{$_}->[$self->{_csv_class_column_index} - 1 ] } keys %data_hash;
  35         96  
1729 1         10 my @sample_names = map {"sample_$_"} keys %data_hash;
  35         59  
1730 1 50       8 my %feature_values_for_samples_hash = map {my $sampleID = $_; "sample_" . $sampleID => [map {my $fname = $all_feature_names[$_]; $fname . "=" . eval{$data_hash{$sampleID}->[$_-1] =~ /^\d+$/ ? sprintf("%.1f", $data_hash{$sampleID}->[$_-1] ) : $data_hash{$sampleID}->[$_-1] } } @{$self->{_csv_columns_for_features}} ] } keys %data_hash;
  35         59  
  35         55  
  140         220  
  140         223  
  140         516  
  35         58  
1731 1 50       10 my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a] => [ map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_csv_columns_for_features}};
  4         15  
  4         11  
  4         31  
  140         300  
  140         518  
  140         387  
  1         5  
1732 1         4 my %numeric_features_valuerange_hash = ();
1733 1         3 my %feature_values_how_many_uniques_hash = ();
1734 1         4 my %features_and_unique_values_hash = ();
1735 1         3 my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
1736 1         5 foreach my $feature (keys %features_and_values_hash) {
1737 4         9 my %seen1 = ();
1738 140 100 66     655 my @unique_values_for_feature = sort grep {$_ if $_ ne 'NA' && !$seen1{$_}++}
1739 4         8 @{$features_and_values_hash{$feature}};
  4         16  
1740 4         15 $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
1741 4         11 my $not_all_values_float = 0;
1742 4 50       11 map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
  14         152  
1743 4 50       13 if ($not_all_values_float == 0) {
1744 0         0 my @minmaxvalues = minmax(\@unique_values_for_feature);
1745 0         0 $numeric_features_valuerange_hash{$feature} = \@minmaxvalues;
1746             }
1747 4         17 $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
1748             }
1749 1 50       7 if ($self->{_debug1}) {
1750 0         0 print "\nAll class names: @all_class_names\n";
1751 0         0 print "\nEach sample data record:\n";
1752 0         0 foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
  0         0  
1753 0         0 print "$sample => @{$feature_values_for_samples_hash{$sample}}\n";
  0         0  
1754             }
1755 0         0 print "\nclass label for each data sample:\n";
1756 0         0 foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %class_for_sample_hash) {
  0         0  
1757 0         0 print "$sample => $class_for_sample_hash{$sample}\n";
1758             }
1759 0         0 print "\nFeatures used: @feature_names\n\n";
1760 0         0 print "\nfeatures and the values taken by them:\n";
1761 0         0 foreach my $feature (sort keys %features_and_values_hash) {
1762 0         0 print "$feature => @{$features_and_values_hash{$feature}}\n";
  0         0  
1763             }
1764 0         0 print "\nnumeric features and their ranges:\n";
1765 0         0 foreach my $feature (sort keys %numeric_features_valuerange_hash) {
1766 0         0 print "$feature => @{$numeric_features_valuerange_hash{$feature}}\n";
  0         0  
1767             }
1768 0         0 print "\nnumber of unique values in each feature:\n";
1769 0         0 foreach my $feature (sort keys %feature_values_how_many_uniques_hash) {
1770 0         0 print "$feature => $feature_values_how_many_uniques_hash{$feature}\n";
1771             }
1772             }
1773 1         4 $self->{_class_names} = \@all_class_names;
1774 1         4 $self->{_feature_names} = \@feature_names;
1775 1         4 $self->{_samples_class_label_hash} = \%class_for_sample_hash;
1776 1         3 $self->{_training_data_hash} = \%feature_values_for_samples_hash;
1777 1         4 $self->{_features_and_values_hash} = \%features_and_values_hash;
1778 1         12 $self->{_features_and_unique_values_hash} = \%features_and_unique_values_hash;
1779 1         3 $self->{_numeric_features_valuerange_hash} = \%numeric_features_valuerange_hash;
1780 1         44 $self->{_feature_values_how_many_uniques_hash} = \%feature_values_how_many_uniques_hash;
1781             }
1782              
1783             sub show_training_data {
1784 0     0 1 0 my $self = shift;
1785 0         0 my @class_names = @{$self->{_class_names}};
  0         0  
1786 0         0 my %features_and_values_hash = %{$self->{_features_and_values_hash}};
  0         0  
1787 0         0 my %samples_class_label_hash = %{$self->{_samples_class_label_hash}};
  0         0  
1788 0         0 my %training_data_hash = %{$self->{_training_data_hash}};
  0         0  
1789 0         0 print "\n\nClass Names: @class_names\n";
1790 0         0 print "\n\nFeatures and Their Values:\n\n";
1791 0         0 while ( my ($k, $v) = each %features_and_values_hash ) {
1792 0         0 print "$k ---> @{$features_and_values_hash{$k}}\n";
  0         0  
1793             }
1794 0         0 print "\n\nSamples vs. Class Labels:\n\n";
1795 0         0 foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %samples_class_label_hash) {
  0         0  
1796 0         0 print "$kee => $samples_class_label_hash{$kee}\n";
1797             }
1798 0         0 print "\n\nTraining Samples:\n\n";
1799 0         0 foreach my $kee (sort {sample_index($a) <=> sample_index($b)}
  0         0  
1800             keys %training_data_hash) {
1801 0         0 print "$kee => @{$training_data_hash{$kee}}\n";
  0         0  
1802             }
1803             }
1804              
1805             sub get_class_names {
1806 0     0 0 0 my $self = shift;
1807 0         0 return @{$self->{_class_names}}
  0         0  
1808             }
1809              
1810             ########################################## Utility Routines ############################################
1811              
1812             sub closest_sampling_point {
1813 0     0 0 0 my $value = shift;
1814 0         0 my $arr_ref = shift;
1815 0         0 my @arr = @{$arr_ref};
  0         0  
1816 0         0 my @compare = map {abs($_ - $value)} @arr;
  0         0  
1817 0         0 my ($minval,$index) = minimum(\@compare);
1818 0         0 return $arr[$index];
1819             }
1820              
1821             ## returns the array index that contains a specified STRING value: (meant only for array of strings)
1822             sub get_index_at_value {
1823 0     0 0 0 my $value = shift;
1824 0         0 my @array = @{shift @_};
  0         0  
1825 0         0 foreach my $i (0..@array-1) {
1826 0 0       0 return $i if $value eq $array[$i];
1827             }
1828             }
1829              
1830             ## When the training data is read from a CSV file, we assume that the first column
1831             ## of each data record contains a unique integer identifier for the record in that
1832             ## row. This training data is stored in a hash whose keys are the prefix 'sample_'
1833             ## followed by the identifying integers. The purpose of this function is to return
1834             ## the identifying integer associated with a data record.
1835             sub sample_index {
1836 1088     1088 0 1757 my $arg = shift;
1837 1088         2447 $arg =~ /_(.+)$/;
1838 1088         2685 return $1;
1839             }
1840              
1841             # Returns the minimum value and its positional index in an array
1842             sub minimum {
1843 0     0 0 0 my $arr = shift;
1844 0         0 my $min;
1845             my $index;
1846 0         0 foreach my $i (0..@{$arr}-1) {
  0         0  
1847 0 0 0     0 if ( (!defined $min) || ($arr->[$i] < $min) ) {
1848 0         0 $index = $i;
1849 0         0 $min = $arr->[$i];
1850             }
1851             }
1852 0         0 return ($min, $index);
1853             }
1854              
1855             # Returns an array of two values, the min and the max, of an array of floats
1856             sub minmax {
1857 0     0 0 0 my $arr = shift;
1858 0         0 my ($min, $max);
1859 0         0 foreach my $i (0..@{$arr}-1) {
  0         0  
1860 0 0 0     0 if ( (!defined $min) || ($arr->[$i] < $min) ) {
1861 0         0 $min = $arr->[$i];
1862             }
1863 0 0 0     0 if ( (!defined $max) || ($arr->[$i] > $max) ) {
1864 0         0 $max = $arr->[$i];
1865             }
1866             }
1867 0         0 return ($min, $max);
1868             }
1869              
1870             # checks whether an element is in an array:
1871             sub contained_in {
1872 414     414 0 858 my $ele = shift;
1873 414         1227 my @array = @_;
1874 414         858 my $count = 0;
1875 414 100       908 map {$count++ if $ele eq $_} @array;
  1104         3997  
1876 414         1727 return $count;
1877             }
1878              
1879             # Meant only for an array of strings (no nesting):
1880             sub deep_copy_array {
1881 289     289 0 619 my $ref_in = shift;
1882 289         578 my $ref_out;
1883 289 100       885 return [] if scalar @$ref_in == 0;
1884 274         525 foreach my $i (0..@{$ref_in}-1) {
  274         994  
1885 628         1783 $ref_out->[$i] = $ref_in->[$i];
1886             }
1887 274         986 return $ref_out;
1888             }
1889              
1890             sub check_for_illegal_params2 {
1891 1     1 0 3 my @params = @_;
1892 1         7 my @legal_params = qw / training_datafile
1893             entropy_threshold
1894             max_depth_desired
1895             csv_class_column_index
1896             csv_columns_for_features
1897             symbolic_to_numeric_cardinality_threshold
1898             number_of_histogram_bins
1899             csv_cleanup_needed
1900             debug1
1901             debug2
1902             debug3
1903             /;
1904 1         7 my $found_match_flag;
1905 1         3 foreach my $param (@params) {
1906 3         5 foreach my $legal (@legal_params) {
1907 10         16 $found_match_flag = 0;
1908 10 100       20 if ($param eq $legal) {
1909 3         4 $found_match_flag = 1;
1910 3         5 last;
1911             }
1912             }
1913 3 50       9 last if $found_match_flag == 0;
1914             }
1915 1         9 return $found_match_flag;
1916             }
1917              
1918             sub print_array_with_msg {
1919 0     0 0 0 my $message = shift;
1920 0         0 my $arr = shift;
1921 0         0 print "\n$message: ";
1922 0         0 print_nested_array( $arr );
1923             }
1924              
1925             sub print_nested_array {
1926 0     0 0 0 my $arr = shift;
1927 0         0 my @arr = @$arr;
1928 0         0 print "[";
1929 0         0 foreach my $item (@arr) {
1930 0 0       0 if (ref $item) {
1931 0         0 print_nested_array($item);
1932             } else {
1933 0         0 print "$item";
1934             }
1935             }
1936 0         0 print "]";
1937             }
1938              
1939             sub cleanup_csv {
1940 0     0 0 0 my $line = shift;
1941 0         0 $line =~ tr/\/:?()[]{}'/ /;
1942             # my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]+\"/g;
1943 0         0 my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]*\"/g;
1944 0         0 for (@double_quoted) {
1945 0         0 my $item = $_;
1946 0         0 $item = substr($item, 1, -1);
1947 0         0 $item =~ s/^\s+|,|\s+$//g;
1948 0         0 $item = join '_', split /\s+/, $item;
1949 0         0 substr($line, index($line, $_), length($_)) = $item;
1950             }
1951 0         0 my @white_spaced = $line =~ /,(\s*[^,]+)(?=,|$)/g;
1952 0         0 for (@white_spaced) {
1953 0         0 my $item = $_;
1954 0         0 $item =~ s/\s+/_/g;
1955 0         0 $item =~ s/^\s*_|_\s*$//g;
1956 0         0 substr($line, index($line, $_), length($_)) = $item;
1957             }
1958 0         0 $line =~ s/,\s*(?=,|$)/,NA/g;
1959 0         0 return $line;
1960             }
1961              
1962             ######################################### Class EvalTrainingData ########################################
1963              
1964             ## This subclass of the DecisionTree class is used to evaluate the quality of your
1965             ## training data by running a 10-fold cross-validation test on it. This test divides
1966             ## all of the training data into ten parts, with nine parts used for training a
1967             ## decision tree and one part used for testing its ability to classify correctly.
1968             ## This selection of nine parts for training and one part for testing is carried out
1969             ## in all of the ten different possible ways. This testing functionality can also
1970             ## be used to find the best values to use for the constructor parameters
1971             ## entropy_threshold, max_depth_desired, and
1972             ## symbolic_to_numeric_cardinality_threshold.
1973              
1974             ## Only the CSV training files can be evaluated in this manner (because only CSV
1975             ## training are allowed to have numeric features --- which is the more interesting
1976             ## case for evaluation analytics.
1977              
1978             package EvalTrainingData;
1979              
1980             @EvalTrainingData::ISA = ('Algorithm::DecisionTree');
1981              
1982             sub new {
1983 0     0   0 my $class = shift;
1984 0         0 my $instance = Algorithm::DecisionTree->new(@_);
1985 0         0 bless $instance, $class;
1986             }
1987              
1988             sub evaluate_training_data {
1989 0     0   0 my $self = shift;
1990 0         0 my $evaldebug = 0;
1991             die "The data evaluation function in the module can only be used when your " .
1992 0 0       0 "training data is in a CSV file" unless $self->{_training_datafile} =~ /\.csv$/;
1993 0         0 print "\nWill run a 10-fold cross-validation test on your training data to test its " .
1994             "class-discriminatory power:\n";
1995 0         0 my %all_training_data = %{$self->{_training_data_hash}};
  0         0  
1996 0         0 my @all_sample_names = sort {Algorithm::DecisionTree::sample_index($a) <=>
  0         0  
1997             Algorithm::DecisionTree::sample_index($b)} keys %all_training_data;
1998 0         0 my $fold_size = int(0.1 * (scalar keys %all_training_data));
1999 0         0 print "fold size: $fold_size\n";
2000 0         0 my %confusion_matrix = ();
2001 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
2002 0         0 foreach my $inner_class_name (@{$self->{_class_names}}) {
  0         0  
2003 0         0 $confusion_matrix{$class_name}->{$inner_class_name} = 0;
2004             }
2005             }
2006 0         0 foreach my $fold_index (0..9) {
2007 0         0 print "\nStarting the iteration indexed $fold_index of the 10-fold cross-validation test\n";
2008 0         0 my @testing_samples = @all_sample_names[$fold_size * $fold_index .. $fold_size * ($fold_index+1) - 1];
2009 0         0 my @training_samples = (@all_sample_names[0 .. $fold_size * $fold_index-1],
2010             @all_sample_names[$fold_size * ($fold_index+1) .. (scalar keys %all_training_data) - 1]);
2011 0         0 my %testing_data = ();
2012 0         0 foreach my $x (@testing_samples) {
2013 0         0 $testing_data{$x} = $all_training_data{$x};
2014             }
2015 0         0 my %training_data = ();
2016 0         0 foreach my $x (@training_samples) {
2017 0         0 $training_data{$x} = $all_training_data{$x};
2018             }
2019 0         0 my $trainingDT = Algorithm::DecisionTree->new('evalmode');
2020 0         0 $trainingDT->{_training_data_hash} = \%training_data;
2021 0         0 $trainingDT->{_class_names} = $self->{_class_names};
2022 0         0 $trainingDT->{_feature_names} = $self->{_feature_names};
2023 0         0 $trainingDT->{_entropy_threshold} = $self->{_entropy_threshold};
2024 0         0 $trainingDT->{_max_depth_desired} = $self->{_max_depth_desired};
2025             $trainingDT->{_symbolic_to_numeric_cardinality_threshold} =
2026 0         0 $self->{_symbolic_to_numeric_cardinality_threshold};
2027 0         0 foreach my $sample_name (@training_samples) {
2028             $trainingDT->{_samples_class_label_hash}->{$sample_name} =
2029 0         0 $self->{_samples_class_label_hash}->{$sample_name};
2030             }
2031 0         0 foreach my $feature (keys %{$self->{_features_and_values_hash}}) {
  0         0  
2032 0         0 $trainingDT->{_features_and_values_hash}->{$feature} = ();
2033             }
2034 0         0 my $pattern = '(\S+)\s*=\s*(\S+)';
2035 0         0 foreach my $item (sort {Algorithm::DecisionTree::sample_index($a) <=>
  0         0  
2036             Algorithm::DecisionTree::sample_index($b)}
2037 0         0 keys %{$trainingDT->{_training_data_hash}}) {
2038 0         0 foreach my $feature_and_value (@{$trainingDT->{_training_data_hash}->{$item}}) {
  0         0  
2039 0         0 $feature_and_value =~ /$pattern/;
2040 0         0 my ($feature,$value) = ($1,$2);
2041 0 0       0 push @{$trainingDT->{_features_and_values_hash}->{$feature}}, $value if $value ne 'NA';
  0         0  
2042             }
2043             }
2044 0         0 foreach my $feature (keys %{$trainingDT->{_features_and_values_hash}}) {
  0         0  
2045 0         0 my %seen = ();
2046 0 0 0     0 my @unique_values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
2047 0         0 @{$trainingDT->{_features_and_values_hash}->{$feature}};
  0         0  
2048 0 0       0 if (Algorithm::DecisionTree::contained_in($feature,
2049 0         0 keys %{$self->{_numeric_features_valuerange_hash}})) {
2050 0         0 @unique_values_for_feature = sort {$a <=> $b} @unique_values_for_feature;
  0         0  
2051             } else {
2052 0         0 @unique_values_for_feature = sort @unique_values_for_feature;
2053             }
2054 0         0 $trainingDT->{_features_and_unique_values_hash}->{$feature} = \@unique_values_for_feature;
2055             }
2056 0         0 foreach my $feature (keys %{$self->{_numeric_features_valuerange_hash}}) {
  0         0  
2057             my @minmaxvalues = Algorithm::DecisionTree::minmax(
2058 0         0 \@{$trainingDT->{_features_and_unique_values_hash}->{$feature}});
  0         0  
2059 0         0 $trainingDT->{_numeric_features_valuerange_hash}->{$feature} = \@minmaxvalues;
2060             }
2061 0 0       0 if ($evaldebug) {
2062 0         0 print "\n\nprinting samples in the testing set: @testing_samples\n";
2063 0         0 print "\n\nPrinting features and their values in the training set:\n";
2064 0         0 foreach my $item (sort keys %{$trainingDT->{_features_and_values_hash}}) {
  0         0  
2065 0         0 print "$item => @{$trainingDT->{_features_and_values_hash}->{$item}}\n";
  0         0  
2066             }
2067 0         0 print "\n\nPrinting unique values for features:\n";
2068 0         0 foreach my $item (sort keys %{$trainingDT->{_features_and_unique_values_hash}}) {
  0         0  
2069 0         0 print "$item => @{$trainingDT->{_features_and_unique_values_hash}->{$item}}\n";
  0         0  
2070             }
2071 0         0 print "\n\nPrinting unique value ranges for features:\n";
2072 0         0 foreach my $item (sort keys %{$trainingDT->{_numeric_features_valuerange_hash}}) {
  0         0  
2073 0         0 print "$item => @{$trainingDT->{_numeric_features_valuerange_hash}->{$item}}\n";
  0         0  
2074             }
2075             }
2076 0         0 foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {
  0         0  
2077             $trainingDT->{_feature_values_how_many_uniques_hash}->{$feature} =
2078 0         0 scalar @{$trainingDT->{_features_and_unique_values_hash}->{$feature}};
  0         0  
2079             }
2080 0 0       0 $trainingDT->{_debug2} = 1 if $evaldebug;
2081 0         0 $trainingDT->calculate_first_order_probabilities();
2082 0         0 $trainingDT->calculate_class_priors();
2083 0         0 my $root_node = $trainingDT->construct_decision_tree_classifier();
2084 0 0       0 $root_node->display_decision_tree(" ") if $evaldebug;
2085 0         0 foreach my $test_sample_name (@testing_samples) {
2086 0         0 my @test_sample_data = @{$all_training_data{$test_sample_name}};
  0         0  
2087 0 0       0 print "original data in test sample: @test_sample_data\n" if $evaldebug;
2088 0 0 0     0 @test_sample_data = grep {$_ if $_ && $_ !~ /=NA$/} @test_sample_data;
  0         0  
2089 0 0       0 print "filtered data in test sample: @test_sample_data\n" if $evaldebug;
2090 0         0 my %classification = %{$trainingDT->classify($root_node, \@test_sample_data)};
  0         0  
2091 0         0 my @solution_path = @{$classification{'solution_path'}};
  0         0  
2092 0         0 delete $classification{'solution_path'};
2093 0         0 my @which_classes = keys %classification;
2094 0         0 @which_classes = sort {$classification{$b} <=> $classification{$a}} @which_classes;
  0         0  
2095 0         0 my $most_likely_class_label = $which_classes[0];
2096 0 0       0 if ($evaldebug) {
2097 0         0 print "\nClassification:\n\n";
2098 0         0 print " class probability\n";
2099 0         0 print " ---------- -----------\n";
2100 0         0 foreach my $which_class (@which_classes) {
2101 0         0 my $classstring = sprintf("%-30s", $which_class);
2102 0         0 my $valuestring = sprintf("%-30s", $classification{$which_class});
2103 0         0 print " $classstring $valuestring\n";
2104             }
2105 0         0 print "\nSolution path in the decision tree: @solution_path\n";
2106 0         0 print "\nNumber of nodes created: " . $root_node->how_many_nodes() . "\n";
2107             }
2108 0         0 my $true_class_label_for_sample = $self->{_samples_class_label_hash}->{$test_sample_name};
2109 0 0       0 print "$test_sample_name: true_class: $true_class_label_for_sample " .
2110             "estimated_class: $most_likely_class_label\n" if $evaldebug;
2111 0         0 $confusion_matrix{$true_class_label_for_sample}->{$most_likely_class_label} += 1;
2112             }
2113             }
2114 0         0 print "\n\n DISPLAYING THE CONFUSION MATRIX FOR THE 10-FOLD CROSS-VALIDATION TEST:\n\n\n";
2115 0         0 my $matrix_header = " " x 30;
2116 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
2117 0         0 $matrix_header .= sprintf("%-30s", $class_name);
2118             }
2119 0         0 print "\n" . $matrix_header . "\n\n";
2120 0         0 foreach my $row_class_name (sort keys %confusion_matrix) {
2121 0         0 my $row_display = sprintf("%-30s", $row_class_name);
2122 0         0 foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {
  0         0  
2123 0         0 $row_display .= sprintf( "%-30u", $confusion_matrix{$row_class_name}->{$col_class_name} );
2124             }
2125 0         0 print "$row_display\n\n";
2126             }
2127 0         0 print "\n\n";
2128 0         0 my ($diagonal_sum, $off_diagonal_sum) = (0,0);
2129 0         0 foreach my $row_class_name (sort keys %confusion_matrix) {
2130 0         0 foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {
  0         0  
2131 0 0       0 if ($row_class_name eq $col_class_name) {
2132 0         0 $diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
2133             } else {
2134 0         0 $off_diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
2135             }
2136             }
2137             }
2138 0         0 my $data_quality_index = 100.0 * $diagonal_sum / ($diagonal_sum + $off_diagonal_sum);
2139 0         0 print "\nTraining Data Quality Index: $data_quality_index (out of a possible maximum of 100)\n";
2140 0 0 0     0 if ($data_quality_index <= 80) {
    0 0        
    0 0        
    0          
2141 0         0 print "\nYour training data does not possess much class discriminatory " .
2142             "information. It could be that the classes are inherently not well " .
2143             "separable or that your constructor parameter choices are not appropriate.\n";
2144             } elsif ($data_quality_index > 80 && $data_quality_index <= 90) {
2145 0         0 print "\nYour training data possesses some class discriminatory information " .
2146             "but it may not be sufficient for real-world applications. You might " .
2147             "try tweaking the constructor parameters to see if that improves the " .
2148             "class discriminations.\n";
2149             } elsif ($data_quality_index > 90 && $data_quality_index <= 95) {
2150 0         0 print "\nYour training data appears to possess good class discriminatory " .
2151             "information. Whether or not it is acceptable would depend on your " .
2152             "application.\n";
2153             } elsif ($data_quality_index > 95 && $data_quality_index <= 98) {
2154 0         0 print "\nYour training data is of excellent quality.\n";
2155             } else {
2156 0         0 print "\nYour training data is perfect.\n";
2157             }
2158              
2159             }
2160              
2161              
2162             ############################################# Class DTNode #############################################
2163              
2164             # The nodes of the decision tree are instances of this class:
2165              
2166             package DTNode;
2167              
2168 1     1   23 use strict;
  1         5  
  1         39  
2169 1     1   14 use Carp;
  1         4  
  1         1578  
2170              
2171             # $feature is the feature test at the current node. $branch_features_and_values is
2172             # an anonymous array holding the feature names and corresponding values on the path
2173             # from the root to the current node:
2174             sub new {
2175 46     46   235 my ($class, $feature, $entropy, $class_probabilities,
2176             $branch_features_and_values_or_thresholds, $dt, $root_or_not) = @_;
2177 46 100       229 $root_or_not = '' if !defined $root_or_not;
2178 46 100       187 if ($root_or_not eq 'root') {
2179 1         5 $dt->{nodes_created} = -1;
2180 1         8 $dt->{class_names} = undef;
2181             }
2182 46         400 my $self = {
2183             _dt => $dt,
2184             _feature => $feature,
2185             _node_creation_entropy => $entropy,
2186             _class_probabilities => $class_probabilities,
2187             _branch_features_and_values_or_thresholds => $branch_features_and_values_or_thresholds,
2188             _linked_to => [],
2189             };
2190 46         153 bless $self, $class;
2191 46         199 $self->{_serial_number} = $self->get_next_serial_num();
2192 46         136 return $self;
2193             }
2194              
2195             sub how_many_nodes {
2196 0     0   0 my $self = shift;
2197 0         0 return $self->{_dt}->{nodes_created} + 1;
2198             }
2199              
2200             sub set_class_names {
2201 1     1   5 my $self = shift;
2202 1         4 my $class_names_list = shift;
2203 1         4 $self->{_dt}->{class_names} = $class_names_list;
2204             }
2205              
2206             sub get_class_names {
2207 0     0   0 my $self = shift;
2208 0         0 return $self->{_dt}->{class_names};
2209             }
2210              
2211             sub get_next_serial_num {
2212 46     46   110 my $self = shift;
2213 46         158 $self->{_dt}->{nodes_created} += 1;
2214 46         154 return $self->{_dt}->{nodes_created};
2215             }
2216              
2217             sub get_serial_num {
2218 46     46   115 my $self = shift;
2219 46         135 $self->{_serial_number};
2220             }
2221              
2222             # this returns the feature test at the current node
2223             sub get_feature {
2224 0     0   0 my $self = shift;
2225 0         0 return $self->{ _feature };
2226             }
2227              
2228             sub set_feature {
2229 46     46   114 my $self = shift;
2230 46         105 my $feature = shift;
2231 46         168 $self->{_feature} = $feature;
2232             }
2233              
2234             sub get_node_entropy {
2235 46     46   103 my $self = shift;
2236 46         132 return $self->{_node_creation_entropy};
2237             }
2238              
2239             sub get_class_probabilities {
2240 0     0   0 my $self = shift;
2241 0         0 return $self->{ _class_probabilities};
2242             }
2243              
2244             sub get_branch_features_and_values_or_thresholds {
2245 46     46   117 my $self = shift;
2246 46         178 return $self->{_branch_features_and_values_or_thresholds};
2247             }
2248              
2249             sub add_to_branch_features_and_values {
2250 0     0   0 my $self = shift;
2251 0         0 my $feature_and_value = shift;
2252 0         0 push @{$self->{ _branch_features_and_values }}, $feature_and_value;
  0         0  
2253             }
2254              
2255             sub get_children {
2256 0     0   0 my $self = shift;
2257 0         0 return $self->{_linked_to};
2258             }
2259              
2260             sub add_child_link {
2261 45     45   136 my ($self, $new_node, ) = @_;
2262 45         103 push @{$self->{_linked_to}}, $new_node;
  45         151  
2263             }
2264              
2265             sub delete_all_links {
2266 0     0   0 my $self = shift;
2267 0         0 $self->{_linked_to} = undef;
2268             }
2269              
2270             sub display_node {
2271 0     0   0 my $self = shift;
2272 0   0     0 my $feature_at_node = $self->get_feature() || " ";
2273 0         0 my $node_creation_entropy_at_node = $self->get_node_entropy();
2274 0         0 my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2275 0         0 my @class_probabilities = @{$self->get_class_probabilities()};
  0         0  
2276 0         0 my @class_probabilities_for_display = map {sprintf("%0.3f", $_)} @class_probabilities;
  0         0  
2277 0         0 my $serial_num = $self->get_serial_num();
2278 0         0 my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
  0         0  
2279 0         0 print "\n\nNODE $serial_num" .
2280             ":\n Branch features and values to this node: @branch_features_and_values_or_thresholds" .
2281             "\n Class probabilities at current node: @class_probabilities_for_display" .
2282             "\n Entropy at current node: $print_node_creation_entropy_at_node" .
2283             "\n Best feature test at current node: $feature_at_node\n\n";
2284             }
2285              
2286             sub display_decision_tree {
2287 0     0   0 my $self = shift;
2288 0         0 my $offset = shift;
2289 0         0 my $serial_num = $self->get_serial_num();
2290 0 0       0 if (@{$self->get_children()} > 0) {
  0         0  
2291 0   0     0 my $feature_at_node = $self->get_feature() || " ";
2292 0         0 my $node_creation_entropy_at_node = $self->get_node_entropy();
2293 0         0 my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2294 0         0 my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
  0         0  
2295 0         0 my @class_probabilities = @{$self->get_class_probabilities()};
  0         0  
2296 0         0 my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
  0         0  
2297 0         0 my @class_names = @{$self->get_class_names()};
  0         0  
2298             my @print_class_probabilities_with_class =
2299 0         0 map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
  0         0  
2300 0         0 print "NODE $serial_num: $offset BRANCH TESTS TO NODE: @branch_features_and_values_or_thresholds\n";
2301 0         0 my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
2302 0         0 print "$second_line_offset" . "Decision Feature: $feature_at_node Node Creation Entropy: " ,
2303             "$print_node_creation_entropy_at_node Class Probs: @print_class_probabilities_with_class\n\n";
2304 0         0 $offset .= " ";
2305 0         0 foreach my $child (@{$self->get_children()}) {
  0         0  
2306 0         0 $child->display_decision_tree($offset);
2307             }
2308             } else {
2309 0         0 my $node_creation_entropy_at_node = $self->get_node_entropy();
2310 0         0 my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2311 0         0 my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
  0         0  
2312 0         0 my @class_probabilities = @{$self->get_class_probabilities()};
  0         0  
2313 0         0 my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
  0         0  
2314 0         0 my @class_names = @{$self->get_class_names()};
  0         0  
2315             my @print_class_probabilities_with_class =
2316 0         0 map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
  0         0  
2317 0         0 print "NODE $serial_num: $offset BRANCH TESTS TO LEAF NODE: @branch_features_and_values_or_thresholds\n";
2318 0         0 my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
2319 0         0 print "$second_line_offset" . "Node Creation Entropy: $print_node_creation_entropy_at_node " .
2320             "Class Probs: @print_class_probabilities_with_class\n\n";
2321             }
2322             }
2323              
2324              
2325             ############################## Generate Your Own Numeric Training Data #################################
2326             ############################# Class TrainingDataGeneratorNumeric ################################
2327              
2328             ## See the script generate_training_data_numeric.pl in the examples
2329             ## directory on how to use this class for generating your own numeric training and
2330             ## test data. The training and test data are generated in accordance with the
2331             ## specifications you place in the parameter file that is supplied as an argument to
2332             ## the constructor of this class.
2333              
2334             package TrainingDataGeneratorNumeric;
2335              
2336 1     1   20 use strict;
  1         3  
  1         36  
2337 1     1   8 use Carp;
  1         4  
  1         1485  
2338              
2339             sub new {
2340 0     0   0 my ($class, %args) = @_;
2341 0         0 my @params = keys %args;
2342 0 0       0 croak "\nYou have used a wrong name for a keyword argument " .
2343             "--- perhaps a misspelling\n"
2344             if check_for_illegal_params3(@params) == 0;
2345             bless {
2346             _output_training_csv_file => $args{'output_training_csv_file'}
2347             || croak("name for output_training_csv_file required"),
2348             _output_test_csv_file => $args{'output_test_csv_file'}
2349             || croak("name for output_test_csv_file required"),
2350             _parameter_file => $args{'parameter_file'}
2351             || croak("parameter_file required"),
2352             _number_of_samples_for_training => $args{'number_of_samples_for_training'}
2353             || croak("number_of_samples_for_training"),
2354             _number_of_samples_for_testing => $args{'number_of_samples_for_testing'}
2355             || croak("number_of_samples_for_testing"),
2356 0   0     0 _debug => $args{debug} || 0,
      0        
      0        
      0        
      0        
      0        
2357             _class_names => [],
2358             _class_names_and_priors => {},
2359             _features_with_value_range => {},
2360             _features_ordered => [],
2361             _classes_and_their_param_values => {},
2362             }, $class;
2363             }
2364              
2365             sub check_for_illegal_params3 {
2366 0     0   0 my @params = @_;
2367 0         0 my @legal_params = qw / output_training_csv_file
2368             output_test_csv_file
2369             parameter_file
2370             number_of_samples_for_training
2371             number_of_samples_for_testing
2372             debug
2373             /;
2374 0         0 my $found_match_flag;
2375 0         0 foreach my $param (@params) {
2376 0         0 foreach my $legal (@legal_params) {
2377 0         0 $found_match_flag = 0;
2378 0 0       0 if ($param eq $legal) {
2379 0         0 $found_match_flag = 1;
2380 0         0 last;
2381             }
2382             }
2383 0 0       0 last if $found_match_flag == 0;
2384             }
2385 0         0 return $found_match_flag;
2386             }
2387              
2388             ## The training data generated by an instance of the class
2389             ## TrainingDataGeneratorNumeric is based on the specs you place in a parameter that
2390             ## you supply to the class constructor through a constructor variable called
2391             ## `parameter_file'. This method is for parsing the parameter file in order to
2392             ## order to determine the names to be used for the different data classes, their
2393             ## means, and their variances.
2394             sub read_parameter_file_numeric {
2395 0     0   0 my $self = shift;
2396 0         0 my @class_names = ();
2397 0         0 my %class_names_and_priors = ();
2398 0         0 my %features_with_value_range = ();
2399 0         0 my %classes_and_their_param_values = ();
2400             # my $regex8 = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
2401 0   0     0 open FILE, $self->{_parameter_file} || die "unable to open parameter file: $!";
2402 0         0 my @params = ;
2403 0         0 my $params = join "", @params;
2404 0         0 my $regex = 'class names: ([\w ]+)\W*class priors: ([\d. ]+)';
2405 0         0 $params =~ /$regex/si;
2406 0         0 my ($class_names, $class_priors) = ($1, $2);
2407 0         0 @class_names = split ' ', $class_names;
2408 0         0 my @class_priors = split ' ', $class_priors;
2409 0         0 foreach my $i (0..@class_names-1) {
2410 0         0 $class_names_and_priors{$class_names[$i]} = $class_priors[$i];
2411             }
2412 0 0       0 if ($self->{_debug}) {
2413 0         0 foreach my $cname (keys %class_names_and_priors) {
2414 0         0 print "$cname => $class_names_and_priors{$cname}\n";
2415             }
2416             }
2417 0         0 $regex = 'feature name: \w*.*?value range: [\d\. -]+';
2418 0         0 my @features = $params =~ /$regex/gsi;
2419 0         0 my @features_ordered;
2420 0         0 $regex = 'feature name: (\w+)\W*?value range:\s*([\d. -]+)';
2421 0         0 foreach my $feature (@features) {
2422 0         0 $feature =~ /$regex/i;
2423 0         0 my $feature_name = $1;
2424 0         0 push @features_ordered, $feature_name;
2425 0         0 my @value_range = split ' ', $2;
2426 0         0 $features_with_value_range{$feature_name} = \@value_range;
2427             }
2428 0 0       0 if ($self->{_debug}) {
2429 0         0 foreach my $fname (keys %features_with_value_range) {
2430 0         0 print "$fname => @{$features_with_value_range{$fname}}\n";
  0         0  
2431             }
2432             }
2433 0         0 foreach my $i (0..@class_names-1) {
2434 0         0 $classes_and_their_param_values{$class_names[$i]} = {};
2435             }
2436 0         0 $regex = 'params for class: \w*?\W+?mean:[\d\. ]+\W*?covariance:\W+?(?:[ \d.]+\W+?)+';
2437 0         0 my @class_params = $params =~ /$regex/gsi;
2438 0         0 $regex = 'params for class: (\w+)\W*?mean:\s*([\d. -]+)\W*covariance:\s*([\s\d.]+)';
2439 0         0 foreach my $class_param (@class_params) {
2440 0         0 $class_param =~ /$regex/gsi;
2441 0         0 my $class_name = $1;
2442 0         0 my @class_mean = split ' ', $2;
2443 0         0 $classes_and_their_param_values{$class_name}->{'mean'} = \@class_mean;
2444 0         0 my $class_param_string = $3;
2445 0         0 my @covar_rows = split '\n', $class_param_string;
2446 0         0 my @covar_matrix;
2447 0         0 foreach my $row (@covar_rows) {
2448 0         0 my @row = split ' ', $row;
2449 0         0 push @covar_matrix, \@row;
2450             }
2451 0         0 $classes_and_their_param_values{$class_name}->{'covariance'} = \@covar_matrix;
2452             }
2453 0 0       0 if ($self->{_debug}) {
2454 0         0 print "\nThe class parameters are:\n\n";
2455 0         0 foreach my $cname (keys %classes_and_their_param_values) {
2456 0         0 print "\nFor class name $cname:\n";
2457 0         0 my %params_hash = %{$classes_and_their_param_values{$cname}};
  0         0  
2458 0         0 foreach my $x (keys %params_hash) {
2459 0 0       0 if ($x eq 'mean') {
2460 0         0 print " $x => @{$params_hash{$x}}\n";
  0         0  
2461             } else {
2462 0 0       0 if ($x eq 'covariance') {
2463 0         0 print " The covariance matrix:\n";
2464 0         0 my @matrix = @{$params_hash{'covariance'}};
  0         0  
2465 0         0 foreach my $row (@matrix) {
2466 0         0 print " @$row\n";
2467             }
2468             }
2469             }
2470             }
2471             }
2472             }
2473 0         0 $self->{_class_names} = \@class_names;
2474 0         0 $self->{_class_names_and_priors} = \%class_names_and_priors;
2475 0         0 $self->{_features_with_value_range} = \%features_with_value_range;
2476 0         0 $self->{_classes_and_their_param_values} = \%classes_and_their_param_values;
2477 0         0 $self->{_features_ordered} = \@features_ordered;
2478             }
2479              
2480             ## After the parameter file is parsed by the previous method, this method calls on
2481             ## Math::Random::random_multivariate_normal() to generate the training and test data
2482             ## samples. Your training and test data can be of any number of of dimensions, can
2483             ## have any mean, and any covariance. The training and test data must obviously be
2484             ## drawn from the same distribution.
2485             sub gen_numeric_training_and_test_data_and_write_to_csv {
2486 1     1   864 use Math::Random;
  1         8174  
  1         1453  
2487 0     0   0 my $self = shift;
2488 0         0 my %training_samples_for_class;
2489             my %test_samples_for_class;
2490 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
2491 0         0 $training_samples_for_class{$class_name} = [];
2492 0         0 $test_samples_for_class{$class_name} = [];
2493             }
2494 0         0 foreach my $class_name (keys %{$self->{_classes_and_their_param_values}}) {
  0         0  
2495 0         0 my @mean = @{$self->{_classes_and_their_param_values}->{$class_name}->{'mean'}};
  0         0  
2496 0         0 my @covariance = @{$self->{_classes_and_their_param_values}->{$class_name}->{'covariance'}};
  0         0  
2497             my @new_training_data = Math::Random::random_multivariate_normal(
2498 0         0 $self->{_number_of_samples_for_training} * $self->{_class_names_and_priors}->{$class_name},
2499             @mean, @covariance );
2500             my @new_test_data = Math::Random::random_multivariate_normal(
2501 0         0 $self->{_number_of_samples_for_testing} * $self->{_class_names_and_priors}->{$class_name},
2502             @mean, @covariance );
2503 0 0       0 if ($self->{_debug}) {
2504 0         0 print "training data for class $class_name:\n";
2505 0         0 foreach my $x (@new_training_data) {print "@$x\n";}
  0         0  
2506 0         0 print "\n\ntest data for class $class_name:\n";
2507 0         0 foreach my $x (@new_test_data) {print "@$x\n";}
  0         0  
2508             }
2509 0         0 $training_samples_for_class{$class_name} = \@new_training_data;
2510 0         0 $test_samples_for_class{$class_name} = \@new_test_data;
2511             }
2512 0         0 my @training_data_records = ();
2513 0         0 my @test_data_records = ();
2514 0         0 foreach my $class_name (keys %training_samples_for_class) {
2515             my $num_of_samples_for_training = $self->{_number_of_samples_for_training} *
2516 0         0 $self->{_class_names_and_priors}->{$class_name};
2517             my $num_of_samples_for_testing = $self->{_number_of_samples_for_testing} *
2518 0         0 $self->{_class_names_and_priors}->{$class_name};
2519 0         0 foreach my $sample_index (0..$num_of_samples_for_training-1) {
2520 0         0 my @training_vector = @{$training_samples_for_class{$class_name}->[$sample_index]};
  0         0  
2521 0         0 @training_vector = map {sprintf("%.3f", $_)} @training_vector;
  0         0  
2522 0         0 my $training_data_record = "$class_name," . join(",", @training_vector) . "\n";
2523 0         0 push @training_data_records, $training_data_record;
2524             }
2525 0         0 foreach my $sample_index (0..$num_of_samples_for_testing-1) {
2526 0         0 my @test_vector = @{$test_samples_for_class{$class_name}->[$sample_index]};
  0         0  
2527 0         0 @test_vector = map {sprintf("%.3f", $_)} @test_vector;
  0         0  
2528 0         0 my $test_data_record = "$class_name," . join(",", @test_vector) . "\n";
2529 0         0 push @test_data_records, $test_data_record;
2530             }
2531             }
2532 0         0 fisher_yates_shuffle(\@training_data_records);
2533 0         0 fisher_yates_shuffle(\@test_data_records);
2534 0 0       0 if ($self->{_debug}) {
2535 0         0 foreach my $record (@training_data_records) {
2536 0         0 print "$record";
2537             }
2538 0         0 foreach my $record (@test_data_records) {
2539 0         0 print "$record";
2540             }
2541             }
2542 0         0 open OUTPUT, ">$self->{_output_training_csv_file}";
2543 0         0 my @feature_names_training = @{$self->{_features_ordered}};
  0         0  
2544 0         0 my @quoted_feature_names_training = map {"\"$_\""} @feature_names_training;
  0         0  
2545 0         0 my $first_row_training = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_training;
2546 0         0 print OUTPUT "$first_row_training\n";
2547 0         0 foreach my $i (0..@training_data_records-1) {
2548 0         0 my $i1 = $i+1;
2549 0         0 my $sample_record = "\"$i1\",$training_data_records[$i]";
2550 0         0 print OUTPUT "$sample_record";
2551             }
2552 0         0 close OUTPUT;
2553 0         0 open OUTPUT, ">$self->{_output_test_csv_file}";
2554 0         0 my @feature_names_testing = keys %{$self->{_features_with_value_range}};
  0         0  
2555 0         0 my @quoted_feature_names_testing = map {"\"$_\""} @feature_names_testing;
  0         0  
2556 0         0 my $first_row_testing = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_testing;
2557 0         0 print OUTPUT "$first_row_testing\n";
2558 0         0 foreach my $i (0..@test_data_records-1) {
2559 0         0 my $i1 = $i+1;
2560 0         0 my $sample_record = "\"$i1\",$test_data_records[$i]";
2561 0         0 print OUTPUT "$sample_record";
2562             }
2563 0         0 close OUTPUT;
2564             }
2565              
2566             # from perl docs:
2567             sub fisher_yates_shuffle {
2568 0     0   0 my $arr = shift;
2569 0         0 my $i = @$arr;
2570 0         0 while (--$i) {
2571 0         0 my $j = int rand( $i + 1 );
2572 0         0 @$arr[$i, $j] = @$arr[$j, $i];
2573             }
2574             }
2575              
2576             ########################### Generate Your Own Symbolic Training Data ###############################
2577             ########################### Class TrainingDataGeneratorSymbolic #############################
2578              
2579             ## See the sample script generate_training_and_test_data_symbolic.pl for how to use
2580             ## this class for generating purely symbolic training and test data. The data is
2581             ## generated according to the specifications you place in a parameter file whose
2582             ## name you supply as one of constructor arguments.
2583             package TrainingDataGeneratorSymbolic;
2584              
2585 1     1   19 use strict;
  1         5  
  1         39  
2586 1     1   9 use Carp;
  1         3  
  1         3354  
2587              
2588             sub new {
2589 1     1   32 my ($class, %args) = @_;
2590 1         7 my @params = keys %args;
2591 1 50       8 croak "\nYou have used a wrong name for a keyword argument " .
2592             "--- perhaps a misspelling\n"
2593             if check_for_illegal_params4(@params) == 0;
2594             bless {
2595             _output_training_datafile => $args{'output_training_datafile'}
2596             || die("name for output_training_datafile required"),
2597             _parameter_file => $args{'parameter_file'}
2598             || die("parameter_file required"),
2599             _number_of_samples_for_training => $args{'number_of_samples_for_training'}
2600             || die("number_of_samples_for_training required"),
2601 1   50     31 _debug => $args{debug} || 0,
      50        
      50        
      50        
2602             _class_names => [],
2603             _class_priors => [],
2604             _features_and_values_hash => {},
2605             _bias_hash => {},
2606             _training_sample_records => {},
2607             }, $class;
2608             }
2609              
2610             sub check_for_illegal_params4 {
2611 1     1   5 my @params = @_;
2612 1         5 my @legal_params = qw / output_training_datafile
2613             parameter_file
2614             number_of_samples_for_training
2615             debug
2616             /;
2617 1         4 my $found_match_flag;
2618 1         4 foreach my $param (@params) {
2619 3         10 foreach my $legal (@legal_params) {
2620 6         15 $found_match_flag = 0;
2621 6 100       23 if ($param eq $legal) {
2622 3         9 $found_match_flag = 1;
2623 3         8 last;
2624             }
2625             }
2626 3 50       14 last if $found_match_flag == 0;
2627             }
2628 1         8 return $found_match_flag;
2629             }
2630              
2631             ## Read a parameter file for generating symbolic training data. See the script
2632             ## generate_symbolic_training_data_symbolic.pl in the Examples directory for how to
2633             ## pass the name of the parameter file to the constructor of the
2634             ## TrainingDataGeneratorSymbolic class.
2635             sub read_parameter_file_symbolic {
2636 1     1   9 my $self = shift;
2637 1         10 my $debug = $self->{_debug};
2638 1         5 my $number_of_training_samples = $self->{_number_of_samples_for_training};
2639 1         3 my $input_parameter_file = $self->{_parameter_file};
2640 1 50       7 croak "Forgot to supply parameter file" if ! defined $input_parameter_file;
2641 1         3 my $output_file_training = $self->{_output_training_datafile};
2642 1         4 my $output_file_testing = $self->{_output_test_datafile};
2643 1         4 my @all_params;
2644             my $param_string;
2645 1   33     36 open INPUT, $input_parameter_file || "unable to open parameter file: $!";
2646 1         53 @all_params = ;
2647 1         9 @all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
  40         150  
2648 1         5 @all_params = grep { $_ =~ s/\r?\n?$//} @all_params;
  36         173  
2649 1         9 $param_string = join ' ', @all_params;
2650 1         16 my ($class_names, $class_priors, $rest_param) =
2651             $param_string =~ /^\s*class names:(.*?)\s*class priors:(.*?)(feature: .*)/;
2652 1 50       12 my @class_names = grep {defined($_) && length($_) > 0} split /\s+/, $1;
  3         26  
2653 1         25 push @{$self->{_class_names}}, @class_names;
  1         7  
2654 1 50       16 my @class_priors = grep {defined($_) && length($_) > 0} split /\s+/, $2;
  3         24  
2655 1         4 push @{$self->{_class_priors}}, @class_priors;
  1         6  
2656 1         15 my ($feature_string, $bias_string) = $rest_param =~ /(feature:.*?) (bias:.*)/;
2657 1         4 my %features_and_values_hash;
2658 1         24 my @features = split /(feature[:])/, $feature_string;
2659 1 50       5 @features = grep {defined($_) && length($_) > 0} @features;
  9         52  
2660 1         4 foreach my $item (@features) {
2661 8 100       38 next if $item =~ /feature/;
2662 4         24 my @splits = split / /, $item;
2663 4 50       14 @splits = grep {defined($_) && length($_) > 0} @splits;
  27         148  
2664 4         16 foreach my $i (0..@splits-1) {
2665 22 100       67 if ($i == 0) {
2666 4         20 $features_and_values_hash{$splits[0]} = [];
2667             } else {
2668 18 100       70 next if $splits[$i] =~ /values/;
2669 14         29 push @{$features_and_values_hash{$splits[0]}}, $splits[$i];
  14         56  
2670             }
2671             }
2672             }
2673 1         6 $self->{_features_and_values_hash} = \%features_and_values_hash;
2674 1         4 my %bias_hash = %{$self->{_bias_hash}};
  1         6  
2675 1         20 my @biases = split /(bias[:]\s*class[:])/, $bias_string;
2676 1 50       5 @biases = grep {defined($_) && length($_) > 0} @biases;
  5         35  
2677 1         5 foreach my $item (@biases) {
2678 4 100       21 next if $item =~ /bias/;
2679 2         24 my @splits = split /\s+/, $item;
2680 2 50       8 @splits = grep {defined($_) && length($_) > 0} @splits;
  18         97  
2681 2         6 my $feature_name;
2682 2         16 foreach my $i (0..@splits-1) {
2683 16 100       89 if ($i == 0) {
    100          
2684 2         10 $bias_hash{$splits[0]} = {};
2685             } elsif ($splits[$i] =~ /(^.+)[:]$/) {
2686 8         29 $feature_name = $1;
2687 8         37 $bias_hash{$splits[0]}->{$feature_name} = [];
2688             } else {
2689 6 50       28 next if !defined $feature_name;
2690 6 50       20 push @{$bias_hash{$splits[0]}->{$feature_name}}, $splits[$i]
  6         26  
2691             if defined $feature_name;
2692             }
2693             }
2694             }
2695 1         5 $self->{_bias_hash} = \%bias_hash;
2696 1 50       16 if ($debug) {
2697 0         0 print "\n\nClass names: @class_names\n";
2698 0         0 my $num_of_classes = @class_names;
2699 0         0 print "Class priors: @class_priors\n";
2700 0         0 print "Number of classes: $num_of_classes\n";
2701 0         0 print "\nHere are the features and their possible values:\n";
2702 0         0 while ( my ($k, $v) = each %features_and_values_hash ) {
2703 0         0 print "$k ===> @$v\n";
2704             }
2705 0         0 print "\nHere is the biasing for each class:\n";
2706 0         0 while ( my ($k, $v) = each %bias_hash ) {
2707 0         0 print "$k:\n";
2708 0         0 while ( my ($k1, $v1) = each %$v ) {
2709 0         0 print " $k1 ===> @$v1\n";
2710             }
2711             }
2712             }
2713             }
2714              
2715             ## This method generates training data according to the specifications placed in a
2716             ## parameter file that is read by the previous method.
2717             sub gen_symbolic_training_data {
2718 1     1   11 my $self = shift;
2719 1         3 my @class_names = @{$self->{_class_names}};
  1         7  
2720 1         4 my @class_priors = @{$self->{_class_priors}};
  1         5  
2721 1         3 my %training_sample_records;
2722 1         3 my %features_and_values_hash = %{$self->{_features_and_values_hash}};
  1         8  
2723 1         4 my %bias_hash = %{$self->{_bias_hash}};
  1         5  
2724 1         5 my $how_many_training_samples = $self->{_number_of_samples_for_training};
2725 1         4 my $how_many_test_samples = $self->{_number_of_samples_for_testing};
2726 1         3 my %class_priors_to_unit_interval_map;
2727 1         3 my $accumulated_interval = 0;
2728 1         6 foreach my $i (0..@class_names-1) {
2729 2         13 $class_priors_to_unit_interval_map{$class_names[$i]}
2730             = [$accumulated_interval, $accumulated_interval + $class_priors[$i]];
2731 2         8 $accumulated_interval += $class_priors[$i];
2732             }
2733 1 50       6 if ($self->{_debug}) {
2734 0         0 print "Mapping of class priors to unit interval: \n";
2735 0         0 while ( my ($k, $v) = each %class_priors_to_unit_interval_map ) {
2736 0         0 print "$k => @$v\n";
2737             }
2738 0         0 print "\n\n";
2739             }
2740 1         10 my $ele_index = 0;
2741 1         6 while ($ele_index < $how_many_training_samples) {
2742 35         86 my $sample_name = "sample" . "_$ele_index";
2743 35         108 $training_sample_records{$sample_name} = [];
2744             # Generate class label for this training sample:
2745 35         109 my $roll_the_dice = rand(1.0);
2746 35         56 my $class_label;
2747 35         95 foreach my $class_name (keys %class_priors_to_unit_interval_map ) {
2748 52         93 my $v = $class_priors_to_unit_interval_map{$class_name};
2749 52 100 66     245 if ( ($roll_the_dice >= $v->[0]) && ($roll_the_dice <= $v->[1]) ) {
2750 35         67 push @{$training_sample_records{$sample_name}},
  35         103  
2751             "class=" . $class_name;
2752 35         69 $class_label = $class_name;
2753 35         66 last;
2754             }
2755             }
2756 35         92 foreach my $feature (keys %features_and_values_hash) {
2757 140         259 my @values = @{$features_and_values_hash{$feature}};
  140         411  
2758 140         303 my $bias_string = $bias_hash{$class_label}->{$feature}->[0];
2759 140         289 my $no_bias = 1.0 / @values;
2760 140 100       482 $bias_string = "$values[0]" . "=$no_bias" if !defined $bias_string;
2761 140         232 my %value_priors_to_unit_interval_map;
2762 140         700 my @splits = split /\s*=\s*/, $bias_string;
2763 140         342 my $chosen_for_bias_value = $splits[0];
2764 140         230 my $chosen_bias = $splits[1];
2765 140         365 my $remaining_bias = 1 - $chosen_bias;
2766 140         298 my $remaining_portion_bias = $remaining_bias / (@values -1);
2767 140 50       301 @splits = grep {defined($_) && length($_) > 0} @splits;
  280         1359  
2768 140         291 my $accumulated = 0;
2769 140         335 foreach my $i (0..@values-1) {
2770 490 100       1220 if ($values[$i] eq $chosen_for_bias_value) {
2771 140         407 $value_priors_to_unit_interval_map{$values[$i]}
2772             = [$accumulated, $accumulated + $chosen_bias];
2773 140         304 $accumulated += $chosen_bias;
2774             } else {
2775 350         866 $value_priors_to_unit_interval_map{$values[$i]}
2776             = [$accumulated, $accumulated + $remaining_portion_bias];
2777 350         677 $accumulated += $remaining_portion_bias;
2778             }
2779             }
2780 140         276 my $roll_the_dice = rand(1.0);
2781 140         230 my $value_label;
2782 140         353 foreach my $value_name (keys %value_priors_to_unit_interval_map ) {
2783 307         546 my $v = $value_priors_to_unit_interval_map{$value_name};
2784 307 100 100     1282 if ( ($roll_the_dice >= $v->[0])
2785             && ($roll_the_dice <= $v->[1]) ) {
2786 140         232 push @{$training_sample_records{$sample_name}},
  140         490  
2787             $feature . "=" . $value_name;
2788 140         260 $value_label = $value_name;
2789 140         352 last;
2790             }
2791             }
2792 140 50       676 if ($self->{_debug}) {
2793 0         0 print "mapping feature value priors for '$feature' " .
2794             "to unit interval: \n";
2795 0         0 while ( my ($k, $v) =
2796             each %value_priors_to_unit_interval_map ) {
2797 0         0 print "$k => @$v\n";
2798             }
2799 0         0 print "\n\n";
2800             }
2801             }
2802 35         120 $ele_index++;
2803             }
2804 1         4 $self->{_training_sample_records} = \%training_sample_records;
2805 1 50       6 if ($self->{_debug}) {
2806 0         0 print "\n\nPRINTING TRAINING RECORDS:\n\n";
2807 0         0 foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %training_sample_records) {
  0         0  
2808 0         0 print "$kee => @{$training_sample_records{$kee}}\n\n";
  0         0  
2809             }
2810             }
2811 1         4 my $output_training_file = $self->{_output_training_datafile};
2812 1 50       5 print "\n\nDISPLAYING TRAINING RECORDS:\n\n" if $self->{_debug};
2813 1         120 open FILEHANDLE, ">$output_training_file";
2814 1         10 my @features = sort keys %features_and_values_hash;
2815 1         4 my $title_string = ',class';
2816 1         4 foreach my $feature_name (@features) {
2817 4         9 $title_string .= ',' . $feature_name;
2818             }
2819 1         12 print FILEHANDLE "$title_string\n";
2820 1         15 my @sample_names = sort {$a <=> $b} map { $_ =~ s/^sample_//; $_ } sort keys %training_sample_records;
  105         156  
  35         76  
  35         63  
2821 1         6 my $record_string = '';
2822 1         3 foreach my $sample_name (@sample_names) {
2823 35         61 $record_string .= "$sample_name,";
2824 35         44 my @record = @{$training_sample_records{"sample_$sample_name"}};
  35         111  
2825 35         59 my %item_parts_hash;
2826 35         50 foreach my $item (@record) {
2827 175         431 my @splits = grep $_, split /=/, $item;
2828 175         365 $item_parts_hash{$splits[0]} = $splits[1];
2829             }
2830 35         76 $record_string .= $item_parts_hash{"class"};
2831 35         64 delete $item_parts_hash{"class"};
2832 35         102 my @kees = sort keys %item_parts_hash;
2833 35         67 foreach my $kee (@kees) {
2834 140         232 $record_string .= ",$item_parts_hash{$kee}";
2835             }
2836 35         67 print FILEHANDLE "$record_string\n";
2837 35         85 $record_string = '';
2838             }
2839 1         62 close FILEHANDLE;
2840             }
2841              
2842             sub sample_index {
2843 0     0     my $arg = shift;
2844 0           $arg =~ /_(.+)$/;
2845 0           return $1;
2846             }
2847              
2848             ################################# Decision Tree Introspection #######################################
2849             ################################# Class DTIntrospection #######################################
2850              
2851             package DTIntrospection;
2852              
2853             ## Instances constructed from this class can provide explanations for the
2854             ## classification decisions at the nodes of a decision tree.
2855             ##
2856             ## When used in the interactive mode, the decision-tree introspection made possible
2857             ## by this class provides answers to the following three questions: (1) List of the
2858             ## training samples that fall in the portion of the feature space that corresponds
2859             ## to a node of the decision tree; (2) The probabilities associated with the last
2860             ## feature test that led to the node; and (3) The class probabilities predicated on
2861             ## just the last feature test on the path to that node.
2862             ##
2863             ## CAVEAT: It is possible for a node to exist even when there are no training
2864             ## samples in the portion of the feature space that corresponds to the node. That
2865             ## is because a decision tree is based on the probability densities estimated from
2866             ## the training data. When training data is non-uniformly distributed, it is
2867             ## possible for the probability associated with a point in the feature space to be
2868             ## non-zero even when there are no training samples at or in the vicinity of that
2869             ## point.
2870             ##
2871             ## For a node to exist even where there are no training samples in the portion of
2872             ## the feature space that belongs to the node is an indication of the generalization
2873             ## ability of decision-tree based classification.
2874             ##
2875             ## When used in a non-interactive mode, an instance of this class can be used to
2876             ## create a tabular display that shows what training samples belong directly to the
2877             ## portion of the feature space that corresponds to each node of the decision tree.
2878             ## An instance of this class can also construct a tabular display that shows how the
2879             ## influence of each training sample propagates in the decision tree. For each
2880             ## training sample, this display first shows the list of nodes that came into
2881             ## existence through feature test(s) that used the data provided by that sample.
2882             ## This list for each training sample is followed by a subtree of the nodes that owe
2883             ## their existence indirectly to the training sample. A training sample influences a
2884             ## node indirectly if the node is a descendant of another node that is affected
2885             ## directly by the training sample.
2886              
2887 1     1   18 use strict;
  1         5  
  1         47  
2888 1     1   9 use Carp;
  1         3  
  1         3391  
2889              
2890             sub new {
2891 0     0     my ($class, $dt) = @_;
2892 0 0         croak "The argument supplied to the DTIntrospection constructor must be of type DecisionTree"
2893             unless ref($dt) eq "Algorithm::DecisionTree";
2894             bless {
2895             _dt => $dt,
2896             _root_dtnode => $dt->{_root_node},
2897 0           _samples_at_nodes_hash => {},
2898             _branch_features_to_nodes_hash => {},
2899             _sample_to_node_mapping_direct_hash => {},
2900             _node_serial_num_to_node_hash => {},
2901             _awareness_raising_msg_shown => 0,
2902             _debug => 0,
2903             }, $class;
2904             }
2905              
2906             sub initialize {
2907 0     0     my $self = shift;
2908             croak "You must first construct the decision tree before using the DTIntrospection class"
2909 0 0         unless $self->{_root_dtnode};
2910 0           $self->recursive_descent($self->{_root_dtnode});
2911             }
2912              
2913             sub recursive_descent {
2914 0     0     my $self = shift;
2915 0           my $node = shift;
2916 0           my $node_serial_number = $node->get_serial_num();
2917 0           my $branch_features_and_values_or_thresholds = $node->get_branch_features_and_values_or_thresholds();
2918 0 0         print "\nAt node $node_serial_number: the branch features and values are: @{$branch_features_and_values_or_thresholds}\n" if $self->{_debug};
  0            
2919 0           $self->{_node_serial_num_to_node_hash}->{$node_serial_number} = $node;
2920 0           $self->{_branch_features_to_nodes_hash}->{$node_serial_number} = $branch_features_and_values_or_thresholds;
2921 0           my @samples_at_node = ();
2922 0           foreach my $item (@$branch_features_and_values_or_thresholds) {
2923 0           my $samples_for_feature_value_combo = $self->get_samples_for_feature_value_combo($item);
2924 0 0         unless (@samples_at_node) {
2925 0           @samples_at_node = @$samples_for_feature_value_combo;
2926             } else {
2927 0           my @accum;
2928 0           foreach my $sample (@samples_at_node) {
2929 0 0         push @accum, $sample if Algorithm::DecisionTree::contained_in($sample, @$samples_for_feature_value_combo);
2930             }
2931 0           @samples_at_node = @accum;
2932             }
2933 0 0         last unless @samples_at_node;
2934             }
2935 0           @samples_at_node = sort {Algorithm::DecisionTree::sample_index($a) <=> Algorithm::DecisionTree::sample_index($b)} @samples_at_node;
  0            
2936 0 0         print "Node: $node_serial_number the samples are: [@samples_at_node]\n" if ($self->{_debug});
2937 0           $self->{_samples_at_nodes_hash}->{$node_serial_number} = \@samples_at_node;
2938 0 0         if (@samples_at_node) {
2939 0           foreach my $sample (@samples_at_node) {
2940 0 0         if (! exists $self->{_sample_to_node_mapping_direct_hash}->{$sample}) {
2941 0           $self->{_sample_to_node_mapping_direct_hash}->{$sample} = [$node_serial_number];
2942             } else {
2943 0           push @{$self->{_sample_to_node_mapping_direct_hash}->{$sample}}, $node_serial_number;
  0            
2944             }
2945             }
2946             }
2947 0           my $children = $node->get_children();
2948 0           foreach my $child (@$children) {
2949 0           $self->recursive_descent($child);
2950             }
2951             }
2952              
2953             sub display_training_samples_at_all_nodes_direct_influence_only {
2954 0     0     my $self = shift;
2955             croak "You must first construct the decision tree before using the DT Introspection class."
2956 0 0         unless $self->{_root_dtnode};
2957 0           $self->recursive_descent_for_showing_samples_at_a_node($self->{_root_dtnode});
2958             }
2959              
2960             sub recursive_descent_for_showing_samples_at_a_node{
2961 0     0     my $self = shift;
2962 0           my $node = shift;
2963 0           my $node_serial_number = $node->get_serial_num();
2964 0           my $branch_features_and_values_or_thresholds = $node->get_branch_features_and_values_or_thresholds();
2965 0 0         if (exists $self->{_samples_at_nodes_hash}->{$node_serial_number}) {
2966 0 0         print "\nAt node $node_serial_number: the branch features and values are: [@{$branch_features_and_values_or_thresholds}]\n" if $self->{_debug};
  0            
2967 0           print "Node $node_serial_number: the samples are: [@{$self->{_samples_at_nodes_hash}->{$node_serial_number}}]\n";
  0            
2968             }
2969 0           map $self->recursive_descent_for_showing_samples_at_a_node($_), @{$node->get_children()};
  0            
2970             }
2971              
2972             sub display_training_samples_to_nodes_influence_propagation {
2973 0     0     my $self = shift;
2974 0           foreach my $sample (sort {Algorithm::DecisionTree::sample_index($a) <=> Algorithm::DecisionTree::sample_index($b)} keys %{$self->{_dt}->{_training_data_hash}}) {
  0            
  0            
2975 0 0         if (exists $self->{_sample_to_node_mapping_direct_hash}->{$sample}) {
2976 0           my $nodes_directly_affected = $self->{_sample_to_node_mapping_direct_hash}->{$sample};
2977 0           print "\n$sample:\n nodes affected directly: [@{$nodes_directly_affected}]\n";
  0            
2978 0           print " nodes affected through probabilistic generalization:\n";
2979 0           map $self->recursive_descent_for_sample_to_node_influence($_, $nodes_directly_affected, " "), @$nodes_directly_affected;
2980             }
2981             }
2982             }
2983              
2984             sub recursive_descent_for_sample_to_node_influence {
2985 0     0     my $self = shift;
2986 0           my $node_serial_num = shift;
2987 0           my $nodes_already_accounted_for = shift;
2988 0           my $offset = shift;
2989 0           $offset .= " ";
2990 0           my $node = $self->{_node_serial_num_to_node_hash}->{$node_serial_num};
2991 0           my @children = map $_->get_serial_num(), @{$node->get_children()};
  0            
2992 0           my @children_affected = grep {!Algorithm::DecisionTree::contained_in($_, @{$nodes_already_accounted_for})} @children;
  0            
  0            
2993 0 0         if (@children_affected) {
2994 0           print "$offset $node_serial_num => [@children_affected]\n";
2995             }
2996 0           map $self->recursive_descent_for_sample_to_node_influence($_, \@children_affected, $offset), @children_affected;
2997             }
2998              
2999             sub get_samples_for_feature_value_combo {
3000 0     0     my $self = shift;
3001 0           my $feature_value_combo = shift;
3002 0           my ($feature,$op,$value) = $self->extract_feature_op_val($feature_value_combo);
3003 0           my @samples = ();
3004 0 0         if ($op eq '=') {
    0          
    0          
3005 0           @samples = grep Algorithm::DecisionTree::contained_in($feature_value_combo, @{$self->{_dt}->{_training_data_hash}->{$_}}), keys %{$self->{_dt}->{_training_data_hash}};
  0            
  0            
3006             } elsif ($op eq '<') {
3007 0           foreach my $sample (keys %{$self->{_dt}->{_training_data_hash}}) {
  0            
3008 0           my @features_and_values = @{$self->{_dt}->{_training_data_hash}->{$sample}};
  0            
3009 0           foreach my $item (@features_and_values) {
3010 0           my ($feature_data,$op_data,$val_data) = $self->extract_feature_op_val($item);
3011 0 0 0       if (($val_data ne 'NA') && ($feature eq $feature_data) && ($val_data <= $value)) {
      0        
3012 0           push @samples, $sample;
3013 0           last;
3014             }
3015             }
3016             }
3017             } elsif ($op eq '>') {
3018 0           foreach my $sample (keys %{$self->{_dt}->{_training_data_hash}}) {
  0            
3019 0           my @features_and_values = @{$self->{_dt}->{_training_data_hash}->{$sample}};
  0            
3020 0           foreach my $item (@features_and_values) {
3021 0           my ($feature_data,$op_data,$val_data) = $self->extract_feature_op_val($item);
3022 0 0 0       if (($val_data ne 'NA') && ($feature eq $feature_data) && ($val_data > $value)) {
      0        
3023 0           push @samples, $sample;
3024 0           last;
3025             }
3026             }
3027             }
3028             } else {
3029 0           die "Something strange is going on";
3030             }
3031 0           return \@samples;
3032             }
3033              
3034             sub extract_feature_op_val {
3035 0     0     my $self = shift;
3036 0           my $feature_value_combo = shift;
3037 0           my $pattern1 = '(.+)=(.+)';
3038 0           my $pattern2 = '(.+)<(.+)';
3039 0           my $pattern3 = '(.+)>(.+)';
3040 0           my ($feature,$value,$op);
3041 0 0         if ($feature_value_combo =~ /$pattern2/) {
    0          
    0          
3042 0           ($feature,$op,$value) = ($1,'<',$2);
3043             } elsif ($feature_value_combo =~ /$pattern3/) {
3044 0           ($feature,$op,$value) = ($1,'>',$2);
3045             } elsif ($feature_value_combo =~ /$pattern1/) {
3046 0           ($feature,$op,$value) = ($1,'=',$2);
3047             }
3048 0           return ($feature,$op,$value);
3049             }
3050              
3051             sub explain_classifications_at_multiple_nodes_interactively {
3052 0     0     my $self = shift;
3053             croak "You called explain_classification_at_multiple_nodes_interactively() without " .
3054             "first initializing the DTIntrospection instance in your code. Aborting."
3055 0 0         unless $self->{_samples_at_nodes_hash};
3056 0           print "\n\nIn order for the decision tree to introspect\n\n";
3057 0           print " DO YOU ACCEPT the fact that, in general, a region of the feature space\n" .
3058             " that corresponds to a DT node may have NON-ZERO probabilities associated\n" .
3059             " with it even when there are NO training data points in that region?\n" .
3060             "\nEnter 'y' for yes or any other character for no: ";
3061 0           my $ans = ;
3062 0           $ans =~ s/^\s*|\s*$//g;
3063 0 0         die "\n Since you answered 'no' to a very real theoretical possibility, no explanations possible for the classification decisions in the decision tree. Aborting!\n" if $ans !~ /^ye?s?$/;
3064 0           $self->{_awareness_raising_msg_shown} = 1;
3065 0           while (1) {
3066 0           my $node_id;
3067             my $ans;
3068 0           while (1) {
3069 0           print "\nEnter the integer ID of a node: ";
3070 0           $ans = ;
3071 0           $ans =~ s/^\s*|\s*$//g;
3072 0 0         return if $ans =~ /^exit$/;
3073 0 0         last if Algorithm::DecisionTree::contained_in($ans, keys %{$self->{_samples_at_nodes_hash}});
  0            
3074 0           print "\nYour answer must be an integer ID of a node. Try again or enter 'exit'.\n";
3075             }
3076 0           $node_id = $ans;
3077 0           $self->explain_classification_at_one_node($node_id)
3078             }
3079             }
3080              
3081             sub explain_classification_at_one_node {
3082 0     0     my $self = shift;
3083 0           my $node_id = shift;
3084             croak "You called explain_classification_at_one_node() without first initializing " .
3085 0 0         "the DTIntrospection instance in your code. Aborting." unless $self->{_samples_at_nodes_hash};
3086 0 0         unless (exists $self->{_samples_at_nodes_hash}->{$node_id}) {
3087 0           print "Node $node_id is not a node in the tree\n";
3088 0           return;
3089             }
3090 0 0         unless ($self->{_awareness_raising_msg_shown}) {
3091 0           print "\n\nIn order for the decision tree to introspect at Node $node_id: \n\n";
3092 0           print " DO YOU ACCEPT the fact that, in general, a region of the feature space\n" .
3093             " that corresponds to a DT node may have NON-ZERO probabilities associated\n" .
3094             " with it even when there are NO training data points in that region?\n" .
3095             "\nEnter 'y' for yes or any other character for no: ";
3096 0           my $ans = ;
3097 0           $ans =~ s/^\s*|\s*$//g;
3098 0 0         die "\n Since you answered 'no' to a very real theoretical possibility, no explanations possible for the classification decision at node $node_id\n" if $ans !~ /^ye?s?$/;
3099             }
3100 0           my @samples_at_node = @{$self->{_samples_at_nodes_hash}->{$node_id}};
  0            
3101 0           my @branch_features_to_node = @{$self->{_branch_features_to_nodes_hash}->{$node_id}};
  0            
3102             # my @class_names = @{$self->get_class_names()};
3103 0           my @class_names = $self->{_dt}->get_class_names();
3104 0           my $class_probabilities = $self->{_root_dtnode}->get_class_probabilities();
3105 0           my ($feature,$op,$value) = $self->extract_feature_op_val( $branch_features_to_node[-1] );
3106 0 0         my $msg = @samples_at_node == 0
3107             ? "\n\n There are NO training data samples directly in the region of the feature space assigned to node $node_id: @samples_at_node\n\n"
3108             : "\n Samples in the portion of the feature space assigned to Node $node_id: @samples_at_node\n";
3109 0           $msg .= "\n Features tests on the branch to node $node_id: [@branch_features_to_node]\n\n";
3110 0           $msg .= "\n Would you like to see the probability associated with the last feature test on the branch leading to Node $node_id?\n";
3111 0           $msg .= "\n Enter 'y' if yes and any other character for 'no': ";
3112 0           print $msg;
3113 0           my $ans = ;
3114 0           $ans =~ s/^\s*|\s*$//g;
3115 0 0         if ($ans =~ /^ye?s?$/) {
3116 0           my $sequence = [$branch_features_to_node[-1]];
3117 0           my $prob = $self->{_dt}->probability_of_a_sequence_of_features_and_values_or_thresholds($sequence);
3118 0           print "\n probability of @{$sequence} is: $prob\n";
  0            
3119             }
3120 0           $msg = "\n Using Bayes rule, would you like to see the class probabilities predicated on just the last feature test on the branch leading to Node $node_id?\n";
3121 0           $msg .= "\n Enter 'y' for yes and any other character for no: ";
3122 0           print $msg;
3123 0           $ans = ;
3124 0           $ans =~ s/^\s*|\s*$//g;
3125 0 0         if ($ans =~ /^ye?s?$/) {
3126 0           my $sequence = [$branch_features_to_node[-1]];
3127 0           foreach my $cls (@class_names) {
3128 0           my $prob = $self->{_dt}->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($cls, $sequence);
3129 0           print "\n probability of class $cls given just the feature test @{$sequence} is: $prob\n";
  0            
3130             }
3131             } else {
3132 0           print "goodbye\n";
3133             }
3134 0           print "\n Finished supplying information on Node $node_id\n\n";
3135             }
3136              
3137             1;
3138              
3139             =pod
3140              
3141             =encoding UTF8
3142              
3143             =head1 NAME
3144              
3145             Algorithm::DecisionTree - A Perl module for decision-tree based classification of
3146             multidimensional data.
3147              
3148              
3149             =head1 SYNOPSIS
3150              
3151             # FOR CONSTRUCTING A DECISION TREE AND FOR CLASSIFYING A SAMPLE:
3152              
3153             # In general, your call for constructing an instance of the DecisionTree class
3154             # will look like:
3155              
3156             my $training_datafile = "stage3cancer.csv";
3157             my $dt = Algorithm::DecisionTree->new(
3158             training_datafile => $training_datafile,
3159             csv_class_column_index => 2,
3160             csv_columns_for_features => [3,4,5,6,7,8],
3161             entropy_threshold => 0.01,
3162             max_depth_desired => 8,
3163             symbolic_to_numeric_cardinality_threshold => 10,
3164             csv_cleanup_needed => 1,
3165             );
3166              
3167             # The constructor option `csv_class_column_index' informs the module as to which
3168             # column of your CSV file contains the class label. THE COLUMN INDEXING IS ZERO
3169             # BASED. The constructor option `csv_columns_for_features' specifies which columns
3170             # are to be used for feature values. The first row of the CSV file must specify
3171             # the names of the features. See examples of CSV files in the `Examples'
3172             # subdirectory.
3173              
3174             # The option `symbolic_to_numeric_cardinality_threshold' is also important. For
3175             # the example shown above, if an ostensibly numeric feature takes on only 10 or
3176             # fewer different values in your training datafile, it will be treated like a
3177             # symbolic features. The option `entropy_threshold' determines the granularity
3178             # with which the entropies are sampled for the purpose of calculating entropy gain
3179             # with a particular choice of decision threshold for a numeric feature or a feature
3180             # value for a symbolic feature.
3181              
3182             # The option 'csv_cleanup_needed' is by default set to 0. If you set it
3183             # to 1, that would cause all line records in your CSV file to be "sanitized" before
3184             # they are used for constructing a decision tree. You need this option if your CSV
3185             # file uses double-quoted field names and field values in the line records and if
3186             # such double-quoted strings are allowed to include commas for, presumably, better
3187             # readability.
3188              
3189             # After you have constructed an instance of the DecisionTree class as shown above,
3190             # you read in the training data file and initialize the probability cache by
3191             # calling:
3192              
3193             $dt->get_training_data();
3194             $dt->calculate_first_order_probabilities();
3195             $dt->calculate_class_priors();
3196              
3197             # Next you construct a decision tree for your training data by calling:
3198              
3199             $root_node = $dt->construct_decision_tree_classifier();
3200              
3201             # where $root_node is an instance of the DTNode class that is also defined in the
3202             # module file. Now you are ready to classify a new data record. Let's say that
3203             # your data record looks like:
3204              
3205             my @test_sample = qw / g2=4.2
3206             grade=2.3
3207             gleason=4
3208             eet=1.7
3209             age=55.0
3210             ploidy=diploid /;
3211              
3212             # You can classify it by calling:
3213              
3214             my $classification = $dt->classify($root_node, \@test_sample);
3215              
3216             # The call to `classify()' returns a reference to a hash whose keys are the class
3217             # names and the values the associated classification probabilities. This hash also
3218             # includes another key-value pair for the solution path from the root node to the
3219             # leaf node at which the final classification was carried out.
3220              
3221              
3222             =head1 CHANGES
3223              
3224             B This version fixes a bug in the C function.
3225             The source of the bug was a typo in a regex component meant for matching with white
3226             space. I have also made one additional change to this function to increase its
3227             versatility. With this change, you are now allowed to have empty strings as values
3228             for features.
3229              
3230             B This version reintroduces C as an optional
3231             parameter in the module constructor. This was done in response to several requests
3232             received from the user community. (Previously, all line records from a CSV file were
3233             processed by the C function no matter what.) The main point made by
3234             the users was that invoking C when there was no need for CSV clean-up
3235             extracted a performance penalty when ingesting large database files with tens of
3236             thousands of line records. In addition to making C optional, I
3237             have also tweaked up the code in the C function in order to extract
3238             data from a larger range of messy CSV files.
3239              
3240             B All the changes made in this version relate to the construction of
3241             regression trees. I have fixed a couple of bugs in the calculation of the regression
3242             coefficients. Additionally, the C class now comes with a new
3243             constructor parameter named C. For most cases, you'd set this
3244             parameter to 0, which causes the regression coefficients to be estimated through
3245             linear least-squares minimization.
3246              
3247             B In addition to constructing decision trees, this version of the
3248             module also allows you to construct regression trees. The regression tree capability
3249             has been packed into a separate subclass, named C, of the main
3250             C class. The subdirectory C in the main
3251             installation directory illustrates how you can use this new functionality of the
3252             module.
3253              
3254             B This version incorporates four very significant upgrades/changes to
3255             the C module: B<(1)> The CSV cleanup is now the default. So you do not
3256             have to set any special parameters in the constructor calls to initiate CSV
3257             cleanup. B<(2)> In the form of a new Perl class named C,
3258             this module provides you with an easy-to-use programming interface for attempting
3259             needle-in-a-haystack solutions for the case when your training data is overwhelmingly
3260             dominated by a single class. You need to set the constructor parameter
3261             C to invoke the logic that constructs multiple
3262             decision trees, each using the minority class samples along with samples drawn
3263             randomly from the majority class. The final classification is made through a
3264             majority vote from all the decision trees. B<(3)> Assuming you are faced with a
3265             big-data problem --- in the sense that you have been given a training database with a
3266             very large number of training records --- the class C will
3267             also let you construct multiple decision trees by pulling training data randomly from
3268             your training database (without paying attention to the relative populations of the
3269             classes). The final classification decision for a test sample is based on a majority
3270             vote from all the decision trees thus constructed. See the C
3271             directory for how to use these new features of the module. And, finally, B<(4)>
3272             Support for the old-style '.dat' training files has been dropped in this version.
3273              
3274             B This version makes it easier to use a CSV training file that
3275             violates the assumption that a comma be used only to separate the different field
3276             values in a line record. Some large econometrics databases use double-quoted values
3277             for fields, and these values may contain commas (presumably for better readability).
3278             This version also allows you to specify the leftmost entry in the first CSV record
3279             that names all the fields. Previously, this entry was required to be an empty
3280             double-quoted string. I have also made some minor changes to the
3281             'C' method to make it more user friendly for large
3282             training files that may contain tens of thousands of records. When pulling training
3283             data from such files, this method prints out a dot on the terminal screen for every
3284             10000 records it has processed.
3285              
3286             B This version brings the boosting capability to the C
3287             module.
3288              
3289             B This version adds bagging to the C module. If your
3290             training dataset is large enough, you can ask the module to construct multiple
3291             decision trees using data bags extracted from your dataset. The module can show you
3292             the results returned by the individual decision trees and also the results obtained
3293             by taking a majority vote of the classification decisions made by the individual
3294             trees. You can specify any arbitrary extent of overlap between the data bags.
3295              
3296             B The introspection capability in this version packs more of a punch.
3297             For each training data sample, you can now figure out not only the decision-tree
3298             nodes that are affected directly by that sample, but also those nodes that are
3299             affected indirectly through the generalization achieved by the probabilistic modeling
3300             of the data. The 'examples' directory of this version includes additional scripts
3301             that illustrate these enhancements to the introspection capability. See the section
3302             "The Introspection API" for a declaration of the introspection related methods, old
3303             and new.
3304              
3305             B In response to requests from several users, this version includes a new
3306             capability: You can now ask the module to introspect about the classification
3307             decisions returned by the decision tree. Toward that end, the module includes a new
3308             class named C. Perhaps the most important bit of information you
3309             are likely to seek through DT introspection is the list of the training samples that
3310             fall directly in the portion of the feature space that is assigned to a node.
3311             B When training samples are non-uniformly distributed in the underlying
3312             feature space, IT IS POSSIBLE FOR A NODE TO EXIST EVEN WHEN NO TRAINING SAMPLES FALL
3313             IN THE PORTION OF THE FEATURE SPACE ASSIGNED TO THE NODE. B<(This is an important
3314             part of the generalization achieved by probabilistic modeling of the training data.)>
3315             For additional information related to DT introspection, see the section titled
3316             "DECISION TREE INTROSPECTION" in this documentation page.
3317              
3318             B fixes a bug in the part of the module that some folks use for generating
3319             synthetic data for experimenting with decision tree construction and classification.
3320             In the class C that is a part of the module, there
3321             was a problem with the order in which the features were recorded from the
3322             user-supplied parameter file. The basic code for decision tree construction and
3323             classification remains unchanged.
3324              
3325             B further downshifts the required version of Perl for this module. This
3326             was a result of testing the module with Version 5.10.1 of Perl. Only one statement
3327             in the module code needed to be changed for the module to work with the older version
3328             of Perl.
3329              
3330             B fixes the C restriction on the required Perl version. This
3331             version should work with Perl versions 5.14.0 and higher.
3332              
3333             B changes the required version of Perl from 5.18.0 to 5.14.0. Everything
3334             else remains the same.
3335              
3336             B should prove more robust when the probability distribution for the
3337             values of a feature is expected to be heavy-tailed; that is, when the supposedly rare
3338             observations can occur with significant probabilities. A new option in the
3339             DecisionTree constructor lets the user specify the precision with which the
3340             probability distributions are estimated for such features.
3341              
3342             B fixes a bug that was caused by the explicitly set zero values for
3343             numerical features being misconstrued as "false" in the conditional statements in
3344             some of the method definitions.
3345              
3346             B makes it easier to write code for classifying in one go all of your test
3347             data samples in a CSV file. The bulk classifications obtained can be written out to
3348             either a CSV file or to a regular text file. See the script
3349             C in the C directory for how to
3350             classify all of your test data records in a CSV file. This version also includes
3351             improved code for generating synthetic numeric/symbolic training and test data
3352             records for experimenting with the decision tree classifier.
3353              
3354             B allows you to test the quality of your training data by running a 10-fold
3355             cross-validation test on the data. This test divides all of the training data into
3356             ten parts, with nine parts used for training a decision tree and one part used for
3357             testing its ability to classify correctly. This selection of nine parts for training
3358             and one part for testing is carried out in all of the ten different ways that are
3359             possible. This testing functionality in Version 2.1 can also be used to find the
3360             best values to use for the constructor parameters C,
3361             C, and C.
3362              
3363             B Now you can use both numeric and
3364             symbolic features for constructing a decision tree. A feature is numeric if it can
3365             take any floating-point value over an interval.
3366              
3367             B fixes a bug in the code that was triggered by 0 being declared as one of
3368             the features values in the training datafile. Version 1.71 also include an additional
3369             safety feature that is useful for training datafiles that contain a very large number
3370             of features. The new version makes sure that the number of values you declare for
3371             each sample record matches the number of features declared at the beginning of the
3372             training datafile.
3373              
3374             B includes safety checks on the consistency of the data you place in your
3375             training datafile. When a training file contains thousands of samples, it is
3376             difficult to manually check that you used the same class names in your sample records
3377             that you declared at top of your training file or that the values you have for your
3378             features are legal vis-a-vis the earlier declarations of the values in the training
3379             file. Another safety feature incorporated in this version is the non-consideration
3380             of classes that are declared at the top of the training file but that have no sample
3381             records in the file.
3382              
3383             B uses probability caching much more extensively compared to the previous
3384             versions. This should result in faster construction of large decision trees.
3385             Another new feature in Version 1.6 is the use of a decision tree for interactive
3386             classification. In this mode, after you have constructed a decision tree from the
3387             training data, the user is prompted for answers to the questions pertaining to the
3388             feature tests at the nodes of the tree.
3389              
3390             Some key elements of the documentation were cleaned up and made more readable in
3391             B. The implementation code remains unchanged from Version 1.4.
3392              
3393             B should make things faster (and easier) for folks who want to use this
3394             module with training data that creates very large decision trees (that is, trees with
3395             tens of thousands or more decision nodes). The speedup in Version 1.4 has been
3396             achieved by eliminating duplicate calculation of probabilities as the tree grows. In
3397             addition, this version provides an additional constructor parameter,
3398             C for controlling the size of the decision tree. This is in
3399             addition to the tree size control achieved by the parameter C that
3400             was introduced in Version 1.3. Since large decision trees can take a long time to
3401             create, you may find yourself wishing you could store the tree you just created in a
3402             disk file and that, subsequently, you could use the stored tree for classification
3403             work. The C directory contains two scripts, C and
3404             C, that show how you can do exactly that with the
3405             help of Perl's C module.
3406              
3407             B addresses the issue that arises when the header of a training datafile
3408             declares a certain possible value for a feature but that (feature,value) pair does
3409             NOT show up anywhere in the training data. Version 1.3 also makes it possible for a
3410             user to control the size of the decision tree by changing the value of the parameter
3411             C Additionally, Version 1.3 includes a method called
3412             C that displays useful information regarding the size and
3413             some other attributes of the training data. It also warns the user if the training
3414             data might result in a decision tree that would simply be much too large --- unless
3415             the user controls the size with the entropy_threshold parameter.
3416              
3417             In addition to the removal of a couple of serious bugs, B incorporates a
3418             number of enhancements: (1) Version 1.2 includes checks on the names of the features
3419             and values used in test data --- this is the data you want to classify with the
3420             decision tree classifier constructed by this module. (2) Version 1.2 includes a
3421             separate constructor for generating test data. To make it easier to generate test
3422             data whose probabilistic parameters may not be identical to that used for the
3423             training data, I have used separate routines for generating the test data. (3)
3424             Version 1.2 also includes in its examples directory a script that classifies the test
3425             data in a file and outputs the class labels into another file. This is for folks who
3426             do not wish to write their own scripts using this module. (4) Version 1.2 also
3427             includes addition to the documentation regarding the issue of numeric values for
3428             features.
3429              
3430             =head1 DESCRIPTION
3431              
3432             B is a I module for constructing a decision tree from
3433             a training datafile containing multidimensional data. In one form or another,
3434             decision trees have been around for about fifty years. From a statistical
3435             perspective, they are closely related to classification and regression by recursive
3436             partitioning of multidimensional data. Early work that demonstrated the usefulness
3437             of such partitioning of data for classification and regression can be traced to the
3438             work of Terry Therneau in the early 1980's in the statistics community, and to the
3439             work of Ross Quinlan in the mid 1990's in the machine learning community.
3440              
3441             For those not familiar with decision tree ideas, the traditional way to classify
3442             multidimensional data is to start with a feature space whose dimensionality is the
3443             same as that of the data. Each feature in this space corresponds to the attribute
3444             that each dimension of the data measures. You then use the training data to carve up
3445             the feature space into different regions, each corresponding to a different class.
3446             Subsequently, when you try to classify a new data sample, you locate it in the
3447             feature space and find the class label of the region to which it belongs. One can
3448             also give the new data point the same class label as that of the nearest training
3449             sample. This is referred to as the nearest neighbor classification. There exist
3450             hundreds of variations of varying power on these two basic approaches to the
3451             classification of multidimensional data.
3452              
3453             A decision tree classifier works differently. When you construct a decision tree,
3454             you select for the root node a feature test that partitions the training data in a
3455             way that causes maximal disambiguation of the class labels associated with the data.
3456             In terms of information content as measured by entropy, such a feature test would
3457             cause maximum reduction in class entropy in going from all of the training data taken
3458             together to the data as partitioned by the feature test. You then drop from the root
3459             node a set of child nodes, one for each partition of the training data created by the
3460             feature test at the root node. When your features are purely symbolic, you'll have
3461             one child node for each value of the feature chosen for the feature test at the root.
3462             When the test at the root involves a numeric feature, you find the decision threshold
3463             for the feature that best bipartitions the data and you drop from the root node two
3464             child nodes, one for each partition. Now at each child node you pose the same
3465             question that you posed when you found the best feature to use at the root: Which
3466             feature at the child node in question would maximally disambiguate the class labels
3467             associated with the training data corresponding to that child node?
3468              
3469             As the reader would expect, the two key steps in any approach to decision-tree based
3470             classification are the construction of the decision tree itself from a file
3471             containing the training data, and then using the decision tree thus obtained for
3472             classifying new data.
3473              
3474             What is cool about decision tree classification is that it gives you soft
3475             classification, meaning it may associate more than one class label with a given data
3476             vector. When this happens, it may mean that your classes are indeed overlapping in
3477             the underlying feature space. It could also mean that you simply have not supplied
3478             sufficient training data to the decision tree classifier. For a tutorial
3479             introduction to how a decision tree is constructed and used, visit
3480             L
3481              
3482             This module also allows you to generate your own synthetic training and test
3483             data. Generating your own training data, using it for constructing a decision-tree
3484             classifier, and subsequently testing the classifier on a synthetically generated
3485             test set of data is a good way to develop greater proficiency with decision trees.
3486              
3487              
3488             =head1 WHAT PRACTICAL PROBLEM IS SOLVED BY THIS MODULE
3489              
3490             If you are new to the concept of a decision tree, their practical utility is best
3491             understood with an example that only involves symbolic features. However, as
3492             mentioned earlier, versions of the module higher than 2.0 allow you to use both
3493             symbolic and numeric features.
3494              
3495             Consider the following scenario: Let's say you are running a small investment company
3496             that employs a team of stockbrokers who make buy/sell decisions for the customers of
3497             your company. Assume that your company has asked the traders to make each investment
3498             decision on the basis of the following four criteria:
3499              
3500             price_to_earnings_ratio (P_to_E)
3501              
3502             price_to_sales_ratio (P_to_S)
3503              
3504             return_on_equity (R_on_E)
3505              
3506             market_share (MS)
3507              
3508             Since you are the boss, you keep track of the buy/sell decisions made by the
3509             individual traders. But one unfortunate day, all of your traders decide to quit
3510             because you did not pay them enough. So what do you do? If you had a module like
3511             the one here, you could still run your company and do so in such a way that, on the
3512             average, would do better than any of the individual traders who worked for your
3513             company. This is what you do: You pool together the individual trader buy/sell
3514             decisions you have accumulated during the last one year. This pooled information is
3515             likely to look like:
3516              
3517              
3518             example buy/sell P_to_E P_to_S R_on_E MS
3519             ============================================================+=
3520              
3521             example_1 buy high low medium low
3522             example_2 buy medium medium low low
3523             example_3 sell low medium low high
3524             ....
3525             ....
3526              
3527             This data, when formatted according to CSV, would constitute your training file. You
3528             could feed this file into the module by calling:
3529              
3530             my $dt = Algorithm::DecisionTree->new(
3531             training_datafile => $training_datafile,
3532             csv_class_column_index => 1,
3533             csv_columns_for_features => [2,3,4,5],
3534             );
3535             $dt->get_training_data();
3536             $dt->calculate_first_order_probabilities();
3537             $dt->calculate_class_priors();
3538              
3539             Subsequently, you would construct a decision tree by calling:
3540              
3541             my $root_node = $dt->construct_decision_tree_classifier();
3542              
3543             Now you and your company (with practically no employees) are ready to service the
3544             customers again. Suppose your computer needs to make a buy/sell decision about an
3545             investment prospect that is best described by:
3546              
3547             price_to_earnings_ratio = low
3548             price_to_sales_ratio = very_low
3549             return_on_equity = none
3550             market_share = medium
3551              
3552             All that your computer would need to do would be to construct a data vector like
3553              
3554             my @data = qw / P_to_E=low
3555             P_to_S=very_low
3556             R_on_E=none
3557             MS=medium /;
3558              
3559             and call the decision tree classifier you just constructed by
3560              
3561             $dt->classify($root_node, \@data);
3562              
3563             The answer returned will be 'buy' and 'sell', along with the associated
3564             probabilities. So if the probability of 'buy' is considerably greater than the
3565             probability of 'sell', that's what you should instruct your computer to do.
3566              
3567             The chances are that, on the average, this approach would beat the performance of any
3568             of your individual traders who worked for you previously since the buy/sell decisions
3569             made by the computer would be based on the collective wisdom of all your previous
3570             traders. B
3571             captured by the silly little example here. However, it does nicely the convey the
3572             sense in which the current module could be used.>
3573              
3574             =head1 SYMBOLIC FEATURES VERSUS NUMERIC FEATURES
3575              
3576             A feature is symbolic when its values are compared using string comparison operators.
3577             By the same token, a feature is numeric when its values are compared using numeric
3578             comparison operators. Having said that, features that take only a small number of
3579             numeric values in the training data can be treated symbolically provided you are
3580             careful about handling their values in the test data. At the least, you have to set
3581             the test data value for such a feature to its closest value in the training data.
3582             The module does that automatically for you for those numeric features for which the
3583             number different numeric values is less than a user-specified threshold. For those
3584             numeric features that the module is allowed to treat symbolically, this snapping of
3585             the values of the features in the test data to the small set of values in the training
3586             data is carried out automatically by the module. That is, after a user has told the
3587             module which numeric features to treat symbolically, the user need not worry about
3588             how the feature values appear in the test data.
3589              
3590             The constructor parameter C let's you tell
3591             the module when to consider an otherwise numeric feature symbolically. Suppose you
3592             set this parameter to 10, that means that all numeric looking features that take 10
3593             or fewer different values in the training datafile will be considered to be symbolic
3594             features by the module. See the tutorial at
3595             L for
3596             further information on the implementation issues related to the symbolic and numeric
3597             features.
3598              
3599             =head1 FEATURES WITH NOT SO "NICE" STATISTICAL PROPERTIES
3600              
3601             For the purpose of estimating the probabilities, it is necessary to sample the range
3602             of values taken on by a numerical feature. For features with "nice" statistical
3603             properties, this sampling interval is set to the median of the differences between
3604             the successive feature values in the training data. (Obviously, as you would expect,
3605             you first sort all the values for a feature before computing the successive
3606             differences.) This logic will not work for the sort of a feature described below.
3607              
3608             Consider a feature whose values are heavy-tailed, and, at the same time, the values
3609             span a million to one range. What I mean by heavy-tailed is that rare values can
3610             occur with significant probabilities. It could happen that most of the values for
3611             such a feature are clustered at one of the two ends of the range. At the same time,
3612             there may exist a significant number of values near the end of the range that is less
3613             populated. (Typically, features related to human economic activities --- such as
3614             wealth, incomes, etc. --- are of this type.) With the logic described in the
3615             previous paragraph, you could end up with a sampling interval that is much too small,
3616             which could result in millions of sampling points for the feature if you are not
3617             careful.
3618              
3619             Beginning with Version 2.22, you have two options in dealing with such features. You
3620             can choose to go with the default behavior of the module, which is to sample the
3621             value range for such a feature over a maximum of 500 points. Or, you can supply an
3622             additional option to the constructor that sets a user-defined value for the number of
3623             points to use. The name of the option is C. The following
3624             script
3625              
3626             construct_dt_for_heavytailed.pl
3627              
3628             in the C directory shows an example of how to call the constructor of the
3629             module with the C option.
3630              
3631              
3632             =head1 TESTING THE QUALITY OF YOUR TRAINING DATA
3633              
3634             Versions 2.1 and higher include a new class named C, derived from
3635             the main class C, that runs a 10-fold cross-validation test on your
3636             training data to test its ability to discriminate between the classes mentioned in
3637             the training file.
3638              
3639             The 10-fold cross-validation test divides all of the training data into ten parts,
3640             with nine parts used for training a decision tree and one part used for testing its
3641             ability to classify correctly. This selection of nine parts for training and one part
3642             for testing is carried out in all of the ten different possible ways.
3643              
3644             The following code fragment illustrates how you invoke the testing function of the
3645             EvalTrainingData class:
3646              
3647             my $training_datafile = "training.csv";
3648             my $eval_data = EvalTrainingData->new(
3649             training_datafile => $training_datafile,
3650             csv_class_column_index => 1,
3651             csv_columns_for_features => [2,3],
3652             entropy_threshold => 0.01,
3653             max_depth_desired => 3,
3654             symbolic_to_numeric_cardinality_threshold => 10,
3655             csv_cleanup_needed => 1,
3656             );
3657             $eval_data->get_training_data();
3658             $eval_data->evaluate_training_data()
3659              
3660             The last statement above prints out a Confusion Matrix and the value of Training Data
3661             Quality Index on a scale of 0 to 100, with 100 designating perfect training data.
3662             The Confusion Matrix shows how the different classes were mislabeled in the 10-fold
3663             cross-validation test.
3664              
3665             This testing functionality can also be used to find the best values to use for the
3666             constructor parameters C, C, and
3667             C.
3668              
3669             The following two scripts in the C directory illustrate the use of the
3670             C class for testing the quality of your data:
3671              
3672             evaluate_training_data1.pl
3673             evaluate_training_data2.pl
3674              
3675              
3676             =head1 HOW TO MAKE THE BEST CHOICES FOR THE CONSTRUCTOR PARAMETERS
3677              
3678             Assuming your training data is good, the quality of the results you get from a
3679             decision tree would depend on the choices you make for the constructor parameters
3680             C, C, and
3681             C. You can optimize your choices for
3682             these parameters by running the 10-fold cross-validation test that is made available
3683             in Versions 2.2 and higher through the new class C that is included
3684             in the module file. A description of how to run this test is in the previous section
3685             of this document.
3686              
3687              
3688             =head1 DECISION TREE INTROSPECTION
3689              
3690             Starting with Version 2.30, you can ask the C class of the module to
3691             explain the classification decisions made at the different nodes of the decision
3692             tree.
3693              
3694             Perhaps the most important bit of information you are likely to seek through DT
3695             introspection is the list of the training samples that fall directly in the portion
3696             of the feature space that is assigned to a node.
3697              
3698             However, note that, when training samples are non-uniformly distributed in the
3699             underlying feature space, it is possible for a node to exist even when there are no
3700             training samples in the portion of the feature space assigned to the node. That is
3701             because the decision tree is constructed from the probability densities estimated
3702             from the training data. When the training samples are non-uniformly distributed, it
3703             is entirely possible for the estimated probability densities to be non-zero in a
3704             small region around a point even when there are no training samples specifically in
3705             that region. (After you have created a statistical model for, say, the height
3706             distribution of people in a community, the model may return a non-zero probability
3707             for the height values in a small interval even if the community does not include a
3708             single individual whose height falls in that interval.)
3709              
3710             That a decision-tree node can exist even when there are no training samples in that
3711             portion of the feature space that belongs to the node is an important indication of
3712             the generalization ability of a decision-tree-based classifier.
3713              
3714             In light of the explanation provided above, before the DTIntrospection class supplies
3715             any answers at all, it asks you to accept the fact that features can take on non-zero
3716             probabilities at a point in the feature space even though there are zero training
3717             samples at that point (or in a small region around that point). If you do not accept
3718             this rudimentary fact, the introspection class will not yield any answers (since you
3719             are not going to believe the answers anyway).
3720              
3721             The point made above implies that the path leading to a node in the decision tree may
3722             test a feature for a certain value or threshold despite the fact that the portion of
3723             the feature space assigned to that node is devoid of any training data.
3724              
3725             See the following three scripts in the Examples directory for how to carry out DT
3726             introspection:
3727              
3728             introspection_in_a_loop_interactive.pl
3729              
3730             introspection_show_training_samples_at_all_nodes_direct_influence.pl
3731              
3732             introspection_show_training_samples_to_nodes_influence_propagation.pl
3733              
3734             The first script places you in an interactive session in which you will first be
3735             asked for the node number you are interested in. Subsequently, you will be asked for
3736             whether or not you are interested in specific questions that the introspection can
3737             provide answers for. The second script descends down the decision tree and shows for
3738             each node the training samples that fall directly in the portion of the feature space
3739             assigned to that node. The third script shows for each training sample how it
3740             affects the decision-tree nodes either directly or indirectly through the
3741             generalization achieved by the probabilistic modeling of the data.
3742              
3743             The output of the script
3744             C looks like:
3745              
3746             Node 0: the samples are: None
3747             Node 1: the samples are: [sample_46 sample_58]
3748             Node 2: the samples are: [sample_1 sample_4 sample_7 .....]
3749             Node 3: the samples are: []
3750             Node 4: the samples are: []
3751             ...
3752             ...
3753              
3754             The nodes for which no samples are listed come into existence through
3755             the generalization achieved by the probabilistic modeling of the data.
3756              
3757             The output produced by the script
3758             C looks like
3759              
3760             sample_1:
3761             nodes affected directly: [2 5 19 23]
3762             nodes affected through probabilistic generalization:
3763             2=> [3 4 25]
3764             25=> [26]
3765             5=> [6]
3766             6=> [7 13]
3767             7=> [8 11]
3768             8=> [9 10]
3769             11=> [12]
3770             13=> [14 18]
3771             14=> [15 16]
3772             16=> [17]
3773             19=> [20]
3774             20=> [21 22]
3775             23=> [24]
3776            
3777             sample_4:
3778             nodes affected directly: [2 5 6 7 11]
3779             nodes affected through probabilistic generalization:
3780             2=> [3 4 25]
3781             25=> [26]
3782             5=> [19]
3783             19=> [20 23]
3784             20=> [21 22]
3785             23=> [24]
3786             6=> [13]
3787             13=> [14 18]
3788             14=> [15 16]
3789             16=> [17]
3790             7=> [8]
3791             8=> [9 10]
3792             11=> [12]
3793            
3794             ...
3795             ...
3796             ...
3797              
3798             For each training sample, the display shown above first presents the list of nodes
3799             that are directly affected by the sample. A node is affected directly by a sample if
3800             the latter falls in the portion of the feature space that belongs to the former.
3801             Subsequently, for each training sample, the display shows a subtree of the nodes that
3802             are affected indirectly by the sample through the generalization achieved by the
3803             probabilistic modeling of the data. In general, a node is affected indirectly by a
3804             sample if it is a descendant of another node that is affected directly.
3805              
3806             Also see the section titled B regarding how to invoke the
3807             introspection capabilities of the module in your own code.
3808              
3809             =head1 METHODS
3810              
3811             The module provides the following methods for constructing a decision tree from
3812             training data in a disk file and for classifying new data records with the decision
3813             tree thus constructed:
3814              
3815             =over 4
3816              
3817             =item B
3818              
3819             my $dt = Algorithm::DecisionTree->new(
3820             training_datafile => $training_datafile,
3821             csv_class_column_index => 2,
3822             csv_columns_for_features => [3,4,5,6,7,8],
3823             entropy_threshold => 0.01,
3824             max_depth_desired => 8,
3825             symbolic_to_numeric_cardinality_threshold => 10,
3826             csv_cleanup_needed => 1,
3827             );
3828              
3829             A call to C constructs a new instance of the C class.
3830             For this call to make sense, the training data in the training datafile must be
3831             in the CSV format.
3832              
3833             =back
3834              
3835             =head2 The Constructor Parameters
3836              
3837             =over 8
3838              
3839             =item C:
3840              
3841             This parameter supplies the name of the file that contains the training data.
3842              
3843             =item C:
3844              
3845             When using a CSV file for your training data, this parameter supplies the zero-based
3846             column index for the column that contains the class label for each data record in the
3847             training file.
3848              
3849              
3850             =item C:
3851              
3852             You need to set this parameter to 1 if your CSV file has double quoted strings (which
3853             may include commas) as values for the fields and if such values are allowed to
3854             include commas for, presumably, better readability.
3855              
3856             =item C:
3857              
3858             When using a CSV file for your training data, this parameter supplies a list of
3859             columns corresponding to the features you wish to use for decision tree construction.
3860             Each column is specified by its zero-based index.
3861              
3862             =item C:
3863              
3864             This parameter sets the granularity with which the entropies are sampled by the
3865             module. For example, a feature test at a node in the decision tree is acceptable if
3866             the entropy gain achieved by the test exceeds this threshold. The larger the value
3867             you choose for this parameter, the smaller the tree. Its default value is 0.001.
3868              
3869             =item C:
3870              
3871             This parameter sets the maximum depth of the decision tree. For obvious reasons, the
3872             smaller the value you choose for this parameter, the smaller the tree.
3873              
3874             =item C:
3875              
3876             This parameter allows the module to treat an otherwise numeric feature symbolically
3877             if the number of different values the feature takes in the training data file does
3878             not exceed the value of this parameter.
3879              
3880             =item C:
3881              
3882             This parameter gives the user the option to set the number of points at which the
3883             value range for a feature should be sampled for estimating the probabilities. This
3884             parameter is effective only for those features that occupy a large value range and
3885             whose probability distributions are heavy tailed. B
3886             when you have a very large training dataset:> In general, the larger the dataset, the
3887             smaller the smallest difference between any two values for a numeric feature in
3888             relation to the overall range of values for that feature. In such cases, the module
3889             may use too large a number of bins for estimating the probabilities and that may slow
3890             down the calculation of the decision tree. You can get around this difficulty by
3891             explicitly giving a value to the 'C' parameter.
3892              
3893             =back
3894              
3895              
3896             You can choose the best values to use for the last three constructor parameters by
3897             running a 10-fold cross-validation test on your training data through the class
3898             C that comes with Versions 2.1 and higher of this module. See the
3899             section "TESTING THE QUALITY OF YOUR TRAINING DATA" of this document page.
3900              
3901             =over
3902              
3903             =item B
3904              
3905             After you have constructed a new instance of the C class,
3906             you must now read in the training data that is the file named in the call to the
3907             constructor. This you do by:
3908              
3909             $dt->get_training_data();
3910              
3911              
3912             =item B
3913              
3914             If you wish to see the training data that was just digested by the module,
3915             call
3916              
3917             $dt->show_training_data();
3918              
3919             =item B
3920              
3921             =item B
3922              
3923             After the module has read the training data file, it needs to initialize the
3924             probability cache. This you do by invoking:
3925              
3926             $dt->calculate_first_order_probabilities()
3927             $dt->calculate_class_priors()
3928              
3929             =item B
3930              
3931             With the probability cache initialized, it is time to construct a decision tree
3932             classifier. This you do by
3933              
3934             my $root_node = $dt->construct_decision_tree_classifier();
3935              
3936             This call returns an instance of type C. The C class is defined
3937             within the main package file. So, don't forget, that C<$root_node> in the above
3938             example call will be instantiated to an object of type C.
3939              
3940             =item B<$root_nodeC<< -> >>display_decision_tree(" "):>
3941              
3942             $root_node->display_decision_tree(" ");
3943              
3944             This will display the decision tree in your terminal window by using a recursively
3945             determined offset for each node as the display routine descends down the tree.
3946              
3947             I have intentionally left the syntax fragment C<$root_node> in the above call to
3948             remind the reader that C is NOT called on the instance of
3949             the C we constructed earlier, but on the C instance returned by
3950             the call to C.
3951              
3952             =item B
3953              
3954             Let's say you want to classify the following data record:
3955              
3956             my @test_sample = qw / g2=4.2
3957             grade=2.3
3958             gleason=4
3959             eet=1.7
3960             age=55.0
3961             ploidy=diploid /;
3962              
3963             you'd make the following call:
3964              
3965             my $classification = $dt->classify($root_node, \@test_sample);
3966              
3967             where, again, C<$root_node> is an instance of type C returned by the call to
3968             C. The variable C<$classification> holds a
3969             reference to a hash whose keys are the class names and whose values the associated
3970             probabilities. The hash that is returned by the above call also includes a special
3971             key-value pair for a key named C. The value associated with this key
3972             is an anonymous array that holds the path, in the form of a list of nodes, from the
3973             root node to the leaf node in the decision tree where the final classification was
3974             made.
3975              
3976              
3977             =item B
3978              
3979             This method allows you to use a decision-tree based classifier in an interactive
3980             mode. In this mode, a user is prompted for answers to the questions pertaining to
3981             the feature tests at the nodes of the tree. The syntax for invoking this method is:
3982              
3983             my $classification = $dt->classify_by_asking_questions($root_node);
3984              
3985             where C<$dt> is an instance of the C class returned by a
3986             call to C and C<$root_node> the root node of the decision tree returned by a
3987             call to C.
3988              
3989             =back
3990              
3991              
3992             =head1 THE INTROSPECTION API
3993              
3994             To construct an instance of C, you call
3995              
3996             my $introspector = DTIntrospection->new($dt);
3997              
3998             where you supply the instance of the C class you used for constructing
3999             the decision tree through the parameter C<$dt>. After you have constructed an
4000             instance of the introspection class, you must initialize it by
4001              
4002             $introspector->initialize();
4003              
4004             Subsequently, you can invoke either of the following methods:
4005              
4006             $introspector->explain_classification_at_one_node($node);
4007              
4008             $introspector->explain_classifications_at_multiple_nodes_interactively();
4009              
4010             depending on whether you want introspection at a single specified node or inside an
4011             infinite loop for an arbitrary number of nodes.
4012              
4013             If you want to output a tabular display that shows for each node in the decision tree
4014             all the training samples that fall in the portion of the feature space that belongs
4015             to that node, call
4016              
4017             $introspector->display_training_samples_at_all_nodes_direct_influence_only();
4018              
4019             If you want to output a tabular display that shows for each training sample a list of
4020             all the nodes that are affected directly AND indirectly by that sample, call
4021              
4022             $introspector->display_training_training_samples_to_nodes_influence_propagation();
4023              
4024             A training sample affects a node directly if the sample falls in the portion of the
4025             features space assigned to that node. On the other hand, a training sample is
4026             considered to affect a node indirectly if the node is a descendant of a node that is
4027             affected directly by the sample.
4028              
4029              
4030             =head1 BULK CLASSIFICATION OF DATA RECORDS
4031              
4032             For large test datasets, you would obviously want to process an entire file of test
4033             data at a time. The following scripts in the C directory illustrate how you
4034             can do that:
4035              
4036             classify_test_data_in_a_file.pl
4037              
4038             This script requires three command-line arguments, the first argument names the
4039             training datafile, the second the test datafile, and the third the file in which the
4040             classification results are to be deposited.
4041              
4042             The other examples directories, C, C, and
4043             C, also contain scripts that illustrate how to carry out
4044             bulk classification of data records when you wish to take advantage of bagging,
4045             boosting, or tree randomization. In their respective directories, these scripts are
4046             named:
4047              
4048             bagging_for_bulk_classification.pl
4049             boosting_for_bulk_classification.pl
4050             classify_database_records.pl
4051              
4052              
4053             =head1 HOW THE CLASSIFICATION RESULTS ARE DISPLAYED
4054              
4055             It depends on whether you apply the classifier at once to all the data samples in a
4056             file, or whether you feed one data sample at a time into the classifier.
4057              
4058             In general, the classifier returns soft classification for a test data vector. What
4059             that means is that, in general, the classifier will list all the classes to which a
4060             given data vector could belong and the probability of each such class label for the
4061             data vector. Run the examples scripts in the Examples directory to see how the output
4062             of classification can be displayed.
4063              
4064             With regard to the soft classifications returned by this classifier, if the
4065             probability distributions for the different classes overlap in the underlying feature
4066             space, you would want the classifier to return all of the applicable class labels for
4067             a data vector along with the corresponding class probabilities. Another reason for
4068             why the decision tree classifier may associate significant probabilities with
4069             multiple class labels is that you used inadequate number of training samples to
4070             induce the decision tree. The good thing is that the classifier does not lie to you
4071             (unlike, say, a hard classification rule that would return a single class label
4072             corresponding to the partitioning of the underlying feature space). The decision
4073             tree classifier give you the best classification that can be made given the training
4074             data you fed into it.
4075              
4076              
4077             =head1 USING BAGGING
4078              
4079             Starting with Version 3.0, you can use the class C that
4080             comes with the module to incorporate bagging in your decision tree based
4081             classification. Bagging means constructing multiple decision trees for different
4082             (possibly overlapping) segments of the data extracted from your training dataset and
4083             then aggregating the decisions made by the individual decision trees for the final
4084             classification. The aggregation of the classification decisions can average out the
4085             noise and bias that may otherwise affect the classification decision obtained from
4086             just one tree.
4087              
4088             =over 4
4089              
4090             =item B
4091              
4092             A typical call to the constructor for the C class looks
4093             like:
4094              
4095             use Algorithm::DecisionTreeWithBagging;
4096            
4097             my $training_datafile = "stage3cancer.csv";
4098            
4099             my $dtbag = Algorithm::DecisionTreeWithBagging->new(
4100             training_datafile => $training_datafile,
4101             csv_class_column_index => 2,
4102             csv_columns_for_features => [3,4,5,6,7,8],
4103             entropy_threshold => 0.01,
4104             max_depth_desired => 8,
4105             symbolic_to_numeric_cardinality_threshold => 10,
4106             how_many_bags => 4,
4107             bag_overlap_fraction => 0.2,
4108             csv_cleanup_needed => 1,
4109             );
4110            
4111             Note in particular the following two constructor parameters:
4112            
4113             how_many_bags
4114              
4115             bag_overlap_fraction
4116              
4117             where, as the name implies, the parameter C controls how many bags
4118             (and, therefore, how many decision trees) will be constructed from your training
4119             dataset; and where the parameter C controls the degree of
4120             overlap between the bags. To understand what exactly is achieved by setting the
4121             parameter C to 0.2 in the above example, let's say that the
4122             non-overlapping partitioning of the training data between the bags results in 100
4123             training samples per bag. With bag_overlap_fraction set to 0.2, additional 20 samples
4124             drawn randomly from the other bags will be added to the data in each bag.
4125              
4126             =back
4127              
4128             =head2 B class>
4129              
4130             =over 8
4131              
4132             =item B
4133              
4134             This method reads your training datafile, randomizes it, and then partitions it into
4135             the specified number of bags. Subsequently, if the constructor parameter
4136             C is non-zero, it adds to each bag additional samples drawn at
4137             random from the other bags. The number of these additional samples added to each bag
4138             is controlled by the constructor parameter C. If this
4139             parameter is set to, say, 0.2, the size of each bag will grow by 20% with the samples
4140             drawn from the other bags.
4141              
4142             =item B
4143              
4144             Shows for each bag the names of the training data samples in that bag.
4145              
4146             =item B
4147              
4148             Calls on the appropriate methods of the main C class to estimate the
4149             first-order probabilities from the data samples in each bag.
4150              
4151             =item B
4152              
4153             Calls on the appropriate method of the main C class to estimate the
4154             class priors for the data samples in each bag.
4155              
4156             =item B
4157              
4158             Calls on the appropriate method of the main C class to construct a
4159             decision tree from the training data in each bag.
4160              
4161             =item B
4162              
4163             Display separately the decision tree for each bag..
4164              
4165             =item B
4166              
4167             Calls on the appropriate methods of the main C class to classify the
4168             argument test sample.
4169              
4170             =item B
4171              
4172             Displays separately the classification decision made by each the decision tree
4173             constructed for each bag.
4174              
4175             =item B
4176              
4177             Using majority voting, this method aggregates the classification decisions made by
4178             the individual decision trees into a single decision.
4179              
4180             =back
4181              
4182             See the example scripts in the directory C for how to call these
4183             methods for classifying individual samples and for bulk classification when you place
4184             all your test samples in a single file.
4185              
4186             =head1 USING BOOSTING
4187              
4188             Starting with Version 3.20, you can use the class C for
4189             constructing a boosted decision-tree classifier. Boosting results in a cascade of
4190             decision trees in which each decision tree is constructed with samples that are
4191             mostly those that are misclassified by the previous decision tree. To be precise,
4192             you create a probability distribution over the training samples for the selection of
4193             samples for training each decision tree in the cascade. To start out, the
4194             distribution is uniform over all of the samples. Subsequently, this probability
4195             distribution changes according to the misclassifications by each tree in the cascade:
4196             if a sample is misclassified by a given tree in the cascade, the probability of its
4197             being selected for training the next tree is increased significantly. You also
4198             associate a trust factor with each decision tree depending on its power to classify
4199             correctly all of the training data samples. After a cascade of decision trees is
4200             constructed in this manner, you construct a final classifier that calculates the
4201             class label for a test data sample by taking into account the classification
4202             decisions made by each individual tree in the cascade, the decisions being weighted
4203             by the trust factors associated with the individual classifiers. These boosting
4204             notions --- generally referred to as the AdaBoost algorithm --- are based on a now
4205             celebrated paper "A Decision-Theoretic Generalization of On-Line Learning and an
4206             Application to Boosting" by Yoav Freund and Robert Schapire that appeared in 1995 in
4207             the Proceedings of the 2nd European Conf. on Computational Learning Theory. For a
4208             tutorial introduction to AdaBoost, see L
4209              
4210             Keep in mind the fact that, ordinarily, the theoretical guarantees provided by
4211             boosting apply only to the case of binary classification. Additionally, your
4212             training dataset must capture all of the significant statistical variations in the
4213             classes represented therein.
4214              
4215             =over 4
4216              
4217             =item B
4218              
4219             If you'd like to experiment with boosting, a typical call to the constructor for the
4220             C class looks like:
4221              
4222             use Algorithm::BoostedDecisionTree;
4223             my $training_datafile = "training6.csv";
4224             my $boosted = Algorithm::BoostedDecisionTree->new(
4225             training_datafile => $training_datafile,
4226             csv_class_column_index => 1,
4227             csv_columns_for_features => [2,3],
4228             entropy_threshold => 0.01,
4229             max_depth_desired => 8,
4230             symbolic_to_numeric_cardinality_threshold => 10,
4231             how_many_stages => 4,
4232             csv_cleanup_needed => 1,
4233             );
4234              
4235             Note in particular the constructor parameter:
4236            
4237             how_many_stages
4238              
4239             As its name implies, this parameter controls how many stages will be used in the
4240             boosted decision tree classifier. As mentioned above, a separate decision tree is
4241             constructed for each stage of boosting using a set of training samples that are drawn
4242             through a probability distribution maintained over the entire training dataset.
4243              
4244             =back
4245              
4246             =head2 B class>
4247              
4248             =over 8
4249              
4250             =item B
4251              
4252             This method reads your training datafile, creates the data structures from the data
4253             ingested for constructing the base decision tree.
4254              
4255             =item B
4256              
4257             Writes to the standard output the training data samples and also some relevant
4258             properties of the features used in the training dataset.
4259              
4260             =item B
4261              
4262             Calls on the appropriate methods of the main C class to estimate the
4263             first-order probabilities and the class priors.
4264              
4265             =item B
4266              
4267             Calls on the appropriate method of the main C class to construct the
4268             base decision tree.
4269              
4270             =item B
4271              
4272             Displays the base decision tree in your terminal window. (The textual form of the
4273             decision tree is written out to the standard output.)
4274              
4275             =item B
4276              
4277             Uses the AdaBoost algorithm to construct a cascade of decision trees. As mentioned
4278             earlier, the training samples for each tree in the cascade are drawn using a
4279             probability distribution over the entire training dataset. This probability
4280             distribution for any given tree in the cascade is heavily influenced by which
4281             training samples are misclassified by the previous tree.
4282              
4283             =item B
4284              
4285             Displays separately in your terminal window the decision tree constructed for each
4286             stage of the cascade. (The textual form of the trees is written out to the standard
4287             output.)
4288              
4289             =item B
4290              
4291             Calls on each decision tree in the cascade to classify the argument C<$test_sample>.
4292              
4293             =item B
4294              
4295             You can call this method to display in your terminal window the classification
4296             decision made by each decision tree in the cascade. The method also prints out the
4297             trust factor associated with each decision tree. It is important to look
4298             simultaneously at the classification decision and the trust factor for each tree ---
4299             since a classification decision made by a specific tree may appear bizarre for a
4300             given test sample. This method is useful primarily for debugging purposes.
4301              
4302             =item B
4303              
4304             As with the previous method, this method is useful mostly for debugging. It returns
4305             class labels for the samples misclassified by the stage whose integer index is
4306             supplied as an argument to the method. Say you have 10 stages in your cascade. The
4307             value of the argument C would go from 0 to 9, with 0 corresponding to
4308             the base tree.
4309              
4310             =item B
4311              
4312             Uses the "final classifier" formula of the AdaBoost algorithm to pool together the
4313             classification decisions made by the individual trees while taking into account the
4314             trust factors associated with the trees. As mentioned earlier, we associate with
4315             each tree of the cascade a trust factor that depends on the overall misclassification
4316             rate associated with that tree.
4317              
4318             =back
4319              
4320             See the example scripts in the C subdirectory for how to call the
4321             methods listed above for classifying individual data samples with boosting and for
4322             bulk classification when you place all your test samples in a single file.
4323              
4324              
4325             =head1 USING RANDOMIZED DECISION TREES
4326              
4327             Consider the following two situations that call for using randomized decision trees,
4328             meaning multiple decision trees that are trained using data extracted randomly from a
4329             large database of training samples:
4330              
4331             (1) Consider a two-class problem for which the training database is grossly
4332             imbalanced in how many majority-class samples it contains vis-a-vis the number of
4333             minority class samples. Let's assume for a moment that the ratio of majority class
4334             samples to minority class samples is 1000 to 1. Let's also assume that you have a
4335             test dataset that is drawn randomly from the same population mixture from which the
4336             training database was created. Now consider a stupid data classification program
4337             that classifies everything as belonging to the majority class. If you measure the
4338             classification accuracy rate as the ratio of the number of samples correctly
4339             classified to the total number of test samples selected randomly from the population,
4340             this classifier would work with an accuracy of 99.99%.
4341              
4342             (2) Let's now consider another situation in which we are faced with a huge training
4343             database but in which every class is equally well represented. Feeding all the data
4344             into a single decision tree would be akin to polling all of the population of the
4345             United States for measuring the Coke-versus-Pepsi preference in the country. You are
4346             likely to get better results if you construct multiple decision trees, each trained
4347             with a collection of training samples drawn randomly from the training database.
4348             After you have created all the decision trees, your final classification decision
4349             could then be based on, say, majority voting by the trees.
4350              
4351             In summary, the C class allows you to solve the following
4352             two problems: (1) Data classification using the needle-in-a-haystack metaphor, that
4353             is, when a vast majority of your training samples belong to just one class. And (2)
4354             You have access to a very large database of training samples and you wish to
4355             construct an ensemble of decision trees for classification.
4356              
4357             =over 4
4358              
4359             =item B
4360              
4361             Here is how you'd call the C constructor for
4362             needle-in-a-haystack classification:
4363              
4364             use Algorithm::RandomizedTreesForBigData;
4365             my $training_datafile = "your_database.csv";
4366             my $rt = Algorithm::RandomizedTreesForBigData->new(
4367             training_datafile => $training_datafile,
4368             csv_class_column_index => 48,
4369             csv_columns_for_features => [24,32,33,34,41],
4370             entropy_threshold => 0.01,
4371             max_depth_desired => 8,
4372             symbolic_to_numeric_cardinality_threshold => 10,
4373             how_many_trees => 5,
4374             looking_for_needles_in_haystack => 1,
4375             csv_cleanup_needed => 1,
4376             );
4377              
4378             Note in particular the constructor parameters:
4379              
4380             looking_for_needles_in_haystack
4381             how_many_trees
4382              
4383             The first of these parameters, C, invokes the logic for
4384             constructing an ensemble of decision trees, each based on a training dataset that
4385             uses all of the minority class samples, and a random drawing from the majority class
4386             samples.
4387              
4388             Here is how you'd call the C constructor for a more
4389             general attempt at constructing an ensemble of decision trees, with each tree trained
4390             with randomly drawn samples from a large database of training data (without paying
4391             attention to the differences in the sizes of the populations for the different
4392             classes):
4393              
4394             use Algorithm::RandomizedTreesForBigData;
4395             my $training_datafile = "your_database.csv";
4396             my $rt = Algorithm::RandomizedTreesForBigData->new(
4397             training_datafile => $training_datafile,
4398             csv_class_column_index => 2,
4399             csv_columns_for_features => [3,4,5,6,7,8],
4400             entropy_threshold => 0.01,
4401             max_depth_desired => 8,
4402             symbolic_to_numeric_cardinality_threshold => 10,
4403             how_many_trees => 3,
4404             how_many_training_samples_per_tree => 50,
4405             csv_cleanup_needed => 1,
4406             );
4407              
4408             Note in particular the constructor parameters:
4409              
4410             how_many_training_samples_per_tree
4411             how_many_trees
4412              
4413             When you set the C parameter, you are not allowed
4414             to also set the C parameter, and vice versa.
4415              
4416             =back
4417              
4418             =head2 B class>
4419              
4420             =over 8
4421              
4422             =item B
4423              
4424             What this method does depends on which of the two constructor parameters ---
4425             C or C --- is
4426             set. When the former is set, it creates a collection of training datasets for
4427             C number of decision trees, with each dataset being a mixture of the
4428             minority class and sample drawn randomly from the majority class. However, when the
4429             latter option is set, all the datasets are drawn randomly from the training database
4430             with no particular attention given to the relative populations of the two classes.
4431              
4432             =item B
4433              
4434             As the name implies, this method shows the training data being used for all the
4435             decision trees. This method is useful for debugging purposes using small datasets.
4436              
4437             =item B
4438              
4439             Calls on the appropriate method of the main C to estimate the
4440             first-order probabilities for the training dataset to be used for each decision tree.
4441              
4442             =item B
4443              
4444             Calls on the appropriate method of the main C class to estimate the
4445             class priors for the training dataset to be used for each decision tree.
4446              
4447             =item B
4448              
4449             Calls on the appropriate method of the main C class to construct the
4450             decision trees.
4451              
4452             =item B
4453              
4454             Displays all the decision trees in your terminal window. (The textual form of the
4455             decision trees is written out to the standard output.)
4456              
4457             =item B
4458              
4459             The test_sample is sent to each decision tree for classification.
4460              
4461             =item B
4462              
4463             The classification decisions returned by the individual decision trees are written
4464             out to the standard output.
4465              
4466             =item B
4467              
4468             This method aggregates the classification results returned by the individual decision
4469             trees and returns the majority decision.
4470              
4471             =back
4472              
4473             =head1 CONSTRUCTING REGRESSION TREES:
4474              
4475             Decision tree based modeling requires that the class labels be distinct. That is,
4476             the training dataset must contain a relatively small number of discrete class labels
4477             for all of your data records if you want to model the data with one or more decision
4478             trees. However, when one is trying to understand all of the associational
4479             relationships that exist in a large database, one often runs into situations where,
4480             instead of discrete class labels, you have a continuously valued variable as a
4481             dependent variable whose values are predicated on a set of feature values. It is for
4482             such situations that you will find useful the new class C that is now
4483             a part of the C module. The C class has been
4484             programmed as a subclass of the main C class.
4485              
4486             You can think of regression with a regression tree as a powerful generalization of
4487             the very commonly used Linear Regression algorithms. Although you can certainly
4488             carry out polynomial regression with run-of-the-mill Linear Regression algorithms for
4489             modeling nonlinearities between the predictor variables and the dependent variable,
4490             specifying the degree of the polynomial is often tricky. Additionally, a polynomial
4491             can inject continuities between the predictor and the predicted variables that may
4492             not really exist in the real data. Regression trees, on the other hand, give you a
4493             piecewise linear relationship between the predictor and the predicted variables that
4494             is freed from the constraints of superimposed continuities at the joins between the
4495             different segments. See the following tutorial for further information regarding the
4496             standard linear regression approach and the regression that can be achieved with the
4497             RegressionTree class in this module:
4498             L
4499              
4500             The RegressionTree class in the current version of the module assumes that all of
4501             your data is numerical. That is, unlike what is possible with the DecisionTree class
4502             (and the other more closely related classes in this module) that allow your training
4503             file to contain a mixture of numerical and symbolic data, the RegressionTree class
4504             requires that ALL of your data be numerical. I hope to relax this constraint in
4505             future versions of this module. Obviously, the dependent variable will always be
4506             numerical for regression.
4507              
4508             See the example scripts in the directory C if you wish to become
4509             more familiar with the regression capabilities of the module.
4510              
4511             =over 4
4512              
4513             =item B
4514              
4515             my $training_datafile = "gendata5.csv";
4516             my $rt = Algorithm::RegressionTree->new(
4517             training_datafile => $training_datafile,
4518             dependent_variable_column => 2,
4519             predictor_columns => [1],
4520             mse_threshold => 0.01,
4521             max_depth_desired => 2,
4522             jacobian_choice => 0,
4523             csv_cleanup_needed => 1,
4524             );
4525              
4526             Note in particular the constructor parameters:
4527              
4528             dependent_variable
4529             predictor_columns
4530             mse_threshold
4531             jacobian_choice
4532              
4533             The first of these parameters, C, is set to the column index in
4534             the CSV file for the dependent variable. The second constructor parameter,
4535             C, tells the system as to which columns contain values for the
4536             predictor variables. The third parameter, C, is for deciding when to
4537             partition the data at a node into two child nodes as a regression tree is being
4538             constructed. If the minmax of MSE (Mean Squared Error) that can be achieved by
4539             partitioning any of the features at a node is smaller than C, that
4540             node becomes a leaf node of the regression tree.
4541              
4542             The last parameter, C, must be set to either 0 or 1 or 2. Its
4543             default value is 0. When this parameter equals 0, the regression coefficients are
4544             calculated using the linear least-squares method and no further "refinement" of the
4545             coefficients is carried out using gradient descent. This is the fastest way to
4546             calculate the regression coefficients. When C is set to 1, you get
4547             a weak version of gradient descent in which the Jacobian is set to the "design
4548             matrix" itself. Choosing 2 for C results in a more reasonable
4549             approximation to the Jacobian. That, however, is at a cost of much longer
4550             computation time. B For most cases, using 0 for C is the
4551             best choice. See my tutorial "I" for why
4552             that is the case.
4553              
4554             =back
4555              
4556             =head2 B class>
4557              
4558             =over 8
4559              
4560             =item B
4561              
4562             Only CSV training datafiles are allowed. Additionally, the first record in the file
4563             must list the names of the fields, and the first column must contain an integer ID
4564             for each record.
4565              
4566             =item B
4567              
4568             As the name implies, this is the method that construct a regression tree.
4569              
4570             =item B
4571              
4572             Displays the regression tree, as the name implies. The white-space string argument
4573             specifies the offset to use in displaying the child nodes in relation to a parent
4574             node.
4575              
4576             =item B
4577              
4578             You call this method after you have constructed a regression tree if you want to
4579             calculate the prediction for one sample. The parameter C<$root_node> is what is
4580             returned by the call C. The formatting of the argument
4581             bound to the C<$test_sample> parameter is important. To elaborate, let's say you are
4582             using two variables named C<$xvar1> and C<$xvar2> as your predictor variables. In
4583             this case, the C<$test_sample> parameter will be bound to a list that will look like
4584              
4585             ['xvar1 = 23.4', 'xvar2 = 12.9']
4586              
4587             Arbitrary amount of white space, including none, on the two sides of the equality
4588             symbol is allowed in the construct shown above. A call to this method returns a
4589             dictionary with two key-value pairs. One of the keys is called C and
4590             the other C. The value associated with key C is the path
4591             in the regression tree to the leaf node that yielded the prediction. And the value
4592             associated with the key C is the answer you are looking for.
4593              
4594             =item B
4595              
4596             This call calculates the predictions for all of the predictor variables data in your
4597             training file. The parameter C<$root_node> is what is returned by the call to
4598             C. The values for the dependent variable thus predicted
4599             can be seen by calling C, which is the method mentioned below.
4600              
4601             =item B
4602              
4603             This method displays the results obtained by calling the prediction method of the
4604             previous entry. This method also creates a hardcopy of the plots and saves it as a
4605             C<.png> disk file. The name of this output file is always C.
4606              
4607             =item B
4608              
4609             This method carries out an error analysis of the predictions for the samples in your
4610             training datafile. It shows you the overall MSE (Mean Squared Error) with tree-based
4611             regression, the MSE for the data samples at each of the leaf nodes of the regression
4612             tree, and the MSE for the plain old Linear Regression as applied to all of the data.
4613             The parameter C<$root_node> in the call syntax is what is returned by the call to
4614             C.
4615              
4616             =item B
4617              
4618             Call this method if you want to apply the regression tree to all your test data in a
4619             disk file. The predictions for all of the test samples in the disk file are written
4620             out to another file whose name is the same as that of the test file except for the
4621             addition of C<_output> in the name of the file. The parameter C<$filename> is the
4622             name of the disk file that contains the test data. And the parameter C<$columns> is a
4623             list of the column indices for the predictor variables in the test file.
4624              
4625             =back
4626              
4627             =head1 GENERATING SYNTHETIC TRAINING DATA
4628              
4629             The module file contains the following additional classes: (1)
4630             C, and (2) C for
4631             generating synthetic training data.
4632              
4633             The class C outputs one CSV file for the
4634             training data and another one for the test data for experimenting with numeric
4635             features. The numeric values are generated using a multivariate Gaussian
4636             distribution whose mean and covariance are specified in a parameter file. See the
4637             file C in the C directory for an example of such a
4638             parameter file. Note that the dimensionality of the data is inferred from the
4639             information you place in the parameter file.
4640              
4641             The class C generates synthetic training for the
4642             purely symbolic case. The relative frequencies of the different possible values for
4643             the features is controlled by the biasing information you place in a parameter file.
4644             See C for an example of such a file.
4645              
4646              
4647             =head1 THE C DIRECTORY
4648              
4649             See the C directory in the distribution for how to construct a decision
4650             tree, and how to then classify new data using the decision tree. To become more
4651             familiar with the module, run the scripts
4652              
4653             construct_dt_and_classify_one_sample_case1.pl
4654             construct_dt_and_classify_one_sample_case2.pl
4655             construct_dt_and_classify_one_sample_case3.pl
4656             construct_dt_and_classify_one_sample_case4.pl
4657              
4658             The first script is for the purely symbolic case, the second for the case that
4659             involves both numeric and symbolic features, the third for the case of purely numeric
4660             features, and the last for the case when the training data is synthetically generated
4661             by the script C.
4662              
4663             Next run the following script as it is for bulk classification of data records placed
4664             in a CSV file:
4665              
4666             classify_test_data_in_a_file.pl training4.csv test4.csv out4.csv
4667              
4668             The script first constructs a decision tree using the training data in the training
4669             file supplied by the first argument file C. The script then
4670             calculates the class label for each data record in the test data file supplied
4671             through the second argument file, C. The estimated class labels are
4672             written out to the output file which in the call shown above is C. An
4673             important thing to note here is that your test file --- in this case C ---
4674             must have a column for class labels. Obviously, in real-life situations, there will
4675             be no class labels in this column. What that is the case, you can place an empty
4676             string C<""> there for each data record. This is demonstrated by the following call:
4677              
4678             classify_test_data_in_a_file.pl training4.csv test4_no_class_labels.csv out4.csv
4679              
4680             The following script in the C directory
4681              
4682             classify_by_asking_questions.pl
4683              
4684             shows how you can use a decision-tree classifier interactively. In this mode, you
4685             first construct the decision tree from the training data and then the user is
4686             prompted for answers to the feature tests at the nodes of the tree.
4687              
4688             If your training data has a feature whose values span a large range and, at the same
4689             time, are characterized by a heavy-tail distribution, you should look at the script
4690              
4691             construct_dt_for_heavytailed.pl
4692              
4693             to see how to use the option C in the call to the
4694             constructor. This option was introduced in Version 2.22 for dealing with such
4695             features. If you do not set this option, the module will use the default value of
4696             500 for the number of points at which to sample the value range for such a feature.
4697              
4698             The C directory also contains the following scripts:
4699              
4700             generate_training_data_numeric.pl
4701             generate_training_data_symbolic.pl
4702              
4703             that show how you can use the module to generate synthetic training. Synthetic
4704             training is generated according to the specifications laid out in a parameter file.
4705             There are constraints on how the information is laid out in a parameter file. See
4706             the files C and C in the C directory
4707             for how to structure these files.
4708              
4709             The C directory of Versions 2.1 and higher of the module also contains the
4710             following two scripts:
4711              
4712             evaluate_training_data1.pl
4713             evaluate_training_data2.pl
4714              
4715             that illustrate how the Perl class C can be used to evaluate the
4716             quality of your training data (as long as it resides in a `C<.csv>' file.) This new
4717             class is a subclass of the C class in the module file. See the README
4718             in the C directory for further information regarding these two scripts.
4719              
4720             The C directory of Versions 2.31 and higher of the module contains the
4721             following three scripts:
4722              
4723             introspection_in_a_loop_interactive.pl
4724              
4725             introspection_show_training_samples_at_all_nodes_direct_influence.pl
4726              
4727             introspection_show_training_samples_to_nodes_influence_propagation.pl
4728              
4729             The first script illustrates how to use the C class of the module
4730             interactively for generating explanations for the classification decisions made at
4731             the nodes of the decision tree. In the interactive session you are first asked for
4732             the node number you are interested in. Subsequently, you are asked for whether or
4733             not you are interested in specific questions that the introspector can provide
4734             answers for. The second script generates a tabular display that shows for each node
4735             of the decision tree a list of the training samples that fall directly in the portion
4736             of the feature space assigned that node. (As mentioned elsewhere in this
4737             documentation, when this list is empty for a node, that means the node is a result of
4738             the generalization achieved by probabilistic modeling of the data. Note that this
4739             module constructs a decision tree NOT by partitioning the set of training samples,
4740             BUT by partitioning the domains of the probability density functions.) The third
4741             script listed above also generates a tabular display, but one that shows how the
4742             influence of each training sample propagates in the tree. This display first shows
4743             the list of nodes that are affected directly by the data in a training sample. This
4744             list is followed by an indented display of the nodes that are affected indirectly by
4745             the training sample. A training sample affects a node indirectly if the node is a
4746             descendant of one of the nodes affected directly.
4747              
4748             The latest addition to the Examples directory is the script:
4749              
4750             get_indexes_associated_with_fields.py
4751              
4752             As to why you may find this script useful, note that large database files may have
4753             hundreds of fields and it is not always easy to figure out what numerical index is
4754             associated with a given field. At the same time, the constructor of the DecisionTree
4755             module requires that the field that holds the class label and the fields that contain
4756             the feature values be specified by their numerical zero-based indexes. If you have a
4757             large database and you are faced with this problem, you can run this script to see
4758             the zero-based numerical index values associated with the different columns of your
4759             CSV file.
4760              
4761              
4762             =head1 THE C DIRECTORY
4763              
4764             The C directory contains the following scripts:
4765              
4766             bagging_for_classifying_one_test_sample.pl
4767            
4768             bagging_for_bulk_classification.pl
4769              
4770             As the names of the scripts imply, the first shows how to call the different methods
4771             of the C class for classifying a single test sample. When
4772             you are classifying a single test sample, you can also see how each bag is
4773             classifying the test sample. You can, for example, display the training data used in
4774             each bag, the decision tree constructed for each bag, etc.
4775              
4776             The second script is for the case when you place all of the test samples in a single
4777             file. The demonstration script displays for each test sample a single aggregate
4778             classification decision that is obtained through majority voting by all the decision
4779             trees.
4780              
4781              
4782             =head1 THE C DIRECTORY
4783              
4784             The C subdirectory in the main installation directory contains the
4785             following three scripts:
4786              
4787             boosting_for_classifying_one_test_sample_1.pl
4788              
4789             boosting_for_classifying_one_test_sample_2.pl
4790              
4791             boosting_for_bulk_classification.pl
4792              
4793             As the names of the first two scripts imply, these show how to call the different
4794             methods of the C class for classifying a single test sample.
4795             When you are classifying a single test sample, you can see how each stage of the
4796             cascade of decision trees is classifying the test sample. You can also view each
4797             decision tree separately and also see the trust factor associated with the tree.
4798              
4799             The third script is for the case when you place all of the test samples in a single
4800             file. The demonstration script outputs for each test sample a single aggregate
4801             classification decision that is obtained through trust-factor weighted majority
4802             voting by all the decision trees.
4803              
4804             =head1 THE C DIRECTORY
4805              
4806             The C directory shows example scripts that you can use to
4807             become more familiar with the C class for solving
4808             needle-in-a-haystack and big-data data classification problems. These scripts are:
4809              
4810             randomized_trees_for_classifying_one_test_sample_1.pl
4811              
4812             randomized_trees_for_classifying_one_test_sample_2.pl
4813              
4814             classify_database_records.pl
4815              
4816             The first script shows the constructor options to use for solving a
4817             needle-in-a-haystack problem --- that is, a problem in which a vast majority of the
4818             training data belongs to just one class. The second script shows the constructor
4819             options for using randomized decision trees for the case when you have access to a
4820             very large database of training samples and you'd like to construct an ensemble of
4821             decision trees using training samples pulled randomly from the training database.
4822             The last script illustrates how you can evaluate the classification power of an
4823             ensemble of decision trees as constructed by C by classifying
4824             a large number of test samples extracted randomly from the training database.
4825              
4826              
4827             =head1 THE C DIRECTORY
4828              
4829             The C subdirectory in the main installation directory shows
4830             example scripts that you can use to become familiar with regression trees and how
4831             they can be used for nonlinear regression. If you are new to the concept of
4832             regression trees, start by executing the following scripts without changing them and
4833             see what sort of output is produced by them:
4834              
4835             regression4.pl
4836              
4837             regression5.pl
4838              
4839             regression6.pl
4840              
4841             regression8.pl
4842              
4843             The C script involves only one predictor variable and one dependent
4844             variable. The training data for this exercise is drawn from the file C.
4845             This data file contains strongly nonlinear data. When you run the script
4846             C, you will see how much better the result from tree regression is
4847             compared to what you can get with linear regression.
4848              
4849             The C script is essentially the same as the previous script except
4850             for the fact that the training datafile used in this case, C, consists
4851             of three noisy segments, as opposed to just two in the previous case.
4852              
4853             The script C deals with the case when we have two predictor variables
4854             and one dependent variable. You can think of the data as consisting of noisy height
4855             values over an C<(x1,x2)> plane. The data used in this script is drawn from the csv
4856             file C.
4857              
4858             Finally, the script C shows how you can carry out bulk prediction for
4859             all your test data records in a disk file. The script writes all the calculated
4860             predictions into another disk file whose name is derived from the name of the test
4861             data file.
4862              
4863              
4864             =head1 EXPORT
4865              
4866             None by design.
4867              
4868             =head1 BUGS
4869              
4870             Please notify the author if you encounter any bugs. When sending email, please place
4871             the string 'DecisionTree' in the subject line.
4872              
4873             =head1 INSTALLATION
4874              
4875             Download the archive from CPAN in any directory of your choice. Unpack the archive
4876             with a command that on a Linux machine would look like:
4877              
4878             tar zxvf Algorithm-DecisionTree-3.43.tar.gz
4879              
4880             This will create an installation directory for you whose name will be
4881             C. Enter this directory and execute the following
4882             commands for a standard install of the module if you have root privileges:
4883              
4884             perl Makefile.PL
4885             make
4886             make test
4887             sudo make install
4888              
4889             If you do not have root privileges, you can carry out a non-standard install the
4890             module in any directory of your choice by:
4891              
4892             perl Makefile.PL prefix=/some/other/directory
4893             make
4894             make test
4895             make install
4896              
4897             With a non-standard install, you may also have to set your PERL5LIB environment
4898             variable so that this module can find the required other modules. How you do that
4899             would depend on what platform you are working on. In order to install this module in
4900             a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
4901             variable by
4902              
4903             setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
4904              
4905             If I used bash, I'd need to declare:
4906              
4907             export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
4908              
4909             Another option for a non-standard install would be to place at the top of your own
4910             scripts the following invocation:
4911              
4912             BEGIN {
4913             unshift @INC, "/some/other/directory/blib/lib";
4914             }
4915              
4916              
4917             =head1 THANKS
4918              
4919             I wish to thank many users of this module for their feedback. Many of the
4920             improvements I have made to the module over the years are a result of the feedback
4921             received.
4922              
4923             I thank Slaven Rezic for pointing out that the module worked with Perl 5.14.x. For
4924             Version 2.22, I had set the required version of Perl to 5.18.0 since that's what I
4925             used for testing the module. Slaven's feedback in the form of the Bug report
4926             C<#96547> resulted in Version 2.23 of the module. Version 2.25 further downshifts
4927             the required version of Perl to 5.10.
4928              
4929             On the basis of the report posted by Slaven at C regarding Version 2.27,
4930             I am removing the Perl version restriction altogether from Version 2.30. Thanks
4931             Slaven!
4932              
4933             Version 3.43 was prompted by a bug report sent by Jan Trukenmüller regarding a
4934             missing backslash in a regex in the C function. Thanks, Jan!
4935              
4936             =head1 AUTHOR
4937              
4938             The author, Avinash Kak, recently finished a 17-year long "Objects Trilogy Project"
4939             with the publication of the book I by John-Wiley. If
4940             interested, visit his web page at Purdue to find out what this project was all
4941             about. You might like I especially if you enjoyed reading
4942             Harry Potter as a kid (or even as an adult, for that matter).
4943              
4944             If you send email regarding this module, please place the string "DecisionTree" in
4945             your subject line to get past my spam filter. Avi Kak's email address is
4946             C
4947              
4948             =head1 COPYRIGHT
4949              
4950             This library is free software; you can redistribute it and/or modify it under the
4951             same terms as Perl itself.
4952              
4953             Copyright 2017 Avinash Kak
4954              
4955             =cut
4956